On  9.02.2018 09:44, Qu Wenruo wrote:
> Kernel uses a delayed chunk allocation behavior for metadata chunks
> 
> KERNEL:
> btrfs_alloc_chunk()
> |- __btrfs_alloc_chunk():     Only allocate chunk mapping
>    |- btrfs_make_block_group():       Add corresponding bg to fs_info->new_bgs
> 
> Then at transaction commit time, it finishes the remaining work:
> btrfs_start_dirty_block_groups():
> |- btrfs_create_pending_block_groups()
>    |- btrfs_insert_item():    To insert block group item
>    |- btrfs_finish_chunk_alloc(): Insert chunk items/dev extents
> 
> Although btrfs-progs btrfs_alloc_chunk() does all the work in one
> function, it can still benefit from function refactor like:
> btrfs-progs:
> btrfs_alloc_chunk():  Wrapper for both normal and convert chunks
> |- __btrfs_alloc_chunk():     Only alloc chunk mapping
> |  |- btrfs_make_block_group(): <<Insert bg items into extent tree>>
> |- btrfs_finish_chunk_alloc():  Insert chunk items/dev extents
> 
> With such refactor, the following functions can share most of its code
> with kernel now:
> __btrfs_alloc_chunk()
> btrfs_finish_chunk_alloc()
> btrfs_alloc_dev_extent()
> 
> Signed-off-by: Qu Wenruo <w...@suse.com>
> ---
>  volumes.c | 421 
> ++++++++++++++++++++++++++++++++++++++------------------------
>  1 file changed, 260 insertions(+), 161 deletions(-)
> 
> diff --git a/volumes.c b/volumes.c
> index cff54c612872..e89520326314 100644
> --- a/volumes.c
> +++ b/volumes.c
> @@ -523,55 +523,40 @@ static int find_free_dev_extent(struct btrfs_device 
> *device, u64 num_bytes,
>       return find_free_dev_extent_start(device, num_bytes, 0, start, len);
>  }
>  
> -static int btrfs_insert_dev_extents(struct btrfs_trans_handle *trans,
> -                                 struct btrfs_fs_info *fs_info,
> -                                 struct map_lookup *map, u64 stripe_size)
> +static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
> +                               struct btrfs_device *device,
> +                               u64 chunk_offset, u64 physical,
> +                               u64 stripe_size)
>  {
> -     int ret = 0;
> -     struct btrfs_path path;
> +     int ret;
> +     struct btrfs_path *path;
> +     struct btrfs_fs_info *fs_info = device->fs_info;
>       struct btrfs_root *root = fs_info->dev_root;
>       struct btrfs_dev_extent *extent;
>       struct extent_buffer *leaf;
>       struct btrfs_key key;
> -     int i;
>  
> -     btrfs_init_path(&path);
> -
> -     for (i = 0; i < map->num_stripes; i++) {
> -             struct btrfs_device *device = map->stripes[i].dev;
> -
> -             key.objectid = device->devid;
> -             key.offset = map->stripes[i].physical;
> -             key.type = BTRFS_DEV_EXTENT_KEY;
> +     path = btrfs_alloc_path();
> +     if (!path)
> +             return -ENOMEM;
>  
> -             ret = btrfs_insert_empty_item(trans, root, &path, &key,
> -                                           sizeof(*extent));
> -             if (ret < 0)
> -                     goto out;
> -             leaf = path.nodes[0];
> -             extent = btrfs_item_ptr(leaf, path.slots[0],
> -                                     struct btrfs_dev_extent);
> -             btrfs_set_dev_extent_chunk_tree(leaf, extent,
> +     key.objectid = device->devid;
> +     key.offset = physical;
> +     key.type = BTRFS_DEV_EXTENT_KEY;
> +     ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
> +     if (ret)
> +             goto out;
> +     leaf = path->nodes[0];
> +     extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
> +     btrfs_set_dev_extent_chunk_tree(leaf, extent,
>                                       BTRFS_CHUNK_TREE_OBJECTID);
> -             btrfs_set_dev_extent_chunk_objectid(leaf, extent,
> -                                     BTRFS_FIRST_CHUNK_TREE_OBJECTID);
> -             btrfs_set_dev_extent_chunk_offset(leaf, extent, map->ce.start);
> -
> -             write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
> -                     (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
> -                     BTRFS_UUID_SIZE);
> -
> -             btrfs_set_dev_extent_length(leaf, extent, stripe_size);
> -             btrfs_mark_buffer_dirty(leaf);
> -             btrfs_release_path(&path);
> -
> -             device->bytes_used += stripe_size;
> -             ret = btrfs_update_device(trans, device);
> -             if (ret < 0)
> -                     goto out;
> -     }
> +     btrfs_set_dev_extent_chunk_objectid(leaf, extent,
> +                                         BTRFS_FIRST_CHUNK_TREE_OBJECTID);
> +     btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
> +     btrfs_set_dev_extent_length(leaf, extent, stripe_size);
> +     btrfs_mark_buffer_dirty(leaf);
>  out:
> -     btrfs_release_path(&path);
> +     btrfs_free_path(path);
>       return ret;
>  }
>  
> @@ -813,28 +798,28 @@ static int btrfs_cmp_device_info(const void *a, const 
> void *b)
>                               / sizeof(struct btrfs_stripe) + 1)
>  
>  /*
> - * Alloc a chunk, will insert dev extents, chunk item, and insert new
> - * block group and update space info (so that extent allocator can use
> - * newly allocated chunk).
> + * Alloc a chunk mapping.
> + * Will do chunk size calculation and free dev extent search, and insert
> + * chunk mapping into chunk mapping tree.
> + *
> + * NOTE: This function doesn't handle any chunk item/dev extent insert.
> + * chunk item/dev extent insert is handled by later 
> btrfs_finish_chunk_alloc().
> + * And for convert chunk (1:1 mapped, more flex chunk location), it's
> + * handled by __btrfs_alloc_convert_chunk().
> + *
> + * Qu: Block group item is still inserted in this function by
> + * btrfs_make_block_group(), this still differs from kernel.
>   *

If it still differs does that mean this function should undergo further
refactoring to unify it with the kernel or it means it will never be the
same as the kernel's code? I thought the idea of unifying the
userspace/kernel code is to have only 1 set of functions which work both
in kernel and userspace?

>   * @start:   return value of allocated chunk start bytenr.
>   * @num_bytes:       return value of allocated chunk size
>   * @type:    chunk type (including both profile and type)
> - * @convert:   if the chunk is allocated for convert case.
> - *             If @convert is true, chunk allocator will skip device extent
> - *             search, but use *start and *num_bytes as chunk start/num_bytes
> - *             and devive offset, to build a 1:1 chunk mapping for convert.
>   */
> -int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
> -                   struct btrfs_fs_info *info, u64 *start,
> -                   u64 *num_bytes, u64 type, bool convert)
> +static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
> +                            struct btrfs_fs_info *info, u64 *start,
> +                            u64 *num_bytes, u64 type)
>  {
> -     struct btrfs_root *extent_root = info->extent_root;
> -     struct btrfs_root *chunk_root = info->chunk_root;
>       struct btrfs_device *device = NULL;
> -     struct btrfs_chunk *chunk;
>       struct list_head *dev_list = &info->fs_devices->devices;
> -     struct btrfs_stripe *stripe;
>       struct map_lookup *map;
>       struct btrfs_device_info *devices_info = NULL;
>       u64 percent_max;
> @@ -845,8 +830,6 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
>       int index;
>       int ndevs = 0;
>       int rw_devs = 0;
> -     int stripe_len = BTRFS_STRIPE_LEN;
> -     struct btrfs_key key;
>       u64 offset;
>       int data_stripes;       /* number of stripes that counts for bg size */
>       int sub_stripes;        /* sub_stripes info for map */
> @@ -874,34 +857,6 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
>       devs_increment = btrfs_raid_array[index].devs_increment;
>       ncopies = btrfs_raid_array[index].ncopies;
>  
> -     if (convert) {
> -             /* For convert, profile must be SINGLE */
> -             if (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
> -                     error("convert only suports SINGLE profile");
> -                     return -EINVAL;
> -             }
> -             if (!IS_ALIGNED(*start, info->sectorsize)) {
> -                     error("chunk start not aligned, start=%llu 
> sectorsize=%u",
> -                             *start, info->sectorsize);
> -                     return -EINVAL;
> -             }
> -             if (!IS_ALIGNED(*num_bytes, info->sectorsize)) {
> -                     error("chunk size not aligned, size=%llu sectorsize=%u",
> -                             *num_bytes, info->sectorsize);
> -                     return -EINVAL;
> -             }
> -             num_stripes = 1;
> -             data_stripes = 1;
> -             offset = *start;
> -             stripe_size = *num_bytes;
> -             /*
> -              * For convert, we use 1:1 chunk mapping specified by @start and
> -              * @num_bytes, so there is no need to go through dev_extent
> -              * searching.
> -              */
> -             goto alloc_chunk;
> -     }
> -
>       /*
>        * Chunk size calculation part.
>        */
> @@ -1047,55 +1002,23 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle 
> *trans,
>       /*
>        * Fill chunk mapping and chunk stripes
>        */
> -alloc_chunk:
> -     if (!convert) {
> -             ret = find_next_chunk(info, &offset);
> -             if (ret)
> -                     goto out_devices_info;
> -     }
> -     key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
> -     key.type = BTRFS_CHUNK_ITEM_KEY;
> -     key.offset = offset;
> -     *num_bytes =  stripe_size * data_stripes;
> -
> -     chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
> -     if (!chunk)
> +     ret = find_next_chunk(info, &offset);
> +     if (ret)
>               goto out_devices_info;
> +     *num_bytes =  stripe_size * data_stripes;
>  
>       map = kmalloc(btrfs_map_lookup_size(num_stripes), GFP_NOFS);
>       if (!map)
>               goto out_chunk_map;
>       map->num_stripes = num_stripes;
>  
> -     if (convert) {
> -             device = list_entry(dev_list->next, struct btrfs_device,
> -                                 dev_list);
> +     for (i = 0; i < ndevs; i++) {
> +             for (j = 0; j < dev_stripes; ++j) {
> +                     int s = i * dev_stripes + j;
>  
> -             map->stripes[0].dev = device;
> -             map->stripes[0].physical = *start;
> -             stripe = &chunk->stripe;
> -             btrfs_set_stack_stripe_devid(stripe, device->devid);
> -             btrfs_set_stack_stripe_offset(stripe, *start);
> -             memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
> -     } else {
> -             for (i = 0; i < ndevs; i++) {
> -                     for (j = 0; j < dev_stripes; ++j) {
> -                             int s = i * dev_stripes + j;
> -
> -                             device = devices_info[i].dev;
> -                             map->stripes[s].dev = device;
> -                             map->stripes[s].physical =
> -                                     devices_info[i].dev_offset +
> -                                     j * stripe_size;
> -                             stripe = &chunk->stripe + s;
> -                             btrfs_set_stack_stripe_devid(stripe,
> -                                             device->devid);
> -                             btrfs_set_stack_stripe_offset(stripe,
> -                                             devices_info[i].dev_offset +
> -                                             j * stripe_size);
> -                             memcpy(stripe->dev_uuid, device->uuid,
> -                                             BTRFS_UUID_SIZE);
> -                     }
> +                     map->stripes[s].dev = devices_info[i].dev;
> +                     map->stripes[s].physical = devices_info[i].dev_offset +
> +                                                j * stripe_size;
>               }
>       }
>       map->stripe_len = BTRFS_STRIPE_LEN;
> @@ -1104,60 +1027,236 @@ alloc_chunk:
>       map->type = type;
>       map->sub_stripes = sub_stripes;
>       map->sector_size = info->sectorsize;
> -     map->ce.start = key.offset;
> +     map->ce.start = offset;
>       map->ce.size = *num_bytes;
>  
> +     kfree(devices_info);
>  
> +     /*
> +      * Insert chunk mapping and block group
> +      */
>       ret = insert_cache_extent(&info->mapping_tree.cache_tree, &map->ce);
>       if (ret < 0)
>               goto out_chunk_map;
>  
> -     /* key was set above */
> -     btrfs_set_stack_chunk_length(chunk, *num_bytes);
> -     btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
> -     btrfs_set_stack_chunk_stripe_len(chunk, stripe_len);
> -     btrfs_set_stack_chunk_type(chunk, type);
> -     btrfs_set_stack_chunk_num_stripes(chunk, num_stripes);
> -     btrfs_set_stack_chunk_io_align(chunk, stripe_len);
> -     btrfs_set_stack_chunk_io_width(chunk, stripe_len);
> -     btrfs_set_stack_chunk_sector_size(chunk, info->sectorsize);
> -     btrfs_set_stack_chunk_sub_stripes(chunk, sub_stripes);
> +     ret = btrfs_make_block_group(trans, info, 0, type, offset,
> +                                  *num_bytes);
> +     *start = offset;
> +     return ret;
> +
> +out_chunk_map:
> +     kfree(map);
> +out_devices_info:
> +     kfree(devices_info);
> +     return ret;
> +}
> +
> +/*
> + * Alloc a chunk mapping for convert.
> + * Convert needs special SINGLE chunk whose logical bytenr is the same as its
> + * physical bytenr.
> + * Chunk size and start bytenr are all specified by @start and @num_bytes
> + *
> + * And just like __btrfs_alloc_chunk() this only handles chunk mapping and
> + * block group item.
> + */
> +static int __btrfs_alloc_convert_chunk(struct btrfs_trans_handle *trans,
> +                                    struct btrfs_fs_info *fs_info, u64 start,
> +                                    u64 num_bytes, u64 type)
> +{
> +     struct list_head *dev_list = &fs_info->fs_devices->devices;
> +     struct map_lookup *map;
> +     struct btrfs_device *device;
> +     int ret;
> +
> +     /* For convert, profile must be SINGLE */
> +     if (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
> +             error("convert only suports SINGLE profile");
> +             return -EINVAL;
> +     }
> +     if (!IS_ALIGNED(start, fs_info->sectorsize)) {
> +             error("chunk start not aligned, start=%llu sectorsize=%u",
> +                   start, fs_info->sectorsize);
> +             return -EINVAL;
> +     }
> +     if (!IS_ALIGNED(num_bytes, fs_info->sectorsize)) {
> +             error("chunk size not aligned, size=%llu sectorsize=%u",
> +                   num_bytes, fs_info->sectorsize);
> +             return -EINVAL;
> +     }
> +     if (list_empty(dev_list)) {
> +             error("no writable device");
> +             return -ENODEV;
> +     }
> +
> +     device = list_entry(dev_list->next, struct btrfs_device, dev_list);
> +     map = malloc(btrfs_map_lookup_size(1));
> +     if (!map)
> +             return -ENOMEM;
> +     map->num_stripes = 1;
> +     map->stripes[0].dev = device;
> +     map->stripes[0].physical = start;
> +     map->stripe_len = BTRFS_STRIPE_LEN;
> +     map->io_align = BTRFS_STRIPE_LEN;
> +     map->io_width = BTRFS_STRIPE_LEN;
> +     map->type = type;
> +     map->sub_stripes = 1;
> +     map->sector_size = fs_info->sectorsize;
> +     map->ce.start = start;
> +     map->ce.size = num_bytes;
> +
> +     ret = insert_cache_extent(&fs_info->mapping_tree.cache_tree, &map->ce);
> +     if (ret < 0)
> +             goto error;
> +     ret = btrfs_make_block_group(trans, fs_info, 0, type, start, num_bytes);
> +     return ret;
> +error:
> +     free(map);
> +     return ret;
> +}
> +
> +/*
> + * Finish the chunk allocation by inserting needed chunk item and device
> + * extents, and update device used bytes
> + */
> +static int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
> +                                 struct btrfs_fs_info *fs_info,
> +                                 u64 chunk_start, u64 chunk_size)
> +{
> +     struct btrfs_root *extent_root = fs_info->extent_root;
> +     struct btrfs_root *chunk_root = fs_info->chunk_root;
> +     struct btrfs_key key;
> +     struct btrfs_device *device;
> +     struct btrfs_chunk *chunk;
> +     struct btrfs_stripe *stripe;
> +     struct cache_extent *ce;
> +     struct map_lookup *map;
> +     size_t item_size;
> +     u64 dev_offset;
> +     u64 stripe_size;
> +     int i = 0;
> +     int ret = 0;
> +
> +     ce = search_cache_extent(&fs_info->mapping_tree.cache_tree, 
> chunk_start);
> +     if (!ce)
> +             return -ENOENT;
> +
> +     map = container_of(ce, struct map_lookup, ce);
> +     item_size = btrfs_chunk_item_size(map->num_stripes);
> +     stripe_size = calc_stripe_length(map->type, map->ce.size,
> +                                      map->num_stripes);
> +
> +     chunk = kzalloc(item_size, GFP_NOFS);
> +     if (!chunk) {
> +             ret = -ENOMEM;
> +             goto out;
> +     }
>  
>       /*
> -      * Insert chunk item and chunk mapping.
> +      * Take the device list mutex to prevent races with the final phase of
> +      * a device replace operation that replaces the device object associated
> +      * with the map's stripes, because the device object's id can change
> +      * at any time during that final phase of the device replace operation
> +      * (dev-replace.c:btrfs_dev_replace_finishing()).
>        */
> -     ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
> -                             btrfs_chunk_item_size(num_stripes));
> -     BUG_ON(ret);
> -     *start = key.offset;
> +     /* mutex_lock(&fs_info->fs_devices->device_list_mutex); */
> +     for (i = 0; i < map->num_stripes; i++) {
> +             device = map->stripes[i].dev;
> +             dev_offset = map->stripes[i].physical;
>  
> -     if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
> -             ret = btrfs_add_system_chunk(info, &key,
> -                                 chunk, btrfs_chunk_item_size(num_stripes));
> -             if (ret < 0)
> -                     goto out_chunk;
> +             device->bytes_used += stripe_size;
> +             ret = btrfs_update_device(trans, device);
> +             if (ret)
> +                     break;
> +             ret = btrfs_alloc_dev_extent(trans, device, chunk_start,
> +                                          dev_offset, stripe_size);
> +             if (ret)
> +                     break;
> +     }
> +     if (ret) {
> +             /* mutex_unlock(&fs_info->fs_devices->device_list_mutex); */
> +             goto out;
>       }
>  
> +     stripe = &chunk->stripe;
> +     for (i = 0; i < map->num_stripes; i++) {
> +             device = map->stripes[i].dev;
> +             dev_offset = map->stripes[i].physical;
> +
> +             btrfs_set_stack_stripe_devid(stripe, device->devid);
> +             btrfs_set_stack_stripe_offset(stripe, dev_offset);
> +             memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
> +             stripe++;
> +     }
> +     /* mutex_unlock(&fs_info->fs_devices->device_list_mutex); */
> +
> +     btrfs_set_stack_chunk_length(chunk, chunk_size);
> +     btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
> +     btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
> +     btrfs_set_stack_chunk_type(chunk, map->type);
> +     btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
> +     btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
> +     btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
> +     btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
> +     btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
> +
> +     key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
> +     key.type = BTRFS_CHUNK_ITEM_KEY;
> +     key.offset = chunk_start;
> +
> +     ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
> +     if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
> +             /*
> +              * TODO: Cleanup of inserted chunk root in case of
> +              * failure.
> +              */
> +             ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
> +     }
> +
> +out:
>       kfree(chunk);
> +     return ret;
> +}
> +
> +/*
> + * Alloc a chunk.
> + * Will do all the needed work including seaching free device extent, insert
> + * chunk mapping, chunk item, block group item and device extents.
> + *
> + * @start:   return value of allocated chunk start bytenr.
> + * @num_bytes:       return value of allocated chunk size
> + * @type:    chunk type (including both profile and type)
> + * @convert: if the chunk is allocated for convert case.
> + *           If @convert is true, chunk allocator will skip device extent
> + *           search, but use *start and *num_bytes as chunk start/num_bytes
> + *           and devive offset, to build a 1:1 chunk mapping for convert.
> + */
> +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
> +                   struct btrfs_fs_info *fs_info, u64 *start, u64 *num_bytes,
> +                   u64 type, bool convert)
> +{
> +     int ret;
>  
>       /*
> -      * Insert device extents
> +      * Allocate chunk mapping
>        */
> -     ret = btrfs_insert_dev_extents(trans, info, map, stripe_size);
> -     if (ret < 0)
> -             goto out_devices_info;
> -
> -     ret = btrfs_make_block_group(trans, info, 0, type, map->ce.start,
> -                                  map->ce.size);
> -     kfree(devices_info);
> -     return ret;
> +     if (convert)
> +             ret = __btrfs_alloc_convert_chunk(trans, fs_info, *start,
> +                                               *num_bytes, type);
> +     else
> +             ret = __btrfs_alloc_chunk(trans, fs_info, start, num_bytes,
> +                                       type);
> +     if (ret < 0) {
> +             error("failed to allocate chunk mapping: %s", strerror(-ret));
> +             return ret;
> +     }
>  
> -out_chunk_map:
> -     kfree(map);
> -out_chunk:
> -     kfree(chunk);
> -out_devices_info:
> -     kfree(devices_info);
> +     /*
> +      * Insert the remaining part (insert variant items)
> +      */
> +     ret = btrfs_finish_chunk_alloc(trans, fs_info, *start, *num_bytes);
> +     if (ret < 0)
> +             error("failed to finish chunk allocation: %s", strerror(-ret));
>       return ret;
>  }
>  
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to