On 9.02.2018 09:44, Qu Wenruo wrote: > Kernel uses a delayed chunk allocation behavior for metadata chunks > > KERNEL: > btrfs_alloc_chunk() > |- __btrfs_alloc_chunk(): Only allocate chunk mapping > |- btrfs_make_block_group(): Add corresponding bg to fs_info->new_bgs > > Then at transaction commit time, it finishes the remaining work: > btrfs_start_dirty_block_groups(): > |- btrfs_create_pending_block_groups() > |- btrfs_insert_item(): To insert block group item > |- btrfs_finish_chunk_alloc(): Insert chunk items/dev extents > > Although btrfs-progs btrfs_alloc_chunk() does all the work in one > function, it can still benefit from function refactor like: > btrfs-progs: > btrfs_alloc_chunk(): Wrapper for both normal and convert chunks > |- __btrfs_alloc_chunk(): Only alloc chunk mapping > | |- btrfs_make_block_group(): <<Insert bg items into extent tree>> > |- btrfs_finish_chunk_alloc(): Insert chunk items/dev extents > > With such refactor, the following functions can share most of its code > with kernel now: > __btrfs_alloc_chunk() > btrfs_finish_chunk_alloc() > btrfs_alloc_dev_extent() > > Signed-off-by: Qu Wenruo <w...@suse.com> > --- > volumes.c | 421 > ++++++++++++++++++++++++++++++++++++++------------------------ > 1 file changed, 260 insertions(+), 161 deletions(-) > > diff --git a/volumes.c b/volumes.c > index cff54c612872..e89520326314 100644 > --- a/volumes.c > +++ b/volumes.c > @@ -523,55 +523,40 @@ static int find_free_dev_extent(struct btrfs_device > *device, u64 num_bytes, > return find_free_dev_extent_start(device, num_bytes, 0, start, len); > } > > -static int btrfs_insert_dev_extents(struct btrfs_trans_handle *trans, > - struct btrfs_fs_info *fs_info, > - struct map_lookup *map, u64 stripe_size) > +static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, > + struct btrfs_device *device, > + u64 chunk_offset, u64 physical, > + u64 stripe_size) > { > - int ret = 0; > - struct btrfs_path path; > + int ret; > + struct btrfs_path *path; > + struct btrfs_fs_info *fs_info = device->fs_info; > struct btrfs_root *root = fs_info->dev_root; > struct btrfs_dev_extent *extent; > struct extent_buffer *leaf; > struct btrfs_key key; > - int i; > > - btrfs_init_path(&path); > - > - for (i = 0; i < map->num_stripes; i++) { > - struct btrfs_device *device = map->stripes[i].dev; > - > - key.objectid = device->devid; > - key.offset = map->stripes[i].physical; > - key.type = BTRFS_DEV_EXTENT_KEY; > + path = btrfs_alloc_path(); > + if (!path) > + return -ENOMEM; > > - ret = btrfs_insert_empty_item(trans, root, &path, &key, > - sizeof(*extent)); > - if (ret < 0) > - goto out; > - leaf = path.nodes[0]; > - extent = btrfs_item_ptr(leaf, path.slots[0], > - struct btrfs_dev_extent); > - btrfs_set_dev_extent_chunk_tree(leaf, extent, > + key.objectid = device->devid; > + key.offset = physical; > + key.type = BTRFS_DEV_EXTENT_KEY; > + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); > + if (ret) > + goto out; > + leaf = path->nodes[0]; > + extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); > + btrfs_set_dev_extent_chunk_tree(leaf, extent, > BTRFS_CHUNK_TREE_OBJECTID); > - btrfs_set_dev_extent_chunk_objectid(leaf, extent, > - BTRFS_FIRST_CHUNK_TREE_OBJECTID); > - btrfs_set_dev_extent_chunk_offset(leaf, extent, map->ce.start); > - > - write_extent_buffer(leaf, fs_info->chunk_tree_uuid, > - (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), > - BTRFS_UUID_SIZE); > - > - btrfs_set_dev_extent_length(leaf, extent, stripe_size); > - btrfs_mark_buffer_dirty(leaf); > - btrfs_release_path(&path); > - > - device->bytes_used += stripe_size; > - ret = btrfs_update_device(trans, device); > - if (ret < 0) > - goto out; > - } > + btrfs_set_dev_extent_chunk_objectid(leaf, extent, > + BTRFS_FIRST_CHUNK_TREE_OBJECTID); > + btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); > + btrfs_set_dev_extent_length(leaf, extent, stripe_size); > + btrfs_mark_buffer_dirty(leaf); > out: > - btrfs_release_path(&path); > + btrfs_free_path(path); > return ret; > } > > @@ -813,28 +798,28 @@ static int btrfs_cmp_device_info(const void *a, const > void *b) > / sizeof(struct btrfs_stripe) + 1) > > /* > - * Alloc a chunk, will insert dev extents, chunk item, and insert new > - * block group and update space info (so that extent allocator can use > - * newly allocated chunk). > + * Alloc a chunk mapping. > + * Will do chunk size calculation and free dev extent search, and insert > + * chunk mapping into chunk mapping tree. > + * > + * NOTE: This function doesn't handle any chunk item/dev extent insert. > + * chunk item/dev extent insert is handled by later > btrfs_finish_chunk_alloc(). > + * And for convert chunk (1:1 mapped, more flex chunk location), it's > + * handled by __btrfs_alloc_convert_chunk(). > + * > + * Qu: Block group item is still inserted in this function by > + * btrfs_make_block_group(), this still differs from kernel. > *
If it still differs does that mean this function should undergo further refactoring to unify it with the kernel or it means it will never be the same as the kernel's code? I thought the idea of unifying the userspace/kernel code is to have only 1 set of functions which work both in kernel and userspace? > * @start: return value of allocated chunk start bytenr. > * @num_bytes: return value of allocated chunk size > * @type: chunk type (including both profile and type) > - * @convert: if the chunk is allocated for convert case. > - * If @convert is true, chunk allocator will skip device extent > - * search, but use *start and *num_bytes as chunk start/num_bytes > - * and devive offset, to build a 1:1 chunk mapping for convert. > */ > -int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, > - struct btrfs_fs_info *info, u64 *start, > - u64 *num_bytes, u64 type, bool convert) > +static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, > + struct btrfs_fs_info *info, u64 *start, > + u64 *num_bytes, u64 type) > { > - struct btrfs_root *extent_root = info->extent_root; > - struct btrfs_root *chunk_root = info->chunk_root; > struct btrfs_device *device = NULL; > - struct btrfs_chunk *chunk; > struct list_head *dev_list = &info->fs_devices->devices; > - struct btrfs_stripe *stripe; > struct map_lookup *map; > struct btrfs_device_info *devices_info = NULL; > u64 percent_max; > @@ -845,8 +830,6 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, > int index; > int ndevs = 0; > int rw_devs = 0; > - int stripe_len = BTRFS_STRIPE_LEN; > - struct btrfs_key key; > u64 offset; > int data_stripes; /* number of stripes that counts for bg size */ > int sub_stripes; /* sub_stripes info for map */ > @@ -874,34 +857,6 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, > devs_increment = btrfs_raid_array[index].devs_increment; > ncopies = btrfs_raid_array[index].ncopies; > > - if (convert) { > - /* For convert, profile must be SINGLE */ > - if (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { > - error("convert only suports SINGLE profile"); > - return -EINVAL; > - } > - if (!IS_ALIGNED(*start, info->sectorsize)) { > - error("chunk start not aligned, start=%llu > sectorsize=%u", > - *start, info->sectorsize); > - return -EINVAL; > - } > - if (!IS_ALIGNED(*num_bytes, info->sectorsize)) { > - error("chunk size not aligned, size=%llu sectorsize=%u", > - *num_bytes, info->sectorsize); > - return -EINVAL; > - } > - num_stripes = 1; > - data_stripes = 1; > - offset = *start; > - stripe_size = *num_bytes; > - /* > - * For convert, we use 1:1 chunk mapping specified by @start and > - * @num_bytes, so there is no need to go through dev_extent > - * searching. > - */ > - goto alloc_chunk; > - } > - > /* > * Chunk size calculation part. > */ > @@ -1047,55 +1002,23 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle > *trans, > /* > * Fill chunk mapping and chunk stripes > */ > -alloc_chunk: > - if (!convert) { > - ret = find_next_chunk(info, &offset); > - if (ret) > - goto out_devices_info; > - } > - key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; > - key.type = BTRFS_CHUNK_ITEM_KEY; > - key.offset = offset; > - *num_bytes = stripe_size * data_stripes; > - > - chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS); > - if (!chunk) > + ret = find_next_chunk(info, &offset); > + if (ret) > goto out_devices_info; > + *num_bytes = stripe_size * data_stripes; > > map = kmalloc(btrfs_map_lookup_size(num_stripes), GFP_NOFS); > if (!map) > goto out_chunk_map; > map->num_stripes = num_stripes; > > - if (convert) { > - device = list_entry(dev_list->next, struct btrfs_device, > - dev_list); > + for (i = 0; i < ndevs; i++) { > + for (j = 0; j < dev_stripes; ++j) { > + int s = i * dev_stripes + j; > > - map->stripes[0].dev = device; > - map->stripes[0].physical = *start; > - stripe = &chunk->stripe; > - btrfs_set_stack_stripe_devid(stripe, device->devid); > - btrfs_set_stack_stripe_offset(stripe, *start); > - memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); > - } else { > - for (i = 0; i < ndevs; i++) { > - for (j = 0; j < dev_stripes; ++j) { > - int s = i * dev_stripes + j; > - > - device = devices_info[i].dev; > - map->stripes[s].dev = device; > - map->stripes[s].physical = > - devices_info[i].dev_offset + > - j * stripe_size; > - stripe = &chunk->stripe + s; > - btrfs_set_stack_stripe_devid(stripe, > - device->devid); > - btrfs_set_stack_stripe_offset(stripe, > - devices_info[i].dev_offset + > - j * stripe_size); > - memcpy(stripe->dev_uuid, device->uuid, > - BTRFS_UUID_SIZE); > - } > + map->stripes[s].dev = devices_info[i].dev; > + map->stripes[s].physical = devices_info[i].dev_offset + > + j * stripe_size; > } > } > map->stripe_len = BTRFS_STRIPE_LEN; > @@ -1104,60 +1027,236 @@ alloc_chunk: > map->type = type; > map->sub_stripes = sub_stripes; > map->sector_size = info->sectorsize; > - map->ce.start = key.offset; > + map->ce.start = offset; > map->ce.size = *num_bytes; > > + kfree(devices_info); > > + /* > + * Insert chunk mapping and block group > + */ > ret = insert_cache_extent(&info->mapping_tree.cache_tree, &map->ce); > if (ret < 0) > goto out_chunk_map; > > - /* key was set above */ > - btrfs_set_stack_chunk_length(chunk, *num_bytes); > - btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); > - btrfs_set_stack_chunk_stripe_len(chunk, stripe_len); > - btrfs_set_stack_chunk_type(chunk, type); > - btrfs_set_stack_chunk_num_stripes(chunk, num_stripes); > - btrfs_set_stack_chunk_io_align(chunk, stripe_len); > - btrfs_set_stack_chunk_io_width(chunk, stripe_len); > - btrfs_set_stack_chunk_sector_size(chunk, info->sectorsize); > - btrfs_set_stack_chunk_sub_stripes(chunk, sub_stripes); > + ret = btrfs_make_block_group(trans, info, 0, type, offset, > + *num_bytes); > + *start = offset; > + return ret; > + > +out_chunk_map: > + kfree(map); > +out_devices_info: > + kfree(devices_info); > + return ret; > +} > + > +/* > + * Alloc a chunk mapping for convert. > + * Convert needs special SINGLE chunk whose logical bytenr is the same as its > + * physical bytenr. > + * Chunk size and start bytenr are all specified by @start and @num_bytes > + * > + * And just like __btrfs_alloc_chunk() this only handles chunk mapping and > + * block group item. > + */ > +static int __btrfs_alloc_convert_chunk(struct btrfs_trans_handle *trans, > + struct btrfs_fs_info *fs_info, u64 start, > + u64 num_bytes, u64 type) > +{ > + struct list_head *dev_list = &fs_info->fs_devices->devices; > + struct map_lookup *map; > + struct btrfs_device *device; > + int ret; > + > + /* For convert, profile must be SINGLE */ > + if (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { > + error("convert only suports SINGLE profile"); > + return -EINVAL; > + } > + if (!IS_ALIGNED(start, fs_info->sectorsize)) { > + error("chunk start not aligned, start=%llu sectorsize=%u", > + start, fs_info->sectorsize); > + return -EINVAL; > + } > + if (!IS_ALIGNED(num_bytes, fs_info->sectorsize)) { > + error("chunk size not aligned, size=%llu sectorsize=%u", > + num_bytes, fs_info->sectorsize); > + return -EINVAL; > + } > + if (list_empty(dev_list)) { > + error("no writable device"); > + return -ENODEV; > + } > + > + device = list_entry(dev_list->next, struct btrfs_device, dev_list); > + map = malloc(btrfs_map_lookup_size(1)); > + if (!map) > + return -ENOMEM; > + map->num_stripes = 1; > + map->stripes[0].dev = device; > + map->stripes[0].physical = start; > + map->stripe_len = BTRFS_STRIPE_LEN; > + map->io_align = BTRFS_STRIPE_LEN; > + map->io_width = BTRFS_STRIPE_LEN; > + map->type = type; > + map->sub_stripes = 1; > + map->sector_size = fs_info->sectorsize; > + map->ce.start = start; > + map->ce.size = num_bytes; > + > + ret = insert_cache_extent(&fs_info->mapping_tree.cache_tree, &map->ce); > + if (ret < 0) > + goto error; > + ret = btrfs_make_block_group(trans, fs_info, 0, type, start, num_bytes); > + return ret; > +error: > + free(map); > + return ret; > +} > + > +/* > + * Finish the chunk allocation by inserting needed chunk item and device > + * extents, and update device used bytes > + */ > +static int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, > + struct btrfs_fs_info *fs_info, > + u64 chunk_start, u64 chunk_size) > +{ > + struct btrfs_root *extent_root = fs_info->extent_root; > + struct btrfs_root *chunk_root = fs_info->chunk_root; > + struct btrfs_key key; > + struct btrfs_device *device; > + struct btrfs_chunk *chunk; > + struct btrfs_stripe *stripe; > + struct cache_extent *ce; > + struct map_lookup *map; > + size_t item_size; > + u64 dev_offset; > + u64 stripe_size; > + int i = 0; > + int ret = 0; > + > + ce = search_cache_extent(&fs_info->mapping_tree.cache_tree, > chunk_start); > + if (!ce) > + return -ENOENT; > + > + map = container_of(ce, struct map_lookup, ce); > + item_size = btrfs_chunk_item_size(map->num_stripes); > + stripe_size = calc_stripe_length(map->type, map->ce.size, > + map->num_stripes); > + > + chunk = kzalloc(item_size, GFP_NOFS); > + if (!chunk) { > + ret = -ENOMEM; > + goto out; > + } > > /* > - * Insert chunk item and chunk mapping. > + * Take the device list mutex to prevent races with the final phase of > + * a device replace operation that replaces the device object associated > + * with the map's stripes, because the device object's id can change > + * at any time during that final phase of the device replace operation > + * (dev-replace.c:btrfs_dev_replace_finishing()). > */ > - ret = btrfs_insert_item(trans, chunk_root, &key, chunk, > - btrfs_chunk_item_size(num_stripes)); > - BUG_ON(ret); > - *start = key.offset; > + /* mutex_lock(&fs_info->fs_devices->device_list_mutex); */ > + for (i = 0; i < map->num_stripes; i++) { > + device = map->stripes[i].dev; > + dev_offset = map->stripes[i].physical; > > - if (type & BTRFS_BLOCK_GROUP_SYSTEM) { > - ret = btrfs_add_system_chunk(info, &key, > - chunk, btrfs_chunk_item_size(num_stripes)); > - if (ret < 0) > - goto out_chunk; > + device->bytes_used += stripe_size; > + ret = btrfs_update_device(trans, device); > + if (ret) > + break; > + ret = btrfs_alloc_dev_extent(trans, device, chunk_start, > + dev_offset, stripe_size); > + if (ret) > + break; > + } > + if (ret) { > + /* mutex_unlock(&fs_info->fs_devices->device_list_mutex); */ > + goto out; > } > > + stripe = &chunk->stripe; > + for (i = 0; i < map->num_stripes; i++) { > + device = map->stripes[i].dev; > + dev_offset = map->stripes[i].physical; > + > + btrfs_set_stack_stripe_devid(stripe, device->devid); > + btrfs_set_stack_stripe_offset(stripe, dev_offset); > + memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); > + stripe++; > + } > + /* mutex_unlock(&fs_info->fs_devices->device_list_mutex); */ > + > + btrfs_set_stack_chunk_length(chunk, chunk_size); > + btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); > + btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); > + btrfs_set_stack_chunk_type(chunk, map->type); > + btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); > + btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); > + btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); > + btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); > + btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); > + > + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; > + key.type = BTRFS_CHUNK_ITEM_KEY; > + key.offset = chunk_start; > + > + ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); > + if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { > + /* > + * TODO: Cleanup of inserted chunk root in case of > + * failure. > + */ > + ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); > + } > + > +out: > kfree(chunk); > + return ret; > +} > + > +/* > + * Alloc a chunk. > + * Will do all the needed work including seaching free device extent, insert > + * chunk mapping, chunk item, block group item and device extents. > + * > + * @start: return value of allocated chunk start bytenr. > + * @num_bytes: return value of allocated chunk size > + * @type: chunk type (including both profile and type) > + * @convert: if the chunk is allocated for convert case. > + * If @convert is true, chunk allocator will skip device extent > + * search, but use *start and *num_bytes as chunk start/num_bytes > + * and devive offset, to build a 1:1 chunk mapping for convert. > + */ > +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, > + struct btrfs_fs_info *fs_info, u64 *start, u64 *num_bytes, > + u64 type, bool convert) > +{ > + int ret; > > /* > - * Insert device extents > + * Allocate chunk mapping > */ > - ret = btrfs_insert_dev_extents(trans, info, map, stripe_size); > - if (ret < 0) > - goto out_devices_info; > - > - ret = btrfs_make_block_group(trans, info, 0, type, map->ce.start, > - map->ce.size); > - kfree(devices_info); > - return ret; > + if (convert) > + ret = __btrfs_alloc_convert_chunk(trans, fs_info, *start, > + *num_bytes, type); > + else > + ret = __btrfs_alloc_chunk(trans, fs_info, start, num_bytes, > + type); > + if (ret < 0) { > + error("failed to allocate chunk mapping: %s", strerror(-ret)); > + return ret; > + } > > -out_chunk_map: > - kfree(map); > -out_chunk: > - kfree(chunk); > -out_devices_info: > - kfree(devices_info); > + /* > + * Insert the remaining part (insert variant items) > + */ > + ret = btrfs_finish_chunk_alloc(trans, fs_info, *start, *num_bytes); > + if (ret < 0) > + error("failed to finish chunk allocation: %s", strerror(-ret)); > return ret; > } > > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html