Hi, here are my comments, after first review round, mostly on code itself, not the functionality/quality of the allocator (although after this patch the allocator code looks quite straightforward a better to read and understand).
Reviewing the patch directly does not work, Arne's advice was to look at the new code only, which did work, at the function __btrfs_alloc_chunk. I found several categories of similar issues: * move declarations from function-level to local scopes; makes the func declaration block less scary :) * BUG_ONs -- a number of new ones -- I'd rather get rid of them now * a few more comments would help * misc code transformations or identifier renames On Mon, May 02, 2011 at 10:47:42AM +0200, Arne Jansen wrote: > In a multi device setup, the chunk allocator currently always allocates > chunks on the devices in the same order. This leads to a very uneven > distribution, especially with RAID1 or RAID10 and an uneven number of > devices. > This patch always sorts the devices before allocating, and allocates the > stripes on the devices with the most available space, as long as there > is enough space available. In a low space situation, it first tries to > maximize striping. > The patch also simplifies the allocator and reduces the checks for > corner cases. > The simplification is done by several means. First, it defines the > properties of each RAID type upfront. These properties are used afterwards > instead of differentiating cases in several places. > Second, the old allocator defined a minimum stripe size for each block > group type, tried to find a large enough chunk, and if this fails just > allocates a smaller one. This is now done in one step. The largest possible > chunk (up to max_chunk_size) is searched and allocated. > Because we now have only one pass, the allocation of the map (struct > map_lookup) is moved down to the point where the number of stripes is > already known. This way we avoid reallocation of the map. > We still avoid allocating stripes that are not a multiple of STRIPE_SIZE. > > Changes from v2: > - bugfix for 'single' raid type; the initial parameter initialization lacked > a case for the 'single' type, thus leaving devs_max at the wrong value > > Signed-off-by: Arne Jansen <sensi...@gmx.net> > --- > fs/btrfs/volumes.c | 476 > +++++++++++++++++++--------------------------------- > fs/btrfs/volumes.h | 1 + > 2 files changed, 171 insertions(+), 306 deletions(-) > > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index 45c592a..77eb763 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -2268,349 +2268,213 @@ static int btrfs_add_system_chunk(struct > btrfs_trans_handle *trans, > return 0; > } > > -static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, > - int num_stripes, int sub_stripes) > -{ > - if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) > - return calc_size; > - else if (type & BTRFS_BLOCK_GROUP_RAID10) > - return calc_size * (num_stripes / sub_stripes); > - else > - return calc_size * num_stripes; > -} > - > -static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 > type, > - int *num_stripes, int *min_stripes, > - int *sub_stripes) > -{ > - *num_stripes = 1; > - *min_stripes = 1; > - *sub_stripes = 0; > - > - if (type & (BTRFS_BLOCK_GROUP_RAID0)) { > - *num_stripes = fs_devices->rw_devices; > - *min_stripes = 2; > - } > - if (type & (BTRFS_BLOCK_GROUP_DUP)) { > - *num_stripes = 2; > - *min_stripes = 2; > - } > - if (type & (BTRFS_BLOCK_GROUP_RAID1)) { > - if (fs_devices->rw_devices < 2) > - return -ENOSPC; > - *num_stripes = 2; > - *min_stripes = 2; > - } > - if (type & (BTRFS_BLOCK_GROUP_RAID10)) { > - *num_stripes = fs_devices->rw_devices; > - if (*num_stripes < 4) > - return -ENOSPC; > - *num_stripes &= ~(u32)1; > - *sub_stripes = 2; > - *min_stripes = 4; > - } > - > - return 0; > -} > - > -static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices, > - u64 proposed_size, u64 type, > - int num_stripes, int small_stripe) > -{ > - int min_stripe_size = 1 * 1024 * 1024; > - u64 calc_size = proposed_size; > - u64 max_chunk_size = calc_size; > - int ncopies = 1; > - > - if (type & (BTRFS_BLOCK_GROUP_RAID1 | > - BTRFS_BLOCK_GROUP_DUP | > - BTRFS_BLOCK_GROUP_RAID10)) > - ncopies = 2; > - > - if (type & BTRFS_BLOCK_GROUP_DATA) { > - max_chunk_size = 10 * calc_size; > - min_stripe_size = 64 * 1024 * 1024; > - } else if (type & BTRFS_BLOCK_GROUP_METADATA) { > - max_chunk_size = 256 * 1024 * 1024; > - min_stripe_size = 32 * 1024 * 1024; > - } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { > - calc_size = 8 * 1024 * 1024; > - max_chunk_size = calc_size * 2; > - min_stripe_size = 1 * 1024 * 1024; > - } > - > - /* we don't want a chunk larger than 10% of writeable space */ > - max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), > - max_chunk_size); > - > - if (calc_size * num_stripes > max_chunk_size * ncopies) { > - calc_size = max_chunk_size * ncopies; > - do_div(calc_size, num_stripes); > - do_div(calc_size, BTRFS_STRIPE_LEN); > - calc_size *= BTRFS_STRIPE_LEN; > - } > - > - /* we don't want tiny stripes */ > - if (!small_stripe) > - calc_size = max_t(u64, min_stripe_size, calc_size); > - > - /* > - * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure > - * we end up with something bigger than a stripe > - */ > - calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN); > - > - do_div(calc_size, BTRFS_STRIPE_LEN); > - calc_size *= BTRFS_STRIPE_LEN; > - > - return calc_size; > -} > - > -static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map, > - int num_stripes) > -{ > - struct map_lookup *new; > - size_t len = map_lookup_size(num_stripes); > - > - BUG_ON(map->num_stripes < num_stripes); > - > - if (map->num_stripes == num_stripes) > - return map; > - > - new = kmalloc(len, GFP_NOFS); > - if (!new) { > - /* just change map->num_stripes */ > - map->num_stripes = num_stripes; > - return map; > - } > - > - memcpy(new, map, len); > - new->num_stripes = num_stripes; > - kfree(map); > - return new; > -} > - > /* > - * helper to allocate device space from btrfs_device_info, in which we stored > - * max free space information of every device. It is used when we can not > - * allocate chunks by default size. > - * > - * By this helper, we can allocate a new chunk as larger as possible. > + * sort the devices in descending order by max_avail, total_avail > */ > -static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans, > - struct btrfs_fs_devices *fs_devices, > - struct btrfs_device_info *devices, > - int nr_device, u64 type, > - struct map_lookup **map_lookup, > - int min_stripes, u64 *stripe_size) > +static int btrfs_cmp_device_info(const void *a, const void *b) > { > - int i, index, sort_again = 0; > - int min_devices = min_stripes; > - u64 max_avail, min_free; > - struct map_lookup *map = *map_lookup; > - int ret; > - > - if (nr_device < min_stripes) > - return -ENOSPC; > - > - btrfs_descending_sort_devices(devices, nr_device); > - > - max_avail = devices[0].max_avail; > - if (!max_avail) > - return -ENOSPC; > - > - for (i = 0; i < nr_device; i++) { > - /* > - * if dev_offset = 0, it means the free space of this device > - * is less than what we need, and we didn't search max avail > - * extent on this device, so do it now. > - */ > - if (!devices[i].dev_offset) { > - ret = find_free_dev_extent(trans, devices[i].dev, > - max_avail, > - &devices[i].dev_offset, > - &devices[i].max_avail); > - if (ret != 0 && ret != -ENOSPC) > - return ret; > - sort_again = 1; > - } > - } > - > - /* we update the max avail free extent of each devices, sort again */ > - if (sort_again) > - btrfs_descending_sort_devices(devices, nr_device); > - > - if (type & BTRFS_BLOCK_GROUP_DUP) > - min_devices = 1; > - > - if (!devices[min_devices - 1].max_avail) > - return -ENOSPC; > - > - max_avail = devices[min_devices - 1].max_avail; > - if (type & BTRFS_BLOCK_GROUP_DUP) > - do_div(max_avail, 2); > - > - max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type, > - min_stripes, 1); > - if (type & BTRFS_BLOCK_GROUP_DUP) > - min_free = max_avail * 2; > - else > - min_free = max_avail; > - > - if (min_free > devices[min_devices - 1].max_avail) > - return -ENOSPC; > - > - map = __shrink_map_lookup_stripes(map, min_stripes); > - *stripe_size = max_avail; > - > - index = 0; > - for (i = 0; i < min_stripes; i++) { > - map->stripes[i].dev = devices[index].dev; > - map->stripes[i].physical = devices[index].dev_offset; > - if (type & BTRFS_BLOCK_GROUP_DUP) { > - i++; > - map->stripes[i].dev = devices[index].dev; > - map->stripes[i].physical = devices[index].dev_offset + > - max_avail; > - } > - index++; > - } > - *map_lookup = map; > - > + const struct btrfs_device_info *di_a = a; > + const struct btrfs_device_info *di_b = b; > + > + if (di_a->max_avail > di_b->max_avail) > + return -1; > + if (di_a->max_avail < di_b->max_avail) > + return 1; > + if (di_a->total_avail > di_b->total_avail) > + return -1; > + if (di_a->total_avail < di_b->total_avail) > + return 1; > return 0; > } > > static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, > struct btrfs_root *extent_root, > struct map_lookup **map_ret, map_out > - u64 *num_bytes, u64 *stripe_size, > + u64 *num_bytes, u64 *stripe_size_out, num_bytes_out there seems to be no formal convention to name outgoing parameters, but at least this makes things more consistent > u64 start, u64 type) > { > struct btrfs_fs_info *info = extent_root->fs_info; > struct btrfs_device *device = NULL; this variable can be declared in local scope block ... > struct btrfs_fs_devices *fs_devices = info->fs_devices; > struct list_head *cur; > - struct map_lookup *map; > + struct map_lookup *map = NULL; > struct extent_map_tree *em_tree; > struct extent_map *em; > - struct btrfs_device_info *devices_info; > - struct list_head private_devs; > - u64 calc_size = 1024 * 1024 * 1024; > - u64 min_free; > - u64 avail; > + struct btrfs_device_info *devices_info = NULL; > + u64 total_avail; this > + u64 max_avail; this > u64 dev_offset; and this should go to local scope > - int num_stripes; > - int min_stripes; > - int sub_stripes; > - int min_devices; /* the min number of devices we need */ > - int i; > + int num_stripes; /* total number of stripes to allocate */ > + int sub_stripes; /* sub_stripes info for map */ > + int dev_stripes; /* stripes per dev */ > + int devs_max; /* max devs to use */ > + int devs_min; /* min devs needed */ > + int devs_increment; /* ndevs has to be a multiple of this */ > + int ncopies; /* how many copies to data has */ > int ret; > + u64 max_stripe_size; > + u64 max_chunk_size; > + u64 stripe_size; > + int ndevs; > int index; ^^^^^ remove, unused or can be substituted with existing var (i) > + int i; > + int j; > > if ((type & BTRFS_BLOCK_GROUP_RAID1) && > (type & BTRFS_BLOCK_GROUP_DUP)) { > WARN_ON(1); > type &= ~BTRFS_BLOCK_GROUP_DUP; > } comment please, why this combination is bad > + (extra newline) > if (list_empty(&fs_devices->alloc_list)) > return -ENOSPC; > > - ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes, > - &min_stripes, &sub_stripes); > - if (ret) > - return ret; > + > + sub_stripes = 1; > + dev_stripes = 1; > + devs_increment = 1; > + ncopies = 1; > + devs_max = 0; /* 0 == as many as possible */ > + devs_min = 1; > + > + /* > + * define the properties of each RAID type. > + * FIXME: move this to a global table and use it in all RAID > + * calculation code > + */ agreed with the FIXME, Hugo Mills' balance patches have the refactoring code, so let's keep this as-is for now > + if (type & (BTRFS_BLOCK_GROUP_DUP)) { > + dev_stripes = 2; > + ncopies = 2; > + devs_max = 1; > + } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { > + devs_min = 2; > + } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { > + devs_increment = 2; > + ncopies = 2; > + devs_max = 2; > + devs_min = 2; > + } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { > + sub_stripes = 2; > + devs_increment = 2; > + ncopies = 2; > + devs_min = 4; > + } else { > + devs_max = 1; > + } > + > + if (type & BTRFS_BLOCK_GROUP_DATA) { > + max_stripe_size = 1024 * 1024 * 1024; > + max_chunk_size = 10 * max_stripe_size; > + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { > + max_stripe_size = 256 * 1024 * 1024; > + max_chunk_size = max_stripe_size; > + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { > + max_stripe_size = 8 * 1024 * 1024; > + max_chunk_size = 2 * max_stripe_size; > + } else { > + BUG_ON(1); a printk would be good, this may catch a bug eg. in alloc_profile bit handling > + } > + > + /* we don't want a chunk larger than 10% of writeable space */ > + max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), > + max_chunk_size); > > devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, > GFP_NOFS); > if (!devices_info) > return -ENOMEM; > > - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); > - if (!map) { > - ret = -ENOMEM; > - goto error; > - } > - map->num_stripes = num_stripes; > - > cur = fs_devices->alloc_list.next; > - index = 0; > - i = 0; > > - calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type, > - num_stripes, 0); > - > - if (type & BTRFS_BLOCK_GROUP_DUP) { > - min_free = calc_size * 2; > - min_devices = 1; > - } else { > - min_free = calc_size; > - min_devices = min_stripes; > - } > - > - INIT_LIST_HEAD(&private_devs); > - while (index < num_stripes) { > + /* > + * in the first pass through the devices list, we gather information > + * about the available holes on each device. > + */ > + ndevs = 0; > + while (cur != &fs_devices->alloc_list) { > device = list_entry(cur, struct btrfs_device, dev_alloc_list); ^^^^^^ declare locally, and add max_avail and total_avail > BUG_ON(!device->writeable); not a newly added BUG_ON, but I wonder if this is the best what can be done here, a printk would be helpful too. I do not see any locking of the device list here, and did not have a deeper look, but I think this could race with device removal, when the device is still in the list, but the racing thread managed to set device->writable to zero. In this case, a WARN would do, and skipping the device will not break things (too much). > + > + cur = cur->next; > + > + if (!device->in_fs_metadata) > + continue; > + > if (device->total_bytes > device->bytes_used) > - avail = device->total_bytes - device->bytes_used; > + total_avail = device->total_bytes - device->bytes_used; > else > - avail = 0; > - cur = cur->next; > - > - if (device->in_fs_metadata && avail >= min_free) { > - ret = find_free_dev_extent(trans, device, min_free, > - &devices_info[i].dev_offset, > - &devices_info[i].max_avail); > - if (ret == 0) { > - list_move_tail(&device->dev_alloc_list, > - &private_devs); > - map->stripes[index].dev = device; > - map->stripes[index].physical = > - devices_info[i].dev_offset; > - index++; > - if (type & BTRFS_BLOCK_GROUP_DUP) { > - map->stripes[index].dev = device; > - map->stripes[index].physical = > - devices_info[i].dev_offset + > - calc_size; > - index++; > - } > - } else if (ret != -ENOSPC) > - goto error; > - > - devices_info[i].dev = device; > - i++; > - } else if (device->in_fs_metadata && > - avail >= BTRFS_STRIPE_LEN) { > - devices_info[i].dev = device; > - devices_info[i].max_avail = avail; > - i++; > - } > - > - if (cur == &fs_devices->alloc_list) > - break; > - } > - > - list_splice(&private_devs, &fs_devices->alloc_list); > - if (index < num_stripes) { > - if (index >= min_stripes) { > - num_stripes = index; > - if (type & (BTRFS_BLOCK_GROUP_RAID10)) { > - num_stripes /= sub_stripes; > - num_stripes *= sub_stripes; > - } > - > - map = __shrink_map_lookup_stripes(map, num_stripes); > - } else if (i >= min_devices) { > - ret = __btrfs_alloc_tiny_space(trans, fs_devices, > - devices_info, i, type, > - &map, min_stripes, > - &calc_size); > - if (ret) > - goto error; > - } else { > - ret = -ENOSPC; > + total_avail = 0; > + /* avail is off by max(alloc_start, 1MB), but that is the same > + * for all devices, so it doesn't hurt the sorting later on > + */ > + > + ret = find_free_dev_extent(trans, device, > + max_stripe_size * dev_stripes, > + &dev_offset, &max_avail); > + if (ret && ret != -ENOSPC) > goto error; > + > + if (ret == 0) > + max_avail = max_stripe_size * dev_stripes; > + > + if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) > + continue; > + > + devices_info[ndevs].dev_offset = dev_offset; > + devices_info[ndevs].max_avail = max_avail; > + devices_info[ndevs].total_avail = total_avail; > + devices_info[ndevs].dev = device; > + ++ndevs; > + } > + > + /* > + * now sort the devices by hole size / available space > + */ > + sort(devices_info, ndevs, sizeof(struct btrfs_device_info), > + btrfs_cmp_device_info, NULL); > + > + /* round down to number of usable stripes */ > + ndevs -= ndevs % devs_increment; > + > + if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { > + ret = -ENOSPC; > + goto error; > + } > + > + if (devs_max && ndevs > devs_max) > + ndevs = devs_max; > + /* > + * the primary goal is to maximize the number of stripes, so use as many > + * devices as possible, even if the stripes are not maximum sized. > + */ > + stripe_size = devices_info[ndevs-1].max_avail; > + num_stripes = ndevs * dev_stripes; > + > + if (stripe_size * num_stripes > max_chunk_size * ncopies) { > + stripe_size = max_chunk_size * ncopies; > + do_div(stripe_size, num_stripes); > + } > + > + do_div(stripe_size, dev_stripes); > + do_div(stripe_size, BTRFS_STRIPE_LEN); > + stripe_size *= BTRFS_STRIPE_LEN; > + > + BUG_ON(!stripe_size); what does this catch, in words? stripe_size was smaller than BTRFS_STRIPE_LEN. Old code made sure that this does not happen, doing calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN) before the final do_div. Is there really a reason to crash? > + > + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); > + if (!map) { > + ret = -ENOMEM; > + goto error; > + } > + map->num_stripes = num_stripes; > + > + index = 0; not used, probably intended as a linear counter of the 'int s = ... ' expression below. the compiler 'strength-reduce' pass will probably deduce this too. > + for (i = 0; i < ndevs; ++i) { > + for (j = 0; j < dev_stripes; ++j) { > + int s = i * dev_stripes + j; > + map->stripes[s].dev = devices_info[i].dev; > + map->stripes[s].physical = devices_info[i].dev_offset + > + j * stripe_size; > } > } not really an issue here, but the devices_info[i] could be moved out of the for( j ) loop (assigned to a tmp var), but an optimizing compiler will do this too :) > map->sector_size = extent_root->sectorsize; > @@ -2621,9 +2485,8 @@ static int __btrfs_alloc_chunk(struct > btrfs_trans_handle *trans, > map->sub_stripes = sub_stripes; > > *map_ret = map; > - *stripe_size = calc_size; > - *num_bytes = chunk_bytes_by_type(type, calc_size, > - map->num_stripes, sub_stripes); > + *stripe_size_out = stripe_size; > + *num_bytes = stripe_size * (num_stripes / ncopies); > > trace_btrfs_chunk_alloc(info->chunk_root, map, start, *num_bytes); ... [inserted from patched file] 2482 index = 0; 2483 while (index < map->num_stripes) { 2484 device = map->stripes[index].dev; 2485 dev_offset = map->stripes[index].physical; 2486 2487 ret = btrfs_alloc_dev_extent(trans, device, 2488 info->chunk_root->root_key.objectid, 2489 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2490 start, dev_offset, stripe_size); 2491 BUG_ON(ret); 2492 index++; 2493 } can you please transform this to a for loop for me? I know this is not your original code, but this cleanup seems worth while changing the whole function. (and make those device and def_offset local declarations) > > @@ -2658,7 +2521,7 @@ static int __btrfs_alloc_chunk(struct > btrfs_trans_handle *trans, > ret = btrfs_alloc_dev_extent(trans, device, > info->chunk_root->root_key.objectid, > BTRFS_FIRST_CHUNK_TREE_OBJECTID, > - start, dev_offset, calc_size); > + start, dev_offset, stripe_size); > BUG_ON(ret); > index++; > } > @@ -2667,8 +2530,9 @@ static int __btrfs_alloc_chunk(struct > btrfs_trans_handle *trans, > return 0; > > error: > + kfree(devices_info); > kfree(map); > - kfree(devices_info); > + unnecessary hunk > return ret; > } > > diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h > index b502f01..37ae6e2 100644 > --- a/fs/btrfs/volumes.h > +++ b/fs/btrfs/volumes.h > @@ -144,6 +144,7 @@ struct btrfs_device_info { > struct btrfs_device *dev; > u64 dev_offset; > u64 max_avail; > + u64 total_avail; > }; > > struct map_lookup { > -- and I'm going to test this a bit too :) david -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html