2018-04-25 10:54 GMT+03:00 Misono Tomohiro <misono.tomoh...@jp.fujitsu.com>: > On 2018/04/25 9:20, Timofey Titovets wrote: >> Currently btrfs raid1/10 balancer bаlance requests to mirrors, >> based on pid % num of mirrors. >> >> Make logic understood: >> - if one of underline devices are non rotational >> - Queue leght to underline devices >> >> By default try use pid % num_mirrors guessing, but: >> - If one of mirrors are non rotational, repick optimal to it >> - If underline mirror have less queue leght then optimal, >> repick to that mirror >> >> For avoid round-robin request balancing, >> lets round down queue leght: >> - By 8 for rotational devs >> - By 2 for all non rotational devs >> >> Changes: >> v1 -> v2: >> - Use helper part_in_flight() from genhd.c >> to get queue lenght >> - Move guess code to guess_optimal() >> - Change balancer logic, try use pid % mirror by default >> Make balancing on spinning rust if one of underline devices >> are overloaded >> v2 -> v3: >> - Fix arg for RAID10 - use sub_stripes, instead of num_stripes >> v3 -> v4: >> - Rebased on latest misc-next >> >> Signed-off-by: Timofey Titovets <nefelim...@gmail.com> >> --- >> block/genhd.c | 1 + >> fs/btrfs/volumes.c | 111 ++++++++++++++++++++++++++++++++++++++++++++- >> 2 files changed, 110 insertions(+), 2 deletions(-) >> >> diff --git a/block/genhd.c b/block/genhd.c >> index 9656f9e9f99e..5ea5acc88d3c 100644 >> --- a/block/genhd.c >> +++ b/block/genhd.c >> @@ -81,6 +81,7 @@ void part_in_flight(struct request_queue *q, struct >> hd_struct *part, >> atomic_read(&part->in_flight[1]); >> } >> } >> +EXPORT_SYMBOL_GPL(part_in_flight); >> >> struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) >> { >> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c >> index c95af358b71f..fa7dd6ac087f 100644 >> --- a/fs/btrfs/volumes.c >> +++ b/fs/btrfs/volumes.c >> @@ -16,6 +16,7 @@ >> #include <linux/semaphore.h> >> #include <linux/uuid.h> >> #include <linux/list_sort.h> >> +#include <linux/genhd.h> >> #include <asm/div64.h> >> #include "ctree.h" >> #include "extent_map.h" >> @@ -5148,7 +5149,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, >> u64 logical, u64 len) >> /* >> * There could be two corrupted data stripes, we need >> * to loop retry in order to rebuild the correct data. >> - * >> + * >> * Fail a stripe at a time on every retry except the >> * stripe under reconstruction. >> */ >> @@ -5201,6 +5202,111 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info >> *fs_info, u64 logical, u64 len) >> return ret; >> } >> >> +/** >> + * bdev_get_queue_len - return rounded down in flight queue lenght of bdev >> + * >> + * @bdev: target bdev >> + * @round_down: round factor big for hdd and small for ssd, like 8 and 2 >> + */ >> +static int bdev_get_queue_len(struct block_device *bdev, int round_down) >> +{ >> + int sum; >> + struct hd_struct *bd_part = bdev->bd_part; >> + struct request_queue *rq = bdev_get_queue(bdev); >> + uint32_t inflight[2] = {0, 0}; >> + >> + part_in_flight(rq, bd_part, inflight); >> + >> + sum = max_t(uint32_t, inflight[0], inflight[1]); >> + >> + /* >> + * Try prevent switch for every sneeze >> + * By roundup output num by some value >> + */ >> + return ALIGN_DOWN(sum, round_down); >> +} >> + >> +/** >> + * guess_optimal - return guessed optimal mirror >> + * >> + * Optimal expected to be pid % num_stripes >> + * >> + * That's generaly ok for spread load >> + * Add some balancer based on queue leght to device >> + * >> + * Basic ideas: >> + * - Sequential read generate low amount of request >> + * so if load of drives are equal, use pid % num_stripes balancing >> + * - For mixed rotate/non-rotate mirrors, pick non-rotate as optimal >> + * and repick if other dev have "significant" less queue lenght > > The code looks always choosing the queue with the lowest length regardless > of the amount of queue length difference. So, this "significant" may be wrong?
yes, but before code looks at queue len, we do round_down by 8, may be you confused because i hide ALIGN_DOWN in bdev_get_queue_len() I'm not think what this is "best" leveling queue leveling solution, but that's works. In theory ABS() <> some_num, can be used, but that's will lead to random send of some requests to hdd. i.e. for ABS() > 8: ssd hdd 9 0 -> hdd 9 1 -> ssd 10 1 -> hdd 10 2 -> ssd And so on. So in general that will looks like: ssd rount_down(7, 8) = 0, hdd round_down(0, 8) = 0 ssd will be used And ssd round_down(9, 8) = 8, hdd round_down(0,8) = 0 hdd will be used, while hdd qlen < 8. i.e. 'significant' depends on round_down factor. Thanks. >> + * - Repick optimal if queue leght of other mirror are less >> + */ >> +static int guess_optimal(struct map_lookup *map, int num, int optimal) >> +{ >> + int i; >> + int round_down = 8; >> + int qlen[num]; >> + bool is_nonrot[num]; >> + bool all_bdev_nonrot = true; >> + bool all_bdev_rotate = true; >> + struct block_device *bdev; >> + >> + if (num == 1) >> + return optimal; >> + >> + /* Check accessible bdevs */ >> + for (i = 0; i < num; i++) { >> + /* Init for missing bdevs */ >> + is_nonrot[i] = false; >> + qlen[i] = INT_MAX; >> + bdev = map->stripes[i].dev->bdev; >> + if (bdev) { >> + qlen[i] = 0; >> + is_nonrot[i] = blk_queue_nonrot(bdev_get_queue(bdev)); >> + if (is_nonrot[i]) >> + all_bdev_rotate = false; >> + else >> + all_bdev_nonrot = false; >> + } >> + } >> + >> + /* >> + * Don't bother with computation >> + * if only one of two bdevs are accessible >> + */ >> + if (num == 2 && qlen[0] != qlen[1]) { >> + if (qlen[0] < qlen[1]) >> + return 0; >> + else >> + return 1; >> + } >> + >> + if (all_bdev_nonrot) >> + round_down = 2; >> + >> + for (i = 0; i < num; i++) { >> + if (qlen[i]) >> + continue; >> + bdev = map->stripes[i].dev->bdev; >> + qlen[i] = bdev_get_queue_len(bdev, round_down); >> + } >> + >> + /* For mixed case, pick non rotational dev as optimal */ >> + if (all_bdev_rotate == all_bdev_nonrot) { >> + for (i = 0; i < num; i++) { >> + if (is_nonrot[i]) >> + optimal = i; >> + } >> + } >> + >> + for (i = 0; i < num; i++) { >> + if (qlen[optimal] > qlen[i]) >> + optimal = i; >> + } >> + >> + return optimal; >> +} >> + >> static int find_live_mirror(struct btrfs_fs_info *fs_info, >> struct map_lookup *map, int first, >> int dev_replace_is_ongoing) >> @@ -5219,7 +5325,8 @@ static int find_live_mirror(struct btrfs_fs_info >> *fs_info, >> else >> num_stripes = map->num_stripes; >> >> - preferred_mirror = first + current->pid % num_stripes; >> + preferred_mirror = first + guess_optimal(map, num_stripes, >> + current->pid % num_stripes); >> >> if (dev_replace_is_ongoing && >> fs_info->dev_replace.cont_reading_from_srcdev_mode == >> > -- Have a nice day, Timofey. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html