From: Timofey Titovets <nefelim...@gmail.com> Currently btrfs raid1/10 balancer bаlance requests to mirrors, based on pid % num of mirrors.
Make logic understood: - if one of underline devices are non rotational - Queue length to underline devices By default try use pid % num_mirrors guessing, but: - If one of mirrors are non rotational, repick optimal to it - If underline mirror have less queue length then optimal, repick to that mirror For avoid round-robin request balancing, lets round down queue length: - By 8 for rotational devs - By 2 for all non rotational devs Some bench results from mail list (Dmitrii Tcvetkov <demfl...@demfloro.ru>): Benchmark summary (arithmetic mean of 3 runs): Mainline Patch ------------------------------------ RAID1 | 18.9 MiB/s | 26.5 MiB/s RAID10 | 30.7 MiB/s | 30.7 MiB/s ------------------------------------------------------------------------ mainline, fio got lucky to read from first HDD (quite slow HDD): Jobs: 1 (f=1): [r(1)][100.0%][r=8456KiB/s,w=0KiB/s][r=264,w=0 IOPS] read: IOPS=265, BW=8508KiB/s (8712kB/s)(499MiB/60070msec) lat (msec): min=2, max=825, avg=60.17, stdev=65.06 ------------------------------------------------------------------------ mainline, fio got lucky to read from second HDD (much more modern): Jobs: 1 (f=1): [r(1)][8.7%][r=11.9MiB/s,w=0KiB/s][r=380,w=0 IOPS] read: IOPS=378, BW=11.8MiB/s (12.4MB/s)(710MiB/60051msec) lat (usec): min=416, max=644286, avg=42312.74, stdev=48518.56 ------------------------------------------------------------------------ mainline, fio got lucky to read from an SSD: Jobs: 1 (f=1): [r(1)][100.0%][r=436MiB/s,w=0KiB/s][r=13.9k,w=0 IOPS] read: IOPS=13.9k, BW=433MiB/s (454MB/s)(25.4GiB/60002msec) lat (usec): min=343, max=16319, avg=1152.52, stdev=245.36 ------------------------------------------------------------------------ With the patch, 2 HDDs: Jobs: 1 (f=1): [r(1)][100.0%][r=17.5MiB/s,w=0KiB/s][r=560,w=0 IOPS] read: IOPS=560, BW=17.5MiB/s (18.4MB/s)(1053MiB/60052msec) lat (usec): min=435, max=341037, avg=28511.64, stdev=30000.14 ------------------------------------------------------------------------ With the patch, HDD(old one)+SSD: Jobs: 1 (f=1): [r(1)][100.0%][r=371MiB/s,w=0KiB/s][r=11.9k,w=0 IOPS] read: IOPS=11.6k, BW=361MiB/s (379MB/s)(21.2GiB/60084msec) lat (usec): min=363, max=346752, avg=1381.73, stdev=6948.32 Changes: v1 -> v2: - Use helper part_in_flight() from genhd.c to get queue length - Move guess code to guess_optimal() - Change balancer logic, try use pid % mirror by default Make balancing on spinning rust if one of underline devices are overloaded v2 -> v3: - Fix arg for RAID10 - use sub_stripes, instead of num_stripes v3 -> v4: - Rebased on latest misc-next v4 -> v5: - Rebased on latest misc-next v5 -> v6: - Fix spelling - Include bench results v6 -> v7: - Fixes based on Nikolay Borisov review: * Assume num == 2 * Remove "for" loop based on that assumption, where possible v7 -> v8: - Add comment about magic '2' num in guess function Signed-off-by: Timofey Titovets <nefelim...@gmail.com> Tested-by: Dmitrii Tcvetkov <demfl...@demfloro.ru> Reviewed-by: Dmitrii Tcvetkov <demfl...@demfloro.ru> --- block/genhd.c | 1 + fs/btrfs/volumes.c | 104 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 104 insertions(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index cff6bdf27226..4ba5ede8969e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -81,6 +81,7 @@ void part_in_flight(struct request_queue *q, struct hd_struct *part, atomic_read(&part->in_flight[1]); } } +EXPORT_SYMBOL_GPL(part_in_flight); void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f435d397019e..d9b5cf31514a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -13,6 +13,7 @@ #include <linux/raid/pq.h> #include <linux/semaphore.h> #include <linux/uuid.h> +#include <linux/genhd.h> #include <linux/list_sort.h> #include "ctree.h" #include "extent_map.h" @@ -28,6 +29,8 @@ #include "dev-replace.h" #include "sysfs.h" +#define BTRFS_RAID_1_10_MAX_MIRRORS 2 + const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { .sub_stripes = 2, @@ -5166,6 +5169,104 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return ret; } +/** + * bdev_get_queue_len - return rounded down in flight queue length of bdev + * + * @bdev: target bdev + * @round_down: round factor big for hdd and small for ssd, like 8 and 2 + */ +static int bdev_get_queue_len(struct block_device *bdev, int round_down) +{ + int sum; + struct hd_struct *bd_part = bdev->bd_part; + struct request_queue *rq = bdev_get_queue(bdev); + uint32_t inflight[2] = {0, 0}; + + part_in_flight(rq, bd_part, inflight); + + sum = max_t(uint32_t, inflight[0], inflight[1]); + + /* + * Try prevent switch for every sneeze + * By roundup output num by some value + */ + return ALIGN_DOWN(sum, round_down); +} + +/** + * guess_optimal - return guessed optimal mirror + * + * Optimal expected to be pid % num_stripes + * + * That's generaly ok for spread load + * Add some balancer based on queue length to device + * + * Basic ideas: + * - Sequential read generate low amount of request + * so if load of drives are equal, use pid % num_stripes balancing + * - For mixed rotate/non-rotate mirrors, pick non-rotate as optimal + * and repick if other dev have "significant" less queue length + * - Repick optimal if queue length of other mirror are less + */ +static int guess_optimal(struct map_lookup *map, int num, int optimal) +{ + int i; + int round_down = 8; + /* Init for missing bdevs */ + int qlen[2] = { INT_MAX, INT_MAX }; + bool is_nonrot[2] = { false, false }; + bool all_bdev_nonrot = true; + bool all_bdev_rotate = true; + struct block_device *bdev; + + /* That function supposed to work with up to 2 mirrors */ + ASSERT(BTRFS_RAID_1_10_MAX_MIRRORS == 2); + ASSERT(BTRFS_RAID_1_10_MAX_MIRRORS == num); + + /* Check accessible bdevs */ + for (i = 0; i < 2; i++) { + bdev = map->stripes[i].dev->bdev; + if (bdev) { + qlen[i] = 0; + is_nonrot[i] = blk_queue_nonrot(bdev_get_queue(bdev)); + if (is_nonrot[i]) + all_bdev_rotate = false; + else + all_bdev_nonrot = false; + } + } + + /* + * Don't bother with computation + * if only one of two bdevs are accessible + */ + if (qlen[0] == INT_MAX) + return 1; + if (qlen[1] == INT_MAX) + return 0; + + if (all_bdev_nonrot) + round_down = 2; + + for (i = 0; i < 2; i++) { + bdev = map->stripes[i].dev->bdev; + qlen[i] = bdev_get_queue_len(bdev, round_down); + } + + /* For mixed case, pick non rotational dev as optimal */ + if (all_bdev_rotate == all_bdev_nonrot) { + if (is_nonrot[0]) + optimal = 0; + else + optimal = 1; + } + + if (qlen[optimal] > qlen[(optimal + 1) % 2]) + optimal = i; + + return optimal; +} + static int find_live_mirror(struct btrfs_fs_info *fs_info, struct map_lookup *map, int first, int dev_replace_is_ongoing) @@ -5184,7 +5285,8 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, else num_stripes = map->num_stripes; - preferred_mirror = first + current->pid % num_stripes; + preferred_mirror = first + guess_optimal(map, num_stripes, + current->pid % num_stripes); if (dev_replace_is_ongoing && fs_info->dev_replace.cont_reading_from_srcdev_mode == -- 2.19.1