On 20/1/21 8:14 pm, David Sterba wrote:
On Tue, Jan 19, 2021 at 11:52:05PM -0800, Anand Jain wrote:
The read policy type latency routes the read IO based on the historical
average wait-time experienced by the read IOs through the individual
device. This patch obtains the historical read IO stats from the kernel
block layer and calculates its average.

This does not say how the stripe is selected using the gathered numbers.
Ie. what is the criteria like minimum average time, "based on" is too
vague.



Could you please add the following in the change log. Hope this will suffice.

----------
This patch adds new read policy Latency. This policy routes the read
I/Os based on the device's average wait time for read requests.
The average is calculated by dividing the total wait time for read
requests by the total read I/Os processed by the device.
This policy uses kernel disk stat to calculate the average, so it needs
the kernel stat to be enabled. If in case the kernel stat is disabled
the policy uses the stripe 0.
This policy can be set through the read_policy sysfs interface as shown
below.

    $ echo latency > /sys/fs/btrfs/<uuid>/read_policy
    $ cat /sys/fs/btrfs/<uuid>/read_policy
         pid [latency] device roundrobin

This policy won't persist across reboot or mount unmount recycle as of
now.

Here below are few performance test results with latency compared with pid policy.

raid1 fio read 500m
-----------------------------------------------------
dev types   | nvme+ssd  nvme+ssd   all-nvme  all-nvme
read type   | random    sequential random    sequential
------------+------------------------------------------
pid         | 744MiB/s  809MiB/s  2225MiB/s 2155MiB/s
latency     | 2072MiB/s 2008MiB/s  1999MiB/s 1961MiB/s


raid10 fio read 500m
-----------------------------------------------------
dev types   | nvme+ssd  nvme+ssd   all-nvme  all-nvme
read type   | random    sequential random    sequential
------------+------------------------------------------
pid         | 1282MiB/s 1427MiB/s 2152MiB/s 1969MiB/s
latency     | 2073MiB/s 1871MiB/s 1975MiB/s 1984MiB/s


raid1c3 fio read 500m
-----------------------------------------------------
dev types   | nvme+ssd  nvme+ssd   all-nvme  all-nvme
read type   | random    sequential random    sequential
------------+------------------------------------------
pid         |  973MiB/s  955MiB/s 2144MiB/s 1962MiB/s
latency     | 2005MiB/s 1924MiB/s 2083MiB/s 1980MiB/s


raid1c4 fio read 500m
-----------------------------------------------------
dev types   | nvme+ssd  nvme+ssd   all-nvme  all-nvme
read type   | random    sequential random    sequential
------------+------------------------------------------
pid         | 1204MiB/s 1221MiB/s 2065MiB/s 1878MiB/s
latency     | 1990MiB/s 1920MiB/s 1945MiB/s 1865MiB/s


In the given fio I/O workload above, it is found that there are fewer I/O merges in case of latency as compared to pid. So in the case of all homogeneous devices pid performance little better. The latency is a better choice in the case of mixed types of devices. Also if any one of the devices is under performing due to intermittent I/Os retries, then the latency policy will automatically use the best available.
-----------




Example usage:
  echo "latency" > /sys/fs/btrfs/$uuid/read_policy

Do you have some sample results? I remember you posted something but it
would be good to have that in the changelog too.

Thanks for suggesting now I have included it, as above.
Also, I can generate a patch reroll with this change log if needed.
Please, let me know.

Thanks, Anand



Signed-off-by: Anand Jain <anand.j...@oracle.com>
Reviewed-by: Josef Bacik <jo...@toxicpanda.com>
---
v4: For btrfs_debug_rl() use fs_info instead of
     device->fs_devices->fs_info.

v3: The block layer commit 0d02129e76ed (block: merge struct block_device and
     struct hd_struct) has changed the first argument in the function
     part_stat_read_all() in 5.11-rc1. So the compilation will fail. This patch
     fixes it.
     Commit log updated.

v2: Use btrfs_debug_rl() instead of btrfs_info_rl()
     It is better we have this debug until we test this on at least few
     hardwares.
     Drop the unrelated changes.
     Update change log.

rfc->v1: Drop part_stat_read_all instead use part_stat_read
     Drop inflight

  fs/btrfs/sysfs.c   |  3 ++-
  fs/btrfs/volumes.c | 38 ++++++++++++++++++++++++++++++++++++++
  fs/btrfs/volumes.h |  2 ++
  3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 4522a1c4cd08..7c0324fe97b2 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -915,7 +915,8 @@ static bool strmatch(const char *buffer, const char *string)
        return false;
  }
-static const char * const btrfs_read_policy_name[] = { "pid" };
+/* Must follow the order as in enum btrfs_read_policy */
+static const char * const btrfs_read_policy_name[] = { "pid", "latency" };
static ssize_t btrfs_read_policy_show(struct kobject *kobj,
                                      struct kobj_attribute *a, char *buf)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 62d6a890fc50..f361f1c87eb6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -14,6 +14,7 @@
  #include <linux/semaphore.h>
  #include <linux/uuid.h>
  #include <linux/list_sort.h>
+#include <linux/part_stat.h>
  #include "misc.h"
  #include "ctree.h"
  #include "extent_map.h"
@@ -5490,6 +5491,39 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info 
*fs_info, u64 logical, u64 len)
        return ret;
  }
+static int btrfs_find_best_stripe(struct btrfs_fs_info *fs_info,

The name btrfs_find_best_stripe should be more descriptive about the
selection criteria.

+                                 struct map_lookup *map, int first,
+                                 int num_stripe)
+{
+       u64 est_wait = 0;
+       int best_stripe = 0;
+       int index;
+
+       for (index = first; index < first + num_stripe; index++) {
+               u64 read_wait;
+               u64 avg_wait = 0;
+               unsigned long read_ios;
+               struct btrfs_device *device = map->stripes[index].dev;
+
+               read_wait = part_stat_read(device->bdev, nsecs[READ]);

This should use STAT_READ as this is supposed to be indexing the stats
members. READ is some generic constant with the same value.

+               read_ios = part_stat_read(device->bdev, ios[READ]);
+
+               if (read_wait && read_ios && read_wait >= read_ios)
+                       avg_wait = div_u64(read_wait, read_ios);
+               else
+                       btrfs_debug_rl(fs_info,
+                       "devid: %llu avg_wait ZERO read_wait %llu read_ios %lu",
+                                      device->devid, read_wait, read_ios);
+
+               if (est_wait == 0 || est_wait > avg_wait) {
+                       est_wait = avg_wait;
+                       best_stripe = index;
+               }
+       }
+
+       return best_stripe;
+}
+
  static int find_live_mirror(struct btrfs_fs_info *fs_info,
                            struct map_lookup *map, int first,
                            int dev_replace_is_ongoing)
@@ -5519,6 +5553,10 @@ static int find_live_mirror(struct btrfs_fs_info 
*fs_info,
        case BTRFS_READ_POLICY_PID:
                preferred_mirror = first + (current->pid % num_stripes);
                break;
+       case BTRFS_READ_POLICY_LATENCY:
+               preferred_mirror = btrfs_find_best_stripe(fs_info, map, first,
+                                                         num_stripes);
+               break;
        }
if (dev_replace_is_ongoing &&
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1997a4649a66..71ba1f0e93f4 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -222,6 +222,8 @@ enum btrfs_chunk_allocation_policy {
  enum btrfs_read_policy {
        /* Use process PID to choose the stripe */
        BTRFS_READ_POLICY_PID,
+       /* Find and use device with the lowest latency */
+       BTRFS_READ_POLICY_LATENCY,
        BTRFS_NR_READ_POLICY,
  };
--
2.28.0

Reply via email to