Re: [dm-devel] dm: dm-mpath: Provide high-resolution timer to HST with bio-mpath

2022-05-09 Thread Gabriel Krisman Bertazi
Mike Snitzer  writes:

> Overall your code was OK, but I nudged it a bit further to be
> inkeeping with how 'features' flags have been implemented elsewhere
> (e.g. dm_target_type's features) -- by using a healer to test the
> flag, etc.
>
> I also tweaked some other small implementation details.  Please see:
> https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git/commit/?h=dm-5.19=c06dfd124d46df9c482fbd1319b5fe19bcb1a110

looks good to me.  thanks for the fixups.

-- 
Gabriel Krisman Bertazi

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel



[dm-devel] [PATCH] dm: dm-mpath: Provide high-resolution timer to HST with bio-mpath

2022-04-27 Thread Gabriel Krisman Bertazi
The precision loss of reading IO start_time with jiffies_to_nsecs
instead of using a high resolution timer degrades HST path prediction
for BIO-based mpath on high load workloads.

Below, I show the utilization percentage of a 10 disk multipath with
asymmetrical disk access cost, while being exercised by a randwrite FIO
benchmark with high submission queue depth (depth=64).  It is possible
to see that the HST path selection degrades heavily for high-iops in
BIO-mpath, underutilizing the slower paths way beyond expected.  This
seems to be caused by the start_time truncation, which makes some IO to
seem much slower than they actually is.  In this scenario ST outperforms
HST for bio-mpath, but not for mq-mpath, which already uses ktime_get_ns().

The third column shows utilization with this patch applied.  It is easy
to see that now HST prediction is much closer to the ideal distribution
(calculated considering the real cost of each path).

| |   ST | HST (orig) | HST(ktime) | Best |
| sdd | 0.17 |   0.20 |   0.17 | 0.18 |
| sde | 0.17 |   0.20 |   0.17 | 0.18 |
| sdf | 0.17 |   0.20 |   0.17 | 0.18 |
| sdg | 0.06 |   0.00 |   0.06 | 0.04 |
| sdh | 0.03 |   0.00 |   0.03 | 0.02 |
| sdi | 0.03 |   0.00 |   0.03 | 0.02 |
| sdj | 0.02 |   0.00 |   0.01 | 0.01 |
| sdk | 0.02 |   0.00 |   0.01 | 0.01 |
| sdl | 0.17 |   0.20 |   0.17 | 0.18 |
| sdm | 0.17 |   0.20 |   0.17 | 0.18 |

This issue was originally discussed [1] when we first merged HST, and
this patch was left as a low hanging fruit to be solved later.  I don't
think anyone is using HST with BIO mpath, but it'd be neat to get it
sorted out.

Regarding the implementation, as suggested by Mike in that mail thread,
in order to avoid the overhead of ktime_get_ns for other selectors, this
patch adds a flag for the selector code to request the high-resolution
timer.

I tested this using the same benchmark used in the original HST submission.

Full test and benchmark scripts are available here:

  https://people.collabora.com/~krisman/HST-BIO-MPATH/

[1] https://lore.kernel.org/lkml/85tv0am9de@collabora.com/T/

Signed-off-by: Gabriel Krisman Bertazi 
Acked-by: Gabriel Krisman Bertazi 
---
 drivers/md/dm-mpath.c  | 16 ++--
 drivers/md/dm-path-selector.h  | 13 +
 drivers/md/dm-ps-historical-service-time.c |  1 +
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f4719b65e5e3..c58cfcd87d04 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -105,6 +105,7 @@ struct multipath {
 struct dm_mpath_io {
struct pgpath *pgpath;
size_t nr_bytes;
+   u64 start_time_ns;
 };
 
 typedef int (*action_fn) (struct pgpath *pgpath);
@@ -647,6 +648,11 @@ static int __multipath_map_bio(struct multipath *m, struct 
bio *bio,
 
mpio->pgpath = pgpath;
 
+   if (pgpath->pg->ps.type->flags & PS_NEED_HR_TIMER_FL)
+   mpio->start_time_ns = ktime_get_ns();
+   else
+   mpio->start_time_ns = 0;
+
bio->bi_status = 0;
bio_set_dev(bio, pgpath->path.dev->bdev);
bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
@@ -1715,9 +1721,15 @@ static int multipath_end_io_bio(struct dm_target *ti, 
struct bio *clone,
if (pgpath) {
struct path_selector *ps = >pg->ps;
 
-   if (ps->type->end_io)
+   if (ps->type->end_io) {
+   u64 st_time = mpio->start_time_ns;
+
+   if (!st_time)
+   st_time = dm_start_time_ns_from_clone(clone);
+
ps->type->end_io(ps, >path, mpio->nr_bytes,
-dm_start_time_ns_from_clone(clone));
+st_time);
+   }
}
 
return r;
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index c47bc0e20275..46710f364286 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -26,6 +26,18 @@ struct path_selector {
void *context;
 };
 
+/* If a path selector uses this flag, a more precise timer
+ * (ktime_get_ns) is used to account for IO start time in BIO-based
+ * mpath.  This improves performance of some path selectors (i.e. HST),
+ * in exchange of a slightly higher overhead when submiting the
+ * BIO. The extra cost is usually offset by improved path selection for
+ * some benchmarks.
+ *
+ * This has no effect for request-based mpath, since it already uses a
+ * higher precision timer by default.
+ */
+#define PS_NEED_HR_TIMER_FL 0x1
+
 /* Information about a path selector type */
 struct path_selector_type {
char *name;
@@ -33,6 +45,7 @@ struct path_selector_type {
 
unsigned int table_args;
unsigned int info_args;
+ 

Re: [dm-devel] [PATCH] dm mpath: fixup sched_clock() usage in historical selector

2022-04-11 Thread Gabriel Krisman Bertazi
Khazhismel Kumykov  writes:

> mixing sched_clock() and ktime_get_ns() will give bad results, don't do
> it
>
> Fixes: 2613eab11996 ("dm mpath: add Historical Service Time Path Selector")
> Signed-off-by: Khazhismel Kumykov 

Looks good.

Reviewed-by: Gabriel Krisman Bertazi 


> ---
>  drivers/md/dm-ps-historical-service-time.c | 11 +--
>  1 file changed, 5 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/md/dm-ps-historical-service-time.c 
> b/drivers/md/dm-ps-historical-service-time.c
> index 875bca30a0dd..82f2a06153dc 100644
> --- a/drivers/md/dm-ps-historical-service-time.c
> +++ b/drivers/md/dm-ps-historical-service-time.c
> @@ -27,7 +27,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
>  
>  
>  #define DM_MSG_PREFIX"multipath historical-service-time"
> @@ -433,7 +432,7 @@ static struct dm_path *hst_select_path(struct 
> path_selector *ps,
>  {
>   struct selector *s = ps->context;
>   struct path_info *pi = NULL, *best = NULL;
> - u64 time_now = sched_clock();
> + u64 time_now = ktime_get_ns();
>   struct dm_path *ret = NULL;
>   unsigned long flags;
>  
> @@ -474,7 +473,7 @@ static int hst_start_io(struct path_selector *ps, struct 
> dm_path *path,
>  
>  static u64 path_service_time(struct path_info *pi, u64 start_time)
>  {
> - u64 sched_now = ktime_get_ns();
> + u64 now = ktime_get_ns();
>  
>   /* if a previous disk request has finished after this IO was
>* sent to the hardware, pretend the submission happened
> @@ -483,11 +482,11 @@ static u64 path_service_time(struct path_info *pi, u64 
> start_time)
>   if (time_after64(pi->last_finish, start_time))
>   start_time = pi->last_finish;
>  
> - pi->last_finish = sched_now;
> - if (time_before64(sched_now, start_time))
> + pi->last_finish = now;
> + if (time_before64(now, start_time))
>       return 0;
>  
> - return sched_now - start_time;
> + return now - start_time;
>  }
>  
>  static int hst_end_io(struct path_selector *ps, struct dm_path *path,

-- 
Gabriel Krisman Bertazi

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel



Re: [dm-devel] [PATCH v2] multipath-tools: add info about the historical-service-time path selector to man page

2020-10-06 Thread Gabriel Krisman Bertazi
Xose Vazquez Perez  writes:

> Just minimal info.
>
> Cc: Khazhismel Kumykov 
> Cc: Gabriel Krisman Bertazi 
> Cc: Martin Wilck 
> Cc: Benjamin Marzinski 
> Cc: Christophe Varoqui 
> Cc: DM-DEVEL ML 
> Signed-off-by: Xose Vazquez Perez 

Reviewed-by: Gabriel Krisman Bertazi 

-- 
Gabriel Krisman Bertazi

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel



Re: [dm-devel] dm: use noio when sending kobject event

2020-07-08 Thread Gabriel Krisman Bertazi
Mike Snitzer  writes:

> On Wed, Jul 08 2020 at  2:26pm -0400,
> Gabriel Krisman Bertazi  wrote:
>
>> If I understand it correctly, considering the deadlock you shared, this
>> doesn't solve the entire issue. For instance, kobject_uevent_env on the
>> GFP_NOIO thread waits on uevent_sock_mutex, and another thread with
>> GFP_IO holding the mutex might have triggered the shrinker from inside
>> kobject_uevent_net_broadcast.  I believe 7e7cd796f277 ("scsi: iscsi: Fix
>> deadlock on recovery path during GFP_IO reclaim") solved the one you
>> shared and other similar cases for iSCSI in a different way.
>
> I staged a different fix, from Mikulas, for 5.9 that is meant to address
> the original report, please see:
> https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git/commit/?h=dm-5.9=e5bfe9baf23dca211f4b794b651e871032c427ec
>
> I'd appreciate it if you could try this commit to se if it fixes the
> original issue you reported.

I reverted 7e7cd796f277 and cherry-picked e5bfe9baf23dc on my tree.
After a few iterations, I could see the conditions that formerly
triggered the deadlock happening, but this patch successfully allowed
the reclaim to succeed and the iscsi recovery thread to run.

My reproducer is a bit artificial, as I wrote it only from only the
problem description provided by google.  They were hitting this in
production and might have a better final word on the fix, though I know
they don't have a simple way to reproduce it.

>> That said, I think this patch is an improvement as we shouldn't be using
>> GFP_IO in this path to begin with, so please add:
>> 
>> Reviewed-by: Gabriel Krisman Bertazi 
>
> FYI, whilee I do appreciate your Reviewed-by I already staged this for
> 5.8 so I'd rather not rebase to add your Reviewed-by, see:
> https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git/commit/?h=dm-5.8=6958c1c640af8c3f40fa8a2eee3b5b905d95b677

No worries.  Actually, thank you guys for helping with this issue.

-- 
Gabriel Krisman Bertazi

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel



Re: [dm-devel] [PATCH] dm: use noio when sending kobject event

2020-07-08 Thread Gabriel Krisman Bertazi
Mikulas Patocka  writes:

> kobject_uevent may allocate memory and it may be called while there are dm
> devices suspended. The allocation may recurse into a suspended device,
> causing a deadlock. We must set the noio flag when sending a uevent.

If I understand it correctly, considering the deadlock you shared, this
doesn't solve the entire issue. For instance, kobject_uevent_env on the
GFP_NOIO thread waits on uevent_sock_mutex, and another thread with
GFP_IO holding the mutex might have triggered the shrinker from inside
kobject_uevent_net_broadcast.  I believe 7e7cd796f277 ("scsi: iscsi: Fix
deadlock on recovery path during GFP_IO reclaim") solved the one you
shared and other similar cases for iSCSI in a different way.

I know this is similar to the log I shared on an earlier patch. Are you
able to reproduce the deadlock with the above patch applied?

That said, I think this patch is an improvement as we shouldn't be using
GFP_IO in this path to begin with, so please add:

Reviewed-by: Gabriel Krisman Bertazi 


> This is the observed deadlock:
>
> iSCSI-write
>  holding: rx_queue_mutex
>  waiting: uevent_sock_mutex
>
>  kobject_uevent_env+0x1bd/0x419
>  kobject_uevent+0xb/0xd
>  device_add+0x48a/0x678
>  scsi_add_host_with_dma+0xc5/0x22d
>  iscsi_host_add+0x53/0x55
>  iscsi_sw_tcp_session_create+0xa6/0x129
>  iscsi_if_rx+0x100/0x1247
>  netlink_unicast+0x213/0x4f0
>  netlink_sendmsg+0x230/0x3c0
>
> iscsi_fail iscsi_conn_failure
>  waiting: rx_queue_mutex
>
>  schedule_preempt_disabled+0x325/0x734
>  __mutex_lock_slowpath+0x18b/0x230
>  mutex_lock+0x22/0x40
>  iscsi_conn_failure+0x42/0x149
>  worker_thread+0x24a/0xbc0
>
> EventManager_
>  holding: uevent_sock_mutex
>  waiting: dm_bufio_client->lock
>
>  dm_bufio_lock+0xe/0x10
>  shrink+0x34/0xf7
>  shrink_slab+0x177/0x5d0
>  do_try_to_free_pages+0x129/0x470
>  try_to_free_mem_cgroup_pages+0x14f/0x210
>  memcg_kmem_newpage_charge+0xa6d/0x13b0
>  __alloc_pages_nodemask+0x4a3/0x1a70
>  fallback_alloc+0x1b2/0x36c
>  __kmalloc_node_track_caller+0xb9/0x10d0
>  __alloc_skb+0x83/0x2f0
>  kobject_uevent_env+0x26b/0x419
>  dm_kobject_uevent+0x70/0x79
>  dev_suspend+0x1a9/0x1e7
>  ctl_ioctl+0x3e9/0x411
>  dm_ctl_ioctl+0x13/0x17
>  do_vfs_ioctl+0xb3/0x460
>  SyS_ioctl+0x5e/0x90
>
> MemcgReclaimerD
>  holding: dm_bufio_client->lock
>  waiting: stuck io to finish (needs iscsi_fail thread to progress)
>
>  schedule at bd603618
>  io_schedule at bd603ba4
>  do_io_schedule at bdaf0d94
>  __wait_on_bit at bd6008a6
>  out_of_line_wait_on_bit at bd600960
>  wait_on_bit.constprop.10 at bdaf0f17
>  __make_buffer_clean at bdaf18ba
>  __cleanup_old_buffer at bdaf192f
>  shrink at bdaf19fd
>  do_shrink_slab at bd6ec000
>  shrink_slab at bd6ec24a
>  do_try_to_free_pages at bd6eda09
>  try_to_free_mem_cgroup_pages at bd6ede7e
>  mem_cgroup_resize_limit at bd7024c0
>  mem_cgroup_write at bd703149
>  cgroup_file_write at ffffbd6d9c6e
>  sys_write at bd6662ea
>  system_call_fastpath at bdbc34a2
>
>
> Signed-off-by: Mikulas Patocka 
> Reported-by: Khazhismel Kumykov 
> Reported-by: Tahsin Erdogan 
> Reported-by: Gabriel Krisman Bertazi 
> Cc: sta...@vger.kernel.org
>
> ---
>  drivers/md/dm.c |   15 ---
>  1 file changed, 12 insertions(+), 3 deletions(-)
>
> Index: linux-2.6/drivers/md/dm.c
> ===
> --- linux-2.6.orig/drivers/md/dm.c2020-06-29 14:50:22.0 +0200
> +++ linux-2.6/drivers/md/dm.c 2020-07-08 18:17:55.0 +0200
> @@ -12,6 +12,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -2926,17 +2927,25 @@ EXPORT_SYMBOL_GPL(dm_internal_resume_fas
>  int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
>  unsigned cookie)
>  {
> + int r;
> + unsigned noio_flag;
>   char udev_cookie[DM_COOKIE_LENGTH];
>   char *envp[] = { udev_cookie, NULL };
>  
> + noio_flag = memalloc_noio_save();
> +
>   if (!cookie)
> - return kobject_uevent(_to_dev(md->disk)->kobj, action);
> + r = kobject_uevent(_to_dev(md->disk)->kobj, action);
>   else {
>   snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
>

Re: [dm-devel] [PATCH] multipath-tools: add info to man page for the historical-service-time path selector

2020-07-07 Thread Gabriel Krisman Bertazi
Xose Vazquez Perez  writes:

> Cc: Khazhismel Kumykov 
> Cc: Gabriel Krisman Bertazi 
> Cc: Martin Wilck 
> Cc: Benjamin Marzinski 
> Cc: Christophe Varoqui 
> Cc: DM-DEVEL ML 
> Signed-off-by: Xose Vazquez Perez 
> ---
>  multipath/multipath.conf.5 | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
> index 05a5e8ff..6e637769 100644
> --- a/multipath/multipath.conf.5
> +++ b/multipath/multipath.conf.5
> @@ -205,6 +205,10 @@ of outstanding I/O to the path.
>  (Since 2.6.31 kernel) Choose the path for the next bunch of I/O based on the 
> amount
>  of outstanding I/O to the path and its relative throughput.
>  .TP
> +.I "historical-service-time 0"
> +(Since 5.8 kernel) Choose the path for the next bunch of I/O based on the 
> shortest
> +time by comparing estimated service time (based on historical service
> time).
Hi,

Thanks for doing this.

What about:

Choose the path for the next bunch of IOs through an estimation of
future service time based on the history of previous I/O submitted to
each path, in an attempt to maximize throughput.  A path's service-time
is loosely defined as the time between an IO start and its completion
and is updated through an exponential moving average (EMA) of the
historical service time of each path.

> +.TP
>  The default is: \fBservice-time 0\fR

It supports some parameters, shouldn't they be documented here?  Some
explanation for the parameters exists in hst_create() in the kernel

 /*
  * Arguments: [ []]
  *   : Base weight for ema [0, 1024) 10-bit fixed point. A
  *  value of 0 will completely ignore any history.
  *  If not given, default (HST_FIXED_95) is used.
  *   : Minimum threshold multiplier for paths to
  *  be considered different. That is, a path is
  *  considered different iff (p1 > N * p2) where p1
  *  is the path with higher service time. A threshold
  *          of 1 or 0 has no effect. Defaults to 0.
  */


>  .RE
>  .

-- 
Gabriel Krisman Bertazi

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel



Re: [dm-devel] [PATCH v4 0/2] Historical Service Time Path Selector

2020-05-20 Thread Gabriel Krisman Bertazi
Xose Vazquez Perez  writes:

> On 5/11/20 6:39 PM, Gabriel Krisman Bertazi wrote:
>
>> This fourth version of HST applies the suggestion from Mikulas Patocka
>> to do the ktime_get_ns inside the mpath map_bio instead of generic
>> device-mapper code. This means that struct dm_mpath_io gained another
>> 64bit field.  For the request-based case, we continue to use the block
>> layer start time information.
>
> You should add some info to the multipath.conf.5 man page (
> https://git.opensvc.com/gitweb.cgi?p=multipath-tools/.git;a=blob;f=multipath/multipath.conf.5;h=05a5e8ffeb110d969f3b2381eb3b88d7f28380f6;hb=HEAD#l189
> ),
> or none one is going to use it.

Sure, will do.

-- 
Gabriel Krisman Bertazi

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel



Re: [dm-devel] [PATCH v4 0/2] Historical Service Time Path Selector

2020-05-11 Thread Gabriel Krisman Bertazi
Mike Snitzer  writes:

> OK, that concall's issue had nothing to do with needing higher
> resolution time (was about IOPs realized with requested-based vs
> bio-based).
>
> Reality is, DM won't need anything higher resolution than jiffies until
> block core's interfaces require something other than jiffies
> (e.g. generic_end_io_acct).
>
> So feel free to proceed with the conditional time fetch solution you
> were going to run with (prior to my previous mail asking you to hold
> off).
>
> Sorry for the noise.  Thanks,
> Mike

No problem, thanks for the information.  I get started on it.


-- 
Gabriel Krisman Bertazi

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel



Re: [dm-devel] [PATCH v4 0/2] Historical Service Time Path Selector

2020-05-11 Thread Gabriel Krisman Bertazi
Mike Snitzer  writes:

> On Mon, May 11 2020 at 12:39pm -0400,
> Gabriel Krisman Bertazi  wrote:
>
>> Hi,
>> 
>> This fourth version of HST applies the suggestion from Mikulas Patocka
>> to do the ktime_get_ns inside the mpath map_bio instead of generic
>> device-mapper code. This means that struct dm_mpath_io gained another
>> 64bit field.  For the request-based case, we continue to use the block
>> layer start time information.
>> 
>> With this modification, I was able obtain similar performance on  BIO
>> to request-based multipath with HST on the benchmarks shared in v1.
>> 
>> v3: https://www.redhat.com/archives/dm-devel/2020-April/msg00308.html
>> v2: https://www.redhat.com/archives/dm-devel/2020-April/msg00270.html
>> v1: https://www.redhat.com/archives/dm-devel/2020-April/msg00176.html
>
> I already staged your v3 in linux-next.  Please provide an incremental
> patch that layers on this git branch:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git/log/?h=dm-5.8
>
> I was hopeful for a flag to be set (e.g. in 'struct path_selector') to
> reflect whether the path selector expects highres start_time.  Makes
> little sense to incur that extra cost of providing the time if the path
> selector doesn't even use it.
>
> Alternatively, could split out the setting of the time needed by .end_io
> to a new path_selector_type method (e.g. .set_start_time).  And then
> only use ktime_get_ns() for bio-based if .set_start_time is defined.
> Would get a little fiddly needing to make sure a stale start_time isn't
> used... also, makes more sense to conditionally call this
> .set_start_time just after .start_io is.

Oh, my apologies, I hadn't noticed it was merged.  I will make the time fetch
conditional and submit a new patch based on that branch.

Thanks,


-- 
Gabriel Krisman Bertazi

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel



[dm-devel] [PATCH v4 2/2] md: mpath: Add Historical Service Time Path Selector

2020-05-11 Thread Gabriel Krisman Bertazi
From: Khazhismel Kumykov 

This new selector keeps an exponential moving average of the service
time for each path (losely defined as delta between start_io and
end_io), and uses this along with the number of inflight requests to
estimate future service time for a path.  Since we don't have a prober
to account for temporally slow paths, re-try "slow" paths every once in
a while (num_paths * historical_service_time). To account for fast paths
transitioning to slow, if a path has not completed any request within
(num_paths * historical_service_time), limit the number of outstanding
requests.  To account for low volume situations where number of
inflights would be zero, the last finish time of each path is factored
in.

Signed-off-by: Khazhismel Kumykov 
Co-developed-by: Gabriel Krisman Bertazi 
Signed-off-by: Gabriel Krisman Bertazi 
---
 drivers/md/Kconfig  |  11 +
 drivers/md/Makefile |   1 +
 drivers/md/dm-historical-service-time.c | 561 
 3 files changed, 573 insertions(+)
 create mode 100644 drivers/md/dm-historical-service-time.c

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 37f05707c59d..5669eeb5cf6f 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -452,6 +452,17 @@ config DM_MULTIPATH_ST
 
  If unsure, say N.
 
+config DM_MULTIPATH_HST
+   tristate "I/O Path Selector based on historical service time"
+   depends on DM_MULTIPATH
+   help
+ This path selector is a dynamic load balancer which selects
+ the path expected to complete the incoming I/O in the shortest
+ time by comparing estimated service time (based on historical
+ service time).
+
+ If unsure, say N.
+
 config DM_DELAY
tristate "I/O delaying target"
depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 9a2d673f94bc..31840f95cd40 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_DM_FLAKEY)   += dm-flakey.o
 obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_MULTIPATH_QL)  += dm-queue-length.o
 obj-$(CONFIG_DM_MULTIPATH_ST)  += dm-service-time.o
+obj-$(CONFIG_DM_MULTIPATH_HST) += dm-historical-service-time.o
 obj-$(CONFIG_DM_SWITCH)+= dm-switch.o
 obj-$(CONFIG_DM_SNAPSHOT)  += dm-snapshot.o
 obj-$(CONFIG_DM_PERSISTENT_DATA)   += persistent-data/
diff --git a/drivers/md/dm-historical-service-time.c 
b/drivers/md/dm-historical-service-time.c
new file mode 100644
index ..186f91e2752c
--- /dev/null
+++ b/drivers/md/dm-historical-service-time.c
@@ -0,0 +1,561 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Historical Service Time
+ *
+ *  Keeps a time-weighted exponential moving average of the historical
+ *  service time. Estimates future service time based on the historical
+ *  service time and the number of outstanding requests.
+ *
+ *  Marks paths stale if they have not finished within hst *
+ *  num_paths. If a path is stale and unused, we will send a single
+ *  request to probe in case the path has improved. This situation
+ *  generally arises if the path is so much worse than others that it
+ *  will never have the best estimated service time, or if the entire
+ *  multipath device is unused. If a path is stale and in use, limit the
+ *  number of requests it can receive with the assumption that the path
+ *  has become degraded.
+ *
+ *  To avoid repeatedly calculating exponents for time weighting, times
+ *  are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
+ *  ns, and the weighting is pre-calculated.
+ *
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#include 
+#include 
+#include 
+
+
+#define DM_MSG_PREFIX  "multipath historical-service-time"
+#define HST_MIN_IO 1
+#define HST_VERSION "0.1.1"
+
+#define HST_FIXED_SHIFT 10  /* 10 bits of decimal precision */
+#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
+#define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
+#define HST_FIXED_95 972
+#define HST_MAX_INFLIGHT HST_FIXED_1
+#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
+#define HST_WEIGHT_COUNT 64ULL
+
+struct selector {
+   struct list_head valid_paths;
+   struct list_head failed_paths;
+   int valid_count;
+   spinlock_t lock;
+
+   unsigned int weights[HST_WEIGHT_COUNT];
+   unsigned int threshold_multiplier;
+};
+
+struct path_info {
+   struct list_head list;
+   struct dm_path *path;
+   unsigned int repeat_count;
+
+   spinlock_t lock;
+
+   u64 historical_service_time; /* Fixed point */
+
+   u64 stale_after;
+   u64 last_finish;
+
+   u64 outstanding;
+};
+
+/**
+ * fixed_power - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By expl

[dm-devel] [PATCH v4 0/2] Historical Service Time Path Selector

2020-05-11 Thread Gabriel Krisman Bertazi
Hi,

This fourth version of HST applies the suggestion from Mikulas Patocka
to do the ktime_get_ns inside the mpath map_bio instead of generic
device-mapper code. This means that struct dm_mpath_io gained another
64bit field.  For the request-based case, we continue to use the block
layer start time information.

With this modification, I was able obtain similar performance on  BIO
to request-based multipath with HST on the benchmarks shared in v1.

v3: https://www.redhat.com/archives/dm-devel/2020-April/msg00308.html
v2: https://www.redhat.com/archives/dm-devel/2020-April/msg00270.html
v1: https://www.redhat.com/archives/dm-devel/2020-April/msg00176.html

Gabriel Krisman Bertazi (1):
  md: mpath: Pass IO start time to path selector

Khazhismel Kumykov (1):
  md: mpath: Add Historical Service Time Path Selector

 drivers/md/Kconfig  |  11 +
 drivers/md/Makefile |   1 +
 drivers/md/dm-historical-service-time.c | 561 
 drivers/md/dm-mpath.c   |  12 +-
 drivers/md/dm-path-selector.h   |   2 +-
 drivers/md/dm-queue-length.c|   2 +-
 drivers/md/dm-service-time.c|   2 +-
 7 files changed, 585 insertions(+), 6 deletions(-)
 create mode 100644 drivers/md/dm-historical-service-time.c

-- 
2.26.2


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel



[dm-devel] [PATCH v4 1/2] md: mpath: Pass IO start time to path selector

2020-05-11 Thread Gabriel Krisman Bertazi
HST is going to need this information to perform path prediction. For
request-based mpath, we use the struct request io_start_time, while for
bio based, use the DM layer start_time.

Signed-off-by: Gabriel Krisman Bertazi 
---
 drivers/md/dm-mpath.c | 12 +---
 drivers/md/dm-path-selector.h |  2 +-
 drivers/md/dm-queue-length.c  |  2 +-
 drivers/md/dm-service-time.c  |  2 +-
 4 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f5e7f8e88767..deb6ae5ad35b 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -105,6 +105,7 @@ struct multipath {
 struct dm_mpath_io {
struct pgpath *pgpath;
size_t nr_bytes;
+   u64 start_time_ns;
 };
 
 typedef int (*action_fn) (struct pgpath *pgpath);
@@ -519,6 +520,7 @@ static int multipath_clone_and_map(struct dm_target *ti, 
struct request *rq,
 
mpio->pgpath = pgpath;
mpio->nr_bytes = nr_bytes;
+   mpio->start_time_ns = rq->io_start_time_ns;
 
bdev = pgpath->path.dev->bdev;
q = bdev_get_queue(bdev);
@@ -567,7 +569,8 @@ static void multipath_release_clone(struct request *clone,
if (pgpath && pgpath->pg->ps.type->end_io)
pgpath->pg->ps.type->end_io(>pg->ps,
>path,
-   mpio->nr_bytes);
+   mpio->nr_bytes,
+   mpio->start_time_ns);
}
 
blk_put_request(clone);
@@ -629,6 +632,7 @@ static int __multipath_map_bio(struct multipath *m, struct 
bio *bio,
}
 
mpio->pgpath = pgpath;
+   mpio->start_time_ns = ktime_get_ns();
 
bio->bi_status = 0;
bio_set_dev(bio, pgpath->path.dev->bdev);
@@ -1620,7 +1624,8 @@ static int multipath_end_io(struct dm_target *ti, struct 
request *clone,
struct path_selector *ps = >pg->ps;
 
if (ps->type->end_io)
-   ps->type->end_io(ps, >path, mpio->nr_bytes);
+   ps->type->end_io(ps, >path, mpio->nr_bytes,
+mpio->start_time_ns);
}
 
return r;
@@ -1664,7 +1669,8 @@ static int multipath_end_io_bio(struct dm_target *ti, 
struct bio *clone,
struct path_selector *ps = >pg->ps;
 
if (ps->type->end_io)
-   ps->type->end_io(ps, >path, mpio->nr_bytes);
+   ps->type->end_io(ps, >path, mpio->nr_bytes,
+mpio->start_time_ns);
}
 
return r;
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index b6eb5365b1a4..c47bc0e20275 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -74,7 +74,7 @@ struct path_selector_type {
int (*start_io) (struct path_selector *ps, struct dm_path *path,
 size_t nr_bytes);
int (*end_io) (struct path_selector *ps, struct dm_path *path,
-  size_t nr_bytes);
+  size_t nr_bytes, u64 start_time);
 };
 
 /* Register a path selector */
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index 969c4f1a3633..5fd018d18418 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -227,7 +227,7 @@ static int ql_start_io(struct path_selector *ps, struct 
dm_path *path,
 }
 
 static int ql_end_io(struct path_selector *ps, struct dm_path *path,
-size_t nr_bytes)
+size_t nr_bytes, u64 start_time)
 {
struct path_info *pi = path->pscontext;
 
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index f006a9005593..9cfda665e9eb 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -309,7 +309,7 @@ static int st_start_io(struct path_selector *ps, struct 
dm_path *path,
 }
 
 static int st_end_io(struct path_selector *ps, struct dm_path *path,
-size_t nr_bytes)
+size_t nr_bytes, u64 start_time)
 {
struct path_info *pi = path->pscontext;
 
-- 
2.26.2


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel



[dm-devel] [PATCH v3 1/2] md: mpath: Pass IO start time to path selector

2020-05-03 Thread Gabriel Krisman Bertazi
HST is going to need this information to perform path prediction. For
request-based mpath, we use the struct request io_start_time, while for
bio based, use the DM layer start_time.

Signed-off-by: Gabriel Krisman Bertazi 
---
 drivers/md/dm-mpath.c |  9 ++---
 drivers/md/dm-path-selector.h |  2 +-
 drivers/md/dm-queue-length.c  |  2 +-
 drivers/md/dm-service-time.c  |  2 +-
 drivers/md/dm.c   | 10 ++
 include/linux/device-mapper.h |  2 ++
 6 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f5e7f8e88767..08b57964719b 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -567,7 +567,8 @@ static void multipath_release_clone(struct request *clone,
if (pgpath && pgpath->pg->ps.type->end_io)
pgpath->pg->ps.type->end_io(>pg->ps,
>path,
-   mpio->nr_bytes);
+   mpio->nr_bytes,
+   clone->io_start_time_ns);
}
 
blk_put_request(clone);
@@ -1620,7 +1621,8 @@ static int multipath_end_io(struct dm_target *ti, struct 
request *clone,
struct path_selector *ps = >pg->ps;
 
if (ps->type->end_io)
-   ps->type->end_io(ps, >path, mpio->nr_bytes);
+   ps->type->end_io(ps, >path, mpio->nr_bytes,
+clone->io_start_time_ns);
}
 
return r;
@@ -1664,7 +1666,8 @@ static int multipath_end_io_bio(struct dm_target *ti, 
struct bio *clone,
struct path_selector *ps = >pg->ps;
 
if (ps->type->end_io)
-   ps->type->end_io(ps, >path, mpio->nr_bytes);
+   ps->type->end_io(ps, >path, mpio->nr_bytes,
+dm_start_time_ns_from_clone(clone));
}
 
return r;
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index b6eb5365b1a4..c47bc0e20275 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -74,7 +74,7 @@ struct path_selector_type {
int (*start_io) (struct path_selector *ps, struct dm_path *path,
 size_t nr_bytes);
int (*end_io) (struct path_selector *ps, struct dm_path *path,
-  size_t nr_bytes);
+  size_t nr_bytes, u64 start_time);
 };
 
 /* Register a path selector */
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index 969c4f1a3633..5fd018d18418 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -227,7 +227,7 @@ static int ql_start_io(struct path_selector *ps, struct 
dm_path *path,
 }
 
 static int ql_end_io(struct path_selector *ps, struct dm_path *path,
-size_t nr_bytes)
+size_t nr_bytes, u64 start_time)
 {
struct path_info *pi = path->pscontext;
 
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index f006a9005593..9cfda665e9eb 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -309,7 +309,7 @@ static int st_start_io(struct path_selector *ps, struct 
dm_path *path,
 }
 
 static int st_end_io(struct path_selector *ps, struct dm_path *path,
-size_t nr_bytes)
+size_t nr_bytes, u64 start_time)
 {
struct path_info *pi = path->pscontext;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index df13fdebe21f..c5deee1bea67 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -674,6 +674,16 @@ static bool md_in_flight(struct mapped_device *md)
return md_in_flight_bios(md);
 }
 
+u64 dm_start_time_ns_from_clone(struct bio *bio)
+{
+   struct dm_target_io *tio = container_of(bio, struct dm_target_io, 
clone);
+   struct dm_io *io = tio->io;
+
+   /* FIXME: convert io->start_time from jiffies to nanoseconds */
+   return jiffies_to_nsecs(io->start_time);
+}
+EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
+
 static void start_io_acct(struct dm_io *io)
 {
struct mapped_device *md = io->md;
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 475668c69dbc..e2d506dd805e 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -329,6 +329,8 @@ void *dm_per_bio_data(struct bio *bio, size_t data_size);
 struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size);
 unsigned dm_bio_get_target_bio_nr(const struct bio *bio);
 
+u64 dm_start_time_ns_from_clone(struct bio *bio);
+
 int dm_register_target(struct target_type *t);
 void dm_unregister_target(struct target_type *t);
 
-- 
2.26.2


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel



Re: [dm-devel] [PATCH v2 0/3] Historical Service Time Path Selector

2020-05-03 Thread Gabriel Krisman Bertazi
Gabriel Krisman Bertazi  writes:

> Hi Mike,
>
> Please find an updated version of HST integrating the change you
> requested to also support BIO based multipath.  I hope you don't mind me
> folding the function you implemented into patch 2.  If you prefer, I can
> integrate a patch you provide into the series.

Hi Mike,

Sorry for the encapsulation patches, I'll pass the parameter directly on
v3.

> One interesting data point is that the selection performance on
> BIO-based is worse when compared to request-based.  I tested a bit and I
> believe the reason is that the paths are more sticky because the bucket
> is too large, and it takes longer for HST to detect differences.  By
> changing the bucket_size indirectly through the bucket_shift, the
> bio-based multipath performance improved, but I'm not sure this is a
> knob we want to expose to users.  For now, please consider the patches
> below, for review.
>

The reason for the BIO-based being worse than request-based was that we
are comparing the jiffies_to_nsecs(jiffies) with ktime_get_ns(), which is not
accurate, given the different precision of the clocks.  Problem is,
request-based mpath uses ktime_get_ns in the block layer, while
dm_io->start_time uses jiffies, and inside the path selector, we don't
have a way to distinguish those paths.  I see a few ways forward, but in
the best interest of getting it right early, I'd like to hear what you
think it is best:

1) My best suggestion would be converting dm_io->start_time to use
ktime_get_ns.  This has the advantage of simplifying dm_stats_account_io
and wouldn't affect the ABI of the accounting histogram.  The only
downside, from what I can see is that ktime_get is slightly more
expensive than reading jiffies, which might be a problem according to
Documentation/admin-guide/device-mapper/statistics.rst.  Is that really
a problem?  I see your FIXME note on the function you implemented, but
are you suggesting exactly this or simply storing
jiffies_to_nsecs(jiffies) in dm_io->start_time?

2) Alternatively, pass end_io_time to the path selector as well, and do
the time collection outside the path selector.  This makes the interface
look ugly, adds the cost anyway of doing ktime operations.

3) Alternatively, collect ktime start/end costs inside HST.  This means
we need to match the IO on start_io and end_io from inside HST, which
doesn't seem like a good design either.

4) ?

As you can see, I'm leaning towards option 1.  Would you be open to a
patch like the following?  Completely untested, still need some work,
just to show the idea.

Khazhy, any thoughts?

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 3f8577e2c13b..eb7c7fae4bc6 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -23,7 +23,7 @@ struct dm_rq_target_io {
blk_status_t error;
union map_info info;
struct dm_stats_aux stats_aux;
-   unsigned long duration_jiffies;
+   u64 duration_ns;
unsigned n_sectors;
unsigned completed;
 };
@@ -132,10 +132,10 @@ static void rq_end_stats(struct mapped_device *md, struct 
request *orig)
 {
if (unlikely(dm_stats_used(>stats))) {
struct dm_rq_target_io *tio = tio_from_request(orig);
-   tio->duration_jiffies = jiffies - tio->duration_jiffies;
+   tio->duration_ns = ktime_get_ns() - tio->duration_ns;
dm_stats_account_io(>stats, rq_data_dir(orig),
blk_rq_pos(orig), tio->n_sectors, true,
-   tio->duration_jiffies, >stats_aux);
+   tio->duration_ns, >stats_aux);
}
 }
 
@@ -451,7 +451,7 @@ static void dm_start_request(struct mapped_device *md, 
struct request *orig)
 
if (unlikely(dm_stats_used(>stats))) {
struct dm_rq_target_io *tio = tio_from_request(orig);
-   tio->duration_jiffies = jiffies;
+   tio->duration_ns = ktime_get_ns();
tio->n_sectors = blk_rq_sectors(orig);
dm_stats_account_io(>stats, rq_data_dir(orig),
blk_rq_pos(orig), tio->n_sectors, false, 0,
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 71417048256a..4353dd04ec42 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -514,7 +514,7 @@ static void dm_stat_round(struct dm_stat *s, struct 
dm_stat_shared *shared,
 static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
  int idx, sector_t len,
  struct dm_stats_aux *stats_aux, bool end,
- unsigned long duration_jiffies)
+ u64 duration_ns)
 {
struct dm_stat_shared *shared = >stat_shared[entry];
struct dm_stat_percpu *p;
@@ -553,11 +553,11 @@ static void dm_stat_for_entry(struct dm_s

[dm-devel] [PATCH v3 0/2] Historical Service Time Path Selector

2020-05-03 Thread Gabriel Krisman Bertazi
Hi,

This is the third version of HST considering the review by Mike
Snitzer.  This version drops the encapsulation patches and passes the
parameter directly to end_io.  It also uses jiffies_to_nsecs directly in
dm_start_time_ns_from_clone, instead of converting to msecs in an
intermediary step.

This didn't show any regressions compared to the benchmark I presented
on previous version:

v2: https://www.redhat.com/archives/dm-devel/2020-April/msg00270.html
v1: https://www.redhat.com/archives/dm-devel/2020-April/msg00176.html

This was tested primarily on a Google cloud SAN with real data and usage
patterns and with artificial benchmarks using fio.

Gabriel Krisman Bertazi (1):
  md: mpath: Pass IO start time to path selector

Khazhismel Kumykov (1):
  md: mpath: Add Historical Service Time Path Selector

 drivers/md/Kconfig  |  11 +
 drivers/md/Makefile |   1 +
 drivers/md/dm-historical-service-time.c | 561 
 drivers/md/dm-mpath.c   |   9 +-
 drivers/md/dm-path-selector.h   |   2 +-
 drivers/md/dm-queue-length.c|   2 +-
 drivers/md/dm-service-time.c|   2 +-
 drivers/md/dm.c |  10 +
 include/linux/device-mapper.h   |   2 +
 9 files changed, 594 insertions(+), 6 deletions(-)
 create mode 100644 drivers/md/dm-historical-service-time.c

-- 
2.26.2


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel



[dm-devel] [PATCH v3 2/2] md: mpath: Add Historical Service Time Path Selector

2020-05-03 Thread Gabriel Krisman Bertazi
From: Khazhismel Kumykov 

This new selector keeps an exponential moving average of the service
time for each path (losely defined as delta between start_io and
end_io), and uses this along with the number of inflight requests to
estimate future service time for a path.  Since we don't have a prober
to account for temporally slow paths, re-try "slow" paths every once in
a while (num_paths * historical_service_time). To account for fast paths
transitioning to slow, if a path has not completed any request within
(num_paths * historical_service_time), limit the number of outstanding
requests.  To account for low volume situations where number of
inflights would be zero, the last finish time of each path is factored
in.

Signed-off-by: Khazhismel Kumykov 
Co-developed-by: Gabriel Krisman Bertazi 
Signed-off-by: Gabriel Krisman Bertazi 
---
 drivers/md/Kconfig  |  11 +
 drivers/md/Makefile |   1 +
 drivers/md/dm-historical-service-time.c | 561 
 3 files changed, 573 insertions(+)
 create mode 100644 drivers/md/dm-historical-service-time.c

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 37f05707c59d..5669eeb5cf6f 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -452,6 +452,17 @@ config DM_MULTIPATH_ST
 
  If unsure, say N.
 
+config DM_MULTIPATH_HST
+   tristate "I/O Path Selector based on historical service time"
+   depends on DM_MULTIPATH
+   help
+ This path selector is a dynamic load balancer which selects
+ the path expected to complete the incoming I/O in the shortest
+ time by comparing estimated service time (based on historical
+ service time).
+
+ If unsure, say N.
+
 config DM_DELAY
tristate "I/O delaying target"
depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 9a2d673f94bc..31840f95cd40 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_DM_FLAKEY)   += dm-flakey.o
 obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_MULTIPATH_QL)  += dm-queue-length.o
 obj-$(CONFIG_DM_MULTIPATH_ST)  += dm-service-time.o
+obj-$(CONFIG_DM_MULTIPATH_HST) += dm-historical-service-time.o
 obj-$(CONFIG_DM_SWITCH)+= dm-switch.o
 obj-$(CONFIG_DM_SNAPSHOT)  += dm-snapshot.o
 obj-$(CONFIG_DM_PERSISTENT_DATA)   += persistent-data/
diff --git a/drivers/md/dm-historical-service-time.c 
b/drivers/md/dm-historical-service-time.c
new file mode 100644
index ..186f91e2752c
--- /dev/null
+++ b/drivers/md/dm-historical-service-time.c
@@ -0,0 +1,561 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Historical Service Time
+ *
+ *  Keeps a time-weighted exponential moving average of the historical
+ *  service time. Estimates future service time based on the historical
+ *  service time and the number of outstanding requests.
+ *
+ *  Marks paths stale if they have not finished within hst *
+ *  num_paths. If a path is stale and unused, we will send a single
+ *  request to probe in case the path has improved. This situation
+ *  generally arises if the path is so much worse than others that it
+ *  will never have the best estimated service time, or if the entire
+ *  multipath device is unused. If a path is stale and in use, limit the
+ *  number of requests it can receive with the assumption that the path
+ *  has become degraded.
+ *
+ *  To avoid repeatedly calculating exponents for time weighting, times
+ *  are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
+ *  ns, and the weighting is pre-calculated.
+ *
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#include 
+#include 
+#include 
+
+
+#define DM_MSG_PREFIX  "multipath historical-service-time"
+#define HST_MIN_IO 1
+#define HST_VERSION "0.1.1"
+
+#define HST_FIXED_SHIFT 10  /* 10 bits of decimal precision */
+#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
+#define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
+#define HST_FIXED_95 972
+#define HST_MAX_INFLIGHT HST_FIXED_1
+#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
+#define HST_WEIGHT_COUNT 64ULL
+
+struct selector {
+   struct list_head valid_paths;
+   struct list_head failed_paths;
+   int valid_count;
+   spinlock_t lock;
+
+   unsigned int weights[HST_WEIGHT_COUNT];
+   unsigned int threshold_multiplier;
+};
+
+struct path_info {
+   struct list_head list;
+   struct dm_path *path;
+   unsigned int repeat_count;
+
+   spinlock_t lock;
+
+   u64 historical_service_time; /* Fixed point */
+
+   u64 stale_after;
+   u64 last_finish;
+
+   u64 outstanding;
+};
+
+/**
+ * fixed_power - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By expl

[dm-devel] [PATCH v2 2/3] md: multipath: Pass io_start_time to the path selector

2020-04-28 Thread Gabriel Krisman Bertazi
HST need to know the IO start time in order to predict path
performance. For request-based multipath use the block layer
io_start_time, while for BIO use the dm_io start_time.

The dm_start_time_ns_from_clone function was suggested and implemented
by Mike Snitzer .

Cc: Mike Snitzer 
Cc: Khazhismel Kumykov 
Signed-off-by: Gabriel Krisman Bertazi 
---
 drivers/md/dm-mpath.c | 25 +++--
 drivers/md/dm-path-selector.h |  1 +
 drivers/md/dm.c   | 10 ++
 include/linux/device-mapper.h |  2 ++
 4 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 1ef4fc2e745b..7af3249948be 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -500,8 +500,9 @@ static int multipath_clone_and_map(struct dm_target *ti, 
struct request *rq,
struct dm_mpath_io *mpio = get_mpio(map_context);
struct request_queue *q;
struct request *clone;
-   struct path_selector_io_data io_data = {
+   struct path_selector_io_data ps_io_data = {
.nr_bytes = nr_bytes,
+   .io_start_time = rq->io_start_time_ns
};
 
/* Do we need to select a new pgpath? */
@@ -552,7 +553,7 @@ static int multipath_clone_and_map(struct dm_target *ti, 
struct request *rq,
if (pgpath->pg->ps.type->start_io)
pgpath->pg->ps.type->start_io(>pg->ps,
  >path,
- _data);
+ _io_data);
return DM_MAPIO_REMAPPED;
 }
 
@@ -568,6 +569,7 @@ static void multipath_release_clone(struct request *clone,
struct pgpath *pgpath = mpio->pgpath;
struct path_selector_io_data ps_io_data = {
.nr_bytes = mpio->nr_bytes,
+   .io_start_time = clone->io_start_time_ns,
};
 
if (pgpath && pgpath->pg->ps.type->end_io)
@@ -623,8 +625,9 @@ static int __multipath_map_bio(struct multipath *m, struct 
bio *bio,
   struct dm_mpath_io *mpio)
 {
struct pgpath *pgpath = __map_bio(m, bio);
-   struct path_selector_io_data io_data = {
+   struct path_selector_io_data ps_io_data = {
.nr_bytes = mpio->nr_bytes,
+   .io_start_time = dm_start_time_ns_from_clone(bio)
};
 
if (IS_ERR(pgpath))
@@ -646,7 +649,7 @@ static int __multipath_map_bio(struct multipath *m, struct 
bio *bio,
if (pgpath->pg->ps.type->start_io)
pgpath->pg->ps.type->start_io(>pg->ps,
  >path,
- _data);
+ _io_data);
return DM_MAPIO_REMAPPED;
 }
 
@@ -1627,12 +1630,13 @@ static int multipath_end_io(struct dm_target *ti, 
struct request *clone,
 
if (pgpath) {
struct path_selector *ps = >pg->ps;
-   struct path_selector_io_data io_data = {
-   .nr_bytes = mpio->nr_bytes
+   struct path_selector_io_data ps_io_data = {
+   .nr_bytes = mpio->nr_bytes,
+   .io_start_time = clone->io_start_time_ns
};
 
if (ps->type->end_io)
-   ps->type->end_io(ps, >path, _data);
+   ps->type->end_io(ps, >path, _io_data);
}
 
return r;
@@ -1674,12 +1678,13 @@ static int multipath_end_io_bio(struct dm_target *ti, 
struct bio *clone,
 done:
if (pgpath) {
struct path_selector *ps = >pg->ps;
-   struct path_selector_io_data io_data = {
-   .nr_bytes = mpio->nr_bytes
+   struct path_selector_io_data ps_io_data = {
+   .nr_bytes = mpio->nr_bytes,
+   .io_start_time = dm_start_time_ns_from_clone(clone)
};
 
if (ps->type->end_io)
-   ps->type->end_io(ps, >path, _data);
+   ps->type->end_io(ps, >path, _io_data);
}
 
return r;
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index fb582a943234..4c5fa6a2efe3 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -28,6 +28,7 @@ struct path_selector {
 
 struct path_selector_io_data {
size_t nr_bytes;
+   u64 io_start_time;
 };
 
 /* Information about a path selector type */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index df13fdebe21f..2e0637a6de9d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -674,6 +674,16 @@ static bool md_in_flight(struct mapped_device *md)
return md_in_flight_bios(md);
 }
 

[dm-devel] [PATCH] dm: multipath: Use updated MPATHF_QUEUE_IO on mapping for BIO-based mpath

2020-04-28 Thread Gabriel Krisman Bertazi
When adding devices that don't have a scsi_dh on a BIO based multipath,
I was able to consistently hit the warning below and lock-up the system.

The problem is that __map_bio reads the flag before it potentially being
modified by choose_pgpath, and ends up using the older value.

The WARN_ON below is not trivially linked to the issue. It goes like
this: The activate_path delayed_work is not initialized for non-scsi_dh
devices, but we always set MPATHF_QUEUE_IO, asking for initialization.
That is fine, since MPATHF_QUEUE_IO would be cleared in choose_pgpath.
Nevertheless, only for BIO-based mpath, we cache the flag before calling
choose_pgpath, and use the older version when deciding if we should
initialize the path.  Therefore, we end up trying to initialize the
paths, and calling the non-initialized activate_path work.

[   82.437100] [ cut here ]
[   82.437659] WARNING: CPU: 3 PID: 602 at kernel/workqueue.c:1624
  __queue_delayed_work+0x71/0x90
[   82.438436] Modules linked in:
[   82.438911] CPU: 3 PID: 602 Comm: systemd-udevd Not tainted 5.6.0-rc6+ #339
[   82.439680] RIP: 0010:__queue_delayed_work+0x71/0x90
[   82.440287] Code: c1 48 89 4a 50 81 ff 00 02 00 00 75 2a 4c 89 cf e9
94 d6 07 00 e9 7f e9 ff ff 0f 0b eb c7 0f 0b 48 81 7a 58 40 74 a8 94 74
a7 <0f> 0b 48 83 7a 48 00 74 a5 0f 0b eb a1 89 fe 4c 89 cf e9 c8 c4 07
[   82.441719] RSP: 0018:b738803977c0 EFLAGS: 00010007
[   82.442121] RAX: a086389f9740 RBX: 0002 RCX: 
[   82.442718] RDX: a086350dd930 RSI: a0863d76f600 RDI: 0200
[   82.443484] RBP: 0200 R08:  R09: a086350dd970
[   82.444128] R10:  R11:  R12: a086350dd930
[   82.444773] R13: a0863d76f600 R14:  R15: a08636738008
[   82.445427] FS:  7f6abfe9dd40() GS:a0863dd8() knlGS:0
[   82.446040] CS:  0010 DS:  ES:  CR0: 80050033
[   82.446478] CR2: 557d288db4e8 CR3: 78b36000 CR4: 06e0
[   82.447104] DR0:  DR1:  DR2: 
[   82.447561] DR3:  DR6: fffe0ff0 DR7: 0400
[   82.448012] Call Trace:
[   82.448164]  queue_delayed_work_on+0x6d/0x80
[   82.448472]  __pg_init_all_paths+0x7b/0xf0
[   82.448714]  pg_init_all_paths+0x26/0x40
[   82.448980]  __multipath_map_bio.isra.0+0x84/0x210
[   82.449267]  __map_bio+0x3c/0x1f0
[   82.449468]  __split_and_process_non_flush+0x14a/0x1b0
[   82.449775]  __split_and_process_bio+0xde/0x340
[   82.450045]  ? dm_get_live_table+0x5/0xb0
[   82.450278]  dm_process_bio+0x98/0x290
[   82.450518]  dm_make_request+0x54/0x120
[   82.450778]  generic_make_request+0xd2/0x3e0
[   82.451038]  ? submit_bio+0x3c/0x150
[   82.451278]  submit_bio+0x3c/0x150
[   82.451492]  mpage_readpages+0x129/0x160
[   82.451756]  ? bdev_evict_inode+0x1d0/0x1d0
[   82.452033]  read_pages+0x72/0x170
[   82.452260]  __do_page_cache_readahead+0x1ba/0x1d0
[   82.452624]  force_page_cache_readahead+0x96/0x110
[   82.452903]  generic_file_read_iter+0x84f/0xae0
[   82.453192]  ? __seccomp_filter+0x7c/0x670
[   82.453547]  new_sync_read+0x10e/0x190
[   82.453883]  vfs_read+0x9d/0x150
[   82.454172]  ksys_read+0x65/0xe0
[   82.454466]  do_syscall_64+0x4e/0x210
[   82.454828]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[...]
[   82.462501] ---[ end trace bb39975e9cf45daa ]---

Signed-off-by: Gabriel Krisman Bertazi 
---
 drivers/md/dm-mpath.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f9b6a7a68bec..f5e7f8e88767 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -589,6 +589,11 @@ static struct pgpath *__map_bio(struct multipath *m, 
struct bio *bio)
if (!pgpath || !queue_io)
pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
 
+   /*
+* MPATHF_QUEUE_IO might have been cleared by choose_pgpath.
+*/
+   queue_io = test_bit(MPATHF_QUEUE_IO, >flags);
+
if ((pgpath && queue_io) ||
(!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, >flags))) {
/* Queue for the daemon to resubmit */
-- 
2.26.2


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel



[dm-devel] [PATCH v2 3/3] md: Add Historical Service Time Path Selector

2020-04-28 Thread Gabriel Krisman Bertazi
From: Khazhismel Kumykov 

This new selector keeps an exponential moving average of the service
time for each path (losely defined as delta between start_io and
end_io), and uses this along with the number of inflight requests to
estimate future service time for a path.  Since we don't have a prober
to account for temporally slow paths, re-try "slow" paths every once in
a while (num_paths * historical_service_time). To account for fast paths
transitioning to slow, if a path has not completed any request within
(num_paths * historical_service_time), limit the number of outstanding
requests.  To account for low volume situations where number of
inflights would be zero, the last finish time of each path is factored
in.

Signed-off-by: Khazhismel Kumykov 
Co-developed-by: Gabriel Krisman Bertazi 
Signed-off-by: Gabriel Krisman Bertazi 
---
 drivers/md/Kconfig  |  11 +
 drivers/md/Makefile |   1 +
 drivers/md/dm-historical-service-time.c | 561 
 3 files changed, 573 insertions(+)
 create mode 100644 drivers/md/dm-historical-service-time.c

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 37f05707c59d..5669eeb5cf6f 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -452,6 +452,17 @@ config DM_MULTIPATH_ST
 
  If unsure, say N.
 
+config DM_MULTIPATH_HST
+   tristate "I/O Path Selector based on historical service time"
+   depends on DM_MULTIPATH
+   help
+ This path selector is a dynamic load balancer which selects
+ the path expected to complete the incoming I/O in the shortest
+ time by comparing estimated service time (based on historical
+ service time).
+
+ If unsure, say N.
+
 config DM_DELAY
tristate "I/O delaying target"
depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 9a2d673f94bc..31840f95cd40 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_DM_FLAKEY)   += dm-flakey.o
 obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_MULTIPATH_QL)  += dm-queue-length.o
 obj-$(CONFIG_DM_MULTIPATH_ST)  += dm-service-time.o
+obj-$(CONFIG_DM_MULTIPATH_HST) += dm-historical-service-time.o
 obj-$(CONFIG_DM_SWITCH)+= dm-switch.o
 obj-$(CONFIG_DM_SNAPSHOT)  += dm-snapshot.o
 obj-$(CONFIG_DM_PERSISTENT_DATA)   += persistent-data/
diff --git a/drivers/md/dm-historical-service-time.c 
b/drivers/md/dm-historical-service-time.c
new file mode 100644
index ..866195048ef9
--- /dev/null
+++ b/drivers/md/dm-historical-service-time.c
@@ -0,0 +1,561 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Historical Service Time
+ *
+ *  Keeps a time-weighted exponential moving average of the historical
+ *  service time. Estimates future service time based on the historical
+ *  service time and the number of outstanding requests.
+ *
+ *  Marks paths stale if they have not finished within hst *
+ *  num_paths. If a path is stale and unused, we will send a single
+ *  request to probe in case the path has improved. This situation
+ *  generally arises if the path is so much worse than others that it
+ *  will never have the best estimated service time, or if the entire
+ *  multipath device is unused. If a path is stale and in use, limit the
+ *  number of requests it can receive with the assumption that the path
+ *  has become degraded.
+ *
+ *  To avoid repeatedly calculating exponents for time weighting, times
+ *  are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
+ *  ns, and the weighting is pre-calculated.
+ *
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#include 
+#include 
+#include 
+
+
+#define DM_MSG_PREFIX  "multipath historical-service-time"
+#define HST_MIN_IO 1
+#define HST_VERSION "0.1.1"
+
+#define HST_FIXED_SHIFT 10  /* 10 bits of decimal precision */
+#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
+#define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
+#define HST_FIXED_95 972
+#define HST_MAX_INFLIGHT HST_FIXED_1
+#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
+#define HST_WEIGHT_COUNT 64ULL
+
+struct selector {
+   struct list_head valid_paths;
+   struct list_head failed_paths;
+   int valid_count;
+   spinlock_t lock;
+
+   unsigned int weights[HST_WEIGHT_COUNT];
+   unsigned int threshold_multiplier;
+};
+
+struct path_info {
+   struct list_head list;
+   struct dm_path *path;
+   unsigned int repeat_count;
+
+   spinlock_t lock;
+
+   u64 historical_service_time; /* Fixed point */
+
+   u64 stale_after;
+   u64 last_finish;
+
+   u64 outstanding;
+};
+
+/**
+ * fixed_power - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By expl

[dm-devel] [PATCH v2 0/3] Historical Service Time Path Selector

2020-04-28 Thread Gabriel Krisman Bertazi
Hi Mike,

Please find an updated version of HST integrating the change you
requested to also support BIO based multipath.  I hope you don't mind me
folding the function you implemented into patch 2.  If you prefer, I can
integrate a patch you provide into the series.

One interesting data point is that the selection performance on
BIO-based is worse when compared to request-based.  I tested a bit and I
believe the reason is that the paths are more sticky because the bucket
is too large, and it takes longer for HST to detect differences.  By
changing the bucket_size indirectly through the bucket_shift, the
bio-based multipath performance improved, but I'm not sure this is a
knob we want to expose to users.  For now, please consider the patches
below, for review.

By the way, the exercise to support bio-based mpath uncovered the mpath
initialization bug that I fixed in the previous patch I sent today, so
it was worth it.

Gabriel Krisman Bertazi (2):
  md: multipath: Encapsulate parameters passed to selectors
  md: multipath: Pass io_start_time to the path selector

Khazhismel Kumykov (1):
  md: Add Historical Service Time Path Selector

 drivers/md/Kconfig  |  11 +
 drivers/md/Makefile |   1 +
 drivers/md/dm-historical-service-time.c | 561 
 drivers/md/dm-mpath.c   |  30 +-
 drivers/md/dm-path-selector.h   |   9 +-
 drivers/md/dm-queue-length.c|   4 +-
 drivers/md/dm-service-time.c|   8 +-
 drivers/md/dm.c |  10 +
 include/linux/device-mapper.h   |   2 +
 9 files changed, 623 insertions(+), 13 deletions(-)
 create mode 100644 drivers/md/dm-historical-service-time.c

-- 
2.26.2


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel



[dm-devel] [PATCH v2 1/3] md: multipath: Encapsulate parameters passed to selectors

2020-04-28 Thread Gabriel Krisman Bertazi
Different selector will use different parameters, which means .io_start
and .io_end will get their signatures modified to include more and more
parameters.  This encapsulates the data in a structure so we can
simplify the interface for future users.  For now it only passes
nr_bytes, but HST will require start_time already.

Cc: Khazhismel Kumykov 
Signed-off-by: Gabriel Krisman Bertazi 
---
 drivers/md/dm-mpath.c | 25 -
 drivers/md/dm-path-selector.h |  8 ++--
 drivers/md/dm-queue-length.c  |  4 ++--
 drivers/md/dm-service-time.c  |  8 
 4 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f5e7f8e88767..1ef4fc2e745b 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -500,6 +500,9 @@ static int multipath_clone_and_map(struct dm_target *ti, 
struct request *rq,
struct dm_mpath_io *mpio = get_mpio(map_context);
struct request_queue *q;
struct request *clone;
+   struct path_selector_io_data io_data = {
+   .nr_bytes = nr_bytes,
+   };
 
/* Do we need to select a new pgpath? */
pgpath = READ_ONCE(m->current_pgpath);
@@ -549,7 +552,7 @@ static int multipath_clone_and_map(struct dm_target *ti, 
struct request *rq,
if (pgpath->pg->ps.type->start_io)
pgpath->pg->ps.type->start_io(>pg->ps,
  >path,
- nr_bytes);
+ _data);
return DM_MAPIO_REMAPPED;
 }
 
@@ -563,11 +566,14 @@ static void multipath_release_clone(struct request *clone,
 */
struct dm_mpath_io *mpio = get_mpio(map_context);
struct pgpath *pgpath = mpio->pgpath;
+   struct path_selector_io_data ps_io_data = {
+   .nr_bytes = mpio->nr_bytes,
+   };
 
if (pgpath && pgpath->pg->ps.type->end_io)
pgpath->pg->ps.type->end_io(>pg->ps,
>path,
-   mpio->nr_bytes);
+   _io_data);
}
 
blk_put_request(clone);
@@ -617,6 +623,9 @@ static int __multipath_map_bio(struct multipath *m, struct 
bio *bio,
   struct dm_mpath_io *mpio)
 {
struct pgpath *pgpath = __map_bio(m, bio);
+   struct path_selector_io_data io_data = {
+   .nr_bytes = mpio->nr_bytes,
+   };
 
if (IS_ERR(pgpath))
return DM_MAPIO_SUBMITTED;
@@ -637,7 +646,7 @@ static int __multipath_map_bio(struct multipath *m, struct 
bio *bio,
if (pgpath->pg->ps.type->start_io)
pgpath->pg->ps.type->start_io(>pg->ps,
  >path,
- mpio->nr_bytes);
+ _data);
return DM_MAPIO_REMAPPED;
 }
 
@@ -1618,9 +1627,12 @@ static int multipath_end_io(struct dm_target *ti, struct 
request *clone,
 
if (pgpath) {
struct path_selector *ps = >pg->ps;
+   struct path_selector_io_data io_data = {
+   .nr_bytes = mpio->nr_bytes
+   };
 
if (ps->type->end_io)
-   ps->type->end_io(ps, >path, mpio->nr_bytes);
+   ps->type->end_io(ps, >path, _data);
}
 
return r;
@@ -1662,9 +1674,12 @@ static int multipath_end_io_bio(struct dm_target *ti, 
struct bio *clone,
 done:
if (pgpath) {
struct path_selector *ps = >pg->ps;
+   struct path_selector_io_data io_data = {
+   .nr_bytes = mpio->nr_bytes
+   };
 
if (ps->type->end_io)
-   ps->type->end_io(ps, >path, mpio->nr_bytes);
+   ps->type->end_io(ps, >path, _data);
}
 
return r;
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index b6eb5365b1a4..fb582a943234 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -26,6 +26,10 @@ struct path_selector {
void *context;
 };
 
+struct path_selector_io_data {
+   size_t nr_bytes;
+};
+
 /* Information about a path selector type */
 struct path_selector_type {
char *name;
@@ -72,9 +76,9 @@ struct path_selector_type {
   status_type_t type, char *result, unsigned int maxlen);
 
int (*start_io) (struct path_selector *ps, struct dm_path *path,
-size_t nr_bytes);
+const struct path_selector_io_data *

Re: [dm-devel] [PATCH 0/2] Historical Service Time Path Selector

2020-04-27 Thread Gabriel Krisman Bertazi
Mike Snitzer  writes:

>
> Looks like you've put a lot of time to this and I'd be happy to help
> you get this to land upstream.
>
> But... (you knew there'd be at least one "but" right? ;) I'm not
> liking making this path selector request-based specific.  All other
> selectors up to this point are request-based vs bio-based agnostic.
>
> Would you be open to dropping patch 1/2 and replacing it with
> something like the following patch?
>
> Then you'd pass 'u64 start_time_ns' into the path_selector_type's
> .end_io (and possibly .start_io).

I think it is fine.

Kind of a MD newbie question, but if I understand correctly,
dm_start_time_ns_from_clone is only for bio based multipath, and we just
pass req->io_start_time directly on request based multipath, right?  If
I understand the code correctly, start_io_acct is only called for the
bio level DM.

I will update the patches, do a quick round of tests with BIO based and
send a v2.

Thanks a lot,

-- 
Gabriel Krisman Bertazi


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel



[dm-devel] [PATCH 2/2] md: Add Historical Service Time Path Selector

2020-04-17 Thread Gabriel Krisman Bertazi
From: Khazhismel Kumykov 

This new selector keeps an exponential moving average of the service
time for each path (losely defined as delta between start_io and
end_io), and uses this along with the number of inflight requests to
estimate future service time for a path.  Since we don't have a prober
to account for temporally slow paths, re-try "slow" paths every once in
a while (num_paths * historical_service_time). To account for fast paths
transitioning to slow, if a path has not completed any request within
(num_paths * historical_service_time), limit the number of outstanding
requests.  To account for low volume situations where number of
inflights would be zero, the last finish time of each path is factored
in.

Signed-off-by: Khazhismel Kumykov 
Co-developed-by: Gabriel Krisman Bertazi 
Signed-off-by: Gabriel Krisman Bertazi 
---
 drivers/md/Kconfig  |  11 +
 drivers/md/Makefile |   1 +
 drivers/md/dm-historical-service-time.c | 561 
 3 files changed, 573 insertions(+)
 create mode 100644 drivers/md/dm-historical-service-time.c

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index d6d5ab23c088..6f348a66450c 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -443,6 +443,17 @@ config DM_MULTIPATH_ST
 
  If unsure, say N.
 
+config DM_MULTIPATH_HST
+   tristate "I/O Path Selector based on historical service time"
+   depends on DM_MULTIPATH
+   help
+ This path selector is a dynamic load balancer which selects
+ the path expected to complete the incoming I/O in the shortest
+ time by comparing estimated service time (based on historical
+ service time).
+
+ If unsure, say N.
+
 config DM_DELAY
tristate "I/O delaying target"
depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index d91a7edcd2ab..d148eeade973 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_DM_FLAKEY)   += dm-flakey.o
 obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_MULTIPATH_QL)  += dm-queue-length.o
 obj-$(CONFIG_DM_MULTIPATH_ST)  += dm-service-time.o
+obj-$(CONFIG_DM_MULTIPATH_HST) += dm-historical-service-time.o
 obj-$(CONFIG_DM_SWITCH)+= dm-switch.o
 obj-$(CONFIG_DM_SNAPSHOT)  += dm-snapshot.o
 obj-$(CONFIG_DM_PERSISTENT_DATA)   += persistent-data/
diff --git a/drivers/md/dm-historical-service-time.c 
b/drivers/md/dm-historical-service-time.c
new file mode 100644
index ..96bab52bba94
--- /dev/null
+++ b/drivers/md/dm-historical-service-time.c
@@ -0,0 +1,561 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Historical Service Time
+ *
+ *  Keeps a time-weighted exponential moving average of the historical
+ *  service time. Estimates future service time based on the historical
+ *  service time and the number of outstanding requests.
+ *
+ *  Marks paths stale if they have not finished within hst *
+ *  num_paths. If a path is stale and unused, we will send a single
+ *  request to probe in case the path has improved. This situation
+ *  generally arises if the path is so much worse than others that it
+ *  will never have the best estimated service time, or if the entire
+ *  multipath device is unused. If a path is stale and in use, limit the
+ *  number of requests it can receive with the assumption that the path
+ *  has become degraded.
+ *
+ *  To avoid repeatedly calculating exponents for time weighting, times
+ *  are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
+ *  ns, and the weighting is pre-calculated.
+ *
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#include 
+#include 
+#include 
+
+
+#define DM_MSG_PREFIX  "multipath historical-service-time"
+#define HST_MIN_IO 1
+#define HST_VERSION "0.1.1"
+
+#define HST_FIXED_SHIFT 10  /* 10 bits of decimal precision */
+#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
+#define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
+#define HST_FIXED_95 972
+#define HST_MAX_INFLIGHT HST_FIXED_1
+#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
+#define HST_WEIGHT_COUNT 64ULL
+
+struct selector {
+   struct list_head valid_paths;
+   struct list_head failed_paths;
+   int valid_count;
+   spinlock_t lock;
+
+   unsigned int weights[HST_WEIGHT_COUNT];
+   unsigned int threshold_multiplier;
+};
+
+struct path_info {
+   struct list_head list;
+   struct dm_path *path;
+   unsigned int repeat_count;
+
+   spinlock_t lock;
+
+   u64 historical_service_time; /* Fixed point */
+
+   u64 stale_after;
+   u64 last_finish;
+
+   u64 outstanding;
+};
+
+/**
+ * fixed_power - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By expl

[dm-devel] [PATCH 0/2] Historical Service Time Path Selector

2020-04-17 Thread Gabriel Krisman Bertazi
Hello,

This small series implements a new path selector that leverages
historical path IO time in order to estimate future path performance.
Implementation details can be found on Patch 2.

This selector yields better path distribution, considering the mean
deviation from the calculated optimal utilization, for small IO depths
when compared to the Service Time selector with fio benchmarks.  For
instance, on a multipath setup with 4 paths, where one path is 4 times
slower than the rest, issuing 500MB of randwrites, we have the following
path utilization rates:

  |depth=1|depth=64   |   |
  |   ST  |   HST |   ST  |   HST |  Best |
|-+---+---+---+---+---|
| sda | 0.250 | 0.294 | 0.297 | 0.294 | 0.307 |
| sdb | 0.250 | 0.297 | 0.296 | 0.297 | 0.307 |
| sdc | 0.250 | 0.296 | 0.298 | 0.296 | 0.307 |
| sdd | 0.250 | 0.112 | 0.106 | 0.112 | 0.076 |

For small depths, HST is much quicker in detecting slow paths and has a
better selection than ST.  As the iodepth increases, ST gets close to
HST, which still behaves steadily.

The raw performance data for different depths types of IO can be found
at:

  

This was tested primarily on a Google cloud SAN with real data and usage
patterns and with artificial benchmarks using fio.

Khazhismel Kumykov (2):
  md: Expose struct request to path selector
  md: Add Historical Service Time Path Selector

 drivers/md/Kconfig  |  11 +
 drivers/md/Makefile |   1 +
 drivers/md/dm-historical-service-time.c | 561 
 drivers/md/dm-mpath.c   |  12 +-
 drivers/md/dm-path-selector.h   |   6 +
 5 files changed, 589 insertions(+), 2 deletions(-)
 create mode 100644 drivers/md/dm-historical-service-time.c

-- 
2.26.0


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel



[dm-devel] [PATCH 1/2] md: Expose struct request to path selector

2020-04-17 Thread Gabriel Krisman Bertazi
From: Khazhismel Kumykov 

This allows a path selector to access metadata such as request start and
end time.  nr_bytes is retained for end_io, as blk_rq_bytes represents
the number of bytes *left* in a request, and is 0 after a request is
finished.

Signed-off-by: Khazhismel Kumykov 
Co-developed-by: Gabriel Krisman Bertazi 
Signed-off-by: Gabriel Krisman Bertazi 
---
 drivers/md/dm-mpath.c | 12 ++--
 drivers/md/dm-path-selector.h |  6 ++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 2bc18c9c3abc..0cdd3a939d41 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -546,7 +546,12 @@ static int multipath_clone_and_map(struct dm_target *ti, 
struct request *rq,
clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
*__clone = clone;
 
-   if (pgpath->pg->ps.type->start_io)
+   if (pgpath->pg->ps.type->start_io_rq)
+   pgpath->pg->ps.type->start_io_rq(>pg->ps,
+ >path,
+ clone ?: rq,
+ nr_bytes);
+   else if (pgpath->pg->ps.type->start_io)
pgpath->pg->ps.type->start_io(>pg->ps,
  >path,
  nr_bytes);
@@ -1614,7 +1619,10 @@ static int multipath_end_io(struct dm_target *ti, struct 
request *clone,
if (pgpath) {
struct path_selector *ps = >pg->ps;
 
-   if (ps->type->end_io)
+   if (ps->type->end_io_rq)
+   ps->type->end_io_rq(ps, >path,
+clone, mpio->nr_bytes);
+   else if (ps->type->end_io)
ps->type->end_io(ps, >path, mpio->nr_bytes);
}
 
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index b6eb5365b1a4..98b6c9d4e21f 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -12,6 +12,7 @@
 #ifndefDM_PATH_SELECTOR_H
 #defineDM_PATH_SELECTOR_H
 
+#include 
 #include 
 
 #include "dm-mpath.h"
@@ -75,6 +76,11 @@ struct path_selector_type {
 size_t nr_bytes);
int (*end_io) (struct path_selector *ps, struct dm_path *path,
   size_t nr_bytes);
+
+   int (*start_io_rq)(struct path_selector *ps, struct dm_path *path,
+  const struct request *rq, size_t nr_bytes);
+   int (*end_io_rq)(struct path_selector *ps, struct dm_path *path,
+const struct request *rq, size_t nr_bytes);
 };
 
 /* Register a path selector */
-- 
2.26.0


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel



[dm-devel] [PATCH] iscsi: do not wait for IOs in dm shrinker

2020-03-05 Thread Gabriel Krisman Bertazi
From: Tahsin Erdogan 

If something goes wrong with an iscsi session, the problem is reported
to giscsid via a netlink message. Then, giscsid tries to add a new device
and destroy the old one. During old device destruction, the pending ios
get completed with an error. Without destroying the device the io
operations are stuck forever.

If dm shrinker is invoked with __GFP_IO, shrinker gets blocked waiting for
the pending ios to complete. So, if the giscsid repair path ends up
doing a memory allocation with __GFP_IO enabled, it could end up in a
deadlock because the iscsi io cannot be completed until giscsid can do its
job and giscsid cannot do its job until the io completes.

Even worse, the deadlock can also occur even if giscsid avoids __GFP_IO
in all paths. For instance, if giscsid tries to grab a mutex held by
another thread and that thread invokes the shrinker we again may enter a
deadlock. Here is a scenario stitched from multiple bugs that
demonstrates how the deadlock can occur:

iSCSI-write
holding: rx_queue_mutex
waiting: uevent_sock_mutex

kobject_uevent_env+0x1bd/0x419
kobject_uevent+0xb/0xd
device_add+0x48a/0x678
scsi_add_host_with_dma+0xc5/0x22d
iscsi_host_add+0x53/0x55
iscsi_sw_tcp_session_create+0xa6/0x129
iscsi_if_rx+0x100/0x1247
netlink_unicast+0x213/0x4f0
netlink_sendmsg+0x230/0x3c0

iscsi_fail iscsi_conn_failure
waiting: rx_queue_mutex

schedule_preempt_disabled+0x325/0x734
__mutex_lock_slowpath+0x18b/0x230
mutex_lock+0x22/0x40
iscsi_conn_failure+0x42/0x149
worker_thread+0x24a/0xbc0

EventManager_
holding: uevent_sock_mutex
waiting: dm_bufio_client->lock

dm_bufio_lock+0xe/0x10
shrink+0x34/0xf7
shrink_slab+0x177/0x5d0
do_try_to_free_pages+0x129/0x470
try_to_free_mem_cgroup_pages+0x14f/0x210
memcg_kmem_newpage_charge+0xa6d/0x13b0
__alloc_pages_nodemask+0x4a3/0x1a70
fallback_alloc+0x1b2/0x36c
__kmalloc_node_track_caller+0xb9/0x10d0
__alloc_skb+0x83/0x2f0
kobject_uevent_env+0x26b/0x419
dm_kobject_uevent+0x70/0x79
dev_suspend+0x1a9/0x1e7
ctl_ioctl+0x3e9/0x411
dm_ctl_ioctl+0x13/0x17
do_vfs_ioctl+0xb3/0x460
SyS_ioctl+0x5e/0x90

MemcgReclaimerD"
holding: dm_bufio_client->lock
waiting: stuck io to finish (needs iscsi_fail thread to progress)

schedule at bd603618
io_schedule at bd603ba4
do_io_schedule at bdaf0d94
__wait_on_bit at bd6008a6
out_of_line_wait_on_bit at bd600960
wait_on_bit.constprop.10 at bdaf0f17
__make_buffer_clean at bdaf18ba
__cleanup_old_buffer at bdaf192f
shrink at bdaf19fd
do_shrink_slab at bd6ec000
shrink_slab at bd6ec24a
do_try_to_free_pages at bd6eda09
try_to_free_mem_cgroup_pages at bd6ede7e
mem_cgroup_resize_limit at bd7024c0
mem_cgroup_write at bd703149
cgroup_file_write at bd6d9c6e
sys_write at bd6662ea
system_call_fastpath at bdbc34a2

Co-developed-by: Khazhismel Kumykov 
Signed-off-by: Khazhismel Kumykov 
Signed-off-by: Tahsin Erdogan 
Co-developed-by: Gabriel Krisman Bertazi 
Signed-off-by: Gabriel Krisman Bertazi 
---
 drivers/md/dm-bufio.c | 31 +--
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 2d519c223562..4c4f80e894b6 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1516,18 +1516,16 @@ static void drop_buffers(struct dm_bufio_client *c)
  * We may not be able to evict this buffer if IO pending or the client
  * is still using it.  Caller is expected to know buffer is too old.
  *
- * And if GFP_NOFS is used, we must not do any I/O because we hold
- * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
- * rerouted to different bufio client.
+ * We must not do any I/O because we hold dm_bufio_clients_lock and we
+ * would risk deadlock if the I/O gets rerouted to different bufio
+ * client.
  */
-static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
+static bool __try_evict_buffer(struct dm_buffer *b)
 {
-   if (!(gfp & __GFP_FS)) {
-   if (test_bit(B_READING, >state) ||
-   test_bit(B_WRITING, >state) ||
-   test_bit(B_DIRTY, >state))
-   return false;
-   }
+   if (test_bit(B_READING, >state) ||
+   test_bit(B_WRITING, >state) ||
+   test_bit(B_DIRTY, >state))
+   return false;
 
if (b->hold_count)
return false;
@@ -1549,8 +1547,7 @@ static unsigned long get_retain_buffers(struct

Re: [dm-devel] [PATCH v2] dm mpath: Add timeout mechanism for queue_if_no_path

2020-01-15 Thread Gabriel Krisman Bertazi
Mike Snitzer  writes:

> On Mon, Jan 13 2020 at  5:41P -0500,
> Gabriel Krisman Bertazi  wrote:
>
>> From: Anatol Pomazau 
>> 
>> Add a configurable timeout mechanism to disable queue_if_no_path without
>> assistance from multipathd.  In reality, this reimplements the
>> no_path_retry mechanism from multipathd in kernel space, which is
>> interesting to prevent processes from hanging indefinitely in cases
>> where the daemon might be unable to respond, after a failure or for
>> whatever reason.
>> 
>> Despite replicating the policy configuration on kernel space, it is
>> quite an important case to prevent IOs from hanging forever, waiting for
>> userspace to behave correctly.
>> 
>> v2:
>>   - Use a module parameter instead of configuring per table
>>   - Simplify code
>> 
>> Co-developed-by: Frank Mayhar 
>> Signed-off-by: Frank Mayhar 
>> Co-developed-by: Bharath Ravi 
>> Signed-off-by: Bharath Ravi 
>> Co-developed-by: Khazhismel Kumykov 
>> Signed-off-by: Khazhismel Kumykov 
>> Signed-off-by: Anatol Pomazau 
>> Co-developed-by: Gabriel Krisman Bertazi 
>> Signed-off-by: Gabriel Krisman Bertazi 
>
> All these tags seem rather excessive (especially in that you aren't cc
> most of them on the submission).  Please review/scrub.  Just seems odd
> that so many had a hand in this relatively small patch.

Hey,

I removed some of the Cc's because those emails addresses were bouncing.
Still, I kept the lines for credits.  I got the bounces when sending v1.

> The Signed-off-by for ana...@google.com seems wrong, or did Anatol
> shephard this patch at some point?  Or did you mean Reviewed-by?
> Something else?

Anatol was the main author, as listed in the From: header.  This
seems correct with regard to the ordering rules of
Documentation/process/submitting-patches.rst, in particular the second
example, (Example of a patch submitted by a Co-developed-by: author::).

Am I missing something?

>
> Anyway, if in the end you hold these tags to reflect the development
> evolution of this patch then so be it ;)
>
> I've reviewed the changes and found various things I felt were
> worthwhile to modify.  Summary:
>
> - included missing 
> - added MODULE_PARM_DESC
> - moved new functions to reduce needed forward declarations
> - tweaked queue_if_no_path_timeout_work's DMWARN message
> - added lockdep_assert_held to enable_nopath_timeout
> - renamed static queue_if_no_path_timeout to queue_if_no_path_timeout_secs
> - use READ_ONCE when accessing queue_if_no_path_timeout_secs
>

The changes you made look good to me and your version of the patch
passes my testcase.

-- 
Gabriel Krisman Bertazi


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel



[dm-devel] [PATCH v2] dm mpath: Add timeout mechanism for queue_if_no_path

2020-01-14 Thread Gabriel Krisman Bertazi
From: Anatol Pomazau 

Add a configurable timeout mechanism to disable queue_if_no_path without
assistance from multipathd.  In reality, this reimplements the
no_path_retry mechanism from multipathd in kernel space, which is
interesting to prevent processes from hanging indefinitely in cases
where the daemon might be unable to respond, after a failure or for
whatever reason.

Despite replicating the policy configuration on kernel space, it is
quite an important case to prevent IOs from hanging forever, waiting for
userspace to behave correctly.

v2:
  - Use a module parameter instead of configuring per table
  - Simplify code

Co-developed-by: Frank Mayhar 
Signed-off-by: Frank Mayhar 
Co-developed-by: Bharath Ravi 
Signed-off-by: Bharath Ravi 
Co-developed-by: Khazhismel Kumykov 
Signed-off-by: Khazhismel Kumykov 
Signed-off-by: Anatol Pomazau 
Co-developed-by: Gabriel Krisman Bertazi 
Signed-off-by: Gabriel Krisman Bertazi 
---
 drivers/md/dm-mpath.c | 62 +++
 1 file changed, 62 insertions(+)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index e0c32793c248..52d90900e85b 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -29,6 +29,7 @@
 #define DM_MSG_PREFIX "multipath"
 #define DM_PG_INIT_DELAY_MSECS 2000
 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
+#define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0
 
 /* Path properties */
 struct pgpath {
@@ -91,6 +92,8 @@ struct multipath {
 
struct work_struct process_queued_bios;
struct bio_list queued_bios;
+
+   struct timer_list nopath_timer; /* Timeout for queue_if_no_path */
 };
 
 /*
@@ -101,6 +104,10 @@ struct dm_mpath_io {
size_t nr_bytes;
 };
 
+static unsigned long queue_if_no_path_timeout = 
QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT;
+module_param_named(queue_if_no_path_timeout_secs,
+  queue_if_no_path_timeout, ulong, 0644);
+
 typedef int (*action_fn) (struct pgpath *pgpath);
 
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
@@ -109,6 +116,10 @@ static void activate_or_offline_path(struct pgpath 
*pgpath);
 static void activate_path_work(struct work_struct *work);
 static void process_queued_bios(struct work_struct *work);
 
+static void queue_if_no_path_timeout_work(struct timer_list *t);
+static void enable_nopath_timeout(struct multipath *m);
+static void disable_nopath_timeout(struct multipath *m);
+
 /*---
  * Multipath state flags.
  *---*/
@@ -195,6 +206,8 @@ static struct multipath *alloc_multipath(struct dm_target 
*ti)
 
m->ti = ti;
ti->private = m;
+
+   timer_setup(>nopath_timer, queue_if_no_path_timeout_work, 0);
}
 
return m;
@@ -1090,6 +1103,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned 
argc, char **argv)
struct dm_arg_set as;
unsigned pg_count = 0;
unsigned next_pg_num;
+   unsigned long flags;
 
as.argc = argc;
as.argv = argv;
@@ -1154,6 +1168,10 @@ static int multipath_ctr(struct dm_target *ti, unsigned 
argc, char **argv)
goto bad;
}
 
+   spin_lock_irqsave(>lock, flags);
+   enable_nopath_timeout(m);
+   spin_unlock_irqrestore(>lock, flags);
+
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
@@ -1208,10 +1226,25 @@ static void multipath_dtr(struct dm_target *ti)
 {
struct multipath *m = ti->private;
 
+   disable_nopath_timeout(m);
flush_multipath_work(m);
free_multipath(m);
 }
 
+/*
+ * If the queue_if_no_path timeout fires, turn off queue_if_no_path and
+ * process any queued I/O.
+ */
+static void queue_if_no_path_timeout_work(struct timer_list *t)
+{
+   struct multipath *m = from_timer(m, t, nopath_timer);
+   struct mapped_device *md = dm_table_get_md((m)->ti->table);
+
+   DMWARN("queue_if_no_path timeout on %s", dm_device_name(md));
+
+   queue_if_no_path(m, false, false);
+}
+
 /*
  * Take a path out of use.
  */
@@ -1241,6 +1274,8 @@ static int fail_path(struct pgpath *pgpath)
 
schedule_work(>trigger_event);
 
+   enable_nopath_timeout(m);
+
 out:
spin_unlock_irqrestore(>lock, flags);
 
@@ -1291,6 +1326,9 @@ static int reinstate_path(struct pgpath *pgpath)
process_queued_io_list(m);
}
 
+   if (pgpath->is_active)
+   disable_nopath_timeout(m);
+
return r;
 }
 
@@ -1314,6 +1352,25 @@ static int action_dev(struct multipath *m, struct dm_dev 
*dev,
return r;
 }
 
+/*
+ * Enable the queue_if_no_path timeout if necessary.  Called with m->lock
+ * held.
+ */
+static void enable_nopath_timeout(struct multipath *m)
+{
+   if (queue_if_no_path_timeout > 0 &&
+   atomic_read(>nr_valid_paths) == 0 &&
+

[dm-devel] [PATCH] dm mpath: Add timeout mechanism for queue_if_no_path

2020-01-03 Thread Gabriel Krisman Bertazi
From: Anatol Pomazau 

Add a configurable timeout mechanism to disable queue_if_no_path without
assistance from multipathd.  In reality, this reimplements the
no_path_retry mechanism from multipathd in kernel space, which is
interesting for cases where we cannot rely on a daemon being present all
the time, in case of failure or to reduce the guest footprint of cloud
services.

Despite replicating the policy configuration on kernel space, it is
quite an important case to prevent IOs from hanging forever, waiting for
userspace to behave correctly.

Co-developed-by: Frank Mayhar 
Signed-off-by: Frank Mayhar 
Co-developed-by: Bharath Ravi 
Signed-off-by: Bharath Ravi 
Co-developed-by: Khazhismel Kumykov 
Signed-off-by: Khazhismel Kumykov 
Signed-off-by: Anatol Pomazau 
Co-developed-by: Gabriel Krisman Bertazi 
Signed-off-by: Gabriel Krisman Bertazi 
---
 drivers/md/dm-mpath.c | 96 ++-
 1 file changed, 95 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index e0c32793c248..72ba7ae98458 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -91,6 +91,9 @@ struct multipath {
 
struct work_struct process_queued_bios;
struct bio_list queued_bios;
+
+   unsigned int nopath_timeout;/* Timeout for queue_if_no_path */
+   struct timer_list nopath_timer;
 };
 
 /*
@@ -109,6 +112,11 @@ static void activate_or_offline_path(struct pgpath 
*pgpath);
 static void activate_path_work(struct work_struct *work);
 static void process_queued_bios(struct work_struct *work);
 
+static void handle_nopath_timeout(struct timer_list *t);
+static void enable_nopath_timeout(struct multipath *m);
+static int activate_nopath_timeout(struct multipath *m,
+  unsigned int to, int enable);
+
 /*---
  * Multipath state flags.
  *---*/
@@ -195,6 +203,8 @@ static struct multipath *alloc_multipath(struct dm_target 
*ti)
 
m->ti = ti;
ti->private = m;
+
+   timer_setup(>nopath_timer, handle_nopath_timeout, 0);
}
 
return m;
@@ -1016,6 +1026,7 @@ static int parse_features(struct dm_arg_set *as, struct 
multipath *m)
{0, 8, "invalid number of feature args"},
{1, 50, "pg_init_retries must be between 1 and 50"},
{0, 6, "pg_init_delay_msecs must be between 0 and 6"},
+   {0, 600, "queue_if_no_path_timeout must be 0 or a timeout in 
seconds"},
};
 
r = dm_read_arg_group(_args, as, , >error);
@@ -1070,6 +1081,16 @@ static int parse_features(struct dm_arg_set *as, struct 
multipath *m)
continue;
}
 
+   if (!strcasecmp(arg_name, "queue_if_no_path_timeout") &&
+   (argc >= 1)) {
+   unsigned int to;
+
+   r = dm_read_arg(_args+3, as, , >error);
+   activate_nopath_timeout(m, to, 0);
+   argc--;
+   continue;
+   }
+
ti->error = "Unrecognised multipath feature request";
r = -EINVAL;
} while (argc && !r);
@@ -1090,6 +,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned 
argc, char **argv)
struct dm_arg_set as;
unsigned pg_count = 0;
unsigned next_pg_num;
+   unsigned long flags;
 
as.argc = argc;
as.argv = argv;
@@ -1154,6 +1176,10 @@ static int multipath_ctr(struct dm_target *ti, unsigned 
argc, char **argv)
goto bad;
}
 
+   spin_lock_irqsave(>lock, flags);
+   enable_nopath_timeout(m);
+   spin_unlock_irqrestore(>lock, flags);
+
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
@@ -1208,10 +1234,26 @@ static void multipath_dtr(struct dm_target *ti)
 {
struct multipath *m = ti->private;
 
+   del_timer_sync(>nopath_timer);
+
flush_multipath_work(m);
free_multipath(m);
 }
 
+/*
+ * If the queue_if_no_path timeout fires, turn off queue_if_no_path and
+ * process any queued I/O.
+ */
+static void handle_nopath_timeout(struct timer_list *t)
+{
+   struct multipath *m = from_timer(m, t, nopath_timer);
+   struct mapped_device *md = dm_table_get_md((m)->ti->table);
+
+   DMWARN("queue_if_no_path timeout on %s", dm_device_name(md));
+
+   (void)queue_if_no_path(m, false, false);
+}
+
 /*
  * Take a path out of use.
  */
@@ -1241,6 +1283,8 @@ static int fail_path(struct pgpath *pgpath)
 
schedule_work(>trigger_event);
 
+   enable_nopath_timeout(m);
+
 out:
spin_unlock_irqrestore(>lock, flags);
 
@@ -1291,6 +1335,8 @@