[PATCH BUGFIX RFC 2/2] Revert "bfq: calculate shallow depths at init time"

2019-01-18 Thread Paolo Valente
This reverts commit f0635b8a416e3b99dc6fd9ac3ce534764869d0c8.
---
 block/bfq-iosched.c | 117 +---
 1 file changed, 57 insertions(+), 60 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 8cc3032b66de..92214d58510c 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -520,6 +520,54 @@ static struct request *bfq_choose_req(struct bfq_data 
*bfqd,
}
 }
 
+/*
+ * See the comments on bfq_limit_depth for the purpose of
+ * the depths set in the function. Return minimum shallow depth we'll use.
+ */
+static unsigned int bfq_update_depths(struct bfq_data *bfqd,
+ struct sbitmap_queue *bt)
+{
+   unsigned int i, j, min_shallow = UINT_MAX;
+   bfqd->sb_shift = bt->sb.shift;
+
+   /*
+* In-word depths if no bfq_queue is being weight-raised:
+* leaving 25% of tags only for sync reads.
+*
+* In next formulas, right-shift the value
+* (1Usb_shift - something)), to be robust against
+* any possible value of bfqd->sb_shift, without having to
+* limit 'something'.
+*/
+   /* no more than 50% of tags for async I/O */
+   bfqd->word_depths[0][0] = max((1U>1, 1U);
+   /*
+* no more than 75% of tags for sync writes (25% extra tags
+* w.r.t. async I/O, to prevent async I/O from starving sync
+* writes)
+*/
+   bfqd->word_depths[0][1] = max(((1U>2, 1U);
+
+   /*
+* In-word depths in case some bfq_queue is being weight-
+* raised: leaving ~63% of tags for sync reads. This is the
+* highest percentage for which, in our tests, application
+* start-up times didn't suffer from any regression due to tag
+* shortage.
+*/
+   /* no more than ~18% of tags for async I/O */
+   bfqd->word_depths[1][0] = max(((1U>4, 1U);
+   /* no more than ~37% of tags for sync writes (~20% extra tags) */
+   bfqd->word_depths[1][1] = max(((1U>4, 1U);
+
+   for (i = 0; i < 2; i++)
+   for (j = 0; j < 2; j++)
+   min_shallow = min(min_shallow, bfqd->word_depths[i][j]);
+
+   return min_shallow;
+}
+
 /*
  * Async I/O can easily starve sync I/O (both sync reads and sync
  * writes), by consuming all tags. Similarly, storms of sync writes,
@@ -529,11 +577,20 @@ static struct request *bfq_choose_req(struct bfq_data 
*bfqd,
  */
 static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
 {
+   struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct bfq_data *bfqd = data->q->elevator->elevator_data;
+   struct sbitmap_queue *bt;
 
if (op_is_sync(op) && !op_is_write(op))
return;
 
+   bt = >bitmap_tags;
+
+   if (unlikely(bfqd->sb_shift != bt->sb.shift)) {
+   unsigned int min_shallow = bfq_update_depths(bfqd, bt);
+   sbitmap_queue_min_shallow_depth(>bitmap_tags, 
min_shallow);
+   }
+
data->shallow_depth =
bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
 
@@ -5295,65 +5352,6 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct 
bfq_group *bfqg)
__bfq_put_async_bfqq(bfqd, >async_idle_bfqq);
 }
 
-/*
- * See the comments on bfq_limit_depth for the purpose of
- * the depths set in the function. Return minimum shallow depth we'll use.
- */
-static unsigned int bfq_update_depths(struct bfq_data *bfqd,
- struct sbitmap_queue *bt)
-{
-   unsigned int i, j, min_shallow = UINT_MAX;
-   bfqd->sb_shift = bt->sb.shift;
-
-   /*
-* In-word depths if no bfq_queue is being weight-raised:
-* leaving 25% of tags only for sync reads.
-*
-* In next formulas, right-shift the value
-* (1Usb_shift - something)), to be robust against
-* any possible value of bfqd->sb_shift, without having to
-* limit 'something'.
-*/
-   /* no more than 50% of tags for async I/O */
-   bfqd->word_depths[0][0] = max((1U>1, 1U);
-   /*
-* no more than 75% of tags for sync writes (25% extra tags
-* w.r.t. async I/O, to prevent async I/O from starving sync
-* writes)
-*/
-   bfqd->word_depths[0][1] = max(((1U>2, 1U);
-
-   /*
-* In-word depths in case some bfq_queue is being weight-
-* raised: leaving ~63% of tags for sync reads. This is the
-* highest percentage for which, in our tests, application
-* start-up times didn't suffer from any regression due to tag
-* shortage.
-*/
-   /* no more than ~18% of tags for async I/O */
-   bfqd->word_depths[1][0] = max(((1U>4, 

[PATCH BUGFIX RFC 1/2] Revert "bfq-iosched: remove unused variable"

2019-01-18 Thread Paolo Valente
This reverts commit bd7d4ef6a4c9b3611fa487a0065bf042c71ce620.
---
 block/bfq-iosched.c | 15 ---
 block/bfq-iosched.h |  6 ++
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index cd307767a134..8cc3032b66de 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5303,25 +5303,26 @@ static unsigned int bfq_update_depths(struct bfq_data 
*bfqd,
  struct sbitmap_queue *bt)
 {
unsigned int i, j, min_shallow = UINT_MAX;
+   bfqd->sb_shift = bt->sb.shift;
 
/*
 * In-word depths if no bfq_queue is being weight-raised:
 * leaving 25% of tags only for sync reads.
 *
 * In next formulas, right-shift the value
-* (1Usb.shift - something)), to be robust against
-* any possible value of bt->sb.shift, without having to
+* (1Usb_shift - something)), to be robust against
+* any possible value of bfqd->sb_shift, without having to
 * limit 'something'.
 */
/* no more than 50% of tags for async I/O */
-   bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U);
+   bfqd->word_depths[0][0] = max((1U>1, 1U);
/*
 * no more than 75% of tags for sync writes (25% extra tags
 * w.r.t. async I/O, to prevent async I/O from starving sync
 * writes)
 */
-   bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U);
+   bfqd->word_depths[0][1] = max(((1U>2, 1U);
 
/*
 * In-word depths in case some bfq_queue is being weight-
@@ -5331,9 +5332,9 @@ static unsigned int bfq_update_depths(struct bfq_data 
*bfqd,
 * shortage.
 */
/* no more than ~18% of tags for async I/O */
-   bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U);
+   bfqd->word_depths[1][0] = max(((1U>4, 1U);
/* no more than ~37% of tags for sync writes (~20% extra tags) */
-   bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U);
+   bfqd->word_depths[1][1] = max(((1U>4, 1U);
 
for (i = 0; i < 2; i++)
for (j = 0; j < 2; j++)
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 0b02bf302de0..4de5dc349a1e 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -697,6 +697,12 @@ struct bfq_data {
/* bfqq associated with the task issuing current bio for merging */
struct bfq_queue *bio_bfqq;
 
+   /*
+* Cached sbitmap shift, used to compute depth limits in
+* bfq_update_depths.
+*/
+   unsigned int sb_shift;
+
/*
 * Depth limits used in bfq_limit_depth (see comments on the
 * function)
-- 
2.20.1



Re: [RFC PATCH 0/3] cgroup: fsio throttle controller

2019-01-18 Thread Paolo Valente



> Il giorno 18 gen 2019, alle ore 12:10, Andrea Righi  
> ha scritto:
> 
> On Fri, Jan 18, 2019 at 12:04:17PM +0100, Paolo Valente wrote:
>> 
>> 
>>> Il giorno 18 gen 2019, alle ore 11:31, Andrea Righi 
>>>  ha scritto:
>>> 
>>> This is a redesign of my old cgroup-io-throttle controller:
>>> https://lwn.net/Articles/330531/
>>> 
>>> I'm resuming this old patch to point out a problem that I think is still
>>> not solved completely.
>>> 
>>> = Problem =
>>> 
>>> The io.max controller works really well at limiting synchronous I/O
>>> (READs), but a lot of I/O requests are initiated outside the context of
>>> the process that is ultimately responsible for its creation (e.g.,
>>> WRITEs).
>>> 
>>> Throttling at the block layer in some cases is too late and we may end
>>> up slowing down processes that are not responsible for the I/O that
>>> is being processed at that level.
>>> 
>>> = Proposed solution =
>>> 
>>> The main idea of this controller is to split I/O measurement and I/O
>>> throttling: I/O is measured at the block layer for READS, at page cache
>>> (dirty pages) for WRITEs, and processes are limited while they're
>>> generating I/O at the VFS level, based on the measured I/O.
>>> 
>> 
>> Hi Andrea,
>> what the about the case where two processes are dirtying the same
>> pages?  Which will be charged?
>> 
>> Thanks,
>> Paolo
> 
> Hi Paolo,
> 
> in this case only the first one will be charged for the I/O activity
> (the one that changes a page from clean to dirty). This is probably not
> totally fair in some cases, but I think it's a good compromise,

Absolutely, I just wanted to better understand this point.

> at the
> end rewriting the same page over and over while it's already dirty
> doesn't actually generate I/O activity, until the page is flushed back
> to disk.
> 

Right.

Thanks,
Paolo

> Obviously I'm open to other better ideas and suggestions.
> 
> Thanks!
> -Andrea



Re: [RFC PATCH 0/3] cgroup: fsio throttle controller

2019-01-18 Thread Paolo Valente



> Il giorno 18 gen 2019, alle ore 11:31, Andrea Righi  
> ha scritto:
> 
> This is a redesign of my old cgroup-io-throttle controller:
> https://lwn.net/Articles/330531/
> 
> I'm resuming this old patch to point out a problem that I think is still
> not solved completely.
> 
> = Problem =
> 
> The io.max controller works really well at limiting synchronous I/O
> (READs), but a lot of I/O requests are initiated outside the context of
> the process that is ultimately responsible for its creation (e.g.,
> WRITEs).
> 
> Throttling at the block layer in some cases is too late and we may end
> up slowing down processes that are not responsible for the I/O that
> is being processed at that level.
> 
> = Proposed solution =
> 
> The main idea of this controller is to split I/O measurement and I/O
> throttling: I/O is measured at the block layer for READS, at page cache
> (dirty pages) for WRITEs, and processes are limited while they're
> generating I/O at the VFS level, based on the measured I/O.
> 

Hi Andrea,
what the about the case where two processes are dirtying the same
pages?  Which will be charged?

Thanks,
Paolo

> = Example =
> 
> Here's a trivial example: create 2 cgroups, set an io.max limit of
> 10MB/s, run a write-intensive workload on both and after a while, from a
> root cgroup, run "sync".
> 
> # cat /proc/self/cgroup
> 0::/cg1
> # fio --rw=write --bs=1M --size=32M --numjobs=16 --name=seeker --time_based 
> --runtime=30
> 
> # cat /proc/self/cgroup
> 0::/cg2
> # fio --rw=write --bs=1M --size=32M --numjobs=16 --name=seeker --time_based 
> --runtime=30
> 
> - io.max controller:
> 
> # echo "259:0 rbps=10485760 wbps=10485760" > /sys/fs/cgroup/unified/cg1/io.max
> # echo "259:0 rbps=10485760 wbps=10485760" > /sys/fs/cgroup/unified/cg2/io.max
> 
> # cat /proc/self/cgroup
> 0::/
> # time sync
> 
> real  0m51,241s
> user  0m0,000s
> sys   0m0,113s
> 
> Ideally "sync" should complete almost immediately, because the root
> cgroup is unlimited and it's not doing any I/O at all, but instead it's
> blocked for more than 50 sec with io.max, because the writeback is
> throttled to satisfy the io.max limits.
> 
> - fsio controller:
> 
> # echo "259:0 10 10" > /sys/fs/cgroup/unified/cg1/fsio.max_mbs
> # echo "259:0 10 10" > /sys/fs/cgroup/unified/cg2/fsio.max_mbs
> 
> [you can find details about the syntax in the documentation patch]
> 
> # cat /proc/self/cgroup
> 0::/
> # time sync
> 
> real  0m0,146s
> user  0m0,003s
> sys   0m0,001s
> 
> = Questions =
> 
> Q: Do we need another controller?
> A: Probably no, I think it would be better to integrate this policy (or
>   something similar) in the current blkio controller, this is just to
>   highlight the problem and get some ideas on how to address it.
> 
> Q: What about proportional limits / latency?
> A: It should be trivial to add latency-based limits if we integrate this in 
> the
>   current I/O controller. About proportional limits (weights), they're
>   strictly related to I/O scheduling and since this controller doesn't touch
>   I/O dispatching policies it's not trivial to implement proportional limits
>   (bandwidth limiting is definitely more straightforward).
> 
> Q: Applying delays at the VFS layer doesn't prevent I/O spikes during
>   writeback, right?
> A: Correct, the tradeoff here is to tolerate I/O bursts during writeback to
>   avoid priority inversion problems in the system.
> 
> Andrea Righi (3):
>  fsio-throttle: documentation
>  fsio-throttle: controller infrastructure
>  fsio-throttle: instrumentation
> 
> Documentation/cgroup-v1/fsio-throttle.txt | 142 +
> block/blk-core.c  |  10 +
> include/linux/cgroup_subsys.h |   4 +
> include/linux/fsio-throttle.h |  43 +++
> include/linux/writeback.h |   7 +-
> init/Kconfig  |  11 +
> kernel/cgroup/Makefile|   1 +
> kernel/cgroup/fsio-throttle.c | 501 ++
> mm/filemap.c  |  20 +-
> mm/page-writeback.c   |  14 +-
> 10 files changed, 749 insertions(+), 4 deletions(-)
> 



Re: [PATCH BUGFIX 0/2] bfq: fix unbalanced decrements causing loss of throughput

2019-01-14 Thread Paolo Valente



> Il giorno 7 dic 2018, alle ore 15:40, Jens Axboe  ha scritto:
> 
> On 12/7/18 3:01 AM, Paolo Valente wrote:
>> 
>> 
>>> Il giorno 7 dic 2018, alle ore 03:23, Jens Axboe  ha 
>>> scritto:
>>> 
>>> On 12/6/18 11:18 AM, Paolo Valente wrote:
>>>> Hi Jens,
>>>> the first patch in this series fixes an error in the decrementing of
>>>> the counter of the number of groups with pending I/O. This wrong
>>>> decrement caused loss of throughput or, less likely, of control on
>>>> I/O. The second patch is a fix of some wrong comments, which somehow
>>>> contributed to making the above bug more difficult to find.
>>> 
>>> Are you fine with this going into 4.21? I can't quite tell what your
>>> intent is. The first patch has a Fixes for something
>> 
>> yep, that fixes a serious error.
>> 
>>> that went into
>>> this series, but then patch 2 is a comment update that would not
>>> normally be something to be applied at this stage.
>>> 
>> 
>> and yes, only comments changed by the second one
>> 
>> May it make sense to apply them in two steps, one in the 4.20 and the other 
>> one in the 4.21?
> 
> I think so, I'll do that.

Hi Jens,
is the second patch still queued?

Thanks,
Paolo

> 
> -- 
> Jens Axboe



Re: [PATCH V2 00/10] unify the interface of the proportional-share policy in blkio/io

2019-01-02 Thread Paolo Valente



> Il giorno 2 gen 2019, alle ore 17:03, Tejun Heo  ha scritto:
> 
> Hello, Paolo.
> 
> On Sun, Dec 30, 2018 at 11:25:25AM +0100, Paolo Valente wrote:
>> What's the benefit of throwing away months of work, on which we agreed
>> before starting it, and that solves a problem already acknowledged by
>> interested parties?
> 
> Showing multiple conflicting numbers definitely isn't anything which
> is agreed upon.
> 

Sorry, of course you din't realize that sharing interface files had
this consequence, otherwise you'd have protested beforehand.

The problem is that this consequence seems unavoidable: if two
policies have different numbers to convey, through a shared interface
file, then they must be allowed to write their different numbers.  To
me, this doesn't sound like a problem.

The only other natural option is no unification, unless you have a
third way.

What do you prefer, or propose?

Thanks,
Paolo

> Thanks.
> 
> -- 
> tejun



Re: [PATCH V2 00/10] unify the interface of the proportional-share policy in blkio/io

2018-12-30 Thread Paolo Valente



> Il giorno 28 dic 2018, alle ore 00:41, Tejun Heo  ha scritto:
> 
> Hello, Paolo.
> 
> On Sun, Dec 23, 2018 at 12:00:14PM +0100, Paolo Valente wrote:
>> 4.21 is coming ...  and the legacy proportional share interface will
>> be gone with cfq.  This will break legacy code using the
>> proportional-share interface on top of bfq.  This code may just fail
>> when trying to use interface files that don't exist any longer.
> 
> Sounds like inheriting .cfq namespace would be the easiest.  Would
> that work?
> 

For bfq, yes, but what will, e.g., Josef do when he adds his new
proportional-share implementation?  Will he add a new set of names not
used by any legacy piece of code?

What's the benefit of throwing away months of work, on which we agreed
before starting it, and that solves a problem already acknowledged by
interested parties?

Thanks,
Paolo

> Thanks.
> 
> -- 
> tejun



Re: [PATCH V2 00/10] unify the interface of the proportional-share policy in blkio/io

2018-12-23 Thread Paolo Valente



> Il giorno 18 dic 2018, alle ore 18:22, Paolo Valente 
>  ha scritto:
> 
> 
> 
>> Il giorno 18 dic 2018, alle ore 17:41, Tejun Heo  ha 
>> scritto:
>> 
>> Hello, Paolo.
>> 
>> On Tue, Dec 18, 2018 at 08:48:10AM +0100, Paolo Valente wrote:
>>> If Tejun cannot see any solution to his concern, then can we just
>>> switch to this extension, considering that
>>> - for non-shared names the interface is *identical* to the current
>>> one;
>>> - by using this new interface, and getting feedback we could
>>> understand how to better handle Tejun's concern?
>>> A lot of systems do use weights, and people don't even know that these
>>> systems don't work correctly in blk-mq.  And they won't work correctly
>>> in any available configuration from 4.21, if we don't fix this problem.
>> 
>> So, when seen from userland, how it should behave isn't vague or
>> complicated.  For a given device and policy type, there can be only
>> one implementation active.
> 
> Yes, but the problem is the opposite. You may have
> - two different policies, with the same interface parameter, 
> - one active on one device
> - the other one active on another device
> 
> In that case, statistics from one policy necessarily differ from
> statistics from the other policy.
> 
> In this respect, in a system with more than one drive it already
> happens that the same policy is active on different devices.  When
> printing a statistics interface file for the policy, the output will
> be a list of separate statistics, with a bunch of statistics *for
> each* drive (plus a grand total in some cases).
> 
> So, our proposal simply extends this scheme in the most natural way:
> if, now, also two or more policies share the same statistics file,
> then the output will be a list of separate statistics, one for each
> policy.  The statistics for each policy will be tagged with the policy
> name, and will have the same identical form as above.  It seems the
> most natural hierarchical extension of the same scheme.
> 

Maybe my generic description didn't highlight how plain are.

If you print, e.g., io_serviced with the current interface, you get

--

8:0 Read 4291168
8:0 Write 2181577
8:0 Sync 5897755
8:0 Async 574990
8:0 Total 6472745
Total 6472745

--

With the new, interface, you get *the same output*, if only one policy
is attached to this interface file.  In, instead
- two policies share
the the file, because one is active on a device and one on another
device
- these policies are named, e.g., bfq and pol2
then you get (device number and statistics invented):

--

bfq:
8:0 Read 4291168
8:0 Write 2181577
8:0 Sync 5897755
8:0 Async 574990
8:0 Total 6472745
Total 6472745

pol2:
16:0 Read 238768
16:0 Write 323123
16:0 Sync 43243
16:0 Async 432432
16:0 Total 412435
Total 4341244

--

So you see the per-device statistics as before, without the problem
of inventing a new set of names for every new policy that has the same
interface files of an existing policy.

Tejun, let's try to converge, to whatever solution you prefer.


4.21 is coming ...  and the legacy proportional share interface will
be gone with cfq.  This will break legacy code using the
proportional-share interface on top of bfq.  This code may just fail
when trying to use interface files that don't exist any longer.

Thanks,
Paolo

> At any rate, if you don't like it, just tell us how you prefer it
> done.  Do you prefer the sharing of statistics file to be simply
> forbidden?  (If this can be done.) I think such an incomplete solution
> would preserve part of the current mess; but, if this allows us to
> exit from this impasse, then it is ok for me.
> 
> *Any* feasible option is ok for me. Just pick one.
> 
>> It doesn't make sense to have two weight
>> mechanisms active on one device, right?
> 
> (Un)fortunately, the problem are not weights.  There won't be two
> weights for two policies expiring a weight parameter.  The problems
> concerns statistics.  See above.
> 
> 
>> So, the interface should only
>> present what makes sense to the user for both configuration knobs and
>> statistics, and that'd be a hard requirement because we don't want to
>> present confusing spurious information to userspace.
>> 
>> There seemd to have been significant misunderstandings as to what the
>> requirements are when this was discussed way back, so idk what the
>> good path forward is at this point.  Just keep the current names?
>> 
> 
> I don't clearly understand how "just picking the current names" is a
> way forward, but if we do not make this extension, in a way

Re: [PATCH V2 00/10] unify the interface of the proportional-share policy in blkio/io

2018-12-18 Thread Paolo Valente



> Il giorno 18 dic 2018, alle ore 18:22, Paolo Valente 
>  ha scritto:
> 
> 
> 
>> Il giorno 18 dic 2018, alle ore 17:41, Tejun Heo  ha 
>> scritto:
>> 
>> Hello, Paolo.
>> 
>> On Tue, Dec 18, 2018 at 08:48:10AM +0100, Paolo Valente wrote:
>>> If Tejun cannot see any solution to his concern, then can we just
>>> switch to this extension, considering that
>>> - for non-shared names the interface is *identical* to the current
>>> one;
>>> - by using this new interface, and getting feedback we could
>>> understand how to better handle Tejun's concern?
>>> A lot of systems do use weights, and people don't even know that these
>>> systems don't work correctly in blk-mq.  And they won't work correctly
>>> in any available configuration from 4.21, if we don't fix this problem.
>> 
>> So, when seen from userland, how it should behave isn't vague or
>> complicated.  For a given device and policy type, there can be only
>> one implementation active.
> 
> Yes, but the problem is the opposite. You may have
> - two different policies, with the same interface parameter, 
> - one active on one device
> - the other one active on another device
> 
> In that case, statistics from one policy necessarily differ from
> statistics from the other policy.
> 
> In this respect, in a system with more than one drive it already
> happens that the same policy is active on different devices.  When
> printing a statistics interface file for the policy, the output will
> be a list of separate statistics, with a bunch of statistics *for
> each* drive (plus a grand total in some cases).
> 
> So, our proposal simply extends this scheme in the most natural way:
> if, now, also two or more policies share the same statistics file,
> then the output will be a list of separate statistics, one for each
> policy.  The statistics for each policy will be tagged with the policy
> name, and will have the same identical form as above.  It seems the
> most natural hierarchical extension of the same scheme.
> 
> At any rate, if you don't like it, just tell us how you prefer it
> done.  Do you prefer the sharing of statistics file to be simply
> forbidden?  (If this can be done.) I think such an incomplete solution
> would preserve part of the current mess; but, if this allows us to
> exit from this impasse, then it is ok for me.
> 
> *Any* feasible option is ok for me. Just pick one.
> 
>> It doesn't make sense to have two weight
>> mechanisms active on one device, right?
> 
> (Un)fortunately, the problem are not weights.  There won't be two
> weights for two policies expiring a weight parameter.  The problems

s/expiring/sharing sorry



Re: [PATCH V2 00/10] unify the interface of the proportional-share policy in blkio/io

2018-12-18 Thread Paolo Valente



> Il giorno 18 dic 2018, alle ore 17:41, Tejun Heo  ha scritto:
> 
> Hello, Paolo.
> 
> On Tue, Dec 18, 2018 at 08:48:10AM +0100, Paolo Valente wrote:
>> If Tejun cannot see any solution to his concern, then can we just
>> switch to this extension, considering that
>> - for non-shared names the interface is *identical* to the current
>>  one;
>> - by using this new interface, and getting feedback we could
>>  understand how to better handle Tejun's concern?
>> A lot of systems do use weights, and people don't even know that these
>> systems don't work correctly in blk-mq.  And they won't work correctly
>> in any available configuration from 4.21, if we don't fix this problem.
> 
> So, when seen from userland, how it should behave isn't vague or
> complicated.  For a given device and policy type, there can be only
> one implementation active.

Yes, but the problem is the opposite. You may have
- two different policies, with the same interface parameter, 
- one active on one device
- the other one active on another device

In that case, statistics from one policy necessarily differ from
statistics from the other policy.

In this respect, in a system with more than one drive it already
happens that the same policy is active on different devices.  When
printing a statistics interface file for the policy, the output will
be a list of separate statistics, with a bunch of statistics *for
each* drive (plus a grand total in some cases).

So, our proposal simply extends this scheme in the most natural way:
if, now, also two or more policies share the same statistics file,
then the output will be a list of separate statistics, one for each
policy.  The statistics for each policy will be tagged with the policy
name, and will have the same identical form as above.  It seems the
most natural hierarchical extension of the same scheme.

At any rate, if you don't like it, just tell us how you prefer it
done.  Do you prefer the sharing of statistics file to be simply
forbidden?  (If this can be done.) I think such an incomplete solution
would preserve part of the current mess; but, if this allows us to
exit from this impasse, then it is ok for me.

*Any* feasible option is ok for me. Just pick one.

>  It doesn't make sense to have two weight
> mechanisms active on one device, right?

(Un)fortunately, the problem are not weights.  There won't be two
weights for two policies expiring a weight parameter.  The problems
concerns statistics.  See above.


>  So, the interface should only
> present what makes sense to the user for both configuration knobs and
> statistics, and that'd be a hard requirement because we don't want to
> present confusing spurious information to userspace.
> 
> There seemd to have been significant misunderstandings as to what the
> requirements are when this was discussed way back, so idk what the
> good path forward is at this point.  Just keep the current names?
> 

I don't clearly understand how "just picking the current names" is a
way forward, but if we do not make this extension, in a way or the
other, then two policies will simply not be allowed to share the same
interface files.  And we will be still at the starting point.

Thanks,
Paolo

> Thanks.
> 
> -- 
> tejun



Re: [PATCH V2 00/10] unify the interface of the proportional-share policy in blkio/io

2018-12-17 Thread Paolo Valente
[RESENDING BECAUSE BOUNCED]

> Il giorno 10 dic 2018, alle ore 14:45, Angelo Ruocco 
>  ha scritto:
> 
> 2018-11-30 19:53 GMT+01:00, Paolo Valente :
>> 
>> 
>>> Il giorno 30 nov 2018, alle ore 19:42, Tejun Heo  ha
>>> scritto:
>>> 
>>> Hello, Paolo.
>>> 
>>> On Fri, Nov 30, 2018 at 07:23:24PM +0100, Paolo Valente wrote:
>>>>> Then we understood that exactly the same happens with throttling, in
>>>>> case the latter is activated on different devices w.r.t. bfq.
>>>>> 
>>>>> In addition, the same may happen, in the near future, with the
>>>>> bandwidth controller Josef is working on.  If the controller can be
>>>>> configured per device, as with throttling, then statistics may differ,
>>>>> for the same interface files, between bfq, throttling and that
>>>>> controller.
>>> 
>>> So, regardless of how all these are implemented, what's presented to
>>> user should be consistent and clear.  There's no other way around it.
>>> Only what's relevant should be visible to userspace.
>>> 
>>>> have you had time to look into this?  Any improvement to this
>>>> interface is ok for us. We are only interested in finally solving this
>>>> interface issue, as, for what concerns us directly, it has been
>>>> preventing legacy code to use bfq for years.
>>> 
>>> Unfortunately, I don't have any implementation proposal, but we can't
>>> show things this way to userspace.
>>> 
>> 
>> Well, this is not very helpful to move forward :)
>> 
>> Let me try to repeat the problem, to try to help you help us unblock
>> the situation.
>> 
>> If we have multiple entities attached to the same interface output
>> file, you don't find it clear that each entity shows the number it
>> wants to show.  But you have no idea either of how that differentiated
>> information should be shown.  Is this the situation, or is the problem
>> somewhere 'above' this level?
>> 
>> If the problem is as I described it, here are some proposal attempts:
>> 1) Do you want file sharing to be allowed only if all entities will
>> output the same number?  (this seems excessive, but maybe it makes
>> sense)
>> 2) Do you want only one number to be shown, equal to the sum of the
>> numbers of each entity?  (in some cases, this may make sense)
>> 3) Do you prefer an average?
>> 4) Do you have any other idea, even if just germinal?
> 
> To further add to what Paolo said and better expose the problem, I'd like to
> say that all those proposals have issues.
> If we only allow "same output" cftypes to be shared then we lose all the
> flexibility of this solution, and we need a way for an entity to know other
> entities internal variables beforehand, which sounds at least very hard, and
> maybe is not even an acceptable thing to do.
> To put the average, sum or some other mathematical function in the file only
> makes sense for certain cftypes, so also doesn't sound like a good idea. In
> fact I can think of scenarios where only seeing the different values of the
> entities makes sense for a user.
> 
> I understand that the problem is inconsistency: having a file that behaves
> differently depending on the situation, and the only way to prevent this I can
> think of is to *always* show the entity owner of a certain file (or part of 
> the
> output), even when the output would be the same among entities or when the
> file is not currently shared but could be. Can this be an acceptable solution?
> 
> Angelo
> 

Hi Jens, all,
let me push for this interface to be fixed too.  If we don't fix it in
some way, then from 4.21 we well end up with a ridiculous paradox: the
proportional share policy (weights) will of course be available, but
unusable in practice.  In fact, as Lennart--and not only Lennart--can
confirm, no piece of code uses bfq.weight to set weights, or will do
it.

A trivial solution would be to throw away all our work to fix this
issue by extending the interface, and just let bfq use the former cfq
names.  But then the same mess will happen as, e.g., Josef will
propose his proportional-share controller.

Before making this solution, we proposed it and waited for it to be
approved several months ago, so I hope that Tejun concern can be
addressed somehow.

If Tejun cannot see any solution to his concern, then can we just
switch to this extension, considering that
- for non-shared names the interface is *identical* to the current
  one;
- by using this new interface, and getting feedback we could
  understand how to better handle Tejun's concern?

A lot of sy

Re: [PATCH BUGFIX 0/2] bfq: fix unbalanced decrements causing loss of throughput

2018-12-07 Thread Paolo Valente



> Il giorno 7 dic 2018, alle ore 03:23, Jens Axboe  ha scritto:
> 
> On 12/6/18 11:18 AM, Paolo Valente wrote:
>> Hi Jens,
>> the first patch in this series fixes an error in the decrementing of
>> the counter of the number of groups with pending I/O. This wrong
>> decrement caused loss of throughput or, less likely, of control on
>> I/O. The second patch is a fix of some wrong comments, which somehow
>> contributed to making the above bug more difficult to find.
> 
> Are you fine with this going into 4.21? I can't quite tell what your
> intent is. The first patch has a Fixes for something

yep, that fixes a serious error.

> that went into
> this series, but then patch 2 is a comment update that would not
> normally be something to be applied at this stage.
> 

and yes, only comments changed by the second one

May it make sense to apply them in two steps, one in the 4.20 and the other one 
in the 4.21?

Thanks,
Paolo

> -- 
> Jens Axboe
> 



[PATCH BUGFIX 2/2] block, bfq: fix comments on __bfq_deactivate_entity

2018-12-06 Thread Paolo Valente
Comments on function __bfq_deactivate_entity contains two imprecise or
wrong statements:
1) The function performs the deactivation of the entity.
2) The function must be invoked only if the entity is on a service tree.

This commits replaces both statements with the correct ones:
1) The functions updates sched_data and service trees for the entity,
so as to represent entity as inactive (which is only part of the steps
needed for the deactivation of the entity).
2) The function must be invoked on every entity being deactivated.

Signed-off-by: Paolo Valente 
---
 block/bfq-wf2q.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 63e0f12be7c9..72adbbe975d5 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -1154,15 +1154,14 @@ static void bfq_activate_requeue_entity(struct 
bfq_entity *entity,
 }
 
 /**
- * __bfq_deactivate_entity - deactivate an entity from its service tree.
- * @entity: the entity to deactivate.
+ * __bfq_deactivate_entity - update sched_data and service trees for
+ * entity, so as to represent entity as inactive
+ * @entity: the entity being deactivated.
  * @ins_into_idle_tree: if false, the entity will not be put into the
  * idle tree.
  *
- * Deactivates an entity, independently of its previous state.  Must
- * be invoked only if entity is on a service tree. Extracts the entity
- * from that tree, and if necessary and allowed, puts it into the idle
- * tree.
+ * If necessary and allowed, puts entity into the idle tree. NOTE:
+ * entity may be on no tree if in service.
  */
 bool __bfq_deactivate_entity(struct bfq_entity *entity, bool 
ins_into_idle_tree)
 {
-- 
2.16.1



[PATCH BUGFIX 1/2] block, bfq: fix decrement of num_active_groups

2018-12-06 Thread Paolo Valente
Since commit '2d29c9f89fcd ("block, bfq: improve asymmetric scenarios
detection")', if there are process groups with I/O requests waiting for
completion, then BFQ tags the scenario as 'asymmetric'. This detection
is needed for preserving service guarantees (for details, see comments
on the computation * of the variable asymmetric_scenario in the
function bfq_better_to_idle).

Unfortunately, commit '2d29c9f89fcd ("block, bfq: improve asymmetric
scenarios detection")' contains an error exactly in the updating of
the number of groups with I/O requests waiting for completion: if a
group has more than one descendant process, then the above number of
groups, which is renamed from num_active_groups to a more appropriate
num_groups_with_pending_reqs by this commit, may happen to be wrongly
decremented multiple times, namely every time one of the descendant
processes gets all its pending I/O requests completed.

A correct, complete solution should work as follows. Consider a group
that is inactive, i.e., that has no descendant process with pending
I/O inside BFQ queues. Then suppose that num_groups_with_pending_reqs
is still accounting for this group, because the group still has some
descendant process with some I/O request still in
flight. num_groups_with_pending_reqs should be decremented when the
in-flight request of the last descendant process is finally completed
(assuming that nothing else has changed for the group in the meantime,
in terms of composition of the group and active/inactive state of
child groups and processes). To accomplish this, an additional
pending-request counter must be added to entities, and must be
updated correctly.

To avoid this additional field and operations, this commit resorts to
the following tradeoff between simplicity and accuracy: for an
inactive group that is still counted in num_groups_with_pending_reqs,
this commit decrements num_groups_with_pending_reqs when the first
descendant process of the group remains with no request waiting for
completion.

This simplified scheme provides a fix to the unbalanced decrements
introduced by 2d29c9f89fcd. Since this error was also caused by lack
of comments on this non-trivial issue, this commit also adds related
comments.

Fixes: 2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection")
Reported-by: Steven Barrett 
Tested-by: Steven Barrett 
Tested-by: Lucjan Lucjanov 
Reviewed-by: Federico Motta 
Signed-off-by: Paolo Valente 
---
 block/bfq-iosched.c | 76 +
 block/bfq-iosched.h | 51 +--
 block/bfq-wf2q.c|  5 +++-
 3 files changed, 107 insertions(+), 25 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3a27d31fcda6..97337214bec4 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -638,7 +638,7 @@ static bool 
bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
 bfqd->queue_weights_tree.rb_node->rb_right)
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
   ) ||
-   (bfqd->num_active_groups > 0
+   (bfqd->num_groups_with_pending_reqs > 0
 #endif
   );
 }
@@ -802,7 +802,21 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
 */
break;
}
-   bfqd->num_active_groups--;
+
+   /*
+* The decrement of num_groups_with_pending_reqs is
+* not performed immediately upon the deactivation of
+* entity, but it is delayed to when it also happens
+* that the first leaf descendant bfqq of entity gets
+* all its pending requests completed. The following
+* instructions perform this delayed decrement, if
+* needed. See the comments on
+* num_groups_with_pending_reqs for details.
+*/
+   if (entity->in_groups_with_pending_reqs) {
+   entity->in_groups_with_pending_reqs = false;
+   bfqd->num_groups_with_pending_reqs--;
+   }
}
 }
 
@@ -3529,27 +3543,44 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
 * fact, if there are active groups, then, for condition (i)
 * to become false, it is enough that an active group contains
 * more active processes or sub-groups than some other active
-* group. We address this issue with the following bi-modal
-* behavior, implemented in the function
+* group. More precisely, for condition (i) to hold because of
+* such a group, it is not even necessary that the group is
+* (still) active: it is sufficient that, even if the group
+* has become inactive, some of its descendant processes still
+* have some request already dispatched but still waiting for
+* completion. In fact,

[PATCH BUGFIX 1/2] block, bfq: fix decrement of num_active_groups

2018-12-06 Thread Paolo Valente
Since commit '2d29c9f89fcd ("block, bfq: improve asymmetric scenarios
detection")', if there are process groups with I/O requests waiting for
completion, then BFQ tags the scenario as 'asymmetric'. This detection
is needed for preserving service guarantees (for details, see comments
on the computation * of the variable asymmetric_scenario in the
function bfq_better_to_idle).

Unfortunately, commit '2d29c9f89fcd ("block, bfq: improve asymmetric
scenarios detection")' contains an error exactly in the updating of
the number of groups with I/O requests waiting for completion: if a
group has more than one descendant process, then the above number of
groups, which is renamed from num_active_groups to a more appropriate
num_groups_with_pending_reqs by this commit, may happen to be wrongly
decremented multiple times, namely every time one of the descendant
processes gets all its pending I/O requests completed.

A correct, complete solution should work as follows. Consider a group
that is inactive, i.e., that has no descendant process with pending
I/O inside BFQ queues. Then suppose that num_groups_with_pending_reqs
is still accounting for this group, because the group still has some
descendant process with some I/O request still in
flight. num_groups_with_pending_reqs should be decremented when the
in-flight request of the last descendant process is finally completed
(assuming that nothing else has changed for the group in the meantime,
in terms of composition of the group and active/inactive state of
child groups and processes). To accomplish this, an additional
pending-request counter must be added to entities, and must be
updated correctly.

To avoid this additional field and operations, this commit resorts to
the following tradeoff between simplicity and accuracy: for an
inactive group that is still counted in num_groups_with_pending_reqs,
this commit decrements num_groups_with_pending_reqs when the first
descendant process of the group remains with no request waiting for
completion.

This simplified scheme provides a fix to the unbalanced decrements
introduced by 2d29c9f89fcd. Since this error was also caused by lack
of comments on this non-trivial issue, this commit also adds related
comments.

Fixes: 2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection")
Reported-by: Steven Barrett 
Tested-by: Steven Barrett 
Tested-by: Lucjan Lucjanov 
Reviewed-by: Federico Motta 
Signed-off-by: Paolo Valente 
---
 block/bfq-iosched.c | 76 +
 block/bfq-iosched.h | 51 +--
 block/bfq-wf2q.c|  5 +++-
 3 files changed, 107 insertions(+), 25 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3a27d31fcda6..97337214bec4 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -638,7 +638,7 @@ static bool 
bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
 bfqd->queue_weights_tree.rb_node->rb_right)
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
   ) ||
-   (bfqd->num_active_groups > 0
+   (bfqd->num_groups_with_pending_reqs > 0
 #endif
   );
 }
@@ -802,7 +802,21 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
 */
break;
}
-   bfqd->num_active_groups--;
+
+   /*
+* The decrement of num_groups_with_pending_reqs is
+* not performed immediately upon the deactivation of
+* entity, but it is delayed to when it also happens
+* that the first leaf descendant bfqq of entity gets
+* all its pending requests completed. The following
+* instructions perform this delayed decrement, if
+* needed. See the comments on
+* num_groups_with_pending_reqs for details.
+*/
+   if (entity->in_groups_with_pending_reqs) {
+   entity->in_groups_with_pending_reqs = false;
+   bfqd->num_groups_with_pending_reqs--;
+   }
}
 }
 
@@ -3529,27 +3543,44 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
 * fact, if there are active groups, then, for condition (i)
 * to become false, it is enough that an active group contains
 * more active processes or sub-groups than some other active
-* group. We address this issue with the following bi-modal
-* behavior, implemented in the function
+* group. More precisely, for condition (i) to hold because of
+* such a group, it is not even necessary that the group is
+* (still) active: it is sufficient that, even if the group
+* has become inactive, some of its descendant processes still
+* have some request already dispatched but still waiting for
+* completion. In fact,

[PATCH BUGFIX 0/2] bfq: fix unbalanced decrements causing loss of throughput

2018-12-06 Thread Paolo Valente
Hi Jens,
the first patch in this series fixes an error in the decrementing of
the counter of the number of groups with pending I/O. This wrong
decrement caused loss of throughput or, less likely, of control on
I/O. The second patch is a fix of some wrong comments, which somehow
contributed to making the above bug more difficult to find.

Thanks,
Paolo

Paolo Valente (2):
  block, bfq: fix decrement of num_active_groups
  block, bfq: fix comments on __bfq_deactivate_entity

 block/bfq-iosched.c | 76 +
 block/bfq-iosched.h | 51 +--
 block/bfq-wf2q.c| 16 ++-
 3 files changed, 112 insertions(+), 31 deletions(-)

--
2.16.1


[PATCH V2 00/10] unify the interface of the proportional-share policy in blkio/io

2018-11-19 Thread Paolo Valente
Hi,
here is the V2 of this patch series. Let me rephrase the description
of the series, in view of the fact that CFQ will be gone with legacy
block.

The current implementation of cgroups doesn't allow two or more
entities, e.g., I/O schedulers, to share the same files.  Thus, to
enable people to set group weights with BFQ, I resorted to making BFQ
create its own version of the same interface files used by CFQ (before
going away with legacy block), by prepending a bfq prefix.

Actually, no legacy code uses these different names, or is likely to
do so.  Having these two sets of names is simply a source of
confusion, as pointed out also, e.g., by Lennart Poettering (CCed
here), and acknowledged by Tejun [2].

In [1] we agreed on a solution that solves this problem, by actually
making it possible to share cgroups files.  Both writing to and
reading from a shared file trigger the appropriate operation for each
of the entities that share the file.  In particular, in case of
reading,
- if all entities produce the same output, the this common output is
 shown only once;
- if the outputs differ, then every per-entity output is shown,
 followed by the name of the entity that produced that output.

With this solution, legacy code that, e.g., sets group weights, just
works, regardless of the I/O scheduler actually implementing
proportional share.

But note that this extension is not restricted to only blkio/io.  The
general group interface now enables files to be shared among multiple
entities of any kind.

(I have also added a patch to fix some clerical errors in bfq doc,
which I found while making the latter consistent with the new
interface.)

CHANGES FROM V1:
- Removed patch that introduced a function to only find kernfs nodes,
  without increasing ref counters
- Changed commit messages and BFQ documentation, to comply with the
  fact that there won't be CFQ any longer

Thanks,
Paolo

Angelo Ruocco (5):
  cgroup: link cftypes of the same subsystem with the same name
  cgroup: add owner name to cftypes
  block, bfq: align min and default weights with the old cfq default
  cgroup: make all functions of all cftypes be invoked
  block, throttle: allow sharing cgroup statistic files

Paolo Valente (5):
  cgroup: add hook seq_show_cft with also the owning cftype as parameter
  block, cgroup: pass cftype to functions that need to use it
  block, bfq: use standard file names for the proportional-share policy
  doc, bfq-iosched: fix a few clerical errors
  doc, bfq-iosched: make it consistent with the new cgroup interface

 Documentation/block/bfq-iosched.txt |  34 ++---
 block/bfq-cgroup.c  | 148 +---
 block/bfq-iosched.h |   4 +-
 block/blk-cgroup.c  |  22 +--
 block/blk-throttle.c|  24 ++--
 include/linux/blk-cgroup.h  |  10 +-
 include/linux/cgroup-defs.h |  14 +-
 include/linux/cgroup.h  |  13 ++
 kernel/cgroup/cgroup.c  | 262 +---
 9 files changed, 390 insertions(+), 141 deletions(-)

--
2.16.1


[PATCH V2 00/10] unify the interface of the proportional-share policy in blkio/io

2018-11-19 Thread Paolo Valente
Hi,
here is the V2 of this patch series. Let me rephrase the description
of the series, in view of the fact that CFQ will be gone with legacy
block.

The current implementation of cgroups doesn't allow two or more
entities, e.g., I/O schedulers, to share the same files.  Thus, to
enable people to set group weights with BFQ, I resorted to making BFQ
create its own version of the same interface files used by CFQ (before
going away with legacy block), by prepending a bfq prefix.

Actually, no legacy code uses these different names, or is likely to
do so.  Having these two sets of names is simply a source of
confusion, as pointed out also, e.g., by Lennart Poettering (CCed
here), and acknowledged by Tejun [2].

In [1] we agreed on a solution that solves this problem, by actually
making it possible to share cgroups files.  Both writing to and
reading from a shared file trigger the appropriate operation for each
of the entities that share the file.  In particular, in case of
reading,
- if all entities produce the same output, the this common output is
 shown only once;
- if the outputs differ, then every per-entity output is shown,
 followed by the name of the entity that produced that output.

With this solution, legacy code that, e.g., sets group weights, just
works, regardless of the I/O scheduler actually implementing
proportional share.

But note that this extension is not restricted to only blkio/io.  The
general group interface now enables files to be shared among multiple
entities of any kind.

(I have also added a patch to fix some clerical errors in bfq doc,
which I found while making the latter consistent with the new
interface.)

CHANGES FROM V1:
- Removed patch that introduced a function to only find kernfs nodes,
  without increasing ref counters
- Changed commit messages and BFQ documentation, to comply with the
  fact that there won't be CFQ any longer

Thanks,
Paolo

Angelo Ruocco (5):
  cgroup: link cftypes of the same subsystem with the same name
  cgroup: add owner name to cftypes
  block, bfq: align min and default weights with the old cfq default
  cgroup: make all functions of all cftypes be invoked
  block, throttle: allow sharing cgroup statistic files

Paolo Valente (5):
  cgroup: add hook seq_show_cft with also the owning cftype as parameter
  block, cgroup: pass cftype to functions that need to use it
  block, bfq: use standard file names for the proportional-share policy
  doc, bfq-iosched: fix a few clerical errors
  doc, bfq-iosched: make it consistent with the new cgroup interface

 Documentation/block/bfq-iosched.txt |  34 ++---
 block/bfq-cgroup.c  | 148 +---
 block/bfq-iosched.h |   4 +-
 block/blk-cgroup.c  |  22 +--
 block/blk-throttle.c|  24 ++--
 include/linux/blk-cgroup.h  |  10 +-
 include/linux/cgroup-defs.h |  14 +-
 include/linux/cgroup.h  |  13 ++
 kernel/cgroup/cgroup.c  | 262 +---
 9 files changed, 390 insertions(+), 141 deletions(-)

--
2.16.1


[PATCH V2 06/10] cgroup: make all functions of all cftypes be invoked

2018-11-19 Thread Paolo Valente
From: Angelo Ruocco 

When two or more entities (of any kind) share a file, their respective
cftypes are linked together. The allowed operations on those files
are: open, release, write and show, mapped to the functions defined in
the cftypes.

This commit makes the cgroup core invoke, whenever one of those
operations is requested, the respective function of all the cftypes
linked together.

Signed-off-by: Angelo Ruocco 
Signed-off-by: Paolo Valente 
---
 kernel/cgroup/cgroup.c | 181 -
 1 file changed, 132 insertions(+), 49 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 61eafd69e2fd..9bf6b0b5a0ca 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3452,66 +3452,107 @@ static int cgroup_cpu_pressure_show(struct seq_file 
*seq, void *v)
 static int cgroup_file_open(struct kernfs_open_file *of)
 {
struct cftype *cft = of->kn->priv;
+   struct cftype *n;
+   int ret = 0;
 
+   list_for_each_cft(cft, n) {
+   if (cft->open)
+   ret = cft->open(of);
+   /*
+* If there has been an error with the open function of one of
+* the cft associated with the file, we call the release
+* function of all the cftype associated to cft whose open
+* function succeded.
+*/
+   if (ret) {
+   struct cftype *c = of->kn->priv;
+   struct cftype *n;
+
+   list_for_each_cft(c, n) {
+   if (cft == c)
+   break;
+   if (c->release)
+   c->release(of);
+   }
+   break;
+   }
+   }
 
-   if (cft->open)
-   return cft->open(of);
-   return 0;
+   return ret;
 }
 
 static void cgroup_file_release(struct kernfs_open_file *of)
 {
struct cftype *cft = of->kn->priv;
+   struct cftype *n;
 
-   if (cft->release)
-   cft->release(of);
+   list_for_each_cft(cft, n)
+   if (cft->release)
+   cft->release(of);
 }
 
+/*
+ * Call all the write functions of the cftypes associated with the file.
+ *
+ * When a write fails, don't keep trying to write into the file via the write
+ * functions of the other cftypes associated with it.
+ */
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 size_t nbytes, loff_t off)
 {
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup *cgrp = of->kn->parent->priv;
struct cftype *cft = of->kn->priv;
+   struct cftype *n;
struct cgroup_subsys_state *css;
-   int ret;
+   int ret = 0;
 
-   /*
-* If namespaces are delegation boundaries, disallow writes to
-* files in an non-init namespace root from inside the namespace
-* except for the files explicitly marked delegatable -
-* cgroup.procs and cgroup.subtree_control.
-*/
-   if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
-   !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
-   ns != _cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
-   return -EPERM;
+   list_for_each_cft(cft, n) {
+   /*
+* If namespaces are delegation boundaries, disallow writes to
+* files in an non-init namespace root from inside the
+* namespace except for the files explicitly marked
+* delegatable - cgroup.procs and cgroup.subtree_control.
+*/
+   if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
+   !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
+   ns != _cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+   return -EPERM;
 
-   if (cft->write)
-   return cft->write(of, buf, nbytes, off);
+   if (cft->write) {
+   ret = cft->write(of, buf, nbytes, off);
 
-   /*
-* kernfs guarantees that a file isn't deleted with operations in
-* flight, which means that the matching css is and stays alive and
-* doesn't need to be pinned.  The RCU locking is not necessary
-* either.  It's just for the convenience of using cgroup_css().
-*/
-   rcu_read_lock();
-   css = cgroup_css(cgrp, cft->ss);
-   rcu_read_unlock();
+   if (ret)
+   break;
+   continue;
+   }
 
-   if (cft->write_u64) {
-   unsigned long long v;
-  

[PATCH V2 01/10] cgroup: add hook seq_show_cft with also the owning cftype as parameter

2018-11-19 Thread Paolo Valente
The current implementation of the seq_show hook in the cftype struct
has only, as parameters, the seq_file to write to and the arguments
passed by the command line. Thus, the only way to retrieve the cftype
that owns an instance of such hook function is by using the accessor
in the seq_file itself.

But in a future scenario where the same file may be shared by multiple
cftypes, this accessor will point only to the first of the cftypes
linked to the seq_file. It will then be impossible to access the
cftype owning the seq_show function within the seq_show itself, unless
such cftype is the first one.

This commit adds an additional seq_show_cft hook that has as a formal
parameter also the cftype that owns the function.

Signed-off-by: Angelo Ruocco 
Signed-off-by: Paolo Valente 
---
 include/linux/cgroup-defs.h |  3 ++-
 kernel/cgroup/cgroup.c  | 15 +--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 5e1694fe035b..7841db6e7fb3 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -543,8 +543,9 @@ struct cftype {
 */
s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
 
-   /* generic seq_file read interface */
+   /* generic seq_file read interfaces*/
int (*seq_show)(struct seq_file *sf, void *v);
+   int (*seq_show_cft)(struct seq_file *sf, struct cftype *cft, void *v);
 
/* optional ops, implement all or none */
void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 6aaf5dd5383b..9d0993dd68fe 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1418,7 +1418,8 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
 {
umode_t mode = 0;
 
-   if (cft->read_u64 || cft->read_s64 || cft->seq_show)
+   if (cft->read_u64 || cft->read_s64 || cft->seq_show ||
+   cft->seq_show_cft)
mode |= S_IRUGO;
 
if (cft->write_u64 || cft->write_s64 || cft->write) {
@@ -3519,17 +3520,19 @@ static int cgroup_seqfile_show(struct seq_file *m, void 
*arg)
 {
struct cftype *cft = seq_cft(m);
struct cgroup_subsys_state *css = seq_css(m);
+   int ret = 0;
 
if (cft->seq_show)
-   return cft->seq_show(m, arg);
-
-   if (cft->read_u64)
+   ret = cft->seq_show(m, arg);
+   else if (cft->seq_show_cft)
+   ret = cft->seq_show_cft(m, cft, arg);
+   else if (cft->read_u64)
seq_printf(m, "%llu\n", cft->read_u64(css, cft));
else if (cft->read_s64)
seq_printf(m, "%lld\n", cft->read_s64(css, cft));
else
-   return -EINVAL;
-   return 0;
+   ret = -EINVAL;
+   return ret;
 }
 
 static struct kernfs_ops cgroup_kf_single_ops = {
-- 
2.16.1



[PATCH V2 02/10] block, cgroup: pass cftype to functions that need to use it

2018-11-19 Thread Paolo Valente
Some seq_show functions need to access the cftype they belong to, for
retrieving the data to show. These functions get their cftype by using
the seq_cft accessor for the seq_file. This solution is no longer
viable in case a seq_file is shared among more than one cftype,
because the accessor always returns (only) the first of the cftypes
sharing the seq_file.

This commit enables these seq_show functions to be passed their
cftype, by replacing their prototype with that of the newly defined
seq_show_cft hook, and by invoking these functions through this new
hook.

Signed-off-by: Angelo Ruocco 
Signed-off-by: Paolo Valente 
---
 block/bfq-cgroup.c | 54 --
 block/blk-cgroup.c | 22 ---
 block/blk-throttle.c   |  8 +++
 include/linux/blk-cgroup.h | 10 +
 4 files changed, 52 insertions(+), 42 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index a7a1712632b0..038e418fa64f 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -918,17 +918,17 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file 
*of,
 }
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-static int bfqg_print_stat(struct seq_file *sf, void *v)
+static int bfqg_print_stat(struct seq_file *sf, struct cftype *cft, void *v)
 {
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
- _policy_bfq, seq_cft(sf)->private, false);
+ _policy_bfq, cft->private, false);
return 0;
 }
 
-static int bfqg_print_rwstat(struct seq_file *sf, void *v)
+static int bfqg_print_rwstat(struct seq_file *sf, struct cftype *cft, void *v)
 {
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
- _policy_bfq, seq_cft(sf)->private, true);
+ _policy_bfq, cft->private, true);
return 0;
 }
 
@@ -949,19 +949,21 @@ static u64 bfqg_prfill_rwstat_recursive(struct seq_file 
*sf,
return __blkg_prfill_rwstat(sf, pd, );
 }
 
-static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
+static int bfqg_print_stat_recursive(struct seq_file *sf, struct cftype *cft,
+void *v)
 {
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
  bfqg_prfill_stat_recursive, _policy_bfq,
- seq_cft(sf)->private, false);
+ cft->private, false);
return 0;
 }
 
-static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, struct cftype *cft,
+  void *v)
 {
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
  bfqg_prfill_rwstat_recursive, _policy_bfq,
- seq_cft(sf)->private, true);
+ cft->private, true);
return 0;
 }
 
@@ -1063,18 +1065,18 @@ struct cftype bfq_blkcg_legacy_files[] = {
{
.name = "bfq.io_service_bytes",
.private = (unsigned long)_policy_bfq,
-   .seq_show = blkg_print_stat_bytes,
+   .seq_show_cft = blkg_print_stat_bytes,
},
{
.name = "bfq.io_serviced",
.private = (unsigned long)_policy_bfq,
-   .seq_show = blkg_print_stat_ios,
+   .seq_show_cft = blkg_print_stat_ios,
},
 #ifdef CONFIG_DEBUG_BLK_CGROUP
{
.name = "bfq.time",
.private = offsetof(struct bfq_group, stats.time),
-   .seq_show = bfqg_print_stat,
+   .seq_show_cft = bfqg_print_stat,
},
{
.name = "bfq.sectors",
@@ -1083,22 +1085,22 @@ struct cftype bfq_blkcg_legacy_files[] = {
{
.name = "bfq.io_service_time",
.private = offsetof(struct bfq_group, stats.service_time),
-   .seq_show = bfqg_print_rwstat,
+   .seq_show_cft = bfqg_print_rwstat,
},
{
.name = "bfq.io_wait_time",
.private = offsetof(struct bfq_group, stats.wait_time),
-   .seq_show = bfqg_print_rwstat,
+   .seq_show_cft = bfqg_print_rwstat,
},
{
.name = "bfq.io_merged",
.private = offsetof(struct bfq_group, stats.merged),
-   .seq_show = bfqg_print_rwstat,
+   .seq_show_cft = bfqg_print_rwstat,
},
{
.name = "bfq.io_queued",
.private = offsetof(struct bfq_group, stats.queued),
-   .seq_show = bfqg_print_rwstat,
+   .seq_show_cft = bfqg_print_rwstat,
},
 #endif /* CONFIG_DEBUG_BLK_CGROUP */
 
@@ -1106,18 +1108,18 @@ struct cftype bfq_blkcg_legacy_files[] = {
{
   

[PATCH V2 05/10] block, bfq: align min and default weights with the old cfq default

2018-11-19 Thread Paolo Valente
From: Angelo Ruocco 

bfq exposes a cgroup attribute, weight, with the same meaning as that
exposed by cfq.

This commit changes bfq default and min weights to match the ones set
by cfq (before legacy blk was removed).

Signed-off-by: Angelo Ruocco 
Signed-off-by: Paolo Valente 
---
 block/bfq-iosched.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 77651d817ecd..249d8128d3ee 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -22,13 +22,13 @@
 #define BFQ_IOPRIO_CLASSES 3
 #define BFQ_CL_IDLE_TIMEOUT(HZ/5)
 
-#define BFQ_MIN_WEIGHT 1
+#define BFQ_MIN_WEIGHT 10
 #define BFQ_MAX_WEIGHT 1000
 #define BFQ_WEIGHT_CONVERSION_COEFF10
 
 #define BFQ_DEFAULT_QUEUE_IOPRIO   4
 
-#define BFQ_WEIGHT_LEGACY_DFL  100
+#define BFQ_WEIGHT_LEGACY_DFL  500
 #define BFQ_DEFAULT_GRP_IOPRIO 0
 #define BFQ_DEFAULT_GRP_CLASS  IOPRIO_CLASS_BE
 
-- 
2.16.1



[PATCH V2 04/10] cgroup: add owner name to cftypes

2018-11-19 Thread Paolo Valente
From: Angelo Ruocco 

The piece of information "who created a certain cftype" is not stored
anywhere, thus a cftype is not able to know who is its owner.

This commit addresses this problem by adding a new field in the cftype
structure that enables the name of its owner to be explicitly set.

Signed-off-by: Angelo Ruocco 
Signed-off-by: Paolo Valente 
---
 include/linux/cgroup-defs.h |  2 ++
 include/linux/cgroup.h  | 13 +
 2 files changed, 15 insertions(+)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index d659763c7221..6e31f478c6e1 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -36,6 +36,7 @@ struct seq_file;
 #define MAX_CGROUP_TYPE_NAMELEN 32
 #define MAX_CGROUP_ROOT_NAMELEN 64
 #define MAX_CFTYPE_NAME64
+#define MAX_OWNER_NAME 64
 
 /* define the enumeration of all cgroup subsystems */
 #define SUBSYS(_x) _x ## _cgrp_id,
@@ -505,6 +506,7 @@ struct cftype {
 * end of cftype array.
 */
char name[MAX_CFTYPE_NAME];
+   char owner_name[MAX_OWNER_NAME];
unsigned long private;
 
/*
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9d12757a65b0..267153bd898a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -291,6 +291,19 @@ void css_task_iter_end(struct css_task_iter *it);
;   \
else
 
+/**
+ * list_for_each_cft - walk circular list of cftypes linked together
+ * @cft: cftype from where to start
+ * @n: cftype used as a temporary storage
+ *
+ * A cftype pointed by a file may be part of a circular list of cftypes, this
+ * macro walks the circular list starting from any given cftype. Unlike the
+ * "list_for_each_entry" macro, the first element is included in the iteration.
+ */
+#define list_for_each_cft(cft, n)  \
+   for (n = NULL; cft != n; n = (n == NULL) ? cft : n, \
+cft = list_next_entry(cft, share_node))
+
 /*
  * Inline functions.
  */
-- 
2.16.1



[PATCH V2 06/10] cgroup: make all functions of all cftypes be invoked

2018-11-19 Thread Paolo Valente
From: Angelo Ruocco 

When two or more entities (of any kind) share a file, their respective
cftypes are linked together. The allowed operations on those files
are: open, release, write and show, mapped to the functions defined in
the cftypes.

This commit makes the cgroup core invoke, whenever one of those
operations is requested, the respective function of all the cftypes
linked together.

Signed-off-by: Angelo Ruocco 
Signed-off-by: Paolo Valente 
---
 kernel/cgroup/cgroup.c | 181 -
 1 file changed, 132 insertions(+), 49 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 61eafd69e2fd..9bf6b0b5a0ca 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3452,66 +3452,107 @@ static int cgroup_cpu_pressure_show(struct seq_file 
*seq, void *v)
 static int cgroup_file_open(struct kernfs_open_file *of)
 {
struct cftype *cft = of->kn->priv;
+   struct cftype *n;
+   int ret = 0;
 
+   list_for_each_cft(cft, n) {
+   if (cft->open)
+   ret = cft->open(of);
+   /*
+* If there has been an error with the open function of one of
+* the cft associated with the file, we call the release
+* function of all the cftype associated to cft whose open
+* function succeded.
+*/
+   if (ret) {
+   struct cftype *c = of->kn->priv;
+   struct cftype *n;
+
+   list_for_each_cft(c, n) {
+   if (cft == c)
+   break;
+   if (c->release)
+   c->release(of);
+   }
+   break;
+   }
+   }
 
-   if (cft->open)
-   return cft->open(of);
-   return 0;
+   return ret;
 }
 
 static void cgroup_file_release(struct kernfs_open_file *of)
 {
struct cftype *cft = of->kn->priv;
+   struct cftype *n;
 
-   if (cft->release)
-   cft->release(of);
+   list_for_each_cft(cft, n)
+   if (cft->release)
+   cft->release(of);
 }
 
+/*
+ * Call all the write functions of the cftypes associated with the file.
+ *
+ * When a write fails, don't keep trying to write into the file via the write
+ * functions of the other cftypes associated with it.
+ */
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 size_t nbytes, loff_t off)
 {
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup *cgrp = of->kn->parent->priv;
struct cftype *cft = of->kn->priv;
+   struct cftype *n;
struct cgroup_subsys_state *css;
-   int ret;
+   int ret = 0;
 
-   /*
-* If namespaces are delegation boundaries, disallow writes to
-* files in an non-init namespace root from inside the namespace
-* except for the files explicitly marked delegatable -
-* cgroup.procs and cgroup.subtree_control.
-*/
-   if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
-   !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
-   ns != _cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
-   return -EPERM;
+   list_for_each_cft(cft, n) {
+   /*
+* If namespaces are delegation boundaries, disallow writes to
+* files in an non-init namespace root from inside the
+* namespace except for the files explicitly marked
+* delegatable - cgroup.procs and cgroup.subtree_control.
+*/
+   if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
+   !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
+   ns != _cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+   return -EPERM;
 
-   if (cft->write)
-   return cft->write(of, buf, nbytes, off);
+   if (cft->write) {
+   ret = cft->write(of, buf, nbytes, off);
 
-   /*
-* kernfs guarantees that a file isn't deleted with operations in
-* flight, which means that the matching css is and stays alive and
-* doesn't need to be pinned.  The RCU locking is not necessary
-* either.  It's just for the convenience of using cgroup_css().
-*/
-   rcu_read_lock();
-   css = cgroup_css(cgrp, cft->ss);
-   rcu_read_unlock();
+   if (ret)
+   break;
+   continue;
+   }
 
-   if (cft->write_u64) {
-   unsigned long long v;
-  

[PATCH V2 01/10] cgroup: add hook seq_show_cft with also the owning cftype as parameter

2018-11-19 Thread Paolo Valente
The current implementation of the seq_show hook in the cftype struct
has only, as parameters, the seq_file to write to and the arguments
passed by the command line. Thus, the only way to retrieve the cftype
that owns an instance of such hook function is by using the accessor
in the seq_file itself.

But in a future scenario where the same file may be shared by multiple
cftypes, this accessor will point only to the first of the cftypes
linked to the seq_file. It will then be impossible to access the
cftype owning the seq_show function within the seq_show itself, unless
such cftype is the first one.

This commit adds an additional seq_show_cft hook that has as a formal
parameter also the cftype that owns the function.

Signed-off-by: Angelo Ruocco 
Signed-off-by: Paolo Valente 
---
 include/linux/cgroup-defs.h |  3 ++-
 kernel/cgroup/cgroup.c  | 15 +--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 5e1694fe035b..7841db6e7fb3 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -543,8 +543,9 @@ struct cftype {
 */
s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
 
-   /* generic seq_file read interface */
+   /* generic seq_file read interfaces*/
int (*seq_show)(struct seq_file *sf, void *v);
+   int (*seq_show_cft)(struct seq_file *sf, struct cftype *cft, void *v);
 
/* optional ops, implement all or none */
void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 6aaf5dd5383b..9d0993dd68fe 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1418,7 +1418,8 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
 {
umode_t mode = 0;
 
-   if (cft->read_u64 || cft->read_s64 || cft->seq_show)
+   if (cft->read_u64 || cft->read_s64 || cft->seq_show ||
+   cft->seq_show_cft)
mode |= S_IRUGO;
 
if (cft->write_u64 || cft->write_s64 || cft->write) {
@@ -3519,17 +3520,19 @@ static int cgroup_seqfile_show(struct seq_file *m, void 
*arg)
 {
struct cftype *cft = seq_cft(m);
struct cgroup_subsys_state *css = seq_css(m);
+   int ret = 0;
 
if (cft->seq_show)
-   return cft->seq_show(m, arg);
-
-   if (cft->read_u64)
+   ret = cft->seq_show(m, arg);
+   else if (cft->seq_show_cft)
+   ret = cft->seq_show_cft(m, cft, arg);
+   else if (cft->read_u64)
seq_printf(m, "%llu\n", cft->read_u64(css, cft));
else if (cft->read_s64)
seq_printf(m, "%lld\n", cft->read_s64(css, cft));
else
-   return -EINVAL;
-   return 0;
+   ret = -EINVAL;
+   return ret;
 }
 
 static struct kernfs_ops cgroup_kf_single_ops = {
-- 
2.16.1



[PATCH V2 02/10] block, cgroup: pass cftype to functions that need to use it

2018-11-19 Thread Paolo Valente
Some seq_show functions need to access the cftype they belong to, for
retrieving the data to show. These functions get their cftype by using
the seq_cft accessor for the seq_file. This solution is no longer
viable in case a seq_file is shared among more than one cftype,
because the accessor always returns (only) the first of the cftypes
sharing the seq_file.

This commit enables these seq_show functions to be passed their
cftype, by replacing their prototype with that of the newly defined
seq_show_cft hook, and by invoking these functions through this new
hook.

Signed-off-by: Angelo Ruocco 
Signed-off-by: Paolo Valente 
---
 block/bfq-cgroup.c | 54 --
 block/blk-cgroup.c | 22 ---
 block/blk-throttle.c   |  8 +++
 include/linux/blk-cgroup.h | 10 +
 4 files changed, 52 insertions(+), 42 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index a7a1712632b0..038e418fa64f 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -918,17 +918,17 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file 
*of,
 }
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-static int bfqg_print_stat(struct seq_file *sf, void *v)
+static int bfqg_print_stat(struct seq_file *sf, struct cftype *cft, void *v)
 {
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
- _policy_bfq, seq_cft(sf)->private, false);
+ _policy_bfq, cft->private, false);
return 0;
 }
 
-static int bfqg_print_rwstat(struct seq_file *sf, void *v)
+static int bfqg_print_rwstat(struct seq_file *sf, struct cftype *cft, void *v)
 {
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
- _policy_bfq, seq_cft(sf)->private, true);
+ _policy_bfq, cft->private, true);
return 0;
 }
 
@@ -949,19 +949,21 @@ static u64 bfqg_prfill_rwstat_recursive(struct seq_file 
*sf,
return __blkg_prfill_rwstat(sf, pd, );
 }
 
-static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
+static int bfqg_print_stat_recursive(struct seq_file *sf, struct cftype *cft,
+void *v)
 {
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
  bfqg_prfill_stat_recursive, _policy_bfq,
- seq_cft(sf)->private, false);
+ cft->private, false);
return 0;
 }
 
-static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, struct cftype *cft,
+  void *v)
 {
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
  bfqg_prfill_rwstat_recursive, _policy_bfq,
- seq_cft(sf)->private, true);
+ cft->private, true);
return 0;
 }
 
@@ -1063,18 +1065,18 @@ struct cftype bfq_blkcg_legacy_files[] = {
{
.name = "bfq.io_service_bytes",
.private = (unsigned long)_policy_bfq,
-   .seq_show = blkg_print_stat_bytes,
+   .seq_show_cft = blkg_print_stat_bytes,
},
{
.name = "bfq.io_serviced",
.private = (unsigned long)_policy_bfq,
-   .seq_show = blkg_print_stat_ios,
+   .seq_show_cft = blkg_print_stat_ios,
},
 #ifdef CONFIG_DEBUG_BLK_CGROUP
{
.name = "bfq.time",
.private = offsetof(struct bfq_group, stats.time),
-   .seq_show = bfqg_print_stat,
+   .seq_show_cft = bfqg_print_stat,
},
{
.name = "bfq.sectors",
@@ -1083,22 +1085,22 @@ struct cftype bfq_blkcg_legacy_files[] = {
{
.name = "bfq.io_service_time",
.private = offsetof(struct bfq_group, stats.service_time),
-   .seq_show = bfqg_print_rwstat,
+   .seq_show_cft = bfqg_print_rwstat,
},
{
.name = "bfq.io_wait_time",
.private = offsetof(struct bfq_group, stats.wait_time),
-   .seq_show = bfqg_print_rwstat,
+   .seq_show_cft = bfqg_print_rwstat,
},
{
.name = "bfq.io_merged",
.private = offsetof(struct bfq_group, stats.merged),
-   .seq_show = bfqg_print_rwstat,
+   .seq_show_cft = bfqg_print_rwstat,
},
{
.name = "bfq.io_queued",
.private = offsetof(struct bfq_group, stats.queued),
-   .seq_show = bfqg_print_rwstat,
+   .seq_show_cft = bfqg_print_rwstat,
},
 #endif /* CONFIG_DEBUG_BLK_CGROUP */
 
@@ -1106,18 +1108,18 @@ struct cftype bfq_blkcg_legacy_files[] = {
{
   

[PATCH V2 05/10] block, bfq: align min and default weights with the old cfq default

2018-11-19 Thread Paolo Valente
From: Angelo Ruocco 

bfq exposes a cgroup attribute, weight, with the same meaning as that
exposed by cfq.

This commit changes bfq default and min weights to match the ones set
by cfq (before legacy blk was removed).

Signed-off-by: Angelo Ruocco 
Signed-off-by: Paolo Valente 
---
 block/bfq-iosched.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 77651d817ecd..249d8128d3ee 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -22,13 +22,13 @@
 #define BFQ_IOPRIO_CLASSES 3
 #define BFQ_CL_IDLE_TIMEOUT(HZ/5)
 
-#define BFQ_MIN_WEIGHT 1
+#define BFQ_MIN_WEIGHT 10
 #define BFQ_MAX_WEIGHT 1000
 #define BFQ_WEIGHT_CONVERSION_COEFF10
 
 #define BFQ_DEFAULT_QUEUE_IOPRIO   4
 
-#define BFQ_WEIGHT_LEGACY_DFL  100
+#define BFQ_WEIGHT_LEGACY_DFL  500
 #define BFQ_DEFAULT_GRP_IOPRIO 0
 #define BFQ_DEFAULT_GRP_CLASS  IOPRIO_CLASS_BE
 
-- 
2.16.1



[PATCH V2 04/10] cgroup: add owner name to cftypes

2018-11-19 Thread Paolo Valente
From: Angelo Ruocco 

The piece of information "who created a certain cftype" is not stored
anywhere, thus a cftype is not able to know who is its owner.

This commit addresses this problem by adding a new field in the cftype
structure that enables the name of its owner to be explicitly set.

Signed-off-by: Angelo Ruocco 
Signed-off-by: Paolo Valente 
---
 include/linux/cgroup-defs.h |  2 ++
 include/linux/cgroup.h  | 13 +
 2 files changed, 15 insertions(+)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index d659763c7221..6e31f478c6e1 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -36,6 +36,7 @@ struct seq_file;
 #define MAX_CGROUP_TYPE_NAMELEN 32
 #define MAX_CGROUP_ROOT_NAMELEN 64
 #define MAX_CFTYPE_NAME64
+#define MAX_OWNER_NAME 64
 
 /* define the enumeration of all cgroup subsystems */
 #define SUBSYS(_x) _x ## _cgrp_id,
@@ -505,6 +506,7 @@ struct cftype {
 * end of cftype array.
 */
char name[MAX_CFTYPE_NAME];
+   char owner_name[MAX_OWNER_NAME];
unsigned long private;
 
/*
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9d12757a65b0..267153bd898a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -291,6 +291,19 @@ void css_task_iter_end(struct css_task_iter *it);
;   \
else
 
+/**
+ * list_for_each_cft - walk circular list of cftypes linked together
+ * @cft: cftype from where to start
+ * @n: cftype used as a temporary storage
+ *
+ * A cftype pointed by a file may be part of a circular list of cftypes, this
+ * macro walks the circular list starting from any given cftype. Unlike the
+ * "list_for_each_entry" macro, the first element is included in the iteration.
+ */
+#define list_for_each_cft(cft, n)  \
+   for (n = NULL; cft != n; n = (n == NULL) ? cft : n, \
+cft = list_next_entry(cft, share_node))
+
 /*
  * Inline functions.
  */
-- 
2.16.1



[PATCH V2 03/10] cgroup: link cftypes of the same subsystem with the same name

2018-11-19 Thread Paolo Valente
From: Angelo Ruocco 

Two entities, of any kind, are not able to create a cgroup file with
the same name in the same folder: if an entity tries to create a file
that has the same name as a file created by another entity, the cgroup
core stops it, warns the user about the error, and then proceeds to
delete all the files created by the last entity.

However, in some specific situations, it may be useful for two or more
entities to use a common file, e.g., the I/O schedulers bfq and cfq
had the same "weight" attribute, that changed the behavior of the two
schedulers in a similar way.

This commit prepares the interface that allows two entities to share
files. It adds a flag CFTYPE_SHARE_FILE for cftypes, flag that allows
cftypes to be linked together if they are part of the same subsystem
and have the same name.

There is a limitation for a cftype that wants to share a file: it can't
have the hooks seq_start/next/stop. The reason is that there is no
consistent way to show portions of a file once multiple cftypes are
attached to it, and thus more than one seq_show() is invoked: there are
neither an univocal start point, nor univocal "next" and "stop"
operations.

Signed-off-by: Angelo Ruocco 
Signed-off-by: Paolo Valente 
---
 include/linux/cgroup-defs.h |  9 ++
 kernel/cgroup/cgroup.c  | 78 +++--
 2 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 7841db6e7fb3..d659763c7221 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -93,6 +93,8 @@ enum {
CFTYPE_NO_PREFIX= (1 << 3), /* (DON'T USE FOR NEW FILES) no 
subsys prefix */
CFTYPE_WORLD_WRITABLE   = (1 << 4), /* (DON'T USE FOR NEW FILES) 
S_IWUGO */
 
+   CFTYPE_SHARES_FILE  = (1 << 5), /* shares file w/ other cfts */
+
/* internal flags, do not use outside cgroup core proper */
__CFTYPE_ONLY_ON_DFL= (1 << 16),/* only on default hierarchy */
__CFTYPE_NOT_ON_DFL = (1 << 17),/* not on default hierarchy */
@@ -528,6 +530,13 @@ struct cftype {
 */
struct cgroup_subsys *ss;   /* NULL for cgroup core files */
struct list_head node;  /* anchored at ss->cfts */
+
+   /*
+* List of cftypes that are sharing the same file. It allows the hook
+* functions of the cftypes in the list to be called together.
+*/
+   struct list_head share_node;
+
struct kernfs_ops *kf_ops;
 
int (*open)(struct kernfs_open_file *of);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 9d0993dd68fe..61eafd69e2fd 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1549,11 +1549,15 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node 
*kn, bool drain_offline)
return NULL;
 }
 
-static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+static void cgroup_rm_file(struct cgroup *cgrp, struct cftype *cft)
 {
char name[CGROUP_FILE_NAME_MAX];
+   struct kernfs_node *kn = kernfs_find_and_get(cgrp->kn,
+   cgroup_file_name(cgrp, cft, name));
+   struct cftype *cfts = kn->priv;
 
lockdep_assert_held(_mutex);
+   kernfs_put(kn);
 
if (cft->file_offset) {
struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
@@ -1566,7 +1570,19 @@ static void cgroup_rm_file(struct cgroup *cgrp, const 
struct cftype *cft)
del_timer_sync(>notify_timer);
}
 
-   kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
+   /* Delete the file only if it's used by one cftype */
+   if (list_empty(>share_node) || atomic_read(>count) == 1) {
+   kernfs_remove(kn);
+   } else {
+   /*
+* Update the "priv" pointer of the kernfs_node if the cftype
+* that first created the file is removed.
+*/
+   if (cft == cfts)
+   kn->priv = list_next_entry(cft, share_node);
+
+   kernfs_put(kn);
+   }
 }
 
 /**
@@ -3437,6 +3453,7 @@ static int cgroup_file_open(struct kernfs_open_file *of)
 {
struct cftype *cft = of->kn->priv;
 
+
if (cft->open)
return cft->open(of);
return 0;
@@ -3585,6 +3602,22 @@ static int cgroup_add_file(struct cgroup_subsys_state 
*css, struct cgroup *cgrp,
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
key = >lockdep_key;
 #endif
+
+   if (cft->flags & CFTYPE_SHARES_FILE) {
+   /* kn->count keeps track of how many cftypes share kn */
+   kn = kernfs_find_and_get(cgrp->kn,
+cgroup_file_name(cgrp, cft, name));
+   if (kn) {
+   struct cftype *cft

[PATCH V2 10/10] doc, bfq-iosched: make it consistent with the new cgroup interface

2018-11-19 Thread Paolo Valente
BFQ now shares interface files with CFQ, for the proportional-share
policy. Make documentation consistent with that.

Signed-off-by: Paolo Valente 
---
 Documentation/block/bfq-iosched.txt | 28 +++-
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/Documentation/block/bfq-iosched.txt 
b/Documentation/block/bfq-iosched.txt
index 6d7dd5ab8554..3d7ef138ce6a 100644
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt
@@ -508,12 +508,14 @@ process.
 To get proportional sharing of bandwidth with BFQ for a given device,
 BFQ must of course be the active scheduler for that device.
 
-Within each group directory, the names of the files associated with
-BFQ-specific cgroup parameters and stats begin with the "bfq."
-prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
-BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
-parameter to set the weight of a group with BFQ is blkio.bfq.weight
-or io.bfq.weight.
+BFQ uses the standard interface files of the proportional-share
+policy, previously used by CFQ. If one such file is read/written, then
+the operation associated with the file is performed by BFQ for each
+device where BFQ is the active scheduler. In addition, BFQ is
+configured so as to share interface files with other entities (if some
+of these files does happen to be shared, then its associated operation
+is performed also by any of the other entities that is using the
+file).
 
 As for cgroups-v1 (blkio controller), the exact set of stat files
 created, and kept up-to-date by bfq, depends on whether
@@ -521,13 +523,13 @@ CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq 
creates all
 the stat files documented in
 Documentation/cgroup-v1/blkio-controller.txt. If, instead,
 CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files
-blkio.bfq.io_service_bytes
-blkio.bfq.io_service_bytes_recursive
-blkio.bfq.io_serviced
-blkio.bfq.io_serviced_recursive
+blkio.io_service_bytes
+blkio.io_service_bytes_recursive
+blkio.io_serviced
+blkio.io_serviced_recursive
 
 The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum
-throughput sustainable with bfq, because updating the blkio.bfq.*
+throughput sustainable with BFQ, because updating the blkio.*
 stats is rather costly, especially for some of the stats enabled by
 CONFIG_DEBUG_BLK_CGROUP.
 
@@ -536,8 +538,8 @@ Parameters to set
 
 For each group, there is only the following parameter to set.
 
-weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
-group inside its parent. Available values: 1..1 (default 100). The
+weight (namely blkio.weight or io.weight): the weight of the group
+inside its parent. Available values: 1..1 (default 100). The
 linear mapping between ioprio and weights, described at the beginning
 of the tunable section, is still valid, but all weights higher than
 IOPRIO_BE_NR*10 are mapped to ioprio 0.
-- 
2.16.1



[PATCH V2 07/10] block, bfq: use standard file names for the proportional-share policy

2018-11-19 Thread Paolo Valente
Some of the files exposed in a cgroup by bfq, for the proportional
share policy, have the same meaning as the files owned by cfq (before
legacy blk was removed).

The old implementation of the cgroup interface didn't allow different
entities to create cgroup files with the same name (in the same
subsystem). So, for bfq, we had to add the prefix "bfq" to the names
of its cgroup files.

This commit renames the cgroup files of the bfq scheduler as those
exposed by cfq, and makes bfq willing to share these files with any
other future policy.

Signed-off-by: Angelo Ruocco 
Signed-off-by: Paolo Valente 
---
 block/bfq-cgroup.c | 94 +++---
 1 file changed, 69 insertions(+), 25 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 038e418fa64f..0643147b2cbc 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -1055,50 +1055,67 @@ struct blkcg_policy blkcg_policy_bfq = {
 
 struct cftype bfq_blkcg_legacy_files[] = {
{
-   .name = "bfq.weight",
-   .flags = CFTYPE_NOT_ON_ROOT,
+   .name = "weight",
+   .owner_name = "bfq",
+   .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_SHARES_FILE,
.seq_show = bfq_io_show_weight,
.write_u64 = bfq_io_set_weight_legacy,
},
 
/* statistics, covers only the tasks in the bfqg */
{
-   .name = "bfq.io_service_bytes",
+   .name = "io_service_bytes",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = (unsigned long)_policy_bfq,
.seq_show_cft = blkg_print_stat_bytes,
},
{
-   .name = "bfq.io_serviced",
+   .name = "io_serviced",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = (unsigned long)_policy_bfq,
.seq_show_cft = blkg_print_stat_ios,
},
 #ifdef CONFIG_DEBUG_BLK_CGROUP
{
-   .name = "bfq.time",
+   .name = "time",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = offsetof(struct bfq_group, stats.time),
.seq_show_cft = bfqg_print_stat,
},
{
-   .name = "bfq.sectors",
+   .name = "sectors",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.seq_show = bfqg_print_stat_sectors,
},
{
-   .name = "bfq.io_service_time",
+   .name = "io_service_time",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = offsetof(struct bfq_group, stats.service_time),
.seq_show_cft = bfqg_print_rwstat,
},
{
-   .name = "bfq.io_wait_time",
+   .name = "io_wait_time",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = offsetof(struct bfq_group, stats.wait_time),
.seq_show_cft = bfqg_print_rwstat,
},
{
-   .name = "bfq.io_merged",
+   .name = "io_merged",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = offsetof(struct bfq_group, stats.merged),
.seq_show_cft = bfqg_print_rwstat,
},
{
-   .name = "bfq.io_queued",
+   .name = "io_queued",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = offsetof(struct bfq_group, stats.queued),
.seq_show_cft = bfqg_print_rwstat,
},
@@ -1106,66 +1123,92 @@ struct cftype bfq_blkcg_legacy_files[] = {
 
/* the same statictics which cover the bfqg and its descendants */
{
-   .name = "bfq.io_service_bytes_recursive",
+   .name = "io_service_bytes_recursive",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = (unsigned long)_policy_bfq,
.seq_show_cft = blkg_print_stat_bytes_recursive,
},
{
-   .name = "bfq.io_serviced_recursive",
+   .name = "io_serviced_recursive",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = (unsigned long)_policy_bfq,
.seq_show_cft = blkg_print_stat_ios_recursive,
},
 #ifdef CONFIG_DEBUG_BLK_CGROUP
{
-   .name =

[PATCH V2 03/10] cgroup: link cftypes of the same subsystem with the same name

2018-11-19 Thread Paolo Valente
From: Angelo Ruocco 

Two entities, of any kind, are not able to create a cgroup file with
the same name in the same folder: if an entity tries to create a file
that has the same name as a file created by another entity, the cgroup
core stops it, warns the user about the error, and then proceeds to
delete all the files created by the last entity.

However, in some specific situations, it may be useful for two or more
entities to use a common file, e.g., the I/O schedulers bfq and cfq
had the same "weight" attribute, that changed the behavior of the two
schedulers in a similar way.

This commit prepares the interface that allows two entities to share
files. It adds a flag CFTYPE_SHARE_FILE for cftypes, flag that allows
cftypes to be linked together if they are part of the same subsystem
and have the same name.

There is a limitation for a cftype that wants to share a file: it can't
have the hooks seq_start/next/stop. The reason is that there is no
consistent way to show portions of a file once multiple cftypes are
attached to it, and thus more than one seq_show() is invoked: there are
neither an univocal start point, nor univocal "next" and "stop"
operations.

Signed-off-by: Angelo Ruocco 
Signed-off-by: Paolo Valente 
---
 include/linux/cgroup-defs.h |  9 ++
 kernel/cgroup/cgroup.c  | 78 +++--
 2 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 7841db6e7fb3..d659763c7221 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -93,6 +93,8 @@ enum {
CFTYPE_NO_PREFIX= (1 << 3), /* (DON'T USE FOR NEW FILES) no 
subsys prefix */
CFTYPE_WORLD_WRITABLE   = (1 << 4), /* (DON'T USE FOR NEW FILES) 
S_IWUGO */
 
+   CFTYPE_SHARES_FILE  = (1 << 5), /* shares file w/ other cfts */
+
/* internal flags, do not use outside cgroup core proper */
__CFTYPE_ONLY_ON_DFL= (1 << 16),/* only on default hierarchy */
__CFTYPE_NOT_ON_DFL = (1 << 17),/* not on default hierarchy */
@@ -528,6 +530,13 @@ struct cftype {
 */
struct cgroup_subsys *ss;   /* NULL for cgroup core files */
struct list_head node;  /* anchored at ss->cfts */
+
+   /*
+* List of cftypes that are sharing the same file. It allows the hook
+* functions of the cftypes in the list to be called together.
+*/
+   struct list_head share_node;
+
struct kernfs_ops *kf_ops;
 
int (*open)(struct kernfs_open_file *of);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 9d0993dd68fe..61eafd69e2fd 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1549,11 +1549,15 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node 
*kn, bool drain_offline)
return NULL;
 }
 
-static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+static void cgroup_rm_file(struct cgroup *cgrp, struct cftype *cft)
 {
char name[CGROUP_FILE_NAME_MAX];
+   struct kernfs_node *kn = kernfs_find_and_get(cgrp->kn,
+   cgroup_file_name(cgrp, cft, name));
+   struct cftype *cfts = kn->priv;
 
lockdep_assert_held(_mutex);
+   kernfs_put(kn);
 
if (cft->file_offset) {
struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
@@ -1566,7 +1570,19 @@ static void cgroup_rm_file(struct cgroup *cgrp, const 
struct cftype *cft)
del_timer_sync(>notify_timer);
}
 
-   kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
+   /* Delete the file only if it's used by one cftype */
+   if (list_empty(>share_node) || atomic_read(>count) == 1) {
+   kernfs_remove(kn);
+   } else {
+   /*
+* Update the "priv" pointer of the kernfs_node if the cftype
+* that first created the file is removed.
+*/
+   if (cft == cfts)
+   kn->priv = list_next_entry(cft, share_node);
+
+   kernfs_put(kn);
+   }
 }
 
 /**
@@ -3437,6 +3453,7 @@ static int cgroup_file_open(struct kernfs_open_file *of)
 {
struct cftype *cft = of->kn->priv;
 
+
if (cft->open)
return cft->open(of);
return 0;
@@ -3585,6 +3602,22 @@ static int cgroup_add_file(struct cgroup_subsys_state 
*css, struct cgroup *cgrp,
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
key = >lockdep_key;
 #endif
+
+   if (cft->flags & CFTYPE_SHARES_FILE) {
+   /* kn->count keeps track of how many cftypes share kn */
+   kn = kernfs_find_and_get(cgrp->kn,
+cgroup_file_name(cgrp, cft, name));
+   if (kn) {
+   struct cftype *cft

[PATCH V2 10/10] doc, bfq-iosched: make it consistent with the new cgroup interface

2018-11-19 Thread Paolo Valente
BFQ now shares interface files with CFQ, for the proportional-share
policy. Make documentation consistent with that.

Signed-off-by: Paolo Valente 
---
 Documentation/block/bfq-iosched.txt | 28 +++-
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/Documentation/block/bfq-iosched.txt 
b/Documentation/block/bfq-iosched.txt
index 6d7dd5ab8554..3d7ef138ce6a 100644
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt
@@ -508,12 +508,14 @@ process.
 To get proportional sharing of bandwidth with BFQ for a given device,
 BFQ must of course be the active scheduler for that device.
 
-Within each group directory, the names of the files associated with
-BFQ-specific cgroup parameters and stats begin with the "bfq."
-prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
-BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
-parameter to set the weight of a group with BFQ is blkio.bfq.weight
-or io.bfq.weight.
+BFQ uses the standard interface files of the proportional-share
+policy, previously used by CFQ. If one such file is read/written, then
+the operation associated with the file is performed by BFQ for each
+device where BFQ is the active scheduler. In addition, BFQ is
+configured so as to share interface files with other entities (if some
+of these files does happen to be shared, then its associated operation
+is performed also by any of the other entities that is using the
+file).
 
 As for cgroups-v1 (blkio controller), the exact set of stat files
 created, and kept up-to-date by bfq, depends on whether
@@ -521,13 +523,13 @@ CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq 
creates all
 the stat files documented in
 Documentation/cgroup-v1/blkio-controller.txt. If, instead,
 CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files
-blkio.bfq.io_service_bytes
-blkio.bfq.io_service_bytes_recursive
-blkio.bfq.io_serviced
-blkio.bfq.io_serviced_recursive
+blkio.io_service_bytes
+blkio.io_service_bytes_recursive
+blkio.io_serviced
+blkio.io_serviced_recursive
 
 The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum
-throughput sustainable with bfq, because updating the blkio.bfq.*
+throughput sustainable with BFQ, because updating the blkio.*
 stats is rather costly, especially for some of the stats enabled by
 CONFIG_DEBUG_BLK_CGROUP.
 
@@ -536,8 +538,8 @@ Parameters to set
 
 For each group, there is only the following parameter to set.
 
-weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
-group inside its parent. Available values: 1..1 (default 100). The
+weight (namely blkio.weight or io.weight): the weight of the group
+inside its parent. Available values: 1..1 (default 100). The
 linear mapping between ioprio and weights, described at the beginning
 of the tunable section, is still valid, but all weights higher than
 IOPRIO_BE_NR*10 are mapped to ioprio 0.
-- 
2.16.1



[PATCH V2 07/10] block, bfq: use standard file names for the proportional-share policy

2018-11-19 Thread Paolo Valente
Some of the files exposed in a cgroup by bfq, for the proportional
share policy, have the same meaning as the files owned by cfq (before
legacy blk was removed).

The old implementation of the cgroup interface didn't allow different
entities to create cgroup files with the same name (in the same
subsystem). So, for bfq, we had to add the prefix "bfq" to the names
of its cgroup files.

This commit renames the cgroup files of the bfq scheduler as those
exposed by cfq, and makes bfq willing to share these files with any
other future policy.

Signed-off-by: Angelo Ruocco 
Signed-off-by: Paolo Valente 
---
 block/bfq-cgroup.c | 94 +++---
 1 file changed, 69 insertions(+), 25 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 038e418fa64f..0643147b2cbc 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -1055,50 +1055,67 @@ struct blkcg_policy blkcg_policy_bfq = {
 
 struct cftype bfq_blkcg_legacy_files[] = {
{
-   .name = "bfq.weight",
-   .flags = CFTYPE_NOT_ON_ROOT,
+   .name = "weight",
+   .owner_name = "bfq",
+   .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_SHARES_FILE,
.seq_show = bfq_io_show_weight,
.write_u64 = bfq_io_set_weight_legacy,
},
 
/* statistics, covers only the tasks in the bfqg */
{
-   .name = "bfq.io_service_bytes",
+   .name = "io_service_bytes",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = (unsigned long)_policy_bfq,
.seq_show_cft = blkg_print_stat_bytes,
},
{
-   .name = "bfq.io_serviced",
+   .name = "io_serviced",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = (unsigned long)_policy_bfq,
.seq_show_cft = blkg_print_stat_ios,
},
 #ifdef CONFIG_DEBUG_BLK_CGROUP
{
-   .name = "bfq.time",
+   .name = "time",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = offsetof(struct bfq_group, stats.time),
.seq_show_cft = bfqg_print_stat,
},
{
-   .name = "bfq.sectors",
+   .name = "sectors",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.seq_show = bfqg_print_stat_sectors,
},
{
-   .name = "bfq.io_service_time",
+   .name = "io_service_time",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = offsetof(struct bfq_group, stats.service_time),
.seq_show_cft = bfqg_print_rwstat,
},
{
-   .name = "bfq.io_wait_time",
+   .name = "io_wait_time",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = offsetof(struct bfq_group, stats.wait_time),
.seq_show_cft = bfqg_print_rwstat,
},
{
-   .name = "bfq.io_merged",
+   .name = "io_merged",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = offsetof(struct bfq_group, stats.merged),
.seq_show_cft = bfqg_print_rwstat,
},
{
-   .name = "bfq.io_queued",
+   .name = "io_queued",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = offsetof(struct bfq_group, stats.queued),
.seq_show_cft = bfqg_print_rwstat,
},
@@ -1106,66 +1123,92 @@ struct cftype bfq_blkcg_legacy_files[] = {
 
/* the same statictics which cover the bfqg and its descendants */
{
-   .name = "bfq.io_service_bytes_recursive",
+   .name = "io_service_bytes_recursive",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = (unsigned long)_policy_bfq,
.seq_show_cft = blkg_print_stat_bytes_recursive,
},
{
-   .name = "bfq.io_serviced_recursive",
+   .name = "io_serviced_recursive",
+   .owner_name = "bfq",
+   .flags = CFTYPE_SHARES_FILE,
.private = (unsigned long)_policy_bfq,
.seq_show_cft = blkg_print_stat_ios_recursive,
},
 #ifdef CONFIG_DEBUG_BLK_CGROUP
{
-   .name =

[PATCH V2 09/10] doc, bfq-iosched: fix a few clerical errors

2018-11-19 Thread Paolo Valente
This commit fixes a few clerical errors in
Documentation/block/bfq-iosched.txt.

Signed-off-by: Paolo Valente 
---
 Documentation/block/bfq-iosched.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/block/bfq-iosched.txt 
b/Documentation/block/bfq-iosched.txt
index 8d8d8f06cab2..6d7dd5ab8554 100644
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt
@@ -42,7 +42,7 @@ sustainable throughputs, on the same systems as above:
 
 BFQ works for multi-queue devices too.
 
-The table of contents follow. Impatients can just jump to Section 3.
+The table of contents follows. Impatients can just jump to Section 3.
 
 CONTENTS
 
@@ -51,7 +51,7 @@ CONTENTS
  1-2 Server systems
 2. How does BFQ work?
 3. What are BFQ's tunables and how to properly configure BFQ?
-4. BFQ group scheduling
+4. Group scheduling with BFQ
  4-1 Service guarantees provided
  4-2 Interface
 
@@ -294,7 +294,7 @@ too.
 per-process ioprio and weight
 -
 
-Unless the cgroups interface is used (see "4. BFQ group scheduling"),
+Unless the cgroups interface is used (see "4. Group scheduling with BFQ"),
 weights can be assigned to processes only indirectly, through I/O
 priorities, and according to the relation:
 weight = (IOPRIO_BE_NR - ioprio) * 10.
-- 
2.16.1



[PATCH V2 08/10] block, throttle: allow sharing cgroup statistic files

2018-11-19 Thread Paolo Valente
From: Angelo Ruocco 

Some of the cgroup files defined in the throttle policy have the same
meaning as those defined in the proportional share policy.

This commit uses the new file sharing interface in cgroup to share
these files.

Signed-off-by: Angelo Ruocco 
Signed-off-by: Paolo Valente 
---
 block/blk-throttle.c | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 6bfdaac53b6f..95825448c031 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1491,22 +1491,30 @@ static struct cftype throtl_legacy_files[] = {
.write = tg_set_conf_uint,
},
{
-   .name = "throttle.io_service_bytes",
+   .name = "io_service_bytes",
+   .owner_name = "throttle",
+   .flags = CFTYPE_SHARES_FILE,
.private = (unsigned long)_policy_throtl,
.seq_show_cft = blkg_print_stat_bytes,
},
{
-   .name = "throttle.io_service_bytes_recursive",
+   .name = "io_service_bytes_recursive",
+   .owner_name = "throttle",
+   .flags = CFTYPE_SHARES_FILE,
.private = (unsigned long)_policy_throtl,
.seq_show_cft = blkg_print_stat_bytes_recursive,
},
{
-   .name = "throttle.io_serviced",
+   .name = "io_serviced",
+   .owner_name = "throttle",
+   .flags = CFTYPE_SHARES_FILE,
.private = (unsigned long)_policy_throtl,
.seq_show_cft = blkg_print_stat_ios,
},
{
-   .name = "throttle.io_serviced_recursive",
+   .name = "io_serviced_recursive",
+   .owner_name = "throttle",
+   .flags = CFTYPE_SHARES_FILE,
.private = (unsigned long)_policy_throtl,
.seq_show_cft = blkg_print_stat_ios_recursive,
},
-- 
2.16.1



[PATCH V2 09/10] doc, bfq-iosched: fix a few clerical errors

2018-11-19 Thread Paolo Valente
This commit fixes a few clerical errors in
Documentation/block/bfq-iosched.txt.

Signed-off-by: Paolo Valente 
---
 Documentation/block/bfq-iosched.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/block/bfq-iosched.txt 
b/Documentation/block/bfq-iosched.txt
index 8d8d8f06cab2..6d7dd5ab8554 100644
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt
@@ -42,7 +42,7 @@ sustainable throughputs, on the same systems as above:
 
 BFQ works for multi-queue devices too.
 
-The table of contents follow. Impatients can just jump to Section 3.
+The table of contents follows. Impatients can just jump to Section 3.
 
 CONTENTS
 
@@ -51,7 +51,7 @@ CONTENTS
  1-2 Server systems
 2. How does BFQ work?
 3. What are BFQ's tunables and how to properly configure BFQ?
-4. BFQ group scheduling
+4. Group scheduling with BFQ
  4-1 Service guarantees provided
  4-2 Interface
 
@@ -294,7 +294,7 @@ too.
 per-process ioprio and weight
 -
 
-Unless the cgroups interface is used (see "4. BFQ group scheduling"),
+Unless the cgroups interface is used (see "4. Group scheduling with BFQ"),
 weights can be assigned to processes only indirectly, through I/O
 priorities, and according to the relation:
 weight = (IOPRIO_BE_NR - ioprio) * 10.
-- 
2.16.1



[PATCH V2 08/10] block, throttle: allow sharing cgroup statistic files

2018-11-19 Thread Paolo Valente
From: Angelo Ruocco 

Some of the cgroup files defined in the throttle policy have the same
meaning as those defined in the proportional share policy.

This commit uses the new file sharing interface in cgroup to share
these files.

Signed-off-by: Angelo Ruocco 
Signed-off-by: Paolo Valente 
---
 block/blk-throttle.c | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 6bfdaac53b6f..95825448c031 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1491,22 +1491,30 @@ static struct cftype throtl_legacy_files[] = {
.write = tg_set_conf_uint,
},
{
-   .name = "throttle.io_service_bytes",
+   .name = "io_service_bytes",
+   .owner_name = "throttle",
+   .flags = CFTYPE_SHARES_FILE,
.private = (unsigned long)_policy_throtl,
.seq_show_cft = blkg_print_stat_bytes,
},
{
-   .name = "throttle.io_service_bytes_recursive",
+   .name = "io_service_bytes_recursive",
+   .owner_name = "throttle",
+   .flags = CFTYPE_SHARES_FILE,
.private = (unsigned long)_policy_throtl,
.seq_show_cft = blkg_print_stat_bytes_recursive,
},
{
-   .name = "throttle.io_serviced",
+   .name = "io_serviced",
+   .owner_name = "throttle",
+   .flags = CFTYPE_SHARES_FILE,
.private = (unsigned long)_policy_throtl,
.seq_show_cft = blkg_print_stat_ios,
},
{
-   .name = "throttle.io_serviced_recursive",
+   .name = "io_serviced_recursive",
+   .owner_name = "throttle",
+   .flags = CFTYPE_SHARES_FILE,
.private = (unsigned long)_policy_throtl,
.seq_show_cft = blkg_print_stat_ios_recursive,
},
-- 
2.16.1



Re: [PATCH] block: BFQ default for single queue devices

2018-10-05 Thread Paolo Valente



> Il giorno 05 ott 2018, alle ore 00:42, Bart Van Assche  
> ha scritto:
> 
> On Thu, 2018-10-04 at 22:39 +0200, Paolo Valente wrote:
>> No, kernel build is, for evident reasons, one of the workloads I cared
>> most about.  Actually, I tried to focus on all my main
>> kernel-development tasks, such as also git checkout, git merge, git
>> grep, ...
>> 
>> According to my test results, with BFQ these tasks are at least as
>> fast as, or, in most system configurations, much faster than with the
>> other schedulers.  Of course, at the same time the system also remains
>> responsive with BFQ.
>> 
>> You can repeat these tests using one of my first scripts in the S
>> suite: kern_dev_tasks_vs_rw.sh (usually, the older the tests, the more
>> hypertrophied the names I gave :) ).
>> 
>> I stopped sharing also my kernel-build results years ago, because I
>> went on obtaining the same, identical good results for years, and I'm
>> aware that I tend to show and say too much stuff.
> 
> On my test setup building the kernel is slightly slower when using the BFQ
> scheduler compared to using scheduler "none" (kernel 4.18.12, NVMe SSD,
> single CPU with 6 cores, hyperthreading disabled). I am aware that the
> proposal at the start of this thread was to make BFQ the default for devices
> with a single hardware queue and not for devices like NVMe SSDs that support
> multiple hardware queues.
> 

I miss your point: as you yourself note, the proposal is limited to
single-queue devices, exactly because BFQ is not ready for
multiple-queue devices yet.

> What I think is missing is measurement results for BFQ on a system with
> multiple CPU sockets and against a fast storage medium.

It is not missing.  As I happened to report in previous threads, we
made a script to measure that too [1], using fio and null block.

I have reported the results we obtained, for three classes of
processors, in the in-kernel BFQ documentation [2].

In particular, BFQ reached 400KIOPS with the fastest CPU mentioned in
that document (Intel i7-4850HQ).

So, since the speed of that single-socket commodity CPU is most likely
lower than the total speed of a multi-socket system, we have that, on
such a system and with BFQ, you should be conservatively ok with
single-queue devices in the range 300-500 KIOPS.

[1] https://github.com/Algodev-github/IOSpeed
[2] https://www.kernel.org/doc/Documentation/block/bfq-iosched.txt

>  

> Eliminating
> the host lock from the SCSI core yielded a significant performance
> improvement for such storage devices. Since the BFQ scheduler locks and
> unlocks bfqd->lock for every dispatch operation it is very likely that BFQ
> will slow down I/O for fast storage devices, even if their driver only
> creates a single hardware queue.
> 

One of the main motivations behind NVMe, and blk-mq itself, is that it
is hard to reach the above IOPS, and more, with a single I/O queue as
bottleneck.

So, I wouldn't expect that systems
- equipped with single-queue drives reaching more than 500 KIOPS
- using SATA or some other non-NVMe as protocol
- so fast to push these drives to their maximum speeds
constitute more than a negligible percentage of devices.

So, by sticking to mq-deadline, we would sacrifice 99% of systems, to
make sure, basically, that those very few systems on steroids reach
maximum throughput with random I/O (while however still suffering from
responsiveness problems).  I think it makes much more sense to have as
default what is best for 99% of the single-queue systems, with those
super systems properly reconfigured by their users.  For sure, other
defaults are to be changed too, to get the most out of those systems.

Thanks,
Paolo


> Bart.



Re: [PATCH] block: BFQ default for single queue devices

2018-10-05 Thread Paolo Valente



> Il giorno 05 ott 2018, alle ore 00:42, Bart Van Assche  
> ha scritto:
> 
> On Thu, 2018-10-04 at 22:39 +0200, Paolo Valente wrote:
>> No, kernel build is, for evident reasons, one of the workloads I cared
>> most about.  Actually, I tried to focus on all my main
>> kernel-development tasks, such as also git checkout, git merge, git
>> grep, ...
>> 
>> According to my test results, with BFQ these tasks are at least as
>> fast as, or, in most system configurations, much faster than with the
>> other schedulers.  Of course, at the same time the system also remains
>> responsive with BFQ.
>> 
>> You can repeat these tests using one of my first scripts in the S
>> suite: kern_dev_tasks_vs_rw.sh (usually, the older the tests, the more
>> hypertrophied the names I gave :) ).
>> 
>> I stopped sharing also my kernel-build results years ago, because I
>> went on obtaining the same, identical good results for years, and I'm
>> aware that I tend to show and say too much stuff.
> 
> On my test setup building the kernel is slightly slower when using the BFQ
> scheduler compared to using scheduler "none" (kernel 4.18.12, NVMe SSD,
> single CPU with 6 cores, hyperthreading disabled). I am aware that the
> proposal at the start of this thread was to make BFQ the default for devices
> with a single hardware queue and not for devices like NVMe SSDs that support
> multiple hardware queues.
> 

I miss your point: as you yourself note, the proposal is limited to
single-queue devices, exactly because BFQ is not ready for
multiple-queue devices yet.

> What I think is missing is measurement results for BFQ on a system with
> multiple CPU sockets and against a fast storage medium.

It is not missing.  As I happened to report in previous threads, we
made a script to measure that too [1], using fio and null block.

I have reported the results we obtained, for three classes of
processors, in the in-kernel BFQ documentation [2].

In particular, BFQ reached 400KIOPS with the fastest CPU mentioned in
that document (Intel i7-4850HQ).

So, since the speed of that single-socket commodity CPU is most likely
lower than the total speed of a multi-socket system, we have that, on
such a system and with BFQ, you should be conservatively ok with
single-queue devices in the range 300-500 KIOPS.

[1] https://github.com/Algodev-github/IOSpeed
[2] https://www.kernel.org/doc/Documentation/block/bfq-iosched.txt

>  

> Eliminating
> the host lock from the SCSI core yielded a significant performance
> improvement for such storage devices. Since the BFQ scheduler locks and
> unlocks bfqd->lock for every dispatch operation it is very likely that BFQ
> will slow down I/O for fast storage devices, even if their driver only
> creates a single hardware queue.
> 

One of the main motivations behind NVMe, and blk-mq itself, is that it
is hard to reach the above IOPS, and more, with a single I/O queue as
bottleneck.

So, I wouldn't expect that systems
- equipped with single-queue drives reaching more than 500 KIOPS
- using SATA or some other non-NVMe as protocol
- so fast to push these drives to their maximum speeds
constitute more than a negligible percentage of devices.

So, by sticking to mq-deadline, we would sacrifice 99% of systems, to
make sure, basically, that those very few systems on steroids reach
maximum throughput with random I/O (while however still suffering from
responsiveness problems).  I think it makes much more sense to have as
default what is best for 99% of the single-queue systems, with those
super systems properly reconfigured by their users.  For sure, other
defaults are to be changed too, to get the most out of those systems.

Thanks,
Paolo


> Bart.



Re: [PATCH BUGFIX 0/3] three fixes for the function bfq_requests_merged

2018-05-31 Thread Paolo Valente



> Il giorno 31 mag 2018, alle ore 16:48, Jens Axboe  ha 
> scritto:
> 
> On 5/31/18 7:23 AM, Paolo Valente wrote:
>> Hi Jens,
>> this series fixes three bugs in bfq_requests_merged. In more detail:
>> 
>> - two linked bugs, with the first (critical: wrong lock) hidden by the
>>  second (less critical: wrong check that made the body of the
>>  function not be executed at all)
>> - a rather minor bug: superflous code
> 
> Patch #2 doesn't apply on for-4.18/block.

Yep, it gave some problems to me too, because we happened to move this
patch around by email.

> I hand applied it, all
> 3 are queued up now.
> 

Thanks for the patience and for queueing these fixes.

Paolo

> -- 
> Jens Axboe
> 



Re: [PATCH BUGFIX 0/3] three fixes for the function bfq_requests_merged

2018-05-31 Thread Paolo Valente



> Il giorno 31 mag 2018, alle ore 16:48, Jens Axboe  ha 
> scritto:
> 
> On 5/31/18 7:23 AM, Paolo Valente wrote:
>> Hi Jens,
>> this series fixes three bugs in bfq_requests_merged. In more detail:
>> 
>> - two linked bugs, with the first (critical: wrong lock) hidden by the
>>  second (less critical: wrong check that made the body of the
>>  function not be executed at all)
>> - a rather minor bug: superflous code
> 
> Patch #2 doesn't apply on for-4.18/block.

Yep, it gave some problems to me too, because we happened to move this
patch around by email.

> I hand applied it, all
> 3 are queued up now.
> 

Thanks for the patience and for queueing these fixes.

Paolo

> -- 
> Jens Axboe
> 



[PATCH BUGFIX/IMPROVEMENTS 3/4] block, bfq: increase weight-raising duration for interactive apps

2018-05-31 Thread Paolo Valente
From: Davide Sapienza 

The maximum possible duration of the weight-raising period for
interactive applications is limited to 13 seconds, as this is the time
needed to load the largest application that we considered when tuning
weight raising. Unfortunately, in such an evaluation, we did not
consider the case of very slow virtual machines.

For example, on a QEMU/KVM virtual machine
- running in a slow PC;
- with a virtual disk stacked on a slow low-end 5400rpm HDD;
- serving a heavy I/O workload, such as the sequential reading of
several files;
mplayer takes 23 seconds to start, if constantly weight-raised.

To address this issue, this commit conservatively sets the upper limit
for weight-raising duration to 25 seconds.

Signed-off-by: Davide Sapienza 
Signed-off-by: Paolo Valente 
---
 block/bfq-iosched.c | 26 +++---
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 262c929e24ee..31ce089d54eb 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -930,22 +930,26 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
do_div(dur, bfqd->peak_rate);
 
/*
-* Limit duration between 3 and 13 seconds. Tests show that
-* higher values than 13 seconds often yield the opposite of
-* the desired result, i.e., worsen responsiveness by letting
-* non-interactive and non-soft-real-time applications
-* preserve weight raising for a too long time interval.
+* Limit duration between 3 and 25 seconds. The upper limit
+* has been conservatively set after the following worst case:
+* on a QEMU/KVM virtual machine
+* - running in a slow PC
+* - with a virtual disk stacked on a slow low-end 5400rpm HDD
+* - serving a heavy I/O workload, such as the sequential reading
+*   of several files
+* mplayer took 23 seconds to start, if constantly weight-raised.
+*
+* As for higher values than that accomodating the above bad
+* scenario, tests show that higher values would often yield
+* the opposite of the desired result, i.e., would worsen
+* responsiveness by allowing non-interactive applications to
+* preserve weight raising for too long.
 *
 * On the other end, lower values than 3 seconds make it
 * difficult for most interactive tasks to complete their jobs
 * before weight-raising finishes.
 */
-   if (dur > msecs_to_jiffies(13000))
-   dur = msecs_to_jiffies(13000);
-   else if (dur < msecs_to_jiffies(3000))
-   dur = msecs_to_jiffies(3000);
-
-   return dur;
+   return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000));
 }
 
 /* switch back from soft real-time to interactive weight raising */
-- 
2.16.1



[PATCH BUGFIX/IMPROVEMENTS 2/4] block, bfq: remove slow-system class

2018-05-31 Thread Paolo Valente
BFQ computes the duration of weight raising for interactive
applications automatically, using some reference parameters. In
particular, BFQ uses the best durations (see comments in the code for
how these durations have been assessed) for two classes of systems:
slow and fast ones. Examples of slow systems are old phones or systems
using micro HDDs. Fast systems are all the remaining ones. Using these
parameters, BFQ computes the actual duration of the weight raising,
for the system at hand, as a function of the relative speed of the
system w.r.t. the speed of a reference system, belonging to the same
class of systems as the system at hand.

This slow vs fast differentiation proved to be useful in the past, but
happens to have little meaning with current hardware. Even worse, it
does cause problems in virtual systems, where the speed of the system
can vary frequently, and so widely to just confuse the class-detection
mechanism, and, as we have verified experimentally, to cause BFQ to
compute non-sensical weight-raising durations.

This commit addresses this issue by removing the slow class and the
class-detection mechanism.

Signed-off-by: Paolo Valente 
---
 block/bfq-iosched.c | 137 
 block/bfq-iosched.h |  14 ++
 2 files changed, 46 insertions(+), 105 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index f3703e7431aa..262c929e24ee 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -251,55 +251,43 @@ static struct kmem_cache *bfq_pool;
  * When configured for computing the duration of the weight-raising
  * for interactive queues automatically (see the comments at the
  * beginning of this file), BFQ does it using the following formula:
- * duration = (R / r) * T,
- * where r is the peak rate of the device, and R
- * and T are two reference parameters. In particular,
- * R is the peak rate of the reference device (see below), and
- * T is a reference time: given the systems that are likely
- * to be installed on the reference device according to its speed
- * class, T is about the maximum time needed, under BFQ and
- * while reading two files in parallel, to load typical large
- * applications on these systems (see the comments on
- * max_service_from_wr below, for more details on how T is
- * obtained).  In practice, the slower/faster the device at hand is,
- * the more/less it takes to load applications with respect to the
+ * duration = (ref_rate / r) * ref_wr_duration,
+ * where r is the peak rate of the device, and ref_rate and
+ * ref_wr_duration are two reference parameters.  In particular,
+ * ref_rate is the peak rate of the reference storage device (see
+ * below), and ref_wr_duration is about the maximum time needed, with
+ * BFQ and while reading two files in parallel, to load typical large
+ * applications on the reference device (see the comments on
+ * max_service_from_wr below, for more details on how ref_wr_duration
+ * is obtained).  In practice, the slower/faster the device at hand
+ * is, the more/less it takes to load applications with respect to the
  * reference device.  Accordingly, the longer/shorter BFQ grants
  * weight raising to interactive applications.
  *
- * BFQ uses four different reference pairs (R, T), depending on:
- * . whether the device is rotational or non-rotational;
- * . whether the device is slow, such as old or portable HDDs, as well as
- *   SD cards, or fast, such as newer HDDs and SSDs.
+ * BFQ uses two different reference pairs (ref_rate, ref_wr_duration),
+ * depending on whether the device is rotational or non-rotational.
  *
- * The device's speed class is dynamically (re)detected in
- * bfq_update_peak_rate() every time the estimated peak rate is updated.
+ * In the following definitions, ref_rate[0] and ref_wr_duration[0]
+ * are the reference values for a rotational device, whereas
+ * ref_rate[1] and ref_wr_duration[1] are the reference values for a
+ * non-rotational device. The reference rates are not the actual peak
+ * rates of the devices used as a reference, but slightly lower
+ * values. The reason for using slightly lower values is that the
+ * peak-rate estimator tends to yield slightly lower values than the
+ * actual peak rate (it can yield the actual peak rate only if there
+ * is only one process doing I/O, and the process does sequential
+ * I/O).
  *
- * In the following definitions, R_slow[0]/R_fast[0] and
- * T_slow[0]/T_fast[0] are the reference values for a slow/fast
- * rotational device, whereas R_slow[1]/R_fast[1] and
- * T_slow[1]/T_fast[1] are the reference values for a slow/fast
- * non-rotational device. Finally, device_speed_thresh are the
- * thresholds used to switch between speed classes. The reference
- * rates are not the actual peak rates of the devices used as a
- * reference, but slightly lower values. The reason for using these
- * slightly lower values is that the peak-rate estimator tends to
- * yield slightly lower values

[PATCH BUGFIX/IMPROVEMENTS 3/4] block, bfq: increase weight-raising duration for interactive apps

2018-05-31 Thread Paolo Valente
From: Davide Sapienza 

The maximum possible duration of the weight-raising period for
interactive applications is limited to 13 seconds, as this is the time
needed to load the largest application that we considered when tuning
weight raising. Unfortunately, in such an evaluation, we did not
consider the case of very slow virtual machines.

For example, on a QEMU/KVM virtual machine
- running in a slow PC;
- with a virtual disk stacked on a slow low-end 5400rpm HDD;
- serving a heavy I/O workload, such as the sequential reading of
several files;
mplayer takes 23 seconds to start, if constantly weight-raised.

To address this issue, this commit conservatively sets the upper limit
for weight-raising duration to 25 seconds.

Signed-off-by: Davide Sapienza 
Signed-off-by: Paolo Valente 
---
 block/bfq-iosched.c | 26 +++---
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 262c929e24ee..31ce089d54eb 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -930,22 +930,26 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
do_div(dur, bfqd->peak_rate);
 
/*
-* Limit duration between 3 and 13 seconds. Tests show that
-* higher values than 13 seconds often yield the opposite of
-* the desired result, i.e., worsen responsiveness by letting
-* non-interactive and non-soft-real-time applications
-* preserve weight raising for a too long time interval.
+* Limit duration between 3 and 25 seconds. The upper limit
+* has been conservatively set after the following worst case:
+* on a QEMU/KVM virtual machine
+* - running in a slow PC
+* - with a virtual disk stacked on a slow low-end 5400rpm HDD
+* - serving a heavy I/O workload, such as the sequential reading
+*   of several files
+* mplayer took 23 seconds to start, if constantly weight-raised.
+*
+* As for higher values than that accomodating the above bad
+* scenario, tests show that higher values would often yield
+* the opposite of the desired result, i.e., would worsen
+* responsiveness by allowing non-interactive applications to
+* preserve weight raising for too long.
 *
 * On the other end, lower values than 3 seconds make it
 * difficult for most interactive tasks to complete their jobs
 * before weight-raising finishes.
 */
-   if (dur > msecs_to_jiffies(13000))
-   dur = msecs_to_jiffies(13000);
-   else if (dur < msecs_to_jiffies(3000))
-   dur = msecs_to_jiffies(3000);
-
-   return dur;
+   return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000));
 }
 
 /* switch back from soft real-time to interactive weight raising */
-- 
2.16.1



[PATCH BUGFIX/IMPROVEMENTS 2/4] block, bfq: remove slow-system class

2018-05-31 Thread Paolo Valente
BFQ computes the duration of weight raising for interactive
applications automatically, using some reference parameters. In
particular, BFQ uses the best durations (see comments in the code for
how these durations have been assessed) for two classes of systems:
slow and fast ones. Examples of slow systems are old phones or systems
using micro HDDs. Fast systems are all the remaining ones. Using these
parameters, BFQ computes the actual duration of the weight raising,
for the system at hand, as a function of the relative speed of the
system w.r.t. the speed of a reference system, belonging to the same
class of systems as the system at hand.

This slow vs fast differentiation proved to be useful in the past, but
happens to have little meaning with current hardware. Even worse, it
does cause problems in virtual systems, where the speed of the system
can vary frequently, and so widely to just confuse the class-detection
mechanism, and, as we have verified experimentally, to cause BFQ to
compute non-sensical weight-raising durations.

This commit addresses this issue by removing the slow class and the
class-detection mechanism.

Signed-off-by: Paolo Valente 
---
 block/bfq-iosched.c | 137 
 block/bfq-iosched.h |  14 ++
 2 files changed, 46 insertions(+), 105 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index f3703e7431aa..262c929e24ee 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -251,55 +251,43 @@ static struct kmem_cache *bfq_pool;
  * When configured for computing the duration of the weight-raising
  * for interactive queues automatically (see the comments at the
  * beginning of this file), BFQ does it using the following formula:
- * duration = (R / r) * T,
- * where r is the peak rate of the device, and R
- * and T are two reference parameters. In particular,
- * R is the peak rate of the reference device (see below), and
- * T is a reference time: given the systems that are likely
- * to be installed on the reference device according to its speed
- * class, T is about the maximum time needed, under BFQ and
- * while reading two files in parallel, to load typical large
- * applications on these systems (see the comments on
- * max_service_from_wr below, for more details on how T is
- * obtained).  In practice, the slower/faster the device at hand is,
- * the more/less it takes to load applications with respect to the
+ * duration = (ref_rate / r) * ref_wr_duration,
+ * where r is the peak rate of the device, and ref_rate and
+ * ref_wr_duration are two reference parameters.  In particular,
+ * ref_rate is the peak rate of the reference storage device (see
+ * below), and ref_wr_duration is about the maximum time needed, with
+ * BFQ and while reading two files in parallel, to load typical large
+ * applications on the reference device (see the comments on
+ * max_service_from_wr below, for more details on how ref_wr_duration
+ * is obtained).  In practice, the slower/faster the device at hand
+ * is, the more/less it takes to load applications with respect to the
  * reference device.  Accordingly, the longer/shorter BFQ grants
  * weight raising to interactive applications.
  *
- * BFQ uses four different reference pairs (R, T), depending on:
- * . whether the device is rotational or non-rotational;
- * . whether the device is slow, such as old or portable HDDs, as well as
- *   SD cards, or fast, such as newer HDDs and SSDs.
+ * BFQ uses two different reference pairs (ref_rate, ref_wr_duration),
+ * depending on whether the device is rotational or non-rotational.
  *
- * The device's speed class is dynamically (re)detected in
- * bfq_update_peak_rate() every time the estimated peak rate is updated.
+ * In the following definitions, ref_rate[0] and ref_wr_duration[0]
+ * are the reference values for a rotational device, whereas
+ * ref_rate[1] and ref_wr_duration[1] are the reference values for a
+ * non-rotational device. The reference rates are not the actual peak
+ * rates of the devices used as a reference, but slightly lower
+ * values. The reason for using slightly lower values is that the
+ * peak-rate estimator tends to yield slightly lower values than the
+ * actual peak rate (it can yield the actual peak rate only if there
+ * is only one process doing I/O, and the process does sequential
+ * I/O).
  *
- * In the following definitions, R_slow[0]/R_fast[0] and
- * T_slow[0]/T_fast[0] are the reference values for a slow/fast
- * rotational device, whereas R_slow[1]/R_fast[1] and
- * T_slow[1]/T_fast[1] are the reference values for a slow/fast
- * non-rotational device. Finally, device_speed_thresh are the
- * thresholds used to switch between speed classes. The reference
- * rates are not the actual peak rates of the devices used as a
- * reference, but slightly lower values. The reason for using these
- * slightly lower values is that the peak-rate estimator tends to
- * yield slightly lower values

Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-14 Thread Paolo Valente


> Il giorno 14 mag 2018, alle ore 19:31, Jens Axboe <ax...@kernel.dk> ha 
> scritto:
> 
> On 5/14/18 11:16 AM, Paolo Valente wrote:
>> 
>> 
>>> Il giorno 10 mag 2018, alle ore 18:14, Bart Van Assche 
>>> <bart.vanass...@wdc.com> ha scritto:
>>> 
>>> On Fri, 2018-05-04 at 19:17 +0200, Paolo Valente wrote:
>>>> When invoked for an I/O request rq, [ ... ]
>>> 
>>> Tested-by: Bart Van Assche <bart.vanass...@wdc.com>
>>> 
>>> 
>>> 
>> 
>> Any decision for this fix, Jens?
> 
> Guess I didn't reply, but I did commit this on Thursday.
> 

Great, thank you!

Paolo

> -- 
> Jens Axboe
> 



Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-14 Thread Paolo Valente


> Il giorno 14 mag 2018, alle ore 19:31, Jens Axboe  ha 
> scritto:
> 
> On 5/14/18 11:16 AM, Paolo Valente wrote:
>> 
>> 
>>> Il giorno 10 mag 2018, alle ore 18:14, Bart Van Assche 
>>>  ha scritto:
>>> 
>>> On Fri, 2018-05-04 at 19:17 +0200, Paolo Valente wrote:
>>>> When invoked for an I/O request rq, [ ... ]
>>> 
>>> Tested-by: Bart Van Assche 
>>> 
>>> 
>>> 
>> 
>> Any decision for this fix, Jens?
> 
> Guess I didn't reply, but I did commit this on Thursday.
> 

Great, thank you!

Paolo

> -- 
> Jens Axboe
> 



Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-14 Thread Paolo Valente


> Il giorno 10 mag 2018, alle ore 18:14, Bart Van Assche 
> <bart.vanass...@wdc.com> ha scritto:
> 
> On Fri, 2018-05-04 at 19:17 +0200, Paolo Valente wrote:
>> When invoked for an I/O request rq, [ ... ]
> 
> Tested-by: Bart Van Assche <bart.vanass...@wdc.com>
> 
> 
> 

Any decision for this fix, Jens?

Thanks,
Paolo

Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-14 Thread Paolo Valente


> Il giorno 10 mag 2018, alle ore 18:14, Bart Van Assche 
>  ha scritto:
> 
> On Fri, 2018-05-04 at 19:17 +0200, Paolo Valente wrote:
>> When invoked for an I/O request rq, [ ... ]
> 
> Tested-by: Bart Van Assche 
> 
> 
> 

Any decision for this fix, Jens?

Thanks,
Paolo

Re: bug in tag handling in blk-mq?

2018-05-08 Thread Paolo Valente


> Il giorno 09 mag 2018, alle ore 06:11, Mike Galbraith  ha 
> scritto:
> 
> On Tue, 2018-05-08 at 19:09 -0600, Jens Axboe wrote:
>> 
>> Alright, I managed to reproduce it. What I think is happening is that
>> BFQ is limiting the inflight case to something less than the wake
>> batch for sbitmap, which can lead to stalls. I don't have time to test
>> this tonight, but perhaps you can give it a go when you are back at it.
>> If not, I'll try tomorrow morning.
>> 
>> If this is the issue, I can turn it into a real patch. This is just to
>> confirm that the issue goes away with the below.
> 
> Confirmed.  Impressive high speed bug stomping.
> 

Great! It's a real relief that this ghost is gone.

Thank you both,
Paolo

>> diff --git a/lib/sbitmap.c b/lib/sbitmap.c
>> index e6a9c06ec70c..94ced15b6428 100644
>> --- a/lib/sbitmap.c
>> +++ b/lib/sbitmap.c
>> @@ -272,6 +272,7 @@ EXPORT_SYMBOL_GPL(sbitmap_bitmap_show);
>> 
>> static unsigned int sbq_calc_wake_batch(unsigned int depth)
>> {
>> +#if 0
>>  unsigned int wake_batch;
>> 
>>  /*
>> @@ -284,6 +285,9 @@ static unsigned int sbq_calc_wake_batch(unsigned int 
>> depth)
>>  wake_batch = max(1U, depth / SBQ_WAIT_QUEUES);
>> 
>>  return wake_batch;
>> +#else
>> +return 1;
>> +#endif
>> }
>> 
>> int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
>> 



Re: bug in tag handling in blk-mq?

2018-05-08 Thread Paolo Valente


> Il giorno 09 mag 2018, alle ore 06:11, Mike Galbraith  ha 
> scritto:
> 
> On Tue, 2018-05-08 at 19:09 -0600, Jens Axboe wrote:
>> 
>> Alright, I managed to reproduce it. What I think is happening is that
>> BFQ is limiting the inflight case to something less than the wake
>> batch for sbitmap, which can lead to stalls. I don't have time to test
>> this tonight, but perhaps you can give it a go when you are back at it.
>> If not, I'll try tomorrow morning.
>> 
>> If this is the issue, I can turn it into a real patch. This is just to
>> confirm that the issue goes away with the below.
> 
> Confirmed.  Impressive high speed bug stomping.
> 

Great! It's a real relief that this ghost is gone.

Thank you both,
Paolo

>> diff --git a/lib/sbitmap.c b/lib/sbitmap.c
>> index e6a9c06ec70c..94ced15b6428 100644
>> --- a/lib/sbitmap.c
>> +++ b/lib/sbitmap.c
>> @@ -272,6 +272,7 @@ EXPORT_SYMBOL_GPL(sbitmap_bitmap_show);
>> 
>> static unsigned int sbq_calc_wake_batch(unsigned int depth)
>> {
>> +#if 0
>>  unsigned int wake_batch;
>> 
>>  /*
>> @@ -284,6 +285,9 @@ static unsigned int sbq_calc_wake_batch(unsigned int 
>> depth)
>>  wake_batch = max(1U, depth / SBQ_WAIT_QUEUES);
>> 
>>  return wake_batch;
>> +#else
>> +return 1;
>> +#endif
>> }
>> 
>> int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
>> 



Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-07 Thread Paolo Valente


> Il giorno 07 mag 2018, alle ore 12:01, Mike Galbraith <efa...@gmx.de> ha 
> scritto:
> 
> On Mon, 2018-05-07 at 11:27 +0200, Paolo Valente wrote:
>> 
>> 
>> Where is the bug?
> 
> Hm, seems potent pain-killers and C don't mix all that well.
> 

I'll try to keep it in mind, and I do wis you to get well soon.

Thanks,
Paolo

Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-07 Thread Paolo Valente


> Il giorno 07 mag 2018, alle ore 12:01, Mike Galbraith  ha 
> scritto:
> 
> On Mon, 2018-05-07 at 11:27 +0200, Paolo Valente wrote:
>> 
>> 
>> Where is the bug?
> 
> Hm, seems potent pain-killers and C don't mix all that well.
> 

I'll try to keep it in mind, and I do wis you to get well soon.

Thanks,
Paolo

Re: bug in tag handling in blk-mq?

2018-05-07 Thread Paolo Valente


> Il giorno 07 mag 2018, alle ore 18:39, Jens Axboe <ax...@kernel.dk> ha 
> scritto:
> 
> On 5/7/18 8:03 AM, Paolo Valente wrote:
>> Hi Jens, Christoph, all,
>> Mike Galbraith has been experiencing hangs, on blk_mq_get_tag, only
>> with bfq [1].  Symptoms seem to clearly point to a problem in I/O-tag
>> handling, triggered by bfq because it limits the number of tags for
>> async and sync write requests (in bfq_limit_depth).
>> 
>> Fortunately, I just happened to find a way to apparently confirm it.
>> With the following one-liner for block/bfq-iosched.c:
>> 
>> @@ -554,8 +554,7 @@ static void bfq_limit_depth(unsigned int op, struct 
>> blk_mq_alloc_data *data)
>>if (unlikely(bfqd->sb_shift != bt->sb.shift))
>>bfq_update_depths(bfqd, bt);
>> 
>> -   data->shallow_depth =
>> -   bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
>> +   data->shallow_depth = 1;
>> 
>>bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
>>__func__, bfqd->wr_busy_queues, op_is_sync(op),
>> 
>> Mike's machine now crashes soon and systematically, while nothing bad
>> happens on my machines, even with heavy workloads (apart from an
>> expected throughput drop).
>> 
>> This change simply reduces to 1 the maximum possible value for the sum
>> of the number of async requests and of sync write requests.
>> 
>> This email is basically a request for help to knowledgeable people.  To
>> start, here are my first doubts/questions:
>> 1) Just to be certain, I guess it is not normal that blk-mq hangs if
>> async requests and sync write requests can be at most one, right?
>> 2) Do you have any hint to where I could look for, to chase this bug?
>> Of course, the bug may be in bfq, i.e, it may be a somehow unrelated
>> bfq bug that causes this hang in blk-mq, indirectly.  But it is hard
>> for me to understand how.
> 
> CC Omar, since he implemented the shallow part. But we'll need some
> traces to show where we are hung, probably also the value of the
> /sys/debug/kernel/block// directory. For the crash mentioned, a
> trace as well. Otherwise we'll be wasting a lot of time on this.
> 
> Is there a reproducer?
> 

Ok Mike, I guess it's your turn now, for at least a stack trace.

Thanks,
Paolo

> -- 
> Jens Axboe



Re: bug in tag handling in blk-mq?

2018-05-07 Thread Paolo Valente


> Il giorno 07 mag 2018, alle ore 18:39, Jens Axboe  ha 
> scritto:
> 
> On 5/7/18 8:03 AM, Paolo Valente wrote:
>> Hi Jens, Christoph, all,
>> Mike Galbraith has been experiencing hangs, on blk_mq_get_tag, only
>> with bfq [1].  Symptoms seem to clearly point to a problem in I/O-tag
>> handling, triggered by bfq because it limits the number of tags for
>> async and sync write requests (in bfq_limit_depth).
>> 
>> Fortunately, I just happened to find a way to apparently confirm it.
>> With the following one-liner for block/bfq-iosched.c:
>> 
>> @@ -554,8 +554,7 @@ static void bfq_limit_depth(unsigned int op, struct 
>> blk_mq_alloc_data *data)
>>if (unlikely(bfqd->sb_shift != bt->sb.shift))
>>bfq_update_depths(bfqd, bt);
>> 
>> -   data->shallow_depth =
>> -   bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
>> +   data->shallow_depth = 1;
>> 
>>bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
>>__func__, bfqd->wr_busy_queues, op_is_sync(op),
>> 
>> Mike's machine now crashes soon and systematically, while nothing bad
>> happens on my machines, even with heavy workloads (apart from an
>> expected throughput drop).
>> 
>> This change simply reduces to 1 the maximum possible value for the sum
>> of the number of async requests and of sync write requests.
>> 
>> This email is basically a request for help to knowledgeable people.  To
>> start, here are my first doubts/questions:
>> 1) Just to be certain, I guess it is not normal that blk-mq hangs if
>> async requests and sync write requests can be at most one, right?
>> 2) Do you have any hint to where I could look for, to chase this bug?
>> Of course, the bug may be in bfq, i.e, it may be a somehow unrelated
>> bfq bug that causes this hang in blk-mq, indirectly.  But it is hard
>> for me to understand how.
> 
> CC Omar, since he implemented the shallow part. But we'll need some
> traces to show where we are hung, probably also the value of the
> /sys/debug/kernel/block// directory. For the crash mentioned, a
> trace as well. Otherwise we'll be wasting a lot of time on this.
> 
> Is there a reproducer?
> 

Ok Mike, I guess it's your turn now, for at least a stack trace.

Thanks,
Paolo

> -- 
> Jens Axboe



bug in tag handling in blk-mq?

2018-05-07 Thread Paolo Valente
Hi Jens, Christoph, all,
Mike Galbraith has been experiencing hangs, on blk_mq_get_tag, only
with bfq [1].  Symptoms seem to clearly point to a problem in I/O-tag
handling, triggered by bfq because it limits the number of tags for
async and sync write requests (in bfq_limit_depth).

Fortunately, I just happened to find a way to apparently confirm it.
With the following one-liner for block/bfq-iosched.c:

@@ -554,8 +554,7 @@ static void bfq_limit_depth(unsigned int op, struct 
blk_mq_alloc_data *data)
if (unlikely(bfqd->sb_shift != bt->sb.shift))
bfq_update_depths(bfqd, bt);
 
-   data->shallow_depth =
-   bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+   data->shallow_depth = 1;
 
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
__func__, bfqd->wr_busy_queues, op_is_sync(op),

Mike's machine now crashes soon and systematically, while nothing bad
happens on my machines, even with heavy workloads (apart from an
expected throughput drop).

This change simply reduces to 1 the maximum possible value for the sum
of the number of async requests and of sync write requests.

This email is basically a request for help to knowledgeable people.  To
start, here are my first doubts/questions:
1) Just to be certain, I guess it is not normal that blk-mq hangs if
async requests and sync write requests can be at most one, right?
2) Do you have any hint to where I could look for, to chase this bug?
Of course, the bug may be in bfq, i.e, it may be a somehow unrelated
bfq bug that causes this hang in blk-mq, indirectly.  But it is hard
for me to understand how.

Looking forward to some help.

Thanks,
Paolo

[1] https://www.spinics.net/lists/stable/msg215036.html

bug in tag handling in blk-mq?

2018-05-07 Thread Paolo Valente
Hi Jens, Christoph, all,
Mike Galbraith has been experiencing hangs, on blk_mq_get_tag, only
with bfq [1].  Symptoms seem to clearly point to a problem in I/O-tag
handling, triggered by bfq because it limits the number of tags for
async and sync write requests (in bfq_limit_depth).

Fortunately, I just happened to find a way to apparently confirm it.
With the following one-liner for block/bfq-iosched.c:

@@ -554,8 +554,7 @@ static void bfq_limit_depth(unsigned int op, struct 
blk_mq_alloc_data *data)
if (unlikely(bfqd->sb_shift != bt->sb.shift))
bfq_update_depths(bfqd, bt);
 
-   data->shallow_depth =
-   bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+   data->shallow_depth = 1;
 
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
__func__, bfqd->wr_busy_queues, op_is_sync(op),

Mike's machine now crashes soon and systematically, while nothing bad
happens on my machines, even with heavy workloads (apart from an
expected throughput drop).

This change simply reduces to 1 the maximum possible value for the sum
of the number of async requests and of sync write requests.

This email is basically a request for help to knowledgeable people.  To
start, here are my first doubts/questions:
1) Just to be certain, I guess it is not normal that blk-mq hangs if
async requests and sync write requests can be at most one, right?
2) Do you have any hint to where I could look for, to chase this bug?
Of course, the bug may be in bfq, i.e, it may be a somehow unrelated
bfq bug that causes this hang in blk-mq, indirectly.  But it is hard
for me to understand how.

Looking forward to some help.

Thanks,
Paolo

[1] https://www.spinics.net/lists/stable/msg215036.html

Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-07 Thread Paolo Valente


> Il giorno 07 mag 2018, alle ore 05:23, Mike Galbraith <efa...@gmx.de> ha 
> scritto:
> 
> On Mon, 2018-05-07 at 04:43 +0200, Mike Galbraith wrote:
>> On Sun, 2018-05-06 at 09:42 +0200, Paolo Valente wrote:
>>> 
>>> I've attached a compressed patch (to avoid possible corruption from my
>>> mailer).  I'm little confident, but no pain, no gain, right?
>>> 
>>> If possible, apply this patch on top of the fix I proposed in this
>>> thread, just to eliminate possible further noise. Finally, the
>>> patch content follows.
>>> 
>>> Hoping for a stroke of luck,
>> 
>> FWIW, box didn't survive the first full build of the morning.
> 
> Nor the second.
> 

Great, finally the first good news!

Help from blk-mq experts would be fundamental here.  To increase the
chances to get it, I'm going to open a new thread on this.  In that
thread I'll ask you to provide an OOPS or something, I hope it is now
easier for you to get it.

Thanks,
Paolo

>   -Mike



Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-07 Thread Paolo Valente


> Il giorno 07 mag 2018, alle ore 05:23, Mike Galbraith  ha 
> scritto:
> 
> On Mon, 2018-05-07 at 04:43 +0200, Mike Galbraith wrote:
>> On Sun, 2018-05-06 at 09:42 +0200, Paolo Valente wrote:
>>> 
>>> I've attached a compressed patch (to avoid possible corruption from my
>>> mailer).  I'm little confident, but no pain, no gain, right?
>>> 
>>> If possible, apply this patch on top of the fix I proposed in this
>>> thread, just to eliminate possible further noise. Finally, the
>>> patch content follows.
>>> 
>>> Hoping for a stroke of luck,
>> 
>> FWIW, box didn't survive the first full build of the morning.
> 
> Nor the second.
> 

Great, finally the first good news!

Help from blk-mq experts would be fundamental here.  To increase the
chances to get it, I'm going to open a new thread on this.  In that
thread I'll ask you to provide an OOPS or something, I hope it is now
easier for you to get it.

Thanks,
Paolo

>   -Mike



Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-07 Thread Paolo Valente


> Il giorno 07 mag 2018, alle ore 07:56, Mike Galbraith <efa...@gmx.de> ha 
> scritto:
> 
> On Sun, 2018-05-06 at 09:42 +0200, Paolo Valente wrote:
>> 
>> diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c
>> index 118f319af7c0..6662efe29b69 100644
>> --- a/block/bfq-mq-iosched.c
>> +++ b/block/bfq-mq-iosched.c
>> @@ -525,8 +525,13 @@ static void bfq_limit_depth(unsigned int op, struct 
>> blk_mq_alloc_data *data)
>>if (unlikely(bfqd->sb_shift != bt->sb.shift))
>>bfq_update_depths(bfqd, bt);
>> 
>> +#if 0
>>data->shallow_depth =
>>bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
>^
> 
> Q: why doesn't the top of this function look like so?
> 
> ---
> block/bfq-iosched.c |2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
> 
> --- a/block/bfq-iosched.c
> +++ b/block/bfq-iosched.c
> @@ -539,7 +539,7 @@ static void bfq_limit_depth(unsigned int
>   struct bfq_data *bfqd = data->q->elevator->elevator_data;
>   struct sbitmap_queue *bt;
> 
> - if (op_is_sync(op) && !op_is_write(op))
> + if (!op_is_write(op))
>   return;
> 
>   if (data->flags & BLK_MQ_REQ_RESERVED) {
> 
> It looks a bit odd that these elements exist...
> 
> +   /*
> +* no more than 75% of tags for sync writes (25% extra tags
> +* w.r.t. async I/O, to prevent async I/O from starving sync
> +* writes)
> +*/
> +   bfqd->word_depths[0][1] = max(((1U<sb_shift) * 3)>>2, 1U);
> 
> +   /* no more than ~37% of tags for sync writes (~20% extra tags) */
> +   bfqd->word_depths[1][1] = max(((1U<sb_shift) * 6)>>4, 1U);
> 
> ...yet we index via and log a guaranteed zero.
> 

I'm not sure I got your point, so, to help you help me quickly, I'll
repeat what I expect the code you highlight to do:

- sync reads must have no limitation, and the lines
if (op_is_sync(op) && !op_is_write(op))
return;
make sure they don't

- sync writes must be limited, and the code you pasted above computes
those limits

- for sync writes, for which op_is_sync(op) is true (but the condition
"op_is_sync(op) && !op_is_write(op)" is false), the line:
bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
becomes
bfqd->word_depths[!!bfqd->wr_busy_queues][1];
e yields the right limit for sync writes, depending on bfqd->wr_busy_queues.

Where is the bug?

Thanks,
Paolo


>   -Mike
> 
> 



Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-07 Thread Paolo Valente


> Il giorno 07 mag 2018, alle ore 07:56, Mike Galbraith  ha 
> scritto:
> 
> On Sun, 2018-05-06 at 09:42 +0200, Paolo Valente wrote:
>> 
>> diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c
>> index 118f319af7c0..6662efe29b69 100644
>> --- a/block/bfq-mq-iosched.c
>> +++ b/block/bfq-mq-iosched.c
>> @@ -525,8 +525,13 @@ static void bfq_limit_depth(unsigned int op, struct 
>> blk_mq_alloc_data *data)
>>if (unlikely(bfqd->sb_shift != bt->sb.shift))
>>bfq_update_depths(bfqd, bt);
>> 
>> +#if 0
>>data->shallow_depth =
>>bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
>^
> 
> Q: why doesn't the top of this function look like so?
> 
> ---
> block/bfq-iosched.c |2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
> 
> --- a/block/bfq-iosched.c
> +++ b/block/bfq-iosched.c
> @@ -539,7 +539,7 @@ static void bfq_limit_depth(unsigned int
>   struct bfq_data *bfqd = data->q->elevator->elevator_data;
>   struct sbitmap_queue *bt;
> 
> - if (op_is_sync(op) && !op_is_write(op))
> + if (!op_is_write(op))
>   return;
> 
>   if (data->flags & BLK_MQ_REQ_RESERVED) {
> 
> It looks a bit odd that these elements exist...
> 
> +   /*
> +* no more than 75% of tags for sync writes (25% extra tags
> +* w.r.t. async I/O, to prevent async I/O from starving sync
> +* writes)
> +*/
> +   bfqd->word_depths[0][1] = max(((1U<sb_shift) * 3)>>2, 1U);
> 
> +   /* no more than ~37% of tags for sync writes (~20% extra tags) */
> +   bfqd->word_depths[1][1] = max(((1U<sb_shift) * 6)>>4, 1U);
> 
> ...yet we index via and log a guaranteed zero.
> 

I'm not sure I got your point, so, to help you help me quickly, I'll
repeat what I expect the code you highlight to do:

- sync reads must have no limitation, and the lines
if (op_is_sync(op) && !op_is_write(op))
return;
make sure they don't

- sync writes must be limited, and the code you pasted above computes
those limits

- for sync writes, for which op_is_sync(op) is true (but the condition
"op_is_sync(op) && !op_is_write(op)" is false), the line:
bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
becomes
bfqd->word_depths[!!bfqd->wr_busy_queues][1];
e yields the right limit for sync writes, depending on bfqd->wr_busy_queues.

Where is the bug?

Thanks,
Paolo


>   -Mike
> 
> 



Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-06 Thread Paolo Valente


> Il giorno 05 mag 2018, alle ore 16:56, Mike Galbraith <efa...@gmx.de> ha 
> scritto:
> 
> On Sat, 2018-05-05 at 12:39 +0200, Paolo Valente wrote:
>> 
>> BTW, if you didn't run out of patience with this permanent issue yet,
>> I was thinking of two o three changes to retry to trigger your failure
>> reliably.
> 
> Sure, fire away, I'll happily give the annoying little bugger
> opportunities to show its tender belly.

I've attached a compressed patch (to avoid possible corruption from my
mailer).  I'm little confident, but no pain, no gain, right?

If possible, apply this patch on top of the fix I proposed in this
thread, just to eliminate possible further noise. Finally, the
patch content follows.

Hoping for a stroke of luck,
Paolo

diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c
index 118f319af7c0..6662efe29b69 100644
--- a/block/bfq-mq-iosched.c
+++ b/block/bfq-mq-iosched.c
@@ -525,8 +525,13 @@ static void bfq_limit_depth(unsigned int op, struct 
blk_mq_alloc_data *data)
if (unlikely(bfqd->sb_shift != bt->sb.shift))
bfq_update_depths(bfqd, bt);
 
+#if 0
data->shallow_depth =
bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+#else
+   data->shallow_depth = 1;
+#endif
+
 
bfq_log(bfqd, "wr_busy %d sync %d depth %u",
bfqd->wr_busy_queues, op_is_sync(op),


> 
>   -Mike
> 



Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-06 Thread Paolo Valente


> Il giorno 05 mag 2018, alle ore 16:56, Mike Galbraith  ha 
> scritto:
> 
> On Sat, 2018-05-05 at 12:39 +0200, Paolo Valente wrote:
>> 
>> BTW, if you didn't run out of patience with this permanent issue yet,
>> I was thinking of two o three changes to retry to trigger your failure
>> reliably.
> 
> Sure, fire away, I'll happily give the annoying little bugger
> opportunities to show its tender belly.

I've attached a compressed patch (to avoid possible corruption from my
mailer).  I'm little confident, but no pain, no gain, right?

If possible, apply this patch on top of the fix I proposed in this
thread, just to eliminate possible further noise. Finally, the
patch content follows.

Hoping for a stroke of luck,
Paolo

diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c
index 118f319af7c0..6662efe29b69 100644
--- a/block/bfq-mq-iosched.c
+++ b/block/bfq-mq-iosched.c
@@ -525,8 +525,13 @@ static void bfq_limit_depth(unsigned int op, struct 
blk_mq_alloc_data *data)
if (unlikely(bfqd->sb_shift != bt->sb.shift))
bfq_update_depths(bfqd, bt);
 
+#if 0
data->shallow_depth =
bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+#else
+   data->shallow_depth = 1;
+#endif
+
 
bfq_log(bfqd, "wr_busy %d sync %d depth %u",
bfqd->wr_busy_queues, op_is_sync(op),


> 
>   -Mike
> 



Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-05 Thread Paolo Valente


> Il giorno 05 mag 2018, alle ore 10:19, Mike Galbraith  ha 
> scritto:
> 
> On Fri, 2018-05-04 at 21:46 +0200, Mike Galbraith wrote:
>> Tentatively, I suspect you've just fixed the nasty stalls I reported a
>> while back.
> 
> Oh well, so much for optimism.  It took a lot, but just hung.

Yep, it'd have been a lot of luck, being your hang related to
different operations from those touched by this fix.  Maybe
time-before-failure stretched because your system suffered from the
illness cured by this fix too.

BTW, if you didn't run out of patience with this permanent issue yet,
I was thinking of two o three changes to retry to trigger your failure
reliably.  In case of success, I could restart racking my brains from
there.

Thanks,
Paolo

Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-05 Thread Paolo Valente


> Il giorno 05 mag 2018, alle ore 10:19, Mike Galbraith  ha 
> scritto:
> 
> On Fri, 2018-05-04 at 21:46 +0200, Mike Galbraith wrote:
>> Tentatively, I suspect you've just fixed the nasty stalls I reported a
>> while back.
> 
> Oh well, so much for optimism.  It took a lot, but just hung.

Yep, it'd have been a lot of luck, being your hang related to
different operations from those touched by this fix.  Maybe
time-before-failure stretched because your system suffered from the
illness cured by this fix too.

BTW, if you didn't run out of patience with this permanent issue yet,
I was thinking of two o three changes to retry to trigger your failure
reliably.  In case of success, I could restart racking my brains from
there.

Thanks,
Paolo

[PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-04 Thread Paolo Valente
When invoked for an I/O request rq, the prepare_request hook of bfq
increments reference counters in the destination bfq_queue for rq. In
this respect, after this hook has been invoked, rq may still be
transformed into a request with no icq attached, i.e., for bfq, a
request not associated with any bfq_queue. No further hook is invoked
to signal this tranformation to bfq (in general, to the destination
elevator for rq). This leads bfq into an inconsistent state, because
bfq has no chance to correctly lower these counters back. This
inconsistency may in its turn cause incorrect scheduling and hangs. It
certainly causes memory leaks, by making it impossible for bfq to free
the involved bfq_queue.

On the bright side, no transformation can still happen for rq after rq
has been inserted into bfq, or merged with another, already inserted,
request. Exploiting this fact, this commit addresses the above issue
by delaying the preparation of an I/O request to when the request is
inserted or merged.

This change also gives a performance bonus: a lock-contention point
gets removed. To prepare a request, bfq needs to hold its scheduler
lock. After postponing request preparation to insertion or merging, no
lock needs to be grabbed any longer in the prepare_request hook, while
the lock already taken to perform insertion or merging is used to
preparare the request as well.

Signed-off-by: Paolo Valente <paolo.vale...@linaro.org>
---
 block/bfq-iosched.c | 86 +++--
 1 file changed, 57 insertions(+), 29 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 771ae9730ac6..ea02162df6c7 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -1858,6 +1858,8 @@ static int bfq_request_merge(struct request_queue *q, 
struct request **req,
return ELEVATOR_NO_MERGE;
 }
 
+static struct bfq_queue *bfq_init_rq(struct request *rq);
+
 static void bfq_request_merged(struct request_queue *q, struct request *req,
   enum elv_merge type)
 {
@@ -1866,7 +1868,7 @@ static void bfq_request_merged(struct request_queue *q, 
struct request *req,
blk_rq_pos(req) <
blk_rq_pos(container_of(rb_prev(>rb_node),
struct request, rb_node))) {
-   struct bfq_queue *bfqq = RQ_BFQQ(req);
+   struct bfq_queue *bfqq = bfq_init_rq(req);
struct bfq_data *bfqd = bfqq->bfqd;
struct request *prev, *next_rq;
 
@@ -1894,7 +1896,8 @@ static void bfq_request_merged(struct request_queue *q, 
struct request *req,
 static void bfq_requests_merged(struct request_queue *q, struct request *rq,
struct request *next)
 {
-   struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
+   struct bfq_queue *bfqq = bfq_init_rq(rq),
+   *next_bfqq = bfq_init_rq(next);
 
if (!RB_EMPTY_NODE(>rb_node))
goto end;
@@ -4540,14 +4543,12 @@ static inline void bfq_update_insert_stats(struct 
request_queue *q,
   unsigned int cmd_flags) {}
 #endif
 
-static void bfq_prepare_request(struct request *rq, struct bio *bio);
-
 static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
   bool at_head)
 {
struct request_queue *q = hctx->queue;
struct bfq_data *bfqd = q->elevator->elevator_data;
-   struct bfq_queue *bfqq = RQ_BFQQ(rq);
+   struct bfq_queue *bfqq;
bool idle_timer_disabled = false;
unsigned int cmd_flags;
 
@@ -4562,24 +4563,13 @@ static void bfq_insert_request(struct blk_mq_hw_ctx 
*hctx, struct request *rq,
blk_mq_sched_request_inserted(rq);
 
spin_lock_irq(>lock);
+   bfqq = bfq_init_rq(rq);
if (at_head || blk_rq_is_passthrough(rq)) {
if (at_head)
list_add(>queuelist, >dispatch);
else
list_add_tail(>queuelist, >dispatch);
-   } else {
-   if (WARN_ON_ONCE(!bfqq)) {
-   /*
-* This should never happen. Most likely rq is
-* a requeued regular request, being
-* re-inserted without being first
-* re-prepared. Do a prepare, to avoid
-* failure.
-*/
-   bfq_prepare_request(rq, rq->bio);
-   bfqq = RQ_BFQQ(rq);
-   }
-
+   } else { /* bfqq is assumed to be non null here */
idle_timer_disabled = __bfq_insert_request(bfqd, rq);
/*
 * Update bfqq, because, if a queue merge has occurred
@@ -4922,11 +4912,48 @@ static struct bfq_queue 
*bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
 }
 
 /*
- * Allocate bfq data structures associa

[PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-04 Thread Paolo Valente
When invoked for an I/O request rq, the prepare_request hook of bfq
increments reference counters in the destination bfq_queue for rq. In
this respect, after this hook has been invoked, rq may still be
transformed into a request with no icq attached, i.e., for bfq, a
request not associated with any bfq_queue. No further hook is invoked
to signal this tranformation to bfq (in general, to the destination
elevator for rq). This leads bfq into an inconsistent state, because
bfq has no chance to correctly lower these counters back. This
inconsistency may in its turn cause incorrect scheduling and hangs. It
certainly causes memory leaks, by making it impossible for bfq to free
the involved bfq_queue.

On the bright side, no transformation can still happen for rq after rq
has been inserted into bfq, or merged with another, already inserted,
request. Exploiting this fact, this commit addresses the above issue
by delaying the preparation of an I/O request to when the request is
inserted or merged.

This change also gives a performance bonus: a lock-contention point
gets removed. To prepare a request, bfq needs to hold its scheduler
lock. After postponing request preparation to insertion or merging, no
lock needs to be grabbed any longer in the prepare_request hook, while
the lock already taken to perform insertion or merging is used to
preparare the request as well.

Signed-off-by: Paolo Valente 
---
 block/bfq-iosched.c | 86 +++--
 1 file changed, 57 insertions(+), 29 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 771ae9730ac6..ea02162df6c7 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -1858,6 +1858,8 @@ static int bfq_request_merge(struct request_queue *q, 
struct request **req,
return ELEVATOR_NO_MERGE;
 }
 
+static struct bfq_queue *bfq_init_rq(struct request *rq);
+
 static void bfq_request_merged(struct request_queue *q, struct request *req,
   enum elv_merge type)
 {
@@ -1866,7 +1868,7 @@ static void bfq_request_merged(struct request_queue *q, 
struct request *req,
blk_rq_pos(req) <
blk_rq_pos(container_of(rb_prev(>rb_node),
struct request, rb_node))) {
-   struct bfq_queue *bfqq = RQ_BFQQ(req);
+   struct bfq_queue *bfqq = bfq_init_rq(req);
struct bfq_data *bfqd = bfqq->bfqd;
struct request *prev, *next_rq;
 
@@ -1894,7 +1896,8 @@ static void bfq_request_merged(struct request_queue *q, 
struct request *req,
 static void bfq_requests_merged(struct request_queue *q, struct request *rq,
struct request *next)
 {
-   struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
+   struct bfq_queue *bfqq = bfq_init_rq(rq),
+   *next_bfqq = bfq_init_rq(next);
 
if (!RB_EMPTY_NODE(>rb_node))
goto end;
@@ -4540,14 +4543,12 @@ static inline void bfq_update_insert_stats(struct 
request_queue *q,
   unsigned int cmd_flags) {}
 #endif
 
-static void bfq_prepare_request(struct request *rq, struct bio *bio);
-
 static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
   bool at_head)
 {
struct request_queue *q = hctx->queue;
struct bfq_data *bfqd = q->elevator->elevator_data;
-   struct bfq_queue *bfqq = RQ_BFQQ(rq);
+   struct bfq_queue *bfqq;
bool idle_timer_disabled = false;
unsigned int cmd_flags;
 
@@ -4562,24 +4563,13 @@ static void bfq_insert_request(struct blk_mq_hw_ctx 
*hctx, struct request *rq,
blk_mq_sched_request_inserted(rq);
 
spin_lock_irq(>lock);
+   bfqq = bfq_init_rq(rq);
if (at_head || blk_rq_is_passthrough(rq)) {
if (at_head)
list_add(>queuelist, >dispatch);
else
list_add_tail(>queuelist, >dispatch);
-   } else {
-   if (WARN_ON_ONCE(!bfqq)) {
-   /*
-* This should never happen. Most likely rq is
-* a requeued regular request, being
-* re-inserted without being first
-* re-prepared. Do a prepare, to avoid
-* failure.
-*/
-   bfq_prepare_request(rq, rq->bio);
-   bfqq = RQ_BFQQ(rq);
-   }
-
+   } else { /* bfqq is assumed to be non null here */
idle_timer_disabled = __bfq_insert_request(bfqd, rq);
/*
 * Update bfqq, because, if a queue merge has occurred
@@ -4922,11 +4912,48 @@ static struct bfq_queue 
*bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
 }
 
 /*
- * Allocate bfq data structures associated with this request.
+ * Only r

Re: testing io.low limit for blk-throttle

2018-05-03 Thread Paolo Valente


> Il giorno 26 apr 2018, alle ore 20:32, Tejun Heo <t...@kernel.org> ha scritto:
> 
> Hello,
> 
> On Tue, Apr 24, 2018 at 02:12:51PM +0200, Paolo Valente wrote:
>> +Tejun (I guess he might be interested in the results below)
> 
> Our experiments didn't work out too well either.  At this point, it
> isn't clear whether io.low will ever leave experimental state.  We're
> trying to find a working solution.
> 

Thanks for this update, Tejun.  I'm still working (very slowly) on a
survey of the current state of affairs in terms of bandwidth and
latency guarantees in the block layer.  The synthesis of the results
I've collected so far is, more or less:

"The problem of reaching a high throughput and, at the same time,
guaranteeing bandwidth and latency is still unsolved, apart from
simple cases, such as homogenous, constant workloads"

I'm anticipating this, because I don't want to risk to underestimate
anybody's work.  So, if anyone has examples of how, e.g., to
distribute I/O bandwidth as desired among heterogenous workloads (for
instance, random vs sequential workloads) that might fluctuate over
time, without losing total throughput, please tell me, and I'll test
them.

Thanks,
Paolo

> Thanks.
> 
> -- 
> tejun



Re: testing io.low limit for blk-throttle

2018-05-03 Thread Paolo Valente


> Il giorno 26 apr 2018, alle ore 20:32, Tejun Heo  ha scritto:
> 
> Hello,
> 
> On Tue, Apr 24, 2018 at 02:12:51PM +0200, Paolo Valente wrote:
>> +Tejun (I guess he might be interested in the results below)
> 
> Our experiments didn't work out too well either.  At this point, it
> isn't clear whether io.low will ever leave experimental state.  We're
> trying to find a working solution.
> 

Thanks for this update, Tejun.  I'm still working (very slowly) on a
survey of the current state of affairs in terms of bandwidth and
latency guarantees in the block layer.  The synthesis of the results
I've collected so far is, more or less:

"The problem of reaching a high throughput and, at the same time,
guaranteeing bandwidth and latency is still unsolved, apart from
simple cases, such as homogenous, constant workloads"

I'm anticipating this, because I don't want to risk to underestimate
anybody's work.  So, if anyone has examples of how, e.g., to
distribute I/O bandwidth as desired among heterogenous workloads (for
instance, random vs sequential workloads) that might fluctuate over
time, without losing total throughput, please tell me, and I'll test
them.

Thanks,
Paolo

> Thanks.
> 
> -- 
> tejun



Re: testing io.low limit for blk-throttle

2018-04-26 Thread Paolo Valente


> Il giorno 27 apr 2018, alle ore 05:27, Joseph Qi <jiangqi...@gmail.com> ha 
> scritto:
> 
> Hi Paolo,
> 
> On 18/4/27 01:27, Paolo Valente wrote:
>> 
>> 
>>> Il giorno 25 apr 2018, alle ore 14:13, Joseph Qi <jiangqi...@gmail.com> ha 
>>> scritto:
>>> 
>>> Hi Paolo,
>>> 
>> 
>> Hi Joseph
>> 
>>> ...
>>> Could you run blktrace as well when testing your case? There are several
>>> throtl traces to help analyze whether it is caused by frequently
>>> upgrade/downgrade.
>> 
>> Certainly.  You can find a trace attached.  Unfortunately, I'm not
>> familiar with the internals of blk-throttle and low limit, so, if you
>> want me to analyze the trace, give me some hints on what I have to
>> look for.  Otherwise, I'll be happy to learn from your analysis.
>> 
> 
> I've taken a glance at your blktrace attached. It is only upgrade at first and
> then downgrade (just adjust limit, not to LIMIT_LOW) frequently.
> But I don't know why it always thinks throttle group is not idle.
> 
> For example:
> fio-2336  [004] d...   428.458249:   8,16   m   N throtl avg_idle=90, 
> idle_threshold=1000, bad_bio=10, total_bio=84, is_idle=0, scale=9
> fio-2336  [004] d...   428.458251:   8,16   m   N throtl downgrade, scale 4
> 
> In throtl_tg_is_idle():
> is_idle = ... ||
>   (tg->latency_target && tg->bio_cnt &&
>tg->bad_bio_cnt * 5 < tg->bio_cnt);
> 
> It should be idle and allow run more bandwidth. But here the result shows not
> idle (is_idle=0). I have to do more investigation to figure it out why. 
> 

Hi Joseph,
actually this doesn't surprise me much, for this scenario I expected
exactly that blk-throttle would have considered the random-I/O group,
for most of the time,
1) non idle,
2) above the 100usec target latency, and
3) below low limit,

In fact,
1) The group can evidently issue I/O at a much higher rate than that
received, so, immediately after its last pending I/O has been served,
the group issues new I/O; in the end, it is is non idle most of the
time
2) To try to enforce the 10MB/s limit, blk-throttle necessarily makes
the group oscillate around 10MB/s, which means that the group is
frequently below limit (this would not have held only if the group had
actually received much more than 10MB/s, but it is not so)
3) For each of the 4k random I/Os of the group, the time needed by the
drive to serve that I/O is already around 40-50usec.  So, since the
group is of course not constantly in service, it is very easy that,
because of throttling, the latency of most I/Os of the group goes
beyond 100usec.

But, as it is often the case for me, I might have simply misunderstood
blk-throttle parameters, and I might be just wrong here.

Thanks,
Paolo

> You can also filter these logs using:
> grep throtl trace | grep -E 'upgrade|downgrade|is_idle'
> 
> Thanks,
> Joseph



Re: testing io.low limit for blk-throttle

2018-04-26 Thread Paolo Valente


> Il giorno 27 apr 2018, alle ore 05:27, Joseph Qi  ha 
> scritto:
> 
> Hi Paolo,
> 
> On 18/4/27 01:27, Paolo Valente wrote:
>> 
>> 
>>> Il giorno 25 apr 2018, alle ore 14:13, Joseph Qi  ha 
>>> scritto:
>>> 
>>> Hi Paolo,
>>> 
>> 
>> Hi Joseph
>> 
>>> ...
>>> Could you run blktrace as well when testing your case? There are several
>>> throtl traces to help analyze whether it is caused by frequently
>>> upgrade/downgrade.
>> 
>> Certainly.  You can find a trace attached.  Unfortunately, I'm not
>> familiar with the internals of blk-throttle and low limit, so, if you
>> want me to analyze the trace, give me some hints on what I have to
>> look for.  Otherwise, I'll be happy to learn from your analysis.
>> 
> 
> I've taken a glance at your blktrace attached. It is only upgrade at first and
> then downgrade (just adjust limit, not to LIMIT_LOW) frequently.
> But I don't know why it always thinks throttle group is not idle.
> 
> For example:
> fio-2336  [004] d...   428.458249:   8,16   m   N throtl avg_idle=90, 
> idle_threshold=1000, bad_bio=10, total_bio=84, is_idle=0, scale=9
> fio-2336  [004] d...   428.458251:   8,16   m   N throtl downgrade, scale 4
> 
> In throtl_tg_is_idle():
> is_idle = ... ||
>   (tg->latency_target && tg->bio_cnt &&
>tg->bad_bio_cnt * 5 < tg->bio_cnt);
> 
> It should be idle and allow run more bandwidth. But here the result shows not
> idle (is_idle=0). I have to do more investigation to figure it out why. 
> 

Hi Joseph,
actually this doesn't surprise me much, for this scenario I expected
exactly that blk-throttle would have considered the random-I/O group,
for most of the time,
1) non idle,
2) above the 100usec target latency, and
3) below low limit,

In fact,
1) The group can evidently issue I/O at a much higher rate than that
received, so, immediately after its last pending I/O has been served,
the group issues new I/O; in the end, it is is non idle most of the
time
2) To try to enforce the 10MB/s limit, blk-throttle necessarily makes
the group oscillate around 10MB/s, which means that the group is
frequently below limit (this would not have held only if the group had
actually received much more than 10MB/s, but it is not so)
3) For each of the 4k random I/Os of the group, the time needed by the
drive to serve that I/O is already around 40-50usec.  So, since the
group is of course not constantly in service, it is very easy that,
because of throttling, the latency of most I/Os of the group goes
beyond 100usec.

But, as it is often the case for me, I might have simply misunderstood
blk-throttle parameters, and I might be just wrong here.

Thanks,
Paolo

> You can also filter these logs using:
> grep throtl trace | grep -E 'upgrade|downgrade|is_idle'
> 
> Thanks,
> Joseph



Re: testing io.low limit for blk-throttle

2018-04-24 Thread Paolo Valente


> Il giorno 23 apr 2018, alle ore 11:01, Joseph Qi <jiangqi...@gmail.com> ha 
> scritto:
> 
> 
> 
> On 18/4/23 15:35, Paolo Valente wrote:
>> 
>> 
>>> Il giorno 23 apr 2018, alle ore 08:05, Joseph Qi <jiangqi...@gmail.com> ha 
>>> scritto:
>>> 
>>> Hi Paolo,
>> 
>> Hi Joseph,
>> thanks for chiming in.
>> 
>>> What's your idle and latency config?
>> 
>> I didn't set them at all, as the only (explicit) requirement in my
>> basic test is that one of the group is guaranteed a minimum bps.
>> 
>> 
>>> IMO, io.low will allow others run more bandwidth if cgroup's average
>>> idle time is high or latency is low.
>> 
>> What you say here makes me think that I simply misunderstood the
>> purpose of io.low.  So, here is my problem/question: "I only need to
>> guarantee at least a minimum bandwidth, in bps, to a group.  Is the
>> io.low limit the way to go?"
>> 
>> I know that I can use just io.max (unless I misunderstood the goal of
>> io.max too :( ), but my extra purpose would be to not waste bandwidth
>> when some group is idle.  Yet, as for now, io.low is not working even
>> for the first, simpler goal, i.e., guaranteeing a minimum bandwidth to
>> one group when all groups are active.
>> 
>> Am I getting something wrong?
>> 
>> Otherwise, if there are some special values for idle and latency
>> parameters that would make throttle work for my test, I'll be of
>> course happy to try them.
>> 
> I think you can try idle time with 1000us for all cgroups, and latency
> target 100us for cgroup with low limit 100MB/s and 2000us for cgroups
> with low limit 10MB/s. That means cgroup with low latency target will
> be preferred.
> BTW, from my expeierence the parameters are not easy to set because
> they are strongly correlated to the cgroup IO behavior.
> 

+Tejun (I guess he might be interested in the results below)

Hi Joseph,
thanks for chiming in. Your suggestion did work!

At first, I thought I had also understood the use of latency from the
outcome of your suggestion: "want low limit really guaranteed for a
group?  set target latency to a low value for it." But then, as a
crosscheck, I repeated the same exact test, but reversing target
latencies: I gave 2000 to the interfered (the group with 100MB/s
limit) and 100 to the interferers.  And the interfered still got more
than 100MB/s!  So I exaggerated: 2 to the interfered.
Same outcome :(

I tried really many other combinations, to try to figure this out, but
results seemed more or less random w.r.t. to latency values.  I
didn't even start to test different values for idle.

So, the only sound lesson that I seem to have learned is: if I want
low limits to be enforced, I have to set target latency and idle
explicitly.  The actual values of latencies matter little, or not at
all. At least this holds for my simple tests.

At any rate, thanks to your help, Joseph, I could move to the most
interesting part for me: how effective is blk-throttle with low
limits?  I could well be wrong again, but my results do not seem that
good.  With the simplest type of non-toy example I considered, I
recorded throughput losses, apparently caused mainly by blk-throttle,
and ranging from 64% to 75%.

Here is a worst-case example.  For each step, I'm reporting below the
command by which you can reproduce that step with the
thr-lat-with-interference benchmark of the S suite [1].  I just split
bandwidth equally among five groups, on my SSD.  The device showed a
peak rate of ~515MB/s in this test, so I set rpbs to 100MB/s for each
group (and tried various values, and combinations of values, for the
target latency, without any effect on the results).  To begin, I made
every group do sequential reads.  Everything worked perfectly fine.

But then I made one group do random I/O [2], and troubles began.  Even
if the group doing random I/O was given a target latency of 100usec
(or lower), while the other had a target latency of 2000usec, the poor
random-I/O group got only 4.7 MB/s!  (A single process doing 4k sync
random I/O reaches 25MB/s on my SSD.)

I guess things broke because low limits did not comply any longer with
the lower speed that device reached with the new, mixed workload: the
device reached 376MB/s, while the sum of the low limits was 500MB/s.
BTW the 'fault' for this loss of throughput was not only of the device
and the workload: if I switched throttling off, then the device still
reached its peak rate, although granting only 1.3MB/s to the
random-I/O group.

So, to comply with the 376MB/s, I lowered the low limits to 74MB/s per
group (to avoid a too tight 75MB/s) [3].  A little better: the
random-I/O group got 7.2 MB/s.  But the total throughput went down
further, to

Re: testing io.low limit for blk-throttle

2018-04-24 Thread Paolo Valente


> Il giorno 23 apr 2018, alle ore 11:01, Joseph Qi  ha 
> scritto:
> 
> 
> 
> On 18/4/23 15:35, Paolo Valente wrote:
>> 
>> 
>>> Il giorno 23 apr 2018, alle ore 08:05, Joseph Qi  ha 
>>> scritto:
>>> 
>>> Hi Paolo,
>> 
>> Hi Joseph,
>> thanks for chiming in.
>> 
>>> What's your idle and latency config?
>> 
>> I didn't set them at all, as the only (explicit) requirement in my
>> basic test is that one of the group is guaranteed a minimum bps.
>> 
>> 
>>> IMO, io.low will allow others run more bandwidth if cgroup's average
>>> idle time is high or latency is low.
>> 
>> What you say here makes me think that I simply misunderstood the
>> purpose of io.low.  So, here is my problem/question: "I only need to
>> guarantee at least a minimum bandwidth, in bps, to a group.  Is the
>> io.low limit the way to go?"
>> 
>> I know that I can use just io.max (unless I misunderstood the goal of
>> io.max too :( ), but my extra purpose would be to not waste bandwidth
>> when some group is idle.  Yet, as for now, io.low is not working even
>> for the first, simpler goal, i.e., guaranteeing a minimum bandwidth to
>> one group when all groups are active.
>> 
>> Am I getting something wrong?
>> 
>> Otherwise, if there are some special values for idle and latency
>> parameters that would make throttle work for my test, I'll be of
>> course happy to try them.
>> 
> I think you can try idle time with 1000us for all cgroups, and latency
> target 100us for cgroup with low limit 100MB/s and 2000us for cgroups
> with low limit 10MB/s. That means cgroup with low latency target will
> be preferred.
> BTW, from my expeierence the parameters are not easy to set because
> they are strongly correlated to the cgroup IO behavior.
> 

+Tejun (I guess he might be interested in the results below)

Hi Joseph,
thanks for chiming in. Your suggestion did work!

At first, I thought I had also understood the use of latency from the
outcome of your suggestion: "want low limit really guaranteed for a
group?  set target latency to a low value for it." But then, as a
crosscheck, I repeated the same exact test, but reversing target
latencies: I gave 2000 to the interfered (the group with 100MB/s
limit) and 100 to the interferers.  And the interfered still got more
than 100MB/s!  So I exaggerated: 2 to the interfered.
Same outcome :(

I tried really many other combinations, to try to figure this out, but
results seemed more or less random w.r.t. to latency values.  I
didn't even start to test different values for idle.

So, the only sound lesson that I seem to have learned is: if I want
low limits to be enforced, I have to set target latency and idle
explicitly.  The actual values of latencies matter little, or not at
all. At least this holds for my simple tests.

At any rate, thanks to your help, Joseph, I could move to the most
interesting part for me: how effective is blk-throttle with low
limits?  I could well be wrong again, but my results do not seem that
good.  With the simplest type of non-toy example I considered, I
recorded throughput losses, apparently caused mainly by blk-throttle,
and ranging from 64% to 75%.

Here is a worst-case example.  For each step, I'm reporting below the
command by which you can reproduce that step with the
thr-lat-with-interference benchmark of the S suite [1].  I just split
bandwidth equally among five groups, on my SSD.  The device showed a
peak rate of ~515MB/s in this test, so I set rpbs to 100MB/s for each
group (and tried various values, and combinations of values, for the
target latency, without any effect on the results).  To begin, I made
every group do sequential reads.  Everything worked perfectly fine.

But then I made one group do random I/O [2], and troubles began.  Even
if the group doing random I/O was given a target latency of 100usec
(or lower), while the other had a target latency of 2000usec, the poor
random-I/O group got only 4.7 MB/s!  (A single process doing 4k sync
random I/O reaches 25MB/s on my SSD.)

I guess things broke because low limits did not comply any longer with
the lower speed that device reached with the new, mixed workload: the
device reached 376MB/s, while the sum of the low limits was 500MB/s.
BTW the 'fault' for this loss of throughput was not only of the device
and the workload: if I switched throttling off, then the device still
reached its peak rate, although granting only 1.3MB/s to the
random-I/O group.

So, to comply with the 376MB/s, I lowered the low limits to 74MB/s per
group (to avoid a too tight 75MB/s) [3].  A little better: the
random-I/O group got 7.2 MB/s.  But the total throughput went down
further, to 289MB/s, and became again lower than the sum of the low
limi

Re: usercopy whitelist woe in scsi_sense_cache

2018-04-21 Thread Paolo Valente


> Il giorno 20 apr 2018, alle ore 22:23, Kees Cook <keesc...@chromium.org> ha 
> scritto:
> 
> On Thu, Apr 19, 2018 at 2:32 AM, Paolo Valente <paolo.vale...@linaro.org> 
> wrote:
>> I'm missing something here.  When the request gets completed in the
>> first place, the hook bfq_finish_requeue_request gets called, and that
>> hook clears both ->elv.priv elements (as the request has a non-null
>> elv.icq).  So, when bfq gets the same request again, those elements
>> must be NULL.  What am I getting wrong?
>> 
>> I have some more concern on this point, but I'll stick to this for the
>> moment, to not create more confusion.
> 
> I don't know the "how", I only found the "what". :)

Got it, although I think you did much more than that ;)

Anyway, my reply was exactly to a (Jens') detailed description of the
how.  And my concern is that there seems to be an inconsistency in
that description.  In addition, Jens is proposing a patch basing on
that description.  But, if this inconsistency is not solved, that
patch may eliminate the symptom at hand, but it may not fix the real
cause, or may even contribute to bury it deeper.

> If you want, grab
> the reproducer VM linked to earlier in this thread; it'll hit the
> problem within about 30 seconds of running the reproducer.
> 

Yep.  Actually, I've been investigating this kind of failure, in
different incarnations, for months now.  In this respect, other
examples are the srp-test failures reported by Bart, e.g., here [1].
According to my analysis, the cause of the problem is somewhere in
blk-mq, outside bfq.  Unfortunately, I didn't make it to find where it
exactly is, mainly because of my limited expertise on blk-mq
internals.  So I have asked for any kind of help and suggestions to
Jens, Mike and any other knowledgeable guy.  Probably those help
requests got somehow lost on those threads, but your results, Kees,
and the analysis that followed from Jens seems now to be carrying us
to the solution of the not-so-recent issue.  Time will tell.


Thanks,
Paolo

[1] https://www.spinics.net/lists/linux-block/msg22760.html


> -Kees
> 
> -- 
> Kees Cook
> Pixel Security



Re: usercopy whitelist woe in scsi_sense_cache

2018-04-21 Thread Paolo Valente


> Il giorno 20 apr 2018, alle ore 22:23, Kees Cook  ha 
> scritto:
> 
> On Thu, Apr 19, 2018 at 2:32 AM, Paolo Valente  
> wrote:
>> I'm missing something here.  When the request gets completed in the
>> first place, the hook bfq_finish_requeue_request gets called, and that
>> hook clears both ->elv.priv elements (as the request has a non-null
>> elv.icq).  So, when bfq gets the same request again, those elements
>> must be NULL.  What am I getting wrong?
>> 
>> I have some more concern on this point, but I'll stick to this for the
>> moment, to not create more confusion.
> 
> I don't know the "how", I only found the "what". :)

Got it, although I think you did much more than that ;)

Anyway, my reply was exactly to a (Jens') detailed description of the
how.  And my concern is that there seems to be an inconsistency in
that description.  In addition, Jens is proposing a patch basing on
that description.  But, if this inconsistency is not solved, that
patch may eliminate the symptom at hand, but it may not fix the real
cause, or may even contribute to bury it deeper.

> If you want, grab
> the reproducer VM linked to earlier in this thread; it'll hit the
> problem within about 30 seconds of running the reproducer.
> 

Yep.  Actually, I've been investigating this kind of failure, in
different incarnations, for months now.  In this respect, other
examples are the srp-test failures reported by Bart, e.g., here [1].
According to my analysis, the cause of the problem is somewhere in
blk-mq, outside bfq.  Unfortunately, I didn't make it to find where it
exactly is, mainly because of my limited expertise on blk-mq
internals.  So I have asked for any kind of help and suggestions to
Jens, Mike and any other knowledgeable guy.  Probably those help
requests got somehow lost on those threads, but your results, Kees,
and the analysis that followed from Jens seems now to be carrying us
to the solution of the not-so-recent issue.  Time will tell.


Thanks,
Paolo

[1] https://www.spinics.net/lists/linux-block/msg22760.html


> -Kees
> 
> -- 
> Kees Cook
> Pixel Security



Re: usercopy whitelist woe in scsi_sense_cache

2018-04-19 Thread Paolo Valente


> Il giorno 18 apr 2018, alle ore 16:30, Jens Axboe <ax...@kernel.dk> ha 
> scritto:
> 
> On 4/18/18 3:08 AM, Paolo Valente wrote:
>> 
>> 
>>> Il giorno 18 apr 2018, alle ore 00:57, Jens Axboe <ax...@kernel.dk> ha 
>>> scritto:
>>> 
>>> On 4/17/18 3:48 PM, Jens Axboe wrote:
>>>> On 4/17/18 3:47 PM, Kees Cook wrote:
>>>>> On Tue, Apr 17, 2018 at 2:39 PM, Jens Axboe <ax...@kernel.dk> wrote:
>>>>>> On 4/17/18 3:25 PM, Kees Cook wrote:
>>>>>>> On Tue, Apr 17, 2018 at 1:46 PM, Kees Cook <keesc...@chromium.org> 
>>>>>>> wrote:
>>>>>>>> I see elv.priv[1] assignments made in a few places -- is it possible
>>>>>>>> there is some kind of uninitialized-but-not-NULL state that can leak
>>>>>>>> in there?
>>>>>>> 
>>>>>>> Got it. This fixes it for me:
>>>>>>> 
>>>>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>>>>> index 0dc9e341c2a7..859df3160303 100644
>>>>>>> --- a/block/blk-mq.c
>>>>>>> +++ b/block/blk-mq.c
>>>>>>> @@ -363,7 +363,7 @@ static struct request *blk_mq_get_request(struct
>>>>>>> request_queue *q,
>>>>>>> 
>>>>>>>   rq = blk_mq_rq_ctx_init(data, tag, op);
>>>>>>>   if (!op_is_flush(op)) {
>>>>>>> -   rq->elv.icq = NULL;
>>>>>>> +   memset(>elv, 0, sizeof(rq->elv));
>>>>>>>   if (e && e->type->ops.mq.prepare_request) {
>>>>>>>   if (e->type->icq_cache && rq_ioc(bio))
>>>>>>>   blk_mq_sched_assign_ioc(rq, bio);
>>>>>>> @@ -461,7 +461,7 @@ void blk_mq_free_request(struct request *rq)
>>>>>>>   e->type->ops.mq.finish_request(rq);
>>>>>>>   if (rq->elv.icq) {
>>>>>>>   put_io_context(rq->elv.icq->ioc);
>>>>>>> -   rq->elv.icq = NULL;
>>>>>>> +   memset(>elv, 0, sizeof(rq->elv));
>>>>>>>   }
>>>>>>>   }
>>>>>> 
>>>>>> This looks like a BFQ problem, this should not be necessary. Paolo,
>>>>>> you're calling your own prepare request handler from the insert
>>>>>> as well, and your prepare request does nothing if rq->elv.icq == NULL.
>>>>> 
>>>>> I sent the patch anyway, since it's kind of a robustness improvement,
>>>>> I'd hope. If you fix BFQ also, please add:
>>>> 
>>>> It's also a memset() in the hot path, would prefer to avoid that...
>>>> The issue here is really the convoluted bfq usage of insert/prepare,
>>>> I'm sure Paolo can take it from here.
>>> 
>> 
>> Hi,
>> I'm very sorry for tuning in very late, but, at the same time, very
>> glad to find the problem probably already solved ;) (in this respect, I 
>> swear,
>> my delay was not intentional)
>> 
>>> Does this fix it?
>>> 
>>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
>>> index f0ecd98509d8..d883469a1582 100644
>>> --- a/block/bfq-iosched.c
>>> +++ b/block/bfq-iosched.c
>>> @@ -4934,8 +4934,11 @@ static void bfq_prepare_request(struct request *rq, 
>>> struct bio *bio)
>>> bool new_queue = false;
>>> bool bfqq_already_existing = false, split = false;
>>> 
>>> -   if (!rq->elv.icq)
>>> +   if (!rq->elv.icq) {
>>> +   rq->elv.priv[0] = rq->elv.priv[1] = NULL;
>>> return;
>>> +   }
>>> +
>> 
>> This does solve the problem at hand.  But it also arouses a question,
>> related to a possible subtle bug.
>> 
>> For BFQ, !rq->elv.icq basically means "this request is not for me, as
>> I am an icq-based scheduler".  But, IIUC the main points in this
>> thread, then this assumption is false.  If it is actually false, then
>> I hope that all requests with !rq->elv.icq that are sent to BFQ do
>> verify the condition (at_head || blk_rq_is_passthrough(rq)).  In fact,
>> requests that do not verify that condition are those that BFQ must put
>

Re: usercopy whitelist woe in scsi_sense_cache

2018-04-19 Thread Paolo Valente


> Il giorno 18 apr 2018, alle ore 16:30, Jens Axboe  ha 
> scritto:
> 
> On 4/18/18 3:08 AM, Paolo Valente wrote:
>> 
>> 
>>> Il giorno 18 apr 2018, alle ore 00:57, Jens Axboe  ha 
>>> scritto:
>>> 
>>> On 4/17/18 3:48 PM, Jens Axboe wrote:
>>>> On 4/17/18 3:47 PM, Kees Cook wrote:
>>>>> On Tue, Apr 17, 2018 at 2:39 PM, Jens Axboe  wrote:
>>>>>> On 4/17/18 3:25 PM, Kees Cook wrote:
>>>>>>> On Tue, Apr 17, 2018 at 1:46 PM, Kees Cook  
>>>>>>> wrote:
>>>>>>>> I see elv.priv[1] assignments made in a few places -- is it possible
>>>>>>>> there is some kind of uninitialized-but-not-NULL state that can leak
>>>>>>>> in there?
>>>>>>> 
>>>>>>> Got it. This fixes it for me:
>>>>>>> 
>>>>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>>>>> index 0dc9e341c2a7..859df3160303 100644
>>>>>>> --- a/block/blk-mq.c
>>>>>>> +++ b/block/blk-mq.c
>>>>>>> @@ -363,7 +363,7 @@ static struct request *blk_mq_get_request(struct
>>>>>>> request_queue *q,
>>>>>>> 
>>>>>>>   rq = blk_mq_rq_ctx_init(data, tag, op);
>>>>>>>   if (!op_is_flush(op)) {
>>>>>>> -   rq->elv.icq = NULL;
>>>>>>> +   memset(>elv, 0, sizeof(rq->elv));
>>>>>>>   if (e && e->type->ops.mq.prepare_request) {
>>>>>>>   if (e->type->icq_cache && rq_ioc(bio))
>>>>>>>   blk_mq_sched_assign_ioc(rq, bio);
>>>>>>> @@ -461,7 +461,7 @@ void blk_mq_free_request(struct request *rq)
>>>>>>>   e->type->ops.mq.finish_request(rq);
>>>>>>>   if (rq->elv.icq) {
>>>>>>>   put_io_context(rq->elv.icq->ioc);
>>>>>>> -   rq->elv.icq = NULL;
>>>>>>> +   memset(>elv, 0, sizeof(rq->elv));
>>>>>>>   }
>>>>>>>   }
>>>>>> 
>>>>>> This looks like a BFQ problem, this should not be necessary. Paolo,
>>>>>> you're calling your own prepare request handler from the insert
>>>>>> as well, and your prepare request does nothing if rq->elv.icq == NULL.
>>>>> 
>>>>> I sent the patch anyway, since it's kind of a robustness improvement,
>>>>> I'd hope. If you fix BFQ also, please add:
>>>> 
>>>> It's also a memset() in the hot path, would prefer to avoid that...
>>>> The issue here is really the convoluted bfq usage of insert/prepare,
>>>> I'm sure Paolo can take it from here.
>>> 
>> 
>> Hi,
>> I'm very sorry for tuning in very late, but, at the same time, very
>> glad to find the problem probably already solved ;) (in this respect, I 
>> swear,
>> my delay was not intentional)
>> 
>>> Does this fix it?
>>> 
>>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
>>> index f0ecd98509d8..d883469a1582 100644
>>> --- a/block/bfq-iosched.c
>>> +++ b/block/bfq-iosched.c
>>> @@ -4934,8 +4934,11 @@ static void bfq_prepare_request(struct request *rq, 
>>> struct bio *bio)
>>> bool new_queue = false;
>>> bool bfqq_already_existing = false, split = false;
>>> 
>>> -   if (!rq->elv.icq)
>>> +   if (!rq->elv.icq) {
>>> +   rq->elv.priv[0] = rq->elv.priv[1] = NULL;
>>> return;
>>> +   }
>>> +
>> 
>> This does solve the problem at hand.  But it also arouses a question,
>> related to a possible subtle bug.
>> 
>> For BFQ, !rq->elv.icq basically means "this request is not for me, as
>> I am an icq-based scheduler".  But, IIUC the main points in this
>> thread, then this assumption is false.  If it is actually false, then
>> I hope that all requests with !rq->elv.icq that are sent to BFQ do
>> verify the condition (at_head || blk_rq_is_passthrough(rq)).  In fact,
>> requests that do not verify that condition are those that BFQ must put
>> in a bfq_queue.  So, even if this patch makes the crash disappear, we
>> drive BFQ completel

Re: [PATCH] blkcg: not hold blkcg lock when deactivating policy.

2018-04-18 Thread Paolo Valente


> Il giorno 18 apr 2018, alle ore 11:18, jiang.bi...@zte.com.cn ha scritto:
> 
> Hi,
>>> Il giorno 17 apr 2018, alle ore 09:10, Jiang Biao <jiang.bi...@zte.com.cn> 
>>> ha scritto:
>>> 
>>> As described in the comment of blkcg_activate_policy(),
>>> *Update of each blkg is protected by both queue and blkcg locks so
>>> that holding either lock and testing blkcg_policy_enabled() is
>>> always enough for dereferencing policy data.*
>>> with queue lock held, there is no need to hold blkcg lock in
>>> blkcg_deactivate_policy(). Similar case is in
>>> blkcg_activate_policy(), which has removed holding of blkcg lock in
>>> commit 4c55f4f9ad3001ac1fefdd8d8ca7641d18558e23.
>>> 
>> 
>> Hi,
>> by chance, did you check whether this may cause problems with bfq,
>> being the latter not protected by the queue lock as cfq?
> Checked the bfq code, bfq seems never used blkcg lock derectly, and 
> update of blkg in the common code is protected by both queue and 
> blkcg locks, so IMHO this patch would not introduce any new problem
> with bfq, even though bfq is not protected by queue lock.

Thank you very much for checking. Then:

Acked-by: Paolo Valente <paolo.vale...@linaro.org>

Thanks,
Paolo

> On the other hand, the locks (queue lock/blkcg lock) used to protected
> the update of blkg seems a bit too heavyweight, especially the queue lock 
> which is used too widely may cause races with other contexts. I wonder
> if there is any way to ease the case? e.g. add a new lock for blkg's own.:) 
> 
> Jiang,
> Regards



Re: [PATCH] blkcg: not hold blkcg lock when deactivating policy.

2018-04-18 Thread Paolo Valente


> Il giorno 18 apr 2018, alle ore 11:18, jiang.bi...@zte.com.cn ha scritto:
> 
> Hi,
>>> Il giorno 17 apr 2018, alle ore 09:10, Jiang Biao  
>>> ha scritto:
>>> 
>>> As described in the comment of blkcg_activate_policy(),
>>> *Update of each blkg is protected by both queue and blkcg locks so
>>> that holding either lock and testing blkcg_policy_enabled() is
>>> always enough for dereferencing policy data.*
>>> with queue lock held, there is no need to hold blkcg lock in
>>> blkcg_deactivate_policy(). Similar case is in
>>> blkcg_activate_policy(), which has removed holding of blkcg lock in
>>> commit 4c55f4f9ad3001ac1fefdd8d8ca7641d18558e23.
>>> 
>> 
>> Hi,
>> by chance, did you check whether this may cause problems with bfq,
>> being the latter not protected by the queue lock as cfq?
> Checked the bfq code, bfq seems never used blkcg lock derectly, and 
> update of blkg in the common code is protected by both queue and 
> blkcg locks, so IMHO this patch would not introduce any new problem
> with bfq, even though bfq is not protected by queue lock.

Thank you very much for checking. Then:

Acked-by: Paolo Valente 

Thanks,
Paolo

> On the other hand, the locks (queue lock/blkcg lock) used to protected
> the update of blkg seems a bit too heavyweight, especially the queue lock 
> which is used too widely may cause races with other contexts. I wonder
> if there is any way to ease the case? e.g. add a new lock for blkg's own.:) 
> 
> Jiang,
> Regards



Re: usercopy whitelist woe in scsi_sense_cache

2018-04-18 Thread Paolo Valente


> Il giorno 18 apr 2018, alle ore 00:57, Jens Axboe  ha 
> scritto:
> 
> On 4/17/18 3:48 PM, Jens Axboe wrote:
>> On 4/17/18 3:47 PM, Kees Cook wrote:
>>> On Tue, Apr 17, 2018 at 2:39 PM, Jens Axboe  wrote:
 On 4/17/18 3:25 PM, Kees Cook wrote:
> On Tue, Apr 17, 2018 at 1:46 PM, Kees Cook  wrote:
>> I see elv.priv[1] assignments made in a few places -- is it possible
>> there is some kind of uninitialized-but-not-NULL state that can leak
>> in there?
> 
> Got it. This fixes it for me:
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 0dc9e341c2a7..859df3160303 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -363,7 +363,7 @@ static struct request *blk_mq_get_request(struct
> request_queue *q,
> 
>rq = blk_mq_rq_ctx_init(data, tag, op);
>if (!op_is_flush(op)) {
> -   rq->elv.icq = NULL;
> +   memset(>elv, 0, sizeof(rq->elv));
>if (e && e->type->ops.mq.prepare_request) {
>if (e->type->icq_cache && rq_ioc(bio))
>blk_mq_sched_assign_ioc(rq, bio);
> @@ -461,7 +461,7 @@ void blk_mq_free_request(struct request *rq)
>e->type->ops.mq.finish_request(rq);
>if (rq->elv.icq) {
>put_io_context(rq->elv.icq->ioc);
> -   rq->elv.icq = NULL;
> +   memset(>elv, 0, sizeof(rq->elv));
>}
>}
 
 This looks like a BFQ problem, this should not be necessary. Paolo,
 you're calling your own prepare request handler from the insert
 as well, and your prepare request does nothing if rq->elv.icq == NULL.
>>> 
>>> I sent the patch anyway, since it's kind of a robustness improvement,
>>> I'd hope. If you fix BFQ also, please add:
>> 
>> It's also a memset() in the hot path, would prefer to avoid that...
>> The issue here is really the convoluted bfq usage of insert/prepare,
>> I'm sure Paolo can take it from here.
> 

Hi,
I'm very sorry for tuning in very late, but, at the same time, very
glad to find the problem probably already solved ;) (in this respect, I swear,
my delay was not intentional)

> Does this fix it?
> 
> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
> index f0ecd98509d8..d883469a1582 100644
> --- a/block/bfq-iosched.c
> +++ b/block/bfq-iosched.c
> @@ -4934,8 +4934,11 @@ static void bfq_prepare_request(struct request *rq, 
> struct bio *bio)
>   bool new_queue = false;
>   bool bfqq_already_existing = false, split = false;
> 
> - if (!rq->elv.icq)
> + if (!rq->elv.icq) {
> + rq->elv.priv[0] = rq->elv.priv[1] = NULL;
>   return;
> + }
> +

This does solve the problem at hand.  But it also arouses a question,
related to a possible subtle bug.

For BFQ, !rq->elv.icq basically means "this request is not for me, as
I am an icq-based scheduler".  But, IIUC the main points in this
thread, then this assumption is false.  If it is actually false, then
I hope that all requests with !rq->elv.icq that are sent to BFQ do
verify the condition (at_head || blk_rq_is_passthrough(rq)).  In fact,
requests that do not verify that condition are those that BFQ must put
in a bfq_queue.  So, even if this patch makes the crash disappear, we
drive BFQ completely crazy (and we may expect other strange failures)
if we send BFQ a request with !((at_head || blk_rq_is_passthrough(rq))
and !rq->elv.icq.  BFQ has to put that rq into a bfq_queue, but simply
cannot.

Jens, or any other, could you please shed a light on this, and explain
how things are exactly?

Thanks,
Paolo

>   bic = icq_to_bic(rq->elv.icq);
> 
>   spin_lock_irq(>lock);
> 
> -- 
> Jens Axboe



Re: usercopy whitelist woe in scsi_sense_cache

2018-04-18 Thread Paolo Valente


> Il giorno 18 apr 2018, alle ore 00:57, Jens Axboe  ha 
> scritto:
> 
> On 4/17/18 3:48 PM, Jens Axboe wrote:
>> On 4/17/18 3:47 PM, Kees Cook wrote:
>>> On Tue, Apr 17, 2018 at 2:39 PM, Jens Axboe  wrote:
 On 4/17/18 3:25 PM, Kees Cook wrote:
> On Tue, Apr 17, 2018 at 1:46 PM, Kees Cook  wrote:
>> I see elv.priv[1] assignments made in a few places -- is it possible
>> there is some kind of uninitialized-but-not-NULL state that can leak
>> in there?
> 
> Got it. This fixes it for me:
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 0dc9e341c2a7..859df3160303 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -363,7 +363,7 @@ static struct request *blk_mq_get_request(struct
> request_queue *q,
> 
>rq = blk_mq_rq_ctx_init(data, tag, op);
>if (!op_is_flush(op)) {
> -   rq->elv.icq = NULL;
> +   memset(>elv, 0, sizeof(rq->elv));
>if (e && e->type->ops.mq.prepare_request) {
>if (e->type->icq_cache && rq_ioc(bio))
>blk_mq_sched_assign_ioc(rq, bio);
> @@ -461,7 +461,7 @@ void blk_mq_free_request(struct request *rq)
>e->type->ops.mq.finish_request(rq);
>if (rq->elv.icq) {
>put_io_context(rq->elv.icq->ioc);
> -   rq->elv.icq = NULL;
> +   memset(>elv, 0, sizeof(rq->elv));
>}
>}
 
 This looks like a BFQ problem, this should not be necessary. Paolo,
 you're calling your own prepare request handler from the insert
 as well, and your prepare request does nothing if rq->elv.icq == NULL.
>>> 
>>> I sent the patch anyway, since it's kind of a robustness improvement,
>>> I'd hope. If you fix BFQ also, please add:
>> 
>> It's also a memset() in the hot path, would prefer to avoid that...
>> The issue here is really the convoluted bfq usage of insert/prepare,
>> I'm sure Paolo can take it from here.
> 

Hi,
I'm very sorry for tuning in very late, but, at the same time, very
glad to find the problem probably already solved ;) (in this respect, I swear,
my delay was not intentional)

> Does this fix it?
> 
> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
> index f0ecd98509d8..d883469a1582 100644
> --- a/block/bfq-iosched.c
> +++ b/block/bfq-iosched.c
> @@ -4934,8 +4934,11 @@ static void bfq_prepare_request(struct request *rq, 
> struct bio *bio)
>   bool new_queue = false;
>   bool bfqq_already_existing = false, split = false;
> 
> - if (!rq->elv.icq)
> + if (!rq->elv.icq) {
> + rq->elv.priv[0] = rq->elv.priv[1] = NULL;
>   return;
> + }
> +

This does solve the problem at hand.  But it also arouses a question,
related to a possible subtle bug.

For BFQ, !rq->elv.icq basically means "this request is not for me, as
I am an icq-based scheduler".  But, IIUC the main points in this
thread, then this assumption is false.  If it is actually false, then
I hope that all requests with !rq->elv.icq that are sent to BFQ do
verify the condition (at_head || blk_rq_is_passthrough(rq)).  In fact,
requests that do not verify that condition are those that BFQ must put
in a bfq_queue.  So, even if this patch makes the crash disappear, we
drive BFQ completely crazy (and we may expect other strange failures)
if we send BFQ a request with !((at_head || blk_rq_is_passthrough(rq))
and !rq->elv.icq.  BFQ has to put that rq into a bfq_queue, but simply
cannot.

Jens, or any other, could you please shed a light on this, and explain
how things are exactly?

Thanks,
Paolo

>   bic = icq_to_bic(rq->elv.icq);
> 
>   spin_lock_irq(>lock);
> 
> -- 
> Jens Axboe



Re: [PATCH] blk-mq: Clear out elevator private data

2018-04-18 Thread Paolo Valente


> Il giorno 17 apr 2018, alle ore 23:42, Kees Cook  ha 
> scritto:
> 
> Some elevators may not correctly check rq->rq_flags & RQF_ELVPRIV, and
> may attempt to read rq->elv fields. When requests got reused, this
> caused BFQ to think it already had a bfqq (rq->elv.priv[1]) allocated.

Hi Kees,
where does BFQ gets confused and operates on a request not destined to
it?  I'm asking because I paid attention to always avoid such a
mistake.

Thanks,
Paolo

> This could lead to odd behaviors like having the sense buffer address
> slowly start incrementing. This eventually tripped HARDENED_USERCOPY
> and KASAN.
> 
> This patch wipes all of rq->elv instead of just rq->elv.icq. While
> it shouldn't technically be needed, this ends up being a robustness
> improvement that should lead to at least finding bugs in elevators faster.
> 
> Reported-by: Oleksandr Natalenko 
> Fixes: bd166ef183c26 ("blk-mq-sched: add framework for MQ capable IO 
> schedulers")
> Cc: sta...@vger.kernel.org
> Signed-off-by: Kees Cook 
> ---
> In theory, BFQ needs to also check the RQF_ELVPRIV flag, but I'll leave that
> to Paolo to figure out. Also, my Fixes line is kind of a best-guess. This
> is where icq was originally wiped, so it seemed as good a commit as any.
> ---
> block/blk-mq.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 0dc9e341c2a7..859df3160303 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -363,7 +363,7 @@ static struct request *blk_mq_get_request(struct 
> request_queue *q,
> 
>   rq = blk_mq_rq_ctx_init(data, tag, op);
>   if (!op_is_flush(op)) {
> - rq->elv.icq = NULL;
> + memset(>elv, 0, sizeof(rq->elv));
>   if (e && e->type->ops.mq.prepare_request) {
>   if (e->type->icq_cache && rq_ioc(bio))
>   blk_mq_sched_assign_ioc(rq, bio);
> @@ -461,7 +461,7 @@ void blk_mq_free_request(struct request *rq)
>   e->type->ops.mq.finish_request(rq);
>   if (rq->elv.icq) {
>   put_io_context(rq->elv.icq->ioc);
> - rq->elv.icq = NULL;
> + memset(>elv, 0, sizeof(rq->elv));
>   }
>   }
> 
> -- 
> 2.7.4
> 
> 
> -- 
> Kees Cook
> Pixel Security



Re: [PATCH] blk-mq: Clear out elevator private data

2018-04-18 Thread Paolo Valente


> Il giorno 17 apr 2018, alle ore 23:42, Kees Cook  ha 
> scritto:
> 
> Some elevators may not correctly check rq->rq_flags & RQF_ELVPRIV, and
> may attempt to read rq->elv fields. When requests got reused, this
> caused BFQ to think it already had a bfqq (rq->elv.priv[1]) allocated.

Hi Kees,
where does BFQ gets confused and operates on a request not destined to
it?  I'm asking because I paid attention to always avoid such a
mistake.

Thanks,
Paolo

> This could lead to odd behaviors like having the sense buffer address
> slowly start incrementing. This eventually tripped HARDENED_USERCOPY
> and KASAN.
> 
> This patch wipes all of rq->elv instead of just rq->elv.icq. While
> it shouldn't technically be needed, this ends up being a robustness
> improvement that should lead to at least finding bugs in elevators faster.
> 
> Reported-by: Oleksandr Natalenko 
> Fixes: bd166ef183c26 ("blk-mq-sched: add framework for MQ capable IO 
> schedulers")
> Cc: sta...@vger.kernel.org
> Signed-off-by: Kees Cook 
> ---
> In theory, BFQ needs to also check the RQF_ELVPRIV flag, but I'll leave that
> to Paolo to figure out. Also, my Fixes line is kind of a best-guess. This
> is where icq was originally wiped, so it seemed as good a commit as any.
> ---
> block/blk-mq.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 0dc9e341c2a7..859df3160303 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -363,7 +363,7 @@ static struct request *blk_mq_get_request(struct 
> request_queue *q,
> 
>   rq = blk_mq_rq_ctx_init(data, tag, op);
>   if (!op_is_flush(op)) {
> - rq->elv.icq = NULL;
> + memset(>elv, 0, sizeof(rq->elv));
>   if (e && e->type->ops.mq.prepare_request) {
>   if (e->type->icq_cache && rq_ioc(bio))
>   blk_mq_sched_assign_ioc(rq, bio);
> @@ -461,7 +461,7 @@ void blk_mq_free_request(struct request *rq)
>   e->type->ops.mq.finish_request(rq);
>   if (rq->elv.icq) {
>   put_io_context(rq->elv.icq->ioc);
> - rq->elv.icq = NULL;
> + memset(>elv, 0, sizeof(rq->elv));
>   }
>   }
> 
> -- 
> 2.7.4
> 
> 
> -- 
> Kees Cook
> Pixel Security



Re: [PATCH] blkcg: not hold blkcg lock when deactivating policy.

2018-04-17 Thread Paolo Valente


> Il giorno 17 apr 2018, alle ore 09:10, Jiang Biao  ha 
> scritto:
> 
> As described in the comment of blkcg_activate_policy(),
> *Update of each blkg is protected by both queue and blkcg locks so
> that holding either lock and testing blkcg_policy_enabled() is
> always enough for dereferencing policy data.*
> with queue lock held, there is no need to hold blkcg lock in
> blkcg_deactivate_policy(). Similar case is in
> blkcg_activate_policy(), which has removed holding of blkcg lock in
> commit 4c55f4f9ad3001ac1fefdd8d8ca7641d18558e23.
> 

Hi,
by chance, did you check whether this may cause problems with bfq,
being the latter not protected by the queue lock as cfq?

Thanks,
Paolo

> Signed-off-by: Jiang Biao 
> Signed-off-by: Wen Yang 
> CC: Tejun Heo 
> CC: Jens Axboe 
> ---
> block/blk-cgroup.c | 5 -
> 1 file changed, 5 deletions(-)
> 
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index c2033a2..2b7f8d0 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -1367,17 +1367,12 @@ void blkcg_deactivate_policy(struct request_queue *q,
>   __clear_bit(pol->plid, q->blkcg_pols);
> 
>   list_for_each_entry(blkg, >blkg_list, q_node) {
> - /* grab blkcg lock too while removing @pd from @blkg */
> - spin_lock(>blkcg->lock);
> -
>   if (blkg->pd[pol->plid]) {
>   if (pol->pd_offline_fn)
>   pol->pd_offline_fn(blkg->pd[pol->plid]);
>   pol->pd_free_fn(blkg->pd[pol->plid]);
>   blkg->pd[pol->plid] = NULL;
>   }
> -
> - spin_unlock(>blkcg->lock);
>   }
> 
>   spin_unlock_irq(q->queue_lock);
> -- 
> 2.7.4
> 



Re: [PATCH] blkcg: not hold blkcg lock when deactivating policy.

2018-04-17 Thread Paolo Valente


> Il giorno 17 apr 2018, alle ore 09:10, Jiang Biao  ha 
> scritto:
> 
> As described in the comment of blkcg_activate_policy(),
> *Update of each blkg is protected by both queue and blkcg locks so
> that holding either lock and testing blkcg_policy_enabled() is
> always enough for dereferencing policy data.*
> with queue lock held, there is no need to hold blkcg lock in
> blkcg_deactivate_policy(). Similar case is in
> blkcg_activate_policy(), which has removed holding of blkcg lock in
> commit 4c55f4f9ad3001ac1fefdd8d8ca7641d18558e23.
> 

Hi,
by chance, did you check whether this may cause problems with bfq,
being the latter not protected by the queue lock as cfq?

Thanks,
Paolo

> Signed-off-by: Jiang Biao 
> Signed-off-by: Wen Yang 
> CC: Tejun Heo 
> CC: Jens Axboe 
> ---
> block/blk-cgroup.c | 5 -
> 1 file changed, 5 deletions(-)
> 
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index c2033a2..2b7f8d0 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -1367,17 +1367,12 @@ void blkcg_deactivate_policy(struct request_queue *q,
>   __clear_bit(pol->plid, q->blkcg_pols);
> 
>   list_for_each_entry(blkg, >blkg_list, q_node) {
> - /* grab blkcg lock too while removing @pd from @blkg */
> - spin_lock(>blkcg->lock);
> -
>   if (blkg->pd[pol->plid]) {
>   if (pol->pd_offline_fn)
>   pol->pd_offline_fn(blkg->pd[pol->plid]);
>   pol->pd_free_fn(blkg->pd[pol->plid]);
>   blkg->pd[pol->plid] = NULL;
>   }
> -
> - spin_unlock(>blkcg->lock);
>   }
> 
>   spin_unlock_irq(q->queue_lock);
> -- 
> 2.7.4
> 



Re: General protection fault with use_blk_mq=1.

2018-03-28 Thread Paolo Valente


> Il giorno 29 mar 2018, alle ore 05:22, Jens Axboe  ha 
> scritto:
> 
> On 3/28/18 9:13 PM, Zephaniah E. Loss-Cutler-Hull wrote:
>> On 03/28/2018 06:02 PM, Jens Axboe wrote:
>>> On 3/28/18 5:03 PM, Zephaniah E. Loss-Cutler-Hull wrote:
 I am not subscribed to any of the lists on the To list here, please CC
 me on any replies.
 
 I am encountering a fairly consistent crash anywhere from 15 minutes to
 12 hours after boot with scsi_mod.use_blk_mq=1 dm_mod.use_blk_mq=1> 
 The crash looks like:
 
>> 
 
 Looking through the code, I'd guess that this is dying inside
 blkg_rwstat_add, which calls percpu_counter_add_batch, which is what RIP
 is pointing at.
>>> 
>>> Leaving the whole thing here for Paolo - it's crashing off insertion of
>>> a request coming out of SG_IO. Don't think we've seen this BFQ failure
>>> case before.
>>> 
>>> You can mitigate this by switching the scsi-mq devices to mq-deadline
>>> instead.
>>> 
>> 
>> I'm thinking that I should also be able to mitigate it by disabling
>> CONFIG_DEBUG_BLK_CGROUP.
>> 
>> That should remove that entire chunk of code.
>> 
>> Of course, that won't help if this is actually a symptom of a bigger
>> problem.
> 
> Yes, it's not a given that it will fully mask the issue at hand. But
> turning off BFQ has a much higher chance of working for you.
> 
> This time actually CC'ing Paolo.
> 

Hi Zephaniah,
if you are actually interested in the benefits of BFQ (low latency,
high responsiveness, fairness, ...) then it may be worth to try what
you yourself suggest: disabling CONFIG_DEBUG_BLK_CGROUP.  Also because
this option activates the heavy computation of debug cgroup statistics,
which probably you don't use.

In addition, the outcome of your attempt without
CONFIG_DEBUG_BLK_CGROUP would give us useful bisection information:
- if no failure occurs, then the issue is likely to be confined in
that debugging code (which, on the bright side, is likely to be of
occasional interest, for only a handful of developers)
- if the issue still shows up, then we may have new hints on this odd
failure

Finally, consider that this issue has been reported to disappear from
4.16 [1], and, as a plus, that the service quality of BFQ had a
further boost exactly from 4.16.

Looking forward to your feedback, in case you try BFQ without
CONFIG_DEBUG_BLK_CGROUP,
Paolo

[1] https://www.spinics.net/lists/linux-block/msg21422.html

> 
> -- 
> Jens Axboe



Re: General protection fault with use_blk_mq=1.

2018-03-28 Thread Paolo Valente


> Il giorno 29 mar 2018, alle ore 05:22, Jens Axboe  ha 
> scritto:
> 
> On 3/28/18 9:13 PM, Zephaniah E. Loss-Cutler-Hull wrote:
>> On 03/28/2018 06:02 PM, Jens Axboe wrote:
>>> On 3/28/18 5:03 PM, Zephaniah E. Loss-Cutler-Hull wrote:
 I am not subscribed to any of the lists on the To list here, please CC
 me on any replies.
 
 I am encountering a fairly consistent crash anywhere from 15 minutes to
 12 hours after boot with scsi_mod.use_blk_mq=1 dm_mod.use_blk_mq=1> 
 The crash looks like:
 
>> 
 
 Looking through the code, I'd guess that this is dying inside
 blkg_rwstat_add, which calls percpu_counter_add_batch, which is what RIP
 is pointing at.
>>> 
>>> Leaving the whole thing here for Paolo - it's crashing off insertion of
>>> a request coming out of SG_IO. Don't think we've seen this BFQ failure
>>> case before.
>>> 
>>> You can mitigate this by switching the scsi-mq devices to mq-deadline
>>> instead.
>>> 
>> 
>> I'm thinking that I should also be able to mitigate it by disabling
>> CONFIG_DEBUG_BLK_CGROUP.
>> 
>> That should remove that entire chunk of code.
>> 
>> Of course, that won't help if this is actually a symptom of a bigger
>> problem.
> 
> Yes, it's not a given that it will fully mask the issue at hand. But
> turning off BFQ has a much higher chance of working for you.
> 
> This time actually CC'ing Paolo.
> 

Hi Zephaniah,
if you are actually interested in the benefits of BFQ (low latency,
high responsiveness, fairness, ...) then it may be worth to try what
you yourself suggest: disabling CONFIG_DEBUG_BLK_CGROUP.  Also because
this option activates the heavy computation of debug cgroup statistics,
which probably you don't use.

In addition, the outcome of your attempt without
CONFIG_DEBUG_BLK_CGROUP would give us useful bisection information:
- if no failure occurs, then the issue is likely to be confined in
that debugging code (which, on the bright side, is likely to be of
occasional interest, for only a handful of developers)
- if the issue still shows up, then we may have new hints on this odd
failure

Finally, consider that this issue has been reported to disappear from
4.16 [1], and, as a plus, that the service quality of BFQ had a
further boost exactly from 4.16.

Looking forward to your feedback, in case you try BFQ without
CONFIG_DEBUG_BLK_CGROUP,
Paolo

[1] https://www.spinics.net/lists/linux-block/msg21422.html

> 
> -- 
> Jens Axboe



Re: General protection fault with use_blk_mq=1.

2018-03-28 Thread Paolo Valente


> Il giorno 29 mar 2018, alle ore 03:02, Jens Axboe  ha 
> scritto:
> 
> On 3/28/18 5:03 PM, Zephaniah E. Loss-Cutler-Hull wrote:
>> I am not subscribed to any of the lists on the To list here, please CC
>> me on any replies.
>> 
>> I am encountering a fairly consistent crash anywhere from 15 minutes to
>> 12 hours after boot with scsi_mod.use_blk_mq=1 dm_mod.use_blk_mq=1> 
>> The crash looks like:
>> 
>> [ 5466.075993] general protection fault:  [#1] PREEMPT SMP PTI
>> [ 5466.075997] Modules linked in: esp4 xfrm4_mode_tunnel fuse usblp
>> uvcvideo pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O)
>> ip6table_filter ip6_tables xt_tcpudp nf_conntrack_ipv4 nf_defrag_ipv4
>> xt_conntrack nf_conntrack iptable_filter ip_tables x_tables intel_rapl
>> joydev serio_raw wmi_bmof iwldvm iwlwifi shpchp kvm_intel kvm irqbypass
>> autofs4 algif_skcipher nls_iso8859_1 nls_cp437 crc32_pclmul
>> ghash_clmulni_intel
>> [ 5466.076022] CPU: 3 PID: 10573 Comm: pool Tainted: G   O
>> 4.15.13-f1-dirty #148
>> [ 5466.076024] Hardware name: Hewlett-Packard HP EliteBook Folio
>> 9470m/18DF, BIOS 68IBD Ver. F.44 05/22/2013
>> [ 5466.076029] RIP: 0010:percpu_counter_add_batch+0x2b/0xb0
>> [ 5466.076031] RSP: 0018:a556c47afb58 EFLAGS: 00010002
>> [ 5466.076033] RAX: 95cda87ce018 RBX: 95cda87cdb68 RCX:
>> 
>> [ 5466.076034] RDX: 3fff RSI: 896495c4 RDI:
>> 895b2bed
>> [ 5466.076036] RBP: 3fff R08:  R09:
>> 95cb7d5f8148
>> [ 5466.076037] R10: 0200 R11:  R12:
>> 0001
>> [ 5466.076038] R13: 95cda87ce088 R14: 95cda6ebd100 R15:
>> a556c47afc58
>> [ 5466.076040] FS:  7f25f5305700() GS:95cdbeac()
>> knlGS:
>> [ 5466.076042] CS:  0010 DS:  ES:  CR0: 80050033
>> [ 5466.076043] CR2: 7f25e807e0a8 CR3: 0003ed5a6001 CR4:
>> 001606e0
>> [ 5466.076044] Call Trace:
>> [ 5466.076050]  bfqg_stats_update_io_add+0x58/0x100
>> [ 5466.076055]  bfq_insert_requests+0xec/0xd80
>> [ 5466.076059]  ? blk_rq_append_bio+0x8f/0xa0
>> [ 5466.076061]  ? blk_rq_map_user_iov+0xc3/0x1d0
>> [ 5466.076065]  blk_mq_sched_insert_request+0xa3/0x130
>> [ 5466.076068]  blk_execute_rq+0x3a/0x50
>> [ 5466.076070]  sg_io+0x197/0x3e0
>> [ 5466.076073]  ? dput+0xca/0x210
>> [ 5466.076077]  ? mntput_no_expire+0x11/0x1a0
>> [ 5466.076079]  scsi_cmd_ioctl+0x289/0x400
>> [ 5466.076082]  ? filename_lookup+0xe1/0x170
>> [ 5466.076085]  sd_ioctl+0xc7/0x1a0
>> [ 5466.076088]  blkdev_ioctl+0x4d4/0x8c0
>> [ 5466.076091]  block_ioctl+0x39/0x40
>> [ 5466.076094]  do_vfs_ioctl+0x92/0x5e0
>> [ 5466.076097]  ? __fget+0x73/0xc0
>> [ 5466.076099]  SyS_ioctl+0x74/0x80
>> [ 5466.076102]  do_syscall_64+0x60/0x110
>> [ 5466.076106]  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
>> [ 5466.076109] RIP: 0033:0x7f25f75fef47
>> [ 5466.076110] RSP: 002b:7f25f53049a8 EFLAGS: 0246 ORIG_RAX:
>> 0010
>> [ 5466.076112] RAX: ffda RBX: 000c RCX:
>> 7f25f75fef47
>> [ 5466.076114] RDX: 7f25f53049b0 RSI: 2285 RDI:
>> 000c
>> [ 5466.076115] RBP: 0010 R08: 7f25e8007818 R09:
>> 0200
>> [ 5466.076116] R10: 0001 R11: 0246 R12:
>> 
>> [ 5466.076118] R13:  R14: 7f25f8a6b5e0 R15:
>> 7f25e80173e0
>> [ 5466.076120] Code: 41 55 49 89 fd bf 01 00 00 00 41 54 49 89 f4 55 89
>> d5 53 e8 18 e1 bb ff 48 c7 c7 c4 95 64 89 e8 dc e9 fb ff 49 8b 45 20 48
>> 63 d5 <65> 8b 18 48 63 db 4c 01 e3 48 39 d3 7d 0a f7 dd 48 63 ed 48 39
>> [ 5466.076147] RIP: percpu_counter_add_batch+0x2b/0xb0 RSP: a556c47afb58
>> [ 5466.076149] ---[ end trace 8d7eb80aafef4494 ]---
>> [ 5466.670153] note: pool[10573] exited with preempt_count 2
>> 
>> (I only have the one instance right this minute as a result of not
>> having remote syslog setup before now.)
>> 
>> This is clearly deep in the blk_mq code, and it goes away when I remove
>> the use_blk_mq kernel command line parameters.
>> 
>> My next obvious step is to try and disable the load of the vbox modules.
>> 
>> I can include the full dmesg output if it would be helpful.
>> 
>> The system is an older HP Ultrabook, and the root partition is, sda1 (a
>> SSD) -> a LUKS encrypted partition -> LVM -> BTRFS.
>> 
>> The kernel is a stock 4.15.11, however I only recently added the blk_mq
>> options, so while I can state that I have seen this on multiple kernels
>> in the 4.15.x series, I have not tested earlier kernels in this
>> configuration.
>> 
>> Looking through the code, I'd guess that this is dying inside
>> blkg_rwstat_add, which calls percpu_counter_add_batch, which is what RIP
>> is pointing at.
> 
> Leaving the whole thing here for Paolo - it's crashing off insertion of
> a request coming out of SG_IO. Don't think we've seen this BFQ failure
> case before.
> 

Actually, we have.  Found and 

Re: General protection fault with use_blk_mq=1.

2018-03-28 Thread Paolo Valente


> Il giorno 29 mar 2018, alle ore 03:02, Jens Axboe  ha 
> scritto:
> 
> On 3/28/18 5:03 PM, Zephaniah E. Loss-Cutler-Hull wrote:
>> I am not subscribed to any of the lists on the To list here, please CC
>> me on any replies.
>> 
>> I am encountering a fairly consistent crash anywhere from 15 minutes to
>> 12 hours after boot with scsi_mod.use_blk_mq=1 dm_mod.use_blk_mq=1> 
>> The crash looks like:
>> 
>> [ 5466.075993] general protection fault:  [#1] PREEMPT SMP PTI
>> [ 5466.075997] Modules linked in: esp4 xfrm4_mode_tunnel fuse usblp
>> uvcvideo pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O)
>> ip6table_filter ip6_tables xt_tcpudp nf_conntrack_ipv4 nf_defrag_ipv4
>> xt_conntrack nf_conntrack iptable_filter ip_tables x_tables intel_rapl
>> joydev serio_raw wmi_bmof iwldvm iwlwifi shpchp kvm_intel kvm irqbypass
>> autofs4 algif_skcipher nls_iso8859_1 nls_cp437 crc32_pclmul
>> ghash_clmulni_intel
>> [ 5466.076022] CPU: 3 PID: 10573 Comm: pool Tainted: G   O
>> 4.15.13-f1-dirty #148
>> [ 5466.076024] Hardware name: Hewlett-Packard HP EliteBook Folio
>> 9470m/18DF, BIOS 68IBD Ver. F.44 05/22/2013
>> [ 5466.076029] RIP: 0010:percpu_counter_add_batch+0x2b/0xb0
>> [ 5466.076031] RSP: 0018:a556c47afb58 EFLAGS: 00010002
>> [ 5466.076033] RAX: 95cda87ce018 RBX: 95cda87cdb68 RCX:
>> 
>> [ 5466.076034] RDX: 3fff RSI: 896495c4 RDI:
>> 895b2bed
>> [ 5466.076036] RBP: 3fff R08:  R09:
>> 95cb7d5f8148
>> [ 5466.076037] R10: 0200 R11:  R12:
>> 0001
>> [ 5466.076038] R13: 95cda87ce088 R14: 95cda6ebd100 R15:
>> a556c47afc58
>> [ 5466.076040] FS:  7f25f5305700() GS:95cdbeac()
>> knlGS:
>> [ 5466.076042] CS:  0010 DS:  ES:  CR0: 80050033
>> [ 5466.076043] CR2: 7f25e807e0a8 CR3: 0003ed5a6001 CR4:
>> 001606e0
>> [ 5466.076044] Call Trace:
>> [ 5466.076050]  bfqg_stats_update_io_add+0x58/0x100
>> [ 5466.076055]  bfq_insert_requests+0xec/0xd80
>> [ 5466.076059]  ? blk_rq_append_bio+0x8f/0xa0
>> [ 5466.076061]  ? blk_rq_map_user_iov+0xc3/0x1d0
>> [ 5466.076065]  blk_mq_sched_insert_request+0xa3/0x130
>> [ 5466.076068]  blk_execute_rq+0x3a/0x50
>> [ 5466.076070]  sg_io+0x197/0x3e0
>> [ 5466.076073]  ? dput+0xca/0x210
>> [ 5466.076077]  ? mntput_no_expire+0x11/0x1a0
>> [ 5466.076079]  scsi_cmd_ioctl+0x289/0x400
>> [ 5466.076082]  ? filename_lookup+0xe1/0x170
>> [ 5466.076085]  sd_ioctl+0xc7/0x1a0
>> [ 5466.076088]  blkdev_ioctl+0x4d4/0x8c0
>> [ 5466.076091]  block_ioctl+0x39/0x40
>> [ 5466.076094]  do_vfs_ioctl+0x92/0x5e0
>> [ 5466.076097]  ? __fget+0x73/0xc0
>> [ 5466.076099]  SyS_ioctl+0x74/0x80
>> [ 5466.076102]  do_syscall_64+0x60/0x110
>> [ 5466.076106]  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
>> [ 5466.076109] RIP: 0033:0x7f25f75fef47
>> [ 5466.076110] RSP: 002b:7f25f53049a8 EFLAGS: 0246 ORIG_RAX:
>> 0010
>> [ 5466.076112] RAX: ffda RBX: 000c RCX:
>> 7f25f75fef47
>> [ 5466.076114] RDX: 7f25f53049b0 RSI: 2285 RDI:
>> 000c
>> [ 5466.076115] RBP: 0010 R08: 7f25e8007818 R09:
>> 0200
>> [ 5466.076116] R10: 0001 R11: 0246 R12:
>> 
>> [ 5466.076118] R13:  R14: 7f25f8a6b5e0 R15:
>> 7f25e80173e0
>> [ 5466.076120] Code: 41 55 49 89 fd bf 01 00 00 00 41 54 49 89 f4 55 89
>> d5 53 e8 18 e1 bb ff 48 c7 c7 c4 95 64 89 e8 dc e9 fb ff 49 8b 45 20 48
>> 63 d5 <65> 8b 18 48 63 db 4c 01 e3 48 39 d3 7d 0a f7 dd 48 63 ed 48 39
>> [ 5466.076147] RIP: percpu_counter_add_batch+0x2b/0xb0 RSP: a556c47afb58
>> [ 5466.076149] ---[ end trace 8d7eb80aafef4494 ]---
>> [ 5466.670153] note: pool[10573] exited with preempt_count 2
>> 
>> (I only have the one instance right this minute as a result of not
>> having remote syslog setup before now.)
>> 
>> This is clearly deep in the blk_mq code, and it goes away when I remove
>> the use_blk_mq kernel command line parameters.
>> 
>> My next obvious step is to try and disable the load of the vbox modules.
>> 
>> I can include the full dmesg output if it would be helpful.
>> 
>> The system is an older HP Ultrabook, and the root partition is, sda1 (a
>> SSD) -> a LUKS encrypted partition -> LVM -> BTRFS.
>> 
>> The kernel is a stock 4.15.11, however I only recently added the blk_mq
>> options, so while I can state that I have seen this on multiple kernels
>> in the 4.15.x series, I have not tested earlier kernels in this
>> configuration.
>> 
>> Looking through the code, I'd guess that this is dying inside
>> blkg_rwstat_add, which calls percpu_counter_add_batch, which is what RIP
>> is pointing at.
> 
> Leaving the whole thing here for Paolo - it's crashing off insertion of
> a request coming out of SG_IO. Don't think we've seen this BFQ failure
> case before.
> 

Actually, we have.  Found and reported by Ming 

[PATCH BUGFIX] block, bfq: lower-bound the estimated peak rate to 1

2018-03-26 Thread Paolo Valente
If a storage device handled by BFQ happens to be slower than 7.5 KB/s
for a certain amount of time (in the order of a second), then the
estimated peak rate of the device, maintained in BFQ, becomes equal to
0. The reason is the limited precision with which the rate is
represented (details on the range of representable values in the
comments introduced by this commit). This leads to a division-by-zero
error where the estimated peak rate is used as divisor. Such a type of
failure has been reported in [1].

This commit addresses this issue by:
1. Lower-bounding the estimated peak rate to 1
2. Adding and improving comments on the range of rates representable

[1] https://www.spinics.net/lists/kernel/msg2739205.html

Signed-off-by: Konstantin Khlebnikov <khlebni...@yandex-team.ru>
Signed-off-by: Paolo Valente <paolo.vale...@linaro.org>
---
 block/bfq-iosched.c | 25 -
 block/bfq-iosched.h |  2 +-
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index aeca22d91101..f0ecd98509d8 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -201,7 +201,20 @@ static struct kmem_cache *bfq_pool;
 /* Target observation time interval for a peak-rate update (ns) */
 #define BFQ_RATE_REF_INTERVAL  NSEC_PER_SEC
 
-/* Shift used for peak rate fixed precision calculations. */
+/*
+ * Shift used for peak-rate fixed precision calculations.
+ * With
+ * - the current shift: 16 positions
+ * - the current type used to store rate: u32
+ * - the current unit of measure for rate: [sectors/usec], or, more precisely,
+ *   [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift,
+ * the range of rates that can be stored is
+ * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec =
+ * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec =
+ * [15, 65G] sectors/sec
+ * Which, assuming a sector size of 512B, corresponds to a range of
+ * [7.5K, 33T] B/sec
+ */
 #define BFQ_RATE_SHIFT 16
 
 /*
@@ -2637,6 +2650,16 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, 
struct request *rq)
rate /= divisor; /* smoothing constant alpha = 1/divisor */
 
bfqd->peak_rate += rate;
+
+   /*
+* For a very slow device, bfqd->peak_rate can reach 0 (see
+* the minimum representable values reported in the comments
+* on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid
+* divisions by zero where bfqd->peak_rate is used as a
+* divisor.
+*/
+   bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate);
+
update_thr_responsiveness_params(bfqd);
 
 reset_computation:
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 350c39ae2896..ae2f3dadec44 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -499,7 +499,7 @@ struct bfq_data {
u64 delta_from_first;
/*
 * Current estimate of the device peak rate, measured in
-* [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
+* [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by
 * BFQ_RATE_SHIFT is performed to increase precision in
 * fixed-point calculations.
 */
-- 
2.16.1



[PATCH BUGFIX] block, bfq: lower-bound the estimated peak rate to 1

2018-03-26 Thread Paolo Valente
If a storage device handled by BFQ happens to be slower than 7.5 KB/s
for a certain amount of time (in the order of a second), then the
estimated peak rate of the device, maintained in BFQ, becomes equal to
0. The reason is the limited precision with which the rate is
represented (details on the range of representable values in the
comments introduced by this commit). This leads to a division-by-zero
error where the estimated peak rate is used as divisor. Such a type of
failure has been reported in [1].

This commit addresses this issue by:
1. Lower-bounding the estimated peak rate to 1
2. Adding and improving comments on the range of rates representable

[1] https://www.spinics.net/lists/kernel/msg2739205.html

Signed-off-by: Konstantin Khlebnikov 
Signed-off-by: Paolo Valente 
---
 block/bfq-iosched.c | 25 -
 block/bfq-iosched.h |  2 +-
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index aeca22d91101..f0ecd98509d8 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -201,7 +201,20 @@ static struct kmem_cache *bfq_pool;
 /* Target observation time interval for a peak-rate update (ns) */
 #define BFQ_RATE_REF_INTERVAL  NSEC_PER_SEC
 
-/* Shift used for peak rate fixed precision calculations. */
+/*
+ * Shift used for peak-rate fixed precision calculations.
+ * With
+ * - the current shift: 16 positions
+ * - the current type used to store rate: u32
+ * - the current unit of measure for rate: [sectors/usec], or, more precisely,
+ *   [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift,
+ * the range of rates that can be stored is
+ * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec =
+ * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec =
+ * [15, 65G] sectors/sec
+ * Which, assuming a sector size of 512B, corresponds to a range of
+ * [7.5K, 33T] B/sec
+ */
 #define BFQ_RATE_SHIFT 16
 
 /*
@@ -2637,6 +2650,16 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, 
struct request *rq)
rate /= divisor; /* smoothing constant alpha = 1/divisor */
 
bfqd->peak_rate += rate;
+
+   /*
+* For a very slow device, bfqd->peak_rate can reach 0 (see
+* the minimum representable values reported in the comments
+* on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid
+* divisions by zero where bfqd->peak_rate is used as a
+* divisor.
+*/
+   bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate);
+
update_thr_responsiveness_params(bfqd);
 
 reset_computation:
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 350c39ae2896..ae2f3dadec44 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -499,7 +499,7 @@ struct bfq_data {
u64 delta_from_first;
/*
 * Current estimate of the device peak rate, measured in
-* [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
+* [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by
 * BFQ_RATE_SHIFT is performed to increase precision in
 * fixed-point calculations.
 */
-- 
2.16.1



Re: [PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1

2018-03-26 Thread Paolo Valente


> Il giorno 26 mar 2018, alle ore 12:28, Konstantin Khlebnikov 
> <khlebni...@yandex-team.ru> ha scritto:
> 
> 
> 
> On 26.03.2018 11:01, Paolo Valente wrote:
>>> Il giorno 21 mar 2018, alle ore 00:49, Paolo Valente 
>>> <paolo.vale...@linaro.org> ha scritto:
>>> 
>>> 
>>> 
>>>> Il giorno 20 mar 2018, alle ore 15:41, Konstantin Khlebnikov 
>>>> <khlebni...@yandex-team.ru> ha scritto:
>>>> 
>>>> On 20.03.2018 06:00, Paolo Valente wrote:
>>>>>> Il giorno 19 mar 2018, alle ore 14:28, Konstantin Khlebnikov 
>>>>>> <khlebni...@yandex-team.ru> ha scritto:
>>>>>> 
>>>>>> On 19.03.2018 09:03, Paolo Valente wrote:
>>>>>>>> Il giorno 05 mar 2018, alle ore 04:48, Konstantin Khlebnikov 
>>>>>>>> <khlebni...@yandex-team.ru> ha scritto:
>>>>>>>> 
>>>>>>>> Rate should never overflow or become zero because it is used as 
>>>>>>>> divider.
>>>>>>>> This patch accumulates it with saturation.
>>>>>>>> 
>>>>>>>> Signed-off-by: Konstantin Khlebnikov <khlebni...@yandex-team.ru>
>>>>>>>> ---
>>>>>>>> block/bfq-iosched.c |8 +---
>>>>>>>> 1 file changed, 5 insertions(+), 3 deletions(-)
>>>>>>>> 
>>>>>>>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
>>>>>>>> index aeca22d91101..a236c8d541b5 100644
>>>>>>>> --- a/block/bfq-iosched.c
>>>>>>>> +++ b/block/bfq-iosched.c
>>>>>>>> @@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct 
>>>>>>>> bfq_data *bfqd,
>>>>>>>> 
>>>>>>>> static void bfq_update_rate_reset(struct bfq_data *bfqd, struct 
>>>>>>>> request *rq)
>>>>>>>> {
>>>>>>>> -  u32 rate, weight, divisor;
>>>>>>>> +  u32 weight, divisor;
>>>>>>>> +  u64 rate;
>>>>>>>> 
>>>>>>>>/*
>>>>>>>> * For the convergence property to hold (see comments on
>>>>>>>> @@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct 
>>>>>>>> bfq_data *bfqd, struct request *rq)
>>>>>>>> */
>>>>>>>>bfqd->peak_rate *= divisor-1;
>>>>>>>>bfqd->peak_rate /= divisor;
>>>>>>>> -  rate /= divisor; /* smoothing constant alpha = 1/divisor */
>>>>>>>> +  do_div(rate, divisor);  /* smoothing constant alpha = 1/divisor 
>>>>>>>> */
>>>>>>>> 
>>>>>>>> -  bfqd->peak_rate += rate;
>>>>>>>> +  /* rate should never overlow or become zero */
>>>>>>> It is bfqd->peak_rate that is used as a divider, and bfqd->peak_rate 
>>>>>>> doesn't risk to be zero even if the variable 'rate' is zero here.
>>>>>>> So I guess the reason why you consider the possibility that 
>>>>>>> bfqd->peak_rate becomes zero is because of an overflow when summing 
>>>>>>> 'rate'. But, according to my calculations, this should be impossible 
>>>>>>> with devices with sensible speeds.
>>>>>>> These are the reasons why I decided I could make it with a 32-bit 
>>>>>>> variable, without any additional clamping. Did I make any mistake in my 
>>>>>>> evaluation?
>>>>>> 
>>>>>> According to Murphy's law this is inevitable..
>>>>>> 
>>>>> Yep.  Actually Murphy has been even clement this time, by making the
>>>>> failure occur to a kernel expert :)
>>>>>> I've seen couple division by zero crashes in bfq_wr_duration.
>>>>>> Unfortunately logs weren't recorded.
>>>>>> 
>>>>>>> Anyway, even if I made some mistake about the maximum possible value of 
>>>>>>> the device rate, and the latter may be too high for bfqd->peak_rate to 
>>>>>>> contain it, then I guess the right solution would not be to clamp the 
>>>>>>> actual rate to U32_MAX, but to mov

Re: [PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1

2018-03-26 Thread Paolo Valente


> Il giorno 26 mar 2018, alle ore 12:28, Konstantin Khlebnikov 
>  ha scritto:
> 
> 
> 
> On 26.03.2018 11:01, Paolo Valente wrote:
>>> Il giorno 21 mar 2018, alle ore 00:49, Paolo Valente 
>>>  ha scritto:
>>> 
>>> 
>>> 
>>>> Il giorno 20 mar 2018, alle ore 15:41, Konstantin Khlebnikov 
>>>>  ha scritto:
>>>> 
>>>> On 20.03.2018 06:00, Paolo Valente wrote:
>>>>>> Il giorno 19 mar 2018, alle ore 14:28, Konstantin Khlebnikov 
>>>>>>  ha scritto:
>>>>>> 
>>>>>> On 19.03.2018 09:03, Paolo Valente wrote:
>>>>>>>> Il giorno 05 mar 2018, alle ore 04:48, Konstantin Khlebnikov 
>>>>>>>>  ha scritto:
>>>>>>>> 
>>>>>>>> Rate should never overflow or become zero because it is used as 
>>>>>>>> divider.
>>>>>>>> This patch accumulates it with saturation.
>>>>>>>> 
>>>>>>>> Signed-off-by: Konstantin Khlebnikov 
>>>>>>>> ---
>>>>>>>> block/bfq-iosched.c |8 +---
>>>>>>>> 1 file changed, 5 insertions(+), 3 deletions(-)
>>>>>>>> 
>>>>>>>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
>>>>>>>> index aeca22d91101..a236c8d541b5 100644
>>>>>>>> --- a/block/bfq-iosched.c
>>>>>>>> +++ b/block/bfq-iosched.c
>>>>>>>> @@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct 
>>>>>>>> bfq_data *bfqd,
>>>>>>>> 
>>>>>>>> static void bfq_update_rate_reset(struct bfq_data *bfqd, struct 
>>>>>>>> request *rq)
>>>>>>>> {
>>>>>>>> -  u32 rate, weight, divisor;
>>>>>>>> +  u32 weight, divisor;
>>>>>>>> +  u64 rate;
>>>>>>>> 
>>>>>>>>/*
>>>>>>>> * For the convergence property to hold (see comments on
>>>>>>>> @@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct 
>>>>>>>> bfq_data *bfqd, struct request *rq)
>>>>>>>> */
>>>>>>>>bfqd->peak_rate *= divisor-1;
>>>>>>>>bfqd->peak_rate /= divisor;
>>>>>>>> -  rate /= divisor; /* smoothing constant alpha = 1/divisor */
>>>>>>>> +  do_div(rate, divisor);  /* smoothing constant alpha = 1/divisor 
>>>>>>>> */
>>>>>>>> 
>>>>>>>> -  bfqd->peak_rate += rate;
>>>>>>>> +  /* rate should never overlow or become zero */
>>>>>>> It is bfqd->peak_rate that is used as a divider, and bfqd->peak_rate 
>>>>>>> doesn't risk to be zero even if the variable 'rate' is zero here.
>>>>>>> So I guess the reason why you consider the possibility that 
>>>>>>> bfqd->peak_rate becomes zero is because of an overflow when summing 
>>>>>>> 'rate'. But, according to my calculations, this should be impossible 
>>>>>>> with devices with sensible speeds.
>>>>>>> These are the reasons why I decided I could make it with a 32-bit 
>>>>>>> variable, without any additional clamping. Did I make any mistake in my 
>>>>>>> evaluation?
>>>>>> 
>>>>>> According to Murphy's law this is inevitable..
>>>>>> 
>>>>> Yep.  Actually Murphy has been even clement this time, by making the
>>>>> failure occur to a kernel expert :)
>>>>>> I've seen couple division by zero crashes in bfq_wr_duration.
>>>>>> Unfortunately logs weren't recorded.
>>>>>> 
>>>>>>> Anyway, even if I made some mistake about the maximum possible value of 
>>>>>>> the device rate, and the latter may be too high for bfqd->peak_rate to 
>>>>>>> contain it, then I guess the right solution would not be to clamp the 
>>>>>>> actual rate to U32_MAX, but to move bfqd->peak_rate to 64 bits. Or am I 
>>>>>>> missing something else?
>>>>>>>>> + bfqd->peak_rate = clamp_t(u64, rate + bfqd->peak_rate, 

Re: [PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1

2018-03-26 Thread Paolo Valente


> Il giorno 21 mar 2018, alle ore 00:49, Paolo Valente 
> <paolo.vale...@linaro.org> ha scritto:
> 
> 
> 
>> Il giorno 20 mar 2018, alle ore 15:41, Konstantin Khlebnikov 
>> <khlebni...@yandex-team.ru> ha scritto:
>> 
>> On 20.03.2018 06:00, Paolo Valente wrote:
>>>> Il giorno 19 mar 2018, alle ore 14:28, Konstantin Khlebnikov 
>>>> <khlebni...@yandex-team.ru> ha scritto:
>>>> 
>>>> On 19.03.2018 09:03, Paolo Valente wrote:
>>>>>> Il giorno 05 mar 2018, alle ore 04:48, Konstantin Khlebnikov 
>>>>>> <khlebni...@yandex-team.ru> ha scritto:
>>>>>> 
>>>>>> Rate should never overflow or become zero because it is used as divider.
>>>>>> This patch accumulates it with saturation.
>>>>>> 
>>>>>> Signed-off-by: Konstantin Khlebnikov <khlebni...@yandex-team.ru>
>>>>>> ---
>>>>>> block/bfq-iosched.c |8 +---
>>>>>> 1 file changed, 5 insertions(+), 3 deletions(-)
>>>>>> 
>>>>>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
>>>>>> index aeca22d91101..a236c8d541b5 100644
>>>>>> --- a/block/bfq-iosched.c
>>>>>> +++ b/block/bfq-iosched.c
>>>>>> @@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct 
>>>>>> bfq_data *bfqd,
>>>>>> 
>>>>>> static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request 
>>>>>> *rq)
>>>>>> {
>>>>>> -u32 rate, weight, divisor;
>>>>>> +u32 weight, divisor;
>>>>>> +u64 rate;
>>>>>> 
>>>>>>  /*
>>>>>>   * For the convergence property to hold (see comments on
>>>>>> @@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct bfq_data 
>>>>>> *bfqd, struct request *rq)
>>>>>>   */
>>>>>>  bfqd->peak_rate *= divisor-1;
>>>>>>  bfqd->peak_rate /= divisor;
>>>>>> -rate /= divisor; /* smoothing constant alpha = 1/divisor */
>>>>>> +do_div(rate, divisor);  /* smoothing constant alpha = 1/divisor 
>>>>>> */
>>>>>> 
>>>>>> -bfqd->peak_rate += rate;
>>>>>> +/* rate should never overlow or become zero */
>>>>> It is bfqd->peak_rate that is used as a divider, and bfqd->peak_rate 
>>>>> doesn't risk to be zero even if the variable 'rate' is zero here.
>>>>> So I guess the reason why you consider the possibility that 
>>>>> bfqd->peak_rate becomes zero is because of an overflow when summing 
>>>>> 'rate'. But, according to my calculations, this should be impossible with 
>>>>> devices with sensible speeds.
>>>>> These are the reasons why I decided I could make it with a 32-bit 
>>>>> variable, without any additional clamping. Did I make any mistake in my 
>>>>> evaluation?
>>>> 
>>>> According to Murphy's law this is inevitable..
>>>> 
>>> Yep.  Actually Murphy has been even clement this time, by making the
>>> failure occur to a kernel expert :)
>>>> I've seen couple division by zero crashes in bfq_wr_duration.
>>>> Unfortunately logs weren't recorded.
>>>> 
>>>>> Anyway, even if I made some mistake about the maximum possible value of 
>>>>> the device rate, and the latter may be too high for bfqd->peak_rate to 
>>>>> contain it, then I guess the right solution would not be to clamp the 
>>>>> actual rate to U32_MAX, but to move bfqd->peak_rate to 64 bits. Or am I 
>>>>> missing something else?
>>>>>>> +   bfqd->peak_rate = clamp_t(u64, rate + bfqd->peak_rate, 1, 
>>>>>>> U32_MAX);
>>>> 
>>>> 32-bit should be enough and better for division.
>>>> My patch makes sure it never overflows/underflows.
>>>> That's cheaper than full 64-bit/64-bit division.
>>>> Anyway 64-bit speed could overflow too. =)
>>>> 
>>> I see your point.  Still, if the mistake is not in sizing, then you
>>> bumped into some odd bug.  In this respect, I don't like much the idea
>>> of sweeping the dust under the carpet.  So, let me ask you for a
>>> little bit more help.  With

Re: [PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1

2018-03-26 Thread Paolo Valente


> Il giorno 21 mar 2018, alle ore 00:49, Paolo Valente 
>  ha scritto:
> 
> 
> 
>> Il giorno 20 mar 2018, alle ore 15:41, Konstantin Khlebnikov 
>>  ha scritto:
>> 
>> On 20.03.2018 06:00, Paolo Valente wrote:
>>>> Il giorno 19 mar 2018, alle ore 14:28, Konstantin Khlebnikov 
>>>>  ha scritto:
>>>> 
>>>> On 19.03.2018 09:03, Paolo Valente wrote:
>>>>>> Il giorno 05 mar 2018, alle ore 04:48, Konstantin Khlebnikov 
>>>>>>  ha scritto:
>>>>>> 
>>>>>> Rate should never overflow or become zero because it is used as divider.
>>>>>> This patch accumulates it with saturation.
>>>>>> 
>>>>>> Signed-off-by: Konstantin Khlebnikov 
>>>>>> ---
>>>>>> block/bfq-iosched.c |8 +---
>>>>>> 1 file changed, 5 insertions(+), 3 deletions(-)
>>>>>> 
>>>>>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
>>>>>> index aeca22d91101..a236c8d541b5 100644
>>>>>> --- a/block/bfq-iosched.c
>>>>>> +++ b/block/bfq-iosched.c
>>>>>> @@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct 
>>>>>> bfq_data *bfqd,
>>>>>> 
>>>>>> static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request 
>>>>>> *rq)
>>>>>> {
>>>>>> -u32 rate, weight, divisor;
>>>>>> +u32 weight, divisor;
>>>>>> +u64 rate;
>>>>>> 
>>>>>>  /*
>>>>>>   * For the convergence property to hold (see comments on
>>>>>> @@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct bfq_data 
>>>>>> *bfqd, struct request *rq)
>>>>>>   */
>>>>>>  bfqd->peak_rate *= divisor-1;
>>>>>>  bfqd->peak_rate /= divisor;
>>>>>> -rate /= divisor; /* smoothing constant alpha = 1/divisor */
>>>>>> +do_div(rate, divisor);  /* smoothing constant alpha = 1/divisor 
>>>>>> */
>>>>>> 
>>>>>> -bfqd->peak_rate += rate;
>>>>>> +/* rate should never overlow or become zero */
>>>>> It is bfqd->peak_rate that is used as a divider, and bfqd->peak_rate 
>>>>> doesn't risk to be zero even if the variable 'rate' is zero here.
>>>>> So I guess the reason why you consider the possibility that 
>>>>> bfqd->peak_rate becomes zero is because of an overflow when summing 
>>>>> 'rate'. But, according to my calculations, this should be impossible with 
>>>>> devices with sensible speeds.
>>>>> These are the reasons why I decided I could make it with a 32-bit 
>>>>> variable, without any additional clamping. Did I make any mistake in my 
>>>>> evaluation?
>>>> 
>>>> According to Murphy's law this is inevitable..
>>>> 
>>> Yep.  Actually Murphy has been even clement this time, by making the
>>> failure occur to a kernel expert :)
>>>> I've seen couple division by zero crashes in bfq_wr_duration.
>>>> Unfortunately logs weren't recorded.
>>>> 
>>>>> Anyway, even if I made some mistake about the maximum possible value of 
>>>>> the device rate, and the latter may be too high for bfqd->peak_rate to 
>>>>> contain it, then I guess the right solution would not be to clamp the 
>>>>> actual rate to U32_MAX, but to move bfqd->peak_rate to 64 bits. Or am I 
>>>>> missing something else?
>>>>>>> +   bfqd->peak_rate = clamp_t(u64, rate + bfqd->peak_rate, 1, 
>>>>>>> U32_MAX);
>>>> 
>>>> 32-bit should be enough and better for division.
>>>> My patch makes sure it never overflows/underflows.
>>>> That's cheaper than full 64-bit/64-bit division.
>>>> Anyway 64-bit speed could overflow too. =)
>>>> 
>>> I see your point.  Still, if the mistake is not in sizing, then you
>>> bumped into some odd bug.  In this respect, I don't like much the idea
>>> of sweeping the dust under the carpet.  So, let me ask you for a
>>> little bit more help.  With your patch applied, and thus with no risk
>>> of crashes, what about adding, right before your clamp_t, something
>>> like:
>>> if (!bfqd->

Re: [PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1

2018-03-20 Thread Paolo Valente


> Il giorno 20 mar 2018, alle ore 15:41, Konstantin Khlebnikov 
> <khlebni...@yandex-team.ru> ha scritto:
> 
> On 20.03.2018 06:00, Paolo Valente wrote:
>>> Il giorno 19 mar 2018, alle ore 14:28, Konstantin Khlebnikov 
>>> <khlebni...@yandex-team.ru> ha scritto:
>>> 
>>> On 19.03.2018 09:03, Paolo Valente wrote:
>>>>> Il giorno 05 mar 2018, alle ore 04:48, Konstantin Khlebnikov 
>>>>> <khlebni...@yandex-team.ru> ha scritto:
>>>>> 
>>>>> Rate should never overflow or become zero because it is used as divider.
>>>>> This patch accumulates it with saturation.
>>>>> 
>>>>> Signed-off-by: Konstantin Khlebnikov <khlebni...@yandex-team.ru>
>>>>> ---
>>>>> block/bfq-iosched.c |8 +---
>>>>> 1 file changed, 5 insertions(+), 3 deletions(-)
>>>>> 
>>>>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
>>>>> index aeca22d91101..a236c8d541b5 100644
>>>>> --- a/block/bfq-iosched.c
>>>>> +++ b/block/bfq-iosched.c
>>>>> @@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct 
>>>>> bfq_data *bfqd,
>>>>> 
>>>>> static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request 
>>>>> *rq)
>>>>> {
>>>>> - u32 rate, weight, divisor;
>>>>> + u32 weight, divisor;
>>>>> + u64 rate;
>>>>> 
>>>>>   /*
>>>>>* For the convergence property to hold (see comments on
>>>>> @@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct bfq_data 
>>>>> *bfqd, struct request *rq)
>>>>>*/
>>>>>   bfqd->peak_rate *= divisor-1;
>>>>>   bfqd->peak_rate /= divisor;
>>>>> - rate /= divisor; /* smoothing constant alpha = 1/divisor */
>>>>> + do_div(rate, divisor);  /* smoothing constant alpha = 1/divisor */
>>>>> 
>>>>> - bfqd->peak_rate += rate;
>>>>> + /* rate should never overlow or become zero */
>>>> It is bfqd->peak_rate that is used as a divider, and bfqd->peak_rate 
>>>> doesn't risk to be zero even if the variable 'rate' is zero here.
>>>> So I guess the reason why you consider the possibility that 
>>>> bfqd->peak_rate becomes zero is because of an overflow when summing 
>>>> 'rate'. But, according to my calculations, this should be impossible with 
>>>> devices with sensible speeds.
>>>> These are the reasons why I decided I could make it with a 32-bit 
>>>> variable, without any additional clamping. Did I make any mistake in my 
>>>> evaluation?
>>> 
>>> According to Murphy's law this is inevitable..
>>> 
>> Yep.  Actually Murphy has been even clement this time, by making the
>> failure occur to a kernel expert :)
>>> I've seen couple division by zero crashes in bfq_wr_duration.
>>> Unfortunately logs weren't recorded.
>>> 
>>>> Anyway, even if I made some mistake about the maximum possible value of 
>>>> the device rate, and the latter may be too high for bfqd->peak_rate to 
>>>> contain it, then I guess the right solution would not be to clamp the 
>>>> actual rate to U32_MAX, but to move bfqd->peak_rate to 64 bits. Or am I 
>>>> missing something else?
>>>>>> +bfqd->peak_rate = clamp_t(u64, rate + bfqd->peak_rate, 1, 
>>>>>> U32_MAX);
>>> 
>>> 32-bit should be enough and better for division.
>>> My patch makes sure it never overflows/underflows.
>>> That's cheaper than full 64-bit/64-bit division.
>>> Anyway 64-bit speed could overflow too. =)
>>> 
>> I see your point.  Still, if the mistake is not in sizing, then you
>> bumped into some odd bug.  In this respect, I don't like much the idea
>> of sweeping the dust under the carpet.  So, let me ask you for a
>> little bit more help.  With your patch applied, and thus with no risk
>> of crashes, what about adding, right before your clamp_t, something
>> like:
>> if (!bfqd->peak_rate)
>>  pr_crit(> bfqd->peak_rate>);
>> Once the failure shows up (Murphy permitting), we might have hints to
>> the bug causing it.
>> Apart from that, I have no problem with patches that make bfq more
>> robust, even in a sort of black-box way.
> 
> This rate estimator is already works as a black-box 

Re: [PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1

2018-03-20 Thread Paolo Valente


> Il giorno 20 mar 2018, alle ore 15:41, Konstantin Khlebnikov 
>  ha scritto:
> 
> On 20.03.2018 06:00, Paolo Valente wrote:
>>> Il giorno 19 mar 2018, alle ore 14:28, Konstantin Khlebnikov 
>>>  ha scritto:
>>> 
>>> On 19.03.2018 09:03, Paolo Valente wrote:
>>>>> Il giorno 05 mar 2018, alle ore 04:48, Konstantin Khlebnikov 
>>>>>  ha scritto:
>>>>> 
>>>>> Rate should never overflow or become zero because it is used as divider.
>>>>> This patch accumulates it with saturation.
>>>>> 
>>>>> Signed-off-by: Konstantin Khlebnikov 
>>>>> ---
>>>>> block/bfq-iosched.c |8 +---
>>>>> 1 file changed, 5 insertions(+), 3 deletions(-)
>>>>> 
>>>>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
>>>>> index aeca22d91101..a236c8d541b5 100644
>>>>> --- a/block/bfq-iosched.c
>>>>> +++ b/block/bfq-iosched.c
>>>>> @@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct 
>>>>> bfq_data *bfqd,
>>>>> 
>>>>> static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request 
>>>>> *rq)
>>>>> {
>>>>> - u32 rate, weight, divisor;
>>>>> + u32 weight, divisor;
>>>>> + u64 rate;
>>>>> 
>>>>>   /*
>>>>>* For the convergence property to hold (see comments on
>>>>> @@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct bfq_data 
>>>>> *bfqd, struct request *rq)
>>>>>*/
>>>>>   bfqd->peak_rate *= divisor-1;
>>>>>   bfqd->peak_rate /= divisor;
>>>>> - rate /= divisor; /* smoothing constant alpha = 1/divisor */
>>>>> + do_div(rate, divisor);  /* smoothing constant alpha = 1/divisor */
>>>>> 
>>>>> - bfqd->peak_rate += rate;
>>>>> + /* rate should never overlow or become zero */
>>>> It is bfqd->peak_rate that is used as a divider, and bfqd->peak_rate 
>>>> doesn't risk to be zero even if the variable 'rate' is zero here.
>>>> So I guess the reason why you consider the possibility that 
>>>> bfqd->peak_rate becomes zero is because of an overflow when summing 
>>>> 'rate'. But, according to my calculations, this should be impossible with 
>>>> devices with sensible speeds.
>>>> These are the reasons why I decided I could make it with a 32-bit 
>>>> variable, without any additional clamping. Did I make any mistake in my 
>>>> evaluation?
>>> 
>>> According to Murphy's law this is inevitable..
>>> 
>> Yep.  Actually Murphy has been even clement this time, by making the
>> failure occur to a kernel expert :)
>>> I've seen couple division by zero crashes in bfq_wr_duration.
>>> Unfortunately logs weren't recorded.
>>> 
>>>> Anyway, even if I made some mistake about the maximum possible value of 
>>>> the device rate, and the latter may be too high for bfqd->peak_rate to 
>>>> contain it, then I guess the right solution would not be to clamp the 
>>>> actual rate to U32_MAX, but to move bfqd->peak_rate to 64 bits. Or am I 
>>>> missing something else?
>>>>>> +bfqd->peak_rate = clamp_t(u64, rate + bfqd->peak_rate, 1, 
>>>>>> U32_MAX);
>>> 
>>> 32-bit should be enough and better for division.
>>> My patch makes sure it never overflows/underflows.
>>> That's cheaper than full 64-bit/64-bit division.
>>> Anyway 64-bit speed could overflow too. =)
>>> 
>> I see your point.  Still, if the mistake is not in sizing, then you
>> bumped into some odd bug.  In this respect, I don't like much the idea
>> of sweeping the dust under the carpet.  So, let me ask you for a
>> little bit more help.  With your patch applied, and thus with no risk
>> of crashes, what about adding, right before your clamp_t, something
>> like:
>> if (!bfqd->peak_rate)
>>  pr_crit(> bfqd->peak_rate>);
>> Once the failure shows up (Murphy permitting), we might have hints to
>> the bug causing it.
>> Apart from that, I have no problem with patches that make bfq more
>> robust, even in a sort of black-box way.
> 
> This rate estimator is already works as a black-box with smoothing and
> low-pass filter inside.

Actually, it is just what you say: a low-pass filter, which works by
deflating

Re: [PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1

2018-03-19 Thread Paolo Valente


> Il giorno 19 mar 2018, alle ore 14:28, Konstantin Khlebnikov 
> <khlebni...@yandex-team.ru> ha scritto:
> 
> On 19.03.2018 09:03, Paolo Valente wrote:
>>> Il giorno 05 mar 2018, alle ore 04:48, Konstantin Khlebnikov 
>>> <khlebni...@yandex-team.ru> ha scritto:
>>> 
>>> Rate should never overflow or become zero because it is used as divider.
>>> This patch accumulates it with saturation.
>>> 
>>> Signed-off-by: Konstantin Khlebnikov <khlebni...@yandex-team.ru>
>>> ---
>>> block/bfq-iosched.c |8 +---
>>> 1 file changed, 5 insertions(+), 3 deletions(-)
>>> 
>>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
>>> index aeca22d91101..a236c8d541b5 100644
>>> --- a/block/bfq-iosched.c
>>> +++ b/block/bfq-iosched.c
>>> @@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct 
>>> bfq_data *bfqd,
>>> 
>>> static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
>>> {
>>> -   u32 rate, weight, divisor;
>>> +   u32 weight, divisor;
>>> +   u64 rate;
>>> 
>>> /*
>>>  * For the convergence property to hold (see comments on
>>> @@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct bfq_data 
>>> *bfqd, struct request *rq)
>>>  */
>>> bfqd->peak_rate *= divisor-1;
>>> bfqd->peak_rate /= divisor;
>>> -   rate /= divisor; /* smoothing constant alpha = 1/divisor */
>>> +   do_div(rate, divisor);  /* smoothing constant alpha = 1/divisor */
>>> 
>>> -   bfqd->peak_rate += rate;
>>> +   /* rate should never overlow or become zero */
>> It is bfqd->peak_rate that is used as a divider, and bfqd->peak_rate doesn't 
>> risk to be zero even if the variable 'rate' is zero here.
>> So I guess the reason why you consider the possibility that bfqd->peak_rate 
>> becomes zero is because of an overflow when summing 'rate'. But, according 
>> to my calculations, this should be impossible with devices with sensible 
>> speeds.
>> These are the reasons why I decided I could make it with a 32-bit variable, 
>> without any additional clamping. Did I make any mistake in my evaluation?
> 
> According to Murphy's law this is inevitable..
> 

Yep.  Actually Murphy has been even clement this time, by making the
failure occur to a kernel expert :)

> I've seen couple division by zero crashes in bfq_wr_duration.
> Unfortunately logs weren't recorded.
> 
>> Anyway, even if I made some mistake about the maximum possible value of the 
>> device rate, and the latter may be too high for bfqd->peak_rate to contain 
>> it, then I guess the right solution would not be to clamp the actual rate to 
>> U32_MAX, but to move bfqd->peak_rate to 64 bits. Or am I missing something 
>> else?
> >>> + bfqd->peak_rate = clamp_t(u64, rate + bfqd->peak_rate, 1, U32_MAX);
> 
> 32-bit should be enough and better for division.
> My patch makes sure it never overflows/underflows.
> That's cheaper than full 64-bit/64-bit division.
> Anyway 64-bit speed could overflow too. =)
> 

I see your point.  Still, if the mistake is not in sizing, then you
bumped into some odd bug.  In this respect, I don't like much the idea
of sweeping the dust under the carpet.  So, let me ask you for a
little bit more help.  With your patch applied, and thus with no risk
of crashes, what about adding, right before your clamp_t, something
like:

if (!bfqd->peak_rate)
pr_crit(peak_rate>);

Once the failure shows up (Murphy permitting), we might have hints to
the bug causing it.

Apart from that, I have no problem with patches that make bfq more
robust, even in a sort of black-box way.

Thanks a lot,
Paolo

> 
>>> update_thr_responsiveness_params(bfqd);
>>> 
>>> reset_computation:



Re: [PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1

2018-03-19 Thread Paolo Valente


> Il giorno 19 mar 2018, alle ore 14:28, Konstantin Khlebnikov 
>  ha scritto:
> 
> On 19.03.2018 09:03, Paolo Valente wrote:
>>> Il giorno 05 mar 2018, alle ore 04:48, Konstantin Khlebnikov 
>>>  ha scritto:
>>> 
>>> Rate should never overflow or become zero because it is used as divider.
>>> This patch accumulates it with saturation.
>>> 
>>> Signed-off-by: Konstantin Khlebnikov 
>>> ---
>>> block/bfq-iosched.c |8 +---
>>> 1 file changed, 5 insertions(+), 3 deletions(-)
>>> 
>>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
>>> index aeca22d91101..a236c8d541b5 100644
>>> --- a/block/bfq-iosched.c
>>> +++ b/block/bfq-iosched.c
>>> @@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct 
>>> bfq_data *bfqd,
>>> 
>>> static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
>>> {
>>> -   u32 rate, weight, divisor;
>>> +   u32 weight, divisor;
>>> +   u64 rate;
>>> 
>>> /*
>>>  * For the convergence property to hold (see comments on
>>> @@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct bfq_data 
>>> *bfqd, struct request *rq)
>>>  */
>>> bfqd->peak_rate *= divisor-1;
>>> bfqd->peak_rate /= divisor;
>>> -   rate /= divisor; /* smoothing constant alpha = 1/divisor */
>>> +   do_div(rate, divisor);  /* smoothing constant alpha = 1/divisor */
>>> 
>>> -   bfqd->peak_rate += rate;
>>> +   /* rate should never overlow or become zero */
>> It is bfqd->peak_rate that is used as a divider, and bfqd->peak_rate doesn't 
>> risk to be zero even if the variable 'rate' is zero here.
>> So I guess the reason why you consider the possibility that bfqd->peak_rate 
>> becomes zero is because of an overflow when summing 'rate'. But, according 
>> to my calculations, this should be impossible with devices with sensible 
>> speeds.
>> These are the reasons why I decided I could make it with a 32-bit variable, 
>> without any additional clamping. Did I make any mistake in my evaluation?
> 
> According to Murphy's law this is inevitable..
> 

Yep.  Actually Murphy has been even clement this time, by making the
failure occur to a kernel expert :)

> I've seen couple division by zero crashes in bfq_wr_duration.
> Unfortunately logs weren't recorded.
> 
>> Anyway, even if I made some mistake about the maximum possible value of the 
>> device rate, and the latter may be too high for bfqd->peak_rate to contain 
>> it, then I guess the right solution would not be to clamp the actual rate to 
>> U32_MAX, but to move bfqd->peak_rate to 64 bits. Or am I missing something 
>> else?
> >>> + bfqd->peak_rate = clamp_t(u64, rate + bfqd->peak_rate, 1, U32_MAX);
> 
> 32-bit should be enough and better for division.
> My patch makes sure it never overflows/underflows.
> That's cheaper than full 64-bit/64-bit division.
> Anyway 64-bit speed could overflow too. =)
> 

I see your point.  Still, if the mistake is not in sizing, then you
bumped into some odd bug.  In this respect, I don't like much the idea
of sweeping the dust under the carpet.  So, let me ask you for a
little bit more help.  With your patch applied, and thus with no risk
of crashes, what about adding, right before your clamp_t, something
like:

if (!bfqd->peak_rate)
pr_crit(peak_rate>);

Once the failure shows up (Murphy permitting), we might have hints to
the bug causing it.

Apart from that, I have no problem with patches that make bfq more
robust, even in a sort of black-box way.

Thanks a lot,
Paolo

> 
>>> update_thr_responsiveness_params(bfqd);
>>> 
>>> reset_computation:



<    1   2   3   4   5   6   7   8   9   10   >