[PATCH BUGFIX RFC 2/2] Revert "bfq: calculate shallow depths at init time"
This reverts commit f0635b8a416e3b99dc6fd9ac3ce534764869d0c8. --- block/bfq-iosched.c | 117 +--- 1 file changed, 57 insertions(+), 60 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 8cc3032b66de..92214d58510c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -520,6 +520,54 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, } } +/* + * See the comments on bfq_limit_depth for the purpose of + * the depths set in the function. Return minimum shallow depth we'll use. + */ +static unsigned int bfq_update_depths(struct bfq_data *bfqd, + struct sbitmap_queue *bt) +{ + unsigned int i, j, min_shallow = UINT_MAX; + bfqd->sb_shift = bt->sb.shift; + + /* +* In-word depths if no bfq_queue is being weight-raised: +* leaving 25% of tags only for sync reads. +* +* In next formulas, right-shift the value +* (1Usb_shift - something)), to be robust against +* any possible value of bfqd->sb_shift, without having to +* limit 'something'. +*/ + /* no more than 50% of tags for async I/O */ + bfqd->word_depths[0][0] = max((1U >1, 1U); + /* +* no more than 75% of tags for sync writes (25% extra tags +* w.r.t. async I/O, to prevent async I/O from starving sync +* writes) +*/ + bfqd->word_depths[0][1] = max(((1U >2, 1U); + + /* +* In-word depths in case some bfq_queue is being weight- +* raised: leaving ~63% of tags for sync reads. This is the +* highest percentage for which, in our tests, application +* start-up times didn't suffer from any regression due to tag +* shortage. +*/ + /* no more than ~18% of tags for async I/O */ + bfqd->word_depths[1][0] = max(((1U >4, 1U); + /* no more than ~37% of tags for sync writes (~20% extra tags) */ + bfqd->word_depths[1][1] = max(((1U >4, 1U); + + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + min_shallow = min(min_shallow, bfqd->word_depths[i][j]); + + return min_shallow; +} + /* * Async I/O can easily starve sync I/O (both sync reads and sync * writes), by consuming all tags. Similarly, storms of sync writes, @@ -529,11 +577,20 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, */ static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) { + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct bfq_data *bfqd = data->q->elevator->elevator_data; + struct sbitmap_queue *bt; if (op_is_sync(op) && !op_is_write(op)) return; + bt = >bitmap_tags; + + if (unlikely(bfqd->sb_shift != bt->sb.shift)) { + unsigned int min_shallow = bfq_update_depths(bfqd, bt); + sbitmap_queue_min_shallow_depth(>bitmap_tags, min_shallow); + } + data->shallow_depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; @@ -5295,65 +5352,6 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) __bfq_put_async_bfqq(bfqd, >async_idle_bfqq); } -/* - * See the comments on bfq_limit_depth for the purpose of - * the depths set in the function. Return minimum shallow depth we'll use. - */ -static unsigned int bfq_update_depths(struct bfq_data *bfqd, - struct sbitmap_queue *bt) -{ - unsigned int i, j, min_shallow = UINT_MAX; - bfqd->sb_shift = bt->sb.shift; - - /* -* In-word depths if no bfq_queue is being weight-raised: -* leaving 25% of tags only for sync reads. -* -* In next formulas, right-shift the value -* (1U sb_shift - something)), to be robust against -* any possible value of bfqd->sb_shift, without having to -* limit 'something'. -*/ - /* no more than 50% of tags for async I/O */ - bfqd->word_depths[0][0] = max((1U >1, 1U); - /* -* no more than 75% of tags for sync writes (25% extra tags -* w.r.t. async I/O, to prevent async I/O from starving sync -* writes) -*/ - bfqd->word_depths[0][1] = max(((1U >2, 1U); - - /* -* In-word depths in case some bfq_queue is being weight- -* raised: leaving ~63% of tags for sync reads. This is the -* highest percentage for which, in our tests, application -* start-up times didn't suffer from any regression due to tag -* shortage. -*/ - /* no more than ~18% of tags for async I/O */ - bfqd->word_depths[1][0] = max(((1U >4,
[PATCH BUGFIX RFC 1/2] Revert "bfq-iosched: remove unused variable"
This reverts commit bd7d4ef6a4c9b3611fa487a0065bf042c71ce620. --- block/bfq-iosched.c | 15 --- block/bfq-iosched.h | 6 ++ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index cd307767a134..8cc3032b66de 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5303,25 +5303,26 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) { unsigned int i, j, min_shallow = UINT_MAX; + bfqd->sb_shift = bt->sb.shift; /* * In-word depths if no bfq_queue is being weight-raised: * leaving 25% of tags only for sync reads. * * In next formulas, right-shift the value -* (1Usb.shift - something)), to be robust against -* any possible value of bt->sb.shift, without having to +* (1U sb_shift - something)), to be robust against +* any possible value of bfqd->sb_shift, without having to * limit 'something'. */ /* no more than 50% of tags for async I/O */ - bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U); + bfqd->word_depths[0][0] = max((1U >1, 1U); /* * no more than 75% of tags for sync writes (25% extra tags * w.r.t. async I/O, to prevent async I/O from starving sync * writes) */ - bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U); + bfqd->word_depths[0][1] = max(((1U >2, 1U); /* * In-word depths in case some bfq_queue is being weight- @@ -5331,9 +5332,9 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd, * shortage. */ /* no more than ~18% of tags for async I/O */ - bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U); + bfqd->word_depths[1][0] = max(((1U >4, 1U); /* no more than ~37% of tags for sync writes (~20% extra tags) */ - bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U); + bfqd->word_depths[1][1] = max(((1U >4, 1U); for (i = 0; i < 2; i++) for (j = 0; j < 2; j++) diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 0b02bf302de0..4de5dc349a1e 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -697,6 +697,12 @@ struct bfq_data { /* bfqq associated with the task issuing current bio for merging */ struct bfq_queue *bio_bfqq; + /* +* Cached sbitmap shift, used to compute depth limits in +* bfq_update_depths. +*/ + unsigned int sb_shift; + /* * Depth limits used in bfq_limit_depth (see comments on the * function) -- 2.20.1
Re: [RFC PATCH 0/3] cgroup: fsio throttle controller
> Il giorno 18 gen 2019, alle ore 12:10, Andrea Righi > ha scritto: > > On Fri, Jan 18, 2019 at 12:04:17PM +0100, Paolo Valente wrote: >> >> >>> Il giorno 18 gen 2019, alle ore 11:31, Andrea Righi >>> ha scritto: >>> >>> This is a redesign of my old cgroup-io-throttle controller: >>> https://lwn.net/Articles/330531/ >>> >>> I'm resuming this old patch to point out a problem that I think is still >>> not solved completely. >>> >>> = Problem = >>> >>> The io.max controller works really well at limiting synchronous I/O >>> (READs), but a lot of I/O requests are initiated outside the context of >>> the process that is ultimately responsible for its creation (e.g., >>> WRITEs). >>> >>> Throttling at the block layer in some cases is too late and we may end >>> up slowing down processes that are not responsible for the I/O that >>> is being processed at that level. >>> >>> = Proposed solution = >>> >>> The main idea of this controller is to split I/O measurement and I/O >>> throttling: I/O is measured at the block layer for READS, at page cache >>> (dirty pages) for WRITEs, and processes are limited while they're >>> generating I/O at the VFS level, based on the measured I/O. >>> >> >> Hi Andrea, >> what the about the case where two processes are dirtying the same >> pages? Which will be charged? >> >> Thanks, >> Paolo > > Hi Paolo, > > in this case only the first one will be charged for the I/O activity > (the one that changes a page from clean to dirty). This is probably not > totally fair in some cases, but I think it's a good compromise, Absolutely, I just wanted to better understand this point. > at the > end rewriting the same page over and over while it's already dirty > doesn't actually generate I/O activity, until the page is flushed back > to disk. > Right. Thanks, Paolo > Obviously I'm open to other better ideas and suggestions. > > Thanks! > -Andrea
Re: [RFC PATCH 0/3] cgroup: fsio throttle controller
> Il giorno 18 gen 2019, alle ore 11:31, Andrea Righi > ha scritto: > > This is a redesign of my old cgroup-io-throttle controller: > https://lwn.net/Articles/330531/ > > I'm resuming this old patch to point out a problem that I think is still > not solved completely. > > = Problem = > > The io.max controller works really well at limiting synchronous I/O > (READs), but a lot of I/O requests are initiated outside the context of > the process that is ultimately responsible for its creation (e.g., > WRITEs). > > Throttling at the block layer in some cases is too late and we may end > up slowing down processes that are not responsible for the I/O that > is being processed at that level. > > = Proposed solution = > > The main idea of this controller is to split I/O measurement and I/O > throttling: I/O is measured at the block layer for READS, at page cache > (dirty pages) for WRITEs, and processes are limited while they're > generating I/O at the VFS level, based on the measured I/O. > Hi Andrea, what the about the case where two processes are dirtying the same pages? Which will be charged? Thanks, Paolo > = Example = > > Here's a trivial example: create 2 cgroups, set an io.max limit of > 10MB/s, run a write-intensive workload on both and after a while, from a > root cgroup, run "sync". > > # cat /proc/self/cgroup > 0::/cg1 > # fio --rw=write --bs=1M --size=32M --numjobs=16 --name=seeker --time_based > --runtime=30 > > # cat /proc/self/cgroup > 0::/cg2 > # fio --rw=write --bs=1M --size=32M --numjobs=16 --name=seeker --time_based > --runtime=30 > > - io.max controller: > > # echo "259:0 rbps=10485760 wbps=10485760" > /sys/fs/cgroup/unified/cg1/io.max > # echo "259:0 rbps=10485760 wbps=10485760" > /sys/fs/cgroup/unified/cg2/io.max > > # cat /proc/self/cgroup > 0::/ > # time sync > > real 0m51,241s > user 0m0,000s > sys 0m0,113s > > Ideally "sync" should complete almost immediately, because the root > cgroup is unlimited and it's not doing any I/O at all, but instead it's > blocked for more than 50 sec with io.max, because the writeback is > throttled to satisfy the io.max limits. > > - fsio controller: > > # echo "259:0 10 10" > /sys/fs/cgroup/unified/cg1/fsio.max_mbs > # echo "259:0 10 10" > /sys/fs/cgroup/unified/cg2/fsio.max_mbs > > [you can find details about the syntax in the documentation patch] > > # cat /proc/self/cgroup > 0::/ > # time sync > > real 0m0,146s > user 0m0,003s > sys 0m0,001s > > = Questions = > > Q: Do we need another controller? > A: Probably no, I think it would be better to integrate this policy (or > something similar) in the current blkio controller, this is just to > highlight the problem and get some ideas on how to address it. > > Q: What about proportional limits / latency? > A: It should be trivial to add latency-based limits if we integrate this in > the > current I/O controller. About proportional limits (weights), they're > strictly related to I/O scheduling and since this controller doesn't touch > I/O dispatching policies it's not trivial to implement proportional limits > (bandwidth limiting is definitely more straightforward). > > Q: Applying delays at the VFS layer doesn't prevent I/O spikes during > writeback, right? > A: Correct, the tradeoff here is to tolerate I/O bursts during writeback to > avoid priority inversion problems in the system. > > Andrea Righi (3): > fsio-throttle: documentation > fsio-throttle: controller infrastructure > fsio-throttle: instrumentation > > Documentation/cgroup-v1/fsio-throttle.txt | 142 + > block/blk-core.c | 10 + > include/linux/cgroup_subsys.h | 4 + > include/linux/fsio-throttle.h | 43 +++ > include/linux/writeback.h | 7 +- > init/Kconfig | 11 + > kernel/cgroup/Makefile| 1 + > kernel/cgroup/fsio-throttle.c | 501 ++ > mm/filemap.c | 20 +- > mm/page-writeback.c | 14 +- > 10 files changed, 749 insertions(+), 4 deletions(-) >
Re: [PATCH BUGFIX 0/2] bfq: fix unbalanced decrements causing loss of throughput
> Il giorno 7 dic 2018, alle ore 15:40, Jens Axboe ha scritto: > > On 12/7/18 3:01 AM, Paolo Valente wrote: >> >> >>> Il giorno 7 dic 2018, alle ore 03:23, Jens Axboe ha >>> scritto: >>> >>> On 12/6/18 11:18 AM, Paolo Valente wrote: >>>> Hi Jens, >>>> the first patch in this series fixes an error in the decrementing of >>>> the counter of the number of groups with pending I/O. This wrong >>>> decrement caused loss of throughput or, less likely, of control on >>>> I/O. The second patch is a fix of some wrong comments, which somehow >>>> contributed to making the above bug more difficult to find. >>> >>> Are you fine with this going into 4.21? I can't quite tell what your >>> intent is. The first patch has a Fixes for something >> >> yep, that fixes a serious error. >> >>> that went into >>> this series, but then patch 2 is a comment update that would not >>> normally be something to be applied at this stage. >>> >> >> and yes, only comments changed by the second one >> >> May it make sense to apply them in two steps, one in the 4.20 and the other >> one in the 4.21? > > I think so, I'll do that. Hi Jens, is the second patch still queued? Thanks, Paolo > > -- > Jens Axboe
Re: [PATCH V2 00/10] unify the interface of the proportional-share policy in blkio/io
> Il giorno 2 gen 2019, alle ore 17:03, Tejun Heo ha scritto: > > Hello, Paolo. > > On Sun, Dec 30, 2018 at 11:25:25AM +0100, Paolo Valente wrote: >> What's the benefit of throwing away months of work, on which we agreed >> before starting it, and that solves a problem already acknowledged by >> interested parties? > > Showing multiple conflicting numbers definitely isn't anything which > is agreed upon. > Sorry, of course you din't realize that sharing interface files had this consequence, otherwise you'd have protested beforehand. The problem is that this consequence seems unavoidable: if two policies have different numbers to convey, through a shared interface file, then they must be allowed to write their different numbers. To me, this doesn't sound like a problem. The only other natural option is no unification, unless you have a third way. What do you prefer, or propose? Thanks, Paolo > Thanks. > > -- > tejun
Re: [PATCH V2 00/10] unify the interface of the proportional-share policy in blkio/io
> Il giorno 28 dic 2018, alle ore 00:41, Tejun Heo ha scritto: > > Hello, Paolo. > > On Sun, Dec 23, 2018 at 12:00:14PM +0100, Paolo Valente wrote: >> 4.21 is coming ... and the legacy proportional share interface will >> be gone with cfq. This will break legacy code using the >> proportional-share interface on top of bfq. This code may just fail >> when trying to use interface files that don't exist any longer. > > Sounds like inheriting .cfq namespace would be the easiest. Would > that work? > For bfq, yes, but what will, e.g., Josef do when he adds his new proportional-share implementation? Will he add a new set of names not used by any legacy piece of code? What's the benefit of throwing away months of work, on which we agreed before starting it, and that solves a problem already acknowledged by interested parties? Thanks, Paolo > Thanks. > > -- > tejun
Re: [PATCH V2 00/10] unify the interface of the proportional-share policy in blkio/io
> Il giorno 18 dic 2018, alle ore 18:22, Paolo Valente > ha scritto: > > > >> Il giorno 18 dic 2018, alle ore 17:41, Tejun Heo ha >> scritto: >> >> Hello, Paolo. >> >> On Tue, Dec 18, 2018 at 08:48:10AM +0100, Paolo Valente wrote: >>> If Tejun cannot see any solution to his concern, then can we just >>> switch to this extension, considering that >>> - for non-shared names the interface is *identical* to the current >>> one; >>> - by using this new interface, and getting feedback we could >>> understand how to better handle Tejun's concern? >>> A lot of systems do use weights, and people don't even know that these >>> systems don't work correctly in blk-mq. And they won't work correctly >>> in any available configuration from 4.21, if we don't fix this problem. >> >> So, when seen from userland, how it should behave isn't vague or >> complicated. For a given device and policy type, there can be only >> one implementation active. > > Yes, but the problem is the opposite. You may have > - two different policies, with the same interface parameter, > - one active on one device > - the other one active on another device > > In that case, statistics from one policy necessarily differ from > statistics from the other policy. > > In this respect, in a system with more than one drive it already > happens that the same policy is active on different devices. When > printing a statistics interface file for the policy, the output will > be a list of separate statistics, with a bunch of statistics *for > each* drive (plus a grand total in some cases). > > So, our proposal simply extends this scheme in the most natural way: > if, now, also two or more policies share the same statistics file, > then the output will be a list of separate statistics, one for each > policy. The statistics for each policy will be tagged with the policy > name, and will have the same identical form as above. It seems the > most natural hierarchical extension of the same scheme. > Maybe my generic description didn't highlight how plain are. If you print, e.g., io_serviced with the current interface, you get -- 8:0 Read 4291168 8:0 Write 2181577 8:0 Sync 5897755 8:0 Async 574990 8:0 Total 6472745 Total 6472745 -- With the new, interface, you get *the same output*, if only one policy is attached to this interface file. In, instead - two policies share the the file, because one is active on a device and one on another device - these policies are named, e.g., bfq and pol2 then you get (device number and statistics invented): -- bfq: 8:0 Read 4291168 8:0 Write 2181577 8:0 Sync 5897755 8:0 Async 574990 8:0 Total 6472745 Total 6472745 pol2: 16:0 Read 238768 16:0 Write 323123 16:0 Sync 43243 16:0 Async 432432 16:0 Total 412435 Total 4341244 -- So you see the per-device statistics as before, without the problem of inventing a new set of names for every new policy that has the same interface files of an existing policy. Tejun, let's try to converge, to whatever solution you prefer. 4.21 is coming ... and the legacy proportional share interface will be gone with cfq. This will break legacy code using the proportional-share interface on top of bfq. This code may just fail when trying to use interface files that don't exist any longer. Thanks, Paolo > At any rate, if you don't like it, just tell us how you prefer it > done. Do you prefer the sharing of statistics file to be simply > forbidden? (If this can be done.) I think such an incomplete solution > would preserve part of the current mess; but, if this allows us to > exit from this impasse, then it is ok for me. > > *Any* feasible option is ok for me. Just pick one. > >> It doesn't make sense to have two weight >> mechanisms active on one device, right? > > (Un)fortunately, the problem are not weights. There won't be two > weights for two policies expiring a weight parameter. The problems > concerns statistics. See above. > > >> So, the interface should only >> present what makes sense to the user for both configuration knobs and >> statistics, and that'd be a hard requirement because we don't want to >> present confusing spurious information to userspace. >> >> There seemd to have been significant misunderstandings as to what the >> requirements are when this was discussed way back, so idk what the >> good path forward is at this point. Just keep the current names? >> > > I don't clearly understand how "just picking the current names" is a > way forward, but if we do not make this extension, in a way
Re: [PATCH V2 00/10] unify the interface of the proportional-share policy in blkio/io
> Il giorno 18 dic 2018, alle ore 18:22, Paolo Valente > ha scritto: > > > >> Il giorno 18 dic 2018, alle ore 17:41, Tejun Heo ha >> scritto: >> >> Hello, Paolo. >> >> On Tue, Dec 18, 2018 at 08:48:10AM +0100, Paolo Valente wrote: >>> If Tejun cannot see any solution to his concern, then can we just >>> switch to this extension, considering that >>> - for non-shared names the interface is *identical* to the current >>> one; >>> - by using this new interface, and getting feedback we could >>> understand how to better handle Tejun's concern? >>> A lot of systems do use weights, and people don't even know that these >>> systems don't work correctly in blk-mq. And they won't work correctly >>> in any available configuration from 4.21, if we don't fix this problem. >> >> So, when seen from userland, how it should behave isn't vague or >> complicated. For a given device and policy type, there can be only >> one implementation active. > > Yes, but the problem is the opposite. You may have > - two different policies, with the same interface parameter, > - one active on one device > - the other one active on another device > > In that case, statistics from one policy necessarily differ from > statistics from the other policy. > > In this respect, in a system with more than one drive it already > happens that the same policy is active on different devices. When > printing a statistics interface file for the policy, the output will > be a list of separate statistics, with a bunch of statistics *for > each* drive (plus a grand total in some cases). > > So, our proposal simply extends this scheme in the most natural way: > if, now, also two or more policies share the same statistics file, > then the output will be a list of separate statistics, one for each > policy. The statistics for each policy will be tagged with the policy > name, and will have the same identical form as above. It seems the > most natural hierarchical extension of the same scheme. > > At any rate, if you don't like it, just tell us how you prefer it > done. Do you prefer the sharing of statistics file to be simply > forbidden? (If this can be done.) I think such an incomplete solution > would preserve part of the current mess; but, if this allows us to > exit from this impasse, then it is ok for me. > > *Any* feasible option is ok for me. Just pick one. > >> It doesn't make sense to have two weight >> mechanisms active on one device, right? > > (Un)fortunately, the problem are not weights. There won't be two > weights for two policies expiring a weight parameter. The problems s/expiring/sharing sorry
Re: [PATCH V2 00/10] unify the interface of the proportional-share policy in blkio/io
> Il giorno 18 dic 2018, alle ore 17:41, Tejun Heo ha scritto: > > Hello, Paolo. > > On Tue, Dec 18, 2018 at 08:48:10AM +0100, Paolo Valente wrote: >> If Tejun cannot see any solution to his concern, then can we just >> switch to this extension, considering that >> - for non-shared names the interface is *identical* to the current >> one; >> - by using this new interface, and getting feedback we could >> understand how to better handle Tejun's concern? >> A lot of systems do use weights, and people don't even know that these >> systems don't work correctly in blk-mq. And they won't work correctly >> in any available configuration from 4.21, if we don't fix this problem. > > So, when seen from userland, how it should behave isn't vague or > complicated. For a given device and policy type, there can be only > one implementation active. Yes, but the problem is the opposite. You may have - two different policies, with the same interface parameter, - one active on one device - the other one active on another device In that case, statistics from one policy necessarily differ from statistics from the other policy. In this respect, in a system with more than one drive it already happens that the same policy is active on different devices. When printing a statistics interface file for the policy, the output will be a list of separate statistics, with a bunch of statistics *for each* drive (plus a grand total in some cases). So, our proposal simply extends this scheme in the most natural way: if, now, also two or more policies share the same statistics file, then the output will be a list of separate statistics, one for each policy. The statistics for each policy will be tagged with the policy name, and will have the same identical form as above. It seems the most natural hierarchical extension of the same scheme. At any rate, if you don't like it, just tell us how you prefer it done. Do you prefer the sharing of statistics file to be simply forbidden? (If this can be done.) I think such an incomplete solution would preserve part of the current mess; but, if this allows us to exit from this impasse, then it is ok for me. *Any* feasible option is ok for me. Just pick one. > It doesn't make sense to have two weight > mechanisms active on one device, right? (Un)fortunately, the problem are not weights. There won't be two weights for two policies expiring a weight parameter. The problems concerns statistics. See above. > So, the interface should only > present what makes sense to the user for both configuration knobs and > statistics, and that'd be a hard requirement because we don't want to > present confusing spurious information to userspace. > > There seemd to have been significant misunderstandings as to what the > requirements are when this was discussed way back, so idk what the > good path forward is at this point. Just keep the current names? > I don't clearly understand how "just picking the current names" is a way forward, but if we do not make this extension, in a way or the other, then two policies will simply not be allowed to share the same interface files. And we will be still at the starting point. Thanks, Paolo > Thanks. > > -- > tejun
Re: [PATCH V2 00/10] unify the interface of the proportional-share policy in blkio/io
[RESENDING BECAUSE BOUNCED] > Il giorno 10 dic 2018, alle ore 14:45, Angelo Ruocco > ha scritto: > > 2018-11-30 19:53 GMT+01:00, Paolo Valente : >> >> >>> Il giorno 30 nov 2018, alle ore 19:42, Tejun Heo ha >>> scritto: >>> >>> Hello, Paolo. >>> >>> On Fri, Nov 30, 2018 at 07:23:24PM +0100, Paolo Valente wrote: >>>>> Then we understood that exactly the same happens with throttling, in >>>>> case the latter is activated on different devices w.r.t. bfq. >>>>> >>>>> In addition, the same may happen, in the near future, with the >>>>> bandwidth controller Josef is working on. If the controller can be >>>>> configured per device, as with throttling, then statistics may differ, >>>>> for the same interface files, between bfq, throttling and that >>>>> controller. >>> >>> So, regardless of how all these are implemented, what's presented to >>> user should be consistent and clear. There's no other way around it. >>> Only what's relevant should be visible to userspace. >>> >>>> have you had time to look into this? Any improvement to this >>>> interface is ok for us. We are only interested in finally solving this >>>> interface issue, as, for what concerns us directly, it has been >>>> preventing legacy code to use bfq for years. >>> >>> Unfortunately, I don't have any implementation proposal, but we can't >>> show things this way to userspace. >>> >> >> Well, this is not very helpful to move forward :) >> >> Let me try to repeat the problem, to try to help you help us unblock >> the situation. >> >> If we have multiple entities attached to the same interface output >> file, you don't find it clear that each entity shows the number it >> wants to show. But you have no idea either of how that differentiated >> information should be shown. Is this the situation, or is the problem >> somewhere 'above' this level? >> >> If the problem is as I described it, here are some proposal attempts: >> 1) Do you want file sharing to be allowed only if all entities will >> output the same number? (this seems excessive, but maybe it makes >> sense) >> 2) Do you want only one number to be shown, equal to the sum of the >> numbers of each entity? (in some cases, this may make sense) >> 3) Do you prefer an average? >> 4) Do you have any other idea, even if just germinal? > > To further add to what Paolo said and better expose the problem, I'd like to > say that all those proposals have issues. > If we only allow "same output" cftypes to be shared then we lose all the > flexibility of this solution, and we need a way for an entity to know other > entities internal variables beforehand, which sounds at least very hard, and > maybe is not even an acceptable thing to do. > To put the average, sum or some other mathematical function in the file only > makes sense for certain cftypes, so also doesn't sound like a good idea. In > fact I can think of scenarios where only seeing the different values of the > entities makes sense for a user. > > I understand that the problem is inconsistency: having a file that behaves > differently depending on the situation, and the only way to prevent this I can > think of is to *always* show the entity owner of a certain file (or part of > the > output), even when the output would be the same among entities or when the > file is not currently shared but could be. Can this be an acceptable solution? > > Angelo > Hi Jens, all, let me push for this interface to be fixed too. If we don't fix it in some way, then from 4.21 we well end up with a ridiculous paradox: the proportional share policy (weights) will of course be available, but unusable in practice. In fact, as Lennart--and not only Lennart--can confirm, no piece of code uses bfq.weight to set weights, or will do it. A trivial solution would be to throw away all our work to fix this issue by extending the interface, and just let bfq use the former cfq names. But then the same mess will happen as, e.g., Josef will propose his proportional-share controller. Before making this solution, we proposed it and waited for it to be approved several months ago, so I hope that Tejun concern can be addressed somehow. If Tejun cannot see any solution to his concern, then can we just switch to this extension, considering that - for non-shared names the interface is *identical* to the current one; - by using this new interface, and getting feedback we could understand how to better handle Tejun's concern? A lot of sy
Re: [PATCH BUGFIX 0/2] bfq: fix unbalanced decrements causing loss of throughput
> Il giorno 7 dic 2018, alle ore 03:23, Jens Axboe ha scritto: > > On 12/6/18 11:18 AM, Paolo Valente wrote: >> Hi Jens, >> the first patch in this series fixes an error in the decrementing of >> the counter of the number of groups with pending I/O. This wrong >> decrement caused loss of throughput or, less likely, of control on >> I/O. The second patch is a fix of some wrong comments, which somehow >> contributed to making the above bug more difficult to find. > > Are you fine with this going into 4.21? I can't quite tell what your > intent is. The first patch has a Fixes for something yep, that fixes a serious error. > that went into > this series, but then patch 2 is a comment update that would not > normally be something to be applied at this stage. > and yes, only comments changed by the second one May it make sense to apply them in two steps, one in the 4.20 and the other one in the 4.21? Thanks, Paolo > -- > Jens Axboe >
[PATCH BUGFIX 2/2] block, bfq: fix comments on __bfq_deactivate_entity
Comments on function __bfq_deactivate_entity contains two imprecise or wrong statements: 1) The function performs the deactivation of the entity. 2) The function must be invoked only if the entity is on a service tree. This commits replaces both statements with the correct ones: 1) The functions updates sched_data and service trees for the entity, so as to represent entity as inactive (which is only part of the steps needed for the deactivation of the entity). 2) The function must be invoked on every entity being deactivated. Signed-off-by: Paolo Valente --- block/bfq-wf2q.c | 11 +-- 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 63e0f12be7c9..72adbbe975d5 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1154,15 +1154,14 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, } /** - * __bfq_deactivate_entity - deactivate an entity from its service tree. - * @entity: the entity to deactivate. + * __bfq_deactivate_entity - update sched_data and service trees for + * entity, so as to represent entity as inactive + * @entity: the entity being deactivated. * @ins_into_idle_tree: if false, the entity will not be put into the * idle tree. * - * Deactivates an entity, independently of its previous state. Must - * be invoked only if entity is on a service tree. Extracts the entity - * from that tree, and if necessary and allowed, puts it into the idle - * tree. + * If necessary and allowed, puts entity into the idle tree. NOTE: + * entity may be on no tree if in service. */ bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree) { -- 2.16.1
[PATCH BUGFIX 1/2] block, bfq: fix decrement of num_active_groups
Since commit '2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection")', if there are process groups with I/O requests waiting for completion, then BFQ tags the scenario as 'asymmetric'. This detection is needed for preserving service guarantees (for details, see comments on the computation * of the variable asymmetric_scenario in the function bfq_better_to_idle). Unfortunately, commit '2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection")' contains an error exactly in the updating of the number of groups with I/O requests waiting for completion: if a group has more than one descendant process, then the above number of groups, which is renamed from num_active_groups to a more appropriate num_groups_with_pending_reqs by this commit, may happen to be wrongly decremented multiple times, namely every time one of the descendant processes gets all its pending I/O requests completed. A correct, complete solution should work as follows. Consider a group that is inactive, i.e., that has no descendant process with pending I/O inside BFQ queues. Then suppose that num_groups_with_pending_reqs is still accounting for this group, because the group still has some descendant process with some I/O request still in flight. num_groups_with_pending_reqs should be decremented when the in-flight request of the last descendant process is finally completed (assuming that nothing else has changed for the group in the meantime, in terms of composition of the group and active/inactive state of child groups and processes). To accomplish this, an additional pending-request counter must be added to entities, and must be updated correctly. To avoid this additional field and operations, this commit resorts to the following tradeoff between simplicity and accuracy: for an inactive group that is still counted in num_groups_with_pending_reqs, this commit decrements num_groups_with_pending_reqs when the first descendant process of the group remains with no request waiting for completion. This simplified scheme provides a fix to the unbalanced decrements introduced by 2d29c9f89fcd. Since this error was also caused by lack of comments on this non-trivial issue, this commit also adds related comments. Fixes: 2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection") Reported-by: Steven Barrett Tested-by: Steven Barrett Tested-by: Lucjan Lucjanov Reviewed-by: Federico Motta Signed-off-by: Paolo Valente --- block/bfq-iosched.c | 76 + block/bfq-iosched.h | 51 +-- block/bfq-wf2q.c| 5 +++- 3 files changed, 107 insertions(+), 25 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3a27d31fcda6..97337214bec4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -638,7 +638,7 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd) bfqd->queue_weights_tree.rb_node->rb_right) #ifdef CONFIG_BFQ_GROUP_IOSCHED ) || - (bfqd->num_active_groups > 0 + (bfqd->num_groups_with_pending_reqs > 0 #endif ); } @@ -802,7 +802,21 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd, */ break; } - bfqd->num_active_groups--; + + /* +* The decrement of num_groups_with_pending_reqs is +* not performed immediately upon the deactivation of +* entity, but it is delayed to when it also happens +* that the first leaf descendant bfqq of entity gets +* all its pending requests completed. The following +* instructions perform this delayed decrement, if +* needed. See the comments on +* num_groups_with_pending_reqs for details. +*/ + if (entity->in_groups_with_pending_reqs) { + entity->in_groups_with_pending_reqs = false; + bfqd->num_groups_with_pending_reqs--; + } } } @@ -3529,27 +3543,44 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) * fact, if there are active groups, then, for condition (i) * to become false, it is enough that an active group contains * more active processes or sub-groups than some other active -* group. We address this issue with the following bi-modal -* behavior, implemented in the function +* group. More precisely, for condition (i) to hold because of +* such a group, it is not even necessary that the group is +* (still) active: it is sufficient that, even if the group +* has become inactive, some of its descendant processes still +* have some request already dispatched but still waiting for +* completion. In fact,
[PATCH BUGFIX 1/2] block, bfq: fix decrement of num_active_groups
Since commit '2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection")', if there are process groups with I/O requests waiting for completion, then BFQ tags the scenario as 'asymmetric'. This detection is needed for preserving service guarantees (for details, see comments on the computation * of the variable asymmetric_scenario in the function bfq_better_to_idle). Unfortunately, commit '2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection")' contains an error exactly in the updating of the number of groups with I/O requests waiting for completion: if a group has more than one descendant process, then the above number of groups, which is renamed from num_active_groups to a more appropriate num_groups_with_pending_reqs by this commit, may happen to be wrongly decremented multiple times, namely every time one of the descendant processes gets all its pending I/O requests completed. A correct, complete solution should work as follows. Consider a group that is inactive, i.e., that has no descendant process with pending I/O inside BFQ queues. Then suppose that num_groups_with_pending_reqs is still accounting for this group, because the group still has some descendant process with some I/O request still in flight. num_groups_with_pending_reqs should be decremented when the in-flight request of the last descendant process is finally completed (assuming that nothing else has changed for the group in the meantime, in terms of composition of the group and active/inactive state of child groups and processes). To accomplish this, an additional pending-request counter must be added to entities, and must be updated correctly. To avoid this additional field and operations, this commit resorts to the following tradeoff between simplicity and accuracy: for an inactive group that is still counted in num_groups_with_pending_reqs, this commit decrements num_groups_with_pending_reqs when the first descendant process of the group remains with no request waiting for completion. This simplified scheme provides a fix to the unbalanced decrements introduced by 2d29c9f89fcd. Since this error was also caused by lack of comments on this non-trivial issue, this commit also adds related comments. Fixes: 2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection") Reported-by: Steven Barrett Tested-by: Steven Barrett Tested-by: Lucjan Lucjanov Reviewed-by: Federico Motta Signed-off-by: Paolo Valente --- block/bfq-iosched.c | 76 + block/bfq-iosched.h | 51 +-- block/bfq-wf2q.c| 5 +++- 3 files changed, 107 insertions(+), 25 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3a27d31fcda6..97337214bec4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -638,7 +638,7 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd) bfqd->queue_weights_tree.rb_node->rb_right) #ifdef CONFIG_BFQ_GROUP_IOSCHED ) || - (bfqd->num_active_groups > 0 + (bfqd->num_groups_with_pending_reqs > 0 #endif ); } @@ -802,7 +802,21 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd, */ break; } - bfqd->num_active_groups--; + + /* +* The decrement of num_groups_with_pending_reqs is +* not performed immediately upon the deactivation of +* entity, but it is delayed to when it also happens +* that the first leaf descendant bfqq of entity gets +* all its pending requests completed. The following +* instructions perform this delayed decrement, if +* needed. See the comments on +* num_groups_with_pending_reqs for details. +*/ + if (entity->in_groups_with_pending_reqs) { + entity->in_groups_with_pending_reqs = false; + bfqd->num_groups_with_pending_reqs--; + } } } @@ -3529,27 +3543,44 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) * fact, if there are active groups, then, for condition (i) * to become false, it is enough that an active group contains * more active processes or sub-groups than some other active -* group. We address this issue with the following bi-modal -* behavior, implemented in the function +* group. More precisely, for condition (i) to hold because of +* such a group, it is not even necessary that the group is +* (still) active: it is sufficient that, even if the group +* has become inactive, some of its descendant processes still +* have some request already dispatched but still waiting for +* completion. In fact,
[PATCH BUGFIX 0/2] bfq: fix unbalanced decrements causing loss of throughput
Hi Jens, the first patch in this series fixes an error in the decrementing of the counter of the number of groups with pending I/O. This wrong decrement caused loss of throughput or, less likely, of control on I/O. The second patch is a fix of some wrong comments, which somehow contributed to making the above bug more difficult to find. Thanks, Paolo Paolo Valente (2): block, bfq: fix decrement of num_active_groups block, bfq: fix comments on __bfq_deactivate_entity block/bfq-iosched.c | 76 + block/bfq-iosched.h | 51 +-- block/bfq-wf2q.c| 16 ++- 3 files changed, 112 insertions(+), 31 deletions(-) -- 2.16.1
[PATCH V2 00/10] unify the interface of the proportional-share policy in blkio/io
Hi, here is the V2 of this patch series. Let me rephrase the description of the series, in view of the fact that CFQ will be gone with legacy block. The current implementation of cgroups doesn't allow two or more entities, e.g., I/O schedulers, to share the same files. Thus, to enable people to set group weights with BFQ, I resorted to making BFQ create its own version of the same interface files used by CFQ (before going away with legacy block), by prepending a bfq prefix. Actually, no legacy code uses these different names, or is likely to do so. Having these two sets of names is simply a source of confusion, as pointed out also, e.g., by Lennart Poettering (CCed here), and acknowledged by Tejun [2]. In [1] we agreed on a solution that solves this problem, by actually making it possible to share cgroups files. Both writing to and reading from a shared file trigger the appropriate operation for each of the entities that share the file. In particular, in case of reading, - if all entities produce the same output, the this common output is shown only once; - if the outputs differ, then every per-entity output is shown, followed by the name of the entity that produced that output. With this solution, legacy code that, e.g., sets group weights, just works, regardless of the I/O scheduler actually implementing proportional share. But note that this extension is not restricted to only blkio/io. The general group interface now enables files to be shared among multiple entities of any kind. (I have also added a patch to fix some clerical errors in bfq doc, which I found while making the latter consistent with the new interface.) CHANGES FROM V1: - Removed patch that introduced a function to only find kernfs nodes, without increasing ref counters - Changed commit messages and BFQ documentation, to comply with the fact that there won't be CFQ any longer Thanks, Paolo Angelo Ruocco (5): cgroup: link cftypes of the same subsystem with the same name cgroup: add owner name to cftypes block, bfq: align min and default weights with the old cfq default cgroup: make all functions of all cftypes be invoked block, throttle: allow sharing cgroup statistic files Paolo Valente (5): cgroup: add hook seq_show_cft with also the owning cftype as parameter block, cgroup: pass cftype to functions that need to use it block, bfq: use standard file names for the proportional-share policy doc, bfq-iosched: fix a few clerical errors doc, bfq-iosched: make it consistent with the new cgroup interface Documentation/block/bfq-iosched.txt | 34 ++--- block/bfq-cgroup.c | 148 +--- block/bfq-iosched.h | 4 +- block/blk-cgroup.c | 22 +-- block/blk-throttle.c| 24 ++-- include/linux/blk-cgroup.h | 10 +- include/linux/cgroup-defs.h | 14 +- include/linux/cgroup.h | 13 ++ kernel/cgroup/cgroup.c | 262 +--- 9 files changed, 390 insertions(+), 141 deletions(-) -- 2.16.1
[PATCH V2 00/10] unify the interface of the proportional-share policy in blkio/io
Hi, here is the V2 of this patch series. Let me rephrase the description of the series, in view of the fact that CFQ will be gone with legacy block. The current implementation of cgroups doesn't allow two or more entities, e.g., I/O schedulers, to share the same files. Thus, to enable people to set group weights with BFQ, I resorted to making BFQ create its own version of the same interface files used by CFQ (before going away with legacy block), by prepending a bfq prefix. Actually, no legacy code uses these different names, or is likely to do so. Having these two sets of names is simply a source of confusion, as pointed out also, e.g., by Lennart Poettering (CCed here), and acknowledged by Tejun [2]. In [1] we agreed on a solution that solves this problem, by actually making it possible to share cgroups files. Both writing to and reading from a shared file trigger the appropriate operation for each of the entities that share the file. In particular, in case of reading, - if all entities produce the same output, the this common output is shown only once; - if the outputs differ, then every per-entity output is shown, followed by the name of the entity that produced that output. With this solution, legacy code that, e.g., sets group weights, just works, regardless of the I/O scheduler actually implementing proportional share. But note that this extension is not restricted to only blkio/io. The general group interface now enables files to be shared among multiple entities of any kind. (I have also added a patch to fix some clerical errors in bfq doc, which I found while making the latter consistent with the new interface.) CHANGES FROM V1: - Removed patch that introduced a function to only find kernfs nodes, without increasing ref counters - Changed commit messages and BFQ documentation, to comply with the fact that there won't be CFQ any longer Thanks, Paolo Angelo Ruocco (5): cgroup: link cftypes of the same subsystem with the same name cgroup: add owner name to cftypes block, bfq: align min and default weights with the old cfq default cgroup: make all functions of all cftypes be invoked block, throttle: allow sharing cgroup statistic files Paolo Valente (5): cgroup: add hook seq_show_cft with also the owning cftype as parameter block, cgroup: pass cftype to functions that need to use it block, bfq: use standard file names for the proportional-share policy doc, bfq-iosched: fix a few clerical errors doc, bfq-iosched: make it consistent with the new cgroup interface Documentation/block/bfq-iosched.txt | 34 ++--- block/bfq-cgroup.c | 148 +--- block/bfq-iosched.h | 4 +- block/blk-cgroup.c | 22 +-- block/blk-throttle.c| 24 ++-- include/linux/blk-cgroup.h | 10 +- include/linux/cgroup-defs.h | 14 +- include/linux/cgroup.h | 13 ++ kernel/cgroup/cgroup.c | 262 +--- 9 files changed, 390 insertions(+), 141 deletions(-) -- 2.16.1
[PATCH V2 06/10] cgroup: make all functions of all cftypes be invoked
From: Angelo Ruocco When two or more entities (of any kind) share a file, their respective cftypes are linked together. The allowed operations on those files are: open, release, write and show, mapped to the functions defined in the cftypes. This commit makes the cgroup core invoke, whenever one of those operations is requested, the respective function of all the cftypes linked together. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente --- kernel/cgroup/cgroup.c | 181 - 1 file changed, 132 insertions(+), 49 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 61eafd69e2fd..9bf6b0b5a0ca 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3452,66 +3452,107 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; + struct cftype *n; + int ret = 0; + list_for_each_cft(cft, n) { + if (cft->open) + ret = cft->open(of); + /* +* If there has been an error with the open function of one of +* the cft associated with the file, we call the release +* function of all the cftype associated to cft whose open +* function succeded. +*/ + if (ret) { + struct cftype *c = of->kn->priv; + struct cftype *n; + + list_for_each_cft(c, n) { + if (cft == c) + break; + if (c->release) + c->release(of); + } + break; + } + } - if (cft->open) - return cft->open(of); - return 0; + return ret; } static void cgroup_file_release(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; + struct cftype *n; - if (cft->release) - cft->release(of); + list_for_each_cft(cft, n) + if (cft->release) + cft->release(of); } +/* + * Call all the write functions of the cftypes associated with the file. + * + * When a write fails, don't keep trying to write into the file via the write + * functions of the other cftypes associated with it. + */ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of->kn->priv; + struct cftype *n; struct cgroup_subsys_state *css; - int ret; + int ret = 0; - /* -* If namespaces are delegation boundaries, disallow writes to -* files in an non-init namespace root from inside the namespace -* except for the files explicitly marked delegatable - -* cgroup.procs and cgroup.subtree_control. -*/ - if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && - !(cft->flags & CFTYPE_NS_DELEGATABLE) && - ns != _cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) - return -EPERM; + list_for_each_cft(cft, n) { + /* +* If namespaces are delegation boundaries, disallow writes to +* files in an non-init namespace root from inside the +* namespace except for the files explicitly marked +* delegatable - cgroup.procs and cgroup.subtree_control. +*/ + if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && + !(cft->flags & CFTYPE_NS_DELEGATABLE) && + ns != _cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) + return -EPERM; - if (cft->write) - return cft->write(of, buf, nbytes, off); + if (cft->write) { + ret = cft->write(of, buf, nbytes, off); - /* -* kernfs guarantees that a file isn't deleted with operations in -* flight, which means that the matching css is and stays alive and -* doesn't need to be pinned. The RCU locking is not necessary -* either. It's just for the convenience of using cgroup_css(). -*/ - rcu_read_lock(); - css = cgroup_css(cgrp, cft->ss); - rcu_read_unlock(); + if (ret) + break; + continue; + } - if (cft->write_u64) { - unsigned long long v; -
[PATCH V2 01/10] cgroup: add hook seq_show_cft with also the owning cftype as parameter
The current implementation of the seq_show hook in the cftype struct has only, as parameters, the seq_file to write to and the arguments passed by the command line. Thus, the only way to retrieve the cftype that owns an instance of such hook function is by using the accessor in the seq_file itself. But in a future scenario where the same file may be shared by multiple cftypes, this accessor will point only to the first of the cftypes linked to the seq_file. It will then be impossible to access the cftype owning the seq_show function within the seq_show itself, unless such cftype is the first one. This commit adds an additional seq_show_cft hook that has as a formal parameter also the cftype that owns the function. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente --- include/linux/cgroup-defs.h | 3 ++- kernel/cgroup/cgroup.c | 15 +-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 5e1694fe035b..7841db6e7fb3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -543,8 +543,9 @@ struct cftype { */ s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); - /* generic seq_file read interface */ + /* generic seq_file read interfaces*/ int (*seq_show)(struct seq_file *sf, void *v); + int (*seq_show_cft)(struct seq_file *sf, struct cftype *cft, void *v); /* optional ops, implement all or none */ void *(*seq_start)(struct seq_file *sf, loff_t *ppos); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6aaf5dd5383b..9d0993dd68fe 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1418,7 +1418,8 @@ static umode_t cgroup_file_mode(const struct cftype *cft) { umode_t mode = 0; - if (cft->read_u64 || cft->read_s64 || cft->seq_show) + if (cft->read_u64 || cft->read_s64 || cft->seq_show || + cft->seq_show_cft) mode |= S_IRUGO; if (cft->write_u64 || cft->write_s64 || cft->write) { @@ -3519,17 +3520,19 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) { struct cftype *cft = seq_cft(m); struct cgroup_subsys_state *css = seq_css(m); + int ret = 0; if (cft->seq_show) - return cft->seq_show(m, arg); - - if (cft->read_u64) + ret = cft->seq_show(m, arg); + else if (cft->seq_show_cft) + ret = cft->seq_show_cft(m, cft, arg); + else if (cft->read_u64) seq_printf(m, "%llu\n", cft->read_u64(css, cft)); else if (cft->read_s64) seq_printf(m, "%lld\n", cft->read_s64(css, cft)); else - return -EINVAL; - return 0; + ret = -EINVAL; + return ret; } static struct kernfs_ops cgroup_kf_single_ops = { -- 2.16.1
[PATCH V2 02/10] block, cgroup: pass cftype to functions that need to use it
Some seq_show functions need to access the cftype they belong to, for retrieving the data to show. These functions get their cftype by using the seq_cft accessor for the seq_file. This solution is no longer viable in case a seq_file is shared among more than one cftype, because the accessor always returns (only) the first of the cftypes sharing the seq_file. This commit enables these seq_show functions to be passed their cftype, by replacing their prototype with that of the newly defined seq_show_cft hook, and by invoking these functions through this new hook. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente --- block/bfq-cgroup.c | 54 -- block/blk-cgroup.c | 22 --- block/blk-throttle.c | 8 +++ include/linux/blk-cgroup.h | 10 + 4 files changed, 52 insertions(+), 42 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index a7a1712632b0..038e418fa64f 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -918,17 +918,17 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, } #ifdef CONFIG_DEBUG_BLK_CGROUP -static int bfqg_print_stat(struct seq_file *sf, void *v) +static int bfqg_print_stat(struct seq_file *sf, struct cftype *cft, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, - _policy_bfq, seq_cft(sf)->private, false); + _policy_bfq, cft->private, false); return 0; } -static int bfqg_print_rwstat(struct seq_file *sf, void *v) +static int bfqg_print_rwstat(struct seq_file *sf, struct cftype *cft, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, - _policy_bfq, seq_cft(sf)->private, true); + _policy_bfq, cft->private, true); return 0; } @@ -949,19 +949,21 @@ static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, ); } -static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) +static int bfqg_print_stat_recursive(struct seq_file *sf, struct cftype *cft, +void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_stat_recursive, _policy_bfq, - seq_cft(sf)->private, false); + cft->private, false); return 0; } -static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) +static int bfqg_print_rwstat_recursive(struct seq_file *sf, struct cftype *cft, + void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_rwstat_recursive, _policy_bfq, - seq_cft(sf)->private, true); + cft->private, true); return 0; } @@ -1063,18 +1065,18 @@ struct cftype bfq_blkcg_legacy_files[] = { { .name = "bfq.io_service_bytes", .private = (unsigned long)_policy_bfq, - .seq_show = blkg_print_stat_bytes, + .seq_show_cft = blkg_print_stat_bytes, }, { .name = "bfq.io_serviced", .private = (unsigned long)_policy_bfq, - .seq_show = blkg_print_stat_ios, + .seq_show_cft = blkg_print_stat_ios, }, #ifdef CONFIG_DEBUG_BLK_CGROUP { .name = "bfq.time", .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat, + .seq_show_cft = bfqg_print_stat, }, { .name = "bfq.sectors", @@ -1083,22 +1085,22 @@ struct cftype bfq_blkcg_legacy_files[] = { { .name = "bfq.io_service_time", .private = offsetof(struct bfq_group, stats.service_time), - .seq_show = bfqg_print_rwstat, + .seq_show_cft = bfqg_print_rwstat, }, { .name = "bfq.io_wait_time", .private = offsetof(struct bfq_group, stats.wait_time), - .seq_show = bfqg_print_rwstat, + .seq_show_cft = bfqg_print_rwstat, }, { .name = "bfq.io_merged", .private = offsetof(struct bfq_group, stats.merged), - .seq_show = bfqg_print_rwstat, + .seq_show_cft = bfqg_print_rwstat, }, { .name = "bfq.io_queued", .private = offsetof(struct bfq_group, stats.queued), - .seq_show = bfqg_print_rwstat, + .seq_show_cft = bfqg_print_rwstat, }, #endif /* CONFIG_DEBUG_BLK_CGROUP */ @@ -1106,18 +1108,18 @@ struct cftype bfq_blkcg_legacy_files[] = { {
[PATCH V2 05/10] block, bfq: align min and default weights with the old cfq default
From: Angelo Ruocco bfq exposes a cgroup attribute, weight, with the same meaning as that exposed by cfq. This commit changes bfq default and min weights to match the ones set by cfq (before legacy blk was removed). Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente --- block/bfq-iosched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 77651d817ecd..249d8128d3ee 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -22,13 +22,13 @@ #define BFQ_IOPRIO_CLASSES 3 #define BFQ_CL_IDLE_TIMEOUT(HZ/5) -#define BFQ_MIN_WEIGHT 1 +#define BFQ_MIN_WEIGHT 10 #define BFQ_MAX_WEIGHT 1000 #define BFQ_WEIGHT_CONVERSION_COEFF10 #define BFQ_DEFAULT_QUEUE_IOPRIO 4 -#define BFQ_WEIGHT_LEGACY_DFL 100 +#define BFQ_WEIGHT_LEGACY_DFL 500 #define BFQ_DEFAULT_GRP_IOPRIO 0 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -- 2.16.1
[PATCH V2 04/10] cgroup: add owner name to cftypes
From: Angelo Ruocco The piece of information "who created a certain cftype" is not stored anywhere, thus a cftype is not able to know who is its owner. This commit addresses this problem by adding a new field in the cftype structure that enables the name of its owner to be explicitly set. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente --- include/linux/cgroup-defs.h | 2 ++ include/linux/cgroup.h | 13 + 2 files changed, 15 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index d659763c7221..6e31f478c6e1 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -36,6 +36,7 @@ struct seq_file; #define MAX_CGROUP_TYPE_NAMELEN 32 #define MAX_CGROUP_ROOT_NAMELEN 64 #define MAX_CFTYPE_NAME64 +#define MAX_OWNER_NAME 64 /* define the enumeration of all cgroup subsystems */ #define SUBSYS(_x) _x ## _cgrp_id, @@ -505,6 +506,7 @@ struct cftype { * end of cftype array. */ char name[MAX_CFTYPE_NAME]; + char owner_name[MAX_OWNER_NAME]; unsigned long private; /* diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9d12757a65b0..267153bd898a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -291,6 +291,19 @@ void css_task_iter_end(struct css_task_iter *it); ; \ else +/** + * list_for_each_cft - walk circular list of cftypes linked together + * @cft: cftype from where to start + * @n: cftype used as a temporary storage + * + * A cftype pointed by a file may be part of a circular list of cftypes, this + * macro walks the circular list starting from any given cftype. Unlike the + * "list_for_each_entry" macro, the first element is included in the iteration. + */ +#define list_for_each_cft(cft, n) \ + for (n = NULL; cft != n; n = (n == NULL) ? cft : n, \ +cft = list_next_entry(cft, share_node)) + /* * Inline functions. */ -- 2.16.1
[PATCH V2 06/10] cgroup: make all functions of all cftypes be invoked
From: Angelo Ruocco When two or more entities (of any kind) share a file, their respective cftypes are linked together. The allowed operations on those files are: open, release, write and show, mapped to the functions defined in the cftypes. This commit makes the cgroup core invoke, whenever one of those operations is requested, the respective function of all the cftypes linked together. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente --- kernel/cgroup/cgroup.c | 181 - 1 file changed, 132 insertions(+), 49 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 61eafd69e2fd..9bf6b0b5a0ca 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3452,66 +3452,107 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; + struct cftype *n; + int ret = 0; + list_for_each_cft(cft, n) { + if (cft->open) + ret = cft->open(of); + /* +* If there has been an error with the open function of one of +* the cft associated with the file, we call the release +* function of all the cftype associated to cft whose open +* function succeded. +*/ + if (ret) { + struct cftype *c = of->kn->priv; + struct cftype *n; + + list_for_each_cft(c, n) { + if (cft == c) + break; + if (c->release) + c->release(of); + } + break; + } + } - if (cft->open) - return cft->open(of); - return 0; + return ret; } static void cgroup_file_release(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; + struct cftype *n; - if (cft->release) - cft->release(of); + list_for_each_cft(cft, n) + if (cft->release) + cft->release(of); } +/* + * Call all the write functions of the cftypes associated with the file. + * + * When a write fails, don't keep trying to write into the file via the write + * functions of the other cftypes associated with it. + */ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of->kn->priv; + struct cftype *n; struct cgroup_subsys_state *css; - int ret; + int ret = 0; - /* -* If namespaces are delegation boundaries, disallow writes to -* files in an non-init namespace root from inside the namespace -* except for the files explicitly marked delegatable - -* cgroup.procs and cgroup.subtree_control. -*/ - if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && - !(cft->flags & CFTYPE_NS_DELEGATABLE) && - ns != _cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) - return -EPERM; + list_for_each_cft(cft, n) { + /* +* If namespaces are delegation boundaries, disallow writes to +* files in an non-init namespace root from inside the +* namespace except for the files explicitly marked +* delegatable - cgroup.procs and cgroup.subtree_control. +*/ + if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && + !(cft->flags & CFTYPE_NS_DELEGATABLE) && + ns != _cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) + return -EPERM; - if (cft->write) - return cft->write(of, buf, nbytes, off); + if (cft->write) { + ret = cft->write(of, buf, nbytes, off); - /* -* kernfs guarantees that a file isn't deleted with operations in -* flight, which means that the matching css is and stays alive and -* doesn't need to be pinned. The RCU locking is not necessary -* either. It's just for the convenience of using cgroup_css(). -*/ - rcu_read_lock(); - css = cgroup_css(cgrp, cft->ss); - rcu_read_unlock(); + if (ret) + break; + continue; + } - if (cft->write_u64) { - unsigned long long v; -
[PATCH V2 01/10] cgroup: add hook seq_show_cft with also the owning cftype as parameter
The current implementation of the seq_show hook in the cftype struct has only, as parameters, the seq_file to write to and the arguments passed by the command line. Thus, the only way to retrieve the cftype that owns an instance of such hook function is by using the accessor in the seq_file itself. But in a future scenario where the same file may be shared by multiple cftypes, this accessor will point only to the first of the cftypes linked to the seq_file. It will then be impossible to access the cftype owning the seq_show function within the seq_show itself, unless such cftype is the first one. This commit adds an additional seq_show_cft hook that has as a formal parameter also the cftype that owns the function. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente --- include/linux/cgroup-defs.h | 3 ++- kernel/cgroup/cgroup.c | 15 +-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 5e1694fe035b..7841db6e7fb3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -543,8 +543,9 @@ struct cftype { */ s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); - /* generic seq_file read interface */ + /* generic seq_file read interfaces*/ int (*seq_show)(struct seq_file *sf, void *v); + int (*seq_show_cft)(struct seq_file *sf, struct cftype *cft, void *v); /* optional ops, implement all or none */ void *(*seq_start)(struct seq_file *sf, loff_t *ppos); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6aaf5dd5383b..9d0993dd68fe 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1418,7 +1418,8 @@ static umode_t cgroup_file_mode(const struct cftype *cft) { umode_t mode = 0; - if (cft->read_u64 || cft->read_s64 || cft->seq_show) + if (cft->read_u64 || cft->read_s64 || cft->seq_show || + cft->seq_show_cft) mode |= S_IRUGO; if (cft->write_u64 || cft->write_s64 || cft->write) { @@ -3519,17 +3520,19 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) { struct cftype *cft = seq_cft(m); struct cgroup_subsys_state *css = seq_css(m); + int ret = 0; if (cft->seq_show) - return cft->seq_show(m, arg); - - if (cft->read_u64) + ret = cft->seq_show(m, arg); + else if (cft->seq_show_cft) + ret = cft->seq_show_cft(m, cft, arg); + else if (cft->read_u64) seq_printf(m, "%llu\n", cft->read_u64(css, cft)); else if (cft->read_s64) seq_printf(m, "%lld\n", cft->read_s64(css, cft)); else - return -EINVAL; - return 0; + ret = -EINVAL; + return ret; } static struct kernfs_ops cgroup_kf_single_ops = { -- 2.16.1
[PATCH V2 02/10] block, cgroup: pass cftype to functions that need to use it
Some seq_show functions need to access the cftype they belong to, for retrieving the data to show. These functions get their cftype by using the seq_cft accessor for the seq_file. This solution is no longer viable in case a seq_file is shared among more than one cftype, because the accessor always returns (only) the first of the cftypes sharing the seq_file. This commit enables these seq_show functions to be passed their cftype, by replacing their prototype with that of the newly defined seq_show_cft hook, and by invoking these functions through this new hook. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente --- block/bfq-cgroup.c | 54 -- block/blk-cgroup.c | 22 --- block/blk-throttle.c | 8 +++ include/linux/blk-cgroup.h | 10 + 4 files changed, 52 insertions(+), 42 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index a7a1712632b0..038e418fa64f 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -918,17 +918,17 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, } #ifdef CONFIG_DEBUG_BLK_CGROUP -static int bfqg_print_stat(struct seq_file *sf, void *v) +static int bfqg_print_stat(struct seq_file *sf, struct cftype *cft, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, - _policy_bfq, seq_cft(sf)->private, false); + _policy_bfq, cft->private, false); return 0; } -static int bfqg_print_rwstat(struct seq_file *sf, void *v) +static int bfqg_print_rwstat(struct seq_file *sf, struct cftype *cft, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, - _policy_bfq, seq_cft(sf)->private, true); + _policy_bfq, cft->private, true); return 0; } @@ -949,19 +949,21 @@ static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, ); } -static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) +static int bfqg_print_stat_recursive(struct seq_file *sf, struct cftype *cft, +void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_stat_recursive, _policy_bfq, - seq_cft(sf)->private, false); + cft->private, false); return 0; } -static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) +static int bfqg_print_rwstat_recursive(struct seq_file *sf, struct cftype *cft, + void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_rwstat_recursive, _policy_bfq, - seq_cft(sf)->private, true); + cft->private, true); return 0; } @@ -1063,18 +1065,18 @@ struct cftype bfq_blkcg_legacy_files[] = { { .name = "bfq.io_service_bytes", .private = (unsigned long)_policy_bfq, - .seq_show = blkg_print_stat_bytes, + .seq_show_cft = blkg_print_stat_bytes, }, { .name = "bfq.io_serviced", .private = (unsigned long)_policy_bfq, - .seq_show = blkg_print_stat_ios, + .seq_show_cft = blkg_print_stat_ios, }, #ifdef CONFIG_DEBUG_BLK_CGROUP { .name = "bfq.time", .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat, + .seq_show_cft = bfqg_print_stat, }, { .name = "bfq.sectors", @@ -1083,22 +1085,22 @@ struct cftype bfq_blkcg_legacy_files[] = { { .name = "bfq.io_service_time", .private = offsetof(struct bfq_group, stats.service_time), - .seq_show = bfqg_print_rwstat, + .seq_show_cft = bfqg_print_rwstat, }, { .name = "bfq.io_wait_time", .private = offsetof(struct bfq_group, stats.wait_time), - .seq_show = bfqg_print_rwstat, + .seq_show_cft = bfqg_print_rwstat, }, { .name = "bfq.io_merged", .private = offsetof(struct bfq_group, stats.merged), - .seq_show = bfqg_print_rwstat, + .seq_show_cft = bfqg_print_rwstat, }, { .name = "bfq.io_queued", .private = offsetof(struct bfq_group, stats.queued), - .seq_show = bfqg_print_rwstat, + .seq_show_cft = bfqg_print_rwstat, }, #endif /* CONFIG_DEBUG_BLK_CGROUP */ @@ -1106,18 +1108,18 @@ struct cftype bfq_blkcg_legacy_files[] = { {
[PATCH V2 05/10] block, bfq: align min and default weights with the old cfq default
From: Angelo Ruocco bfq exposes a cgroup attribute, weight, with the same meaning as that exposed by cfq. This commit changes bfq default and min weights to match the ones set by cfq (before legacy blk was removed). Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente --- block/bfq-iosched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 77651d817ecd..249d8128d3ee 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -22,13 +22,13 @@ #define BFQ_IOPRIO_CLASSES 3 #define BFQ_CL_IDLE_TIMEOUT(HZ/5) -#define BFQ_MIN_WEIGHT 1 +#define BFQ_MIN_WEIGHT 10 #define BFQ_MAX_WEIGHT 1000 #define BFQ_WEIGHT_CONVERSION_COEFF10 #define BFQ_DEFAULT_QUEUE_IOPRIO 4 -#define BFQ_WEIGHT_LEGACY_DFL 100 +#define BFQ_WEIGHT_LEGACY_DFL 500 #define BFQ_DEFAULT_GRP_IOPRIO 0 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -- 2.16.1
[PATCH V2 04/10] cgroup: add owner name to cftypes
From: Angelo Ruocco The piece of information "who created a certain cftype" is not stored anywhere, thus a cftype is not able to know who is its owner. This commit addresses this problem by adding a new field in the cftype structure that enables the name of its owner to be explicitly set. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente --- include/linux/cgroup-defs.h | 2 ++ include/linux/cgroup.h | 13 + 2 files changed, 15 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index d659763c7221..6e31f478c6e1 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -36,6 +36,7 @@ struct seq_file; #define MAX_CGROUP_TYPE_NAMELEN 32 #define MAX_CGROUP_ROOT_NAMELEN 64 #define MAX_CFTYPE_NAME64 +#define MAX_OWNER_NAME 64 /* define the enumeration of all cgroup subsystems */ #define SUBSYS(_x) _x ## _cgrp_id, @@ -505,6 +506,7 @@ struct cftype { * end of cftype array. */ char name[MAX_CFTYPE_NAME]; + char owner_name[MAX_OWNER_NAME]; unsigned long private; /* diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9d12757a65b0..267153bd898a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -291,6 +291,19 @@ void css_task_iter_end(struct css_task_iter *it); ; \ else +/** + * list_for_each_cft - walk circular list of cftypes linked together + * @cft: cftype from where to start + * @n: cftype used as a temporary storage + * + * A cftype pointed by a file may be part of a circular list of cftypes, this + * macro walks the circular list starting from any given cftype. Unlike the + * "list_for_each_entry" macro, the first element is included in the iteration. + */ +#define list_for_each_cft(cft, n) \ + for (n = NULL; cft != n; n = (n == NULL) ? cft : n, \ +cft = list_next_entry(cft, share_node)) + /* * Inline functions. */ -- 2.16.1
[PATCH V2 03/10] cgroup: link cftypes of the same subsystem with the same name
From: Angelo Ruocco Two entities, of any kind, are not able to create a cgroup file with the same name in the same folder: if an entity tries to create a file that has the same name as a file created by another entity, the cgroup core stops it, warns the user about the error, and then proceeds to delete all the files created by the last entity. However, in some specific situations, it may be useful for two or more entities to use a common file, e.g., the I/O schedulers bfq and cfq had the same "weight" attribute, that changed the behavior of the two schedulers in a similar way. This commit prepares the interface that allows two entities to share files. It adds a flag CFTYPE_SHARE_FILE for cftypes, flag that allows cftypes to be linked together if they are part of the same subsystem and have the same name. There is a limitation for a cftype that wants to share a file: it can't have the hooks seq_start/next/stop. The reason is that there is no consistent way to show portions of a file once multiple cftypes are attached to it, and thus more than one seq_show() is invoked: there are neither an univocal start point, nor univocal "next" and "stop" operations. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente --- include/linux/cgroup-defs.h | 9 ++ kernel/cgroup/cgroup.c | 78 +++-- 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 7841db6e7fb3..d659763c7221 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -93,6 +93,8 @@ enum { CFTYPE_NO_PREFIX= (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ + CFTYPE_SHARES_FILE = (1 << 5), /* shares file w/ other cfts */ + /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL= (1 << 16),/* only on default hierarchy */ __CFTYPE_NOT_ON_DFL = (1 << 17),/* not on default hierarchy */ @@ -528,6 +530,13 @@ struct cftype { */ struct cgroup_subsys *ss; /* NULL for cgroup core files */ struct list_head node; /* anchored at ss->cfts */ + + /* +* List of cftypes that are sharing the same file. It allows the hook +* functions of the cftypes in the list to be called together. +*/ + struct list_head share_node; + struct kernfs_ops *kf_ops; int (*open)(struct kernfs_open_file *of); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9d0993dd68fe..61eafd69e2fd 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1549,11 +1549,15 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) return NULL; } -static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +static void cgroup_rm_file(struct cgroup *cgrp, struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; + struct kernfs_node *kn = kernfs_find_and_get(cgrp->kn, + cgroup_file_name(cgrp, cft, name)); + struct cftype *cfts = kn->priv; lockdep_assert_held(_mutex); + kernfs_put(kn); if (cft->file_offset) { struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); @@ -1566,7 +1570,19 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) del_timer_sync(>notify_timer); } - kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); + /* Delete the file only if it's used by one cftype */ + if (list_empty(>share_node) || atomic_read(>count) == 1) { + kernfs_remove(kn); + } else { + /* +* Update the "priv" pointer of the kernfs_node if the cftype +* that first created the file is removed. +*/ + if (cft == cfts) + kn->priv = list_next_entry(cft, share_node); + + kernfs_put(kn); + } } /** @@ -3437,6 +3453,7 @@ static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; + if (cft->open) return cft->open(of); return 0; @@ -3585,6 +3602,22 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, #ifdef CONFIG_DEBUG_LOCK_ALLOC key = >lockdep_key; #endif + + if (cft->flags & CFTYPE_SHARES_FILE) { + /* kn->count keeps track of how many cftypes share kn */ + kn = kernfs_find_and_get(cgrp->kn, +cgroup_file_name(cgrp, cft, name)); + if (kn) { + struct cftype *cft
[PATCH V2 10/10] doc, bfq-iosched: make it consistent with the new cgroup interface
BFQ now shares interface files with CFQ, for the proportional-share policy. Make documentation consistent with that. Signed-off-by: Paolo Valente --- Documentation/block/bfq-iosched.txt | 28 +++- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 6d7dd5ab8554..3d7ef138ce6a 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -508,12 +508,14 @@ process. To get proportional sharing of bandwidth with BFQ for a given device, BFQ must of course be the active scheduler for that device. -Within each group directory, the names of the files associated with -BFQ-specific cgroup parameters and stats begin with the "bfq." -prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for -BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group -parameter to set the weight of a group with BFQ is blkio.bfq.weight -or io.bfq.weight. +BFQ uses the standard interface files of the proportional-share +policy, previously used by CFQ. If one such file is read/written, then +the operation associated with the file is performed by BFQ for each +device where BFQ is the active scheduler. In addition, BFQ is +configured so as to share interface files with other entities (if some +of these files does happen to be shared, then its associated operation +is performed also by any of the other entities that is using the +file). As for cgroups-v1 (blkio controller), the exact set of stat files created, and kept up-to-date by bfq, depends on whether @@ -521,13 +523,13 @@ CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all the stat files documented in Documentation/cgroup-v1/blkio-controller.txt. If, instead, CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files -blkio.bfq.io_service_bytes -blkio.bfq.io_service_bytes_recursive -blkio.bfq.io_serviced -blkio.bfq.io_serviced_recursive +blkio.io_service_bytes +blkio.io_service_bytes_recursive +blkio.io_serviced +blkio.io_serviced_recursive The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum -throughput sustainable with bfq, because updating the blkio.bfq.* +throughput sustainable with BFQ, because updating the blkio.* stats is rather costly, especially for some of the stats enabled by CONFIG_DEBUG_BLK_CGROUP. @@ -536,8 +538,8 @@ Parameters to set For each group, there is only the following parameter to set. -weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the -group inside its parent. Available values: 1..1 (default 100). The +weight (namely blkio.weight or io.weight): the weight of the group +inside its parent. Available values: 1..1 (default 100). The linear mapping between ioprio and weights, described at the beginning of the tunable section, is still valid, but all weights higher than IOPRIO_BE_NR*10 are mapped to ioprio 0. -- 2.16.1
[PATCH V2 07/10] block, bfq: use standard file names for the proportional-share policy
Some of the files exposed in a cgroup by bfq, for the proportional share policy, have the same meaning as the files owned by cfq (before legacy blk was removed). The old implementation of the cgroup interface didn't allow different entities to create cgroup files with the same name (in the same subsystem). So, for bfq, we had to add the prefix "bfq" to the names of its cgroup files. This commit renames the cgroup files of the bfq scheduler as those exposed by cfq, and makes bfq willing to share these files with any other future policy. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente --- block/bfq-cgroup.c | 94 +++--- 1 file changed, 69 insertions(+), 25 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 038e418fa64f..0643147b2cbc 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -1055,50 +1055,67 @@ struct blkcg_policy blkcg_policy_bfq = { struct cftype bfq_blkcg_legacy_files[] = { { - .name = "bfq.weight", - .flags = CFTYPE_NOT_ON_ROOT, + .name = "weight", + .owner_name = "bfq", + .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_SHARES_FILE, .seq_show = bfq_io_show_weight, .write_u64 = bfq_io_set_weight_legacy, }, /* statistics, covers only the tasks in the bfqg */ { - .name = "bfq.io_service_bytes", + .name = "io_service_bytes", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = (unsigned long)_policy_bfq, .seq_show_cft = blkg_print_stat_bytes, }, { - .name = "bfq.io_serviced", + .name = "io_serviced", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = (unsigned long)_policy_bfq, .seq_show_cft = blkg_print_stat_ios, }, #ifdef CONFIG_DEBUG_BLK_CGROUP { - .name = "bfq.time", + .name = "time", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = offsetof(struct bfq_group, stats.time), .seq_show_cft = bfqg_print_stat, }, { - .name = "bfq.sectors", + .name = "sectors", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .seq_show = bfqg_print_stat_sectors, }, { - .name = "bfq.io_service_time", + .name = "io_service_time", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = offsetof(struct bfq_group, stats.service_time), .seq_show_cft = bfqg_print_rwstat, }, { - .name = "bfq.io_wait_time", + .name = "io_wait_time", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = offsetof(struct bfq_group, stats.wait_time), .seq_show_cft = bfqg_print_rwstat, }, { - .name = "bfq.io_merged", + .name = "io_merged", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = offsetof(struct bfq_group, stats.merged), .seq_show_cft = bfqg_print_rwstat, }, { - .name = "bfq.io_queued", + .name = "io_queued", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = offsetof(struct bfq_group, stats.queued), .seq_show_cft = bfqg_print_rwstat, }, @@ -1106,66 +1123,92 @@ struct cftype bfq_blkcg_legacy_files[] = { /* the same statictics which cover the bfqg and its descendants */ { - .name = "bfq.io_service_bytes_recursive", + .name = "io_service_bytes_recursive", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = (unsigned long)_policy_bfq, .seq_show_cft = blkg_print_stat_bytes_recursive, }, { - .name = "bfq.io_serviced_recursive", + .name = "io_serviced_recursive", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = (unsigned long)_policy_bfq, .seq_show_cft = blkg_print_stat_ios_recursive, }, #ifdef CONFIG_DEBUG_BLK_CGROUP { - .name =
[PATCH V2 03/10] cgroup: link cftypes of the same subsystem with the same name
From: Angelo Ruocco Two entities, of any kind, are not able to create a cgroup file with the same name in the same folder: if an entity tries to create a file that has the same name as a file created by another entity, the cgroup core stops it, warns the user about the error, and then proceeds to delete all the files created by the last entity. However, in some specific situations, it may be useful for two or more entities to use a common file, e.g., the I/O schedulers bfq and cfq had the same "weight" attribute, that changed the behavior of the two schedulers in a similar way. This commit prepares the interface that allows two entities to share files. It adds a flag CFTYPE_SHARE_FILE for cftypes, flag that allows cftypes to be linked together if they are part of the same subsystem and have the same name. There is a limitation for a cftype that wants to share a file: it can't have the hooks seq_start/next/stop. The reason is that there is no consistent way to show portions of a file once multiple cftypes are attached to it, and thus more than one seq_show() is invoked: there are neither an univocal start point, nor univocal "next" and "stop" operations. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente --- include/linux/cgroup-defs.h | 9 ++ kernel/cgroup/cgroup.c | 78 +++-- 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 7841db6e7fb3..d659763c7221 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -93,6 +93,8 @@ enum { CFTYPE_NO_PREFIX= (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ + CFTYPE_SHARES_FILE = (1 << 5), /* shares file w/ other cfts */ + /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL= (1 << 16),/* only on default hierarchy */ __CFTYPE_NOT_ON_DFL = (1 << 17),/* not on default hierarchy */ @@ -528,6 +530,13 @@ struct cftype { */ struct cgroup_subsys *ss; /* NULL for cgroup core files */ struct list_head node; /* anchored at ss->cfts */ + + /* +* List of cftypes that are sharing the same file. It allows the hook +* functions of the cftypes in the list to be called together. +*/ + struct list_head share_node; + struct kernfs_ops *kf_ops; int (*open)(struct kernfs_open_file *of); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9d0993dd68fe..61eafd69e2fd 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1549,11 +1549,15 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) return NULL; } -static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +static void cgroup_rm_file(struct cgroup *cgrp, struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; + struct kernfs_node *kn = kernfs_find_and_get(cgrp->kn, + cgroup_file_name(cgrp, cft, name)); + struct cftype *cfts = kn->priv; lockdep_assert_held(_mutex); + kernfs_put(kn); if (cft->file_offset) { struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); @@ -1566,7 +1570,19 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) del_timer_sync(>notify_timer); } - kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); + /* Delete the file only if it's used by one cftype */ + if (list_empty(>share_node) || atomic_read(>count) == 1) { + kernfs_remove(kn); + } else { + /* +* Update the "priv" pointer of the kernfs_node if the cftype +* that first created the file is removed. +*/ + if (cft == cfts) + kn->priv = list_next_entry(cft, share_node); + + kernfs_put(kn); + } } /** @@ -3437,6 +3453,7 @@ static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; + if (cft->open) return cft->open(of); return 0; @@ -3585,6 +3602,22 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, #ifdef CONFIG_DEBUG_LOCK_ALLOC key = >lockdep_key; #endif + + if (cft->flags & CFTYPE_SHARES_FILE) { + /* kn->count keeps track of how many cftypes share kn */ + kn = kernfs_find_and_get(cgrp->kn, +cgroup_file_name(cgrp, cft, name)); + if (kn) { + struct cftype *cft
[PATCH V2 10/10] doc, bfq-iosched: make it consistent with the new cgroup interface
BFQ now shares interface files with CFQ, for the proportional-share policy. Make documentation consistent with that. Signed-off-by: Paolo Valente --- Documentation/block/bfq-iosched.txt | 28 +++- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 6d7dd5ab8554..3d7ef138ce6a 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -508,12 +508,14 @@ process. To get proportional sharing of bandwidth with BFQ for a given device, BFQ must of course be the active scheduler for that device. -Within each group directory, the names of the files associated with -BFQ-specific cgroup parameters and stats begin with the "bfq." -prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for -BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group -parameter to set the weight of a group with BFQ is blkio.bfq.weight -or io.bfq.weight. +BFQ uses the standard interface files of the proportional-share +policy, previously used by CFQ. If one such file is read/written, then +the operation associated with the file is performed by BFQ for each +device where BFQ is the active scheduler. In addition, BFQ is +configured so as to share interface files with other entities (if some +of these files does happen to be shared, then its associated operation +is performed also by any of the other entities that is using the +file). As for cgroups-v1 (blkio controller), the exact set of stat files created, and kept up-to-date by bfq, depends on whether @@ -521,13 +523,13 @@ CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all the stat files documented in Documentation/cgroup-v1/blkio-controller.txt. If, instead, CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files -blkio.bfq.io_service_bytes -blkio.bfq.io_service_bytes_recursive -blkio.bfq.io_serviced -blkio.bfq.io_serviced_recursive +blkio.io_service_bytes +blkio.io_service_bytes_recursive +blkio.io_serviced +blkio.io_serviced_recursive The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum -throughput sustainable with bfq, because updating the blkio.bfq.* +throughput sustainable with BFQ, because updating the blkio.* stats is rather costly, especially for some of the stats enabled by CONFIG_DEBUG_BLK_CGROUP. @@ -536,8 +538,8 @@ Parameters to set For each group, there is only the following parameter to set. -weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the -group inside its parent. Available values: 1..1 (default 100). The +weight (namely blkio.weight or io.weight): the weight of the group +inside its parent. Available values: 1..1 (default 100). The linear mapping between ioprio and weights, described at the beginning of the tunable section, is still valid, but all weights higher than IOPRIO_BE_NR*10 are mapped to ioprio 0. -- 2.16.1
[PATCH V2 07/10] block, bfq: use standard file names for the proportional-share policy
Some of the files exposed in a cgroup by bfq, for the proportional share policy, have the same meaning as the files owned by cfq (before legacy blk was removed). The old implementation of the cgroup interface didn't allow different entities to create cgroup files with the same name (in the same subsystem). So, for bfq, we had to add the prefix "bfq" to the names of its cgroup files. This commit renames the cgroup files of the bfq scheduler as those exposed by cfq, and makes bfq willing to share these files with any other future policy. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente --- block/bfq-cgroup.c | 94 +++--- 1 file changed, 69 insertions(+), 25 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 038e418fa64f..0643147b2cbc 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -1055,50 +1055,67 @@ struct blkcg_policy blkcg_policy_bfq = { struct cftype bfq_blkcg_legacy_files[] = { { - .name = "bfq.weight", - .flags = CFTYPE_NOT_ON_ROOT, + .name = "weight", + .owner_name = "bfq", + .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_SHARES_FILE, .seq_show = bfq_io_show_weight, .write_u64 = bfq_io_set_weight_legacy, }, /* statistics, covers only the tasks in the bfqg */ { - .name = "bfq.io_service_bytes", + .name = "io_service_bytes", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = (unsigned long)_policy_bfq, .seq_show_cft = blkg_print_stat_bytes, }, { - .name = "bfq.io_serviced", + .name = "io_serviced", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = (unsigned long)_policy_bfq, .seq_show_cft = blkg_print_stat_ios, }, #ifdef CONFIG_DEBUG_BLK_CGROUP { - .name = "bfq.time", + .name = "time", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = offsetof(struct bfq_group, stats.time), .seq_show_cft = bfqg_print_stat, }, { - .name = "bfq.sectors", + .name = "sectors", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .seq_show = bfqg_print_stat_sectors, }, { - .name = "bfq.io_service_time", + .name = "io_service_time", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = offsetof(struct bfq_group, stats.service_time), .seq_show_cft = bfqg_print_rwstat, }, { - .name = "bfq.io_wait_time", + .name = "io_wait_time", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = offsetof(struct bfq_group, stats.wait_time), .seq_show_cft = bfqg_print_rwstat, }, { - .name = "bfq.io_merged", + .name = "io_merged", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = offsetof(struct bfq_group, stats.merged), .seq_show_cft = bfqg_print_rwstat, }, { - .name = "bfq.io_queued", + .name = "io_queued", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = offsetof(struct bfq_group, stats.queued), .seq_show_cft = bfqg_print_rwstat, }, @@ -1106,66 +1123,92 @@ struct cftype bfq_blkcg_legacy_files[] = { /* the same statictics which cover the bfqg and its descendants */ { - .name = "bfq.io_service_bytes_recursive", + .name = "io_service_bytes_recursive", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = (unsigned long)_policy_bfq, .seq_show_cft = blkg_print_stat_bytes_recursive, }, { - .name = "bfq.io_serviced_recursive", + .name = "io_serviced_recursive", + .owner_name = "bfq", + .flags = CFTYPE_SHARES_FILE, .private = (unsigned long)_policy_bfq, .seq_show_cft = blkg_print_stat_ios_recursive, }, #ifdef CONFIG_DEBUG_BLK_CGROUP { - .name =
[PATCH V2 09/10] doc, bfq-iosched: fix a few clerical errors
This commit fixes a few clerical errors in Documentation/block/bfq-iosched.txt. Signed-off-by: Paolo Valente --- Documentation/block/bfq-iosched.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 8d8d8f06cab2..6d7dd5ab8554 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -42,7 +42,7 @@ sustainable throughputs, on the same systems as above: BFQ works for multi-queue devices too. -The table of contents follow. Impatients can just jump to Section 3. +The table of contents follows. Impatients can just jump to Section 3. CONTENTS @@ -51,7 +51,7 @@ CONTENTS 1-2 Server systems 2. How does BFQ work? 3. What are BFQ's tunables and how to properly configure BFQ? -4. BFQ group scheduling +4. Group scheduling with BFQ 4-1 Service guarantees provided 4-2 Interface @@ -294,7 +294,7 @@ too. per-process ioprio and weight - -Unless the cgroups interface is used (see "4. BFQ group scheduling"), +Unless the cgroups interface is used (see "4. Group scheduling with BFQ"), weights can be assigned to processes only indirectly, through I/O priorities, and according to the relation: weight = (IOPRIO_BE_NR - ioprio) * 10. -- 2.16.1
[PATCH V2 08/10] block, throttle: allow sharing cgroup statistic files
From: Angelo Ruocco Some of the cgroup files defined in the throttle policy have the same meaning as those defined in the proportional share policy. This commit uses the new file sharing interface in cgroup to share these files. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente --- block/blk-throttle.c | 16 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 6bfdaac53b6f..95825448c031 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1491,22 +1491,30 @@ static struct cftype throtl_legacy_files[] = { .write = tg_set_conf_uint, }, { - .name = "throttle.io_service_bytes", + .name = "io_service_bytes", + .owner_name = "throttle", + .flags = CFTYPE_SHARES_FILE, .private = (unsigned long)_policy_throtl, .seq_show_cft = blkg_print_stat_bytes, }, { - .name = "throttle.io_service_bytes_recursive", + .name = "io_service_bytes_recursive", + .owner_name = "throttle", + .flags = CFTYPE_SHARES_FILE, .private = (unsigned long)_policy_throtl, .seq_show_cft = blkg_print_stat_bytes_recursive, }, { - .name = "throttle.io_serviced", + .name = "io_serviced", + .owner_name = "throttle", + .flags = CFTYPE_SHARES_FILE, .private = (unsigned long)_policy_throtl, .seq_show_cft = blkg_print_stat_ios, }, { - .name = "throttle.io_serviced_recursive", + .name = "io_serviced_recursive", + .owner_name = "throttle", + .flags = CFTYPE_SHARES_FILE, .private = (unsigned long)_policy_throtl, .seq_show_cft = blkg_print_stat_ios_recursive, }, -- 2.16.1
[PATCH V2 09/10] doc, bfq-iosched: fix a few clerical errors
This commit fixes a few clerical errors in Documentation/block/bfq-iosched.txt. Signed-off-by: Paolo Valente --- Documentation/block/bfq-iosched.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 8d8d8f06cab2..6d7dd5ab8554 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -42,7 +42,7 @@ sustainable throughputs, on the same systems as above: BFQ works for multi-queue devices too. -The table of contents follow. Impatients can just jump to Section 3. +The table of contents follows. Impatients can just jump to Section 3. CONTENTS @@ -51,7 +51,7 @@ CONTENTS 1-2 Server systems 2. How does BFQ work? 3. What are BFQ's tunables and how to properly configure BFQ? -4. BFQ group scheduling +4. Group scheduling with BFQ 4-1 Service guarantees provided 4-2 Interface @@ -294,7 +294,7 @@ too. per-process ioprio and weight - -Unless the cgroups interface is used (see "4. BFQ group scheduling"), +Unless the cgroups interface is used (see "4. Group scheduling with BFQ"), weights can be assigned to processes only indirectly, through I/O priorities, and according to the relation: weight = (IOPRIO_BE_NR - ioprio) * 10. -- 2.16.1
[PATCH V2 08/10] block, throttle: allow sharing cgroup statistic files
From: Angelo Ruocco Some of the cgroup files defined in the throttle policy have the same meaning as those defined in the proportional share policy. This commit uses the new file sharing interface in cgroup to share these files. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente --- block/blk-throttle.c | 16 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 6bfdaac53b6f..95825448c031 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1491,22 +1491,30 @@ static struct cftype throtl_legacy_files[] = { .write = tg_set_conf_uint, }, { - .name = "throttle.io_service_bytes", + .name = "io_service_bytes", + .owner_name = "throttle", + .flags = CFTYPE_SHARES_FILE, .private = (unsigned long)_policy_throtl, .seq_show_cft = blkg_print_stat_bytes, }, { - .name = "throttle.io_service_bytes_recursive", + .name = "io_service_bytes_recursive", + .owner_name = "throttle", + .flags = CFTYPE_SHARES_FILE, .private = (unsigned long)_policy_throtl, .seq_show_cft = blkg_print_stat_bytes_recursive, }, { - .name = "throttle.io_serviced", + .name = "io_serviced", + .owner_name = "throttle", + .flags = CFTYPE_SHARES_FILE, .private = (unsigned long)_policy_throtl, .seq_show_cft = blkg_print_stat_ios, }, { - .name = "throttle.io_serviced_recursive", + .name = "io_serviced_recursive", + .owner_name = "throttle", + .flags = CFTYPE_SHARES_FILE, .private = (unsigned long)_policy_throtl, .seq_show_cft = blkg_print_stat_ios_recursive, }, -- 2.16.1
Re: [PATCH] block: BFQ default for single queue devices
> Il giorno 05 ott 2018, alle ore 00:42, Bart Van Assche > ha scritto: > > On Thu, 2018-10-04 at 22:39 +0200, Paolo Valente wrote: >> No, kernel build is, for evident reasons, one of the workloads I cared >> most about. Actually, I tried to focus on all my main >> kernel-development tasks, such as also git checkout, git merge, git >> grep, ... >> >> According to my test results, with BFQ these tasks are at least as >> fast as, or, in most system configurations, much faster than with the >> other schedulers. Of course, at the same time the system also remains >> responsive with BFQ. >> >> You can repeat these tests using one of my first scripts in the S >> suite: kern_dev_tasks_vs_rw.sh (usually, the older the tests, the more >> hypertrophied the names I gave :) ). >> >> I stopped sharing also my kernel-build results years ago, because I >> went on obtaining the same, identical good results for years, and I'm >> aware that I tend to show and say too much stuff. > > On my test setup building the kernel is slightly slower when using the BFQ > scheduler compared to using scheduler "none" (kernel 4.18.12, NVMe SSD, > single CPU with 6 cores, hyperthreading disabled). I am aware that the > proposal at the start of this thread was to make BFQ the default for devices > with a single hardware queue and not for devices like NVMe SSDs that support > multiple hardware queues. > I miss your point: as you yourself note, the proposal is limited to single-queue devices, exactly because BFQ is not ready for multiple-queue devices yet. > What I think is missing is measurement results for BFQ on a system with > multiple CPU sockets and against a fast storage medium. It is not missing. As I happened to report in previous threads, we made a script to measure that too [1], using fio and null block. I have reported the results we obtained, for three classes of processors, in the in-kernel BFQ documentation [2]. In particular, BFQ reached 400KIOPS with the fastest CPU mentioned in that document (Intel i7-4850HQ). So, since the speed of that single-socket commodity CPU is most likely lower than the total speed of a multi-socket system, we have that, on such a system and with BFQ, you should be conservatively ok with single-queue devices in the range 300-500 KIOPS. [1] https://github.com/Algodev-github/IOSpeed [2] https://www.kernel.org/doc/Documentation/block/bfq-iosched.txt > > Eliminating > the host lock from the SCSI core yielded a significant performance > improvement for such storage devices. Since the BFQ scheduler locks and > unlocks bfqd->lock for every dispatch operation it is very likely that BFQ > will slow down I/O for fast storage devices, even if their driver only > creates a single hardware queue. > One of the main motivations behind NVMe, and blk-mq itself, is that it is hard to reach the above IOPS, and more, with a single I/O queue as bottleneck. So, I wouldn't expect that systems - equipped with single-queue drives reaching more than 500 KIOPS - using SATA or some other non-NVMe as protocol - so fast to push these drives to their maximum speeds constitute more than a negligible percentage of devices. So, by sticking to mq-deadline, we would sacrifice 99% of systems, to make sure, basically, that those very few systems on steroids reach maximum throughput with random I/O (while however still suffering from responsiveness problems). I think it makes much more sense to have as default what is best for 99% of the single-queue systems, with those super systems properly reconfigured by their users. For sure, other defaults are to be changed too, to get the most out of those systems. Thanks, Paolo > Bart.
Re: [PATCH] block: BFQ default for single queue devices
> Il giorno 05 ott 2018, alle ore 00:42, Bart Van Assche > ha scritto: > > On Thu, 2018-10-04 at 22:39 +0200, Paolo Valente wrote: >> No, kernel build is, for evident reasons, one of the workloads I cared >> most about. Actually, I tried to focus on all my main >> kernel-development tasks, such as also git checkout, git merge, git >> grep, ... >> >> According to my test results, with BFQ these tasks are at least as >> fast as, or, in most system configurations, much faster than with the >> other schedulers. Of course, at the same time the system also remains >> responsive with BFQ. >> >> You can repeat these tests using one of my first scripts in the S >> suite: kern_dev_tasks_vs_rw.sh (usually, the older the tests, the more >> hypertrophied the names I gave :) ). >> >> I stopped sharing also my kernel-build results years ago, because I >> went on obtaining the same, identical good results for years, and I'm >> aware that I tend to show and say too much stuff. > > On my test setup building the kernel is slightly slower when using the BFQ > scheduler compared to using scheduler "none" (kernel 4.18.12, NVMe SSD, > single CPU with 6 cores, hyperthreading disabled). I am aware that the > proposal at the start of this thread was to make BFQ the default for devices > with a single hardware queue and not for devices like NVMe SSDs that support > multiple hardware queues. > I miss your point: as you yourself note, the proposal is limited to single-queue devices, exactly because BFQ is not ready for multiple-queue devices yet. > What I think is missing is measurement results for BFQ on a system with > multiple CPU sockets and against a fast storage medium. It is not missing. As I happened to report in previous threads, we made a script to measure that too [1], using fio and null block. I have reported the results we obtained, for three classes of processors, in the in-kernel BFQ documentation [2]. In particular, BFQ reached 400KIOPS with the fastest CPU mentioned in that document (Intel i7-4850HQ). So, since the speed of that single-socket commodity CPU is most likely lower than the total speed of a multi-socket system, we have that, on such a system and with BFQ, you should be conservatively ok with single-queue devices in the range 300-500 KIOPS. [1] https://github.com/Algodev-github/IOSpeed [2] https://www.kernel.org/doc/Documentation/block/bfq-iosched.txt > > Eliminating > the host lock from the SCSI core yielded a significant performance > improvement for such storage devices. Since the BFQ scheduler locks and > unlocks bfqd->lock for every dispatch operation it is very likely that BFQ > will slow down I/O for fast storage devices, even if their driver only > creates a single hardware queue. > One of the main motivations behind NVMe, and blk-mq itself, is that it is hard to reach the above IOPS, and more, with a single I/O queue as bottleneck. So, I wouldn't expect that systems - equipped with single-queue drives reaching more than 500 KIOPS - using SATA or some other non-NVMe as protocol - so fast to push these drives to their maximum speeds constitute more than a negligible percentage of devices. So, by sticking to mq-deadline, we would sacrifice 99% of systems, to make sure, basically, that those very few systems on steroids reach maximum throughput with random I/O (while however still suffering from responsiveness problems). I think it makes much more sense to have as default what is best for 99% of the single-queue systems, with those super systems properly reconfigured by their users. For sure, other defaults are to be changed too, to get the most out of those systems. Thanks, Paolo > Bart.
Re: [PATCH BUGFIX 0/3] three fixes for the function bfq_requests_merged
> Il giorno 31 mag 2018, alle ore 16:48, Jens Axboe ha > scritto: > > On 5/31/18 7:23 AM, Paolo Valente wrote: >> Hi Jens, >> this series fixes three bugs in bfq_requests_merged. In more detail: >> >> - two linked bugs, with the first (critical: wrong lock) hidden by the >> second (less critical: wrong check that made the body of the >> function not be executed at all) >> - a rather minor bug: superflous code > > Patch #2 doesn't apply on for-4.18/block. Yep, it gave some problems to me too, because we happened to move this patch around by email. > I hand applied it, all > 3 are queued up now. > Thanks for the patience and for queueing these fixes. Paolo > -- > Jens Axboe >
Re: [PATCH BUGFIX 0/3] three fixes for the function bfq_requests_merged
> Il giorno 31 mag 2018, alle ore 16:48, Jens Axboe ha > scritto: > > On 5/31/18 7:23 AM, Paolo Valente wrote: >> Hi Jens, >> this series fixes three bugs in bfq_requests_merged. In more detail: >> >> - two linked bugs, with the first (critical: wrong lock) hidden by the >> second (less critical: wrong check that made the body of the >> function not be executed at all) >> - a rather minor bug: superflous code > > Patch #2 doesn't apply on for-4.18/block. Yep, it gave some problems to me too, because we happened to move this patch around by email. > I hand applied it, all > 3 are queued up now. > Thanks for the patience and for queueing these fixes. Paolo > -- > Jens Axboe >
[PATCH BUGFIX/IMPROVEMENTS 3/4] block, bfq: increase weight-raising duration for interactive apps
From: Davide Sapienza The maximum possible duration of the weight-raising period for interactive applications is limited to 13 seconds, as this is the time needed to load the largest application that we considered when tuning weight raising. Unfortunately, in such an evaluation, we did not consider the case of very slow virtual machines. For example, on a QEMU/KVM virtual machine - running in a slow PC; - with a virtual disk stacked on a slow low-end 5400rpm HDD; - serving a heavy I/O workload, such as the sequential reading of several files; mplayer takes 23 seconds to start, if constantly weight-raised. To address this issue, this commit conservatively sets the upper limit for weight-raising duration to 25 seconds. Signed-off-by: Davide Sapienza Signed-off-by: Paolo Valente --- block/bfq-iosched.c | 26 +++--- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 262c929e24ee..31ce089d54eb 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -930,22 +930,26 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) do_div(dur, bfqd->peak_rate); /* -* Limit duration between 3 and 13 seconds. Tests show that -* higher values than 13 seconds often yield the opposite of -* the desired result, i.e., worsen responsiveness by letting -* non-interactive and non-soft-real-time applications -* preserve weight raising for a too long time interval. +* Limit duration between 3 and 25 seconds. The upper limit +* has been conservatively set after the following worst case: +* on a QEMU/KVM virtual machine +* - running in a slow PC +* - with a virtual disk stacked on a slow low-end 5400rpm HDD +* - serving a heavy I/O workload, such as the sequential reading +* of several files +* mplayer took 23 seconds to start, if constantly weight-raised. +* +* As for higher values than that accomodating the above bad +* scenario, tests show that higher values would often yield +* the opposite of the desired result, i.e., would worsen +* responsiveness by allowing non-interactive applications to +* preserve weight raising for too long. * * On the other end, lower values than 3 seconds make it * difficult for most interactive tasks to complete their jobs * before weight-raising finishes. */ - if (dur > msecs_to_jiffies(13000)) - dur = msecs_to_jiffies(13000); - else if (dur < msecs_to_jiffies(3000)) - dur = msecs_to_jiffies(3000); - - return dur; + return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000)); } /* switch back from soft real-time to interactive weight raising */ -- 2.16.1
[PATCH BUGFIX/IMPROVEMENTS 2/4] block, bfq: remove slow-system class
BFQ computes the duration of weight raising for interactive applications automatically, using some reference parameters. In particular, BFQ uses the best durations (see comments in the code for how these durations have been assessed) for two classes of systems: slow and fast ones. Examples of slow systems are old phones or systems using micro HDDs. Fast systems are all the remaining ones. Using these parameters, BFQ computes the actual duration of the weight raising, for the system at hand, as a function of the relative speed of the system w.r.t. the speed of a reference system, belonging to the same class of systems as the system at hand. This slow vs fast differentiation proved to be useful in the past, but happens to have little meaning with current hardware. Even worse, it does cause problems in virtual systems, where the speed of the system can vary frequently, and so widely to just confuse the class-detection mechanism, and, as we have verified experimentally, to cause BFQ to compute non-sensical weight-raising durations. This commit addresses this issue by removing the slow class and the class-detection mechanism. Signed-off-by: Paolo Valente --- block/bfq-iosched.c | 137 block/bfq-iosched.h | 14 ++ 2 files changed, 46 insertions(+), 105 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index f3703e7431aa..262c929e24ee 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -251,55 +251,43 @@ static struct kmem_cache *bfq_pool; * When configured for computing the duration of the weight-raising * for interactive queues automatically (see the comments at the * beginning of this file), BFQ does it using the following formula: - * duration = (R / r) * T, - * where r is the peak rate of the device, and R - * and T are two reference parameters. In particular, - * R is the peak rate of the reference device (see below), and - * T is a reference time: given the systems that are likely - * to be installed on the reference device according to its speed - * class, T is about the maximum time needed, under BFQ and - * while reading two files in parallel, to load typical large - * applications on these systems (see the comments on - * max_service_from_wr below, for more details on how T is - * obtained). In practice, the slower/faster the device at hand is, - * the more/less it takes to load applications with respect to the + * duration = (ref_rate / r) * ref_wr_duration, + * where r is the peak rate of the device, and ref_rate and + * ref_wr_duration are two reference parameters. In particular, + * ref_rate is the peak rate of the reference storage device (see + * below), and ref_wr_duration is about the maximum time needed, with + * BFQ and while reading two files in parallel, to load typical large + * applications on the reference device (see the comments on + * max_service_from_wr below, for more details on how ref_wr_duration + * is obtained). In practice, the slower/faster the device at hand + * is, the more/less it takes to load applications with respect to the * reference device. Accordingly, the longer/shorter BFQ grants * weight raising to interactive applications. * - * BFQ uses four different reference pairs (R, T), depending on: - * . whether the device is rotational or non-rotational; - * . whether the device is slow, such as old or portable HDDs, as well as - * SD cards, or fast, such as newer HDDs and SSDs. + * BFQ uses two different reference pairs (ref_rate, ref_wr_duration), + * depending on whether the device is rotational or non-rotational. * - * The device's speed class is dynamically (re)detected in - * bfq_update_peak_rate() every time the estimated peak rate is updated. + * In the following definitions, ref_rate[0] and ref_wr_duration[0] + * are the reference values for a rotational device, whereas + * ref_rate[1] and ref_wr_duration[1] are the reference values for a + * non-rotational device. The reference rates are not the actual peak + * rates of the devices used as a reference, but slightly lower + * values. The reason for using slightly lower values is that the + * peak-rate estimator tends to yield slightly lower values than the + * actual peak rate (it can yield the actual peak rate only if there + * is only one process doing I/O, and the process does sequential + * I/O). * - * In the following definitions, R_slow[0]/R_fast[0] and - * T_slow[0]/T_fast[0] are the reference values for a slow/fast - * rotational device, whereas R_slow[1]/R_fast[1] and - * T_slow[1]/T_fast[1] are the reference values for a slow/fast - * non-rotational device. Finally, device_speed_thresh are the - * thresholds used to switch between speed classes. The reference - * rates are not the actual peak rates of the devices used as a - * reference, but slightly lower values. The reason for using these - * slightly lower values is that the peak-rate estimator tends to - * yield slightly lower values
[PATCH BUGFIX/IMPROVEMENTS 3/4] block, bfq: increase weight-raising duration for interactive apps
From: Davide Sapienza The maximum possible duration of the weight-raising period for interactive applications is limited to 13 seconds, as this is the time needed to load the largest application that we considered when tuning weight raising. Unfortunately, in such an evaluation, we did not consider the case of very slow virtual machines. For example, on a QEMU/KVM virtual machine - running in a slow PC; - with a virtual disk stacked on a slow low-end 5400rpm HDD; - serving a heavy I/O workload, such as the sequential reading of several files; mplayer takes 23 seconds to start, if constantly weight-raised. To address this issue, this commit conservatively sets the upper limit for weight-raising duration to 25 seconds. Signed-off-by: Davide Sapienza Signed-off-by: Paolo Valente --- block/bfq-iosched.c | 26 +++--- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 262c929e24ee..31ce089d54eb 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -930,22 +930,26 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) do_div(dur, bfqd->peak_rate); /* -* Limit duration between 3 and 13 seconds. Tests show that -* higher values than 13 seconds often yield the opposite of -* the desired result, i.e., worsen responsiveness by letting -* non-interactive and non-soft-real-time applications -* preserve weight raising for a too long time interval. +* Limit duration between 3 and 25 seconds. The upper limit +* has been conservatively set after the following worst case: +* on a QEMU/KVM virtual machine +* - running in a slow PC +* - with a virtual disk stacked on a slow low-end 5400rpm HDD +* - serving a heavy I/O workload, such as the sequential reading +* of several files +* mplayer took 23 seconds to start, if constantly weight-raised. +* +* As for higher values than that accomodating the above bad +* scenario, tests show that higher values would often yield +* the opposite of the desired result, i.e., would worsen +* responsiveness by allowing non-interactive applications to +* preserve weight raising for too long. * * On the other end, lower values than 3 seconds make it * difficult for most interactive tasks to complete their jobs * before weight-raising finishes. */ - if (dur > msecs_to_jiffies(13000)) - dur = msecs_to_jiffies(13000); - else if (dur < msecs_to_jiffies(3000)) - dur = msecs_to_jiffies(3000); - - return dur; + return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000)); } /* switch back from soft real-time to interactive weight raising */ -- 2.16.1
[PATCH BUGFIX/IMPROVEMENTS 2/4] block, bfq: remove slow-system class
BFQ computes the duration of weight raising for interactive applications automatically, using some reference parameters. In particular, BFQ uses the best durations (see comments in the code for how these durations have been assessed) for two classes of systems: slow and fast ones. Examples of slow systems are old phones or systems using micro HDDs. Fast systems are all the remaining ones. Using these parameters, BFQ computes the actual duration of the weight raising, for the system at hand, as a function of the relative speed of the system w.r.t. the speed of a reference system, belonging to the same class of systems as the system at hand. This slow vs fast differentiation proved to be useful in the past, but happens to have little meaning with current hardware. Even worse, it does cause problems in virtual systems, where the speed of the system can vary frequently, and so widely to just confuse the class-detection mechanism, and, as we have verified experimentally, to cause BFQ to compute non-sensical weight-raising durations. This commit addresses this issue by removing the slow class and the class-detection mechanism. Signed-off-by: Paolo Valente --- block/bfq-iosched.c | 137 block/bfq-iosched.h | 14 ++ 2 files changed, 46 insertions(+), 105 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index f3703e7431aa..262c929e24ee 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -251,55 +251,43 @@ static struct kmem_cache *bfq_pool; * When configured for computing the duration of the weight-raising * for interactive queues automatically (see the comments at the * beginning of this file), BFQ does it using the following formula: - * duration = (R / r) * T, - * where r is the peak rate of the device, and R - * and T are two reference parameters. In particular, - * R is the peak rate of the reference device (see below), and - * T is a reference time: given the systems that are likely - * to be installed on the reference device according to its speed - * class, T is about the maximum time needed, under BFQ and - * while reading two files in parallel, to load typical large - * applications on these systems (see the comments on - * max_service_from_wr below, for more details on how T is - * obtained). In practice, the slower/faster the device at hand is, - * the more/less it takes to load applications with respect to the + * duration = (ref_rate / r) * ref_wr_duration, + * where r is the peak rate of the device, and ref_rate and + * ref_wr_duration are two reference parameters. In particular, + * ref_rate is the peak rate of the reference storage device (see + * below), and ref_wr_duration is about the maximum time needed, with + * BFQ and while reading two files in parallel, to load typical large + * applications on the reference device (see the comments on + * max_service_from_wr below, for more details on how ref_wr_duration + * is obtained). In practice, the slower/faster the device at hand + * is, the more/less it takes to load applications with respect to the * reference device. Accordingly, the longer/shorter BFQ grants * weight raising to interactive applications. * - * BFQ uses four different reference pairs (R, T), depending on: - * . whether the device is rotational or non-rotational; - * . whether the device is slow, such as old or portable HDDs, as well as - * SD cards, or fast, such as newer HDDs and SSDs. + * BFQ uses two different reference pairs (ref_rate, ref_wr_duration), + * depending on whether the device is rotational or non-rotational. * - * The device's speed class is dynamically (re)detected in - * bfq_update_peak_rate() every time the estimated peak rate is updated. + * In the following definitions, ref_rate[0] and ref_wr_duration[0] + * are the reference values for a rotational device, whereas + * ref_rate[1] and ref_wr_duration[1] are the reference values for a + * non-rotational device. The reference rates are not the actual peak + * rates of the devices used as a reference, but slightly lower + * values. The reason for using slightly lower values is that the + * peak-rate estimator tends to yield slightly lower values than the + * actual peak rate (it can yield the actual peak rate only if there + * is only one process doing I/O, and the process does sequential + * I/O). * - * In the following definitions, R_slow[0]/R_fast[0] and - * T_slow[0]/T_fast[0] are the reference values for a slow/fast - * rotational device, whereas R_slow[1]/R_fast[1] and - * T_slow[1]/T_fast[1] are the reference values for a slow/fast - * non-rotational device. Finally, device_speed_thresh are the - * thresholds used to switch between speed classes. The reference - * rates are not the actual peak rates of the devices used as a - * reference, but slightly lower values. The reason for using these - * slightly lower values is that the peak-rate estimator tends to - * yield slightly lower values
Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge
> Il giorno 14 mag 2018, alle ore 19:31, Jens Axboe <ax...@kernel.dk> ha > scritto: > > On 5/14/18 11:16 AM, Paolo Valente wrote: >> >> >>> Il giorno 10 mag 2018, alle ore 18:14, Bart Van Assche >>> <bart.vanass...@wdc.com> ha scritto: >>> >>> On Fri, 2018-05-04 at 19:17 +0200, Paolo Valente wrote: >>>> When invoked for an I/O request rq, [ ... ] >>> >>> Tested-by: Bart Van Assche <bart.vanass...@wdc.com> >>> >>> >>> >> >> Any decision for this fix, Jens? > > Guess I didn't reply, but I did commit this on Thursday. > Great, thank you! Paolo > -- > Jens Axboe >
Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge
> Il giorno 14 mag 2018, alle ore 19:31, Jens Axboe ha > scritto: > > On 5/14/18 11:16 AM, Paolo Valente wrote: >> >> >>> Il giorno 10 mag 2018, alle ore 18:14, Bart Van Assche >>> ha scritto: >>> >>> On Fri, 2018-05-04 at 19:17 +0200, Paolo Valente wrote: >>>> When invoked for an I/O request rq, [ ... ] >>> >>> Tested-by: Bart Van Assche >>> >>> >>> >> >> Any decision for this fix, Jens? > > Guess I didn't reply, but I did commit this on Thursday. > Great, thank you! Paolo > -- > Jens Axboe >
Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge
> Il giorno 10 mag 2018, alle ore 18:14, Bart Van Assche > <bart.vanass...@wdc.com> ha scritto: > > On Fri, 2018-05-04 at 19:17 +0200, Paolo Valente wrote: >> When invoked for an I/O request rq, [ ... ] > > Tested-by: Bart Van Assche <bart.vanass...@wdc.com> > > > Any decision for this fix, Jens? Thanks, Paolo
Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge
> Il giorno 10 mag 2018, alle ore 18:14, Bart Van Assche > ha scritto: > > On Fri, 2018-05-04 at 19:17 +0200, Paolo Valente wrote: >> When invoked for an I/O request rq, [ ... ] > > Tested-by: Bart Van Assche > > > Any decision for this fix, Jens? Thanks, Paolo
Re: bug in tag handling in blk-mq?
> Il giorno 09 mag 2018, alle ore 06:11, Mike Galbraithha > scritto: > > On Tue, 2018-05-08 at 19:09 -0600, Jens Axboe wrote: >> >> Alright, I managed to reproduce it. What I think is happening is that >> BFQ is limiting the inflight case to something less than the wake >> batch for sbitmap, which can lead to stalls. I don't have time to test >> this tonight, but perhaps you can give it a go when you are back at it. >> If not, I'll try tomorrow morning. >> >> If this is the issue, I can turn it into a real patch. This is just to >> confirm that the issue goes away with the below. > > Confirmed. Impressive high speed bug stomping. > Great! It's a real relief that this ghost is gone. Thank you both, Paolo >> diff --git a/lib/sbitmap.c b/lib/sbitmap.c >> index e6a9c06ec70c..94ced15b6428 100644 >> --- a/lib/sbitmap.c >> +++ b/lib/sbitmap.c >> @@ -272,6 +272,7 @@ EXPORT_SYMBOL_GPL(sbitmap_bitmap_show); >> >> static unsigned int sbq_calc_wake_batch(unsigned int depth) >> { >> +#if 0 >> unsigned int wake_batch; >> >> /* >> @@ -284,6 +285,9 @@ static unsigned int sbq_calc_wake_batch(unsigned int >> depth) >> wake_batch = max(1U, depth / SBQ_WAIT_QUEUES); >> >> return wake_batch; >> +#else >> +return 1; >> +#endif >> } >> >> int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, >>
Re: bug in tag handling in blk-mq?
> Il giorno 09 mag 2018, alle ore 06:11, Mike Galbraith ha > scritto: > > On Tue, 2018-05-08 at 19:09 -0600, Jens Axboe wrote: >> >> Alright, I managed to reproduce it. What I think is happening is that >> BFQ is limiting the inflight case to something less than the wake >> batch for sbitmap, which can lead to stalls. I don't have time to test >> this tonight, but perhaps you can give it a go when you are back at it. >> If not, I'll try tomorrow morning. >> >> If this is the issue, I can turn it into a real patch. This is just to >> confirm that the issue goes away with the below. > > Confirmed. Impressive high speed bug stomping. > Great! It's a real relief that this ghost is gone. Thank you both, Paolo >> diff --git a/lib/sbitmap.c b/lib/sbitmap.c >> index e6a9c06ec70c..94ced15b6428 100644 >> --- a/lib/sbitmap.c >> +++ b/lib/sbitmap.c >> @@ -272,6 +272,7 @@ EXPORT_SYMBOL_GPL(sbitmap_bitmap_show); >> >> static unsigned int sbq_calc_wake_batch(unsigned int depth) >> { >> +#if 0 >> unsigned int wake_batch; >> >> /* >> @@ -284,6 +285,9 @@ static unsigned int sbq_calc_wake_batch(unsigned int >> depth) >> wake_batch = max(1U, depth / SBQ_WAIT_QUEUES); >> >> return wake_batch; >> +#else >> +return 1; >> +#endif >> } >> >> int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, >>
Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge
> Il giorno 07 mag 2018, alle ore 12:01, Mike Galbraith <efa...@gmx.de> ha > scritto: > > On Mon, 2018-05-07 at 11:27 +0200, Paolo Valente wrote: >> >> >> Where is the bug? > > Hm, seems potent pain-killers and C don't mix all that well. > I'll try to keep it in mind, and I do wis you to get well soon. Thanks, Paolo
Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge
> Il giorno 07 mag 2018, alle ore 12:01, Mike Galbraith ha > scritto: > > On Mon, 2018-05-07 at 11:27 +0200, Paolo Valente wrote: >> >> >> Where is the bug? > > Hm, seems potent pain-killers and C don't mix all that well. > I'll try to keep it in mind, and I do wis you to get well soon. Thanks, Paolo
Re: bug in tag handling in blk-mq?
> Il giorno 07 mag 2018, alle ore 18:39, Jens Axboe <ax...@kernel.dk> ha > scritto: > > On 5/7/18 8:03 AM, Paolo Valente wrote: >> Hi Jens, Christoph, all, >> Mike Galbraith has been experiencing hangs, on blk_mq_get_tag, only >> with bfq [1]. Symptoms seem to clearly point to a problem in I/O-tag >> handling, triggered by bfq because it limits the number of tags for >> async and sync write requests (in bfq_limit_depth). >> >> Fortunately, I just happened to find a way to apparently confirm it. >> With the following one-liner for block/bfq-iosched.c: >> >> @@ -554,8 +554,7 @@ static void bfq_limit_depth(unsigned int op, struct >> blk_mq_alloc_data *data) >>if (unlikely(bfqd->sb_shift != bt->sb.shift)) >>bfq_update_depths(bfqd, bt); >> >> - data->shallow_depth = >> - bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; >> + data->shallow_depth = 1; >> >>bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", >>__func__, bfqd->wr_busy_queues, op_is_sync(op), >> >> Mike's machine now crashes soon and systematically, while nothing bad >> happens on my machines, even with heavy workloads (apart from an >> expected throughput drop). >> >> This change simply reduces to 1 the maximum possible value for the sum >> of the number of async requests and of sync write requests. >> >> This email is basically a request for help to knowledgeable people. To >> start, here are my first doubts/questions: >> 1) Just to be certain, I guess it is not normal that blk-mq hangs if >> async requests and sync write requests can be at most one, right? >> 2) Do you have any hint to where I could look for, to chase this bug? >> Of course, the bug may be in bfq, i.e, it may be a somehow unrelated >> bfq bug that causes this hang in blk-mq, indirectly. But it is hard >> for me to understand how. > > CC Omar, since he implemented the shallow part. But we'll need some > traces to show where we are hung, probably also the value of the > /sys/debug/kernel/block// directory. For the crash mentioned, a > trace as well. Otherwise we'll be wasting a lot of time on this. > > Is there a reproducer? > Ok Mike, I guess it's your turn now, for at least a stack trace. Thanks, Paolo > -- > Jens Axboe
Re: bug in tag handling in blk-mq?
> Il giorno 07 mag 2018, alle ore 18:39, Jens Axboe ha > scritto: > > On 5/7/18 8:03 AM, Paolo Valente wrote: >> Hi Jens, Christoph, all, >> Mike Galbraith has been experiencing hangs, on blk_mq_get_tag, only >> with bfq [1]. Symptoms seem to clearly point to a problem in I/O-tag >> handling, triggered by bfq because it limits the number of tags for >> async and sync write requests (in bfq_limit_depth). >> >> Fortunately, I just happened to find a way to apparently confirm it. >> With the following one-liner for block/bfq-iosched.c: >> >> @@ -554,8 +554,7 @@ static void bfq_limit_depth(unsigned int op, struct >> blk_mq_alloc_data *data) >>if (unlikely(bfqd->sb_shift != bt->sb.shift)) >>bfq_update_depths(bfqd, bt); >> >> - data->shallow_depth = >> - bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; >> + data->shallow_depth = 1; >> >>bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", >>__func__, bfqd->wr_busy_queues, op_is_sync(op), >> >> Mike's machine now crashes soon and systematically, while nothing bad >> happens on my machines, even with heavy workloads (apart from an >> expected throughput drop). >> >> This change simply reduces to 1 the maximum possible value for the sum >> of the number of async requests and of sync write requests. >> >> This email is basically a request for help to knowledgeable people. To >> start, here are my first doubts/questions: >> 1) Just to be certain, I guess it is not normal that blk-mq hangs if >> async requests and sync write requests can be at most one, right? >> 2) Do you have any hint to where I could look for, to chase this bug? >> Of course, the bug may be in bfq, i.e, it may be a somehow unrelated >> bfq bug that causes this hang in blk-mq, indirectly. But it is hard >> for me to understand how. > > CC Omar, since he implemented the shallow part. But we'll need some > traces to show where we are hung, probably also the value of the > /sys/debug/kernel/block// directory. For the crash mentioned, a > trace as well. Otherwise we'll be wasting a lot of time on this. > > Is there a reproducer? > Ok Mike, I guess it's your turn now, for at least a stack trace. Thanks, Paolo > -- > Jens Axboe
bug in tag handling in blk-mq?
Hi Jens, Christoph, all, Mike Galbraith has been experiencing hangs, on blk_mq_get_tag, only with bfq [1]. Symptoms seem to clearly point to a problem in I/O-tag handling, triggered by bfq because it limits the number of tags for async and sync write requests (in bfq_limit_depth). Fortunately, I just happened to find a way to apparently confirm it. With the following one-liner for block/bfq-iosched.c: @@ -554,8 +554,7 @@ static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) if (unlikely(bfqd->sb_shift != bt->sb.shift)) bfq_update_depths(bfqd, bt); - data->shallow_depth = - bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + data->shallow_depth = 1; bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", __func__, bfqd->wr_busy_queues, op_is_sync(op), Mike's machine now crashes soon and systematically, while nothing bad happens on my machines, even with heavy workloads (apart from an expected throughput drop). This change simply reduces to 1 the maximum possible value for the sum of the number of async requests and of sync write requests. This email is basically a request for help to knowledgeable people. To start, here are my first doubts/questions: 1) Just to be certain, I guess it is not normal that blk-mq hangs if async requests and sync write requests can be at most one, right? 2) Do you have any hint to where I could look for, to chase this bug? Of course, the bug may be in bfq, i.e, it may be a somehow unrelated bfq bug that causes this hang in blk-mq, indirectly. But it is hard for me to understand how. Looking forward to some help. Thanks, Paolo [1] https://www.spinics.net/lists/stable/msg215036.html
bug in tag handling in blk-mq?
Hi Jens, Christoph, all, Mike Galbraith has been experiencing hangs, on blk_mq_get_tag, only with bfq [1]. Symptoms seem to clearly point to a problem in I/O-tag handling, triggered by bfq because it limits the number of tags for async and sync write requests (in bfq_limit_depth). Fortunately, I just happened to find a way to apparently confirm it. With the following one-liner for block/bfq-iosched.c: @@ -554,8 +554,7 @@ static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) if (unlikely(bfqd->sb_shift != bt->sb.shift)) bfq_update_depths(bfqd, bt); - data->shallow_depth = - bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + data->shallow_depth = 1; bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", __func__, bfqd->wr_busy_queues, op_is_sync(op), Mike's machine now crashes soon and systematically, while nothing bad happens on my machines, even with heavy workloads (apart from an expected throughput drop). This change simply reduces to 1 the maximum possible value for the sum of the number of async requests and of sync write requests. This email is basically a request for help to knowledgeable people. To start, here are my first doubts/questions: 1) Just to be certain, I guess it is not normal that blk-mq hangs if async requests and sync write requests can be at most one, right? 2) Do you have any hint to where I could look for, to chase this bug? Of course, the bug may be in bfq, i.e, it may be a somehow unrelated bfq bug that causes this hang in blk-mq, indirectly. But it is hard for me to understand how. Looking forward to some help. Thanks, Paolo [1] https://www.spinics.net/lists/stable/msg215036.html
Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge
> Il giorno 07 mag 2018, alle ore 05:23, Mike Galbraith <efa...@gmx.de> ha > scritto: > > On Mon, 2018-05-07 at 04:43 +0200, Mike Galbraith wrote: >> On Sun, 2018-05-06 at 09:42 +0200, Paolo Valente wrote: >>> >>> I've attached a compressed patch (to avoid possible corruption from my >>> mailer). I'm little confident, but no pain, no gain, right? >>> >>> If possible, apply this patch on top of the fix I proposed in this >>> thread, just to eliminate possible further noise. Finally, the >>> patch content follows. >>> >>> Hoping for a stroke of luck, >> >> FWIW, box didn't survive the first full build of the morning. > > Nor the second. > Great, finally the first good news! Help from blk-mq experts would be fundamental here. To increase the chances to get it, I'm going to open a new thread on this. In that thread I'll ask you to provide an OOPS or something, I hope it is now easier for you to get it. Thanks, Paolo > -Mike
Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge
> Il giorno 07 mag 2018, alle ore 05:23, Mike Galbraith ha > scritto: > > On Mon, 2018-05-07 at 04:43 +0200, Mike Galbraith wrote: >> On Sun, 2018-05-06 at 09:42 +0200, Paolo Valente wrote: >>> >>> I've attached a compressed patch (to avoid possible corruption from my >>> mailer). I'm little confident, but no pain, no gain, right? >>> >>> If possible, apply this patch on top of the fix I proposed in this >>> thread, just to eliminate possible further noise. Finally, the >>> patch content follows. >>> >>> Hoping for a stroke of luck, >> >> FWIW, box didn't survive the first full build of the morning. > > Nor the second. > Great, finally the first good news! Help from blk-mq experts would be fundamental here. To increase the chances to get it, I'm going to open a new thread on this. In that thread I'll ask you to provide an OOPS or something, I hope it is now easier for you to get it. Thanks, Paolo > -Mike
Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge
> Il giorno 07 mag 2018, alle ore 07:56, Mike Galbraith <efa...@gmx.de> ha > scritto: > > On Sun, 2018-05-06 at 09:42 +0200, Paolo Valente wrote: >> >> diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c >> index 118f319af7c0..6662efe29b69 100644 >> --- a/block/bfq-mq-iosched.c >> +++ b/block/bfq-mq-iosched.c >> @@ -525,8 +525,13 @@ static void bfq_limit_depth(unsigned int op, struct >> blk_mq_alloc_data *data) >>if (unlikely(bfqd->sb_shift != bt->sb.shift)) >>bfq_update_depths(bfqd, bt); >> >> +#if 0 >>data->shallow_depth = >>bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; >^ > > Q: why doesn't the top of this function look like so? > > --- > block/bfq-iosched.c |2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > --- a/block/bfq-iosched.c > +++ b/block/bfq-iosched.c > @@ -539,7 +539,7 @@ static void bfq_limit_depth(unsigned int > struct bfq_data *bfqd = data->q->elevator->elevator_data; > struct sbitmap_queue *bt; > > - if (op_is_sync(op) && !op_is_write(op)) > + if (!op_is_write(op)) > return; > > if (data->flags & BLK_MQ_REQ_RESERVED) { > > It looks a bit odd that these elements exist... > > + /* > +* no more than 75% of tags for sync writes (25% extra tags > +* w.r.t. async I/O, to prevent async I/O from starving sync > +* writes) > +*/ > + bfqd->word_depths[0][1] = max(((1U<sb_shift) * 3)>>2, 1U); > > + /* no more than ~37% of tags for sync writes (~20% extra tags) */ > + bfqd->word_depths[1][1] = max(((1U<sb_shift) * 6)>>4, 1U); > > ...yet we index via and log a guaranteed zero. > I'm not sure I got your point, so, to help you help me quickly, I'll repeat what I expect the code you highlight to do: - sync reads must have no limitation, and the lines if (op_is_sync(op) && !op_is_write(op)) return; make sure they don't - sync writes must be limited, and the code you pasted above computes those limits - for sync writes, for which op_is_sync(op) is true (but the condition "op_is_sync(op) && !op_is_write(op)" is false), the line: bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; becomes bfqd->word_depths[!!bfqd->wr_busy_queues][1]; e yields the right limit for sync writes, depending on bfqd->wr_busy_queues. Where is the bug? Thanks, Paolo > -Mike > >
Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge
> Il giorno 07 mag 2018, alle ore 07:56, Mike Galbraith ha > scritto: > > On Sun, 2018-05-06 at 09:42 +0200, Paolo Valente wrote: >> >> diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c >> index 118f319af7c0..6662efe29b69 100644 >> --- a/block/bfq-mq-iosched.c >> +++ b/block/bfq-mq-iosched.c >> @@ -525,8 +525,13 @@ static void bfq_limit_depth(unsigned int op, struct >> blk_mq_alloc_data *data) >>if (unlikely(bfqd->sb_shift != bt->sb.shift)) >>bfq_update_depths(bfqd, bt); >> >> +#if 0 >>data->shallow_depth = >>bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; >^ > > Q: why doesn't the top of this function look like so? > > --- > block/bfq-iosched.c |2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > --- a/block/bfq-iosched.c > +++ b/block/bfq-iosched.c > @@ -539,7 +539,7 @@ static void bfq_limit_depth(unsigned int > struct bfq_data *bfqd = data->q->elevator->elevator_data; > struct sbitmap_queue *bt; > > - if (op_is_sync(op) && !op_is_write(op)) > + if (!op_is_write(op)) > return; > > if (data->flags & BLK_MQ_REQ_RESERVED) { > > It looks a bit odd that these elements exist... > > + /* > +* no more than 75% of tags for sync writes (25% extra tags > +* w.r.t. async I/O, to prevent async I/O from starving sync > +* writes) > +*/ > + bfqd->word_depths[0][1] = max(((1U<sb_shift) * 3)>>2, 1U); > > + /* no more than ~37% of tags for sync writes (~20% extra tags) */ > + bfqd->word_depths[1][1] = max(((1U<sb_shift) * 6)>>4, 1U); > > ...yet we index via and log a guaranteed zero. > I'm not sure I got your point, so, to help you help me quickly, I'll repeat what I expect the code you highlight to do: - sync reads must have no limitation, and the lines if (op_is_sync(op) && !op_is_write(op)) return; make sure they don't - sync writes must be limited, and the code you pasted above computes those limits - for sync writes, for which op_is_sync(op) is true (but the condition "op_is_sync(op) && !op_is_write(op)" is false), the line: bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; becomes bfqd->word_depths[!!bfqd->wr_busy_queues][1]; e yields the right limit for sync writes, depending on bfqd->wr_busy_queues. Where is the bug? Thanks, Paolo > -Mike > >
Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge
> Il giorno 05 mag 2018, alle ore 16:56, Mike Galbraith <efa...@gmx.de> ha > scritto: > > On Sat, 2018-05-05 at 12:39 +0200, Paolo Valente wrote: >> >> BTW, if you didn't run out of patience with this permanent issue yet, >> I was thinking of two o three changes to retry to trigger your failure >> reliably. > > Sure, fire away, I'll happily give the annoying little bugger > opportunities to show its tender belly. I've attached a compressed patch (to avoid possible corruption from my mailer). I'm little confident, but no pain, no gain, right? If possible, apply this patch on top of the fix I proposed in this thread, just to eliminate possible further noise. Finally, the patch content follows. Hoping for a stroke of luck, Paolo diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 118f319af7c0..6662efe29b69 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -525,8 +525,13 @@ static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) if (unlikely(bfqd->sb_shift != bt->sb.shift)) bfq_update_depths(bfqd, bt); +#if 0 data->shallow_depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; +#else + data->shallow_depth = 1; +#endif + bfq_log(bfqd, "wr_busy %d sync %d depth %u", bfqd->wr_busy_queues, op_is_sync(op), > > -Mike >
Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge
> Il giorno 05 mag 2018, alle ore 16:56, Mike Galbraith ha > scritto: > > On Sat, 2018-05-05 at 12:39 +0200, Paolo Valente wrote: >> >> BTW, if you didn't run out of patience with this permanent issue yet, >> I was thinking of two o three changes to retry to trigger your failure >> reliably. > > Sure, fire away, I'll happily give the annoying little bugger > opportunities to show its tender belly. I've attached a compressed patch (to avoid possible corruption from my mailer). I'm little confident, but no pain, no gain, right? If possible, apply this patch on top of the fix I proposed in this thread, just to eliminate possible further noise. Finally, the patch content follows. Hoping for a stroke of luck, Paolo diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 118f319af7c0..6662efe29b69 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -525,8 +525,13 @@ static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) if (unlikely(bfqd->sb_shift != bt->sb.shift)) bfq_update_depths(bfqd, bt); +#if 0 data->shallow_depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; +#else + data->shallow_depth = 1; +#endif + bfq_log(bfqd, "wr_busy %d sync %d depth %u", bfqd->wr_busy_queues, op_is_sync(op), > > -Mike >
Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge
> Il giorno 05 mag 2018, alle ore 10:19, Mike Galbraithha > scritto: > > On Fri, 2018-05-04 at 21:46 +0200, Mike Galbraith wrote: >> Tentatively, I suspect you've just fixed the nasty stalls I reported a >> while back. > > Oh well, so much for optimism. It took a lot, but just hung. Yep, it'd have been a lot of luck, being your hang related to different operations from those touched by this fix. Maybe time-before-failure stretched because your system suffered from the illness cured by this fix too. BTW, if you didn't run out of patience with this permanent issue yet, I was thinking of two o three changes to retry to trigger your failure reliably. In case of success, I could restart racking my brains from there. Thanks, Paolo
Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge
> Il giorno 05 mag 2018, alle ore 10:19, Mike Galbraith ha > scritto: > > On Fri, 2018-05-04 at 21:46 +0200, Mike Galbraith wrote: >> Tentatively, I suspect you've just fixed the nasty stalls I reported a >> while back. > > Oh well, so much for optimism. It took a lot, but just hung. Yep, it'd have been a lot of luck, being your hang related to different operations from those touched by this fix. Maybe time-before-failure stretched because your system suffered from the illness cured by this fix too. BTW, if you didn't run out of patience with this permanent issue yet, I was thinking of two o three changes to retry to trigger your failure reliably. In case of success, I could restart racking my brains from there. Thanks, Paolo
[PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge
When invoked for an I/O request rq, the prepare_request hook of bfq increments reference counters in the destination bfq_queue for rq. In this respect, after this hook has been invoked, rq may still be transformed into a request with no icq attached, i.e., for bfq, a request not associated with any bfq_queue. No further hook is invoked to signal this tranformation to bfq (in general, to the destination elevator for rq). This leads bfq into an inconsistent state, because bfq has no chance to correctly lower these counters back. This inconsistency may in its turn cause incorrect scheduling and hangs. It certainly causes memory leaks, by making it impossible for bfq to free the involved bfq_queue. On the bright side, no transformation can still happen for rq after rq has been inserted into bfq, or merged with another, already inserted, request. Exploiting this fact, this commit addresses the above issue by delaying the preparation of an I/O request to when the request is inserted or merged. This change also gives a performance bonus: a lock-contention point gets removed. To prepare a request, bfq needs to hold its scheduler lock. After postponing request preparation to insertion or merging, no lock needs to be grabbed any longer in the prepare_request hook, while the lock already taken to perform insertion or merging is used to preparare the request as well. Signed-off-by: Paolo Valente <paolo.vale...@linaro.org> --- block/bfq-iosched.c | 86 +++-- 1 file changed, 57 insertions(+), 29 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 771ae9730ac6..ea02162df6c7 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1858,6 +1858,8 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, return ELEVATOR_NO_MERGE; } +static struct bfq_queue *bfq_init_rq(struct request *rq); + static void bfq_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { @@ -1866,7 +1868,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, blk_rq_pos(req) < blk_rq_pos(container_of(rb_prev(>rb_node), struct request, rb_node))) { - struct bfq_queue *bfqq = RQ_BFQQ(req); + struct bfq_queue *bfqq = bfq_init_rq(req); struct bfq_data *bfqd = bfqq->bfqd; struct request *prev, *next_rq; @@ -1894,7 +1896,8 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, static void bfq_requests_merged(struct request_queue *q, struct request *rq, struct request *next) { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); + struct bfq_queue *bfqq = bfq_init_rq(rq), + *next_bfqq = bfq_init_rq(next); if (!RB_EMPTY_NODE(>rb_node)) goto end; @@ -4540,14 +4543,12 @@ static inline void bfq_update_insert_stats(struct request_queue *q, unsigned int cmd_flags) {} #endif -static void bfq_prepare_request(struct request *rq, struct bio *bio); - static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) { struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_queue *bfqq; bool idle_timer_disabled = false; unsigned int cmd_flags; @@ -4562,24 +4563,13 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_mq_sched_request_inserted(rq); spin_lock_irq(>lock); + bfqq = bfq_init_rq(rq); if (at_head || blk_rq_is_passthrough(rq)) { if (at_head) list_add(>queuelist, >dispatch); else list_add_tail(>queuelist, >dispatch); - } else { - if (WARN_ON_ONCE(!bfqq)) { - /* -* This should never happen. Most likely rq is -* a requeued regular request, being -* re-inserted without being first -* re-prepared. Do a prepare, to avoid -* failure. -*/ - bfq_prepare_request(rq, rq->bio); - bfqq = RQ_BFQQ(rq); - } - + } else { /* bfqq is assumed to be non null here */ idle_timer_disabled = __bfq_insert_request(bfqd, rq); /* * Update bfqq, because, if a queue merge has occurred @@ -4922,11 +4912,48 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, } /* - * Allocate bfq data structures associa
[PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge
When invoked for an I/O request rq, the prepare_request hook of bfq increments reference counters in the destination bfq_queue for rq. In this respect, after this hook has been invoked, rq may still be transformed into a request with no icq attached, i.e., for bfq, a request not associated with any bfq_queue. No further hook is invoked to signal this tranformation to bfq (in general, to the destination elevator for rq). This leads bfq into an inconsistent state, because bfq has no chance to correctly lower these counters back. This inconsistency may in its turn cause incorrect scheduling and hangs. It certainly causes memory leaks, by making it impossible for bfq to free the involved bfq_queue. On the bright side, no transformation can still happen for rq after rq has been inserted into bfq, or merged with another, already inserted, request. Exploiting this fact, this commit addresses the above issue by delaying the preparation of an I/O request to when the request is inserted or merged. This change also gives a performance bonus: a lock-contention point gets removed. To prepare a request, bfq needs to hold its scheduler lock. After postponing request preparation to insertion or merging, no lock needs to be grabbed any longer in the prepare_request hook, while the lock already taken to perform insertion or merging is used to preparare the request as well. Signed-off-by: Paolo Valente --- block/bfq-iosched.c | 86 +++-- 1 file changed, 57 insertions(+), 29 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 771ae9730ac6..ea02162df6c7 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1858,6 +1858,8 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, return ELEVATOR_NO_MERGE; } +static struct bfq_queue *bfq_init_rq(struct request *rq); + static void bfq_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { @@ -1866,7 +1868,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, blk_rq_pos(req) < blk_rq_pos(container_of(rb_prev(>rb_node), struct request, rb_node))) { - struct bfq_queue *bfqq = RQ_BFQQ(req); + struct bfq_queue *bfqq = bfq_init_rq(req); struct bfq_data *bfqd = bfqq->bfqd; struct request *prev, *next_rq; @@ -1894,7 +1896,8 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, static void bfq_requests_merged(struct request_queue *q, struct request *rq, struct request *next) { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); + struct bfq_queue *bfqq = bfq_init_rq(rq), + *next_bfqq = bfq_init_rq(next); if (!RB_EMPTY_NODE(>rb_node)) goto end; @@ -4540,14 +4543,12 @@ static inline void bfq_update_insert_stats(struct request_queue *q, unsigned int cmd_flags) {} #endif -static void bfq_prepare_request(struct request *rq, struct bio *bio); - static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) { struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_queue *bfqq; bool idle_timer_disabled = false; unsigned int cmd_flags; @@ -4562,24 +4563,13 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_mq_sched_request_inserted(rq); spin_lock_irq(>lock); + bfqq = bfq_init_rq(rq); if (at_head || blk_rq_is_passthrough(rq)) { if (at_head) list_add(>queuelist, >dispatch); else list_add_tail(>queuelist, >dispatch); - } else { - if (WARN_ON_ONCE(!bfqq)) { - /* -* This should never happen. Most likely rq is -* a requeued regular request, being -* re-inserted without being first -* re-prepared. Do a prepare, to avoid -* failure. -*/ - bfq_prepare_request(rq, rq->bio); - bfqq = RQ_BFQQ(rq); - } - + } else { /* bfqq is assumed to be non null here */ idle_timer_disabled = __bfq_insert_request(bfqd, rq); /* * Update bfqq, because, if a queue merge has occurred @@ -4922,11 +4912,48 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, } /* - * Allocate bfq data structures associated with this request. + * Only r
Re: testing io.low limit for blk-throttle
> Il giorno 26 apr 2018, alle ore 20:32, Tejun Heo <t...@kernel.org> ha scritto: > > Hello, > > On Tue, Apr 24, 2018 at 02:12:51PM +0200, Paolo Valente wrote: >> +Tejun (I guess he might be interested in the results below) > > Our experiments didn't work out too well either. At this point, it > isn't clear whether io.low will ever leave experimental state. We're > trying to find a working solution. > Thanks for this update, Tejun. I'm still working (very slowly) on a survey of the current state of affairs in terms of bandwidth and latency guarantees in the block layer. The synthesis of the results I've collected so far is, more or less: "The problem of reaching a high throughput and, at the same time, guaranteeing bandwidth and latency is still unsolved, apart from simple cases, such as homogenous, constant workloads" I'm anticipating this, because I don't want to risk to underestimate anybody's work. So, if anyone has examples of how, e.g., to distribute I/O bandwidth as desired among heterogenous workloads (for instance, random vs sequential workloads) that might fluctuate over time, without losing total throughput, please tell me, and I'll test them. Thanks, Paolo > Thanks. > > -- > tejun
Re: testing io.low limit for blk-throttle
> Il giorno 26 apr 2018, alle ore 20:32, Tejun Heo ha scritto: > > Hello, > > On Tue, Apr 24, 2018 at 02:12:51PM +0200, Paolo Valente wrote: >> +Tejun (I guess he might be interested in the results below) > > Our experiments didn't work out too well either. At this point, it > isn't clear whether io.low will ever leave experimental state. We're > trying to find a working solution. > Thanks for this update, Tejun. I'm still working (very slowly) on a survey of the current state of affairs in terms of bandwidth and latency guarantees in the block layer. The synthesis of the results I've collected so far is, more or less: "The problem of reaching a high throughput and, at the same time, guaranteeing bandwidth and latency is still unsolved, apart from simple cases, such as homogenous, constant workloads" I'm anticipating this, because I don't want to risk to underestimate anybody's work. So, if anyone has examples of how, e.g., to distribute I/O bandwidth as desired among heterogenous workloads (for instance, random vs sequential workloads) that might fluctuate over time, without losing total throughput, please tell me, and I'll test them. Thanks, Paolo > Thanks. > > -- > tejun
Re: testing io.low limit for blk-throttle
> Il giorno 27 apr 2018, alle ore 05:27, Joseph Qi <jiangqi...@gmail.com> ha > scritto: > > Hi Paolo, > > On 18/4/27 01:27, Paolo Valente wrote: >> >> >>> Il giorno 25 apr 2018, alle ore 14:13, Joseph Qi <jiangqi...@gmail.com> ha >>> scritto: >>> >>> Hi Paolo, >>> >> >> Hi Joseph >> >>> ... >>> Could you run blktrace as well when testing your case? There are several >>> throtl traces to help analyze whether it is caused by frequently >>> upgrade/downgrade. >> >> Certainly. You can find a trace attached. Unfortunately, I'm not >> familiar with the internals of blk-throttle and low limit, so, if you >> want me to analyze the trace, give me some hints on what I have to >> look for. Otherwise, I'll be happy to learn from your analysis. >> > > I've taken a glance at your blktrace attached. It is only upgrade at first and > then downgrade (just adjust limit, not to LIMIT_LOW) frequently. > But I don't know why it always thinks throttle group is not idle. > > For example: > fio-2336 [004] d... 428.458249: 8,16 m N throtl avg_idle=90, > idle_threshold=1000, bad_bio=10, total_bio=84, is_idle=0, scale=9 > fio-2336 [004] d... 428.458251: 8,16 m N throtl downgrade, scale 4 > > In throtl_tg_is_idle(): > is_idle = ... || > (tg->latency_target && tg->bio_cnt && >tg->bad_bio_cnt * 5 < tg->bio_cnt); > > It should be idle and allow run more bandwidth. But here the result shows not > idle (is_idle=0). I have to do more investigation to figure it out why. > Hi Joseph, actually this doesn't surprise me much, for this scenario I expected exactly that blk-throttle would have considered the random-I/O group, for most of the time, 1) non idle, 2) above the 100usec target latency, and 3) below low limit, In fact, 1) The group can evidently issue I/O at a much higher rate than that received, so, immediately after its last pending I/O has been served, the group issues new I/O; in the end, it is is non idle most of the time 2) To try to enforce the 10MB/s limit, blk-throttle necessarily makes the group oscillate around 10MB/s, which means that the group is frequently below limit (this would not have held only if the group had actually received much more than 10MB/s, but it is not so) 3) For each of the 4k random I/Os of the group, the time needed by the drive to serve that I/O is already around 40-50usec. So, since the group is of course not constantly in service, it is very easy that, because of throttling, the latency of most I/Os of the group goes beyond 100usec. But, as it is often the case for me, I might have simply misunderstood blk-throttle parameters, and I might be just wrong here. Thanks, Paolo > You can also filter these logs using: > grep throtl trace | grep -E 'upgrade|downgrade|is_idle' > > Thanks, > Joseph
Re: testing io.low limit for blk-throttle
> Il giorno 27 apr 2018, alle ore 05:27, Joseph Qi ha > scritto: > > Hi Paolo, > > On 18/4/27 01:27, Paolo Valente wrote: >> >> >>> Il giorno 25 apr 2018, alle ore 14:13, Joseph Qi ha >>> scritto: >>> >>> Hi Paolo, >>> >> >> Hi Joseph >> >>> ... >>> Could you run blktrace as well when testing your case? There are several >>> throtl traces to help analyze whether it is caused by frequently >>> upgrade/downgrade. >> >> Certainly. You can find a trace attached. Unfortunately, I'm not >> familiar with the internals of blk-throttle and low limit, so, if you >> want me to analyze the trace, give me some hints on what I have to >> look for. Otherwise, I'll be happy to learn from your analysis. >> > > I've taken a glance at your blktrace attached. It is only upgrade at first and > then downgrade (just adjust limit, not to LIMIT_LOW) frequently. > But I don't know why it always thinks throttle group is not idle. > > For example: > fio-2336 [004] d... 428.458249: 8,16 m N throtl avg_idle=90, > idle_threshold=1000, bad_bio=10, total_bio=84, is_idle=0, scale=9 > fio-2336 [004] d... 428.458251: 8,16 m N throtl downgrade, scale 4 > > In throtl_tg_is_idle(): > is_idle = ... || > (tg->latency_target && tg->bio_cnt && >tg->bad_bio_cnt * 5 < tg->bio_cnt); > > It should be idle and allow run more bandwidth. But here the result shows not > idle (is_idle=0). I have to do more investigation to figure it out why. > Hi Joseph, actually this doesn't surprise me much, for this scenario I expected exactly that blk-throttle would have considered the random-I/O group, for most of the time, 1) non idle, 2) above the 100usec target latency, and 3) below low limit, In fact, 1) The group can evidently issue I/O at a much higher rate than that received, so, immediately after its last pending I/O has been served, the group issues new I/O; in the end, it is is non idle most of the time 2) To try to enforce the 10MB/s limit, blk-throttle necessarily makes the group oscillate around 10MB/s, which means that the group is frequently below limit (this would not have held only if the group had actually received much more than 10MB/s, but it is not so) 3) For each of the 4k random I/Os of the group, the time needed by the drive to serve that I/O is already around 40-50usec. So, since the group is of course not constantly in service, it is very easy that, because of throttling, the latency of most I/Os of the group goes beyond 100usec. But, as it is often the case for me, I might have simply misunderstood blk-throttle parameters, and I might be just wrong here. Thanks, Paolo > You can also filter these logs using: > grep throtl trace | grep -E 'upgrade|downgrade|is_idle' > > Thanks, > Joseph
Re: testing io.low limit for blk-throttle
> Il giorno 23 apr 2018, alle ore 11:01, Joseph Qi <jiangqi...@gmail.com> ha > scritto: > > > > On 18/4/23 15:35, Paolo Valente wrote: >> >> >>> Il giorno 23 apr 2018, alle ore 08:05, Joseph Qi <jiangqi...@gmail.com> ha >>> scritto: >>> >>> Hi Paolo, >> >> Hi Joseph, >> thanks for chiming in. >> >>> What's your idle and latency config? >> >> I didn't set them at all, as the only (explicit) requirement in my >> basic test is that one of the group is guaranteed a minimum bps. >> >> >>> IMO, io.low will allow others run more bandwidth if cgroup's average >>> idle time is high or latency is low. >> >> What you say here makes me think that I simply misunderstood the >> purpose of io.low. So, here is my problem/question: "I only need to >> guarantee at least a minimum bandwidth, in bps, to a group. Is the >> io.low limit the way to go?" >> >> I know that I can use just io.max (unless I misunderstood the goal of >> io.max too :( ), but my extra purpose would be to not waste bandwidth >> when some group is idle. Yet, as for now, io.low is not working even >> for the first, simpler goal, i.e., guaranteeing a minimum bandwidth to >> one group when all groups are active. >> >> Am I getting something wrong? >> >> Otherwise, if there are some special values for idle and latency >> parameters that would make throttle work for my test, I'll be of >> course happy to try them. >> > I think you can try idle time with 1000us for all cgroups, and latency > target 100us for cgroup with low limit 100MB/s and 2000us for cgroups > with low limit 10MB/s. That means cgroup with low latency target will > be preferred. > BTW, from my expeierence the parameters are not easy to set because > they are strongly correlated to the cgroup IO behavior. > +Tejun (I guess he might be interested in the results below) Hi Joseph, thanks for chiming in. Your suggestion did work! At first, I thought I had also understood the use of latency from the outcome of your suggestion: "want low limit really guaranteed for a group? set target latency to a low value for it." But then, as a crosscheck, I repeated the same exact test, but reversing target latencies: I gave 2000 to the interfered (the group with 100MB/s limit) and 100 to the interferers. And the interfered still got more than 100MB/s! So I exaggerated: 2 to the interfered. Same outcome :( I tried really many other combinations, to try to figure this out, but results seemed more or less random w.r.t. to latency values. I didn't even start to test different values for idle. So, the only sound lesson that I seem to have learned is: if I want low limits to be enforced, I have to set target latency and idle explicitly. The actual values of latencies matter little, or not at all. At least this holds for my simple tests. At any rate, thanks to your help, Joseph, I could move to the most interesting part for me: how effective is blk-throttle with low limits? I could well be wrong again, but my results do not seem that good. With the simplest type of non-toy example I considered, I recorded throughput losses, apparently caused mainly by blk-throttle, and ranging from 64% to 75%. Here is a worst-case example. For each step, I'm reporting below the command by which you can reproduce that step with the thr-lat-with-interference benchmark of the S suite [1]. I just split bandwidth equally among five groups, on my SSD. The device showed a peak rate of ~515MB/s in this test, so I set rpbs to 100MB/s for each group (and tried various values, and combinations of values, for the target latency, without any effect on the results). To begin, I made every group do sequential reads. Everything worked perfectly fine. But then I made one group do random I/O [2], and troubles began. Even if the group doing random I/O was given a target latency of 100usec (or lower), while the other had a target latency of 2000usec, the poor random-I/O group got only 4.7 MB/s! (A single process doing 4k sync random I/O reaches 25MB/s on my SSD.) I guess things broke because low limits did not comply any longer with the lower speed that device reached with the new, mixed workload: the device reached 376MB/s, while the sum of the low limits was 500MB/s. BTW the 'fault' for this loss of throughput was not only of the device and the workload: if I switched throttling off, then the device still reached its peak rate, although granting only 1.3MB/s to the random-I/O group. So, to comply with the 376MB/s, I lowered the low limits to 74MB/s per group (to avoid a too tight 75MB/s) [3]. A little better: the random-I/O group got 7.2 MB/s. But the total throughput went down further, to
Re: testing io.low limit for blk-throttle
> Il giorno 23 apr 2018, alle ore 11:01, Joseph Qi ha > scritto: > > > > On 18/4/23 15:35, Paolo Valente wrote: >> >> >>> Il giorno 23 apr 2018, alle ore 08:05, Joseph Qi ha >>> scritto: >>> >>> Hi Paolo, >> >> Hi Joseph, >> thanks for chiming in. >> >>> What's your idle and latency config? >> >> I didn't set them at all, as the only (explicit) requirement in my >> basic test is that one of the group is guaranteed a minimum bps. >> >> >>> IMO, io.low will allow others run more bandwidth if cgroup's average >>> idle time is high or latency is low. >> >> What you say here makes me think that I simply misunderstood the >> purpose of io.low. So, here is my problem/question: "I only need to >> guarantee at least a minimum bandwidth, in bps, to a group. Is the >> io.low limit the way to go?" >> >> I know that I can use just io.max (unless I misunderstood the goal of >> io.max too :( ), but my extra purpose would be to not waste bandwidth >> when some group is idle. Yet, as for now, io.low is not working even >> for the first, simpler goal, i.e., guaranteeing a minimum bandwidth to >> one group when all groups are active. >> >> Am I getting something wrong? >> >> Otherwise, if there are some special values for idle and latency >> parameters that would make throttle work for my test, I'll be of >> course happy to try them. >> > I think you can try idle time with 1000us for all cgroups, and latency > target 100us for cgroup with low limit 100MB/s and 2000us for cgroups > with low limit 10MB/s. That means cgroup with low latency target will > be preferred. > BTW, from my expeierence the parameters are not easy to set because > they are strongly correlated to the cgroup IO behavior. > +Tejun (I guess he might be interested in the results below) Hi Joseph, thanks for chiming in. Your suggestion did work! At first, I thought I had also understood the use of latency from the outcome of your suggestion: "want low limit really guaranteed for a group? set target latency to a low value for it." But then, as a crosscheck, I repeated the same exact test, but reversing target latencies: I gave 2000 to the interfered (the group with 100MB/s limit) and 100 to the interferers. And the interfered still got more than 100MB/s! So I exaggerated: 2 to the interfered. Same outcome :( I tried really many other combinations, to try to figure this out, but results seemed more or less random w.r.t. to latency values. I didn't even start to test different values for idle. So, the only sound lesson that I seem to have learned is: if I want low limits to be enforced, I have to set target latency and idle explicitly. The actual values of latencies matter little, or not at all. At least this holds for my simple tests. At any rate, thanks to your help, Joseph, I could move to the most interesting part for me: how effective is blk-throttle with low limits? I could well be wrong again, but my results do not seem that good. With the simplest type of non-toy example I considered, I recorded throughput losses, apparently caused mainly by blk-throttle, and ranging from 64% to 75%. Here is a worst-case example. For each step, I'm reporting below the command by which you can reproduce that step with the thr-lat-with-interference benchmark of the S suite [1]. I just split bandwidth equally among five groups, on my SSD. The device showed a peak rate of ~515MB/s in this test, so I set rpbs to 100MB/s for each group (and tried various values, and combinations of values, for the target latency, without any effect on the results). To begin, I made every group do sequential reads. Everything worked perfectly fine. But then I made one group do random I/O [2], and troubles began. Even if the group doing random I/O was given a target latency of 100usec (or lower), while the other had a target latency of 2000usec, the poor random-I/O group got only 4.7 MB/s! (A single process doing 4k sync random I/O reaches 25MB/s on my SSD.) I guess things broke because low limits did not comply any longer with the lower speed that device reached with the new, mixed workload: the device reached 376MB/s, while the sum of the low limits was 500MB/s. BTW the 'fault' for this loss of throughput was not only of the device and the workload: if I switched throttling off, then the device still reached its peak rate, although granting only 1.3MB/s to the random-I/O group. So, to comply with the 376MB/s, I lowered the low limits to 74MB/s per group (to avoid a too tight 75MB/s) [3]. A little better: the random-I/O group got 7.2 MB/s. But the total throughput went down further, to 289MB/s, and became again lower than the sum of the low limi
Re: usercopy whitelist woe in scsi_sense_cache
> Il giorno 20 apr 2018, alle ore 22:23, Kees Cook <keesc...@chromium.org> ha > scritto: > > On Thu, Apr 19, 2018 at 2:32 AM, Paolo Valente <paolo.vale...@linaro.org> > wrote: >> I'm missing something here. When the request gets completed in the >> first place, the hook bfq_finish_requeue_request gets called, and that >> hook clears both ->elv.priv elements (as the request has a non-null >> elv.icq). So, when bfq gets the same request again, those elements >> must be NULL. What am I getting wrong? >> >> I have some more concern on this point, but I'll stick to this for the >> moment, to not create more confusion. > > I don't know the "how", I only found the "what". :) Got it, although I think you did much more than that ;) Anyway, my reply was exactly to a (Jens') detailed description of the how. And my concern is that there seems to be an inconsistency in that description. In addition, Jens is proposing a patch basing on that description. But, if this inconsistency is not solved, that patch may eliminate the symptom at hand, but it may not fix the real cause, or may even contribute to bury it deeper. > If you want, grab > the reproducer VM linked to earlier in this thread; it'll hit the > problem within about 30 seconds of running the reproducer. > Yep. Actually, I've been investigating this kind of failure, in different incarnations, for months now. In this respect, other examples are the srp-test failures reported by Bart, e.g., here [1]. According to my analysis, the cause of the problem is somewhere in blk-mq, outside bfq. Unfortunately, I didn't make it to find where it exactly is, mainly because of my limited expertise on blk-mq internals. So I have asked for any kind of help and suggestions to Jens, Mike and any other knowledgeable guy. Probably those help requests got somehow lost on those threads, but your results, Kees, and the analysis that followed from Jens seems now to be carrying us to the solution of the not-so-recent issue. Time will tell. Thanks, Paolo [1] https://www.spinics.net/lists/linux-block/msg22760.html > -Kees > > -- > Kees Cook > Pixel Security
Re: usercopy whitelist woe in scsi_sense_cache
> Il giorno 20 apr 2018, alle ore 22:23, Kees Cook ha > scritto: > > On Thu, Apr 19, 2018 at 2:32 AM, Paolo Valente > wrote: >> I'm missing something here. When the request gets completed in the >> first place, the hook bfq_finish_requeue_request gets called, and that >> hook clears both ->elv.priv elements (as the request has a non-null >> elv.icq). So, when bfq gets the same request again, those elements >> must be NULL. What am I getting wrong? >> >> I have some more concern on this point, but I'll stick to this for the >> moment, to not create more confusion. > > I don't know the "how", I only found the "what". :) Got it, although I think you did much more than that ;) Anyway, my reply was exactly to a (Jens') detailed description of the how. And my concern is that there seems to be an inconsistency in that description. In addition, Jens is proposing a patch basing on that description. But, if this inconsistency is not solved, that patch may eliminate the symptom at hand, but it may not fix the real cause, or may even contribute to bury it deeper. > If you want, grab > the reproducer VM linked to earlier in this thread; it'll hit the > problem within about 30 seconds of running the reproducer. > Yep. Actually, I've been investigating this kind of failure, in different incarnations, for months now. In this respect, other examples are the srp-test failures reported by Bart, e.g., here [1]. According to my analysis, the cause of the problem is somewhere in blk-mq, outside bfq. Unfortunately, I didn't make it to find where it exactly is, mainly because of my limited expertise on blk-mq internals. So I have asked for any kind of help and suggestions to Jens, Mike and any other knowledgeable guy. Probably those help requests got somehow lost on those threads, but your results, Kees, and the analysis that followed from Jens seems now to be carrying us to the solution of the not-so-recent issue. Time will tell. Thanks, Paolo [1] https://www.spinics.net/lists/linux-block/msg22760.html > -Kees > > -- > Kees Cook > Pixel Security
Re: usercopy whitelist woe in scsi_sense_cache
> Il giorno 18 apr 2018, alle ore 16:30, Jens Axboe <ax...@kernel.dk> ha > scritto: > > On 4/18/18 3:08 AM, Paolo Valente wrote: >> >> >>> Il giorno 18 apr 2018, alle ore 00:57, Jens Axboe <ax...@kernel.dk> ha >>> scritto: >>> >>> On 4/17/18 3:48 PM, Jens Axboe wrote: >>>> On 4/17/18 3:47 PM, Kees Cook wrote: >>>>> On Tue, Apr 17, 2018 at 2:39 PM, Jens Axboe <ax...@kernel.dk> wrote: >>>>>> On 4/17/18 3:25 PM, Kees Cook wrote: >>>>>>> On Tue, Apr 17, 2018 at 1:46 PM, Kees Cook <keesc...@chromium.org> >>>>>>> wrote: >>>>>>>> I see elv.priv[1] assignments made in a few places -- is it possible >>>>>>>> there is some kind of uninitialized-but-not-NULL state that can leak >>>>>>>> in there? >>>>>>> >>>>>>> Got it. This fixes it for me: >>>>>>> >>>>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c >>>>>>> index 0dc9e341c2a7..859df3160303 100644 >>>>>>> --- a/block/blk-mq.c >>>>>>> +++ b/block/blk-mq.c >>>>>>> @@ -363,7 +363,7 @@ static struct request *blk_mq_get_request(struct >>>>>>> request_queue *q, >>>>>>> >>>>>>> rq = blk_mq_rq_ctx_init(data, tag, op); >>>>>>> if (!op_is_flush(op)) { >>>>>>> - rq->elv.icq = NULL; >>>>>>> + memset(>elv, 0, sizeof(rq->elv)); >>>>>>> if (e && e->type->ops.mq.prepare_request) { >>>>>>> if (e->type->icq_cache && rq_ioc(bio)) >>>>>>> blk_mq_sched_assign_ioc(rq, bio); >>>>>>> @@ -461,7 +461,7 @@ void blk_mq_free_request(struct request *rq) >>>>>>> e->type->ops.mq.finish_request(rq); >>>>>>> if (rq->elv.icq) { >>>>>>> put_io_context(rq->elv.icq->ioc); >>>>>>> - rq->elv.icq = NULL; >>>>>>> + memset(>elv, 0, sizeof(rq->elv)); >>>>>>> } >>>>>>> } >>>>>> >>>>>> This looks like a BFQ problem, this should not be necessary. Paolo, >>>>>> you're calling your own prepare request handler from the insert >>>>>> as well, and your prepare request does nothing if rq->elv.icq == NULL. >>>>> >>>>> I sent the patch anyway, since it's kind of a robustness improvement, >>>>> I'd hope. If you fix BFQ also, please add: >>>> >>>> It's also a memset() in the hot path, would prefer to avoid that... >>>> The issue here is really the convoluted bfq usage of insert/prepare, >>>> I'm sure Paolo can take it from here. >>> >> >> Hi, >> I'm very sorry for tuning in very late, but, at the same time, very >> glad to find the problem probably already solved ;) (in this respect, I >> swear, >> my delay was not intentional) >> >>> Does this fix it? >>> >>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c >>> index f0ecd98509d8..d883469a1582 100644 >>> --- a/block/bfq-iosched.c >>> +++ b/block/bfq-iosched.c >>> @@ -4934,8 +4934,11 @@ static void bfq_prepare_request(struct request *rq, >>> struct bio *bio) >>> bool new_queue = false; >>> bool bfqq_already_existing = false, split = false; >>> >>> - if (!rq->elv.icq) >>> + if (!rq->elv.icq) { >>> + rq->elv.priv[0] = rq->elv.priv[1] = NULL; >>> return; >>> + } >>> + >> >> This does solve the problem at hand. But it also arouses a question, >> related to a possible subtle bug. >> >> For BFQ, !rq->elv.icq basically means "this request is not for me, as >> I am an icq-based scheduler". But, IIUC the main points in this >> thread, then this assumption is false. If it is actually false, then >> I hope that all requests with !rq->elv.icq that are sent to BFQ do >> verify the condition (at_head || blk_rq_is_passthrough(rq)). In fact, >> requests that do not verify that condition are those that BFQ must put >
Re: usercopy whitelist woe in scsi_sense_cache
> Il giorno 18 apr 2018, alle ore 16:30, Jens Axboe ha > scritto: > > On 4/18/18 3:08 AM, Paolo Valente wrote: >> >> >>> Il giorno 18 apr 2018, alle ore 00:57, Jens Axboe ha >>> scritto: >>> >>> On 4/17/18 3:48 PM, Jens Axboe wrote: >>>> On 4/17/18 3:47 PM, Kees Cook wrote: >>>>> On Tue, Apr 17, 2018 at 2:39 PM, Jens Axboe wrote: >>>>>> On 4/17/18 3:25 PM, Kees Cook wrote: >>>>>>> On Tue, Apr 17, 2018 at 1:46 PM, Kees Cook >>>>>>> wrote: >>>>>>>> I see elv.priv[1] assignments made in a few places -- is it possible >>>>>>>> there is some kind of uninitialized-but-not-NULL state that can leak >>>>>>>> in there? >>>>>>> >>>>>>> Got it. This fixes it for me: >>>>>>> >>>>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c >>>>>>> index 0dc9e341c2a7..859df3160303 100644 >>>>>>> --- a/block/blk-mq.c >>>>>>> +++ b/block/blk-mq.c >>>>>>> @@ -363,7 +363,7 @@ static struct request *blk_mq_get_request(struct >>>>>>> request_queue *q, >>>>>>> >>>>>>> rq = blk_mq_rq_ctx_init(data, tag, op); >>>>>>> if (!op_is_flush(op)) { >>>>>>> - rq->elv.icq = NULL; >>>>>>> + memset(>elv, 0, sizeof(rq->elv)); >>>>>>> if (e && e->type->ops.mq.prepare_request) { >>>>>>> if (e->type->icq_cache && rq_ioc(bio)) >>>>>>> blk_mq_sched_assign_ioc(rq, bio); >>>>>>> @@ -461,7 +461,7 @@ void blk_mq_free_request(struct request *rq) >>>>>>> e->type->ops.mq.finish_request(rq); >>>>>>> if (rq->elv.icq) { >>>>>>> put_io_context(rq->elv.icq->ioc); >>>>>>> - rq->elv.icq = NULL; >>>>>>> + memset(>elv, 0, sizeof(rq->elv)); >>>>>>> } >>>>>>> } >>>>>> >>>>>> This looks like a BFQ problem, this should not be necessary. Paolo, >>>>>> you're calling your own prepare request handler from the insert >>>>>> as well, and your prepare request does nothing if rq->elv.icq == NULL. >>>>> >>>>> I sent the patch anyway, since it's kind of a robustness improvement, >>>>> I'd hope. If you fix BFQ also, please add: >>>> >>>> It's also a memset() in the hot path, would prefer to avoid that... >>>> The issue here is really the convoluted bfq usage of insert/prepare, >>>> I'm sure Paolo can take it from here. >>> >> >> Hi, >> I'm very sorry for tuning in very late, but, at the same time, very >> glad to find the problem probably already solved ;) (in this respect, I >> swear, >> my delay was not intentional) >> >>> Does this fix it? >>> >>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c >>> index f0ecd98509d8..d883469a1582 100644 >>> --- a/block/bfq-iosched.c >>> +++ b/block/bfq-iosched.c >>> @@ -4934,8 +4934,11 @@ static void bfq_prepare_request(struct request *rq, >>> struct bio *bio) >>> bool new_queue = false; >>> bool bfqq_already_existing = false, split = false; >>> >>> - if (!rq->elv.icq) >>> + if (!rq->elv.icq) { >>> + rq->elv.priv[0] = rq->elv.priv[1] = NULL; >>> return; >>> + } >>> + >> >> This does solve the problem at hand. But it also arouses a question, >> related to a possible subtle bug. >> >> For BFQ, !rq->elv.icq basically means "this request is not for me, as >> I am an icq-based scheduler". But, IIUC the main points in this >> thread, then this assumption is false. If it is actually false, then >> I hope that all requests with !rq->elv.icq that are sent to BFQ do >> verify the condition (at_head || blk_rq_is_passthrough(rq)). In fact, >> requests that do not verify that condition are those that BFQ must put >> in a bfq_queue. So, even if this patch makes the crash disappear, we >> drive BFQ completel
Re: [PATCH] blkcg: not hold blkcg lock when deactivating policy.
> Il giorno 18 apr 2018, alle ore 11:18, jiang.bi...@zte.com.cn ha scritto: > > Hi, >>> Il giorno 17 apr 2018, alle ore 09:10, Jiang Biao <jiang.bi...@zte.com.cn> >>> ha scritto: >>> >>> As described in the comment of blkcg_activate_policy(), >>> *Update of each blkg is protected by both queue and blkcg locks so >>> that holding either lock and testing blkcg_policy_enabled() is >>> always enough for dereferencing policy data.* >>> with queue lock held, there is no need to hold blkcg lock in >>> blkcg_deactivate_policy(). Similar case is in >>> blkcg_activate_policy(), which has removed holding of blkcg lock in >>> commit 4c55f4f9ad3001ac1fefdd8d8ca7641d18558e23. >>> >> >> Hi, >> by chance, did you check whether this may cause problems with bfq, >> being the latter not protected by the queue lock as cfq? > Checked the bfq code, bfq seems never used blkcg lock derectly, and > update of blkg in the common code is protected by both queue and > blkcg locks, so IMHO this patch would not introduce any new problem > with bfq, even though bfq is not protected by queue lock. Thank you very much for checking. Then: Acked-by: Paolo Valente <paolo.vale...@linaro.org> Thanks, Paolo > On the other hand, the locks (queue lock/blkcg lock) used to protected > the update of blkg seems a bit too heavyweight, especially the queue lock > which is used too widely may cause races with other contexts. I wonder > if there is any way to ease the case? e.g. add a new lock for blkg's own.:) > > Jiang, > Regards
Re: [PATCH] blkcg: not hold blkcg lock when deactivating policy.
> Il giorno 18 apr 2018, alle ore 11:18, jiang.bi...@zte.com.cn ha scritto: > > Hi, >>> Il giorno 17 apr 2018, alle ore 09:10, Jiang Biao >>> ha scritto: >>> >>> As described in the comment of blkcg_activate_policy(), >>> *Update of each blkg is protected by both queue and blkcg locks so >>> that holding either lock and testing blkcg_policy_enabled() is >>> always enough for dereferencing policy data.* >>> with queue lock held, there is no need to hold blkcg lock in >>> blkcg_deactivate_policy(). Similar case is in >>> blkcg_activate_policy(), which has removed holding of blkcg lock in >>> commit 4c55f4f9ad3001ac1fefdd8d8ca7641d18558e23. >>> >> >> Hi, >> by chance, did you check whether this may cause problems with bfq, >> being the latter not protected by the queue lock as cfq? > Checked the bfq code, bfq seems never used blkcg lock derectly, and > update of blkg in the common code is protected by both queue and > blkcg locks, so IMHO this patch would not introduce any new problem > with bfq, even though bfq is not protected by queue lock. Thank you very much for checking. Then: Acked-by: Paolo Valente Thanks, Paolo > On the other hand, the locks (queue lock/blkcg lock) used to protected > the update of blkg seems a bit too heavyweight, especially the queue lock > which is used too widely may cause races with other contexts. I wonder > if there is any way to ease the case? e.g. add a new lock for blkg's own.:) > > Jiang, > Regards
Re: usercopy whitelist woe in scsi_sense_cache
> Il giorno 18 apr 2018, alle ore 00:57, Jens Axboeha > scritto: > > On 4/17/18 3:48 PM, Jens Axboe wrote: >> On 4/17/18 3:47 PM, Kees Cook wrote: >>> On Tue, Apr 17, 2018 at 2:39 PM, Jens Axboe wrote: On 4/17/18 3:25 PM, Kees Cook wrote: > On Tue, Apr 17, 2018 at 1:46 PM, Kees Cook wrote: >> I see elv.priv[1] assignments made in a few places -- is it possible >> there is some kind of uninitialized-but-not-NULL state that can leak >> in there? > > Got it. This fixes it for me: > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 0dc9e341c2a7..859df3160303 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -363,7 +363,7 @@ static struct request *blk_mq_get_request(struct > request_queue *q, > >rq = blk_mq_rq_ctx_init(data, tag, op); >if (!op_is_flush(op)) { > - rq->elv.icq = NULL; > + memset(>elv, 0, sizeof(rq->elv)); >if (e && e->type->ops.mq.prepare_request) { >if (e->type->icq_cache && rq_ioc(bio)) >blk_mq_sched_assign_ioc(rq, bio); > @@ -461,7 +461,7 @@ void blk_mq_free_request(struct request *rq) >e->type->ops.mq.finish_request(rq); >if (rq->elv.icq) { >put_io_context(rq->elv.icq->ioc); > - rq->elv.icq = NULL; > + memset(>elv, 0, sizeof(rq->elv)); >} >} This looks like a BFQ problem, this should not be necessary. Paolo, you're calling your own prepare request handler from the insert as well, and your prepare request does nothing if rq->elv.icq == NULL. >>> >>> I sent the patch anyway, since it's kind of a robustness improvement, >>> I'd hope. If you fix BFQ also, please add: >> >> It's also a memset() in the hot path, would prefer to avoid that... >> The issue here is really the convoluted bfq usage of insert/prepare, >> I'm sure Paolo can take it from here. > Hi, I'm very sorry for tuning in very late, but, at the same time, very glad to find the problem probably already solved ;) (in this respect, I swear, my delay was not intentional) > Does this fix it? > > diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c > index f0ecd98509d8..d883469a1582 100644 > --- a/block/bfq-iosched.c > +++ b/block/bfq-iosched.c > @@ -4934,8 +4934,11 @@ static void bfq_prepare_request(struct request *rq, > struct bio *bio) > bool new_queue = false; > bool bfqq_already_existing = false, split = false; > > - if (!rq->elv.icq) > + if (!rq->elv.icq) { > + rq->elv.priv[0] = rq->elv.priv[1] = NULL; > return; > + } > + This does solve the problem at hand. But it also arouses a question, related to a possible subtle bug. For BFQ, !rq->elv.icq basically means "this request is not for me, as I am an icq-based scheduler". But, IIUC the main points in this thread, then this assumption is false. If it is actually false, then I hope that all requests with !rq->elv.icq that are sent to BFQ do verify the condition (at_head || blk_rq_is_passthrough(rq)). In fact, requests that do not verify that condition are those that BFQ must put in a bfq_queue. So, even if this patch makes the crash disappear, we drive BFQ completely crazy (and we may expect other strange failures) if we send BFQ a request with !((at_head || blk_rq_is_passthrough(rq)) and !rq->elv.icq. BFQ has to put that rq into a bfq_queue, but simply cannot. Jens, or any other, could you please shed a light on this, and explain how things are exactly? Thanks, Paolo > bic = icq_to_bic(rq->elv.icq); > > spin_lock_irq(>lock); > > -- > Jens Axboe
Re: usercopy whitelist woe in scsi_sense_cache
> Il giorno 18 apr 2018, alle ore 00:57, Jens Axboe ha > scritto: > > On 4/17/18 3:48 PM, Jens Axboe wrote: >> On 4/17/18 3:47 PM, Kees Cook wrote: >>> On Tue, Apr 17, 2018 at 2:39 PM, Jens Axboe wrote: On 4/17/18 3:25 PM, Kees Cook wrote: > On Tue, Apr 17, 2018 at 1:46 PM, Kees Cook wrote: >> I see elv.priv[1] assignments made in a few places -- is it possible >> there is some kind of uninitialized-but-not-NULL state that can leak >> in there? > > Got it. This fixes it for me: > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 0dc9e341c2a7..859df3160303 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -363,7 +363,7 @@ static struct request *blk_mq_get_request(struct > request_queue *q, > >rq = blk_mq_rq_ctx_init(data, tag, op); >if (!op_is_flush(op)) { > - rq->elv.icq = NULL; > + memset(>elv, 0, sizeof(rq->elv)); >if (e && e->type->ops.mq.prepare_request) { >if (e->type->icq_cache && rq_ioc(bio)) >blk_mq_sched_assign_ioc(rq, bio); > @@ -461,7 +461,7 @@ void blk_mq_free_request(struct request *rq) >e->type->ops.mq.finish_request(rq); >if (rq->elv.icq) { >put_io_context(rq->elv.icq->ioc); > - rq->elv.icq = NULL; > + memset(>elv, 0, sizeof(rq->elv)); >} >} This looks like a BFQ problem, this should not be necessary. Paolo, you're calling your own prepare request handler from the insert as well, and your prepare request does nothing if rq->elv.icq == NULL. >>> >>> I sent the patch anyway, since it's kind of a robustness improvement, >>> I'd hope. If you fix BFQ also, please add: >> >> It's also a memset() in the hot path, would prefer to avoid that... >> The issue here is really the convoluted bfq usage of insert/prepare, >> I'm sure Paolo can take it from here. > Hi, I'm very sorry for tuning in very late, but, at the same time, very glad to find the problem probably already solved ;) (in this respect, I swear, my delay was not intentional) > Does this fix it? > > diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c > index f0ecd98509d8..d883469a1582 100644 > --- a/block/bfq-iosched.c > +++ b/block/bfq-iosched.c > @@ -4934,8 +4934,11 @@ static void bfq_prepare_request(struct request *rq, > struct bio *bio) > bool new_queue = false; > bool bfqq_already_existing = false, split = false; > > - if (!rq->elv.icq) > + if (!rq->elv.icq) { > + rq->elv.priv[0] = rq->elv.priv[1] = NULL; > return; > + } > + This does solve the problem at hand. But it also arouses a question, related to a possible subtle bug. For BFQ, !rq->elv.icq basically means "this request is not for me, as I am an icq-based scheduler". But, IIUC the main points in this thread, then this assumption is false. If it is actually false, then I hope that all requests with !rq->elv.icq that are sent to BFQ do verify the condition (at_head || blk_rq_is_passthrough(rq)). In fact, requests that do not verify that condition are those that BFQ must put in a bfq_queue. So, even if this patch makes the crash disappear, we drive BFQ completely crazy (and we may expect other strange failures) if we send BFQ a request with !((at_head || blk_rq_is_passthrough(rq)) and !rq->elv.icq. BFQ has to put that rq into a bfq_queue, but simply cannot. Jens, or any other, could you please shed a light on this, and explain how things are exactly? Thanks, Paolo > bic = icq_to_bic(rq->elv.icq); > > spin_lock_irq(>lock); > > -- > Jens Axboe
Re: [PATCH] blk-mq: Clear out elevator private data
> Il giorno 17 apr 2018, alle ore 23:42, Kees Cookha > scritto: > > Some elevators may not correctly check rq->rq_flags & RQF_ELVPRIV, and > may attempt to read rq->elv fields. When requests got reused, this > caused BFQ to think it already had a bfqq (rq->elv.priv[1]) allocated. Hi Kees, where does BFQ gets confused and operates on a request not destined to it? I'm asking because I paid attention to always avoid such a mistake. Thanks, Paolo > This could lead to odd behaviors like having the sense buffer address > slowly start incrementing. This eventually tripped HARDENED_USERCOPY > and KASAN. > > This patch wipes all of rq->elv instead of just rq->elv.icq. While > it shouldn't technically be needed, this ends up being a robustness > improvement that should lead to at least finding bugs in elevators faster. > > Reported-by: Oleksandr Natalenko > Fixes: bd166ef183c26 ("blk-mq-sched: add framework for MQ capable IO > schedulers") > Cc: sta...@vger.kernel.org > Signed-off-by: Kees Cook > --- > In theory, BFQ needs to also check the RQF_ELVPRIV flag, but I'll leave that > to Paolo to figure out. Also, my Fixes line is kind of a best-guess. This > is where icq was originally wiped, so it seemed as good a commit as any. > --- > block/blk-mq.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 0dc9e341c2a7..859df3160303 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -363,7 +363,7 @@ static struct request *blk_mq_get_request(struct > request_queue *q, > > rq = blk_mq_rq_ctx_init(data, tag, op); > if (!op_is_flush(op)) { > - rq->elv.icq = NULL; > + memset(>elv, 0, sizeof(rq->elv)); > if (e && e->type->ops.mq.prepare_request) { > if (e->type->icq_cache && rq_ioc(bio)) > blk_mq_sched_assign_ioc(rq, bio); > @@ -461,7 +461,7 @@ void blk_mq_free_request(struct request *rq) > e->type->ops.mq.finish_request(rq); > if (rq->elv.icq) { > put_io_context(rq->elv.icq->ioc); > - rq->elv.icq = NULL; > + memset(>elv, 0, sizeof(rq->elv)); > } > } > > -- > 2.7.4 > > > -- > Kees Cook > Pixel Security
Re: [PATCH] blk-mq: Clear out elevator private data
> Il giorno 17 apr 2018, alle ore 23:42, Kees Cook ha > scritto: > > Some elevators may not correctly check rq->rq_flags & RQF_ELVPRIV, and > may attempt to read rq->elv fields. When requests got reused, this > caused BFQ to think it already had a bfqq (rq->elv.priv[1]) allocated. Hi Kees, where does BFQ gets confused and operates on a request not destined to it? I'm asking because I paid attention to always avoid such a mistake. Thanks, Paolo > This could lead to odd behaviors like having the sense buffer address > slowly start incrementing. This eventually tripped HARDENED_USERCOPY > and KASAN. > > This patch wipes all of rq->elv instead of just rq->elv.icq. While > it shouldn't technically be needed, this ends up being a robustness > improvement that should lead to at least finding bugs in elevators faster. > > Reported-by: Oleksandr Natalenko > Fixes: bd166ef183c26 ("blk-mq-sched: add framework for MQ capable IO > schedulers") > Cc: sta...@vger.kernel.org > Signed-off-by: Kees Cook > --- > In theory, BFQ needs to also check the RQF_ELVPRIV flag, but I'll leave that > to Paolo to figure out. Also, my Fixes line is kind of a best-guess. This > is where icq was originally wiped, so it seemed as good a commit as any. > --- > block/blk-mq.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 0dc9e341c2a7..859df3160303 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -363,7 +363,7 @@ static struct request *blk_mq_get_request(struct > request_queue *q, > > rq = blk_mq_rq_ctx_init(data, tag, op); > if (!op_is_flush(op)) { > - rq->elv.icq = NULL; > + memset(>elv, 0, sizeof(rq->elv)); > if (e && e->type->ops.mq.prepare_request) { > if (e->type->icq_cache && rq_ioc(bio)) > blk_mq_sched_assign_ioc(rq, bio); > @@ -461,7 +461,7 @@ void blk_mq_free_request(struct request *rq) > e->type->ops.mq.finish_request(rq); > if (rq->elv.icq) { > put_io_context(rq->elv.icq->ioc); > - rq->elv.icq = NULL; > + memset(>elv, 0, sizeof(rq->elv)); > } > } > > -- > 2.7.4 > > > -- > Kees Cook > Pixel Security
Re: [PATCH] blkcg: not hold blkcg lock when deactivating policy.
> Il giorno 17 apr 2018, alle ore 09:10, Jiang Biaoha > scritto: > > As described in the comment of blkcg_activate_policy(), > *Update of each blkg is protected by both queue and blkcg locks so > that holding either lock and testing blkcg_policy_enabled() is > always enough for dereferencing policy data.* > with queue lock held, there is no need to hold blkcg lock in > blkcg_deactivate_policy(). Similar case is in > blkcg_activate_policy(), which has removed holding of blkcg lock in > commit 4c55f4f9ad3001ac1fefdd8d8ca7641d18558e23. > Hi, by chance, did you check whether this may cause problems with bfq, being the latter not protected by the queue lock as cfq? Thanks, Paolo > Signed-off-by: Jiang Biao > Signed-off-by: Wen Yang > CC: Tejun Heo > CC: Jens Axboe > --- > block/blk-cgroup.c | 5 - > 1 file changed, 5 deletions(-) > > diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c > index c2033a2..2b7f8d0 100644 > --- a/block/blk-cgroup.c > +++ b/block/blk-cgroup.c > @@ -1367,17 +1367,12 @@ void blkcg_deactivate_policy(struct request_queue *q, > __clear_bit(pol->plid, q->blkcg_pols); > > list_for_each_entry(blkg, >blkg_list, q_node) { > - /* grab blkcg lock too while removing @pd from @blkg */ > - spin_lock(>blkcg->lock); > - > if (blkg->pd[pol->plid]) { > if (pol->pd_offline_fn) > pol->pd_offline_fn(blkg->pd[pol->plid]); > pol->pd_free_fn(blkg->pd[pol->plid]); > blkg->pd[pol->plid] = NULL; > } > - > - spin_unlock(>blkcg->lock); > } > > spin_unlock_irq(q->queue_lock); > -- > 2.7.4 >
Re: [PATCH] blkcg: not hold blkcg lock when deactivating policy.
> Il giorno 17 apr 2018, alle ore 09:10, Jiang Biao ha > scritto: > > As described in the comment of blkcg_activate_policy(), > *Update of each blkg is protected by both queue and blkcg locks so > that holding either lock and testing blkcg_policy_enabled() is > always enough for dereferencing policy data.* > with queue lock held, there is no need to hold blkcg lock in > blkcg_deactivate_policy(). Similar case is in > blkcg_activate_policy(), which has removed holding of blkcg lock in > commit 4c55f4f9ad3001ac1fefdd8d8ca7641d18558e23. > Hi, by chance, did you check whether this may cause problems with bfq, being the latter not protected by the queue lock as cfq? Thanks, Paolo > Signed-off-by: Jiang Biao > Signed-off-by: Wen Yang > CC: Tejun Heo > CC: Jens Axboe > --- > block/blk-cgroup.c | 5 - > 1 file changed, 5 deletions(-) > > diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c > index c2033a2..2b7f8d0 100644 > --- a/block/blk-cgroup.c > +++ b/block/blk-cgroup.c > @@ -1367,17 +1367,12 @@ void blkcg_deactivate_policy(struct request_queue *q, > __clear_bit(pol->plid, q->blkcg_pols); > > list_for_each_entry(blkg, >blkg_list, q_node) { > - /* grab blkcg lock too while removing @pd from @blkg */ > - spin_lock(>blkcg->lock); > - > if (blkg->pd[pol->plid]) { > if (pol->pd_offline_fn) > pol->pd_offline_fn(blkg->pd[pol->plid]); > pol->pd_free_fn(blkg->pd[pol->plid]); > blkg->pd[pol->plid] = NULL; > } > - > - spin_unlock(>blkcg->lock); > } > > spin_unlock_irq(q->queue_lock); > -- > 2.7.4 >
Re: General protection fault with use_blk_mq=1.
> Il giorno 29 mar 2018, alle ore 05:22, Jens Axboeha > scritto: > > On 3/28/18 9:13 PM, Zephaniah E. Loss-Cutler-Hull wrote: >> On 03/28/2018 06:02 PM, Jens Axboe wrote: >>> On 3/28/18 5:03 PM, Zephaniah E. Loss-Cutler-Hull wrote: I am not subscribed to any of the lists on the To list here, please CC me on any replies. I am encountering a fairly consistent crash anywhere from 15 minutes to 12 hours after boot with scsi_mod.use_blk_mq=1 dm_mod.use_blk_mq=1> The crash looks like: >> Looking through the code, I'd guess that this is dying inside blkg_rwstat_add, which calls percpu_counter_add_batch, which is what RIP is pointing at. >>> >>> Leaving the whole thing here for Paolo - it's crashing off insertion of >>> a request coming out of SG_IO. Don't think we've seen this BFQ failure >>> case before. >>> >>> You can mitigate this by switching the scsi-mq devices to mq-deadline >>> instead. >>> >> >> I'm thinking that I should also be able to mitigate it by disabling >> CONFIG_DEBUG_BLK_CGROUP. >> >> That should remove that entire chunk of code. >> >> Of course, that won't help if this is actually a symptom of a bigger >> problem. > > Yes, it's not a given that it will fully mask the issue at hand. But > turning off BFQ has a much higher chance of working for you. > > This time actually CC'ing Paolo. > Hi Zephaniah, if you are actually interested in the benefits of BFQ (low latency, high responsiveness, fairness, ...) then it may be worth to try what you yourself suggest: disabling CONFIG_DEBUG_BLK_CGROUP. Also because this option activates the heavy computation of debug cgroup statistics, which probably you don't use. In addition, the outcome of your attempt without CONFIG_DEBUG_BLK_CGROUP would give us useful bisection information: - if no failure occurs, then the issue is likely to be confined in that debugging code (which, on the bright side, is likely to be of occasional interest, for only a handful of developers) - if the issue still shows up, then we may have new hints on this odd failure Finally, consider that this issue has been reported to disappear from 4.16 [1], and, as a plus, that the service quality of BFQ had a further boost exactly from 4.16. Looking forward to your feedback, in case you try BFQ without CONFIG_DEBUG_BLK_CGROUP, Paolo [1] https://www.spinics.net/lists/linux-block/msg21422.html > > -- > Jens Axboe
Re: General protection fault with use_blk_mq=1.
> Il giorno 29 mar 2018, alle ore 05:22, Jens Axboe ha > scritto: > > On 3/28/18 9:13 PM, Zephaniah E. Loss-Cutler-Hull wrote: >> On 03/28/2018 06:02 PM, Jens Axboe wrote: >>> On 3/28/18 5:03 PM, Zephaniah E. Loss-Cutler-Hull wrote: I am not subscribed to any of the lists on the To list here, please CC me on any replies. I am encountering a fairly consistent crash anywhere from 15 minutes to 12 hours after boot with scsi_mod.use_blk_mq=1 dm_mod.use_blk_mq=1> The crash looks like: >> Looking through the code, I'd guess that this is dying inside blkg_rwstat_add, which calls percpu_counter_add_batch, which is what RIP is pointing at. >>> >>> Leaving the whole thing here for Paolo - it's crashing off insertion of >>> a request coming out of SG_IO. Don't think we've seen this BFQ failure >>> case before. >>> >>> You can mitigate this by switching the scsi-mq devices to mq-deadline >>> instead. >>> >> >> I'm thinking that I should also be able to mitigate it by disabling >> CONFIG_DEBUG_BLK_CGROUP. >> >> That should remove that entire chunk of code. >> >> Of course, that won't help if this is actually a symptom of a bigger >> problem. > > Yes, it's not a given that it will fully mask the issue at hand. But > turning off BFQ has a much higher chance of working for you. > > This time actually CC'ing Paolo. > Hi Zephaniah, if you are actually interested in the benefits of BFQ (low latency, high responsiveness, fairness, ...) then it may be worth to try what you yourself suggest: disabling CONFIG_DEBUG_BLK_CGROUP. Also because this option activates the heavy computation of debug cgroup statistics, which probably you don't use. In addition, the outcome of your attempt without CONFIG_DEBUG_BLK_CGROUP would give us useful bisection information: - if no failure occurs, then the issue is likely to be confined in that debugging code (which, on the bright side, is likely to be of occasional interest, for only a handful of developers) - if the issue still shows up, then we may have new hints on this odd failure Finally, consider that this issue has been reported to disappear from 4.16 [1], and, as a plus, that the service quality of BFQ had a further boost exactly from 4.16. Looking forward to your feedback, in case you try BFQ without CONFIG_DEBUG_BLK_CGROUP, Paolo [1] https://www.spinics.net/lists/linux-block/msg21422.html > > -- > Jens Axboe
Re: General protection fault with use_blk_mq=1.
> Il giorno 29 mar 2018, alle ore 03:02, Jens Axboeha > scritto: > > On 3/28/18 5:03 PM, Zephaniah E. Loss-Cutler-Hull wrote: >> I am not subscribed to any of the lists on the To list here, please CC >> me on any replies. >> >> I am encountering a fairly consistent crash anywhere from 15 minutes to >> 12 hours after boot with scsi_mod.use_blk_mq=1 dm_mod.use_blk_mq=1> >> The crash looks like: >> >> [ 5466.075993] general protection fault: [#1] PREEMPT SMP PTI >> [ 5466.075997] Modules linked in: esp4 xfrm4_mode_tunnel fuse usblp >> uvcvideo pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) >> ip6table_filter ip6_tables xt_tcpudp nf_conntrack_ipv4 nf_defrag_ipv4 >> xt_conntrack nf_conntrack iptable_filter ip_tables x_tables intel_rapl >> joydev serio_raw wmi_bmof iwldvm iwlwifi shpchp kvm_intel kvm irqbypass >> autofs4 algif_skcipher nls_iso8859_1 nls_cp437 crc32_pclmul >> ghash_clmulni_intel >> [ 5466.076022] CPU: 3 PID: 10573 Comm: pool Tainted: G O >> 4.15.13-f1-dirty #148 >> [ 5466.076024] Hardware name: Hewlett-Packard HP EliteBook Folio >> 9470m/18DF, BIOS 68IBD Ver. F.44 05/22/2013 >> [ 5466.076029] RIP: 0010:percpu_counter_add_batch+0x2b/0xb0 >> [ 5466.076031] RSP: 0018:a556c47afb58 EFLAGS: 00010002 >> [ 5466.076033] RAX: 95cda87ce018 RBX: 95cda87cdb68 RCX: >> >> [ 5466.076034] RDX: 3fff RSI: 896495c4 RDI: >> 895b2bed >> [ 5466.076036] RBP: 3fff R08: R09: >> 95cb7d5f8148 >> [ 5466.076037] R10: 0200 R11: R12: >> 0001 >> [ 5466.076038] R13: 95cda87ce088 R14: 95cda6ebd100 R15: >> a556c47afc58 >> [ 5466.076040] FS: 7f25f5305700() GS:95cdbeac() >> knlGS: >> [ 5466.076042] CS: 0010 DS: ES: CR0: 80050033 >> [ 5466.076043] CR2: 7f25e807e0a8 CR3: 0003ed5a6001 CR4: >> 001606e0 >> [ 5466.076044] Call Trace: >> [ 5466.076050] bfqg_stats_update_io_add+0x58/0x100 >> [ 5466.076055] bfq_insert_requests+0xec/0xd80 >> [ 5466.076059] ? blk_rq_append_bio+0x8f/0xa0 >> [ 5466.076061] ? blk_rq_map_user_iov+0xc3/0x1d0 >> [ 5466.076065] blk_mq_sched_insert_request+0xa3/0x130 >> [ 5466.076068] blk_execute_rq+0x3a/0x50 >> [ 5466.076070] sg_io+0x197/0x3e0 >> [ 5466.076073] ? dput+0xca/0x210 >> [ 5466.076077] ? mntput_no_expire+0x11/0x1a0 >> [ 5466.076079] scsi_cmd_ioctl+0x289/0x400 >> [ 5466.076082] ? filename_lookup+0xe1/0x170 >> [ 5466.076085] sd_ioctl+0xc7/0x1a0 >> [ 5466.076088] blkdev_ioctl+0x4d4/0x8c0 >> [ 5466.076091] block_ioctl+0x39/0x40 >> [ 5466.076094] do_vfs_ioctl+0x92/0x5e0 >> [ 5466.076097] ? __fget+0x73/0xc0 >> [ 5466.076099] SyS_ioctl+0x74/0x80 >> [ 5466.076102] do_syscall_64+0x60/0x110 >> [ 5466.076106] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 >> [ 5466.076109] RIP: 0033:0x7f25f75fef47 >> [ 5466.076110] RSP: 002b:7f25f53049a8 EFLAGS: 0246 ORIG_RAX: >> 0010 >> [ 5466.076112] RAX: ffda RBX: 000c RCX: >> 7f25f75fef47 >> [ 5466.076114] RDX: 7f25f53049b0 RSI: 2285 RDI: >> 000c >> [ 5466.076115] RBP: 0010 R08: 7f25e8007818 R09: >> 0200 >> [ 5466.076116] R10: 0001 R11: 0246 R12: >> >> [ 5466.076118] R13: R14: 7f25f8a6b5e0 R15: >> 7f25e80173e0 >> [ 5466.076120] Code: 41 55 49 89 fd bf 01 00 00 00 41 54 49 89 f4 55 89 >> d5 53 e8 18 e1 bb ff 48 c7 c7 c4 95 64 89 e8 dc e9 fb ff 49 8b 45 20 48 >> 63 d5 <65> 8b 18 48 63 db 4c 01 e3 48 39 d3 7d 0a f7 dd 48 63 ed 48 39 >> [ 5466.076147] RIP: percpu_counter_add_batch+0x2b/0xb0 RSP: a556c47afb58 >> [ 5466.076149] ---[ end trace 8d7eb80aafef4494 ]--- >> [ 5466.670153] note: pool[10573] exited with preempt_count 2 >> >> (I only have the one instance right this minute as a result of not >> having remote syslog setup before now.) >> >> This is clearly deep in the blk_mq code, and it goes away when I remove >> the use_blk_mq kernel command line parameters. >> >> My next obvious step is to try and disable the load of the vbox modules. >> >> I can include the full dmesg output if it would be helpful. >> >> The system is an older HP Ultrabook, and the root partition is, sda1 (a >> SSD) -> a LUKS encrypted partition -> LVM -> BTRFS. >> >> The kernel is a stock 4.15.11, however I only recently added the blk_mq >> options, so while I can state that I have seen this on multiple kernels >> in the 4.15.x series, I have not tested earlier kernels in this >> configuration. >> >> Looking through the code, I'd guess that this is dying inside >> blkg_rwstat_add, which calls percpu_counter_add_batch, which is what RIP >> is pointing at. > > Leaving the whole thing here for Paolo - it's crashing off insertion of > a request coming out of SG_IO. Don't think we've seen this BFQ failure > case before. > Actually, we have. Found and
Re: General protection fault with use_blk_mq=1.
> Il giorno 29 mar 2018, alle ore 03:02, Jens Axboe ha > scritto: > > On 3/28/18 5:03 PM, Zephaniah E. Loss-Cutler-Hull wrote: >> I am not subscribed to any of the lists on the To list here, please CC >> me on any replies. >> >> I am encountering a fairly consistent crash anywhere from 15 minutes to >> 12 hours after boot with scsi_mod.use_blk_mq=1 dm_mod.use_blk_mq=1> >> The crash looks like: >> >> [ 5466.075993] general protection fault: [#1] PREEMPT SMP PTI >> [ 5466.075997] Modules linked in: esp4 xfrm4_mode_tunnel fuse usblp >> uvcvideo pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) >> ip6table_filter ip6_tables xt_tcpudp nf_conntrack_ipv4 nf_defrag_ipv4 >> xt_conntrack nf_conntrack iptable_filter ip_tables x_tables intel_rapl >> joydev serio_raw wmi_bmof iwldvm iwlwifi shpchp kvm_intel kvm irqbypass >> autofs4 algif_skcipher nls_iso8859_1 nls_cp437 crc32_pclmul >> ghash_clmulni_intel >> [ 5466.076022] CPU: 3 PID: 10573 Comm: pool Tainted: G O >> 4.15.13-f1-dirty #148 >> [ 5466.076024] Hardware name: Hewlett-Packard HP EliteBook Folio >> 9470m/18DF, BIOS 68IBD Ver. F.44 05/22/2013 >> [ 5466.076029] RIP: 0010:percpu_counter_add_batch+0x2b/0xb0 >> [ 5466.076031] RSP: 0018:a556c47afb58 EFLAGS: 00010002 >> [ 5466.076033] RAX: 95cda87ce018 RBX: 95cda87cdb68 RCX: >> >> [ 5466.076034] RDX: 3fff RSI: 896495c4 RDI: >> 895b2bed >> [ 5466.076036] RBP: 3fff R08: R09: >> 95cb7d5f8148 >> [ 5466.076037] R10: 0200 R11: R12: >> 0001 >> [ 5466.076038] R13: 95cda87ce088 R14: 95cda6ebd100 R15: >> a556c47afc58 >> [ 5466.076040] FS: 7f25f5305700() GS:95cdbeac() >> knlGS: >> [ 5466.076042] CS: 0010 DS: ES: CR0: 80050033 >> [ 5466.076043] CR2: 7f25e807e0a8 CR3: 0003ed5a6001 CR4: >> 001606e0 >> [ 5466.076044] Call Trace: >> [ 5466.076050] bfqg_stats_update_io_add+0x58/0x100 >> [ 5466.076055] bfq_insert_requests+0xec/0xd80 >> [ 5466.076059] ? blk_rq_append_bio+0x8f/0xa0 >> [ 5466.076061] ? blk_rq_map_user_iov+0xc3/0x1d0 >> [ 5466.076065] blk_mq_sched_insert_request+0xa3/0x130 >> [ 5466.076068] blk_execute_rq+0x3a/0x50 >> [ 5466.076070] sg_io+0x197/0x3e0 >> [ 5466.076073] ? dput+0xca/0x210 >> [ 5466.076077] ? mntput_no_expire+0x11/0x1a0 >> [ 5466.076079] scsi_cmd_ioctl+0x289/0x400 >> [ 5466.076082] ? filename_lookup+0xe1/0x170 >> [ 5466.076085] sd_ioctl+0xc7/0x1a0 >> [ 5466.076088] blkdev_ioctl+0x4d4/0x8c0 >> [ 5466.076091] block_ioctl+0x39/0x40 >> [ 5466.076094] do_vfs_ioctl+0x92/0x5e0 >> [ 5466.076097] ? __fget+0x73/0xc0 >> [ 5466.076099] SyS_ioctl+0x74/0x80 >> [ 5466.076102] do_syscall_64+0x60/0x110 >> [ 5466.076106] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 >> [ 5466.076109] RIP: 0033:0x7f25f75fef47 >> [ 5466.076110] RSP: 002b:7f25f53049a8 EFLAGS: 0246 ORIG_RAX: >> 0010 >> [ 5466.076112] RAX: ffda RBX: 000c RCX: >> 7f25f75fef47 >> [ 5466.076114] RDX: 7f25f53049b0 RSI: 2285 RDI: >> 000c >> [ 5466.076115] RBP: 0010 R08: 7f25e8007818 R09: >> 0200 >> [ 5466.076116] R10: 0001 R11: 0246 R12: >> >> [ 5466.076118] R13: R14: 7f25f8a6b5e0 R15: >> 7f25e80173e0 >> [ 5466.076120] Code: 41 55 49 89 fd bf 01 00 00 00 41 54 49 89 f4 55 89 >> d5 53 e8 18 e1 bb ff 48 c7 c7 c4 95 64 89 e8 dc e9 fb ff 49 8b 45 20 48 >> 63 d5 <65> 8b 18 48 63 db 4c 01 e3 48 39 d3 7d 0a f7 dd 48 63 ed 48 39 >> [ 5466.076147] RIP: percpu_counter_add_batch+0x2b/0xb0 RSP: a556c47afb58 >> [ 5466.076149] ---[ end trace 8d7eb80aafef4494 ]--- >> [ 5466.670153] note: pool[10573] exited with preempt_count 2 >> >> (I only have the one instance right this minute as a result of not >> having remote syslog setup before now.) >> >> This is clearly deep in the blk_mq code, and it goes away when I remove >> the use_blk_mq kernel command line parameters. >> >> My next obvious step is to try and disable the load of the vbox modules. >> >> I can include the full dmesg output if it would be helpful. >> >> The system is an older HP Ultrabook, and the root partition is, sda1 (a >> SSD) -> a LUKS encrypted partition -> LVM -> BTRFS. >> >> The kernel is a stock 4.15.11, however I only recently added the blk_mq >> options, so while I can state that I have seen this on multiple kernels >> in the 4.15.x series, I have not tested earlier kernels in this >> configuration. >> >> Looking through the code, I'd guess that this is dying inside >> blkg_rwstat_add, which calls percpu_counter_add_batch, which is what RIP >> is pointing at. > > Leaving the whole thing here for Paolo - it's crashing off insertion of > a request coming out of SG_IO. Don't think we've seen this BFQ failure > case before. > Actually, we have. Found and reported by Ming
[PATCH BUGFIX] block, bfq: lower-bound the estimated peak rate to 1
If a storage device handled by BFQ happens to be slower than 7.5 KB/s for a certain amount of time (in the order of a second), then the estimated peak rate of the device, maintained in BFQ, becomes equal to 0. The reason is the limited precision with which the rate is represented (details on the range of representable values in the comments introduced by this commit). This leads to a division-by-zero error where the estimated peak rate is used as divisor. Such a type of failure has been reported in [1]. This commit addresses this issue by: 1. Lower-bounding the estimated peak rate to 1 2. Adding and improving comments on the range of rates representable [1] https://www.spinics.net/lists/kernel/msg2739205.html Signed-off-by: Konstantin Khlebnikov <khlebni...@yandex-team.ru> Signed-off-by: Paolo Valente <paolo.vale...@linaro.org> --- block/bfq-iosched.c | 25 - block/bfq-iosched.h | 2 +- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index aeca22d91101..f0ecd98509d8 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -201,7 +201,20 @@ static struct kmem_cache *bfq_pool; /* Target observation time interval for a peak-rate update (ns) */ #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC -/* Shift used for peak rate fixed precision calculations. */ +/* + * Shift used for peak-rate fixed precision calculations. + * With + * - the current shift: 16 positions + * - the current type used to store rate: u32 + * - the current unit of measure for rate: [sectors/usec], or, more precisely, + * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, + * the range of rates that can be stored is + * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = + * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = + * [15, 65G] sectors/sec + * Which, assuming a sector size of 512B, corresponds to a range of + * [7.5K, 33T] B/sec + */ #define BFQ_RATE_SHIFT 16 /* @@ -2637,6 +2650,16 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) rate /= divisor; /* smoothing constant alpha = 1/divisor */ bfqd->peak_rate += rate; + + /* +* For a very slow device, bfqd->peak_rate can reach 0 (see +* the minimum representable values reported in the comments +* on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid +* divisions by zero where bfqd->peak_rate is used as a +* divisor. +*/ + bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); + update_thr_responsiveness_params(bfqd); reset_computation: diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 350c39ae2896..ae2f3dadec44 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -499,7 +499,7 @@ struct bfq_data { u64 delta_from_first; /* * Current estimate of the device peak rate, measured in -* [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by +* [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by * BFQ_RATE_SHIFT is performed to increase precision in * fixed-point calculations. */ -- 2.16.1
[PATCH BUGFIX] block, bfq: lower-bound the estimated peak rate to 1
If a storage device handled by BFQ happens to be slower than 7.5 KB/s for a certain amount of time (in the order of a second), then the estimated peak rate of the device, maintained in BFQ, becomes equal to 0. The reason is the limited precision with which the rate is represented (details on the range of representable values in the comments introduced by this commit). This leads to a division-by-zero error where the estimated peak rate is used as divisor. Such a type of failure has been reported in [1]. This commit addresses this issue by: 1. Lower-bounding the estimated peak rate to 1 2. Adding and improving comments on the range of rates representable [1] https://www.spinics.net/lists/kernel/msg2739205.html Signed-off-by: Konstantin Khlebnikov Signed-off-by: Paolo Valente --- block/bfq-iosched.c | 25 - block/bfq-iosched.h | 2 +- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index aeca22d91101..f0ecd98509d8 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -201,7 +201,20 @@ static struct kmem_cache *bfq_pool; /* Target observation time interval for a peak-rate update (ns) */ #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC -/* Shift used for peak rate fixed precision calculations. */ +/* + * Shift used for peak-rate fixed precision calculations. + * With + * - the current shift: 16 positions + * - the current type used to store rate: u32 + * - the current unit of measure for rate: [sectors/usec], or, more precisely, + * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, + * the range of rates that can be stored is + * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = + * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = + * [15, 65G] sectors/sec + * Which, assuming a sector size of 512B, corresponds to a range of + * [7.5K, 33T] B/sec + */ #define BFQ_RATE_SHIFT 16 /* @@ -2637,6 +2650,16 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) rate /= divisor; /* smoothing constant alpha = 1/divisor */ bfqd->peak_rate += rate; + + /* +* For a very slow device, bfqd->peak_rate can reach 0 (see +* the minimum representable values reported in the comments +* on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid +* divisions by zero where bfqd->peak_rate is used as a +* divisor. +*/ + bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); + update_thr_responsiveness_params(bfqd); reset_computation: diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 350c39ae2896..ae2f3dadec44 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -499,7 +499,7 @@ struct bfq_data { u64 delta_from_first; /* * Current estimate of the device peak rate, measured in -* [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by +* [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by * BFQ_RATE_SHIFT is performed to increase precision in * fixed-point calculations. */ -- 2.16.1
Re: [PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1
> Il giorno 26 mar 2018, alle ore 12:28, Konstantin Khlebnikov > <khlebni...@yandex-team.ru> ha scritto: > > > > On 26.03.2018 11:01, Paolo Valente wrote: >>> Il giorno 21 mar 2018, alle ore 00:49, Paolo Valente >>> <paolo.vale...@linaro.org> ha scritto: >>> >>> >>> >>>> Il giorno 20 mar 2018, alle ore 15:41, Konstantin Khlebnikov >>>> <khlebni...@yandex-team.ru> ha scritto: >>>> >>>> On 20.03.2018 06:00, Paolo Valente wrote: >>>>>> Il giorno 19 mar 2018, alle ore 14:28, Konstantin Khlebnikov >>>>>> <khlebni...@yandex-team.ru> ha scritto: >>>>>> >>>>>> On 19.03.2018 09:03, Paolo Valente wrote: >>>>>>>> Il giorno 05 mar 2018, alle ore 04:48, Konstantin Khlebnikov >>>>>>>> <khlebni...@yandex-team.ru> ha scritto: >>>>>>>> >>>>>>>> Rate should never overflow or become zero because it is used as >>>>>>>> divider. >>>>>>>> This patch accumulates it with saturation. >>>>>>>> >>>>>>>> Signed-off-by: Konstantin Khlebnikov <khlebni...@yandex-team.ru> >>>>>>>> --- >>>>>>>> block/bfq-iosched.c |8 +--- >>>>>>>> 1 file changed, 5 insertions(+), 3 deletions(-) >>>>>>>> >>>>>>>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c >>>>>>>> index aeca22d91101..a236c8d541b5 100644 >>>>>>>> --- a/block/bfq-iosched.c >>>>>>>> +++ b/block/bfq-iosched.c >>>>>>>> @@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct >>>>>>>> bfq_data *bfqd, >>>>>>>> >>>>>>>> static void bfq_update_rate_reset(struct bfq_data *bfqd, struct >>>>>>>> request *rq) >>>>>>>> { >>>>>>>> - u32 rate, weight, divisor; >>>>>>>> + u32 weight, divisor; >>>>>>>> + u64 rate; >>>>>>>> >>>>>>>>/* >>>>>>>> * For the convergence property to hold (see comments on >>>>>>>> @@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct >>>>>>>> bfq_data *bfqd, struct request *rq) >>>>>>>> */ >>>>>>>>bfqd->peak_rate *= divisor-1; >>>>>>>>bfqd->peak_rate /= divisor; >>>>>>>> - rate /= divisor; /* smoothing constant alpha = 1/divisor */ >>>>>>>> + do_div(rate, divisor); /* smoothing constant alpha = 1/divisor >>>>>>>> */ >>>>>>>> >>>>>>>> - bfqd->peak_rate += rate; >>>>>>>> + /* rate should never overlow or become zero */ >>>>>>> It is bfqd->peak_rate that is used as a divider, and bfqd->peak_rate >>>>>>> doesn't risk to be zero even if the variable 'rate' is zero here. >>>>>>> So I guess the reason why you consider the possibility that >>>>>>> bfqd->peak_rate becomes zero is because of an overflow when summing >>>>>>> 'rate'. But, according to my calculations, this should be impossible >>>>>>> with devices with sensible speeds. >>>>>>> These are the reasons why I decided I could make it with a 32-bit >>>>>>> variable, without any additional clamping. Did I make any mistake in my >>>>>>> evaluation? >>>>>> >>>>>> According to Murphy's law this is inevitable.. >>>>>> >>>>> Yep. Actually Murphy has been even clement this time, by making the >>>>> failure occur to a kernel expert :) >>>>>> I've seen couple division by zero crashes in bfq_wr_duration. >>>>>> Unfortunately logs weren't recorded. >>>>>> >>>>>>> Anyway, even if I made some mistake about the maximum possible value of >>>>>>> the device rate, and the latter may be too high for bfqd->peak_rate to >>>>>>> contain it, then I guess the right solution would not be to clamp the >>>>>>> actual rate to U32_MAX, but to mov
Re: [PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1
> Il giorno 26 mar 2018, alle ore 12:28, Konstantin Khlebnikov > ha scritto: > > > > On 26.03.2018 11:01, Paolo Valente wrote: >>> Il giorno 21 mar 2018, alle ore 00:49, Paolo Valente >>> ha scritto: >>> >>> >>> >>>> Il giorno 20 mar 2018, alle ore 15:41, Konstantin Khlebnikov >>>> ha scritto: >>>> >>>> On 20.03.2018 06:00, Paolo Valente wrote: >>>>>> Il giorno 19 mar 2018, alle ore 14:28, Konstantin Khlebnikov >>>>>> ha scritto: >>>>>> >>>>>> On 19.03.2018 09:03, Paolo Valente wrote: >>>>>>>> Il giorno 05 mar 2018, alle ore 04:48, Konstantin Khlebnikov >>>>>>>> ha scritto: >>>>>>>> >>>>>>>> Rate should never overflow or become zero because it is used as >>>>>>>> divider. >>>>>>>> This patch accumulates it with saturation. >>>>>>>> >>>>>>>> Signed-off-by: Konstantin Khlebnikov >>>>>>>> --- >>>>>>>> block/bfq-iosched.c |8 +--- >>>>>>>> 1 file changed, 5 insertions(+), 3 deletions(-) >>>>>>>> >>>>>>>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c >>>>>>>> index aeca22d91101..a236c8d541b5 100644 >>>>>>>> --- a/block/bfq-iosched.c >>>>>>>> +++ b/block/bfq-iosched.c >>>>>>>> @@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct >>>>>>>> bfq_data *bfqd, >>>>>>>> >>>>>>>> static void bfq_update_rate_reset(struct bfq_data *bfqd, struct >>>>>>>> request *rq) >>>>>>>> { >>>>>>>> - u32 rate, weight, divisor; >>>>>>>> + u32 weight, divisor; >>>>>>>> + u64 rate; >>>>>>>> >>>>>>>>/* >>>>>>>> * For the convergence property to hold (see comments on >>>>>>>> @@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct >>>>>>>> bfq_data *bfqd, struct request *rq) >>>>>>>> */ >>>>>>>>bfqd->peak_rate *= divisor-1; >>>>>>>>bfqd->peak_rate /= divisor; >>>>>>>> - rate /= divisor; /* smoothing constant alpha = 1/divisor */ >>>>>>>> + do_div(rate, divisor); /* smoothing constant alpha = 1/divisor >>>>>>>> */ >>>>>>>> >>>>>>>> - bfqd->peak_rate += rate; >>>>>>>> + /* rate should never overlow or become zero */ >>>>>>> It is bfqd->peak_rate that is used as a divider, and bfqd->peak_rate >>>>>>> doesn't risk to be zero even if the variable 'rate' is zero here. >>>>>>> So I guess the reason why you consider the possibility that >>>>>>> bfqd->peak_rate becomes zero is because of an overflow when summing >>>>>>> 'rate'. But, according to my calculations, this should be impossible >>>>>>> with devices with sensible speeds. >>>>>>> These are the reasons why I decided I could make it with a 32-bit >>>>>>> variable, without any additional clamping. Did I make any mistake in my >>>>>>> evaluation? >>>>>> >>>>>> According to Murphy's law this is inevitable.. >>>>>> >>>>> Yep. Actually Murphy has been even clement this time, by making the >>>>> failure occur to a kernel expert :) >>>>>> I've seen couple division by zero crashes in bfq_wr_duration. >>>>>> Unfortunately logs weren't recorded. >>>>>> >>>>>>> Anyway, even if I made some mistake about the maximum possible value of >>>>>>> the device rate, and the latter may be too high for bfqd->peak_rate to >>>>>>> contain it, then I guess the right solution would not be to clamp the >>>>>>> actual rate to U32_MAX, but to move bfqd->peak_rate to 64 bits. Or am I >>>>>>> missing something else? >>>>>>>>> + bfqd->peak_rate = clamp_t(u64, rate + bfqd->peak_rate,
Re: [PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1
> Il giorno 21 mar 2018, alle ore 00:49, Paolo Valente > <paolo.vale...@linaro.org> ha scritto: > > > >> Il giorno 20 mar 2018, alle ore 15:41, Konstantin Khlebnikov >> <khlebni...@yandex-team.ru> ha scritto: >> >> On 20.03.2018 06:00, Paolo Valente wrote: >>>> Il giorno 19 mar 2018, alle ore 14:28, Konstantin Khlebnikov >>>> <khlebni...@yandex-team.ru> ha scritto: >>>> >>>> On 19.03.2018 09:03, Paolo Valente wrote: >>>>>> Il giorno 05 mar 2018, alle ore 04:48, Konstantin Khlebnikov >>>>>> <khlebni...@yandex-team.ru> ha scritto: >>>>>> >>>>>> Rate should never overflow or become zero because it is used as divider. >>>>>> This patch accumulates it with saturation. >>>>>> >>>>>> Signed-off-by: Konstantin Khlebnikov <khlebni...@yandex-team.ru> >>>>>> --- >>>>>> block/bfq-iosched.c |8 +--- >>>>>> 1 file changed, 5 insertions(+), 3 deletions(-) >>>>>> >>>>>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c >>>>>> index aeca22d91101..a236c8d541b5 100644 >>>>>> --- a/block/bfq-iosched.c >>>>>> +++ b/block/bfq-iosched.c >>>>>> @@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct >>>>>> bfq_data *bfqd, >>>>>> >>>>>> static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request >>>>>> *rq) >>>>>> { >>>>>> -u32 rate, weight, divisor; >>>>>> +u32 weight, divisor; >>>>>> +u64 rate; >>>>>> >>>>>> /* >>>>>> * For the convergence property to hold (see comments on >>>>>> @@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct bfq_data >>>>>> *bfqd, struct request *rq) >>>>>> */ >>>>>> bfqd->peak_rate *= divisor-1; >>>>>> bfqd->peak_rate /= divisor; >>>>>> -rate /= divisor; /* smoothing constant alpha = 1/divisor */ >>>>>> +do_div(rate, divisor); /* smoothing constant alpha = 1/divisor >>>>>> */ >>>>>> >>>>>> -bfqd->peak_rate += rate; >>>>>> +/* rate should never overlow or become zero */ >>>>> It is bfqd->peak_rate that is used as a divider, and bfqd->peak_rate >>>>> doesn't risk to be zero even if the variable 'rate' is zero here. >>>>> So I guess the reason why you consider the possibility that >>>>> bfqd->peak_rate becomes zero is because of an overflow when summing >>>>> 'rate'. But, according to my calculations, this should be impossible with >>>>> devices with sensible speeds. >>>>> These are the reasons why I decided I could make it with a 32-bit >>>>> variable, without any additional clamping. Did I make any mistake in my >>>>> evaluation? >>>> >>>> According to Murphy's law this is inevitable.. >>>> >>> Yep. Actually Murphy has been even clement this time, by making the >>> failure occur to a kernel expert :) >>>> I've seen couple division by zero crashes in bfq_wr_duration. >>>> Unfortunately logs weren't recorded. >>>> >>>>> Anyway, even if I made some mistake about the maximum possible value of >>>>> the device rate, and the latter may be too high for bfqd->peak_rate to >>>>> contain it, then I guess the right solution would not be to clamp the >>>>> actual rate to U32_MAX, but to move bfqd->peak_rate to 64 bits. Or am I >>>>> missing something else? >>>>>>> + bfqd->peak_rate = clamp_t(u64, rate + bfqd->peak_rate, 1, >>>>>>> U32_MAX); >>>> >>>> 32-bit should be enough and better for division. >>>> My patch makes sure it never overflows/underflows. >>>> That's cheaper than full 64-bit/64-bit division. >>>> Anyway 64-bit speed could overflow too. =) >>>> >>> I see your point. Still, if the mistake is not in sizing, then you >>> bumped into some odd bug. In this respect, I don't like much the idea >>> of sweeping the dust under the carpet. So, let me ask you for a >>> little bit more help. With
Re: [PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1
> Il giorno 21 mar 2018, alle ore 00:49, Paolo Valente > ha scritto: > > > >> Il giorno 20 mar 2018, alle ore 15:41, Konstantin Khlebnikov >> ha scritto: >> >> On 20.03.2018 06:00, Paolo Valente wrote: >>>> Il giorno 19 mar 2018, alle ore 14:28, Konstantin Khlebnikov >>>> ha scritto: >>>> >>>> On 19.03.2018 09:03, Paolo Valente wrote: >>>>>> Il giorno 05 mar 2018, alle ore 04:48, Konstantin Khlebnikov >>>>>> ha scritto: >>>>>> >>>>>> Rate should never overflow or become zero because it is used as divider. >>>>>> This patch accumulates it with saturation. >>>>>> >>>>>> Signed-off-by: Konstantin Khlebnikov >>>>>> --- >>>>>> block/bfq-iosched.c |8 +--- >>>>>> 1 file changed, 5 insertions(+), 3 deletions(-) >>>>>> >>>>>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c >>>>>> index aeca22d91101..a236c8d541b5 100644 >>>>>> --- a/block/bfq-iosched.c >>>>>> +++ b/block/bfq-iosched.c >>>>>> @@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct >>>>>> bfq_data *bfqd, >>>>>> >>>>>> static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request >>>>>> *rq) >>>>>> { >>>>>> -u32 rate, weight, divisor; >>>>>> +u32 weight, divisor; >>>>>> +u64 rate; >>>>>> >>>>>> /* >>>>>> * For the convergence property to hold (see comments on >>>>>> @@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct bfq_data >>>>>> *bfqd, struct request *rq) >>>>>> */ >>>>>> bfqd->peak_rate *= divisor-1; >>>>>> bfqd->peak_rate /= divisor; >>>>>> -rate /= divisor; /* smoothing constant alpha = 1/divisor */ >>>>>> +do_div(rate, divisor); /* smoothing constant alpha = 1/divisor >>>>>> */ >>>>>> >>>>>> -bfqd->peak_rate += rate; >>>>>> +/* rate should never overlow or become zero */ >>>>> It is bfqd->peak_rate that is used as a divider, and bfqd->peak_rate >>>>> doesn't risk to be zero even if the variable 'rate' is zero here. >>>>> So I guess the reason why you consider the possibility that >>>>> bfqd->peak_rate becomes zero is because of an overflow when summing >>>>> 'rate'. But, according to my calculations, this should be impossible with >>>>> devices with sensible speeds. >>>>> These are the reasons why I decided I could make it with a 32-bit >>>>> variable, without any additional clamping. Did I make any mistake in my >>>>> evaluation? >>>> >>>> According to Murphy's law this is inevitable.. >>>> >>> Yep. Actually Murphy has been even clement this time, by making the >>> failure occur to a kernel expert :) >>>> I've seen couple division by zero crashes in bfq_wr_duration. >>>> Unfortunately logs weren't recorded. >>>> >>>>> Anyway, even if I made some mistake about the maximum possible value of >>>>> the device rate, and the latter may be too high for bfqd->peak_rate to >>>>> contain it, then I guess the right solution would not be to clamp the >>>>> actual rate to U32_MAX, but to move bfqd->peak_rate to 64 bits. Or am I >>>>> missing something else? >>>>>>> + bfqd->peak_rate = clamp_t(u64, rate + bfqd->peak_rate, 1, >>>>>>> U32_MAX); >>>> >>>> 32-bit should be enough and better for division. >>>> My patch makes sure it never overflows/underflows. >>>> That's cheaper than full 64-bit/64-bit division. >>>> Anyway 64-bit speed could overflow too. =) >>>> >>> I see your point. Still, if the mistake is not in sizing, then you >>> bumped into some odd bug. In this respect, I don't like much the idea >>> of sweeping the dust under the carpet. So, let me ask you for a >>> little bit more help. With your patch applied, and thus with no risk >>> of crashes, what about adding, right before your clamp_t, something >>> like: >>> if (!bfqd->
Re: [PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1
> Il giorno 20 mar 2018, alle ore 15:41, Konstantin Khlebnikov > <khlebni...@yandex-team.ru> ha scritto: > > On 20.03.2018 06:00, Paolo Valente wrote: >>> Il giorno 19 mar 2018, alle ore 14:28, Konstantin Khlebnikov >>> <khlebni...@yandex-team.ru> ha scritto: >>> >>> On 19.03.2018 09:03, Paolo Valente wrote: >>>>> Il giorno 05 mar 2018, alle ore 04:48, Konstantin Khlebnikov >>>>> <khlebni...@yandex-team.ru> ha scritto: >>>>> >>>>> Rate should never overflow or become zero because it is used as divider. >>>>> This patch accumulates it with saturation. >>>>> >>>>> Signed-off-by: Konstantin Khlebnikov <khlebni...@yandex-team.ru> >>>>> --- >>>>> block/bfq-iosched.c |8 +--- >>>>> 1 file changed, 5 insertions(+), 3 deletions(-) >>>>> >>>>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c >>>>> index aeca22d91101..a236c8d541b5 100644 >>>>> --- a/block/bfq-iosched.c >>>>> +++ b/block/bfq-iosched.c >>>>> @@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct >>>>> bfq_data *bfqd, >>>>> >>>>> static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request >>>>> *rq) >>>>> { >>>>> - u32 rate, weight, divisor; >>>>> + u32 weight, divisor; >>>>> + u64 rate; >>>>> >>>>> /* >>>>>* For the convergence property to hold (see comments on >>>>> @@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct bfq_data >>>>> *bfqd, struct request *rq) >>>>>*/ >>>>> bfqd->peak_rate *= divisor-1; >>>>> bfqd->peak_rate /= divisor; >>>>> - rate /= divisor; /* smoothing constant alpha = 1/divisor */ >>>>> + do_div(rate, divisor); /* smoothing constant alpha = 1/divisor */ >>>>> >>>>> - bfqd->peak_rate += rate; >>>>> + /* rate should never overlow or become zero */ >>>> It is bfqd->peak_rate that is used as a divider, and bfqd->peak_rate >>>> doesn't risk to be zero even if the variable 'rate' is zero here. >>>> So I guess the reason why you consider the possibility that >>>> bfqd->peak_rate becomes zero is because of an overflow when summing >>>> 'rate'. But, according to my calculations, this should be impossible with >>>> devices with sensible speeds. >>>> These are the reasons why I decided I could make it with a 32-bit >>>> variable, without any additional clamping. Did I make any mistake in my >>>> evaluation? >>> >>> According to Murphy's law this is inevitable.. >>> >> Yep. Actually Murphy has been even clement this time, by making the >> failure occur to a kernel expert :) >>> I've seen couple division by zero crashes in bfq_wr_duration. >>> Unfortunately logs weren't recorded. >>> >>>> Anyway, even if I made some mistake about the maximum possible value of >>>> the device rate, and the latter may be too high for bfqd->peak_rate to >>>> contain it, then I guess the right solution would not be to clamp the >>>> actual rate to U32_MAX, but to move bfqd->peak_rate to 64 bits. Or am I >>>> missing something else? >>>>>> +bfqd->peak_rate = clamp_t(u64, rate + bfqd->peak_rate, 1, >>>>>> U32_MAX); >>> >>> 32-bit should be enough and better for division. >>> My patch makes sure it never overflows/underflows. >>> That's cheaper than full 64-bit/64-bit division. >>> Anyway 64-bit speed could overflow too. =) >>> >> I see your point. Still, if the mistake is not in sizing, then you >> bumped into some odd bug. In this respect, I don't like much the idea >> of sweeping the dust under the carpet. So, let me ask you for a >> little bit more help. With your patch applied, and thus with no risk >> of crashes, what about adding, right before your clamp_t, something >> like: >> if (!bfqd->peak_rate) >> pr_crit(> bfqd->peak_rate>); >> Once the failure shows up (Murphy permitting), we might have hints to >> the bug causing it. >> Apart from that, I have no problem with patches that make bfq more >> robust, even in a sort of black-box way. > > This rate estimator is already works as a black-box
Re: [PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1
> Il giorno 20 mar 2018, alle ore 15:41, Konstantin Khlebnikov > ha scritto: > > On 20.03.2018 06:00, Paolo Valente wrote: >>> Il giorno 19 mar 2018, alle ore 14:28, Konstantin Khlebnikov >>> ha scritto: >>> >>> On 19.03.2018 09:03, Paolo Valente wrote: >>>>> Il giorno 05 mar 2018, alle ore 04:48, Konstantin Khlebnikov >>>>> ha scritto: >>>>> >>>>> Rate should never overflow or become zero because it is used as divider. >>>>> This patch accumulates it with saturation. >>>>> >>>>> Signed-off-by: Konstantin Khlebnikov >>>>> --- >>>>> block/bfq-iosched.c |8 +--- >>>>> 1 file changed, 5 insertions(+), 3 deletions(-) >>>>> >>>>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c >>>>> index aeca22d91101..a236c8d541b5 100644 >>>>> --- a/block/bfq-iosched.c >>>>> +++ b/block/bfq-iosched.c >>>>> @@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct >>>>> bfq_data *bfqd, >>>>> >>>>> static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request >>>>> *rq) >>>>> { >>>>> - u32 rate, weight, divisor; >>>>> + u32 weight, divisor; >>>>> + u64 rate; >>>>> >>>>> /* >>>>>* For the convergence property to hold (see comments on >>>>> @@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct bfq_data >>>>> *bfqd, struct request *rq) >>>>>*/ >>>>> bfqd->peak_rate *= divisor-1; >>>>> bfqd->peak_rate /= divisor; >>>>> - rate /= divisor; /* smoothing constant alpha = 1/divisor */ >>>>> + do_div(rate, divisor); /* smoothing constant alpha = 1/divisor */ >>>>> >>>>> - bfqd->peak_rate += rate; >>>>> + /* rate should never overlow or become zero */ >>>> It is bfqd->peak_rate that is used as a divider, and bfqd->peak_rate >>>> doesn't risk to be zero even if the variable 'rate' is zero here. >>>> So I guess the reason why you consider the possibility that >>>> bfqd->peak_rate becomes zero is because of an overflow when summing >>>> 'rate'. But, according to my calculations, this should be impossible with >>>> devices with sensible speeds. >>>> These are the reasons why I decided I could make it with a 32-bit >>>> variable, without any additional clamping. Did I make any mistake in my >>>> evaluation? >>> >>> According to Murphy's law this is inevitable.. >>> >> Yep. Actually Murphy has been even clement this time, by making the >> failure occur to a kernel expert :) >>> I've seen couple division by zero crashes in bfq_wr_duration. >>> Unfortunately logs weren't recorded. >>> >>>> Anyway, even if I made some mistake about the maximum possible value of >>>> the device rate, and the latter may be too high for bfqd->peak_rate to >>>> contain it, then I guess the right solution would not be to clamp the >>>> actual rate to U32_MAX, but to move bfqd->peak_rate to 64 bits. Or am I >>>> missing something else? >>>>>> +bfqd->peak_rate = clamp_t(u64, rate + bfqd->peak_rate, 1, >>>>>> U32_MAX); >>> >>> 32-bit should be enough and better for division. >>> My patch makes sure it never overflows/underflows. >>> That's cheaper than full 64-bit/64-bit division. >>> Anyway 64-bit speed could overflow too. =) >>> >> I see your point. Still, if the mistake is not in sizing, then you >> bumped into some odd bug. In this respect, I don't like much the idea >> of sweeping the dust under the carpet. So, let me ask you for a >> little bit more help. With your patch applied, and thus with no risk >> of crashes, what about adding, right before your clamp_t, something >> like: >> if (!bfqd->peak_rate) >> pr_crit(> bfqd->peak_rate>); >> Once the failure shows up (Murphy permitting), we might have hints to >> the bug causing it. >> Apart from that, I have no problem with patches that make bfq more >> robust, even in a sort of black-box way. > > This rate estimator is already works as a black-box with smoothing and > low-pass filter inside. Actually, it is just what you say: a low-pass filter, which works by deflating
Re: [PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1
> Il giorno 19 mar 2018, alle ore 14:28, Konstantin Khlebnikov > <khlebni...@yandex-team.ru> ha scritto: > > On 19.03.2018 09:03, Paolo Valente wrote: >>> Il giorno 05 mar 2018, alle ore 04:48, Konstantin Khlebnikov >>> <khlebni...@yandex-team.ru> ha scritto: >>> >>> Rate should never overflow or become zero because it is used as divider. >>> This patch accumulates it with saturation. >>> >>> Signed-off-by: Konstantin Khlebnikov <khlebni...@yandex-team.ru> >>> --- >>> block/bfq-iosched.c |8 +--- >>> 1 file changed, 5 insertions(+), 3 deletions(-) >>> >>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c >>> index aeca22d91101..a236c8d541b5 100644 >>> --- a/block/bfq-iosched.c >>> +++ b/block/bfq-iosched.c >>> @@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct >>> bfq_data *bfqd, >>> >>> static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) >>> { >>> - u32 rate, weight, divisor; >>> + u32 weight, divisor; >>> + u64 rate; >>> >>> /* >>> * For the convergence property to hold (see comments on >>> @@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct bfq_data >>> *bfqd, struct request *rq) >>> */ >>> bfqd->peak_rate *= divisor-1; >>> bfqd->peak_rate /= divisor; >>> - rate /= divisor; /* smoothing constant alpha = 1/divisor */ >>> + do_div(rate, divisor); /* smoothing constant alpha = 1/divisor */ >>> >>> - bfqd->peak_rate += rate; >>> + /* rate should never overlow or become zero */ >> It is bfqd->peak_rate that is used as a divider, and bfqd->peak_rate doesn't >> risk to be zero even if the variable 'rate' is zero here. >> So I guess the reason why you consider the possibility that bfqd->peak_rate >> becomes zero is because of an overflow when summing 'rate'. But, according >> to my calculations, this should be impossible with devices with sensible >> speeds. >> These are the reasons why I decided I could make it with a 32-bit variable, >> without any additional clamping. Did I make any mistake in my evaluation? > > According to Murphy's law this is inevitable.. > Yep. Actually Murphy has been even clement this time, by making the failure occur to a kernel expert :) > I've seen couple division by zero crashes in bfq_wr_duration. > Unfortunately logs weren't recorded. > >> Anyway, even if I made some mistake about the maximum possible value of the >> device rate, and the latter may be too high for bfqd->peak_rate to contain >> it, then I guess the right solution would not be to clamp the actual rate to >> U32_MAX, but to move bfqd->peak_rate to 64 bits. Or am I missing something >> else? > >>> + bfqd->peak_rate = clamp_t(u64, rate + bfqd->peak_rate, 1, U32_MAX); > > 32-bit should be enough and better for division. > My patch makes sure it never overflows/underflows. > That's cheaper than full 64-bit/64-bit division. > Anyway 64-bit speed could overflow too. =) > I see your point. Still, if the mistake is not in sizing, then you bumped into some odd bug. In this respect, I don't like much the idea of sweeping the dust under the carpet. So, let me ask you for a little bit more help. With your patch applied, and thus with no risk of crashes, what about adding, right before your clamp_t, something like: if (!bfqd->peak_rate) pr_crit(peak_rate>); Once the failure shows up (Murphy permitting), we might have hints to the bug causing it. Apart from that, I have no problem with patches that make bfq more robust, even in a sort of black-box way. Thanks a lot, Paolo > >>> update_thr_responsiveness_params(bfqd); >>> >>> reset_computation:
Re: [PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1
> Il giorno 19 mar 2018, alle ore 14:28, Konstantin Khlebnikov > ha scritto: > > On 19.03.2018 09:03, Paolo Valente wrote: >>> Il giorno 05 mar 2018, alle ore 04:48, Konstantin Khlebnikov >>> ha scritto: >>> >>> Rate should never overflow or become zero because it is used as divider. >>> This patch accumulates it with saturation. >>> >>> Signed-off-by: Konstantin Khlebnikov >>> --- >>> block/bfq-iosched.c |8 +--- >>> 1 file changed, 5 insertions(+), 3 deletions(-) >>> >>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c >>> index aeca22d91101..a236c8d541b5 100644 >>> --- a/block/bfq-iosched.c >>> +++ b/block/bfq-iosched.c >>> @@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct >>> bfq_data *bfqd, >>> >>> static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) >>> { >>> - u32 rate, weight, divisor; >>> + u32 weight, divisor; >>> + u64 rate; >>> >>> /* >>> * For the convergence property to hold (see comments on >>> @@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct bfq_data >>> *bfqd, struct request *rq) >>> */ >>> bfqd->peak_rate *= divisor-1; >>> bfqd->peak_rate /= divisor; >>> - rate /= divisor; /* smoothing constant alpha = 1/divisor */ >>> + do_div(rate, divisor); /* smoothing constant alpha = 1/divisor */ >>> >>> - bfqd->peak_rate += rate; >>> + /* rate should never overlow or become zero */ >> It is bfqd->peak_rate that is used as a divider, and bfqd->peak_rate doesn't >> risk to be zero even if the variable 'rate' is zero here. >> So I guess the reason why you consider the possibility that bfqd->peak_rate >> becomes zero is because of an overflow when summing 'rate'. But, according >> to my calculations, this should be impossible with devices with sensible >> speeds. >> These are the reasons why I decided I could make it with a 32-bit variable, >> without any additional clamping. Did I make any mistake in my evaluation? > > According to Murphy's law this is inevitable.. > Yep. Actually Murphy has been even clement this time, by making the failure occur to a kernel expert :) > I've seen couple division by zero crashes in bfq_wr_duration. > Unfortunately logs weren't recorded. > >> Anyway, even if I made some mistake about the maximum possible value of the >> device rate, and the latter may be too high for bfqd->peak_rate to contain >> it, then I guess the right solution would not be to clamp the actual rate to >> U32_MAX, but to move bfqd->peak_rate to 64 bits. Or am I missing something >> else? > >>> + bfqd->peak_rate = clamp_t(u64, rate + bfqd->peak_rate, 1, U32_MAX); > > 32-bit should be enough and better for division. > My patch makes sure it never overflows/underflows. > That's cheaper than full 64-bit/64-bit division. > Anyway 64-bit speed could overflow too. =) > I see your point. Still, if the mistake is not in sizing, then you bumped into some odd bug. In this respect, I don't like much the idea of sweeping the dust under the carpet. So, let me ask you for a little bit more help. With your patch applied, and thus with no risk of crashes, what about adding, right before your clamp_t, something like: if (!bfqd->peak_rate) pr_crit(peak_rate>); Once the failure shows up (Murphy permitting), we might have hints to the bug causing it. Apart from that, I have no problem with patches that make bfq more robust, even in a sort of black-box way. Thanks a lot, Paolo > >>> update_thr_responsiveness_params(bfqd); >>> >>> reset_computation: