Re: performance drop after using blkcg
gt;cfq_write_isolation = 0; /* added by joeytao, 2013.8.16 */ +#else + cfqd->cfq_write_isolation = 1; /* added by joeytao, 2013.8.21 */ +#endif cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; cfqd->cfq_group_idle = cfq_group_idle; @@ -4154,6 +4183,8 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); +SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1); +SHOW_FUNCTION(cfq_write_isolation_show, cfqd->cfq_write_isolation, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ @@ -4187,6 +4218,8 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); +STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1); +STORE_FUNCTION(cfq_write_isolation_store, &cfqd->cfq_write_isolation, 0, UINT_MAX, 0); #undef STORE_FUNCTION #define CFQ_ATTR(name) \ @@ -4204,6 +4237,8 @@ static struct elv_fs_entry cfq_attrs[] = { CFQ_ATTR(slice_idle), CFQ_ATTR(group_idle), CFQ_ATTR(low_latency), + CFQ_ATTR(target_latency), + CFQ_ATTR(write_isolation), __ATTR_NULL }; diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index b2eee89..0c45b09 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -18,6 +18,11 @@ struct cfq_io_context { unsigned long ttime_samples; unsigned long ttime_mean; + /* added by joeytao */ + unsigned long awtime_total; + unsigned long awtime_samples; + unsigned long awtime_mean; + struct list_head queue_list; struct hlist_node cic_list; -- 1.7.1 -- View this message in context: http://linux-kernel.2935.n7.nabble.com/performance-drop-after-using-blkcg-tp567957p710886.html Sent from the Linux Kernel mailing list archive at Nabble.com. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: performance drop after using blkcg
gt;cfq_write_isolation = 0; /* added by joeytao, 2013.8.16 */ +#else + cfqd->cfq_write_isolation = 1; /* added by joeytao, 2013.8.21 */ +#endif cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; cfqd->cfq_group_idle = cfq_group_idle; @@ -4154,6 +4183,8 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); +SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1); +SHOW_FUNCTION(cfq_write_isolation_show, cfqd->cfq_write_isolation, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ @@ -4187,6 +4218,8 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); +STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1); +STORE_FUNCTION(cfq_write_isolation_store, &cfqd->cfq_write_isolation, 0, UINT_MAX, 0); #undef STORE_FUNCTION #define CFQ_ATTR(name) \ @@ -4204,6 +4237,8 @@ static struct elv_fs_entry cfq_attrs[] = { CFQ_ATTR(slice_idle), CFQ_ATTR(group_idle), CFQ_ATTR(low_latency), + CFQ_ATTR(target_latency), + CFQ_ATTR(write_isolation), __ATTR_NULL }; diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index b2eee89..0c45b09 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -18,6 +18,11 @@ struct cfq_io_context { unsigned long ttime_samples; unsigned long ttime_mean; + /* added by joeytao */ + unsigned long awtime_total; + unsigned long awtime_samples; + unsigned long awtime_mean; + struct list_head queue_list; struct hlist_node cic_list; -- 1.7.1 -- View this message in context: http://linux-kernel.2935.n7.nabble.com/performance-drop-after-using-blkcg-tp567957p710883.html Sent from the Linux Kernel mailing list archive at Nabble.com. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: performance drop after using blkcg
2012/12/11 Vivek Goyal : > These results are with slice_idle=0? Yes, slice_idle is disabled. > What's the storage you are using. Looking at the speed of IO I would > guess it is not one of those rotational disks. I have done the same test on 3 different type of boxes,and all of them show a performance drop(30%-40%) after using blkcg. Though they have different type of disk, all the storage they use are traditional rotational devices(e.g."HP EG0146FAWHU", "IBM-ESXS"). > So if somebody wants to experiment, just tweak the code a bit to allow > preemption when a queue which lost share gets backlogged and you > practially have a prototype of iops based group scheduling. Could you please explain more on this? How to adjust the code? I have test the following code piece, the result is we lost group differentiation. cfq_group_served() { if (iops_mode(cfqd)) charge = cfqq->slice_dispatch; cfqg->vdisktime += cfq_scale_slice(charge, cfqg); } -- Regards, Zhao Shuai -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: performance drop after using blkcg
Hello, Vivek. On Tue, Dec 11, 2012 at 11:18:20AM -0500, Vivek Goyal wrote: > - Controlling device queue should bring down throughput too as it > should bring down level of parallelism at device level. Also asking > user to tune device queue depth seems bad interface. How would a > user know what's the right queue depth. May be software can try to > be intelligent about it and if IO latencies cross a threshold then > try to decrese queue depth. (We do things like that in CFQ). Yeah, it should definitely be something automatic. Command completion latencies are visible to iosched, so it should be doable. > - Passing prio to device sounds something new and promising. If they > can do a good job at it, why not. I think at minimum they need to > make sure READs are prioritized over writes by default. And may > be provide a way to signal important writes which need to go to > the disk now. > > If READs are prioritized in device, then it takes care of one very > important use case. Then we just have to worry about other case of > fairness between different readers or fairness between different > writers and there we do not idle and try our best to give fair share. > In case group is not backlogged, it is bound to loose some share. I think it can be good enough if we have queue at the head / tail choice. No idea how it'll actually fan out tho. Thanks. -- tejun -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: performance drop after using blkcg
On Tue, Dec 11, 2012 at 08:01:37AM -0800, Tejun Heo wrote: [..] > > Only way to provide effective isolation seemed to be idling and the > > moment we idle we kill the performance. It does not matter whether we > > are scheduling time or iops. > > If the completion latency of IOs fluctuates heavily depend on queue > depth, queue depth would need to be throttled so that lower priority > queue can't overwhelm the device queue while prospect higher priority > accessors exist. Another aspect is that devices are getting a lot > more consistent in terms of latency. > > While idling would also solve isolation issue with unordered deep > device queue, it really is a solution for a rotational device with > large seek penalty as the time lost while idling can often/somtimes > made up by the save from lower seeks. For non-rot devices with deep > queue, the right thing to do would be controlling queue depth or > propagate priority to the device queue (from what I hear, people are > working on it. dunno how well it would turn out tho). - Controlling device queue should bring down throughput too as it should bring down level of parallelism at device level. Also asking user to tune device queue depth seems bad interface. How would a user know what's the right queue depth. May be software can try to be intelligent about it and if IO latencies cross a threshold then try to decrese queue depth. (We do things like that in CFQ). - Passing prio to device sounds something new and promising. If they can do a good job at it, why not. I think at minimum they need to make sure READs are prioritized over writes by default. And may be provide a way to signal important writes which need to go to the disk now. If READs are prioritized in device, then it takes care of one very important use case. Then we just have to worry about other case of fairness between different readers or fairness between different writers and there we do not idle and try our best to give fair share. In case group is not backlogged, it is bound to loose some share. > > > > cfq is way too heavy and > > > ill-suited for high speed non-rot devices which are becoming more and > > > more consistent in terms of iops they can handle. > > > > > > I think we need something better suited for the maturing non-rot > > > devices. They're becoming very different from what cfq was built for > > > and we really shouldn't be maintaining several rb trees which need > > > full synchronization for each IO. We're doing way too much and it > > > just isn't scalable. > > > > I am fine with doing things differently in a different scheduler. But > > what I am aruging here is that atleast with CFQ we should be able to > > experiment and figure out what works. In CFQ all the code is there and > > if this iops based scheduling has merit, one should be able to quickly > > experiment and demonstrate how would one do things differently. > > > > To me I have not been able to understand yet that what is iops based > > scheduling doing differently. Will we idle there or not. If we idle > > we again have performance problems. > > When the device can do tens of thousands ios per sec, I don't think it > makes much sense to idle the device. You just lose too much. Agreed. idling starts showing soon on fast SATA rotational devices itself so idling on faster devices will lead to bad results on most of the workloads. > > > So doing things out of CFQ is fine. I am only after understanding the > > technical idea which will solve the problem of provinding isolation > > as well as fairness without losing throughput. And I have not been > > able to get a hang of it yet. > > I think it already has some aspect of it. It has the half-iops mode > for a reason, right? It just is very inefficient and way more complex > than it needs to be. I introduced this iops_mode() in an attempt to try to provide fair disk share in terms of iops instead of disk slices. It might not be most efficient one but atleast it can provide answers whether it is something useful or not and for what workload and devices this iops based scheduling is useful. So if somebody wants to experiment, just tweak the code a bit to allow preemption when a queue which lost share gets backlogged and you practially have a prototype of iops based group scheduling. Thanks Vivek -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: performance drop after using blkcg
Hello, Vivek. On Tue, Dec 11, 2012 at 10:37:25AM -0500, Vivek Goyal wrote: > I have experimented with schemes like that but did not see any very > promising resutls. Assume device supports queue depth of 128, and there > is one dependent reader and one writer. If reader goes away and comes > back and preempts low priority writer, in that small time window writer > has dispatched enough requests to introduce read delays. So preemption > helps only so much. I am curious to know how iops based scheduler solve > these issues. > > Only way to provide effective isolation seemed to be idling and the > moment we idle we kill the performance. It does not matter whether we > are scheduling time or iops. If the completion latency of IOs fluctuates heavily depend on queue depth, queue depth would need to be throttled so that lower priority queue can't overwhelm the device queue while prospect higher priority accessors exist. Another aspect is that devices are getting a lot more consistent in terms of latency. While idling would also solve isolation issue with unordered deep device queue, it really is a solution for a rotational device with large seek penalty as the time lost while idling can often/somtimes made up by the save from lower seeks. For non-rot devices with deep queue, the right thing to do would be controlling queue depth or propagate priority to the device queue (from what I hear, people are working on it. dunno how well it would turn out tho). > > cfq is way too heavy and > > ill-suited for high speed non-rot devices which are becoming more and > > more consistent in terms of iops they can handle. > > > > I think we need something better suited for the maturing non-rot > > devices. They're becoming very different from what cfq was built for > > and we really shouldn't be maintaining several rb trees which need > > full synchronization for each IO. We're doing way too much and it > > just isn't scalable. > > I am fine with doing things differently in a different scheduler. But > what I am aruging here is that atleast with CFQ we should be able to > experiment and figure out what works. In CFQ all the code is there and > if this iops based scheduling has merit, one should be able to quickly > experiment and demonstrate how would one do things differently. > > To me I have not been able to understand yet that what is iops based > scheduling doing differently. Will we idle there or not. If we idle > we again have performance problems. When the device can do tens of thousands ios per sec, I don't think it makes much sense to idle the device. You just lose too much. > So doing things out of CFQ is fine. I am only after understanding the > technical idea which will solve the problem of provinding isolation > as well as fairness without losing throughput. And I have not been > able to get a hang of it yet. I think it already has some aspect of it. It has the half-iops mode for a reason, right? It just is very inefficient and way more complex than it needs to be. Thanks. -- tejun -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: performance drop after using blkcg
On Tue, Dec 11, 2012 at 07:14:12AM -0800, Tejun Heo wrote: > Hello, Vivek. > > On Tue, Dec 11, 2012 at 10:02:34AM -0500, Vivek Goyal wrote: > > cfq_group_served() { > > if (iops_mode(cfqd)) > > charge = cfqq->slice_dispatch; > > cfqg->vdisktime += cfq_scale_slice(charge, cfqg); > > } > > > > Isn't it effectively IOPS scheduling. One should get IOPS rate in > > proportion to > > their weight (as long as they can throw enough traffic at device to keep > > it busy). If not, can you please give more details about your proposal. > > The problem is that we lose a lot of isolation w/o idling between > queues or groups. This is because we switch between slices and while > a slice is in progress only ios belongint to that slice can be issued. > ie. higher priority cfqgs / cfqqs, after dispatching the ios they have > ready, lose their slice immmediately. Lower priority slice takes over > and when hgiher priority ones get ready, they have to wait for the > lower priority one before submitting the new IOs. In many cases, they > end up not being able to generate IOs any faster than the ones in > lower priority cfqqs/cfqgs. > > This is becase we switch slices rather than iops. I am not sure how any of the above problems will go away if we start scheduling iops. > We can make cfq > essentially switch iops by implementing very aggressive preemption but > I really don't see much point in that. Yes, this should be easily doable. Once a queue/group is being removed and is losing its share, just keep track of last vdisktime. When more IO comes in this group and current group is preempted (if its vdisktime is greater than one being queued). And new group is probably queued at the front. I have experimented with schemes like that but did not see any very promising resutls. Assume device supports queue depth of 128, and there is one dependent reader and one writer. If reader goes away and comes back and preempts low priority writer, in that small time window writer has dispatched enough requests to introduce read delays. So preemption helps only so much. I am curious to know how iops based scheduler solve these issues. Only way to provide effective isolation seemed to be idling and the moment we idle we kill the performance. It does not matter whether we are scheduling time or iops. > cfq is way too heavy and > ill-suited for high speed non-rot devices which are becoming more and > more consistent in terms of iops they can handle. > > I think we need something better suited for the maturing non-rot > devices. They're becoming very different from what cfq was built for > and we really shouldn't be maintaining several rb trees which need > full synchronization for each IO. We're doing way too much and it > just isn't scalable. I am fine with doing things differently in a different scheduler. But what I am aruging here is that atleast with CFQ we should be able to experiment and figure out what works. In CFQ all the code is there and if this iops based scheduling has merit, one should be able to quickly experiment and demonstrate how would one do things differently. To me I have not been able to understand yet that what is iops based scheduling doing differently. Will we idle there or not. If we idle we again have performance problems. So doing things out of CFQ is fine. I am only after understanding the technical idea which will solve the problem of provinding isolation as well as fairness without losing throughput. And I have not been able to get a hang of it yet. Thanks Vivek -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: performance drop after using blkcg
Hello, Vivek. On Tue, Dec 11, 2012 at 10:02:34AM -0500, Vivek Goyal wrote: > cfq_group_served() { > if (iops_mode(cfqd)) > charge = cfqq->slice_dispatch; > cfqg->vdisktime += cfq_scale_slice(charge, cfqg); > } > > Isn't it effectively IOPS scheduling. One should get IOPS rate in proportion > to > their weight (as long as they can throw enough traffic at device to keep > it busy). If not, can you please give more details about your proposal. The problem is that we lose a lot of isolation w/o idling between queues or groups. This is because we switch between slices and while a slice is in progress only ios belongint to that slice can be issued. ie. higher priority cfqgs / cfqqs, after dispatching the ios they have ready, lose their slice immmediately. Lower priority slice takes over and when hgiher priority ones get ready, they have to wait for the lower priority one before submitting the new IOs. In many cases, they end up not being able to generate IOs any faster than the ones in lower priority cfqqs/cfqgs. This is becase we switch slices rather than iops. We can make cfq essentially switch iops by implementing very aggressive preemption but I really don't see much point in that. cfq is way too heavy and ill-suited for high speed non-rot devices which are becoming more and more consistent in terms of iops they can handle. I think we need something better suited for the maturing non-rot devices. They're becoming very different from what cfq was built for and we really shouldn't be maintaining several rb trees which need full synchronization for each IO. We're doing way too much and it just isn't scalable. Thanks. -- tejun -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: performance drop after using blkcg
On Tue, Dec 11, 2012 at 06:47:18AM -0800, Tejun Heo wrote: > Hello, > > On Tue, Dec 11, 2012 at 09:43:36AM -0500, Vivek Goyal wrote: > > I think if one sets slice_idle=0 and group_idle=0 in CFQ, for all practical > > purposes it should become and IOPS based group scheduling. > > No, I don't think it is. You can't achieve isolation without idling > between group switches. We're measuring slices in terms of iops but > what cfq actually schedules are still time slices, not IOs. I think I have not been able to understand your proposal. Can you explain a bit more. This is what CFQ does in iops_mode(). It will calculate the number of requests dispatched from a group and scale that number based on weight and put the group back on service tree. So if you have not got your fair share in terms of number of requests dispatched to the device, you will be put ahead in the queue and given a chance to dispatch requests first. Now couple of things. - There is no idling here. If device is asking for more requests (deep queue depth) then this group will be removed from service tree and CFQ will move on to serve other queued group. So if there is a dependent reader it will lose its share. If we try to idle here, then we have solved nothing in terms of performance problems. Device is faster but your workload can't cope with it so you are artificially slowing down the device. - But if all the contending workloads/groups are throwing enough IO traffic on the device and don't get expired, they they should be able to dispatch number of requests to device in proportion to their weight. So this is effectively trying to keep track of number of reqeusts dispatched from the group instead of time slice consumed by group and then do the scheduling. cfq_group_served() { if (iops_mode(cfqd)) charge = cfqq->slice_dispatch; cfqg->vdisktime += cfq_scale_slice(charge, cfqg); } Isn't it effectively IOPS scheduling. One should get IOPS rate in proportion to their weight (as long as they can throw enough traffic at device to keep it busy). If not, can you please give more details about your proposal. Thanks Vivek -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: performance drop after using blkcg
Hello, On Tue, Dec 11, 2012 at 09:43:36AM -0500, Vivek Goyal wrote: > I think if one sets slice_idle=0 and group_idle=0 in CFQ, for all practical > purposes it should become and IOPS based group scheduling. No, I don't think it is. You can't achieve isolation without idling between group switches. We're measuring slices in terms of iops but what cfq actually schedules are still time slices, not IOs. > For group accounting then CFQ uses number of requests from each cgroup > and uses that information to schedule groups. > > I have not been able to figure out the practical benefits of that > approach. At least not for the simple workloads I played with. This > approach will not work for simple things like trying to improve dependent > read latencies in presence of heavery writers. That's the single biggest > use case CFQ solves, IMO. As I wrote above, it's not about accounting. It's about scheduling unit. > And that happens because we stop writes and don't let them go to device > and device is primarily dealing with reads. If some process is doing > dependent reads and we want to improve read latencies, then either > we need to stop flow of writes or devices are good and they always > prioritize READs over WRITEs. If devices are good then we probably > don't even need blkcg. > > So yes, iops based appraoch is fine just that number of cases where you > will see any service differentiation should significantly less. No, using iops to schedule time slices would lead to that. We just need to be allocating and scheduling iops, and I don't think we should be doing that from cfq. Thanks. -- tejun -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: performance drop after using blkcg
On Tue, Dec 11, 2012 at 06:27:42AM -0800, Tejun Heo wrote: > On Tue, Dec 11, 2012 at 09:25:18AM -0500, Vivek Goyal wrote: > > In general, do not use blkcg on faster storage. In current form it > > is at best suitable for single rotational SATA/SAS disk. I have not > > been able to figure out how to provide fairness without group idling. > > I think cfq is just the wrong approach for faster non-rotational > devices. We should be allocating iops instead of time slices. I think if one sets slice_idle=0 and group_idle=0 in CFQ, for all practical purposes it should become and IOPS based group scheduling. For group accounting then CFQ uses number of requests from each cgroup and uses that information to schedule groups. I have not been able to figure out the practical benefits of that approach. At least not for the simple workloads I played with. This approach will not work for simple things like trying to improve dependent read latencies in presence of heavery writers. That's the single biggest use case CFQ solves, IMO. And that happens because we stop writes and don't let them go to device and device is primarily dealing with reads. If some process is doing dependent reads and we want to improve read latencies, then either we need to stop flow of writes or devices are good and they always prioritize READs over WRITEs. If devices are good then we probably don't even need blkcg. So yes, iops based appraoch is fine just that number of cases where you will see any service differentiation should significantly less. Thanks Vivek -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: performance drop after using blkcg
On Tue, Dec 11, 2012 at 09:25:18AM -0500, Vivek Goyal wrote: > In general, do not use blkcg on faster storage. In current form it > is at best suitable for single rotational SATA/SAS disk. I have not > been able to figure out how to provide fairness without group idling. I think cfq is just the wrong approach for faster non-rotational devices. We should be allocating iops instead of time slices. Thanks. -- tejun -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: performance drop after using blkcg
On Mon, Dec 10, 2012 at 08:28:54PM +0800, Zhao Shuai wrote: > Hi, > > I plan to use blkcg(proportional BW) in my system. But I encounter > great performance drop after enabling blkcg. > The testing tool is fio(version 2.0.7) and both the BW and IOPS fields > are recorded. Two instances of fio program are carried out simultaneously, > each opearting on a separate disk file (say /data/testfile1, > /data/testfile2). > System environment: > kernel: 3.7.0-rc5 > CFQ's slice_idle is disabled(slice_idle=0) while group_idle is > enabled(group_idle=8). > > FIO configuration(e.g. "read") for the first fio program(say FIO1): > > [global] > description=Emulation of Intel IOmeter File Server Access Pattern > > [iometer] > bssplit=4k/30:8k/40:16k/30 > rw=read > direct=1 > time_based > runtime=180s > ioengine=sync > filename=/data/testfile1 > numjobs=32 > group_reporting > > > result before using blkcg: (the value of BW is KB/s) > >FIO1 BW/IOPSFIO2 BW/IOPS > --- > read 26799/2911 25861/2810 > write 138618/15071138578/15069 > rw 72159/7838(r) 71851/7811(r) >72171/7840(w) 71799/7805(w) > randread 4982/5435370/585 > randwrite 5192/5666010/654 > randrw 2369/258(r) 3027/330(r) >2369/258(w) 3016/328(w) > > result after using blkcg(create two blkio cgroups with > default blkio.weight(500) and put FIO1 and FIO2 into these > cgroups respectively) These results are with slice_idle=0? > >FIO1 BW/IOPSFIO2 BW/IOPS > --- > read 36651/3985 36470/3943 > write 75738/8229 75641/8221 > rw 49169/5342(r) 49168/5346(r) >49200/5348(w) 49140/5341(w) > randread 4876/5324905/534 > randwrite 5535/6035497/599 > randrw 2521/274(r) 2527/275(r) >2510/273(w) 2532/274(w) > > Comparing with those results, we found greate performance drop > (30%-40%) in some test cases(especially for the "write", "rw" case). > Is it normal to see write/rw bandwidth decrease by 40% after using > blkio-cgroup? If not, any way to improve or tune the performace? What's the storage you are using. Looking at the speed of IO I would guess it is not one of those rotational disks. blkcg does cause the drop in performance (due to idling at group level). Faster the storage or more the number of cgroups, drop becomes even more visible. Only optimization I could think of was disabling slice_idle and you have already done that. There might be some opporutnities to cut down the group idling in some cases and lose on fairness but we will have to identify those and modify code. In general, do not use blkcg on faster storage. In current form it is at best suitable for single rotational SATA/SAS disk. I have not been able to figure out how to provide fairness without group idling. Thanks Vivek -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
performance drop after using blkcg
Hi, I plan to use blkcg(proportional BW) in my system. But I encounter great performance drop after enabling blkcg. The testing tool is fio(version 2.0.7) and both the BW and IOPS fields are recorded. Two instances of fio program are carried out simultaneously, each opearting on a separate disk file (say /data/testfile1, /data/testfile2). System environment: kernel: 3.7.0-rc5 CFQ's slice_idle is disabled(slice_idle=0) while group_idle is enabled(group_idle=8). FIO configuration(e.g. "read") for the first fio program(say FIO1): [global] description=Emulation of Intel IOmeter File Server Access Pattern [iometer] bssplit=4k/30:8k/40:16k/30 rw=read direct=1 time_based runtime=180s ioengine=sync filename=/data/testfile1 numjobs=32 group_reporting result before using blkcg: (the value of BW is KB/s) FIO1 BW/IOPSFIO2 BW/IOPS --- read 26799/2911 25861/2810 write 138618/15071138578/15069 rw 72159/7838(r) 71851/7811(r) 72171/7840(w) 71799/7805(w) randread 4982/5435370/585 randwrite 5192/5666010/654 randrw 2369/258(r) 3027/330(r) 2369/258(w) 3016/328(w) result after using blkcg(create two blkio cgroups with default blkio.weight(500) and put FIO1 and FIO2 into these cgroups respectively) FIO1 BW/IOPSFIO2 BW/IOPS --- read 36651/3985 36470/3943 write 75738/8229 75641/8221 rw 49169/5342(r) 49168/5346(r) 49200/5348(w) 49140/5341(w) randread 4876/5324905/534 randwrite 5535/6035497/599 randrw 2521/274(r) 2527/275(r) 2510/273(w) 2532/274(w) Comparing with those results, we found greate performance drop (30%-40%) in some test cases(especially for the "write", "rw" case). Is it normal to see write/rw bandwidth decrease by 40% after using blkio-cgroup? If not, any way to improve or tune the performace? Thanks. -- Regards, Zhao Shuai -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/