Re: performance drop after using blkcg

2013-08-28 Thread joeytao
gt;cfq_write_isolation = 0; /* added by joeytao, 2013.8.16 */
+#else
+   cfqd->cfq_write_isolation = 1; /* added by joeytao, 2013.8.21 */
+#endif
cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
cfqd->cfq_slice_idle = cfq_slice_idle;
cfqd->cfq_group_idle = cfq_group_idle;
@@ -4154,6 +4183,8 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1],
1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
+SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
+SHOW_FUNCTION(cfq_write_isolation_show, cfqd->cfq_write_isolation, 0); 
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)
\
@@ -4187,6 +4218,8 @@ STORE_FUNCTION(cfq_slice_async_store,
&cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
UINT_MAX, 0);
 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
+STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1,
UINT_MAX, 1);
+STORE_FUNCTION(cfq_write_isolation_store, &cfqd->cfq_write_isolation, 0,
UINT_MAX, 0); 
 #undef STORE_FUNCTION
 
 #define CFQ_ATTR(name) \
@@ -4204,6 +4237,8 @@ static struct elv_fs_entry cfq_attrs[] = {
CFQ_ATTR(slice_idle),
CFQ_ATTR(group_idle),
CFQ_ATTR(low_latency),
+   CFQ_ATTR(target_latency),
+   CFQ_ATTR(write_isolation),
__ATTR_NULL
 };
 
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index b2eee89..0c45b09 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -18,6 +18,11 @@ struct cfq_io_context {
unsigned long ttime_samples;
unsigned long ttime_mean;
 
+   /* added by joeytao */
+   unsigned long awtime_total;
+   unsigned long awtime_samples;
+   unsigned long awtime_mean;
+
struct list_head queue_list;
struct hlist_node cic_list;
 
-- 
1.7.1




--
View this message in context: 
http://linux-kernel.2935.n7.nabble.com/performance-drop-after-using-blkcg-tp567957p710886.html
Sent from the Linux Kernel mailing list archive at Nabble.com.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: performance drop after using blkcg

2013-08-28 Thread joeytao
gt;cfq_write_isolation = 0; /* added by joeytao, 2013.8.16 */
+#else
+   cfqd->cfq_write_isolation = 1; /* added by joeytao, 2013.8.21 */
+#endif
cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
cfqd->cfq_slice_idle = cfq_slice_idle;
cfqd->cfq_group_idle = cfq_group_idle;
@@ -4154,6 +4183,8 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1],
1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
+SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
+SHOW_FUNCTION(cfq_write_isolation_show, cfqd->cfq_write_isolation, 0); 
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)
\
@@ -4187,6 +4218,8 @@ STORE_FUNCTION(cfq_slice_async_store,
&cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
UINT_MAX, 0);
 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
+STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1,
UINT_MAX, 1);
+STORE_FUNCTION(cfq_write_isolation_store, &cfqd->cfq_write_isolation, 0,
UINT_MAX, 0); 
 #undef STORE_FUNCTION
 
 #define CFQ_ATTR(name) \
@@ -4204,6 +4237,8 @@ static struct elv_fs_entry cfq_attrs[] = {
CFQ_ATTR(slice_idle),
CFQ_ATTR(group_idle),
CFQ_ATTR(low_latency),
+   CFQ_ATTR(target_latency),
+   CFQ_ATTR(write_isolation),
__ATTR_NULL
 };
 
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index b2eee89..0c45b09 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -18,6 +18,11 @@ struct cfq_io_context {
unsigned long ttime_samples;
unsigned long ttime_mean;
 
+   /* added by joeytao */
+   unsigned long awtime_total;
+   unsigned long awtime_samples;
+   unsigned long awtime_mean;
+
struct list_head queue_list;
struct hlist_node cic_list;
 
-- 
1.7.1





--
View this message in context: 
http://linux-kernel.2935.n7.nabble.com/performance-drop-after-using-blkcg-tp567957p710883.html
Sent from the Linux Kernel mailing list archive at Nabble.com.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: performance drop after using blkcg

2012-12-11 Thread Zhao Shuai
2012/12/11 Vivek Goyal :
> These results are with slice_idle=0?

Yes, slice_idle is disabled.

> What's the storage you are using. Looking at the speed of IO I would
> guess it is not one of those rotational disks.

I have done the same test on 3 different type of boxes,and all of them
show a performance drop(30%-40%) after using blkcg. Though they
have different type of disk, all the storage they use are traditional
rotational
devices(e.g."HP EG0146FAWHU", "IBM-ESXS").

> So if somebody wants to experiment, just tweak the code a bit to allow
> preemption when a queue which lost share gets backlogged and you
> practially have a prototype of iops based group scheduling.

Could you please explain more on this? How to adjust the code? I have test
the following code piece, the result is we lost group differentiation.

cfq_group_served() {
 if (iops_mode(cfqd))
 charge = cfqq->slice_dispatch;
 cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
 }


-- 
Regards,
Zhao Shuai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: performance drop after using blkcg

2012-12-11 Thread Tejun Heo
Hello, Vivek.

On Tue, Dec 11, 2012 at 11:18:20AM -0500, Vivek Goyal wrote:
> - Controlling device queue should bring down throughput too as it
>   should bring down level of parallelism at device level. Also asking
>   user to tune device queue depth seems bad interface. How would a
>   user know what's the right queue depth. May be software can try to
>   be intelligent about it and if IO latencies cross a threshold then
>   try to decrese queue depth. (We do things like that in CFQ).

Yeah, it should definitely be something automatic.  Command completion
latencies are visible to iosched, so it should be doable.

> - Passing prio to device sounds something new and promising. If they
>   can do a good job at it, why not. I think at minimum they need to
>   make sure READs are prioritized over writes by default. And may
>   be provide a way to signal important writes which need to go to
>   the disk now.
> 
>   If READs are prioritized in device, then it takes care of one very
>   important use case. Then we just have to worry about other case of
>   fairness between different readers or fairness between different
>   writers and there we do not idle and try our best to give fair share.
>   In case group is not backlogged, it is bound to loose some share.

I think it can be good enough if we have queue at the head / tail
choice.  No idea how it'll actually fan out tho.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: performance drop after using blkcg

2012-12-11 Thread Vivek Goyal
On Tue, Dec 11, 2012 at 08:01:37AM -0800, Tejun Heo wrote:

[..]
> > Only way to provide effective isolation seemed to be idling and the
> > moment we idle we kill the performance. It does not matter whether we
> > are scheduling time or iops.
> 
> If the completion latency of IOs fluctuates heavily depend on queue
> depth, queue depth would need to be throttled so that lower priority
> queue can't overwhelm the device queue while prospect higher priority
> accessors exist.  Another aspect is that devices are getting a lot
> more consistent in terms of latency.
> 
> While idling would also solve isolation issue with unordered deep
> device queue, it really is a solution for a rotational device with
> large seek penalty as the time lost while idling can often/somtimes
> made up by the save from lower seeks.  For non-rot devices with deep
> queue, the right thing to do would be controlling queue depth or
> propagate priority to the device queue (from what I hear, people are
> working on it. dunno how well it would turn out tho).

- Controlling device queue should bring down throughput too as it
  should bring down level of parallelism at device level. Also asking
  user to tune device queue depth seems bad interface. How would a
  user know what's the right queue depth. May be software can try to
  be intelligent about it and if IO latencies cross a threshold then
  try to decrese queue depth. (We do things like that in CFQ).

- Passing prio to device sounds something new and promising. If they
  can do a good job at it, why not. I think at minimum they need to
  make sure READs are prioritized over writes by default. And may
  be provide a way to signal important writes which need to go to
  the disk now.

  If READs are prioritized in device, then it takes care of one very
  important use case. Then we just have to worry about other case of
  fairness between different readers or fairness between different
  writers and there we do not idle and try our best to give fair share.
  In case group is not backlogged, it is bound to loose some share.

> 
> > >  cfq is way too heavy and
> > > ill-suited for high speed non-rot devices which are becoming more and
> > > more consistent in terms of iops they can handle.
> > > 
> > > I think we need something better suited for the maturing non-rot
> > > devices.  They're becoming very different from what cfq was built for
> > > and we really shouldn't be maintaining several rb trees which need
> > > full synchronization for each IO.  We're doing way too much and it
> > > just isn't scalable.
> > 
> > I am fine with doing things differently in a different scheduler. But 
> > what I am aruging here is that atleast with CFQ we should be able to
> > experiment and figure out what works.  In CFQ all the code is there and
> > if this iops based scheduling has merit, one should be able to quickly
> > experiment and demonstrate how would one do things differently.
> > 
> > To me I have not been able to understand yet that what is iops based
> > scheduling doing differently. Will we idle there or not. If we idle
> > we again have performance problems.
> 
> When the device can do tens of thousands ios per sec, I don't think it
> makes much sense to idle the device.  You just lose too much.

Agreed. idling starts showing soon on fast SATA rotational devices itself
so idling on faster devices will lead to bad results on most of the
workloads.

> 
> > So doing things out of CFQ is fine. I am only after understanding the
> > technical idea which will solve the problem of provinding isolation
> > as well as fairness without losing throughput. And I have not been
> > able to get a hang of it yet.
> 
> I think it already has some aspect of it.  It has the half-iops mode
> for a reason, right?  It just is very inefficient and way more complex
> than it needs to be.

I introduced this iops_mode() in an attempt to try to provide fair disk
share in terms of iops instead of disk slices. It might not be most
efficient one but atleast it can provide answers whether it is something
useful or not and for what workload and devices this iops based scheduling
is useful. 

So if somebody wants to experiment, just tweak the code a bit to allow
preemption when a queue which lost share gets backlogged and you
practially have a prototype of iops based group scheduling.

Thanks
Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: performance drop after using blkcg

2012-12-11 Thread Tejun Heo
Hello, Vivek.

On Tue, Dec 11, 2012 at 10:37:25AM -0500, Vivek Goyal wrote:
> I have experimented with schemes like that but did not see any very
> promising resutls. Assume device supports queue depth of 128, and there
> is one dependent reader and one writer. If reader goes away and comes
> back and preempts low priority writer, in that small time window writer
> has dispatched enough requests to introduce read delays. So preemption
> helps only so much. I am curious to know how iops based scheduler solve
> these issues.  
> 
> Only way to provide effective isolation seemed to be idling and the
> moment we idle we kill the performance. It does not matter whether we
> are scheduling time or iops.

If the completion latency of IOs fluctuates heavily depend on queue
depth, queue depth would need to be throttled so that lower priority
queue can't overwhelm the device queue while prospect higher priority
accessors exist.  Another aspect is that devices are getting a lot
more consistent in terms of latency.

While idling would also solve isolation issue with unordered deep
device queue, it really is a solution for a rotational device with
large seek penalty as the time lost while idling can often/somtimes
made up by the save from lower seeks.  For non-rot devices with deep
queue, the right thing to do would be controlling queue depth or
propagate priority to the device queue (from what I hear, people are
working on it. dunno how well it would turn out tho).

> >  cfq is way too heavy and
> > ill-suited for high speed non-rot devices which are becoming more and
> > more consistent in terms of iops they can handle.
> > 
> > I think we need something better suited for the maturing non-rot
> > devices.  They're becoming very different from what cfq was built for
> > and we really shouldn't be maintaining several rb trees which need
> > full synchronization for each IO.  We're doing way too much and it
> > just isn't scalable.
> 
> I am fine with doing things differently in a different scheduler. But 
> what I am aruging here is that atleast with CFQ we should be able to
> experiment and figure out what works.  In CFQ all the code is there and
> if this iops based scheduling has merit, one should be able to quickly
> experiment and demonstrate how would one do things differently.
> 
> To me I have not been able to understand yet that what is iops based
> scheduling doing differently. Will we idle there or not. If we idle
> we again have performance problems.

When the device can do tens of thousands ios per sec, I don't think it
makes much sense to idle the device.  You just lose too much.

> So doing things out of CFQ is fine. I am only after understanding the
> technical idea which will solve the problem of provinding isolation
> as well as fairness without losing throughput. And I have not been
> able to get a hang of it yet.

I think it already has some aspect of it.  It has the half-iops mode
for a reason, right?  It just is very inefficient and way more complex
than it needs to be.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: performance drop after using blkcg

2012-12-11 Thread Vivek Goyal
On Tue, Dec 11, 2012 at 07:14:12AM -0800, Tejun Heo wrote:
> Hello, Vivek.
> 
> On Tue, Dec 11, 2012 at 10:02:34AM -0500, Vivek Goyal wrote:
> > cfq_group_served() {
> > if (iops_mode(cfqd))
> > charge = cfqq->slice_dispatch;
> > cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
> > }
> > 
> > Isn't it effectively IOPS scheduling. One should get IOPS rate in 
> > proportion to
> > their weight (as long as they can throw enough traffic at device to keep
> > it busy). If not, can you please give more details about your proposal.
> 
> The problem is that we lose a lot of isolation w/o idling between
> queues or groups.  This is because we switch between slices and while
> a slice is in progress only ios belongint to that slice can be issued.
> ie. higher priority cfqgs / cfqqs, after dispatching the ios they have
> ready, lose their slice immmediately.  Lower priority slice takes over
> and when hgiher priority ones get ready, they have to wait for the
> lower priority one before submitting the new IOs.  In many cases, they
> end up not being able to generate IOs any faster than the ones in
> lower priority cfqqs/cfqgs.
> 
> This is becase we switch slices rather than iops.

I am not sure how any of the above problems will go away if we start
scheduling iops. 

>  We can make cfq
> essentially switch iops by implementing very aggressive preemption but
> I really don't see much point in that.

Yes, this should be easily doable. Once a queue/group is being removed
and is losing its share, just keep track of last vdisktime. When more IO
comes in this group and current group is preempted (if its vdisktime is
greater than one being queued). And new group is probably queued at
the front.

I have experimented with schemes like that but did not see any very
promising resutls. Assume device supports queue depth of 128, and there
is one dependent reader and one writer. If reader goes away and comes
back and preempts low priority writer, in that small time window writer
has dispatched enough requests to introduce read delays. So preemption
helps only so much. I am curious to know how iops based scheduler solve
these issues.  

Only way to provide effective isolation seemed to be idling and the
moment we idle we kill the performance. It does not matter whether we
are scheduling time or iops.

>  cfq is way too heavy and
> ill-suited for high speed non-rot devices which are becoming more and
> more consistent in terms of iops they can handle.
> 
> I think we need something better suited for the maturing non-rot
> devices.  They're becoming very different from what cfq was built for
> and we really shouldn't be maintaining several rb trees which need
> full synchronization for each IO.  We're doing way too much and it
> just isn't scalable.

I am fine with doing things differently in a different scheduler. But 
what I am aruging here is that atleast with CFQ we should be able to
experiment and figure out what works.  In CFQ all the code is there and
if this iops based scheduling has merit, one should be able to quickly
experiment and demonstrate how would one do things differently.

To me I have not been able to understand yet that what is iops based
scheduling doing differently. Will we idle there or not. If we idle
we again have performance problems.

So doing things out of CFQ is fine. I am only after understanding the
technical idea which will solve the problem of provinding isolation
as well as fairness without losing throughput. And I have not been
able to get a hang of it yet.

Thanks
Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: performance drop after using blkcg

2012-12-11 Thread Tejun Heo
Hello, Vivek.

On Tue, Dec 11, 2012 at 10:02:34AM -0500, Vivek Goyal wrote:
> cfq_group_served() {
> if (iops_mode(cfqd))
> charge = cfqq->slice_dispatch;
>   cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
> }
> 
> Isn't it effectively IOPS scheduling. One should get IOPS rate in proportion 
> to
> their weight (as long as they can throw enough traffic at device to keep
> it busy). If not, can you please give more details about your proposal.

The problem is that we lose a lot of isolation w/o idling between
queues or groups.  This is because we switch between slices and while
a slice is in progress only ios belongint to that slice can be issued.
ie. higher priority cfqgs / cfqqs, after dispatching the ios they have
ready, lose their slice immmediately.  Lower priority slice takes over
and when hgiher priority ones get ready, they have to wait for the
lower priority one before submitting the new IOs.  In many cases, they
end up not being able to generate IOs any faster than the ones in
lower priority cfqqs/cfqgs.

This is becase we switch slices rather than iops.  We can make cfq
essentially switch iops by implementing very aggressive preemption but
I really don't see much point in that.  cfq is way too heavy and
ill-suited for high speed non-rot devices which are becoming more and
more consistent in terms of iops they can handle.

I think we need something better suited for the maturing non-rot
devices.  They're becoming very different from what cfq was built for
and we really shouldn't be maintaining several rb trees which need
full synchronization for each IO.  We're doing way too much and it
just isn't scalable.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: performance drop after using blkcg

2012-12-11 Thread Vivek Goyal
On Tue, Dec 11, 2012 at 06:47:18AM -0800, Tejun Heo wrote:
> Hello,
> 
> On Tue, Dec 11, 2012 at 09:43:36AM -0500, Vivek Goyal wrote:
> > I think if one sets slice_idle=0 and group_idle=0 in CFQ, for all practical
> > purposes it should become and IOPS based group scheduling.
> 
> No, I don't think it is.  You can't achieve isolation without idling
> between group switches.  We're measuring slices in terms of iops but
> what cfq actually schedules are still time slices, not IOs.

I think I have not been able to understand your proposal. Can you explain
a bit more.

This is what CFQ does in iops_mode(). It will calculate the number of
requests dispatched from a group and scale that number based on weight
and put the group back on service tree. So if you have not got your
fair share in terms of number of requests dispatched to the device,
you will be put ahead in the queue and given a chance to dispatch 
requests first. 

Now couple of things.

- There is no idling here. If device is asking for more requests (deep
  queue depth) then this group will be removed from service tree and
  CFQ will move on to serve other queued group. So if there is a dependent
  reader it will lose its share.

  If we try to idle here, then we have solved nothing in terms of
  performance problems.  Device is faster but your workload can't cope
  with it so you are artificially slowing down the device.

- But if all the contending workloads/groups are throwing enough IO
  traffic on the device and don't get expired, they they should be able
  to dispatch number of requests to device in proportion to their weight.

So this is effectively trying to keep track of number of reqeusts
dispatched from the group instead of time slice consumed by group and
then do the scheduling.

cfq_group_served() {
if (iops_mode(cfqd))
charge = cfqq->slice_dispatch;
cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
}

Isn't it effectively IOPS scheduling. One should get IOPS rate in proportion to
their weight (as long as they can throw enough traffic at device to keep
it busy). If not, can you please give more details about your proposal.

Thanks
Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: performance drop after using blkcg

2012-12-11 Thread Tejun Heo
Hello,

On Tue, Dec 11, 2012 at 09:43:36AM -0500, Vivek Goyal wrote:
> I think if one sets slice_idle=0 and group_idle=0 in CFQ, for all practical
> purposes it should become and IOPS based group scheduling.

No, I don't think it is.  You can't achieve isolation without idling
between group switches.  We're measuring slices in terms of iops but
what cfq actually schedules are still time slices, not IOs.

> For group accounting then CFQ uses number of requests from each cgroup
> and uses that information to schedule groups.
> 
> I have not been able to figure out the practical benefits of that
> approach. At least not for the simple workloads I played with. This
> approach will not work for simple things like trying to improve dependent
> read latencies in presence of heavery writers. That's the single biggest
> use case CFQ solves, IMO.

As I wrote above, it's not about accounting.  It's about scheduling
unit.

> And that happens because we stop writes and don't let them go to device
> and device is primarily dealing with reads. If some process is doing
> dependent reads and we want to improve read latencies, then either
> we need to stop flow of writes or devices are good and they always
> prioritize READs over WRITEs. If devices are good then we probably
> don't even need blkcg.
> 
> So yes, iops based appraoch is fine just that number of cases where you
> will see any service differentiation should significantly less.

No, using iops to schedule time slices would lead to that.  We just
need to be allocating and scheduling iops, and I don't think we should
be doing that from cfq.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: performance drop after using blkcg

2012-12-11 Thread Vivek Goyal
On Tue, Dec 11, 2012 at 06:27:42AM -0800, Tejun Heo wrote:
> On Tue, Dec 11, 2012 at 09:25:18AM -0500, Vivek Goyal wrote:
> > In general, do not use blkcg on faster storage. In current form it
> > is at best suitable for single rotational SATA/SAS disk. I have not
> > been able to figure out how to provide fairness without group idling.
> 
> I think cfq is just the wrong approach for faster non-rotational
> devices.  We should be allocating iops instead of time slices.

I think if one sets slice_idle=0 and group_idle=0 in CFQ, for all practical
purposes it should become and IOPS based group scheduling.

For group accounting then CFQ uses number of requests from each cgroup
and uses that information to schedule groups.

I have not been able to figure out the practical benefits of that
approach. At least not for the simple workloads I played with. This
approach will not work for simple things like trying to improve dependent
read latencies in presence of heavery writers. That's the single biggest
use case CFQ solves, IMO.

And that happens because we stop writes and don't let them go to device
and device is primarily dealing with reads. If some process is doing
dependent reads and we want to improve read latencies, then either
we need to stop flow of writes or devices are good and they always
prioritize READs over WRITEs. If devices are good then we probably
don't even need blkcg.

So yes, iops based appraoch is fine just that number of cases where you
will see any service differentiation should significantly less.

Thanks
Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: performance drop after using blkcg

2012-12-11 Thread Tejun Heo
On Tue, Dec 11, 2012 at 09:25:18AM -0500, Vivek Goyal wrote:
> In general, do not use blkcg on faster storage. In current form it
> is at best suitable for single rotational SATA/SAS disk. I have not
> been able to figure out how to provide fairness without group idling.

I think cfq is just the wrong approach for faster non-rotational
devices.  We should be allocating iops instead of time slices.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: performance drop after using blkcg

2012-12-11 Thread Vivek Goyal
On Mon, Dec 10, 2012 at 08:28:54PM +0800, Zhao Shuai wrote:
> Hi,
> 
> I plan to use blkcg(proportional BW) in my system. But I encounter
> great performance drop after enabling blkcg.
> The testing tool is fio(version 2.0.7) and both the BW and IOPS fields
> are recorded. Two instances of fio program are carried out simultaneously,
> each opearting on a separate disk file (say /data/testfile1,
> /data/testfile2).
> System environment:
> kernel: 3.7.0-rc5
> CFQ's slice_idle is disabled(slice_idle=0) while group_idle is
> enabled(group_idle=8).
> 
> FIO configuration(e.g. "read") for the first fio program(say FIO1):
> 
> [global]
> description=Emulation of Intel IOmeter File Server Access Pattern
> 
> [iometer]
> bssplit=4k/30:8k/40:16k/30
> rw=read
> direct=1
> time_based
> runtime=180s
> ioengine=sync
> filename=/data/testfile1
> numjobs=32
> group_reporting
> 
> 
> result before using blkcg: (the value of BW is KB/s)
> 
>FIO1 BW/IOPSFIO2 BW/IOPS
> ---
> read   26799/2911  25861/2810
> write  138618/15071138578/15069
> rw 72159/7838(r)   71851/7811(r)
>72171/7840(w)   71799/7805(w)
> randread   4982/5435370/585
> randwrite  5192/5666010/654
> randrw 2369/258(r) 3027/330(r)
>2369/258(w) 3016/328(w)
> 
> result after using blkcg(create two blkio cgroups with
> default blkio.weight(500) and put FIO1 and FIO2 into these
> cgroups respectively)

These results are with slice_idle=0?

> 
>FIO1 BW/IOPSFIO2 BW/IOPS
> ---
> read   36651/3985  36470/3943
> write  75738/8229  75641/8221
> rw 49169/5342(r)   49168/5346(r)
>49200/5348(w)   49140/5341(w)
> randread   4876/5324905/534
> randwrite  5535/6035497/599
> randrw 2521/274(r) 2527/275(r)
>2510/273(w) 2532/274(w)
> 
> Comparing with those results, we found greate performance drop
> (30%-40%) in some test cases(especially for the "write", "rw" case).
> Is it normal to see write/rw bandwidth decrease by 40% after using
> blkio-cgroup? If not, any way to improve or tune the performace?

What's the storage you are using. Looking at the speed of IO I would
guess it is not one of those rotational disks.

blkcg does cause the drop in performance (due to idling at group level).
Faster the storage or more the number of cgroups, drop becomes even
more visible.

Only optimization I could think of was disabling slice_idle and you
have already done that.

There might be some opporutnities to cut down the group idling in
some cases and lose on fairness but we will have to identify those
and modify code.

In general, do not use blkcg on faster storage. In current form it
is at best suitable for single rotational SATA/SAS disk. I have not
been able to figure out how to provide fairness without group idling.

Thanks
Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


performance drop after using blkcg

2012-12-10 Thread Zhao Shuai
Hi,

I plan to use blkcg(proportional BW) in my system. But I encounter
great performance drop after enabling blkcg.

The testing tool is fio(version 2.0.7) and both the BW and IOPS fields
are recorded. Two instances of fio program are carried out simultaneously,
each opearting on a separate disk file (say /data/testfile1, /data/testfile2).

System environment:
kernel: 3.7.0-rc5
CFQ's slice_idle is disabled(slice_idle=0) while group_idle is
enabled(group_idle=8).

FIO configuration(e.g. "read") for the first fio program(say FIO1):

[global]
description=Emulation of Intel IOmeter File Server Access Pattern

[iometer]
bssplit=4k/30:8k/40:16k/30
rw=read
direct=1
time_based
runtime=180s
ioengine=sync
filename=/data/testfile1
numjobs=32
group_reporting


result before using blkcg: (the value of BW is KB/s)

   FIO1 BW/IOPSFIO2 BW/IOPS
---
read   26799/2911  25861/2810
write  138618/15071138578/15069
rw 72159/7838(r)   71851/7811(r)
   72171/7840(w)   71799/7805(w)
randread   4982/5435370/585
randwrite  5192/5666010/654
randrw 2369/258(r) 3027/330(r)
   2369/258(w) 3016/328(w)

result after using blkcg(create two blkio cgroups with
default blkio.weight(500) and put FIO1 and FIO2 into these
cgroups respectively)

   FIO1 BW/IOPSFIO2 BW/IOPS
---
read   36651/3985  36470/3943
write  75738/8229  75641/8221
rw 49169/5342(r)   49168/5346(r)
   49200/5348(w)   49140/5341(w)
randread   4876/5324905/534
randwrite  5535/6035497/599
randrw 2521/274(r) 2527/275(r)
   2510/273(w) 2532/274(w)

Comparing with those results, we found greate performance drop
(30%-40%) in some test cases(especially for the "write", "rw" case).
Is it normal to see write/rw bandwidth decrease by 40% after using
blkio-cgroup? If not, any way to improve or tune the performace?

Thanks.

--
Regards,
Zhao Shuai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/