With context switching ratelimiting enabled, the following
pattern is quite common in a scheduling trace:

     0.000845622 |||||||||||.x||| d32768v12 csched2:runq_insert d0v13, position 0
     0.000845831 |||||||||||.x||| d32768v12 csched2:runq_tickle_new d0v13, 
processor = 12, credit = 10135529
     0.000846546 |||||||||||.x||| d32768v12 csched2:burn_credits d2v7, credit = 
2619231, delta = 255937
 [1] 0.000846739 |||||||||||.x||| d32768v12 csched2:runq_tickle cpu 12
     [...]
 [2] 0.000850597 ||||||||||||x||| d32768v12 csched2:schedule cpu 12, rq# 1, 
busy, SMT busy, tickled
     0.000850760 ||||||||||||x||| d32768v12 csched2:burn_credits d2v7, credit = 
2614028, delta = 5203
 [3] 0.000851022 ||||||||||||x||| d32768v12 csched2:ratelimit triggered
 [4] 0.000851614 ||||||||||||x||| d32768v12 runstate_continue d2v7 
running->running

Basically, what happens is that runq_tickle() realizes
d0v13 should preempt d2v7, running on cpu 12, as it
has higher credits (10135529 vs. 2619231). It therefore
tickles cpu 12 [1], which, in turn, schedules [2].

But --surprise surprise-- d2v7 has run for less than the
ratelimit interval [3], and hence it is _not_ preempted,
and continues to run. This indeed looks fine. Actually,
this is what ratelimiting is there for. Note, however,
that:
 1) we interrupted cpu 12 for nothing;
 2) what if, say on cpu 8, there is a vcpu that has:
    + less credit than d0v13 (so d0v13 can well
      preempt it),
    + more credit than d2v7 (that's why it was not
      selected to be preempted),
    + run for more than the ratelimiting interval
      (so it can really be scheduled out)?

This patch tries to figure out whether the situation
is the one described at 2) and, if it is, tickles 8 (in
the example above) instead of 12.

Signed-off-by: Dario Faggioli <dario.faggi...@citrix.com>
---
Cc: George Dunlap <george.dun...@citrix.com>
Cc: Anshul Makkar <anshul.mak...@citrix.com>
---
 xen/common/sched_credit2.c |   31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
index f03ecce..3bb764d 100644
--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -146,6 +146,8 @@
 #define CSCHED2_MIGRATE_RESIST       ((opt_migrate_resist)*MICROSECS(1))
 /* How much to "compensate" a vcpu for L2 migration */
 #define CSCHED2_MIGRATE_COMPENSATION MICROSECS(50)
+/* How tolerant we should be when peeking at runtime of vcpus on other cpus */
+#define CSCHED2_RATELIMIT_TICKLE_TOLERANCE MICROSECS(50)
 /* How big of a bias we should have against a yielding vcpu */
 #define CSCHED2_YIELD_BIAS           ((opt_yield_bias)*MICROSECS(1))
 #define CSCHED2_YIELD_BIAS_MIN       CSCHED2_MIN_TIMER
@@ -972,6 +974,27 @@ static inline bool_t soft_aff_check_preempt(unsigned int 
bs, unsigned int cpu)
     return !cpumask_test_cpu(cpu, cpumask_scratch);
 }
 
+/*
+ * What we want to know is whether svc, which we assume to be running on some
+ * pcpu, can be interrupted and preempted. So fat, the only reason because of
+ * which a preemption would be deferred is context switch ratelimiting, so
+ * check for that.
+ *
+ * Use a caller provided value of ratelimit, instead of the scheduler's own
+ * prv->ratelimit_us so the caller can play some tricks, if he wants (which,
+ * as a matter of fact, he does, by applying the tolerance).
+ */
+static inline bool_t is_preemptable(const struct csched2_vcpu *svc,
+                                    s_time_t now, s_time_t ratelimit)
+{
+    s_time_t runtime;
+
+    ASSERT(svc->vcpu->is_running);
+    runtime = now - svc->vcpu->runstate.state_entry_time;
+
+    return runtime > ratelimit;
+}
+
 void burn_credits(struct csched2_runqueue_data *rqd, struct csched2_vcpu *, 
s_time_t);
 
 /*
@@ -997,6 +1020,8 @@ runq_tickle(const struct scheduler *ops, struct 
csched2_vcpu *new, s_time_t now)
     s_time_t lowest = (1<<30);
     unsigned int bs, cpu = new->vcpu->processor;
     struct csched2_runqueue_data *rqd = RQD(ops, cpu);
+    s_time_t ratelimit = MICROSECS(CSCHED2_PRIV(ops)->ratelimit_us) -
+                         CSCHED2_RATELIMIT_TICKLE_TOLERANCE;
     cpumask_t mask, skip_mask;
     struct csched2_vcpu * cur;
 
@@ -1104,7 +1129,8 @@ runq_tickle(const struct scheduler *ops, struct 
csched2_vcpu *new, s_time_t now)
                                 (unsigned char *)&d);
                 }
 
-                if ( cur->credit < new->credit )
+                if ( cur->credit < new->credit &&
+                     is_preemptable(cur, now, ratelimit) )
                 {
                     SCHED_STAT_CRANK(tickled_busy_cpu);
                     ipid = cpu;
@@ -1155,7 +1181,8 @@ runq_tickle(const struct scheduler *ops, struct 
csched2_vcpu *new, s_time_t now)
                                 (unsigned char *)&d);
                 }
 
-                if ( cur->credit < lowest )
+                if ( cur->credit < lowest &&
+                     is_preemptable(cur, now, ratelimit) )
                 {
                     ipid = i;
                     lowest = cur->credit;


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

Reply via email to