rt: add utilization tracking

Peter Zijlstra Mon, 07 Aug 2017 09:45:37 -0700

On Fri, Aug 04, 2017 at 03:40:21PM +0200, Vincent Guittot wrote:

> There were several comments on v1:
> - As raised by Peter for v1, if IRQ time is taken into account in
>   rt_avg, it will not be accounted in rq->clock_task. This means that cfs
>   utilization is not affected by some extra contributions or decays
>   because of IRQ.


Right.

> - Regading the sync of rt and cfs utilization, both cfs and rt use the same
>   rq->clock_task. Then, we have the same issue than cfs regarding blocked 
> value.
>   The utilization of idle cfs/rt rqs are not updated regularly but only when a
>   load_balance is triggered (more precisely a call to update_blocked_average).
>   I'd like to fix this issue for both cfs and rt with a separate patch that
>   will ensure that utilization (and load) are updated regularly even for
>   idle CPUs

Yeah, that needs help.

> - One last open question is the location of rt utilization function in fair.c
>   file. PELT related funtions should probably move in a dedicated pelt.c file.
>   This would also help to address one comment about having a place to update
>   metrics of NOHZ idle CPUs. Thought ?

Probably, but I have a bunch of patches lined up changing that code, so
lets not do that now.

In any case, would something like the attached patches make sense? It
completely replaces rt_avg with separate IRQ,RT and DL tracking.

Subject: sched: Add DL utilization tracking
From: Peter Zijlstra <[email protected]>
Date: Mon Aug 7 16:49:48 CEST 2017

Track how much time we spend running DL tasks on avgerage.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
 kernel/sched/deadline.c |   11 ++++++++++-
 kernel/sched/fair.c     |   12 ++++++++++++
 kernel/sched/sched.h    |    2 ++
 3 files changed, 24 insertions(+), 1 deletion(-)

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1655,6 +1655,8 @@ static struct sched_dl_entity *pick_next
 	return rb_entry(left, struct sched_dl_entity, rb_node);
 }
 
+extern int update_dl_rq_load_avg(u64 now, int cpu, struct dl_rq *dl_rq, int running);
+
 struct task_struct *
 pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
@@ -1702,19 +1704,25 @@ pick_next_task_dl(struct rq *rq, struct
 	p->se.exec_start = rq_clock_task(rq);
 
 	/* Running task will never be pushed. */
-       dequeue_pushable_dl_task(rq, p);
+	dequeue_pushable_dl_task(rq, p);
 
 	if (hrtick_enabled(rq))
 		start_hrtick_dl(rq, p);
 
 	queue_push_tasks(rq);
 
+	if (p) {
+		update_dl_rq_load_avg(rq_clock_task(rq), cpu_of(rq), dl_rq,
+				rq->curr->sched_class == &dl_sched_class);
+	}
+
 	return p;
 }
 
 static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
 {
 	update_curr_dl(rq);
+	update_dl_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->dl, 1);
 
 	if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_dl_task(rq, p);
@@ -1723,6 +1731,7 @@ static void put_prev_task_dl(struct rq *
 static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
 {
 	update_curr_dl(rq);
+	update_dl_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->dl, 1);
 
 	/*
 	 * Even when we have runtime, update_curr_dl() might have resulted in us
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3017,6 +3017,11 @@ int update_rt_rq_load_avg(u64 now, int c
 	return ___update_load_avg(now, cpu, &rt_rq->avg, 0, running, NULL);
 }
 
+int update_dl_rq_load_avg(u64 now, int cpu, struct dl_rq *dl_rq, int running)
+{
+	return ___update_load_avg(now, cpu, &dl_rq->avg, 0, running, NULL);
+}
+
 
 /*
  * Signed add and clamp on underflow.
@@ -3550,6 +3555,11 @@ int update_rt_rq_load_avg(u64 now, int c
 	return 0;
 }
 
+int update_dl_rq_load_avg(u64 now, int cpu, struct dl_rq *dl_rq, int running)
+{
+	return 0;
+}
+
 #define UPDATE_TG	0x0
 #define SKIP_AGE_LOAD	0x0
 
@@ -6945,6 +6955,7 @@ static void update_blocked_averages(int
 	}
 
 	update_rt_rq_load_avg(rq_clock_task(rq), cpu, &rq->rt, 0);
+	update_dl_rq_load_avg(rq_clock_task(rq), cpu, &rq->dl, 0);
 	rq_unlock_irqrestore(rq, &rf);
 }
 
@@ -7005,6 +7016,7 @@ static inline void update_blocked_averag
 	update_rq_clock(rq);
 	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
 	update_rt_rq_load_avg(rq_clock_task(rq), cpu, &rq->rt, 0);
+	update_dl_rq_load_avg(rq_clock_task(rq), cpu, &rq->dl, 0);
 	rq_unlock_irqrestore(rq, &rf);
 }
 
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -580,6 +580,8 @@ struct dl_rq {
 	 */
 	struct rb_root pushable_dl_tasks_root;
 	struct rb_node *pushable_dl_tasks_leftmost;
+
+	struct sched_avg avg;
 #else
 	struct dl_bw dl_bw;
 #endif

Subject: sched: Add IRQ utilization tracking
From: Peter Zijlstra <[email protected]>
Date: Mon Aug 7 17:22:46 CEST 2017

Track how much time we spend on IRQ...

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
 kernel/sched/core.c  |    6 +++++-
 kernel/sched/fair.c  |   12 ++++++++++++
 kernel/sched/sched.h |    2 ++
 3 files changed, 19 insertions(+), 1 deletion(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -151,6 +151,8 @@ struct rq *task_rq_lock(struct task_stru
 	}
 }
 
+extern int update_irq_load_avg(u64 now, int cpu, struct rq *rq, int running);
+
 /*
  * RQ-clock updating methods:
  */
@@ -204,8 +206,10 @@ static void update_rq_clock_task(struct
 	rq->clock_task += delta;
 
 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
+	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) {
 		sched_rt_avg_update(rq, irq_delta + steal);
+		update_irq_load_avg(rq->clock, cpu_of(rq), rq, 1);
+	}
 #endif
 }
 
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3012,6 +3012,11 @@ __update_load_avg_cfs_rq(u64 now, int cp
 			cfs_rq->curr != NULL, cfs_rq);
 }
 
+int update_irq_load_avg(u64 now, int cpu, struct rq *rq, int running)
+{
+	return ___update_load_avg(now, cpu, &rq->irq_avg, 0, running, NULL);
+}
+
 int update_rt_rq_load_avg(u64 now, int cpu, struct rt_rq *rt_rq, int running)
 {
 	return ___update_load_avg(now, cpu, &rt_rq->avg, 0, running, NULL);
@@ -3550,6 +3555,11 @@ update_cfs_rq_load_avg(u64 now, struct c
 	return 0;
 }
 
+int update_irq_load_avg(u64 now, int cpu, struct rq *rq, int running)
+{
+	return 0;
+}
+
 int update_rt_rq_load_avg(u64 now, int cpu, struct rt_rq *rt_rq, int running)
 {
 	return 0;
@@ -6926,6 +6936,7 @@ static void update_blocked_averages(int
 
 	rq_lock_irqsave(rq, &rf);
 	update_rq_clock(rq);
+	update_irq_load_avg(rq_clock(rq), cpu, rq, 0);
 
 	/*
 	 * Iterates the task_group tree in a bottom up fashion, see
@@ -7014,6 +7025,7 @@ static inline void update_blocked_averag
 
 	rq_lock_irqsave(rq, &rf);
 	update_rq_clock(rq);
+	update_irq_load_avg(rq_clock(rq), cpu, rq, 0);
 	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
 	update_rt_rq_load_avg(rq_clock_task(rq), cpu, &rq->rt, 0);
 	update_dl_rq_load_avg(rq_clock_task(rq), cpu, &rq->dl, 0);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -748,6 +748,8 @@ struct rq {
 
 	struct list_head cfs_tasks;
 
+	struct sched_avg irq_avg;
+
 	u64 rt_avg;
 	u64 age_stamp;
 	u64 idle_stamp;

Subject: sched: Replace rt_avg
From: Peter Zijlstra <[email protected]>
Date: Mon Aug  7 18:26:25 CEST 2017

Now that we have separate IRQ, RT and DL utilization tracking, we can
fully replace the old rt_avg code, reducing the amount of statistics.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
 include/linux/sched/sysctl.h |    1 -
 kernel/sched/core.c          |   40 ++--------------------------------------
 kernel/sched/deadline.c      |    2 --
 kernel/sched/fair.c          |   21 +++++----------------
 kernel/sched/rt.c            |    2 --
 kernel/sched/sched.h         |   26 +-------------------------
 kernel/sysctl.c              |    7 -------
 7 files changed, 8 insertions(+), 91 deletions(-)

--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -39,7 +39,6 @@ extern unsigned int sysctl_numa_balancin
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
-extern unsigned int sysctl_sched_time_avg;
 
 int sched_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *length,
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -62,14 +62,6 @@ const_debug unsigned int sysctl_sched_fe
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
 /*
- * period over which we average the RT time consumption, measured
- * in ms.
- *
- * default: 1s
- */
-const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
-
-/*
  * period over which we measure -rt task CPU usage in us.
  * default: 1s
  */
@@ -161,7 +153,7 @@ static void update_rq_clock_task(struct
 {
 /*
  * In theory, the compile should just see 0 here, and optimize out the call
- * to sched_rt_avg_update. But I don't trust it...
+ * to update_irq_load_avg(). But I don't trust it...
  */
 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
 	s64 steal = 0, irq_delta = 0;
@@ -206,10 +198,8 @@ static void update_rq_clock_task(struct
 	rq->clock_task += delta;
 
 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) {
-		sched_rt_avg_update(rq, irq_delta + steal);
+	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
 		update_irq_load_avg(rq->clock, cpu_of(rq), rq, 1);
-	}
 #endif
 }
 
@@ -673,23 +663,6 @@ bool sched_can_stop_tick(struct rq *rq)
 	return true;
 }
 #endif /* CONFIG_NO_HZ_FULL */
-
-void sched_avg_update(struct rq *rq)
-{
-	s64 period = sched_avg_period();
-
-	while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
-		/*
-		 * Inline assembly required to prevent the compiler
-		 * optimising this loop into a divmod call.
-		 * See __iter_div_u64_rem() for another example of this.
-		 */
-		asm("" : "+rm" (rq->age_stamp));
-		rq->age_stamp += period;
-		rq->rt_avg /= 2;
-	}
-}
-
 #endif /* CONFIG_SMP */
 
 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -5515,13 +5488,6 @@ void set_rq_offline(struct rq *rq)
 	}
 }
 
-static void set_cpu_rq_start_time(unsigned int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-
-	rq->age_stamp = sched_clock_cpu(cpu);
-}
-
 /*
  * used to mark begin/end of suspend/resume:
  */
@@ -5640,7 +5606,6 @@ static void sched_rq_cpu_starting(unsign
 
 int sched_cpu_starting(unsigned int cpu)
 {
-	set_cpu_rq_start_time(cpu);
 	sched_rq_cpu_starting(cpu);
 	return 0;
 }
@@ -5919,7 +5884,6 @@ void __init sched_init(void)
 	if (cpu_isolated_map == NULL)
 		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 	idle_thread_set_boot_cpu();
-	set_cpu_rq_start_time(smp_processor_id());
 #endif
 	init_sched_fair_class();
 
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1147,8 +1147,6 @@ static void update_curr_dl(struct rq *rq
 	curr->se.exec_start = rq_clock_task(rq);
 	cpuacct_charge(curr, delta_exec);
 
-	sched_rt_avg_update(rq, delta_exec);
-
 	if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM))
 		delta_exec = grub_reclaim(delta_exec, rq, &curr->dl);
 	dl_se->runtime -= delta_exec;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5151,8 +5151,6 @@ static void cpu_load_update(struct rq *t
 
 		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
 	}
-
-	sched_avg_update(this_rq);
 }
 
 /* Used instead of source_load when we know the type == 0 */
@@ -7134,23 +7132,14 @@ static inline int get_sd_load_idx(struct
 static unsigned long scale_rt_capacity(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	u64 total, used, age_stamp, avg;
-	s64 delta;
+	unsigned long used;
 
 	/*
-	 * Since we're reading these variables without serialization make sure
-	 * we read them once before doing sanity checks on them.
+	 * Subtract IRQ, RT and DL time as all of those preempt CFS.
 	 */
-	age_stamp = READ_ONCE(rq->age_stamp);
-	avg = READ_ONCE(rq->rt_avg);
-	delta = __rq_clock_broken(rq) - age_stamp;
-
-	if (unlikely(delta < 0))
-		delta = 0;
-
-	total = sched_avg_period() + delta;
-
-	used = div_u64(avg, total);
+	used = READ_ONCE(rq->irq_avg.util_avg) +
+	       READ_ONCE(rq->rt.avg.util_avg)  +
+	       READ_ONCE(rq->dl.avg.util_avg);
 
 	if (likely(used < SCHED_CAPACITY_SCALE))
 		return SCHED_CAPACITY_SCALE - used;
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -981,8 +981,6 @@ static void update_curr_rt(struct rq *rq
 	curr->se.exec_start = rq_clock_task(rq);
 	cpuacct_charge(curr, delta_exec);
 
-	sched_rt_avg_update(rq, delta_exec);
-
 	if (!rt_bandwidth_enabled())
 		return;
 
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -750,8 +750,6 @@ struct rq {
 
 	struct sched_avg irq_avg;
 
-	u64 rt_avg;
-	u64 age_stamp;
 	u64 idle_stamp;
 	u64 avg_idle;
 
@@ -843,11 +841,6 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 #define raw_rq()		raw_cpu_ptr(&runqueues)
 
-static inline u64 __rq_clock_broken(struct rq *rq)
-{
-	return READ_ONCE(rq->clock);
-}
-
 /*
  * rq::clock_update_flags bits
  *
@@ -1621,15 +1614,9 @@ extern void deactivate_task(struct rq *r
 
 extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
 
-extern const_debug unsigned int sysctl_sched_time_avg;
 extern const_debug unsigned int sysctl_sched_nr_migrate;
 extern const_debug unsigned int sysctl_sched_migration_cost;
 
-static inline u64 sched_avg_period(void)
-{
-	return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
-}
-
 #ifdef CONFIG_SCHED_HRTICK
 
 /*
@@ -1658,8 +1645,6 @@ static inline int hrtick_enabled(struct
 #endif /* CONFIG_SCHED_HRTICK */
 
 #ifdef CONFIG_SMP
-extern void sched_avg_update(struct rq *rq);
-
 #ifndef arch_scale_freq_capacity
 static __always_inline
 unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
@@ -1678,16 +1663,7 @@ unsigned long arch_scale_cpu_capacity(st
 	return SCHED_CAPACITY_SCALE;
 }
 #endif
-
-static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
-{
-	rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
-	sched_avg_update(rq);
-}
-#else
-static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
-static inline void sched_avg_update(struct rq *rq) { }
-#endif
+#endif /* CONFIG_SMP */
 
 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 	__acquires(rq->lock);
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -362,13 +362,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "sched_time_avg_ms",
-		.data		= &sysctl_sched_time_avg,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 #ifdef CONFIG_SCHEDSTATS
 	{
 		.procname	= "sched_schedstats",

Re: [RESEND PATCH v2 1/2] sched/rt: add utilization tracking

Reply via email to