Warning: non-merge-ready in any sense Under CONFIG_FAIR_SOFTIRQ_SCHEDULE each sched tick will account cpu time spent on processing softirqs to ksoftirqd of the softirq's group. Update then ksoftirqd->se.sum_exec_runtime and recalculate ksoftirqd->se.vruntime.
Use CFS's vrutime to decide if softirq needs to be served or deferred. It's possible to tune this with ksoftirqd nice policy. Signed-off-by: Dmitry Safonov <d...@arista.com> --- include/linux/interrupt.h | 1 + kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 19 +++++++++++++++++++ kernel/softirq.c | 45 +++++++++++++++++++++++++++++++++++++-------- 4 files changed, 95 insertions(+), 8 deletions(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 17e1a04445fa..a0b5c24c088a 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -512,6 +512,7 @@ extern struct task_struct *__percpu **ksoftirqd; extern unsigned nr_softirq_groups; extern bool servicing_softirq(unsigned nr); +extern unsigned group_softirqs(unsigned nr); static inline bool current_is_ksoftirqd(void) { unsigned i; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2fe3aa853e4d..d0105739551f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -813,6 +813,42 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) } #endif /* CONFIG_SMP */ +static void update_ksoftirqd(struct cfs_rq *cfs_rq) +{ +#ifdef CONFIG_FAIR_SOFTIRQ_SCHEDULE + int rq_cpu = cpu_of(rq_of(cfs_rq)); + u64 si_times[NR_SOFTIRQS], delta[NR_SOFTIRQS]; + unsigned i; + + if (unlikely(!ksoftirqd)) + return; + + softirq_time_read(rq_cpu, si_times); + + for (i = 0; i < NR_SOFTIRQS; i++) { + delta[i] = si_times[i] - cfs_rq->prev_si_time[i]; + cfs_rq->prev_si_time[i] = si_times[i]; + if (unlikely((s64)delta[i] < 0)) + delta[i] = 0; + } + + for (i = 0; i < nr_softirq_groups; i++) { + unsigned j, softirq = 0, group_mask = group_softirqs(i); + struct task_struct *tsk = *this_cpu_ptr(ksoftirqd[i]); + u64 sum_delta = 0; + + while ((j = ffs(group_mask))) { + softirq += j - 1; + group_mask >>= j; + sum_delta += delta[softirq]; + } + + tsk->se.sum_exec_runtime += sum_delta; + tsk->se.vruntime += calc_delta_fair(sum_delta, &tsk->se); + } +#endif +} + /* * Update the current task's runtime statistics. */ @@ -822,6 +858,8 @@ static void update_curr(struct cfs_rq *cfs_rq) u64 now = rq_clock_task(rq_of(cfs_rq)); u64 delta_exec; + update_ksoftirqd(cfs_rq); + if (unlikely(!curr)) return; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14e154c86dc5..e95d8d4f9146 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -487,6 +487,10 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ +#ifdef CONFIG_FAIR_SOFTIRQ_SCHEDULE + u64 prev_si_time[NR_SOFTIRQS]; +#endif + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; u64 runtime_expires; @@ -2081,6 +2085,21 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +static inline void softirq_time_read(int cpu, u64 si_times[NR_SOFTIRQS]) +{ +#ifdef CONFIG_FAIR_SOFTIRQ_SCHEDULE + struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); + unsigned int seq, i; + + for (i = 0; i < NR_SOFTIRQS; i++) { + do { + seq = __u64_stats_fetch_begin(&irqtime->sync); + si_times[i] = irqtime->total_si[i]; + } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); + } +#endif +} + #ifdef CONFIG_CPU_FREQ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); diff --git a/kernel/softirq.c b/kernel/softirq.c index 516e31d3d5b4..a123bafa11c2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -82,6 +82,11 @@ bool servicing_softirq(unsigned nr) return false; } +unsigned group_softirqs(unsigned nr) +{ + return group_to_softirqs[nr]; +} + /* * we cannot loop indefinitely here to avoid userspace starvation, * but we also don't want to introduce a worst case 1/HZ latency @@ -112,15 +117,10 @@ static void wakeup_softirqd(u32 softirq_mask) * If ksoftirqd is scheduled, we do not want to process pending softirqs * right now. Let ksoftirqd handle this at its own rate, to get fairness. */ -static bool ksoftirqd_running(void) +static bool ksoftirqd_running(__u32 pending) { - /* We rely that there are pending softirqs */ - __u32 pending = local_softirq_pending(); unsigned i; - if (!ksoftirqd) - return false; - for (i = 0; i < nr_softirq_groups && pending; i++) { /* Interrupts are disabled: no need to stop preemption */ struct task_struct *tsk = *this_cpu_ptr(ksoftirqd[i]); @@ -137,6 +137,33 @@ static bool ksoftirqd_running(void) return !pending; } +static __u32 softirqs_to_serve(__u32 pending) +{ + unsigned i; + __u32 unserve = pending; + + if (!ksoftirqd || !current || is_idle_task(current)) + return pending; + + if (!IS_ENABLED(CONFIG_FAIR_SOFTIRQ_SCHEDULE)) + return ksoftirqd_running(pending) ? 0 : pending; + + for (i = 0; i < nr_softirq_groups && unserve; i++) { + /* Interrupts are disabled: no need to stop preemption */ + struct task_struct *tsk = *this_cpu_ptr(ksoftirqd[i]); + + if (tsk && (s64)(current->se.vruntime - tsk->se.vruntime) < 0) { + if (tsk->state != TASK_RUNNING) + wake_up_process(tsk); + continue; + } + + unserve &= ~group_to_softirqs[i]; + } + + return pending & ~unserve; +} + /* * preempt_count and SOFTIRQ_OFFSET usage: * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving @@ -385,7 +412,8 @@ asmlinkage __visible void do_softirq(void) local_irq_save(flags); - if (!ksoftirqd_running()) + pending = softirqs_to_serve(pending); + if (pending) do_softirq_own_stack(pending); local_irq_restore(flags); @@ -414,7 +442,8 @@ static inline void invoke_softirq(void) { __u32 pending = local_softirq_pending(); - if (!pending || !ksoftirqd_running()) + pending = softirqs_to_serve(pending); + if (!pending) return; if (!force_irqthreads) { -- 2.13.6