From: Rik van Riel <r...@redhat.com>

Currently, if there was any irq or softirq time during 'ticks'
jiffies, the entire period will be accounted as irq or softirq
time.

This is inaccurate if only a subset of 'ticks' jiffies was
actually spent handling irqs, and could conceivably mis-count
all of the ticks during a period as irq time, when there was
some irq and some softirq time.

This can actually happen when irqtime_account_process_tick
is called from account_idle_ticks, which can pass a larger
number of ticks down all at once.

Fix this by changing irqtime_account_hi_update and
irqtime_account_si_update to round elapsed irq and softirq
time to jiffies, and return the number of jiffies spent in
each mode, similar to how steal time is handled.

Additionally, have irqtime_account_process_tick take into
account how much time was spent in each of steal, irq,
and softirq time.

The latter could help improve the accuracy of timekeeping
when returning from idle on a NO_HZ_IDLE CPU.

Properly accounting how much time was spent in hardirq and
softirq time will also allow the NO_HZ_FULL code to re-use
these same functions for hardirq and softirq accounting.

Signed-off-by: Rik van Riel <r...@redhat.com>
---
 kernel/sched/cputime.c | 69 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 3d60e5d76fdb..9bd2d4f42037 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -79,34 +79,36 @@ void irqtime_account_irq(struct task_struct *curr)
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
 
-static int irqtime_account_hi_update(void)
+static unsigned long irqtime_account_hi_update(unsigned long max_jiffies)
 {
        u64 *cpustat = kcpustat_this_cpu->cpustat;
+       unsigned long irq_jiffies;
        unsigned long flags;
-       u64 latest_ns;
-       int ret = 0;
+       u64 irq;
 
        local_irq_save(flags);
-       latest_ns = this_cpu_read(cpu_hardirq_time);
-       if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
-               ret = 1;
+       irq = this_cpu_read(cpu_hardirq_time) - cpustat[CPUTIME_IRQ];
+       irq_jiffies = min(cputime_to_jiffies(irq), max_jiffies);
+       if (irq_jiffies)
+               cpustat[CPUTIME_IRQ] += jiffies_to_cputime(irq_jiffies);
        local_irq_restore(flags);
-       return ret;
+       return irq_jiffies;
 }
 
-static int irqtime_account_si_update(void)
+static unsigned long irqtime_account_si_update(unsigned long max_jiffies)
 {
        u64 *cpustat = kcpustat_this_cpu->cpustat;
+       unsigned long si_jiffies;
        unsigned long flags;
-       u64 latest_ns;
-       int ret = 0;
+       u64 softirq;
 
        local_irq_save(flags);
-       latest_ns = this_cpu_read(cpu_softirq_time);
-       if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
-               ret = 1;
+       softirq = this_cpu_read(cpu_softirq_time) - cpustat[CPUTIME_SOFTIRQ];
+       si_jiffies = min(cputime_to_jiffies(softirq), max_jiffies);
+       if (si_jiffies)
+               cpustat[CPUTIME_SOFTIRQ] += jiffies_to_cputime(si_jiffies);
        local_irq_restore(flags);
-       return ret;
+       return si_jiffies;
 }
 
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
@@ -283,6 +285,26 @@ static __always_inline unsigned long 
steal_account_process_tick(unsigned long ma
 }
 
 /*
+ * Account how much elapsed time was spent in steal, irq, or softirq time.
+ * Due to rounding errors, the calculated amount can sometimes exceed
+ * max_jiffies; be careful not to account more than max_jiffies.
+ */
+static inline int account_other_ticks(unsigned long max_jiffies)
+{
+       unsigned long accounted;
+
+       accounted = steal_account_process_tick(max_jiffies);
+
+       if (accounted < max_jiffies)
+               accounted += irqtime_account_hi_update(max_jiffies - accounted);
+
+       if (accounted < max_jiffies)
+               accounted += irqtime_account_si_update(max_jiffies - accounted);
+
+       return accounted;
+}
+
+/*
  * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
  * tasks (sum on group iteration) belonging to @tsk's group.
  */
@@ -344,19 +366,24 @@ static void irqtime_account_process_tick(struct 
task_struct *p, int user_tick,
 {
        cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
        u64 cputime = (__force u64) cputime_one_jiffy;
-       u64 *cpustat = kcpustat_this_cpu->cpustat;
+       unsigned long other;
 
-       if (steal_account_process_tick(ULONG_MAX))
+       /*
+        * When returning from idle, many ticks can get accounted at
+        * once, including some ticks of steal, irq, and softirq time.
+        * Subtract those ticks from the amount of time accounted to
+        * idle, or potentially user or system time. Due to rounding,
+        * other time can exceed ticks occasionally.
+        */
+       other = account_other_ticks(ticks);
+       if (other >= ticks)
                return;
+       ticks -= other;
 
        cputime *= ticks;
        scaled *= ticks;
 
-       if (irqtime_account_hi_update()) {
-               cpustat[CPUTIME_IRQ] += cputime;
-       } else if (irqtime_account_si_update()) {
-               cpustat[CPUTIME_SOFTIRQ] += cputime;
-       } else if (this_cpu_ksoftirqd() == p) {
+       if (this_cpu_ksoftirqd() == p) {
                /*
                 * ksoftirqd time do not get accounted in cpu_softirq_time.
                 * So, we have to handle it separately here.
-- 
2.5.5

Reply via email to