From: Martin Schwidefsky <schwidef...@de.ibm.com>

The account_system_time() function is called with a cputime that
occurred while running in the kernel. The function detects which
context the CPU is currently running in and accounts the time to
the correct bucket. This forces the arch code to account the
cputime for hardirq and softirq immediately.

Such accounting function can be costly and perform unwelcome divisions
and multiplications, among others.

The arch code can delay the accounting for system time. For s390
the accounting is done once per timer tick and for each task switch.

Signed-off-by: Martin Schwidefsky <schwidef...@de.ibm.com>
Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Michael Ellerman <m...@ellerman.id.au>
Cc: Heiko Carstens <heiko.carst...@de.ibm.com>
Cc: Martin Schwidefsky <schwidef...@de.ibm.com>
Cc: Tony Luck <tony.l...@intel.com>
Cc: Fenghua Yu <fenghua...@intel.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Rik van Riel <r...@redhat.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Ingo Molnar <mi...@kernel.org>
Cc: Stanislaw Gruszka <sgrus...@redhat.com>
Cc: Wanpeng Li <wanpeng...@hotmail.com>
[rebase against latest cputime tree, massaged changelog accordingly]
Signed-off-by: Frederic Weisbecker <fweis...@gmail.com>
---
 arch/s390/include/asm/lowcore.h   |  65 +++++++++++-----------
 arch/s390/include/asm/processor.h |   3 +
 arch/s390/kernel/vtime.c          | 112 +++++++++++++++++++++++---------------
 3 files changed, 106 insertions(+), 74 deletions(-)

diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index 9bfad2a..61261e0e 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -85,53 +85,56 @@ struct lowcore {
        __u64   mcck_enter_timer;               /* 0x02c0 */
        __u64   exit_timer;                     /* 0x02c8 */
        __u64   user_timer;                     /* 0x02d0 */
-       __u64   system_timer;                   /* 0x02d8 */
-       __u64   steal_timer;                    /* 0x02e0 */
-       __u64   last_update_timer;              /* 0x02e8 */
-       __u64   last_update_clock;              /* 0x02f0 */
-       __u64   int_clock;                      /* 0x02f8 */
-       __u64   mcck_clock;                     /* 0x0300 */
-       __u64   clock_comparator;               /* 0x0308 */
+       __u64   guest_timer;                    /* 0x02d8 */
+       __u64   system_timer;                   /* 0x02e0 */
+       __u64   hardirq_timer;                  /* 0x02e8 */
+       __u64   softirq_timer;                  /* 0x02f0 */
+       __u64   steal_timer;                    /* 0x02f8 */
+       __u64   last_update_timer;              /* 0x0300 */
+       __u64   last_update_clock;              /* 0x0308 */
+       __u64   int_clock;                      /* 0x0310 */
+       __u64   mcck_clock;                     /* 0x0318 */
+       __u64   clock_comparator;               /* 0x0320 */
 
        /* Current process. */
-       __u64   current_task;                   /* 0x0310 */
-       __u8    pad_0x318[0x320-0x318];         /* 0x0318 */
-       __u64   kernel_stack;                   /* 0x0320 */
+       __u64   current_task;                   /* 0x0328 */
+       __u8    pad_0x318[0x320-0x318];         /* 0x0330 */
+       __u64   kernel_stack;                   /* 0x0338 */
 
        /* Interrupt, panic and restart stack. */
-       __u64   async_stack;                    /* 0x0328 */
-       __u64   panic_stack;                    /* 0x0330 */
-       __u64   restart_stack;                  /* 0x0338 */
+       __u64   async_stack;                    /* 0x0340 */
+       __u64   panic_stack;                    /* 0x0348 */
+       __u64   restart_stack;                  /* 0x0350 */
 
        /* Restart function and parameter. */
-       __u64   restart_fn;                     /* 0x0340 */
-       __u64   restart_data;                   /* 0x0348 */
-       __u64   restart_source;                 /* 0x0350 */
+       __u64   restart_fn;                     /* 0x0358 */
+       __u64   restart_data;                   /* 0x0360 */
+       __u64   restart_source;                 /* 0x0368 */
 
        /* Address space pointer. */
-       __u64   kernel_asce;                    /* 0x0358 */
-       __u64   user_asce;                      /* 0x0360 */
+       __u64   kernel_asce;                    /* 0x0370 */
+       __u64   user_asce;                      /* 0x0378 */
 
        /*
         * The lpp and current_pid fields form a
         * 64-bit value that is set as program
         * parameter with the LPP instruction.
         */
-       __u32   lpp;                            /* 0x0368 */
-       __u32   current_pid;                    /* 0x036c */
+       __u32   lpp;                            /* 0x0380 */
+       __u32   current_pid;                    /* 0x0384 */
 
        /* SMP info area */
-       __u32   cpu_nr;                         /* 0x0370 */
-       __u32   softirq_pending;                /* 0x0374 */
-       __u64   percpu_offset;                  /* 0x0378 */
-       __u64   vdso_per_cpu_data;              /* 0x0380 */
-       __u64   machine_flags;                  /* 0x0388 */
-       __u32   preempt_count;                  /* 0x0390 */
-       __u8    pad_0x0394[0x0398-0x0394];      /* 0x0394 */
-       __u64   gmap;                           /* 0x0398 */
-       __u32   spinlock_lockval;               /* 0x03a0 */
-       __u32   fpu_flags;                      /* 0x03a4 */
-       __u8    pad_0x03a8[0x0400-0x03a8];      /* 0x03a8 */
+       __u32   cpu_nr;                         /* 0x0388 */
+       __u32   softirq_pending;                /* 0x038c */
+       __u64   percpu_offset;                  /* 0x0390 */
+       __u64   vdso_per_cpu_data;              /* 0x0398 */
+       __u64   machine_flags;                  /* 0x03a0 */
+       __u32   preempt_count;                  /* 0x03a8 */
+       __u8    pad_0x03ac[0x03b0-0x03ac];      /* 0x03ac */
+       __u64   gmap;                           /* 0x03b0 */
+       __u32   spinlock_lockval;               /* 0x03b8 */
+       __u32   fpu_flags;                      /* 0x03bc */
+       __u8    pad_0x03c0[0x0400-0x03c0];      /* 0x03c0 */
 
        /* Per cpu primary space access list */
        __u32   paste[16];                      /* 0x0400 */
diff --git a/arch/s390/include/asm/processor.h 
b/arch/s390/include/asm/processor.h
index 9c00351..6f07907 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -111,7 +111,10 @@ struct thread_struct {
        unsigned int  acrs[NUM_ACRS];
         unsigned long ksp;              /* kernel stack pointer             */
        unsigned long user_timer;       /* task cputime in user space */
+       unsigned long guest_timer;      /* task cputime in kvm guest */
        unsigned long system_timer;     /* task cputime in kernel space */
+       unsigned long hardirq_timer;    /* task cputime in hardirq context */
+       unsigned long softirq_timer;    /* task cputime in softirq context */
        unsigned long sys_call_table;   /* system call table address */
        mm_segment_t mm_segment;
        unsigned long gmap_addr;        /* address of last gmap fault. */
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 6b246aa..0fdcaca 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -90,14 +90,23 @@ static void update_mt_scaling(void)
        __this_cpu_write(mt_scaling_jiffies, jiffies_64);
 }
 
+static inline u64 scale_vtime(u64 vtime)
+{
+       u64 mult = __this_cpu_read(mt_scaling_mult);
+       u64 div = __this_cpu_read(mt_scaling_div);
+
+       if (smp_cpu_mtid)
+               return vtime * mult / div;
+       return vtime;
+}
+
 /*
  * Update process times based on virtual cpu times stored by entry.S
  * to the lowcore fields user_timer, system_timer & steal_clock.
  */
 static int do_account_vtime(struct task_struct *tsk, int hardirq_offset)
 {
-       u64 timer, clock, user, system, steal;
-       u64 user_scaled, system_scaled;
+       u64 timer, clock, user, guest, system, hardirq, softirq, steal;
 
        timer = S390_lowcore.last_update_timer;
        clock = S390_lowcore.last_update_clock;
@@ -110,36 +119,57 @@ static int do_account_vtime(struct task_struct *tsk, int 
hardirq_offset)
 #endif
                : "=m" (S390_lowcore.last_update_timer),
                  "=m" (S390_lowcore.last_update_clock));
-       S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer;
-       S390_lowcore.steal_timer += S390_lowcore.last_update_clock - clock;
+       clock = S390_lowcore.last_update_clock - clock;
+       timer -= S390_lowcore.last_update_timer;
+
+       if ((tsk->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
+               S390_lowcore.guest_timer += timer;
+       else if (hardirq_count() - hardirq_offset)
+               S390_lowcore.hardirq_timer += timer;
+       else if (in_serving_softirq())
+               S390_lowcore.softirq_timer += timer;
+       else
+               S390_lowcore.system_timer += timer;
 
        /* Update MT utilization calculation */
        if (smp_cpu_mtid &&
            time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies)))
                update_mt_scaling();
 
+       /* Calculate cputime delta */
        user = S390_lowcore.user_timer - tsk->thread.user_timer;
-       S390_lowcore.steal_timer -= user;
        tsk->thread.user_timer = S390_lowcore.user_timer;
-
+       guest = S390_lowcore.guest_timer - tsk->thread.guest_timer;
+       tsk->thread.guest_timer = S390_lowcore.guest_timer;
        system = S390_lowcore.system_timer - tsk->thread.system_timer;
-       S390_lowcore.steal_timer -= system;
        tsk->thread.system_timer = S390_lowcore.system_timer;
+       hardirq = S390_lowcore.hardirq_timer - tsk->thread.hardirq_timer;
+       tsk->thread.hardirq_timer = S390_lowcore.hardirq_timer;
+       softirq = S390_lowcore.softirq_timer - tsk->thread.softirq_timer;
+       tsk->thread.softirq_timer = S390_lowcore.softirq_timer;
+       S390_lowcore.steal_timer +=
+               clock - user - guest - system - hardirq - softirq;
 
-       user_scaled = user;
-       system_scaled = system;
-       /* Do MT utilization scaling */
-       if (smp_cpu_mtid) {
-               u64 mult = __this_cpu_read(mt_scaling_mult);
-               u64 div = __this_cpu_read(mt_scaling_div);
+       /* Push account value */
+       if (user) {
+               account_user_time(tsk, user);
+               tsk->utimescaled += scale_vtime(user);
+       }
 
-               user_scaled = (user_scaled * mult) / div;
-               system_scaled = (system_scaled * mult) / div;
+       if (guest) {
+               account_guest_time(tsk, guest);
+               tsk->utimescaled += scale_vtime(guest);
        }
-       account_user_time(tsk, user);
-       tsk->utimescaled += user_scaled;
-       account_system_time(tsk, hardirq_offset, system);
-       tsk->stimescaled += system_scaled;
+
+       if (system)
+               account_system_index_scaled(tsk, system, scale_vtime(system),
+                                           CPUTIME_SYSTEM);
+       if (hardirq)
+               account_system_index_scaled(tsk, hardirq, scale_vtime(hardirq),
+                                           CPUTIME_IRQ);
+       if (softirq)
+               account_system_index_scaled(tsk, softirq, scale_vtime(softirq),
+                                           CPUTIME_SOFTIRQ);
 
        steal = S390_lowcore.steal_timer;
        if ((s64) steal > 0) {
@@ -147,16 +177,22 @@ static int do_account_vtime(struct task_struct *tsk, int 
hardirq_offset)
                account_steal_time(steal);
        }
 
-       return virt_timer_forward(user + system);
+       return virt_timer_forward(user + guest + system + hardirq + softirq);
 }
 
 void vtime_task_switch(struct task_struct *prev)
 {
        do_account_vtime(prev, 0);
        prev->thread.user_timer = S390_lowcore.user_timer;
+       prev->thread.guest_timer = S390_lowcore.guest_timer;
        prev->thread.system_timer = S390_lowcore.system_timer;
+       prev->thread.hardirq_timer = S390_lowcore.hardirq_timer;
+       prev->thread.softirq_timer = S390_lowcore.softirq_timer;
        S390_lowcore.user_timer = current->thread.user_timer;
+       S390_lowcore.guest_timer = current->thread.guest_timer;
        S390_lowcore.system_timer = current->thread.system_timer;
+       S390_lowcore.hardirq_timer = current->thread.hardirq_timer;
+       S390_lowcore.softirq_timer = current->thread.softirq_timer;
 }
 
 /*
@@ -176,32 +212,22 @@ void vtime_account_user(struct task_struct *tsk)
  */
 void vtime_account_irq_enter(struct task_struct *tsk)
 {
-       u64 timer, system, system_scaled;
+       u64 timer;
 
        timer = S390_lowcore.last_update_timer;
        S390_lowcore.last_update_timer = get_vtimer();
-       S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer;
-
-       /* Update MT utilization calculation */
-       if (smp_cpu_mtid &&
-           time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies)))
-               update_mt_scaling();
-
-       system = S390_lowcore.system_timer - tsk->thread.system_timer;
-       S390_lowcore.steal_timer -= system;
-       tsk->thread.system_timer = S390_lowcore.system_timer;
-       system_scaled = system;
-       /* Do MT utilization scaling */
-       if (smp_cpu_mtid) {
-               u64 mult = __this_cpu_read(mt_scaling_mult);
-               u64 div = __this_cpu_read(mt_scaling_div);
-
-               system_scaled = (system_scaled * mult) / div;
-       }
-       account_system_time(tsk, 0, system);
-       tsk->stimescaled += system_scaled;
-
-       virt_timer_forward(system);
+       timer -= S390_lowcore.last_update_timer;
+
+       if ((tsk->flags & PF_VCPU) && (irq_count() == 0))
+               S390_lowcore.guest_timer += timer;
+       else if (hardirq_count())
+               S390_lowcore.hardirq_timer += timer;
+       else if (in_serving_softirq())
+               S390_lowcore.softirq_timer += timer;
+       else
+               S390_lowcore.system_timer += timer;
+
+       virt_timer_forward(timer);
 }
 EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 
-- 
2.7.4

Reply via email to