On Sat, 2012-12-29 at 17:42 +0100, Frederic Weisbecker wrote:
> While remotely reading the cputime of a task running in a
> full dynticks CPU, the values stored in utime/stime fields
> of struct task_struct may be stale. Its values may be those
> of the last kernel <-> user transition time snapshot and
> we need to add the tickless time spent since this snapshot.
> 
> To fix this, flush the cputime of the dynticks CPUs on
> kernel <-> user transition and record the time / context
> where we did this. Then on top of this snapshot and the current
> time, perform the fixup on the reader side from task_times()
> accessors.
> 
> FIXME: do the same for idle and guest time.
> 
> Signed-off-by: Frederic Weisbecker <fweis...@gmail.com>
> Cc: Alessio Igor Bogani <abog...@kernel.org>
> Cc: Andrew Morton <a...@linux-foundation.org>
> Cc: Chris Metcalf <cmetc...@tilera.com>
> Cc: Christoph Lameter <c...@linux.com>
> Cc: Geoff Levand <ge...@infradead.org>
> Cc: Gilad Ben Yossef <gi...@benyossef.com>
> Cc: Hakan Akkan <hakanak...@gmail.com>
> Cc: Ingo Molnar <mi...@kernel.org>
> Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
> Cc: Paul Gortmaker <paul.gortma...@windriver.com>
> Cc: Peter Zijlstra <pet...@infradead.org>
> Cc: Steven Rostedt <rost...@goodmis.org>
> Cc: Thomas Gleixner <t...@linutronix.de>
> ---
>  arch/s390/kernel/vtime.c      |    6 +-
>  include/asm-generic/cputime.h |    1 +
>  include/linux/hardirq.h       |    4 +-
>  include/linux/init_task.h     |   11 ++++
>  include/linux/sched.h         |   16 +++++
>  include/linux/vtime.h         |   40 +++++++-------
>  kernel/context_tracking.c     |    2 +-
>  kernel/fork.c                 |    6 ++
>  kernel/sched/cputime.c        |  123 
> ++++++++++++++++++++++++++++++-----------
>  kernel/softirq.c              |    6 +-
>  10 files changed, 154 insertions(+), 61 deletions(-)
> 
> diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
> index e84b8b6..ce9cc5a 100644
> --- a/arch/s390/kernel/vtime.c
> +++ b/arch/s390/kernel/vtime.c
> @@ -127,7 +127,7 @@ void vtime_account_user(struct task_struct *tsk)
>   * Update process times based on virtual cpu times stored by entry.S
>   * to the lowcore fields user_timer, system_timer & steal_clock.
>   */
> -void vtime_account(struct task_struct *tsk)
> +void vtime_account_irq_enter(struct task_struct *tsk)
>  {
>       struct thread_info *ti = task_thread_info(tsk);
>       u64 timer, system;
> @@ -145,10 +145,10 @@ void vtime_account(struct task_struct *tsk)
> 
>       virt_timer_forward(system);
>  }
> -EXPORT_SYMBOL_GPL(vtime_account);
> +EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
> 
>  void vtime_account_system(struct task_struct *tsk)
> -__attribute__((alias("vtime_account")));
> +__attribute__((alias("vtime_account_irq_enter")));
>  EXPORT_SYMBOL_GPL(vtime_account_system);
> 
>  void __kprobes vtime_stop_cpu(void)
> diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h
> index 9a62937..3e704d5 100644
> --- a/include/asm-generic/cputime.h
> +++ b/include/asm-generic/cputime.h
> @@ -10,6 +10,7 @@ typedef unsigned long __nocast cputime_t;
>  #define cputime_to_jiffies(__ct)     (__force unsigned long)(__ct)
>  #define cputime_to_scaled(__ct)              (__ct)
>  #define jiffies_to_cputime(__hz)     (__force cputime_t)(__hz)
> +#define jiffies_to_scaled(__hz)              (__force cputime_t)(__hz)
> 
>  typedef u64 __nocast cputime64_t;
> 
> diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
> index 624ef3f..7105d5c 100644
> --- a/include/linux/hardirq.h
> +++ b/include/linux/hardirq.h
> @@ -153,7 +153,7 @@ extern void rcu_nmi_exit(void);
>   */
>  #define __irq_enter()                                        \
>       do {                                            \
> -             vtime_account_irq_enter(current);       \
> +             account_irq_enter_time(current);        \
>               add_preempt_count(HARDIRQ_OFFSET);      \
>               trace_hardirq_enter();                  \
>       } while (0)
> @@ -169,7 +169,7 @@ extern void irq_enter(void);
>  #define __irq_exit()                                 \
>       do {                                            \
>               trace_hardirq_exit();                   \
> -             vtime_account_irq_exit(current);        \
> +             account_irq_exit_time(current);         \
>               sub_preempt_count(HARDIRQ_OFFSET);      \
>       } while (0)
> 
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index 6d087c5..a6ef59f 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -10,6 +10,7 @@
>  #include <linux/pid_namespace.h>
>  #include <linux/user_namespace.h>
>  #include <linux/securebits.h>
> +#include <linux/seqlock.h>
>  #include <net/net_namespace.h>
> 
>  #ifdef CONFIG_SMP
> @@ -141,6 +142,15 @@ extern struct task_group root_task_group;
>  # define INIT_PERF_EVENTS(tsk)
>  #endif
> 
> +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
> +# define INIT_VTIME(tsk)                                             \
> +     .vtime_seqlock = __SEQLOCK_UNLOCKED(tsk.vtime_seqlock), \
> +     .prev_jiffies = INITIAL_JIFFIES, /* CHECKME */          \
> +     .prev_jiffies_whence = JIFFIES_SYS,
> +#else
> +# define INIT_VTIME(tsk)
> +#endif
> +
>  #define INIT_TASK_COMM "swapper"
> 
>  /*
> @@ -210,6 +220,7 @@ extern struct task_group root_task_group;
>       INIT_TRACE_RECURSION                                            \
>       INIT_TASK_RCU_PREEMPT(tsk)                                      \
>       INIT_CPUSET_SEQ                                                 \
> +     INIT_VTIME(tsk)                                                 \
>  }
> 
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d57e20f..3bca36e 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1368,6 +1368,15 @@ struct task_struct {
>  #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
>       struct cputime prev_cputime;
>  #endif
> +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
> +     seqlock_t vtime_seqlock;
> +     long prev_jiffies;
> +     enum {
> +             JIFFIES_SLEEPING = 0,
> +             JIFFIES_USER,
> +             JIFFIES_SYS,
> +     } prev_jiffies_whence;
> +#endif
>       unsigned long nvcsw, nivcsw; /* context switch counts */
>       struct timespec start_time;             /* monotonic time */
>       struct timespec real_start_time;        /* boot based time */
> @@ -1792,6 +1801,12 @@ static inline void put_task_struct(struct task_struct 
> *t)
>               __put_task_struct(t);
>  }
> 
> +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
> +extern void task_cputime(struct task_struct *t,
> +                      cputime_t *utime, cputime_t *stime);
> +extern void task_cputime_scaled(struct task_struct *t,
> +                             cputime_t *utimescaled, cputime_t *stimescaled);
> +#else
>  static inline void task_cputime(struct task_struct *t,
>                               cputime_t *utime, cputime_t *stime)
>  {
> @@ -1810,6 +1825,7 @@ static inline void task_cputime_scaled(struct 
> task_struct *t,
>       if (stimescaled)
>               *stimescaled = t->stimescaled;
>  }
> +#endif
>  extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, 
> cputime_t *st);
>  extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t 
> *ut, cputime_t *st);
> 
> diff --git a/include/linux/vtime.h b/include/linux/vtime.h
> index e57020d..81c7d84 100644
> --- a/include/linux/vtime.h
> +++ b/include/linux/vtime.h
> @@ -9,52 +9,52 @@ extern void vtime_account_system(struct task_struct *tsk);
>  extern void vtime_account_system_irqsafe(struct task_struct *tsk);
>  extern void vtime_account_idle(struct task_struct *tsk);
>  extern void vtime_account_user(struct task_struct *tsk);
> -extern void vtime_account(struct task_struct *tsk);
> +extern void vtime_account_irq_enter(struct task_struct *tsk);
> 
> -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
> -extern bool vtime_accounting(void);
> -#else
> +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
>  static inline bool vtime_accounting(void) { return true; }
>  #endif
> 
>  #else /* !CONFIG_VIRT_CPU_ACCOUNTING */
> +
>  static inline void vtime_task_switch(struct task_struct *prev) { }
>  static inline void vtime_account_system(struct task_struct *tsk) { }
>  static inline void vtime_account_system_irqsafe(struct task_struct *tsk) { }
>  static inline void vtime_account_user(struct task_struct *tsk) { }
> -static inline void vtime_account(struct task_struct *tsk) { }
> +static inline void vtime_account_irq_enter(struct task_struct *tsk) { }
>  static inline bool vtime_accounting(void) { return false; }
>  #endif
> 
>  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
> -static inline void arch_vtime_task_switch(struct task_struct *tsk) { }
> +extern void arch_vtime_task_switch(struct task_struct *tsk);
> +extern void vtime_account_irq_exit(struct task_struct *tsk);
> +extern void vtime_user_enter(struct task_struct *tsk);
> +extern bool vtime_accounting(void);
> +#else
> +static inline void vtime_account_irq_exit(struct task_struct *tsk)
> +{
> +     /* On hard|softirq exit we always account to hard|softirq cputime */
> +     vtime_account_system(tsk);
> +}
> +static inline void vtime_enter_user(struct task_struct *tsk) { }

I guess the function name above should be "vtime_user_enter" to match
the above extern, and the usage in user_enter()? 

Thanks, Zhong

>  #endif
> 
> +
>  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
>  extern void irqtime_account_irq(struct task_struct *tsk);
>  #else
>  static inline void irqtime_account_irq(struct task_struct *tsk) { }
>  #endif
> 
> -static inline void vtime_account_irq_enter(struct task_struct *tsk)
> +static inline void account_irq_enter_time(struct task_struct *tsk)
>  {
> -     /*
> -      * Hardirq can interrupt idle task anytime. So we need vtime_account()
> -      * that performs the idle check in CONFIG_VIRT_CPU_ACCOUNTING.
> -      * Softirq can also interrupt idle task directly if it calls
> -      * local_bh_enable(). Such case probably don't exist but we never know.
> -      * Ksoftirqd is not concerned because idle time is flushed on context
> -      * switch. Softirqs in the end of hardirqs are also not a problem 
> because
> -      * the idle time is flushed on hardirq time already.
> -      */
> -     vtime_account(tsk);
> +     vtime_account_irq_enter(tsk);
>       irqtime_account_irq(tsk);
>  }
> 
> -static inline void vtime_account_irq_exit(struct task_struct *tsk)
> +static inline void account_irq_exit_time(struct task_struct *tsk)
>  {
> -     /* On hard|softirq exit we always account to hard|softirq cputime */
> -     vtime_account_system(tsk);
> +     vtime_account_irq_exit(tsk);
>       irqtime_account_irq(tsk);
>  }
> 
> diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
> index ca1e073..bd2f2fc 100644
> --- a/kernel/context_tracking.c
> +++ b/kernel/context_tracking.c
> @@ -56,7 +56,7 @@ void user_enter(void)
>       local_irq_save(flags);
>       if (__this_cpu_read(context_tracking.active) &&
>           __this_cpu_read(context_tracking.state) != IN_USER) {
> -             vtime_account_system(current);
> +             vtime_user_enter(current);
>               /*
>                * At this stage, only low level arch entry code remains and
>                * then we'll run in userspace. We can assume there won't be
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 8e934d2..62892a5 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1225,6 +1225,12 @@ static struct task_struct *copy_process(unsigned long 
> clone_flags,
>  #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
>       p->prev_cputime.utime = p->prev_cputime.stime = 0;
>  #endif
> +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
> +     seqlock_init(&p->vtime_seqlock);
> +     p->prev_jiffies_whence = JIFFIES_SLEEPING; /*CHECKME: idle tasks? */
> +     p->prev_jiffies = jiffies;
> +#endif
> +
>  #if defined(SPLIT_RSS_COUNTING)
>       memset(&p->rss_stat, 0, sizeof(p->rss_stat));
>  #endif
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index 0603671..bad19b2 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -484,7 +484,7 @@ void vtime_task_switch(struct task_struct *prev)
>   * vtime_account().
>   */
>  #ifndef __ARCH_HAS_VTIME_ACCOUNT
> -void vtime_account(struct task_struct *tsk)
> +void vtime_account_irq_enter(struct task_struct *tsk)
>  {
>       if (!in_interrupt()) {
>               /*
> @@ -505,7 +505,7 @@ void vtime_account(struct task_struct *tsk)
>       }
>       vtime_account_system(tsk);
>  }
> -EXPORT_SYMBOL_GPL(vtime_account);
> +EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
>  #endif /* __ARCH_HAS_VTIME_ACCOUNT */
>  #endif /* CONFIG_VIRT_CPU_ACCOUNTING */
> 
> @@ -616,41 +616,67 @@ void thread_group_cputime_adjusted(struct task_struct 
> *p, cputime_t *ut, cputime
>  #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
> 
>  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
> -static DEFINE_PER_CPU(long, last_jiffies) = INITIAL_JIFFIES;
> -
> -static cputime_t get_vtime_delta(void)
> +static cputime_t get_vtime_delta(struct task_struct *tsk)
>  {
>       long delta;
> 
> -     delta = jiffies - __this_cpu_read(last_jiffies);
> -     __this_cpu_add(last_jiffies, delta);
> +     delta = jiffies - tsk->prev_jiffies;
> +     tsk->prev_jiffies += delta;
> 
>       return jiffies_to_cputime(delta);
>  }
> 
> -void vtime_account_system(struct task_struct *tsk)
> +static void __vtime_account_system(struct task_struct *tsk)
>  {
> -     cputime_t delta_cpu = get_vtime_delta();
> +     cputime_t delta_cpu = get_vtime_delta(tsk);
> 
>       account_system_time(tsk, irq_count(), delta_cpu, 
> cputime_to_scaled(delta_cpu));
>  }
> 
> +void vtime_account_system(struct task_struct *tsk)
> +{
> +     write_seqlock(&tsk->vtime_seqlock);
> +     __vtime_account_system(tsk);
> +     write_sequnlock(&tsk->vtime_seqlock);
> +}
> +
> +void vtime_account_irq_exit(struct task_struct *tsk)
> +{
> +     write_seqlock(&tsk->vtime_seqlock);
> +     if (context_tracking_in_user())
> +             tsk->prev_jiffies_whence = JIFFIES_USER;
> +     __vtime_account_system(tsk);
> +     write_sequnlock(&tsk->vtime_seqlock);
> +}
> +
>  void vtime_account_user(struct task_struct *tsk)
>  {
> -     cputime_t delta_cpu = get_vtime_delta();
> +     cputime_t delta_cpu = get_vtime_delta(tsk);
> 
>       /*
>        * This is an unfortunate hack: if we flush user time only on
>        * irq entry, we miss the jiffies update and the time is spuriously
>        * accounted to system time.
>        */
> -     if (context_tracking_in_user())
> +     if (context_tracking_in_user()) {
> +             write_seqlock(&tsk->vtime_seqlock);
> +             tsk->prev_jiffies_whence = JIFFIES_SYS;
>               account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
> +             write_sequnlock(&tsk->vtime_seqlock);
> +     }
> +}
> +
> +void vtime_user_enter(struct task_struct *tsk)
> +{
> +     write_seqlock(&tsk->vtime_seqlock);
> +     tsk->prev_jiffies_whence = JIFFIES_USER;
> +     __vtime_account_system(tsk);
> +     write_sequnlock(&tsk->vtime_seqlock);
>  }
> 
>  void vtime_account_idle(struct task_struct *tsk)
>  {
> -     cputime_t delta_cpu = get_vtime_delta();
> +     cputime_t delta_cpu = get_vtime_delta(tsk);
> 
>       account_idle_time(delta_cpu);
>  }
> @@ -660,31 +686,64 @@ bool vtime_accounting(void)
>       return context_tracking_active();
>  }
> 
> -static int __cpuinit vtime_cpu_notify(struct notifier_block *self,
> -                                   unsigned long action, void *hcpu)
> +void arch_vtime_task_switch(struct task_struct *prev)
>  {
> -     long cpu = (long)hcpu;
> -     long *last_jiffies_cpu = per_cpu_ptr(&last_jiffies, cpu);
> +     write_seqlock(&prev->vtime_seqlock);
> +     prev->prev_jiffies_whence = JIFFIES_SLEEPING;
> +     write_sequnlock(&prev->vtime_seqlock);
> 
> -     switch (action) {
> -     case CPU_UP_PREPARE:
> -     case CPU_UP_PREPARE_FROZEN:
> -             /*
> -              * CHECKME: ensure that's visible by the CPU
> -              * once it wakes up
> -              */
> -             *last_jiffies_cpu = jiffies;
> -     default:
> -             break;
> -     }
> +     write_seqlock(&current->vtime_seqlock);
> +     current->prev_jiffies_whence = JIFFIES_SYS;
> +     current->prev_jiffies = jiffies;
> +     write_sequnlock(&current->vtime_seqlock);
> +}
> +
> +void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
> +{
> +     unsigned int seq;
> +     long delta;
> +
> +     do {
> +             seq = read_seqbegin(&t->vtime_seqlock);
> +
> +             *utime = t->utime;
> +             *stime = t->stime;
> +
> +             if (t->prev_jiffies_whence == JIFFIES_SLEEPING || 
> +                 is_idle_task(t))
> +                     continue;
> 
> -     return NOTIFY_OK;
> +             delta = jiffies - t->prev_jiffies;
> +
> +             if (t->prev_jiffies_whence == JIFFIES_USER)
> +                     *utime += delta;
> +             else if (t->prev_jiffies_whence == JIFFIES_SYS)
> +                     *stime += delta;
> +     } while (read_seqretry(&t->vtime_seqlock, seq));
>  }
> 
> -static int __init init_vtime(void)
> +void task_cputime_scaled(struct task_struct *t,
> +                      cputime_t *utimescaled, cputime_t *stimescaled)
>  {
> -     cpu_notifier(vtime_cpu_notify, 0);
> -     return 0;
> +     unsigned int seq;
> +     long delta;
> +
> +     do {
> +             seq = read_seqbegin(&t->vtime_seqlock);
> +
> +             *utimescaled = t->utimescaled;
> +             *stimescaled = t->stimescaled;
> +
> +             if (t->prev_jiffies_whence == JIFFIES_SLEEPING || 
> +                 is_idle_task(t))
> +                     continue;
> +
> +             delta = jiffies - t->prev_jiffies;
> +
> +             if (t->prev_jiffies_whence == JIFFIES_USER)
> +                     *utimescaled += jiffies_to_scaled(delta);
> +             else if (t->prev_jiffies_whence == JIFFIES_SYS)
> +                     *stimescaled += jiffies_to_scaled(delta);
> +     } while (read_seqretry(&t->vtime_seqlock, seq));
>  }
> -early_initcall(init_vtime);
>  #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index ed567ba..f5cc25f 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void)
>       current->flags &= ~PF_MEMALLOC;
> 
>       pending = local_softirq_pending();
> -     vtime_account_irq_enter(current);
> +     account_irq_enter_time(current);
> 
>       __local_bh_disable((unsigned long)__builtin_return_address(0),
>                               SOFTIRQ_OFFSET);
> @@ -272,7 +272,7 @@ restart:
> 
>       lockdep_softirq_exit();
> 
> -     vtime_account_irq_exit(current);
> +     account_irq_exit_time(current);
>       __local_bh_enable(SOFTIRQ_OFFSET);
>       tsk_restore_flags(current, old_flags, PF_MEMALLOC);
>  }
> @@ -341,7 +341,7 @@ static inline void invoke_softirq(void)
>   */
>  void irq_exit(void)
>  {
> -     vtime_account_irq_exit(current);
> +     account_irq_exit_time(current);
>       trace_hardirq_exit();
>       sub_preempt_count(IRQ_EXIT_OFFSET);
>       if (!in_interrupt() && local_softirq_pending())


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to