On Wed, 13 Aug 2014 19:22:30 +0200
Oleg Nesterov <o...@redhat.com> wrote:

> On 08/12, Rik van Riel wrote:
> >
> > Any other ideas?
> 
> To simplify, lets suppose that we only need sum_exec_runtime.
> 
> Perhaps we can do something like this

That would probably work, indeed.

However, it turns out that a seqcount doesn't look too badly either.

The following patch has only been compile tested so far, I am about to
give it a real test.

I believe k_getrusage can probably be changed in the same way.

---8<---

Subject: time,signal: protect cpu use statistics with seqcount

Both times() and clock_gettime(CLOCK_PROCESS_CPUTIME_ID) have scalability
issues on large systems, due to both functions being serialized with a
lock.

The lock protects against reporting a wrong value, due to a thread in the
task group exiting, its statistics reporting up to the signal struct, and
that exited task's statistics being counted twice (or not at all).

Protecting that with a lock results in times and clock_gettime being
completely serialized on large systems.

This can be fixed by using a seqcount around the events that gather and
propagate statistics. As an additional benefit, the protection code can
be moved into thread_group_cputime, slightly simplifying the calling
functions.

This way the statistics reporting code can run lockless.

Signed-off-by: Rik van Riel <r...@redhat.com>
---
 include/linux/sched.h          |  1 +
 kernel/exit.c                  |  4 ++++
 kernel/fork.c                  |  1 +
 kernel/sched/cputime.c         | 36 +++++++++++++++++++++---------------
 kernel/sys.c                   |  2 --
 kernel/time/posix-cpu-timers.c |  9 ++++-----
 6 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 857ba40..5670d33 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -461,6 +461,7 @@ struct sighand_struct {
        atomic_t                count;
        struct k_sigaction      action[_NSIG];
        spinlock_t              siglock;
+       seqcount_t              stats_seq; /* write nests inside spinlock */
        wait_queue_head_t       signalfd_wqh;
 };
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 32c58f7..019c263 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -126,6 +126,7 @@ static void __exit_signal(struct task_struct *tsk)
                 * will have been the last reference on the signal_struct.
                 */
                task_cputime(tsk, &utime, &stime);
+               write_seqcount_begin(&sighand->stats_seq);
                sig->utime += utime;
                sig->stime += stime;
                sig->gtime += task_gtime(tsk);
@@ -137,6 +138,7 @@ static void __exit_signal(struct task_struct *tsk)
                sig->oublock += task_io_get_oublock(tsk);
                task_io_accounting_add(&sig->ioac, &tsk->ioac);
                sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
+               write_seqcount_end(&sighand->stats_seq);
        }
 
        sig->nr_threads--;
@@ -1041,6 +1043,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct 
task_struct *p)
                 */
                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
                spin_lock_irq(&p->real_parent->sighand->siglock);
+               write_seqcount_begin(&p->real_parent->sighand->stats_seq);
                psig = p->real_parent->signal;
                sig = p->signal;
                psig->cutime += tgutime + sig->cutime;
@@ -1065,6 +1068,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct 
task_struct *p)
                        psig->cmaxrss = maxrss;
                task_io_accounting_add(&psig->ioac, &p->ioac);
                task_io_accounting_add(&psig->ioac, &sig->ioac);
+               write_seqcount_end(&p->real_parent->sighand->stats_seq);
                spin_unlock_irq(&p->real_parent->sighand->siglock);
        }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 1380d8a..4681694 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1749,6 +1749,7 @@ static void sighand_ctor(void *data)
        struct sighand_struct *sighand = data;
 
        spin_lock_init(&sighand->siglock);
+       seqcount_init(&sighand->stats_seq);
        init_waitqueue_head(&sighand->signalfd_wqh);
 }
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 72fdf06..370fd67 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -286,25 +286,34 @@ static __always_inline bool 
steal_account_process_tick(void)
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 {
        struct signal_struct *sig = tsk->signal;
+       struct sighand_struct *sighand;
        cputime_t utime, stime;
        struct task_struct *t;
-
-       times->utime = sig->utime;
-       times->stime = sig->stime;
-       times->sum_exec_runtime = sig->sum_sched_runtime;
+       int seq;
 
        rcu_read_lock();
-       /* make sure we can trust tsk->thread_group list */
-       if (!likely(pid_alive(tsk)))
+       sighand = rcu_dereference(tsk->sighand);
+       if (unlikely(!sighand))
                goto out;
 
-       t = tsk;
        do {
-               task_cputime(t, &utime, &stime);
-               times->utime += utime;
-               times->stime += stime;
-               times->sum_exec_runtime += task_sched_runtime(t);
-       } while_each_thread(tsk, t);
+               seq = read_seqcount_begin(&sighand->stats_seq);
+               times->utime = sig->utime;
+               times->stime = sig->stime;
+               times->sum_exec_runtime = sig->sum_sched_runtime;
+
+               /* make sure we can trust tsk->thread_group list */
+               if (!likely(pid_alive(tsk)))
+                       goto out;
+
+               t = tsk;
+               do {
+                       task_cputime(t, &utime, &stime);
+                       times->utime += utime;
+                       times->stime += stime;
+                       times->sum_exec_runtime += task_sched_runtime(t);
+               } while_each_thread(tsk, t);
+       } while (read_seqcount_retry(&sighand->stats_seq, seq));
 out:
        rcu_read_unlock();
 }
@@ -617,9 +626,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t 
*ut, cputime_t *st)
        cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
 
-/*
- * Must be called with siglock held.
- */
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, 
cputime_t *st)
 {
        struct task_cputime cputime;
diff --git a/kernel/sys.c b/kernel/sys.c
index ce81291..b663664 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -862,11 +862,9 @@ void do_sys_times(struct tms *tms)
 {
        cputime_t tgutime, tgstime, cutime, cstime;
 
-       spin_lock_irq(&current->sighand->siglock);
        thread_group_cputime_adjusted(current, &tgutime, &tgstime);
        cutime = current->signal->cutime;
        cstime = current->signal->cstime;
-       spin_unlock_irq(&current->sighand->siglock);
        tms->tms_utime = cputime_to_clock_t(tgutime);
        tms->tms_stime = cputime_to_clock_t(tgstime);
        tms->tms_cutime = cputime_to_clock_t(cutime);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3b89464..1bde818 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -781,14 +781,14 @@ static void posix_cpu_timer_get(struct k_itimer *timer, 
struct itimerspec *itp)
                cpu_clock_sample(timer->it_clock, p, &now);
        } else {
                struct sighand_struct *sighand;
-               unsigned long flags;
 
                /*
                 * Protect against sighand release/switch in exit/exec and
                 * also make timer sampling safe if it ends up calling
                 * thread_group_cputime().
                 */
-               sighand = lock_task_sighand(p, &flags);
+               rcu_read_lock();
+               sighand = rcu_dereference(p->sighand);
                if (unlikely(sighand == NULL)) {
                        /*
                         * The process has been reaped.
@@ -798,10 +798,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, 
struct itimerspec *itp)
                        timer->it.cpu.expires = 0;
                        sample_to_timespec(timer->it_clock, 
timer->it.cpu.expires,
                                           &itp->it_value);
-               } else {
+               } else
                        cpu_timer_sample_group(timer->it_clock, p, &now);
-                       unlock_task_sighand(p, &flags);
-               }
+               rcu_read_unlock();
        }
 
        if (now < timer->it.cpu.expires) {
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to