real_parent->kin_lock protects:
        child's threads, place in sibling list etc.

parent->kin_lock protects:
        ptrace operations

Note: sighand is still stable under tasklist_lock.

Signed-off-by: Kirill Tkhai <[email protected]>
---
 fs/exec.c                |   16 ++++++++---
 fs/proc/array.c          |    4 +--
 kernel/exit.c            |   68 +++++++++++++++++++++++++++++++++++++---------
 kernel/fork.c            |    4 +++
 kernel/ptrace.c          |   45 ++++++++++++++++++++++++++----
 kernel/signal.c          |    9 ++++++
 kernel/sys.c             |    8 ++++-
 mm/oom_kill.c            |    9 ++++--
 security/keys/keyctl.c   |    4 +--
 security/selinux/hooks.c |    4 +--
 10 files changed, 136 insertions(+), 35 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 1977c2a..4de7770 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -932,14 +932,16 @@ static int de_thread(struct task_struct *tsk)
                for (;;) {
                        threadgroup_change_begin(tsk);
                        write_lock_irq(&tasklist_lock);
+                       write_real_parent_lock(tsk);
                        /*
-                        * Do this under tasklist_lock to ensure that
-                        * exit_notify() can't miss ->group_exit_task
+                        * Do this under real_parent's kin_lock to ensure
+                        * that exit_notify() can't miss ->group_exit_task.
                         */
                        sig->notify_count = -1;
                        if (likely(leader->exit_state))
                                break;
                        __set_current_state(TASK_KILLABLE);
+                       write_real_parent_unlock(tsk);
                        write_unlock_irq(&tasklist_lock);
                        threadgroup_change_end(tsk);
                        schedule();
@@ -995,9 +997,13 @@ static int de_thread(struct task_struct *tsk)
                 * We are going to release_task()->ptrace_unlink() silently,
                 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
                 * the tracer wont't block again waiting for this thread.
+                *
+                * Also, we not need hold leader->parent->kin_lock, because
+                * it can't change till we release real_parent->kin_lock.
                 */
                if (unlikely(leader->ptrace))
                        __wake_up_parent(leader, leader->parent);
+               write_real_parent_unlock(tsk);
                write_unlock_irq(&tasklist_lock);
                threadgroup_change_end(tsk);
 
@@ -1029,9 +1035,11 @@ static int de_thread(struct task_struct *tsk)
                       sizeof(newsighand->action));
 
                write_lock_irq(&tasklist_lock);
+               write_parent_and_real_parent_lock(tsk);
                spin_lock(&oldsighand->siglock);
                rcu_assign_pointer(tsk->sighand, newsighand);
                spin_unlock(&oldsighand->siglock);
+               write_parent_and_real_parent_unlock(tsk);
                write_unlock_irq(&tasklist_lock);
 
                __cleanup_sighand(oldsighand);
@@ -1042,10 +1050,10 @@ static int de_thread(struct task_struct *tsk)
 
 killed:
        /* protects against exit_notify() and __exit_signal() */
-       read_lock(&tasklist_lock);
+       read_real_parent_lock(tsk);
        sig->group_exit_task = NULL;
        sig->notify_count = 0;
-       read_unlock(&tasklist_lock);
+       read_real_parent_unlock(tsk);
        return -EAGAIN;
 }
 
diff --git a/fs/proc/array.c b/fs/proc/array.c
index e2f21c0..820611cc 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -588,7 +588,7 @@ get_children_pid(struct inode *inode, struct pid *pid_prev, 
loff_t pos)
        if (!task)
                goto put_start;
 
-       read_lock(&tasklist_lock);
+       read_lock(&start->kin_lock);
        if (task->real_parent == start && !(list_empty(&task->sibling))) {
                put_task_struct(task);
                if (!list_is_last(&task->sibling, &start->children)) {
@@ -623,7 +623,7 @@ get_children_pid(struct inode *inode, struct pid *pid_prev, 
loff_t pos)
        }
 
 unlock:
-       read_unlock(&tasklist_lock);
+       read_unlock(&start->kin_lock);
 put_start:
        put_task_struct(start);
 out:
diff --git a/kernel/exit.c b/kernel/exit.c
index 5e82787..a268093 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -180,6 +180,7 @@ void release_task(struct task_struct *p)
        proc_flush_task(p);
 
        write_lock_irq(&tasklist_lock);
+       write_parent_and_real_parent_lock(p);
        ptrace_release_task(p);
        __exit_signal(p);
 
@@ -202,6 +203,7 @@ void release_task(struct task_struct *p)
                        leader->exit_state = EXIT_DEAD;
        }
 
+       write_parent_and_real_parent_unlock(p);
        write_unlock_irq(&tasklist_lock);
        release_thread(p);
        call_rcu(&p->rcu, delayed_put_task_struct);
@@ -317,43 +319,50 @@ void mm_update_next_owner(struct mm_struct *mm)
                return;
        }
 
-       read_lock(&tasklist_lock);
        /*
         * Search in the children
         */
+       read_lock(&p->kin_lock);
        list_for_each_entry(c, &p->children, sibling) {
                if (c->mm == mm) {
                        get_task_struct(c);
+                       read_unlock(&p->kin_lock);
                        goto assign_new_owner;
                }
        }
+       read_unlock(&p->kin_lock);
 
        /*
         * Search in the siblings
         */
+       read_real_parent_lock(p);
        list_for_each_entry(c, &p->real_parent->children, sibling) {
                if (c->mm == mm) {
                        get_task_struct(c);
+                       read_real_parent_unlock(p);
                        goto assign_new_owner;
                }
        }
+       read_real_parent_unlock(p);
 
        /*
         * Search through everything else, we should not get here often.
         */
+       rcu_read_lock();
        for_each_process(g) {
                if (g->flags & PF_KTHREAD)
                        continue;
                for_each_thread(g, c) {
                        if (c->mm == mm) {
                                get_task_struct(c);
+                               rcu_read_unlock();
                                goto assign_new_owner;
                        }
                        if (c->mm)
                                break;
                }
        }
-       read_unlock(&tasklist_lock);
+       rcu_read_unlock();
        /*
         * We found no owner yet mm_users > 1: this implies that we are
         * most likely racing with swapoff (try_to_unuse()) or /proc or
@@ -369,11 +378,6 @@ void mm_update_next_owner(struct mm_struct *mm)
         * We always want mm->owner->mm == mm
         */
        task_lock(c);
-       /*
-        * Delay read_unlock() till we have the task_lock()
-        * to ensure that c does not slip away underneath us
-        */
-       read_unlock(&tasklist_lock);
        if (c->mm != mm) {
                task_unlock(c);
                put_task_struct(c);
@@ -470,9 +474,13 @@ static void check_pid_ns_reaper_exit(struct task_struct 
*father)
                return;
 
        write_lock(&pid_ns->cr_lock);
+       read_real_parent_lock(father);
+
        reaper = find_alive_thread(father);
        if (reaper)
                pid_ns->child_reaper = reaper;
+
+       read_real_parent_unlock(father);
        write_unlock(&pid_ns->cr_lock);
 
        if (reaper)
@@ -494,13 +502,22 @@ static void check_pid_ns_reaper_exit(struct task_struct 
*father)
  *    child_subreaper for its children (like a service manager)
  * 3. give it to the init process (PID 1) in our pid namespace
  */
-static struct task_struct *find_new_reaper(struct task_struct *father)
+static struct task_struct *find_double_lock_new_reaper(struct task_struct 
*father)
 {
        struct task_struct *thread, *reaper, *child_reaper;
 
+       rcu_read_lock();
+again:
        thread = find_alive_thread(father);
-       if (thread)
+       if (thread) {
+               double_write_lock(&father->kin_lock, &thread->kin_lock);
+               if (thread->flags & PF_EXITING) {
+                       double_write_unlock(&father->kin_lock, 
&thread->kin_lock);
+                       goto again;
+               }
+               rcu_read_unlock();
                return thread;
+       }
 
        child_reaper = task_active_pid_ns(father)->child_reaper;
 
@@ -518,12 +535,26 @@ static struct task_struct *find_new_reaper(struct 
task_struct *father)
                                break;
                        if (!reaper->signal->is_child_subreaper)
                                continue;
+find_again:
                        thread = find_alive_thread(reaper);
-                       if (thread)
-                               return thread;
+                       if (thread) {
+                               double_write_lock(&father->kin_lock,
+                                                 &thread->kin_lock);
+                               if (!(thread->flags & PF_EXITING)) {
+                                       rcu_read_unlock();
+                                       return thread;
+                               }
+                               double_write_unlock(&father->kin_lock,
+                                                   &thread->kin_lock);
+                               goto find_again;
+                       }
                }
        }
 
+       rcu_read_unlock();
+
+       double_write_lock(&child_reaper->kin_lock, &father->kin_lock);
+
        return child_reaper;
 }
 
@@ -569,11 +600,18 @@ static void forget_original_parent(struct task_struct 
*father,
 
        /* Can drop and reacquire tasklist_lock */
        check_pid_ns_reaper_exit(father);
-       if (list_empty(&father->children))
+
+       /* read_lock() guarantees concurrent thread sees our PF_EXITING */
+       read_lock(&father->kin_lock);
+       if (list_empty(&father->children)) {
+               read_unlock(&father->kin_lock);
                return;
+       }
+       read_unlock(&father->kin_lock);
 
        read_lock(&task_active_pid_ns(father)->cr_lock);
-       reaper = find_new_reaper(father);
+       reaper = find_double_lock_new_reaper(father);
+
        list_for_each_entry(p, &father->children, sibling) {
                for_each_thread(p, t) {
                        t->real_parent = reaper;
@@ -592,6 +630,8 @@ static void forget_original_parent(struct task_struct 
*father,
                        reparent_leader(father, p, dead);
        }
        list_splice_tail_init(&father->children, &reaper->children);
+
+       double_write_unlock(&reaper->kin_lock, &father->kin_lock);
        read_unlock(&task_active_pid_ns(father)->cr_lock);
 }
 
@@ -608,6 +648,7 @@ static void exit_notify(struct task_struct *tsk, int 
group_dead)
        write_lock_irq(&tasklist_lock);
        forget_original_parent(tsk, &dead);
 
+       write_parent_and_real_parent_lock(tsk);
        if (group_dead)
                kill_orphaned_pgrp(tsk->group_leader, NULL);
 
@@ -631,6 +672,7 @@ static void exit_notify(struct task_struct *tsk, int 
group_dead)
        /* mt-exec, de_thread() is waiting for group leader */
        if (unlikely(tsk->signal->notify_count < 0))
                wake_up_process(tsk->signal->group_exit_task);
+       write_parent_and_real_parent_unlock(tsk);
        write_unlock_irq(&tasklist_lock);
 
        list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
diff --git a/kernel/fork.c b/kernel/fork.c
index ee389ba..63e225b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1524,9 +1524,11 @@ static struct task_struct *copy_process(unsigned long 
clone_flags,
 
        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
+               write_real_parent_lock(current);
                p->real_parent = current->real_parent;
                p->parent_exec_id = current->parent_exec_id;
        } else {
+               write_lock(&current->kin_lock);
                p->real_parent = current;
                p->parent_exec_id = current->self_exec_id;
        }
@@ -1550,6 +1552,7 @@ static struct task_struct *copy_process(unsigned long 
clone_flags,
        recalc_sigpending();
        if (signal_pending(current)) {
                spin_unlock(&current->sighand->siglock);
+               write_real_parent_unlock(p);
                write_unlock_irq(&tasklist_lock);
                retval = -ERESTARTNOINTR;
                goto bad_fork_free_pid;
@@ -1591,6 +1594,7 @@ static struct task_struct *copy_process(unsigned long 
clone_flags,
 
        total_forks++;
        spin_unlock(&current->sighand->siglock);
+       write_real_parent_unlock(p);
        syscall_tracepoint_update(p);
        write_unlock_irq(&tasklist_lock);
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c8e0e05..817288d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,6 +181,7 @@ static int ptrace_check_attach(struct task_struct *child, 
bool ignore_state)
         * be changed by us so it's not changing right after this.
         */
        read_lock(&tasklist_lock);
+       read_parent_lock(child);
        if (child->ptrace && child->parent == current) {
                WARN_ON(child->state == __TASK_TRACED);
                /*
@@ -190,6 +191,7 @@ static int ptrace_check_attach(struct task_struct *child, 
bool ignore_state)
                if (ignore_state || ptrace_freeze_traced(child))
                        ret = 0;
        }
+       read_parent_unlock(child);
        read_unlock(&tasklist_lock);
 
        if (!ret && !ignore_state) {
@@ -275,6 +277,7 @@ static int ptrace_attach(struct task_struct *task, long 
request,
                         unsigned long flags)
 {
        bool seize = (request == PTRACE_SEIZE);
+       struct task_struct *old_parent;
        int retval;
 
        retval = -EIO;
@@ -312,11 +315,13 @@ static int ptrace_attach(struct task_struct *task, long 
request,
                goto unlock_creds;
 
        write_lock_irq(&tasklist_lock);
+       write_parent_and_given_lock(task, current);
+       old_parent = task->parent;
        retval = -EPERM;
-       if (unlikely(task->exit_state))
-               goto unlock_tasklist;
-       if (task->ptrace)
-               goto unlock_tasklist;
+       if (unlikely(task->exit_state) || task->ptrace) {
+               write_unlock(&old_parent->kin_lock);
+               goto unlock_current;
+       }
 
        if (seize)
                flags |= PT_SEIZED;
@@ -327,6 +332,7 @@ static int ptrace_attach(struct task_struct *task, long 
request,
        task->ptrace = flags;
 
        __ptrace_link(task, current);
+       write_unlock(&old_parent->kin_lock);
 
        /* SEIZE doesn't trap tracee on attach */
        if (!seize)
@@ -358,7 +364,8 @@ static int ptrace_attach(struct task_struct *task, long 
request,
        spin_unlock(&task->sighand->siglock);
 
        retval = 0;
-unlock_tasklist:
+unlock_current:
+       write_unlock(&current->kin_lock);
        write_unlock_irq(&tasklist_lock);
 unlock_creds:
        mutex_unlock(&task->signal->cred_guard_mutex);
@@ -383,6 +390,7 @@ static int ptrace_traceme(void)
        int ret = -EPERM;
 
        write_lock_irq(&tasklist_lock);
+       write_parent_lock(current);
        /* Are we already being traced? */
        if (!current->ptrace) {
                ret = security_ptrace_traceme(current->parent);
@@ -392,10 +400,12 @@ static int ptrace_traceme(void)
                 * pretend ->real_parent untraces us right after return.
                 */
                if (!ret && !(current->real_parent->flags & PF_EXITING)) {
+                       BUG_ON(current->parent != current->real_parent);
                        current->ptrace = PT_PTRACED;
                        __ptrace_link(current, current->real_parent);
                }
        }
+       write_parent_unlock(current);
        write_unlock_irq(&tasklist_lock);
 
        return ret;
@@ -456,6 +466,7 @@ static bool __ptrace_detach(struct task_struct *tracer, 
struct task_struct *p)
 
 static int ptrace_detach(struct task_struct *child, unsigned int data)
 {
+       struct task_struct *old_parent;
        if (!valid_signal(data))
                return -EIO;
 
@@ -464,6 +475,8 @@ static int ptrace_detach(struct task_struct *child, 
unsigned int data)
        clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 
        write_lock_irq(&tasklist_lock);
+       write_parent_and_real_parent_lock(child);
+       old_parent = child->parent;
        /*
         * We rely on ptrace_freeze_traced(). It can't be killed and
         * untraced by another thread, it can't be a zombie.
@@ -475,6 +488,9 @@ static int ptrace_detach(struct task_struct *child, 
unsigned int data)
         */
        child->exit_code = data;
        __ptrace_detach(current, child);
+       if (old_parent != child->real_parent)
+               write_unlock(&old_parent->kin_lock);
+       write_real_parent_unlock(child);
        write_unlock_irq(&tasklist_lock);
 
        proc_ptrace_connector(child, PTRACE_DETACH);
@@ -488,15 +504,30 @@ static int ptrace_detach(struct task_struct *child, 
unsigned int data)
  */
 void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
 {
-       struct task_struct *p, *n;
+       struct task_struct *p;
+
+       rcu_read_lock();
+       for (;;) {
+               p = list_first_or_null_rcu(&tracer->ptraced, struct task_struct,
+                                          ptrace_entry);
+               if (!p)
+                       break;
+
+               write_parent_and_real_parent_lock(p);
+               if (p->parent != tracer) {
+                       write_parent_and_real_parent_unlock(p);
+                       continue;
+               }
 
-       list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
                if (unlikely(p->ptrace & PT_EXITKILL))
                        send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
 
                if (__ptrace_detach(tracer, p))
                        list_add(&p->ptrace_entry, dead);
+
+               write_parent_and_real_parent_unlock(p);
        }
+       rcu_read_unlock();
 }
 
 int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user 
*dst, int len)
diff --git a/kernel/signal.c b/kernel/signal.c
index f19833b..8288019 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1882,6 +1882,7 @@ static void ptrace_stop(int exit_code, int why, int 
clear_code, siginfo_t *info)
 
        spin_unlock_irq(&current->sighand->siglock);
        read_lock(&tasklist_lock);
+       read_parent_and_real_parent_lock(current);
        if (may_ptrace_stop()) {
                /*
                 * Notify parents of the stop.
@@ -1904,6 +1905,7 @@ static void ptrace_stop(int exit_code, int why, int 
clear_code, siginfo_t *info)
                 * XXX: implement read_unlock_no_resched().
                 */
                preempt_disable();
+               read_parent_and_real_parent_unlock(current);
                read_unlock(&tasklist_lock);
                preempt_enable_no_resched();
                freezable_schedule();
@@ -1925,6 +1927,7 @@ static void ptrace_stop(int exit_code, int why, int 
clear_code, siginfo_t *info)
                __set_current_state(TASK_RUNNING);
                if (clear_code)
                        current->exit_code = 0;
+               read_parent_and_real_parent_unlock(current);
                read_unlock(&tasklist_lock);
        }
 
@@ -2079,7 +2082,9 @@ static bool do_signal_stop(int signr)
                 */
                if (notify) {
                        read_lock(&tasklist_lock);
+                       read_parent_and_real_parent_lock(current);
                        do_notify_parent_cldstop(current, false, notify);
+                       read_parent_and_real_parent_unlock(current);
                        read_unlock(&tasklist_lock);
                }
 
@@ -2225,11 +2230,13 @@ int get_signal(struct ksignal *ksig)
                 * a duplicate.
                 */
                read_lock(&tasklist_lock);
+               read_parent_and_real_parent_lock(current->group_leader);
                do_notify_parent_cldstop(current, false, why);
 
                if (ptrace_reparented(current->group_leader))
                        do_notify_parent_cldstop(current->group_leader,
                                                true, why);
+               read_parent_and_real_parent_unlock(current->group_leader);
                read_unlock(&tasklist_lock);
 
                goto relock;
@@ -2476,7 +2483,9 @@ void exit_signals(struct task_struct *tsk)
         */
        if (unlikely(group_stop)) {
                read_lock(&tasklist_lock);
+               read_parent_and_real_parent_lock(tsk);
                do_notify_parent_cldstop(tsk, false, group_stop);
+               read_parent_and_real_parent_unlock(tsk);
                read_unlock(&tasklist_lock);
        }
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index a4e372b..783aec4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -939,7 +939,9 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
        err = -ESRCH;
        p = find_task_by_vpid(pid);
        if (!p)
-               goto out;
+               goto out2;
+
+       write_real_parent_lock(p);
 
        err = -EINVAL;
        if (!thread_group_leader(p))
@@ -981,7 +983,9 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 
        err = 0;
 out:
-       /* All paths lead to here, thus we are safe. -DaveM */
+       /* All (almost) paths lead to here, thus we are safe. -DaveM */
+       write_real_parent_unlock(p);
+out2:
        write_unlock_irq(&tasklist_lock);
        rcu_read_unlock();
        return err;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2b665da..aa89beb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -538,13 +538,14 @@ void oom_kill_process(struct task_struct *p, gfp_t 
gfp_mask, int order,
         * parent.  This attempts to lose the minimal amount of work done while
         * still freeing memory.
         */
-       read_lock(&tasklist_lock);
+       rcu_read_lock();
        for_each_thread(p, t) {
+               read_lock(&t->kin_lock);
                list_for_each_entry(child, &t->children, sibling) {
                        unsigned int child_points;
 
                        if (child->mm == p->mm)
-                               continue;
+                               goto unlock;
                        /*
                         * oom_badness() returns 0 if the thread is unkillable
                         */
@@ -557,8 +558,10 @@ void oom_kill_process(struct task_struct *p, gfp_t 
gfp_mask, int order,
                                get_task_struct(victim);
                        }
                }
+unlock:
+               read_unlock(&t->kin_lock);
        }
-       read_unlock(&tasklist_lock);
+       rcu_read_unlock();
 
        p = find_lock_task_mm(victim);
        if (!p) {
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 0b9ec78..64eea7b 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1491,7 +1491,7 @@ long keyctl_session_to_parent(void)
 
        me = current;
        rcu_read_lock();
-       write_lock_irq(&tasklist_lock);
+       write_real_parent_lock_irq(me);
 
        ret = -EPERM;
        oldwork = NULL;
@@ -1540,7 +1540,7 @@ long keyctl_session_to_parent(void)
        if (!ret)
                newwork = NULL;
 unlock:
-       write_unlock_irq(&tasklist_lock);
+       write_real_parent_unlock_irq(me);
        rcu_read_unlock();
        if (oldwork)
                put_cred(container_of(oldwork, struct cred, rcu));
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 7dade28..0feda4b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2461,9 +2461,9 @@ static void selinux_bprm_committed_creds(struct 
linux_binprm *bprm)
 
        /* Wake up the parent if it is waiting so that it can recheck
         * wait permission to the new task SID. */
-       read_lock(&tasklist_lock);
+       read_real_parent_lock(current);
        __wake_up_parent(current, current->real_parent);
-       read_unlock(&tasklist_lock);
+       read_real_parent_unlock(current);
 }
 
 /* superblock security operations */



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to