This patch adds a special-case when waiting on a pid (via waitpid,
waitid, wait4, etc) to avoid doing an O(n) scan of children and tracees,
and instead do an O(1) lookup. This improves performance when waiting on
a pid from a thread group with many children and/or tracees.

Time to fork and then call waitpid on the child, from a task that
already has N children [1]:

N    | Before  | After
-----|---------|------
1    | 74 us   | 74 us
20   | 72 us   | 75 us
100  | 83 us   | 77 us
500  | 99 us   | 74 us
1000 | 179 us  | 75 us
5000 | 804 us  | 79 us
8000 | 1268 us | 78 us

[1]: https://lkml.org/lkml/2021/3/12/1567

This can make a substantial performance improvement for applications
with a thread that has many children or tracees and frequently needs to
wait on them. Tools that use ptrace to intercept syscalls for a large
number of processes are likely to fall into this category. In particular
this patch was developed while building a ptrace-based second generation
of the Shadow emulator [2], for which it allows us to avoid quadratic
scaling (without having to use a workaround that introduces a ~40%
performance penalty) [3]. Other examples of tools that fall into this
category which this patch may help include User Mode Linux [4] and
DetTrace [5].

[2]: https://shadow.github.io/
[3]: https://github.com/shadow/shadow/issues/1134#issuecomment-798992292
[4]: https://en.wikipedia.org/wiki/User-mode_Linux
[5]: https://github.com/dettrace/dettrace

Signed-off-by: James Newsome <[email protected]>
---

v5: https://lkml.org/lkml/2021/3/12/1134

* Switched back to explicitly looking up by tgid and then pid.
* Added further motivation and context in the patch description.

 kernel/exit.c | 69 +++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 59 insertions(+), 10 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 04029e35e69a..65c862c604a7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1439,9 +1439,50 @@ void __wake_up_parent(struct task_struct *p, struct 
task_struct *parent)
                           TASK_INTERRUPTIBLE, p);
 }
 
+static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
+                                struct task_struct *target)
+{
+       struct task_struct *parent =
+               !ptrace ? target->real_parent : target->parent;
+
+       return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
+                                    same_thread_group(current, parent));
+}
+
+/*
+ * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
+ * and tracee lists to find the target task.
+ */
+static int do_wait_pid(struct wait_opts *wo)
+{
+       bool ptrace;
+       struct task_struct *target;
+       int retval;
+
+       target = pid_task(wo->wo_pid, PIDTYPE_PID);
+       if (!target)
+               return 0;
+
+       ptrace = false;
+       if (thread_group_leader(target) &&
+           is_effectively_child(wo, ptrace, target)) {
+               retval = wait_consider_task(wo, ptrace, target);
+               if (retval)
+                       return retval;
+       }
+
+       ptrace = true;
+       if (target->ptrace && is_effectively_child(wo, ptrace, target)) {
+               retval = wait_consider_task(wo, ptrace, target);
+               if (retval)
+                       return retval;
+       }
+
+       return 0;
+}
+
 static long do_wait(struct wait_opts *wo)
 {
-       struct task_struct *tsk;
        int retval;
 
        trace_sched_process_wait(wo->wo_pid);
@@ -1463,19 +1504,27 @@ static long do_wait(struct wait_opts *wo)
 
        set_current_state(TASK_INTERRUPTIBLE);
        read_lock(&tasklist_lock);
-       tsk = current;
-       do {
-               retval = do_wait_thread(wo, tsk);
-               if (retval)
-                       goto end;
 
-               retval = ptrace_do_wait(wo, tsk);
+       if (wo->wo_type == PIDTYPE_PID) {
+               retval = do_wait_pid(wo);
                if (retval)
                        goto end;
+       } else {
+               struct task_struct *tsk = current;
+
+               do {
+                       retval = do_wait_thread(wo, tsk);
+                       if (retval)
+                               goto end;
 
-               if (wo->wo_flags & __WNOTHREAD)
-                       break;
-       } while_each_thread(current, tsk);
+                       retval = ptrace_do_wait(wo, tsk);
+                       if (retval)
+                               goto end;
+
+                       if (wo->wo_flags & __WNOTHREAD)
+                               break;
+               } while_each_thread(current, tsk);
+       }
        read_unlock(&tasklist_lock);
 
 notask:
-- 
2.30.1

Reply via email to