musl implements system call cancellation in an unusual but clever way.
When a thread issues a cancellable syscall, musl issues the syscall
through a special thunk that looks roughly like this:

cancellable_syscall:
        test whether a cancel is queued
        jnz cancel_me
        int $0x80
end_cancellable_syscall:

If a pthread cancellation signal hits with
cancellable_syscall <= EIP < end_cancellable_syscall, then the
signal interrupted a cancellation point before the syscall in
question started.  If so, it rewrites the calling context to skip
the syscall and simulate a -EINTR return.  The caller will detect
this simulated -EINTR or an actual -EINTR and handle a possible
cancellation event.

This technique doesn't work if int $0x80 is replaced by a call to
AT_SYSINFO: the signal handler can no longer tell whether it's
interrupting a call to AT_SYSINFO or, if it is, where AT_SYSINFO was
called from.

Add minimal helpers so that musl's signal handler can learn the
status of a possible pending AT_SYSINFO invocation and, if it hasn't
entered the kernel yet, abort it without needing to parse the vdso
DWARF unwind data.

Signed-off-by: Andy Lutomirski <[email protected]>
---

musl people-

Does this solve your AT_SYSINFO cancellation problem?  I'd like to
make sure it survives an actual implementation before I commit to the ABI.

x86 people-

Are you okay with this idea?


 arch/x86/entry/vdso/Makefile                      |   3 +-
 arch/x86/entry/vdso/vdso32/cancellation_helpers.c | 116 ++++++++++++++++++++++
 arch/x86/entry/vdso/vdso32/vdso32.lds.S           |   2 +
 tools/testing/selftests/x86/unwind_vdso.c         |  57 +++++++++--
 4 files changed, 171 insertions(+), 7 deletions(-)
 create mode 100644 arch/x86/entry/vdso/vdso32/cancellation_helpers.c

diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index b88846471247..465052b49603 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -130,7 +130,7 @@ override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
 
 targets += vdso32/vdso32.lds
 targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
-targets += vdso32/vclock_gettime.o
+targets += vdso32/vclock_gettime.o vdso32/cancellation_helpers.o
 
 KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO
 $(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
@@ -150,6 +150,7 @@ $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
 $(obj)/vdso32.so.dbg: FORCE \
                      $(obj)/vdso32/vdso32.lds \
                      $(obj)/vdso32/vclock_gettime.o \
+                     $(obj)/vdso32/cancellation_helpers.o \
                      $(obj)/vdso32/note.o \
                      $(obj)/vdso32/system_call.o \
                      $(obj)/vdso32/sigreturn.o
diff --git a/arch/x86/entry/vdso/vdso32/cancellation_helpers.c 
b/arch/x86/entry/vdso/vdso32/cancellation_helpers.c
new file mode 100644
index 000000000000..3cb2e88baec6
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/cancellation_helpers.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016 Andrew Lutomirski
+ * Subject to the GNU Public License, v.2
+ *
+ * This provides helpers to enable libc implementations to cancel
+ * interrupted AT_SYSINFO invocations without needing to parse the
+ * DWARF unwinding instructions.
+ */
+
+#include <asm/signal.h>
+#include <asm/sigframe.h>
+
+extern char __kernel_vsyscall[] __attribute__((visibility("hidden")));
+extern char int80_landing_pad[] __attribute__((visibility("hidden")));
+
+static unsigned long *pending_syscall_retaddr_ptr(const void *context)
+{
+       const struct ucontext_ia32 *uc = context;
+       unsigned long ctx_eip = uc->uc_mcontext.ip;
+       unsigned long offset_into_vsyscall;
+       unsigned long *retaddr;
+
+       /*
+        * An AT_SYSINFO system call is pending if and only if we're in
+        * __kernel_vsyscall before int80_landing_pad.  If we're at
+        * int80_landing_pad or beyond, we've finished the system call
+        * and are on our way out.
+        *
+        * If we're at int80_landing_pad-2, then either we're using the
+        * int $0x80 slow path because we have no fast system call
+        * support or we are restarting a fast system call.  Either way,
+        * the system call is still pending.
+        */
+
+       if (ctx_eip < (unsigned long)__kernel_vsyscall ||
+           ctx_eip >= (unsigned long)int80_landing_pad)
+               return NULL;
+
+       /*
+        * The first three instructions of __kernel_vsyscall are one-byte
+        * pushes.
+        */
+       offset_into_vsyscall = (ctx_eip - (unsigned long)__kernel_vsyscall);
+       retaddr = (unsigned long *)uc->uc_mcontext.sp;
+       if (offset_into_vsyscall < 3)
+               retaddr += offset_into_vsyscall;
+       else
+               retaddr += 3;
+
+       /*
+        * GCC (correctly) fails to deduce out that retaddr can't be NULL
+        * in the success path.  Helping it out reduces code size.
+        */
+       if (!retaddr)
+               __builtin_unreachable();
+
+       return retaddr;
+}
+
+/*
+ * If context is a sigcontet for a pending AT_SYSINFO syscall, returns
+ * the return address of that syscall.  Otherwise returns -1UL.
+ */
+unsigned long __vdso_pending_syscall_return_address(const void *context)
+{
+       unsigned long *retaddr = pending_syscall_retaddr_ptr(context);
+       return retaddr ? *retaddr : -1UL;
+}
+
+/*
+ * If context is a sigcontext for a pending AT_SYSINFO syscall, then
+ * this will pop off the call frame and point the context to
+ * AT_SYSINFO's return address.  ESP will contain whatever value it had
+ * immediately prior to the call instruction (i.e. ESP acts as though
+ * the system call returned normally).  EAX will be set to -EINTR.  All
+ * other GPRs will be clobbered.  __vdso_abort_pending_syscall will
+ * return 0.
+ *
+ * If context is a valid sigcontext that does not represent a pending
+ * AT_SYSINFO syscall, then __vdso_abort_pending_syscall returns
+ * -EINVAL.
+ *
+ * If context is not a valid sigcontext at all, behavior is undefined.
+ */
+long __vdso_abort_pending_syscall(void *context)
+{
+       struct ucontext_ia32 *uc = context;
+       unsigned long *retaddr = pending_syscall_retaddr_ptr(context);
+
+       if (!retaddr)
+               return -EINVAL;
+
+       uc->uc_mcontext.ip = *retaddr;
+       uc->uc_mcontext.sp = (unsigned long)(retaddr + 1);
+
+       /*
+        * Clobber GPRs -- we don't want to implement full unwinding, and we
+        * don't want userspace to start expecting anything about the final
+        * state of the GPRs.
+        *
+        * (There really are subtleties here.  EAX can be clobbered by
+        *  syscall restart, and register limitations mean that the
+        *  saved context has at least one of the argument registers
+        *  used for a different purpose by the calling sequence just
+        *  prior to kernel entry.  In the current implementation, that
+        *  register is EBP, but it could change.)
+        */
+       uc->uc_mcontext.ax = -EINTR;
+       uc->uc_mcontext.bx = 0xFFFFFFFF;
+       uc->uc_mcontext.cx = 0xFFFFFFFF;
+       uc->uc_mcontext.dx = 0xFFFFFFFF;
+       uc->uc_mcontext.si = 0xFFFFFFFF;
+       uc->uc_mcontext.di = 0xFFFFFFFF;
+       uc->uc_mcontext.bp = 0xFFFFFFFF;
+       return 0;
+}
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S 
b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index 31056cf294bf..f04e8bd30755 100644
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -25,6 +25,8 @@ VERSION
                __vdso_clock_gettime;
                __vdso_gettimeofday;
                __vdso_time;
+               __vdso_pending_syscall_return_address;
+               __vdso_abort_pending_syscall;
        };
 
        LINUX_2.5 {
diff --git a/tools/testing/selftests/x86/unwind_vdso.c 
b/tools/testing/selftests/x86/unwind_vdso.c
index 00a26a82fa98..7c649b4b6834 100644
--- a/tools/testing/selftests/x86/unwind_vdso.c
+++ b/tools/testing/selftests/x86/unwind_vdso.c
@@ -35,6 +35,7 @@ int main()
 #include <syscall.h>
 #include <unistd.h>
 #include <string.h>
+#include <errno.h>
 #include <inttypes.h>
 #include <sys/mman.h>
 #include <signal.h>
@@ -88,8 +89,12 @@ static unsigned long sysinfo;
 static bool got_sysinfo = false;
 static unsigned long return_address;
 
+static unsigned long (*vdso_pending_syscall_return_address)(
+       const void *context);
+
 struct unwind_state {
        unsigned long ip;       /* trap source */
+       unsigned long ax;       /* ax at call site */
        int depth;              /* -1 until we hit the trap source */
 };
 
@@ -115,7 +120,7 @@ _Unwind_Reason_Code trace_fn(struct _Unwind_Context * ctx, 
void *opaque)
                unsigned long ebp = _Unwind_GetGR(ctx, 5);
                unsigned long esi = _Unwind_GetGR(ctx, 6);
                unsigned long edi = _Unwind_GetGR(ctx, 7);
-               bool ok = (eax == SYS_getpid || eax == getpid()) &&
+               bool ok = (eax == SYS_break || eax == -ENOSYS) &&
                        ebx == 1 && ecx == 2 && edx == 3 &&
                        esi == 4 && edi == 5 && ebp == 6;
 
@@ -125,6 +130,8 @@ _Unwind_Reason_Code trace_fn(struct _Unwind_Context * ctx, 
void *opaque)
                       (ok ? "OK" : "FAIL"),
                       eax, ebx, ecx, edx, esi, edi, ebp);
 
+               state->ax = eax;
+
                return _URC_NORMAL_STOP;
        } else {
                state->depth++;
@@ -137,6 +144,7 @@ static void sigtrap(int sig, siginfo_t *info, void 
*ctx_void)
        ucontext_t *ctx = (ucontext_t *)ctx_void;
        struct unwind_state state;
        unsigned long ip = ctx->uc_mcontext.gregs[REG_EIP];
+       unsigned long reported_return_address = 0;
 
        if (!got_sysinfo && ip == sysinfo) {
                got_sysinfo = true;
@@ -148,8 +156,15 @@ static void sigtrap(int sig, siginfo_t *info, void 
*ctx_void)
                       ip, return_address);
        }
 
-       if (!got_sysinfo)
-               return;         /* Not there yet */
+       if (!got_sysinfo) {
+               if (vdso_pending_syscall_return_address &&
+                   vdso_pending_syscall_return_address(ctx_void) != -1UL) {
+                       printf("[FAIL]\t__vdso_pending_syscall_return_address 
incorrectly detected a pending syscall\n");
+                       nerrs++;
+               }
+
+               return;         /* We haven't started AT_SYSINFO yet */
+       }
 
        if (ip == return_address) {
                ctx->uc_mcontext.gregs[REG_EFL] &= ~X86_EFLAGS_TF;
@@ -157,11 +172,32 @@ static void sigtrap(int sig, siginfo_t *info, void 
*ctx_void)
                return;
        }
 
-       printf("\tSIGTRAP at 0x%lx\n", ip);
+       if (vdso_pending_syscall_return_address) {
+               reported_return_address =
+                       vdso_pending_syscall_return_address(ctx_void);
+               if (reported_return_address != -1UL)
+                       printf("\tSIGTRAP at 0x%lx, pending syscall will return 
to 0x%lx\n",
+                              ip, reported_return_address);
+               else
+                       printf("\tSIGTRAP at 0x%lx, no syscall pending\n", ip);
+       } else {
+               printf("\tSIGTRAP at 0x%lx\n", ip);
+       }
 
        state.ip = ip;
        state.depth = -1;
        _Unwind_Backtrace(trace_fn, &state);
+
+       if (vdso_pending_syscall_return_address) {
+               unsigned long expected =
+                       (state.ax == SYS_break ? return_address : -1UL);
+               if (reported_return_address != expected) {
+                       printf("[FAIL]\t  __vdso_pending_syscall_return_address 
returned 0x%lx; expected 0x%lx\n", reported_return_address, expected);
+                       nerrs++;
+               } else {
+                       printf("[OK]\t  __vdso_pending_syscall_return_address 
returned the correct value\n");
+               }
+       }
 }
 
 int main()
@@ -177,12 +213,21 @@ int main()
                       info.dli_fname, info.dli_fbase);
        }
 
+       void *vdso = dlopen("linux-gate.so.1", RTLD_NOW);
+       if (vdso)
+               vdso_pending_syscall_return_address = dlsym(vdso, 
"__vdso_pending_syscall_return_address");
+
        sethandler(SIGTRAP, sigtrap, 0);
 
-       syscall(SYS_getpid);  /* Force symbol binding without TF set. */
+       syscall(SYS_break);  /* Force symbol binding without TF set. */
        printf("[RUN]\tSet TF and check a fast syscall\n");
        set_eflags(get_eflags() | X86_EFLAGS_TF);
-       syscall(SYS_getpid, 1, 2, 3, 4, 5, 6);
+
+       /*
+        * We need a harmless syscall that will never return its own syscall
+        * nr.  SYS_break is not implemented and returns -ENOSYS.
+        */
+       syscall(SYS_break, 1, 2, 3, 4, 5, 6);
        if (!got_sysinfo) {
                set_eflags(get_eflags() & ~X86_EFLAGS_TF);
 
-- 
2.5.0

Reply via email to