This is more complicated than the 32-bit and compat cases because it
preserves an asm fast path for the case where the callee-saved regs
aren't needed in pt_regs and no entry or exit work needs to be done.

This appears to slow down fastpath syscalls by no more than one cycle
on my Skylake laptop.

Signed-off-by: Andy Lutomirski <l...@kernel.org>
---
 arch/x86/entry/common.c   |  26 ++++++++++
 arch/x86/entry/entry_64.S | 124 +++++++++++++++-------------------------------
 2 files changed, 67 insertions(+), 83 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index a89fdbc1f0be..d45119e770ef 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -344,6 +344,32 @@ __visible inline void syscall_return_slowpath(struct 
pt_regs *regs)
        prepare_exit_to_usermode(regs);
 }
 
+#ifdef CONFIG_X86_64
+__visible void do_syscall_64(struct pt_regs *regs)
+{
+       struct thread_info *ti = pt_regs_to_thread_info(regs);
+       unsigned long nr = regs->orig_ax;
+
+       local_irq_enable();
+
+       if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
+               nr = syscall_trace_enter(regs);
+
+       /*
+        * NB: Native and x32 syscalls are dispatched from the same
+        * table.  The only functional difference is the x32 bit in
+        * regs->orig_ax, which changes the behavior of some syscalls.
+        */
+       if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
+               regs->ax = sys_call_table[nr & __SYSCALL_MASK](
+                       regs->di, regs->si, regs->dx,
+                       regs->r10, regs->r8, regs->r9);
+       }
+
+       syscall_return_slowpath(regs);
+}
+#endif
+
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 /*
  * Does a 32-bit syscall.  Called with IRQs on and does all entry and
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 81b0944708c5..1ab5362f241d 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -145,17 +145,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
        movq    %rsp, PER_CPU_VAR(rsp_scratch)
        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
+       TRACE_IRQS_OFF
+
        /* Construct struct pt_regs on stack */
        pushq   $__USER_DS                      /* pt_regs->ss */
        pushq   PER_CPU_VAR(rsp_scratch)        /* pt_regs->sp */
-       /*
-        * Re-enable interrupts.
-        * We use 'rsp_scratch' as a scratch space, hence irq-off block above
-        * must execute atomically in the face of possible interrupt-driven
-        * task preemption. We must enable interrupts only after we're done
-        * with using rsp_scratch:
-        */
-       ENABLE_INTERRUPTS(CLBR_NONE)
        pushq   %r11                            /* pt_regs->flags */
        pushq   $__USER_CS                      /* pt_regs->cs */
        pushq   %rcx                            /* pt_regs->ip */
@@ -171,9 +165,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
        pushq   %r11                            /* pt_regs->r11 */
        sub     $(6*8), %rsp                    /* pt_regs->bp, bx, r12-15 not 
saved */
 
-       testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, 
SIZEOF_PTREGS)
-       jnz     tracesys
+       /*
+        * If we need to do entry work or if we guess we'll need to do
+        * exit work, go straight to the slow path.
+        */
+       testl   $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, 
ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+       jnz     entry_SYSCALL64_slow_path
+
 entry_SYSCALL_64_fastpath:
+       /*
+        * Easy case: enable interrupts and issue the syscall.  If the syscall
+        * needs pt_regs, we'll call a stub that disables interrupts again
+        * and jumps to the slow path.
+        */
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_NONE)
 #if __SYSCALL_MASK == ~0
        cmpq    $__NR_syscall_max, %rax
 #else
@@ -185,93 +191,43 @@ entry_SYSCALL_64_fastpath:
        call    *sys_call_table_fastpath_64(, %rax, 8)
        movq    %rax, RAX(%rsp)
 1:
-/*
- * Syscall return path ending with SYSRET (fast path).
- * Has incompletely filled pt_regs.
- */
-       LOCKDEP_SYS_EXIT
-       /*
-        * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
-        * it is too small to ever cause noticeable irq latency.
-        */
-       DISABLE_INTERRUPTS(CLBR_NONE)
 
        /*
-        * We must check ti flags with interrupts (or at least preemption)
-        * off because we must *never* return to userspace without
-        * processing exit work that is enqueued if we're preempted here.
-        * In particular, returning to userspace with any of the one-shot
-        * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
-        * very bad.
+        * If we get here, then we know that pt_regs is clean for SYSRET64.
+        * If we see that no exit work is required (which we are required
+        * to check with IRQs off), then we can go straight to SYSRET64.
         */
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF
        testl   $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, 
SIZEOF_PTREGS)
-       jnz     int_ret_from_sys_call_irqs_off  /* Go to the slow path */
+       jnz     1f
 
-       RESTORE_C_REGS_EXCEPT_RCX_R11
-       movq    RIP(%rsp), %rcx
-       movq    EFLAGS(%rsp), %r11
+       LOCKDEP_SYS_EXIT
+       TRACE_IRQS_ON           /* user mode is traced as IRQs on */
+       RESTORE_C_REGS
        movq    RSP(%rsp), %rsp
-       /*
-        * 64-bit SYSRET restores rip from rcx,
-        * rflags from r11 (but RF and VM bits are forced to 0),
-        * cs and ss are loaded from MSRs.
-        * Restoration of rflags re-enables interrupts.
-        *
-        * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
-        * descriptor is not reinitialized.  This means that we should
-        * avoid SYSRET with SS == NULL, which could happen if we schedule,
-        * exit the kernel, and re-enter using an interrupt vector.  (All
-        * interrupt entries on x86_64 set SS to NULL.)  We prevent that
-        * from happening by reloading SS in __switch_to.  (Actually
-        * detecting the failure in 64-bit userspace is tricky but can be
-        * done.)
-        */
        USERGS_SYSRET64
 
-GLOBAL(int_ret_from_sys_call_irqs_off)
+1:
+       /*
+        * The fast path looked good when we started, but something changed
+        * along the way and we need to switch to the slow path.  Calling
+        * raise(3) will trigger this, for example.  IRQs are off.
+        */
        TRACE_IRQS_ON
        ENABLE_INTERRUPTS(CLBR_NONE)
-       jmp int_ret_from_sys_call
-
-       /* Do syscall entry tracing */
-tracesys:
-       movq    %rsp, %rdi
-       movl    $AUDIT_ARCH_X86_64, %esi
-       call    syscall_trace_enter_phase1
        SAVE_EXTRA_REGS
        movq    %rsp, %rdi
-       movl    $AUDIT_ARCH_X86_64, %esi
-       movq    %rax, %rdx
-       call    syscall_trace_enter_phase2
-
-       /*
-        * Reload registers from stack in case ptrace changed them.
-        * We don't reload %rax because syscall_trace_entry_phase2() returned
-        * the value it wants us to use in the table lookup.
-        */
-       RESTORE_C_REGS_EXCEPT_RAX
-#if __SYSCALL_MASK == ~0
-       cmpq    $__NR_syscall_max, %rax
-#else
-       andl    $__SYSCALL_MASK, %eax
-       cmpl    $__NR_syscall_max, %eax
-#endif
-       ja      1f                              /* return -ENOSYS (already in 
pt_regs->ax) */
-       movq    %r10, %rcx                      /* fixup for C */
-       call    *sys_call_table(, %rax, 8)
-       movq    %rax, RAX(%rsp)
-       RESTORE_EXTRA_REGS
-1:
-       /* Use IRET because user could have changed pt_regs->foo */
+       call    syscall_return_slowpath /* returns with IRQs disabled */
+       jmp     return_from_SYSCALL_64
 
-/*
- * Syscall return path ending with IRET.
- * Has correct iret frame.
- */
-GLOBAL(int_ret_from_sys_call)
+entry_SYSCALL64_slow_path:
+       /* IRQs are off. */
        SAVE_EXTRA_REGS
        movq    %rsp, %rdi
-       call    syscall_return_slowpath /* returns with IRQs disabled */
+       call    do_syscall_64           /* returns with IRQs disabled */
+
+return_from_SYSCALL_64:
        RESTORE_EXTRA_REGS
        TRACE_IRQS_IRETQ                /* we're about to change IF */
 
@@ -353,8 +309,10 @@ ENTRY(stub_ptregs_64)
         * Syscalls marked as needing ptregs that go through the fast path
         * land here.  We transfer to the slow path.
         */
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF
        addq    $8, %rsp
-       jmp     tracesys
+       jmp     entry_SYSCALL64_slow_path
 END(stub_ptregs_64)
 
 /*
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to