This fixes a couple minor holes if we took an IRQ very early in syscall
processing:

 - We could enter the IRQ with CONTEXT_USER.  Everything worked (RCU
   was fine), but we could warn if all the debugging options were
   set.

 - We could have the IRQ regs overlap task_pt_regs.  I'm not aware
   of anything important that would break, but some of the /proc
   stuff could plausibly have gotten confused.

Fix it the straightforward way: finish filling in pt_regs and call
enter_from_user_mode before enabling interrupts if _TIF_NOHZ is set.

This should be the last piece of the puzzle needed to get rid of most
remaining exception_enter calls.  (vmalloc faults are still tricky,
but they're mostly fatal in the syscall prologue already.)

Signed-off-by: Andy Lutomirski <[email protected]>
---

This is the last significant functionality change I send for 4.3, I
hope.  With this applied, context tracking for all non-NMI, non-debug
entries should be exact.

There's probably some (minor) performance regression on
CONFIG_CONTEXT_TRACKING=y kernels that aren't using nohz.  If so
(I'll benchmark it later this week), I'll try to rig up a simple
patch to NOP out the hooks of nohz is off.

Sasha, this should fix the intermittent DEBUG_LOCKS splat you're
seeing.

I don't intend to send v2 the #BP stuff for 4.3.  The pile is plenty
big already.

 arch/x86/entry/common.c            | 12 +-------
 arch/x86/entry/entry_64.S          | 32 ++++++++++++++------
 arch/x86/entry/entry_64_compat.S   | 60 +++++++++++++++++++++++++++++---------
 arch/x86/include/asm/thread_info.h |  3 +-
 4 files changed, 71 insertions(+), 36 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 80dcc9261ca3..b570cea2f469 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -70,21 +70,11 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs 
*regs, u32 arch)
        u32 work;
 
        BUG_ON(regs != task_pt_regs(current));
+       CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 
        work = ACCESS_ONCE(current_thread_info()->flags) &
                _TIF_WORK_SYSCALL_ENTRY;
 
-#ifdef CONFIG_CONTEXT_TRACKING
-       /*
-        * If TIF_NOHZ is set, we are required to call user_exit() before
-        * doing anything that could touch RCU.
-        */
-       if (work & _TIF_NOHZ) {
-               enter_from_user_mode();
-               work &= ~_TIF_NOHZ;
-       }
-#endif
-
 #ifdef CONFIG_SECCOMP
        /*
         * Do seccomp first -- it should minimize exposure of other
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e2d078c9dfe4..6bf0c7ecf399 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -142,20 +142,16 @@ ENTRY(entry_SYSCALL_64)
         */
 GLOBAL(entry_SYSCALL_64_after_swapgs)
 
+       /*
+        * IRQs must be off while we use rsp_scratch to keep it from
+        * being clobbered by a different task.
+        */
        movq    %rsp, PER_CPU_VAR(rsp_scratch)
        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
        /* Construct struct pt_regs on stack */
        pushq   $__USER_DS                      /* pt_regs->ss */
        pushq   PER_CPU_VAR(rsp_scratch)        /* pt_regs->sp */
-       /*
-        * Re-enable interrupts.
-        * We use 'rsp_scratch' as a scratch space, hence irq-off block above
-        * must execute atomically in the face of possible interrupt-driven
-        * task preemption. We must enable interrupts only after we're done
-        * with using rsp_scratch:
-        */
-       ENABLE_INTERRUPTS(CLBR_NONE)
        pushq   %r11                            /* pt_regs->flags */
        pushq   $__USER_CS                      /* pt_regs->cs */
        pushq   %rcx                            /* pt_regs->ip */
@@ -171,8 +167,17 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
        pushq   %r11                            /* pt_regs->r11 */
        sub     $(6*8), %rsp                    /* pt_regs->bp, bx, r12-15 not 
saved */
 
-       testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, 
SIZEOF_PTREGS)
+       testl   $(_TIF_WORK_SYSCALL_ENTRY | _TIF_NOHZ), 
ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
        jnz     tracesys
+
+       /*
+        * Re-enable interrupts.  IRQ tracing already thinks that IRQs are
+        * on (since we treat user mode as having IRQs on), and the
+        * prologue above is too short for it to be worth adding a
+        * tracing round trip.
+        */
+       ENABLE_INTERRUPTS(CLBR_NONE)
+
 entry_SYSCALL_64_fastpath:
 #if __SYSCALL_MASK == ~0
        cmpq    $__NR_syscall_max, %rax
@@ -235,6 +240,15 @@ GLOBAL(int_ret_from_sys_call_irqs_off)
 
        /* Do syscall entry tracing */
 tracesys:
+#ifdef CONFIG_CONTEXT_TRACKING
+       /* This is slow enough that it's worth tracing. */
+       TRACE_IRQS_OFF
+       call enter_from_user_mode
+       TRACE_IRQS_ON
+#endif
+
+       ENABLE_INTERRUPTS(CLBR_NONE)
+
        movq    %rsp, %rdi
        movl    $AUDIT_ARCH_X86_64, %esi
        call    syscall_trace_enter_phase1
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index ff32a289b5d1..099ec1174ff9 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -103,11 +103,19 @@ ENTRY(entry_SYSENTER_compat)
        jnz     sysenter_fix_flags
 sysenter_flags_fixed:
 
+#ifdef CONFIG_CONTEXT_TRACKING
+       /* This is slow enough that it's worth tracing. */
+       TRACE_IRQS_OFF
+       call enter_from_user_mode
+       TRACE_IRQS_ON
+#endif
+
        /*
         * Re-enable interrupts.  IRQ tracing already thinks that IRQs are
         * on (since we treat user mode as having IRQs on), and the
         * prologue above is too short for it to be worth adding a
-        * tracing round trip.
+        * tracing round trip except in the CONFIG_CONTEXT_TRACKING
+        * case.
         */
        ENABLE_INTERRUPTS(CLBR_NONE)
 
@@ -318,15 +326,10 @@ ENDPROC(entry_SYSENTER_compat)
  * with the int 0x80 path.
  */
 ENTRY(entry_SYSCALL_compat)
-       /*
-        * Interrupts are off on entry.
-        * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
-        * it is too small to ever cause noticeable irq latency.
-        */
+       /* Interrupts are off on entry. */
        SWAPGS_UNSAFE_STACK
        movl    %esp, %r8d
        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
-       ENABLE_INTERRUPTS(CLBR_NONE)
 
        /* Zero-extending 32-bit regs, do not remove */
        movl    %eax, %eax
@@ -346,6 +349,22 @@ ENTRY(entry_SYSCALL_compat)
        pushq   $-ENOSYS                /* pt_regs->ax */
        sub     $(10*8), %rsp           /* pt_regs->r8-11, bp, bx, r12-15 not 
saved */
 
+#ifdef CONFIG_CONTEXT_TRACKING
+       /* This is slow enough that it's worth tracing. */
+       TRACE_IRQS_OFF
+       call enter_from_user_mode
+       TRACE_IRQS_ON
+#endif
+
+       /*
+        * Re-enable interrupts.  IRQ tracing already thinks that IRQs are
+        * on (since we treat user mode as having IRQs on), and the
+        * prologue above is too short for it to be worth adding a
+        * tracing round trip except in the CONFIG_CONTEXT_TRACKING
+        * case.
+        */
+       ENABLE_INTERRUPTS(CLBR_NONE)
+
        /*
         * No need to do an access_ok check here because r8 has been
         * 32-bit zero extended:
@@ -354,6 +373,7 @@ ENTRY(entry_SYSCALL_compat)
 1:     movl    (%r8), %r9d
        _ASM_EXTABLE(1b, ia32_badarg)
        ASM_CLAC
+
        orl     $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
        testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, 
SIZEOF_PTREGS)
        jnz     cstar_tracesys
@@ -518,14 +538,9 @@ ia32_ret_from_sys_call:
  */
 
 ENTRY(entry_INT80_compat)
-       /*
-        * Interrupts are off on entry.
-        * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
-        * it is too small to ever cause noticeable irq latency.
-        */
+       /* Interrupts are off on entry. */
        PARAVIRT_ADJUST_EXCEPTION_FRAME
        SWAPGS
-       ENABLE_INTERRUPTS(CLBR_NONE)
 
        /* Zero-extending 32-bit regs, do not remove */
        movl    %eax, %eax
@@ -545,9 +560,17 @@ ENTRY(entry_INT80_compat)
        sub     $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
 
        orl     $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-       testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, 
SIZEOF_PTREGS)
+       testl   $(_TIF_WORK_SYSCALL_ENTRY | _TIF_NOHZ), 
ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
        jnz     ia32_tracesys
 
+       /*
+        * Re-enable interrupts.  IRQ tracing already thinks that IRQs are
+        * on (since we treat user mode as having IRQs on), and the
+        * prologue above is too short for it to be worth adding a
+        * tracing round trip.
+        */
+       ENABLE_INTERRUPTS(CLBR_NONE)
+
 ia32_do_call:
        /* 32-bit syscall -> 64-bit C ABI argument conversion */
        movl    %edi, %r8d              /* arg5 */
@@ -564,6 +587,15 @@ ia32_do_call:
        jmp     int_ret_from_sys_call
 
 ia32_tracesys:
+#ifdef CONFIG_CONTEXT_TRACKING
+       /* This is slow enough that it's worth tracing. */
+       TRACE_IRQS_OFF
+       call enter_from_user_mode
+       TRACE_IRQS_ON
+#endif
+
+       ENABLE_INTERRUPTS(CLBR_NONE)
+
        SAVE_EXTRA_REGS
        movq    %rsp, %rdi                      /* &pt_regs -> arg1 */
        call    syscall_trace_enter
diff --git a/arch/x86/include/asm/thread_info.h 
b/arch/x86/include/asm/thread_info.h
index 8afdc3e44247..3c5a96815dec 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -140,8 +140,7 @@ struct thread_info {
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY        \
        (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |   \
-        _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |     \
-        _TIF_NOHZ)
+        _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
 
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK                                              \
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to