On Wed, Jul 3, 2019 at 3:28 AM root <pet...@infradead.org> wrote: > > Despire the current efforts to read CR2 before tracing happens there > still exist a number of possible holes: > > idtentry page_fault do_page_fault has_error_code=1 > call error_entry > TRACE_IRQS_OFF > call trace_hardirqs_off* > #PF // modifies CR2 > > CALL_enter_from_user_mode > __context_tracking_exit() > trace_user_exit(0) > #PF // modifies CR2 > > call do_page_fault > address = read_cr2(); /* whoopsie */ > > And similar for i386. > > Fix it by pulling the CR2 read into the entry code, before any of that > stuff gets a chance to run and ruin things. > > Ideally we'll clean up the entry code by moving this tracing and > context tracking nonsense into C some day, but let's not delay fixing > this longer. > > Reported-by: He Zhe <zhe...@windriver.com> > Reported-by: Eiichi Tsukata <de...@etsukata.com> > Debugged-by: Steven Rostedt <rost...@goodmis.org> > Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org> > --- > arch/x86/entry/entry_32.S | 25 ++++++++++++++++++++++--- > arch/x86/entry/entry_64.S | 28 ++++++++++++++-------------- > arch/x86/include/asm/kvm_para.h | 2 +- > arch/x86/include/asm/traps.h | 2 +- > arch/x86/kernel/kvm.c | 8 ++++---- > arch/x86/mm/fault.c | 28 ++++++++++------------------ > 6 files changed, 52 insertions(+), 41 deletions(-) > > --- a/arch/x86/entry/entry_32.S > +++ b/arch/x86/entry/entry_32.S > @@ -1443,9 +1443,28 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vec > > ENTRY(page_fault) > ASM_CLAC > - pushl $do_page_fault > - ALIGN > - jmp common_exception > + pushl $0; /* %gs's slot on the stack */ > + > + SAVE_ALL switch_stacks=1 skip_gs=1 > + > + ENCODE_FRAME_POINTER > + UNWIND_ESPFIX_STACK > + > + /* fixup %gs */ > + GS_TO_REG %ecx > + REG_TO_PTGS %ecx > + SET_KERNEL_GS %ecx > + > + GET_CR2_INTO(%ecx) # might clobber %eax > + > + /* fixup orig %eax */ > + movl PT_ORIG_EAX(%esp), %edx # get the error code > + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart > + > + TRACE_IRQS_OFF > + movl %esp, %eax # pt_regs pointer > + call do_page_fault > + jmp ret_from_exception > END(page_fault) > > common_exception: > --- a/arch/x86/entry/entry_64.S > +++ b/arch/x86/entry/entry_64.S > @@ -901,7 +901,7 @@ apicinterrupt IRQ_WORK_VECTOR > irq_work > * @paranoid == 2 is special: the stub will never switch stacks. This is for > * #DF: if the thread stack is somehow unusable, we'll still get a useful > OOPS. > */ > -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 > ist_offset=0 create_gap=0 > +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 > ist_offset=0 create_gap=0 read_cr2=0 > ENTRY(\sym) > UNWIND_HINT_IRET_REGS offset=\has_error_code*8 > > @@ -937,18 +937,27 @@ ENTRY(\sym) > > .if \paranoid > call paranoid_entry > + /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ > .else > call error_entry > .endif > UNWIND_HINT_REGS > - /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ > > - .if \paranoid > + .if \read_cr2 > + GET_CR2_INTO(%rdx); /* can clobber %rax */ > + .endif > + > .if \shift_ist != -1 > TRACE_IRQS_OFF_DEBUG /* reload IDT in case of > recursion */ > .else > TRACE_IRQS_OFF > .endif > + > + .if \paranoid == 0 > + testb $3, CS(%rsp) > + jz .Lfrom_kernel_no_context_tracking_\@ > + CALL_enter_from_user_mode > +.Lfrom_kernel_no_context_tracking_\@: > .endif > > movq %rsp, %rdi /* pt_regs pointer */ > @@ -1180,10 +1189,10 @@ idtentry xenint3 do_int3 > has_error_co > #endif > > idtentry general_protection do_general_protection has_error_code=1 > -idtentry page_fault do_page_fault has_error_code=1 > +idtentry page_fault do_page_fault has_error_code=1 > read_cr2=1 > > #ifdef CONFIG_KVM_GUEST > -idtentry async_page_fault do_async_page_fault has_error_code=1 > +idtentry async_page_fault do_async_page_fault has_error_code=1 > read_cr2=1 > #endif > > #ifdef CONFIG_X86_MCE > @@ -1338,18 +1347,9 @@ ENTRY(error_entry) > movq %rax, %rsp /* switch stack */ > ENCODE_FRAME_POINTER > pushq %r12 > - > - /* > - * We need to tell lockdep that IRQs are off. We can't do this until > - * we fix gsbase, and we should do it before enter_from_user_mode > - * (which can take locks). > - */ > - TRACE_IRQS_OFF
This hunk looks wrong. Am I missing some other place that handles the case where we enter from kernel mode and IRQs were on?