Currently, the identification of the context is made through the preempt_counter, but it is set after the execution of the first functions of the IRQ/NMI, causing potential problems in the identification of the current status. For instance, ftrace/perf might drop events in the early stage of IRQ/NMI handlers because the preempt_counter was not set.
The proposed approach is to use a dedicated per-cpu variable to keep track of the context of execution, with values set before the execution of the first C function of the interrupt handler. This is a PoC in the x86_64. Signed-off-by: Daniel Bristot de Oliveira <[email protected]> Cc: Steven Rostedt <[email protected]> Cc: Arnaldo Carvalho de Melo <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Andy Lutomirski <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Borislav Petkov <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: "H. Peter Anvin" <[email protected]> Cc: "Joel Fernandes (Google)" <[email protected]> Cc: Jiri Olsa <[email protected]> Cc: Namhyung Kim <[email protected]> Cc: Alexander Shishkin <[email protected]> Cc: Tommaso Cucinotta <[email protected]> Cc: Romulo Silva de Oliveira <[email protected]> Cc: Clark Williams <[email protected]> Cc: [email protected] Cc: [email protected] --- arch/x86/entry/entry_64.S | 9 +++++++++ arch/x86/include/asm/irqflags.h | 30 ++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/common.c | 4 ++++ include/linux/irqflags.h | 4 ++++ kernel/softirq.c | 5 ++++- 5 files changed, 51 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1f0efdb7b629..1471b544241f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -545,6 +545,7 @@ ENTRY(interrupt_entry) testb $3, CS+8(%rsp) jz 1f + TASK_CONTEXT_SET_BIT context=TASK_CTX_IRQ /* * IRQ from user mode. * @@ -561,6 +562,8 @@ ENTRY(interrupt_entry) 1: ENTER_IRQ_STACK old_rsp=%rdi save_ret=1 + + TASK_CONTEXT_SET_BIT context=TASK_CTX_IRQ /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF @@ -586,6 +589,7 @@ ret_from_intr: DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF + TASK_CONTEXT_RESET_BIT context=TASK_CTX_IRQ LEAVE_IRQ_STACK testb $3, CS(%rsp) @@ -780,6 +784,7 @@ ENTRY(\sym) call interrupt_entry UNWIND_HINT_REGS indirect=1 call \do_sym /* rdi points to pt_regs */ + TASK_CONTEXT_RESET_BIT context=TASK_CTX_IRQ jmp ret_from_intr END(\sym) _ASM_NOKPROBE(\sym) @@ -1403,9 +1408,11 @@ ENTRY(nmi) * done with the NMI stack. */ + TASK_CONTEXT_SET_BIT context=TASK_CTX_NMI movq %rsp, %rdi movq $-1, %rsi call do_nmi + TASK_CONTEXT_RESET_BIT context=TASK_CTX_NMI /* * Return back to user mode. We must *not* do the normal exit @@ -1615,10 +1622,12 @@ end_repeat_nmi: call paranoid_entry UNWIND_HINT_REGS + TASK_CONTEXT_SET_BIT context=TASK_CTX_NMI /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp, %rdi movq $-1, %rsi call do_nmi + TASK_CONTEXT_RESET_BIT context=TASK_CTX_NMI /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 058e40fed167..5a12bc3ea02b 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -3,6 +3,7 @@ #define _X86_IRQFLAGS_H_ #include <asm/processor-flags.h> +#include <asm/percpu.h> #ifndef __ASSEMBLY__ @@ -202,4 +203,33 @@ static inline int arch_irqs_disabled(void) #endif #endif /* __ASSEMBLY__ */ +#ifdef CONFIG_X86_64 +/* + * NOTE: I know I need to implement this to the 32 bits as well. + * But... this is just a POC. + */ +#define ARCH_HAS_TASK_CONTEXT 1 + +#define TASK_CTX_THREAD 0x0 +#define TASK_CTX_SOFTIRQ 0x1 +#define TASK_CTX_IRQ 0x2 +#define TASK_CTX_NMI 0x4 + +#ifdef __ASSEMBLY__ +.macro TASK_CONTEXT_SET_BIT context:req + orb $\context, PER_CPU_VAR(task_context) +.endm + +.macro TASK_CONTEXT_RESET_BIT context:req + andb $~\context, PER_CPU_VAR(task_context) +.endm +#else /* __ASSEMBLY__ */ +DECLARE_PER_CPU(unsigned char, task_context); + +static __always_inline void task_context_set(unsigned char context) +{ + raw_cpu_write_1(task_context, context); +} +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_X86_64 */ #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb28e98a0659..1acbec22319b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1531,6 +1531,8 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); +DEFINE_PER_CPU(unsigned char, task_context) __visible = 0; + /* May not be marked __init: used by software suspend */ void syscall_init(void) { @@ -1604,6 +1606,8 @@ EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); +DEFINE_PER_CPU(unsigned char, task_context) __visible = 0; + /* * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find * the top of the kernel stack. Use an extra percpu variable to track the diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 21619c92c377..1c3473bbe5d2 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -168,4 +168,8 @@ do { \ #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags) +#ifndef ARCH_HAS_TASK_CONTEXT +#define task_context_set(context) do {} while (0) +#endif + #endif diff --git a/kernel/softirq.c b/kernel/softirq.c index 10277429ed84..324de769dc07 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -410,8 +410,11 @@ void irq_exit(void) #endif account_irq_exit_time(current); preempt_count_sub(HARDIRQ_OFFSET); - if (!in_interrupt() && local_softirq_pending()) + if (!in_interrupt() && local_softirq_pending()) { + task_context_set(TASK_CTX_SOFTIRQ); invoke_softirq(); + task_context_set(TASK_CTX_IRQ); + } tick_irq_exit(); rcu_irq_exit(); -- 2.20.1

