On Sep 7, 2015, at 11:48 PM, James Morse wrote: Hi James,
> On 04/09/15 15:23, Jungseok Lee wrote: >> Currently, kernel context and interrupts are handled using a single >> kernel stack navigated by sp_el1. This forces many systems to use >> 16KB stack, not 8KB one. Low memory platforms naturally suffer from >> both memory pressure and performance degradation simultaneously as >> VM page allocator falls into slowpath frequently. >> >> This patch, thus, solves the problem as introducing a separate percpu >> IRQ stack to handle both hard and soft interrupts with two ground rules: >> >> - Utilize sp_el0 in EL1 context, which is not used currently >> - Do *not* complicate current_thread_info calculation >> >> struct thread_info can be tracked easily using sp_el0, not sp_el1 when >> this feature is enabled. >> >> Signed-off-by: Jungseok Lee <jungseokle...@gmail.com> >> --- >> arch/arm64/Kconfig.debug | 10 ++ >> arch/arm64/include/asm/irq.h | 8 ++ >> arch/arm64/include/asm/thread_info.h | 11 ++ >> arch/arm64/kernel/asm-offsets.c | 8 ++ >> arch/arm64/kernel/entry.S | 83 +++++++++++++++- >> arch/arm64/kernel/head.S | 7 ++ >> arch/arm64/kernel/irq.c | 18 ++++ >> 7 files changed, 142 insertions(+), 3 deletions(-) >> >> diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug >> index d6285ef..e16d91f 100644 >> --- a/arch/arm64/Kconfig.debug >> +++ b/arch/arm64/Kconfig.debug >> @@ -18,6 +18,16 @@ config ARM64_PTDUMP >> kernel. >> If in doubt, say "N" >> >> +config IRQ_STACK >> + bool "Use separate kernel stack when handling interrupts" >> + depends on ARM64_4K_PAGES >> + help >> + Say Y here if you want to use separate kernel stack to handle both >> + hard and soft interrupts. As reduceing memory footprint regarding >> + kernel stack, it benefits low memory platforms. >> + >> + If in doubt, say N. >> + > > I don't think it is necessary to have a debug-only Kconfig option for this. > Reducing memory use is good for everyone! > > This would let you get rid of all the #ifdefs Hmm.. A single concern is stability. However, I agree that this is not an optional feature. Especially, it definitely benefits low memory platforms. I will remove this snippet in the next version. >> config STRICT_DEVMEM >> bool "Filter access to /dev/mem" >> depends on MMU >> diff --git a/arch/arm64/include/asm/thread_info.h >> b/arch/arm64/include/asm/thread_info.h >> index dcd06d1..5345a67 100644 >> --- a/arch/arm64/include/asm/thread_info.h >> +++ b/arch/arm64/include/asm/thread_info.h >> @@ -71,11 +71,22 @@ register unsigned long current_stack_pointer asm ("sp"); >> */ >> static inline struct thread_info *current_thread_info(void) >> __attribute_const__; >> >> +#ifndef CONFIG_IRQ_STACK >> static inline struct thread_info *current_thread_info(void) >> { >> return (struct thread_info *) >> (current_stack_pointer & ~(THREAD_SIZE - 1)); >> } >> +#else >> +static inline struct thread_info *current_thread_info(void) >> +{ >> + unsigned long sp_el0; >> + >> + asm volatile("mrs %0, sp_el0" : "=r" (sp_el0)); >> + >> + return (struct thread_info *)(sp_el0 & ~(THREAD_SIZE - 1)); >> +} >> +#endif > > Because sp_el0 is only used as a stack value to find struct thread_info, > you could just store the struct thread_info pointer in sp_el0, and save the > masking on each read of the value. IMHO, this logic, masking SP with ~(THREAD_SIZE - 1), is a well-known idiom. So, I don't want to change the expression. In addition, the same process is needed before storing the address of struct thread_info. >> >> #define thread_saved_pc(tsk) \ >> ((unsigned long)(tsk->thread.cpu_context.pc)) >> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S >> index d23ca0d..f1fdfa9 100644 >> --- a/arch/arm64/kernel/entry.S >> +++ b/arch/arm64/kernel/entry.S >> @@ -88,7 +88,11 @@ >> >> .if \el == 0 >> mrs x21, sp_el0 >> +#ifndef CONFIG_IRQ_STACK >> get_thread_info tsk // Ensure MDSCR_EL1.SS is clear, >> +#else >> + get_thread_info \el, tsk >> +#endif >> ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug >> disable_step_tsk x19, x20 // exceptions when scheduling. >> .endif >> @@ -168,11 +172,56 @@ >> eret // return to kernel >> .endm >> >> +#ifndef CONFIG_IRQ_STACK >> .macro get_thread_info, rd >> mov \rd, sp >> - and \rd, \rd, #~(THREAD_SIZE - 1) // top of stack >> + and \rd, \rd, #~(THREAD_SIZE - 1) // bottom of stack >> + .endm >> +#else >> + .macro get_thread_info, el, rd >> + .if \el == 0 >> + mov \rd, sp >> + .else >> + mrs \rd, sp_el0 >> + .endif >> + and \rd, \rd, #~(THREAD_SIZE - 1) // bottom of thread stack >> + .endm >> + >> + .macro get_irq_stack >> + get_thread_info 1, tsk >> + ldr w22, [tsk, #TI_CPU] >> + adr_l x21, irq_stacks >> + mov x23, #IRQ_STACK_SIZE >> + madd x21, x22, x23, x21 >> .endm > > Using per_cpu variables would save the multiply here. > You then wouldn't need IRQ_STACK_SIZE. Agree. >> >> + .macro irq_stack_entry >> + get_irq_stack >> + ldr w23, [x21, #IRQ_COUNT] >> + cbnz w23, 1f >> + mov x23, sp >> + str x23, [x21, #IRQ_THREAD_SP] >> + ldr x23, [x21, #IRQ_STACK] >> + mov sp, x23 >> + mov x23, xzr >> +1: add w23, w23, #1 >> + str w23, [x21, #IRQ_COUNT] > > Why do you need to count the number of irqs? > struct thread_info:preempt_count already does something similar (but its > not usable this early...) > > An alternative way would be to compare the top bits of the two stack > pointers - all we care about is irq recursion right? Exactly. A point is a recursion check, not an actual number in this context. I'd like to address the recursion issue with minimal load and store operation. The suggestion sounds good. I will figure out a better and simpler way and update it with comments in the code. > This would save storing 'int count' for each cpu, and the logic to > inc/decrement it. > > >> + .endm >> + >> + .macro irq_stack_exit >> + get_irq_stack >> + ldr w23, [x21, #IRQ_COUNT] >> + sub w23, w23, #1 >> + cbnz w23, 1f >> + mov x23, sp >> + str x23, [x21, #IRQ_STACK] >> + ldr x23, [x21, #IRQ_THREAD_SP] >> + mov sp, x23 >> + mov x23, xzr >> +1: str w23, [x21, #IRQ_COUNT] >> + .endm >> +#endif >> + >> /* >> * These are the registers used in the syscall handler, and allow us to >> * have in theory up to 7 arguments to a function - x0 to x6. >> @@ -188,10 +237,15 @@ tsk .req x28 // current thread_info >> * Interrupt handling. >> */ >> .macro irq_handler >> - adrp x1, handle_arch_irq >> - ldr x1, [x1, #:lo12:handle_arch_irq] >> + ldr_l x1, handle_arch_irq >> mov x0, sp >> +#ifdef CONFIG_IRQ_STACK >> + irq_stack_entry >> +#endif >> blr x1 >> +#ifdef CONFIG_IRQ_STACK >> + irq_stack_exit >> +#endif >> .endm >> >> .text >> @@ -366,7 +420,11 @@ el1_irq: >> irq_handler >> >> #ifdef CONFIG_PREEMPT >> +#ifndef CONFIG_IRQ_STACK >> get_thread_info tsk >> +#else >> + get_thread_info 1, tsk >> +#endif >> ldr w24, [tsk, #TI_PREEMPT] // get preempt count >> cbnz w24, 1f // preempt count != 0 >> ldr x0, [tsk, #TI_FLAGS] // get flags >> @@ -395,6 +453,10 @@ el1_preempt: >> .align 6 >> el0_sync: >> kernel_entry 0 >> +#ifdef CONFIG_IRQ_STACK >> + mov x25, sp >> + msr sp_el0, x25 >> +#endif > > Could you do this in kernel_entry? Yes, I could. It would make the code neater. >> mrs x25, esr_el1 // read the syndrome register >> lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class >> cmp x24, #ESR_ELx_EC_SVC64 // SVC in 64-bit state >> @@ -423,6 +485,10 @@ el0_sync: >> .align 6 >> el0_sync_compat: >> kernel_entry 0, 32 >> +#ifdef CONFIG_IRQ_STACK >> + mov x25, sp >> + msr sp_el0, x25 >> +#endif >> mrs x25, esr_el1 // read the syndrome register >> lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class >> cmp x24, #ESR_ELx_EC_SVC32 // SVC in 32-bit state >> @@ -560,6 +626,10 @@ ENDPROC(el0_sync) >> el0_irq: >> kernel_entry 0 >> el0_irq_naked: >> +#ifdef CONFIG_IRQ_STACK >> + mov x22, sp >> + msr sp_el0, x22 >> +#endif >> enable_dbg >> #ifdef CONFIG_TRACE_IRQFLAGS >> bl trace_hardirqs_off >> @@ -602,6 +672,9 @@ ENTRY(cpu_switch_to) >> ldp x29, x9, [x8], #16 >> ldr lr, [x8] >> mov sp, x9 >> +#ifdef CONFIG_IRQ_STACK >> + msr sp_el0, x9 >> +#endif >> ret >> ENDPROC(cpu_switch_to) >> >> @@ -661,7 +734,11 @@ ENTRY(ret_from_fork) >> cbz x19, 1f // not a kernel thread >> mov x0, x20 >> blr x19 >> +#ifndef CONFIG_IRQ_STACK >> 1: get_thread_info tsk >> +#else >> +1: get_thread_info 1, tsk >> +#endif >> b ret_to_user >> ENDPROC(ret_from_fork) >> >> diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S >> index c0ff3ce..3142766 100644 >> --- a/arch/arm64/kernel/head.S >> +++ b/arch/arm64/kernel/head.S >> @@ -446,6 +446,10 @@ __mmap_switched: >> b 1b >> 2: >> adr_l sp, initial_sp, x4 >> +#ifdef CONFIG_IRQ_STACK >> + mov x4, sp >> + msr sp_el0, x4 >> +#endif >> str_l x21, __fdt_pointer, x5 // Save FDT pointer >> str_l x24, memstart_addr, x6 // Save PHYS_OFFSET >> mov x29, #0 >> @@ -619,6 +623,9 @@ ENDPROC(secondary_startup) >> ENTRY(__secondary_switched) >> ldr x0, [x21] // get secondary_data.stack >> mov sp, x0 >> +#ifdef CONFIG_IRQ_STACK >> + msr sp_el0, x0 >> +#endif >> mov x29, #0 >> b secondary_start_kernel >> ENDPROC(__secondary_switched) >> diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c >> index 463fa2e..fb0f522 100644 >> --- a/arch/arm64/kernel/irq.c >> +++ b/arch/arm64/kernel/irq.c >> @@ -31,6 +31,10 @@ >> >> unsigned long irq_err_count; >> >> +#ifdef CONFIG_IRQ_STACK >> +struct irq_stack irq_stacks[NR_CPUS]; >> +#endif >> + > > DEFINE_PER_CPU()? Yeah, I will update it with per_cpu in the next version. >> int arch_show_interrupts(struct seq_file *p, int prec) >> { >> #ifdef CONFIG_SMP >> @@ -52,6 +56,20 @@ void __init set_handle_irq(void (*handle_irq)(struct >> pt_regs *)) >> >> void __init init_IRQ(void) >> { >> +#ifdef CONFIG_IRQ_STACK >> + unsigned int cpu; >> + >> + for_each_possible_cpu(cpu) { >> + void *stack = (void *)__get_free_pages(THREADINFO_GFP, >> + THREAD_SIZE_ORDER); >> + if (!stack) >> + panic("CPU%u: No IRQ stack", cpu); >> + >> + irq_stacks[cpu].stack = stack + THREAD_START_SP; >> + } >> + pr_info("IRQ: Allocated IRQ stack successfully\n"); >> +#endif >> + > > Wouldn't it be better to only allocate the stack when the CPU is about to > be brought up? (put the alloc code in __cpu_up()). __cpu_up could be called very frequently if a power management scheme based on CPU hotplug is actively used. So, I'd like to avoid even a simple logic which checks the previously allocated IRQ stack. Thanks for the valuable feedbacks. Best Regards Jungseok Lee-- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/