[tip:x86/fpu] x86/fpu: Defer FPU state load until return to userspace
Commit-ID: 5f409e20b794565e2d60ad333e79334630a6c798 Gitweb: https://git.kernel.org/tip/5f409e20b794565e2d60ad333e79334630a6c798 Author: Rik van Riel AuthorDate: Wed, 3 Apr 2019 18:41:52 +0200 Committer: Borislav Petkov CommitDate: Fri, 12 Apr 2019 19:34:47 +0200 x86/fpu: Defer FPU state load until return to userspace Defer loading of FPU state until return to userspace. This gives the kernel the potential to skip loading FPU state for tasks that stay in kernel mode, or for tasks that end up with repeated invocations of kernel_fpu_begin() & kernel_fpu_end(). The fpregs_lock/unlock() section ensures that the registers remain unchanged. Otherwise a context switch or a bottom half could save the registers to its FPU context and the processor's FPU registers would became random if modified at the same time. KVM swaps the host/guest registers on entry/exit path. This flow has been kept as is. First it ensures that the registers are loaded and then saves the current (host) state before it loads the guest's registers. The swap is done at the very end with disabled interrupts so it should not change anymore before theg guest is entered. The read/save version seems to be cheaper compared to memcpy() in a micro benchmark. Each thread gets TIF_NEED_FPU_LOAD set as part of fork() / fpu__copy(). For kernel threads, this flag gets never cleared which avoids saving / restoring the FPU state for kernel threads and during in-kernel usage of the FPU registers. [ bp: Correct and update commit message and fix checkpatch warnings. s/register/registers/ where it is used in plural. minor comment corrections. remove unused trace_x86_fpu_activate_state() TP. ] Signed-off-by: Rik van Riel Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Aubrey Li Cc: Babu Moger Cc: "Chang S. Bae" Cc: Dmitry Safonov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: kvm ML Cc: Nicolai Stange Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Tim Chen Cc: Waiman Long Cc: x86-ml Cc: Yi Wang Link: https://lkml.kernel.org/r/20190403164156.19645-24-bige...@linutronix.de --- arch/x86/entry/common.c | 10 +++- arch/x86/include/asm/fpu/api.h | 22 +++- arch/x86/include/asm/fpu/internal.h | 27 + arch/x86/include/asm/trace/fpu.h| 10 ++-- arch/x86/kernel/fpu/core.c | 106 +++- arch/x86/kernel/fpu/signal.c| 49 ++--- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/process_32.c| 5 +- arch/x86/kernel/process_64.c| 5 +- arch/x86/kvm/x86.c | 20 +-- 10 files changed, 184 insertions(+), 72 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 7bc105f47d21..51beb8d29123 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -25,12 +25,13 @@ #include #include #include +#include #include #include #include -#include #include +#include #define CREATE_TRACE_POINTS #include @@ -196,6 +197,13 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) exit_to_usermode_loop(regs, cached_flags); + /* Reload ti->flags; we may have rescheduled above. */ + cached_flags = READ_ONCE(ti->flags); + + fpregs_assert_state_consistent(); + if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD)) + switch_fpu_return(); + #ifdef CONFIG_COMPAT /* * Compat syscalls set TS_COMPAT. Make sure we clear it before diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 73e684160f35..b774c52e5411 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -10,7 +10,7 @@ #ifndef _ASM_X86_FPU_API_H #define _ASM_X86_FPU_API_H -#include +#include /* * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It @@ -22,17 +22,37 @@ extern void kernel_fpu_begin(void); extern void kernel_fpu_end(void); extern bool irq_fpu_usable(void); +extern void fpregs_mark_activate(void); +/* + * Use fpregs_lock() while editing CPU's FPU registers or fpu->state. + * A context switch will (and softirq might) save CPU's FPU registers to + * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in + * a random state. + */ static inline void fpregs_lock(void) { preempt_disable(); + local_bh_disable(); } static inline void fpregs_unlock(void) { + local_bh_enable(); preempt_enable(); } +#ifdef CONFIG_X86_DEBUG_FPU +extern void fpregs_assert_state_consistent(void); +#else +static inline void fpregs_assert_state_consistent(void) { } +#endif + +/* + * Load the task FPU state before returning to userspace. + */ +extern
[tip:x86/fpu] x86/fpu: Prepare copy_fpstate_to_sigframe() for TIF_NEED_FPU_LOAD
Commit-ID: a352a3b7b7920212ee4c45a41500c66826318e92 Gitweb: https://git.kernel.org/tip/a352a3b7b7920212ee4c45a41500c66826318e92 Author: Rik van Riel AuthorDate: Wed, 3 Apr 2019 18:41:47 +0200 Committer: Borislav Petkov CommitDate: Thu, 11 Apr 2019 18:20:04 +0200 x86/fpu: Prepare copy_fpstate_to_sigframe() for TIF_NEED_FPU_LOAD The FPU registers need only to be saved if TIF_NEED_FPU_LOAD is not set. Otherwise this has been already done and can be skipped. [ bp: Massage a bit. ] Signed-off-by: Rik van Riel Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-19-bige...@linutronix.de --- arch/x86/kernel/fpu/signal.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 8f23f5237218..9b9dfdc96285 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -171,7 +171,17 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - copy_fpregs_to_fpstate(fpu); + /* +* If we do not need to load the FPU registers at return to userspace +* then the CPU has the current state and we need to save it. Otherwise, +* it has already been done and we can skip it. +*/ + fpregs_lock(); + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { + copy_fpregs_to_fpstate(fpu); + set_thread_flag(TIF_NEED_FPU_LOAD); + } + fpregs_unlock(); if (using_compacted_format()) { if (copy_xstate_to_user(buf_fx, xsave, 0, size))
[tip:x86/fpu] x86/fpu: Always store the registers in copy_fpstate_to_sigframe()
Commit-ID: 69277c98f5eef0d9839699b7825923c3985f665f Gitweb: https://git.kernel.org/tip/69277c98f5eef0d9839699b7825923c3985f665f Author: Rik van Riel AuthorDate: Wed, 3 Apr 2019 18:41:46 +0200 Committer: Borislav Petkov CommitDate: Thu, 11 Apr 2019 18:08:57 +0200 x86/fpu: Always store the registers in copy_fpstate_to_sigframe() copy_fpstate_to_sigframe() stores the registers directly to user space. This is okay because the FPU registers are valid and saving them directly avoids saving them into kernel memory and making a copy. However, this cannot be done anymore if the FPU registers are going to be restored on the return to userland. It is possible that the FPU registers will be invalidated in the middle of the save operation and this should be done with disabled preemption / BH. Save the FPU registers to the task's FPU struct and copy them to the user memory later on. Signed-off-by: Rik van Riel Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-18-bige...@linutronix.de --- arch/x86/kernel/fpu/signal.c | 19 ++- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 155f4552413e..8f23f5237218 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -144,8 +144,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * buf == buf_fx for 64-bit frames and 32-bit fsave frame. * buf != buf_fx for 32-bit frames with fxstate. * - * Save the state directly to the user frame pointed by the aligned pointer - * 'buf_fx'. + * Save the state to task's fpu->state and then copy it to the user frame + * pointed to by the aligned pointer 'buf_fx'. * * If this is a 32-bit frame with fxstate, put a fsave header before * the aligned state at 'buf_fx'. @@ -155,6 +155,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) */ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { + struct fpu *fpu = ¤t->thread.fpu; + struct xregs_state *xsave = &fpu->state.xsave; struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); @@ -169,9 +171,16 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - /* Save the live registers state to the user frame directly. */ - if (copy_fpregs_to_sigframe(buf_fx)) - return -1; + copy_fpregs_to_fpstate(fpu); + + if (using_compacted_format()) { + if (copy_xstate_to_user(buf_fx, xsave, 0, size)) + return -1; + } else { + fpstate_sanitize_xstate(fpu); + if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size)) + return -1; + } /* Save the fsave header for the 32-bit frames. */ if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
[tip:x86/fpu] x86/fpu: Eager switch PKRU state
Commit-ID: 0cecca9d03c964abbd2b7927d0670eb70db4ebf2 Gitweb: https://git.kernel.org/tip/0cecca9d03c964abbd2b7927d0670eb70db4ebf2 Author: Rik van Riel AuthorDate: Wed, 3 Apr 2019 18:41:44 +0200 Committer: Borislav Petkov CommitDate: Thu, 11 Apr 2019 15:57:10 +0200 x86/fpu: Eager switch PKRU state While most of a task's FPU state is only needed in user space, the protection keys need to be in place immediately after a context switch. The reason is that any access to userspace memory while running in kernel mode also needs to abide by the memory permissions specified in the protection keys. The "eager switch" is a preparation for loading the FPU state on return to userland. Instead of decoupling PKRU state from xstate, update PKRU within xstate on write operations by the kernel. For user tasks the PKRU should be always read from the xsave area and it should not change anything because the PKRU value was loaded as part of FPU restore. For kernel threads the default "init_pkru_value" will be written. Before this commit, the kernel thread would end up with a random value which it inherited from the previous user task. [ bigeasy: save pkru to xstate, no cache, don't use __raw_xsave_addr() ] [ bp: update commit message, sort headers properly in asm/fpu/xstate.h ] Signed-off-by: Rik van Riel Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andi Kleen Cc: Andy Lutomirski Cc: Aubrey Li Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: Joerg Roedel Cc: Juergen Gross Cc: "Kirill A. Shutemov" Cc: kvm ML Cc: Michal Hocko Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-16-bige...@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 24 ++-- arch/x86/include/asm/fpu/xstate.h | 4 +++- arch/x86/include/asm/pgtable.h | 6 ++ arch/x86/mm/pkeys.c | 1 - 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 3e0c2c496f2d..6eb4a0b1ad0e 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -534,8 +535,27 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) */ static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) { - if (static_cpu_has(X86_FEATURE_FPU)) - __fpregs_load_activate(new_fpu, cpu); + u32 pkru_val = init_pkru_value; + struct pkru_state *pk; + + if (!static_cpu_has(X86_FEATURE_FPU)) + return; + + __fpregs_load_activate(new_fpu, cpu); + + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) + return; + + /* +* PKRU state is switched eagerly because it needs to be valid before we +* return to userland e.g. for a copy_to_user() operation. +*/ + if (current->mm) { + pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU); + if (pk) + pkru_val = pk->pkru; + } + __write_pkru(pkru_val); } /* diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index fbe41f808e5d..7e42b285c856 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -2,9 +2,11 @@ #ifndef __ASM_X86_XSAVE_H #define __ASM_X86_XSAVE_H +#include #include + #include -#include +#include /* Bit 63 of XCR0 is reserved for future expansion */ #define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63))) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e8875ca75623..9beb371b1adf 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1355,6 +1355,12 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) #define PKRU_WD_BIT 0x2 #define PKRU_BITS_PER_PKEY 2 +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +extern u32 init_pkru_value; +#else +#define init_pkru_value0 +#endif + static inline bool __pkru_allows_read(u32 pkru, u16 pkey) { int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY; diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 50f65fc1b9a3..2ecbf4155f98 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -126,7 +126,6 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey * in the process's lifetime will not accidentally get access * to data which is pkey-protected later on. */ -static u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) | PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) |
[tip:x86/fpu] x86/fpu: Add an __fpregs_load_activate() internal helper
Commit-ID: 4ee91519e1dccc175665fe24bb20a47c6053575c Gitweb: https://git.kernel.org/tip/4ee91519e1dccc175665fe24bb20a47c6053575c Author: Rik van Riel AuthorDate: Wed, 3 Apr 2019 18:41:38 +0200 Committer: Borislav Petkov CommitDate: Wed, 10 Apr 2019 16:23:14 +0200 x86/fpu: Add an __fpregs_load_activate() internal helper Add a helper function that ensures the floating point registers for the current task are active. Use with preemption disabled. While at it, add fpregs_lock/unlock() helpers too, to be used in later patches. [ bp: Add a comment about its intended usage. ] Signed-off-by: Rik van Riel Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Aubrey Li Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-10-bige...@linutronix.de --- arch/x86/include/asm/fpu/api.h | 11 +++ arch/x86/include/asm/fpu/internal.h | 22 ++ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index b56d504af654..73e684160f35 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -10,6 +10,7 @@ #ifndef _ASM_X86_FPU_API_H #define _ASM_X86_FPU_API_H +#include /* * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It @@ -22,6 +23,16 @@ extern void kernel_fpu_begin(void); extern void kernel_fpu_end(void); extern bool irq_fpu_usable(void); +static inline void fpregs_lock(void) +{ + preempt_disable(); +} + +static inline void fpregs_unlock(void) +{ + preempt_enable(); +} + /* * Query the presence of one or more xfeatures. Works on any legacy CPU as well. * diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 54f70cae2f15..3e0c2c496f2d 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -484,6 +484,18 @@ static inline void fpregs_activate(struct fpu *fpu) trace_x86_fpu_regs_activated(fpu); } +/* + * Internal helper, do not use directly. Use switch_fpu_return() instead. + */ +static inline void __fpregs_load_activate(struct fpu *fpu, int cpu) +{ + if (!fpregs_state_valid(fpu, cpu)) { + if (current->mm) + copy_kernel_to_fpregs(&fpu->state); + fpregs_activate(fpu); + } +} + /* * FPU state switching for scheduling. * @@ -522,14 +534,8 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) */ static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) { - if (static_cpu_has(X86_FEATURE_FPU)) { - if (!fpregs_state_valid(new_fpu, cpu)) { - if (current->mm) - copy_kernel_to_fpregs(&new_fpu->state); - } - - fpregs_activate(new_fpu); - } + if (static_cpu_has(X86_FEATURE_FPU)) + __fpregs_load_activate(new_fpu, cpu); } /*
[tip:x86/mm] x86/mm/tlb: Make lazy TLB mode lazier
Commit-ID: 145f573b89a62bf53cfc0144fa9b1c56b0f70b45 Gitweb: https://git.kernel.org/tip/145f573b89a62bf53cfc0144fa9b1c56b0f70b45 Author: Rik van Riel AuthorDate: Tue, 25 Sep 2018 23:58:44 -0400 Committer: Peter Zijlstra CommitDate: Tue, 9 Oct 2018 16:51:12 +0200 x86/mm/tlb: Make lazy TLB mode lazier Lazy TLB mode can result in an idle CPU being woken up by a TLB flush, when all it really needs to do is reload %CR3 at the next context switch, assuming no page table pages got freed. Memory ordering is used to prevent race conditions between switch_mm_irqs_off, which checks whether .tlb_gen changed, and the TLB invalidation code, which increments .tlb_gen whenever page table entries get invalidated. The atomic increment in inc_mm_tlb_gen is its own barrier; the context switch code adds an explicit barrier between reading tlbstate.is_lazy and next->context.tlb_gen. CPUs in lazy TLB mode remain part of the mm_cpumask(mm), both because that allows TLB flush IPIs to be sent at page table freeing time, and because the cache line bouncing on the mm_cpumask(mm) was responsible for about half the CPU use in switch_mm_irqs_off(). We can change native_flush_tlb_others() without touching other (paravirt) implementations of flush_tlb_others() because we'll be flushing less. The existing implementations flush more and are therefore still correct. Cc: npig...@gmail.com Cc: mi...@kernel.org Cc: will.dea...@arm.com Cc: kernel-t...@fb.com Cc: l...@kernel.org Cc: h...@zytor.com Tested-by: Song Liu Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180926035844.1420-8-r...@surriel.com --- arch/x86/mm/tlb.c | 67 +++ 1 file changed, 58 insertions(+), 9 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 92e46f4c058c..7d68489cfdb1 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -185,6 +185,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; bool need_flush; @@ -242,17 +243,40 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, next->context.ctx_id); /* -* We don't currently support having a real mm loaded without -* our cpu set in mm_cpumask(). We have all the bookkeeping -* in place to figure out whether we would need to flush -* if our cpu were cleared in mm_cpumask(), but we don't -* currently use it. +* Even in lazy TLB mode, the CPU should stay set in the +* mm_cpumask. The TLB shootdown code can figure out from +* from cpu_tlbstate.is_lazy whether or not to send an IPI. */ if (WARN_ON_ONCE(real_prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next cpumask_set_cpu(cpu, mm_cpumask(next)); - return; + /* +* If the CPU is not in lazy TLB mode, we are just switching +* from one thread in a process to another thread in the same +* process. No TLB flush required. +*/ + if (!was_lazy) + return; + + /* +* Read the tlb_gen to check whether a flush is needed. +* If the TLB is up to date, just use it. +* The barrier synchronizes with the tlb_gen increment in +* the TLB shootdown code. +*/ + smp_mb(); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == + next_tlb_gen) + return; + + /* +* TLB contents went out of date while we were in lazy +* mode. Fall through to the TLB switching code below. +*/ + new_asid = prev_asid; + need_flush = true; } else { u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); @@ -346,8 +370,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm, next); this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); - load_mm_cr4(next); - switch_ldt(real_prev, next); + if (next != real_prev) { + load_mm_cr4(next); + switch_ldt(real_prev, next); + } } /* @@ -455,6 +481,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
[tip:x86/mm] x86/mm/tlb: Add freed_tables element to flush_tlb_info
Commit-ID: 97807813fe7074ee865d6bc1df1d0f8fb878ee9d Gitweb: https://git.kernel.org/tip/97807813fe7074ee865d6bc1df1d0f8fb878ee9d Author: Rik van Riel AuthorDate: Tue, 25 Sep 2018 23:58:43 -0400 Committer: Peter Zijlstra CommitDate: Tue, 9 Oct 2018 16:51:12 +0200 x86/mm/tlb: Add freed_tables element to flush_tlb_info Pass the information on to native_flush_tlb_others. No functional changes. Cc: npig...@gmail.com Cc: mi...@kernel.org Cc: will.dea...@arm.com Cc: songliubrav...@fb.com Cc: kernel-t...@fb.com Cc: h...@zytor.com Cc: l...@kernel.org Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180926035844.1420-7-r...@surriel.com --- arch/x86/include/asm/tlbflush.h | 1 + arch/x86/mm/tlb.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 1dea9860ce5b..323a313947e0 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -532,6 +532,7 @@ struct flush_tlb_info { unsigned long end; u64 new_tlb_gen; unsigned intstride_shift; + boolfreed_tables; }; #define local_flush_tlb() __flush_tlb() diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 14bf39fc0447..92e46f4c058c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -617,6 +617,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { .mm = mm, .stride_shift = stride_shift, + .freed_tables = freed_tables, }; cpu = get_cpu();
[tip:x86/mm] x86/mm/tlb: Add freed_tables argument to flush_tlb_mm_range
Commit-ID: 016c4d92cd16f569c6485ae62b076c1a4b779536 Gitweb: https://git.kernel.org/tip/016c4d92cd16f569c6485ae62b076c1a4b779536 Author: Rik van Riel AuthorDate: Tue, 25 Sep 2018 23:58:42 -0400 Committer: Peter Zijlstra CommitDate: Tue, 9 Oct 2018 16:51:12 +0200 x86/mm/tlb: Add freed_tables argument to flush_tlb_mm_range Add an argument to flush_tlb_mm_range to indicate whether page tables are about to be freed after this TLB flush. This allows for an optimization of flush_tlb_mm_range to skip CPUs in lazy TLB mode. No functional changes. Cc: npig...@gmail.com Cc: mi...@kernel.org Cc: will.dea...@arm.com Cc: songliubrav...@fb.com Cc: kernel-t...@fb.com Cc: l...@kernel.org Cc: h...@zytor.com Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180926035844.1420-6-r...@surriel.com --- arch/x86/include/asm/tlb.h | 2 +- arch/x86/include/asm/tlbflush.h | 10 ++ arch/x86/kernel/ldt.c | 2 +- arch/x86/kernel/vm86_32.c | 2 +- arch/x86/mm/tlb.c | 3 ++- 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index afbe7d1e68cf..404b8b1d44f5 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -20,7 +20,7 @@ static inline void tlb_flush(struct mmu_gather *tlb) end = tlb->end; } - flush_tlb_mm_range(tlb->mm, start, end, stride_shift); + flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables); } /* diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index d6c0cd9e9591..1dea9860ce5b 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -536,22 +536,24 @@ struct flush_tlb_info { #define local_flush_tlb() __flush_tlb() -#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL) +#define flush_tlb_mm(mm) \ + flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true) #define flush_tlb_range(vma, start, end) \ flush_tlb_mm_range((vma)->vm_mm, start, end,\ ((vma)->vm_flags & VM_HUGETLB) \ ? huge_page_shift(hstate_vma(vma)) \ - : PAGE_SHIFT) + : PAGE_SHIFT, false) extern void flush_tlb_all(void); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned int stride_shift); + unsigned long end, unsigned int stride_shift, + bool freed_tables); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) { - flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT); + flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false); } void native_flush_tlb_others(const struct cpumask *cpumask, diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 7fdb2414ca65..ab18e0884dc6 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) map_ldt_struct_to_user(mm); va = (unsigned long)ldt_slot_va(slot); - flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT); + flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false); ldt->slot = slot; return 0; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 52fed70f671e..c2fd39752da8 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) pte_unmap_unlock(pte, ptl); out: up_write(&mm->mmap_sem); - flush_tlb_mm_range(mm, 0xA, 0xA + 32*PAGE_SIZE, PAGE_SHIFT); + flush_tlb_mm_range(mm, 0xA, 0xA + 32*PAGE_SIZE, PAGE_SHIFT, false); } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 9fb30d27854b..14bf39fc0447 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -609,7 +609,8 @@ void native_flush_tlb_others(const struct cpumask *cpumask, static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned int stride_shift) + unsigned long end, unsigned int stride_shift, + bool freed_tables) { int cpu;
[tip:x86/mm] smp,cpumask: introduce on_each_cpu_cond_mask
Commit-ID: 7d49b28a80b830c3ca876d33bedc58d62a78e16f Gitweb: https://git.kernel.org/tip/7d49b28a80b830c3ca876d33bedc58d62a78e16f Author: Rik van Riel AuthorDate: Tue, 25 Sep 2018 23:58:41 -0400 Committer: Peter Zijlstra CommitDate: Tue, 9 Oct 2018 16:51:11 +0200 smp,cpumask: introduce on_each_cpu_cond_mask Introduce a variant of on_each_cpu_cond that iterates only over the CPUs in a cpumask, in order to avoid making callbacks for every single CPU in the system when we only need to test a subset. Cc: npig...@gmail.com Cc: mi...@kernel.org Cc: will.dea...@arm.com Cc: songliubrav...@fb.com Cc: kernel-t...@fb.com Cc: h...@zytor.com Cc: l...@kernel.org Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180926035844.1420-5-r...@surriel.com --- include/linux/smp.h | 4 kernel/smp.c| 17 + kernel/up.c | 14 +++--- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/include/linux/smp.h b/include/linux/smp.h index 9fb239e12b82..a56f08ff3097 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -53,6 +53,10 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), smp_call_func_t func, void *info, bool wait, gfp_t gfp_flags); +void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags, const struct cpumask *mask); + int smp_call_function_single_async(int cpu, call_single_data_t *csd); #ifdef CONFIG_SMP diff --git a/kernel/smp.c b/kernel/smp.c index a7d4f9f50a49..163c451af42e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -669,9 +669,9 @@ EXPORT_SYMBOL(on_each_cpu_mask); * You must not call this function with disabled interrupts or * from a hardware interrupt handler or from a bottom half handler. */ -void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), +void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags) + gfp_t gfp_flags, const struct cpumask *mask) { cpumask_var_t cpus; int cpu, ret; @@ -680,7 +680,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN { preempt_disable(); - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) if (cond_func(cpu, info)) __cpumask_set_cpu(cpu, cpus); on_each_cpu_mask(cpus, func, info, wait); @@ -692,7 +692,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), * just have to IPI them one by one. */ preempt_disable(); - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) if (cond_func(cpu, info)) { ret = smp_call_function_single(cpu, func, info, wait); @@ -701,6 +701,15 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), preempt_enable(); } } +EXPORT_SYMBOL(on_each_cpu_cond_mask); + +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags) +{ + on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, + cpu_online_mask); +} EXPORT_SYMBOL(on_each_cpu_cond); static void do_nothing(void *unused) diff --git a/kernel/up.c b/kernel/up.c index 42c46bf3e0a5..ff536f9cc8a2 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask); * Preemption is disabled here to make sure the cond_func is called under the * same condtions in UP and SMP. */ -void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), - smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags) +void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags, const struct cpumask *mask) { unsigned long flags; @@ -82,6 +82,14 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), } preempt_enable(); } +EXPORT_SYMBOL(on_each_cpu_cond_mask); + +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags) +{ + on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL); +} EXPORT_SYMBOL(on_each_cpu_cond); int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
[tip:x86/mm] smp: use __cpumask_set_cpu in on_each_cpu_cond
Commit-ID: c3f7f2c7eba1a53d2e5ffbc2dcc9a20c5f094890 Gitweb: https://git.kernel.org/tip/c3f7f2c7eba1a53d2e5ffbc2dcc9a20c5f094890 Author: Rik van Riel AuthorDate: Tue, 25 Sep 2018 23:58:40 -0400 Committer: Peter Zijlstra CommitDate: Tue, 9 Oct 2018 16:51:11 +0200 smp: use __cpumask_set_cpu in on_each_cpu_cond The code in on_each_cpu_cond sets CPUs in a locally allocated bitmask, which should never be used by other CPUs simultaneously. There is no need to use locked memory accesses to set the bits in this bitmap. Switch to __cpumask_set_cpu. Cc: npig...@gmail.com Cc: mi...@kernel.org Cc: will.dea...@arm.com Cc: songliubrav...@fb.com Cc: kernel-t...@fb.com Cc: h...@zytor.com Suggested-by: Peter Zijlstra Signed-off-by: Rik van Riel Reviewed-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180926035844.1420-4-r...@surriel.com --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/smp.c b/kernel/smp.c index d86eec5f51c1..a7d4f9f50a49 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -682,7 +682,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), preempt_disable(); for_each_online_cpu(cpu) if (cond_func(cpu, info)) - cpumask_set_cpu(cpu, cpus); + __cpumask_set_cpu(cpu, cpus); on_each_cpu_mask(cpus, func, info, wait); preempt_enable(); free_cpumask_var(cpus);
[tip:x86/mm] x86/mm/tlb: Restructure switch_mm_irqs_off()
Commit-ID: 12c4d978fd170ccdd7260ec11f93b11e46904228 Gitweb: https://git.kernel.org/tip/12c4d978fd170ccdd7260ec11f93b11e46904228 Author: Rik van Riel AuthorDate: Tue, 25 Sep 2018 23:58:39 -0400 Committer: Peter Zijlstra CommitDate: Tue, 9 Oct 2018 16:51:11 +0200 x86/mm/tlb: Restructure switch_mm_irqs_off() Move some code that will be needed for the lazy -> !lazy state transition when a lazy TLB CPU has gotten out of date. No functional changes, since the if (real_prev == next) branch always returns. (cherry picked from commit 61d0beb5796ab11f7f3bf38cb2eccc6579aaa70b) Cc: npig...@gmail.com Cc: efa...@gmx.de Cc: will.dea...@arm.com Cc: Linus Torvalds Cc: Thomas Gleixner Cc: songliubrav...@fb.com Cc: kernel-t...@fb.com Cc: h...@zytor.com Suggested-by: Andy Lutomirski Signed-off-by: Rik van Riel Acked-by: Dave Hansen Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180716190337.26133-4-r...@surriel.com --- arch/x86/mm/tlb.c | 66 +++ 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 54a5870190a6..9fb30d27854b 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -187,6 +187,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; + bool need_flush; + u16 new_asid; /* * NB: The scheduler will call us with prev == next when switching @@ -252,8 +254,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, return; } else { - u16 new_asid; - bool need_flush; u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); /* @@ -308,44 +308,44 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, /* Let nmi_uaccess_okay() know that we're changing CR3. */ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); barrier(); + } - if (need_flush) { - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, true); - - /* -* NB: This gets called via leave_mm() in the idle path -* where RCU functions differently. Tracing normally -* uses RCU, so we need to use the _rcuidle variant. -* -* (There is no good reason for this. The idle code should -* be rearranged to call this before rcu_idle_enter().) -*/ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - } else { - /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, false); - - /* See above wrt _rcuidle. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); - } + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, new_asid, true); /* -* Record last user mm's context id, so we can avoid -* flushing branch buffer with IBPB if we switch back -* to the same user. +* NB: This gets called via leave_mm() in the idle path +* where RCU functions differently. Tracing normally +* uses RCU, so we need to use the _rcuidle variant. +* +* (There is no good reason for this. The idle code should +* be rearranged to call this before rcu_idle_enter().) */ - if (next != &init_mm) - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); - - /* Make sure we write CR3 before loaded_mm. */ - barrier(); + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + load_new_mm_cr3(next->pgd, new_asid, false); - this_cpu_write(cpu_tlbstate.loaded_mm, next); - this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); } + /* +* Record last user mm's context id,
[tip:x86/mm] x86/mm/tlb: Always use lazy TLB mode
Commit-ID: 5462bc3a9a3c38328bbbd276d51164c7cf21d6a8 Gitweb: https://git.kernel.org/tip/5462bc3a9a3c38328bbbd276d51164c7cf21d6a8 Author: Rik van Riel AuthorDate: Tue, 25 Sep 2018 23:58:38 -0400 Committer: Peter Zijlstra CommitDate: Tue, 9 Oct 2018 16:51:11 +0200 x86/mm/tlb: Always use lazy TLB mode On most workloads, the number of context switches far exceeds the number of TLB flushes sent. Optimizing the context switches, by always using lazy TLB mode, speeds up those workloads. This patch results in about a 1% reduction in CPU use on a two socket Broadwell system running a memcache like workload. Cc: npig...@gmail.com Cc: efa...@gmx.de Cc: will.dea...@arm.com Cc: Linus Torvalds Cc: Thomas Gleixner Cc: kernel-t...@fb.com Cc: h...@zytor.com Cc: l...@kernel.org Tested-by: Song Liu Signed-off-by: Rik van Riel (cherry picked from commit 95b0e6357d3e4e05349668940d7ff8f3b7e7e11e) Acked-by: Dave Hansen Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180716190337.26133-7-r...@surriel.com --- arch/x86/include/asm/tlbflush.h | 16 arch/x86/mm/tlb.c | 15 +-- 2 files changed, 1 insertion(+), 30 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 671f65309ce7..d6c0cd9e9591 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) #endif -static inline bool tlb_defer_switch_to_init_mm(void) -{ - /* -* If we have PCID, then switching to init_mm is reasonably -* fast. If we don't have PCID, then switching to init_mm is -* quite slow, so we try to defer it in the hopes that we can -* avoid it entirely. The latter approach runs the risk of -* receiving otherwise unnecessary IPIs. -* -* This choice is just a heuristic. The tlb code can handle this -* function returning true or false regardless of whether we have -* PCID. -*/ - return !static_cpu_has(X86_FEATURE_PCID); -} - struct tlb_context { u64 ctx_id; u64 tlb_gen; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6aa195796dec..54a5870190a6 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -368,20 +368,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) return; - if (tlb_defer_switch_to_init_mm()) { - /* -* There's a significant optimization that may be possible -* here. We have accurate enough TLB flush tracking that we -* don't need to maintain coherence of TLB per se when we're -* lazy. We do, however, need to maintain coherence of -* paging-structure caches. We could, in principle, leave our -* old mm loaded and only switch to init_mm when -* tlb_remove_page() happens. -*/ - this_cpu_write(cpu_tlbstate.is_lazy, true); - } else { - switch_mm(NULL, &init_mm, NULL); - } + this_cpu_write(cpu_tlbstate.is_lazy, true); } /*
[tip:x86/mm] x86/mm/tlb: Skip atomic operations for 'init_mm' in switch_mm_irqs_off()
Commit-ID: e9d8c61557687b7126101e9550bdf243223f0d8f Gitweb: https://git.kernel.org/tip/e9d8c61557687b7126101e9550bdf243223f0d8f Author: Rik van Riel AuthorDate: Mon, 16 Jul 2018 15:03:37 -0400 Committer: Ingo Molnar CommitDate: Tue, 17 Jul 2018 09:35:34 +0200 x86/mm/tlb: Skip atomic operations for 'init_mm' in switch_mm_irqs_off() Song Liu noticed switch_mm_irqs_off() taking a lot of CPU time in recent kernels,using 1.8% of a 48 CPU system during a netperf to localhost run. Digging into the profile, we noticed that cpumask_clear_cpu and cpumask_set_cpu together take about half of the CPU time taken by switch_mm_irqs_off(). However, the CPUs running netperf end up switching back and forth between netperf and the idle task, which does not require changes to the mm_cpumask. Furthermore, the init_mm cpumask ends up being the most heavily contended one in the system. Simply skipping changes to mm_cpumask(&init_mm) reduces overhead. Reported-and-tested-by: Song Liu Signed-off-by: Rik van Riel Acked-by: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: efa...@gmx.de Cc: kernel-t...@fb.com Cc: l...@kernel.org Link: http://lkml.kernel.org/r/20180716190337.26133-8-r...@surriel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 17 - 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 493559cae2d5..f086195f644c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -310,15 +310,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, sync_current_stack_to_mm(next); } - /* Stop remote flushes for the previous mm */ - VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && - real_prev != &init_mm); - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + /* +* Stop remote flushes for the previous mm. +* Skip kernel threads; we never send init_mm TLB flushing IPIs, +* but the bitmap manipulation can cause cache line contention. +*/ + if (real_prev != &init_mm) { + VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, + mm_cpumask(real_prev))); + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + } /* * Start remote flushes and then read tlb_gen. */ - cpumask_set_cpu(cpu, mm_cpumask(next)); + if (next != &init_mm) + cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
[tip:x86/mm] x86/mm/tlb: Always use lazy TLB mode
Commit-ID: 95b0e6357d3e4e05349668940d7ff8f3b7e7e11e Gitweb: https://git.kernel.org/tip/95b0e6357d3e4e05349668940d7ff8f3b7e7e11e Author: Rik van Riel AuthorDate: Mon, 16 Jul 2018 15:03:36 -0400 Committer: Ingo Molnar CommitDate: Tue, 17 Jul 2018 09:35:34 +0200 x86/mm/tlb: Always use lazy TLB mode Now that CPUs in lazy TLB mode no longer receive TLB shootdown IPIs, except at page table freeing time, and idle CPUs will no longer get shootdown IPIs for things like mprotect and madvise, we can always use lazy TLB mode. Tested-by: Song Liu Signed-off-by: Rik van Riel Acked-by: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: efa...@gmx.de Cc: kernel-t...@fb.com Cc: l...@kernel.org Link: http://lkml.kernel.org/r/20180716190337.26133-7-r...@surriel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 16 arch/x86/mm/tlb.c | 15 +-- 2 files changed, 1 insertion(+), 30 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 3aa3204b5dc0..511bf5fae8b8 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) #endif -static inline bool tlb_defer_switch_to_init_mm(void) -{ - /* -* If we have PCID, then switching to init_mm is reasonably -* fast. If we don't have PCID, then switching to init_mm is -* quite slow, so we try to defer it in the hopes that we can -* avoid it entirely. The latter approach runs the risk of -* receiving otherwise unnecessary IPIs. -* -* This choice is just a heuristic. The tlb code can handle this -* function returning true or false regardless of whether we have -* PCID. -*/ - return !static_cpu_has(X86_FEATURE_PCID); -} - struct tlb_context { u64 ctx_id; u64 tlb_gen; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index e4156e37aa71..493559cae2d5 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -379,20 +379,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) return; - if (tlb_defer_switch_to_init_mm()) { - /* -* There's a significant optimization that may be possible -* here. We have accurate enough TLB flush tracking that we -* don't need to maintain coherence of TLB per se when we're -* lazy. We do, however, need to maintain coherence of -* paging-structure caches. We could, in principle, leave our -* old mm loaded and only switch to init_mm when -* tlb_remove_page() happens. -*/ - this_cpu_write(cpu_tlbstate.is_lazy, true); - } else { - switch_mm(NULL, &init_mm, NULL); - } + this_cpu_write(cpu_tlbstate.is_lazy, true); } /*
[tip:x86/mm] x86/mm/tlb: Only send page table free TLB flush to lazy TLB CPUs
Commit-ID: 64482aafe55fc7e84d0741c356f8176ee7bde357 Gitweb: https://git.kernel.org/tip/64482aafe55fc7e84d0741c356f8176ee7bde357 Author: Rik van Riel AuthorDate: Mon, 16 Jul 2018 15:03:35 -0400 Committer: Ingo Molnar CommitDate: Tue, 17 Jul 2018 09:35:33 +0200 x86/mm/tlb: Only send page table free TLB flush to lazy TLB CPUs CPUs in !is_lazy have either received TLB flush IPIs earlier on during the munmap (when the user memory was unmapped), or have context switched and reloaded during that stage of the munmap. Page table free TLB flushes only need to be sent to CPUs in lazy TLB mode, which TLB contents might not yet be up to date yet. Tested-by: Song Liu Signed-off-by: Rik van Riel Acked-by: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: efa...@gmx.de Cc: kernel-t...@fb.com Cc: l...@kernel.org Link: http://lkml.kernel.org/r/20180716190337.26133-6-r...@surriel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 43 +++ 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 26542cc17043..e4156e37aa71 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -712,15 +712,50 @@ void tlb_flush_remove_tables_local(void *arg) } } +static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm, + struct cpumask *lazy_cpus) +{ + int cpu; + + for_each_cpu(cpu, mm_cpumask(mm)) { + if (!per_cpu(cpu_tlbstate.is_lazy, cpu)) + cpumask_set_cpu(cpu, lazy_cpus); + } +} + void tlb_flush_remove_tables(struct mm_struct *mm) { int cpu = get_cpu(); + cpumask_var_t lazy_cpus; + + if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) { + put_cpu(); + return; + } + + if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) { + /* +* If the cpumask allocation fails, do a brute force flush +* on all the CPUs that have this mm loaded. +*/ + smp_call_function_many(mm_cpumask(mm), + tlb_flush_remove_tables_local, (void *)mm, 1); + put_cpu(); + return; + } + /* -* XXX: this really only needs to be called for CPUs in lazy TLB mode. +* CPUs with !is_lazy either received a TLB flush IPI while the user +* pages in this address range were unmapped, or have context switched +* and reloaded %CR3 since then. +* +* Shootdown IPIs at page table freeing time only need to be sent to +* CPUs that may have out of date TLB contents. */ - if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) - smp_call_function_many(mm_cpumask(mm), tlb_flush_remove_tables_local, (void *)mm, 1); - + mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus); + smp_call_function_many(lazy_cpus, + tlb_flush_remove_tables_local, (void *)mm, 1); + free_cpumask_var(lazy_cpus); put_cpu(); }
[tip:x86/mm] x86/mm/tlb: Make lazy TLB mode lazier
Commit-ID: ac0315896970d8589291e9d8a1569fc65967b7f1 Gitweb: https://git.kernel.org/tip/ac0315896970d8589291e9d8a1569fc65967b7f1 Author: Rik van Riel AuthorDate: Mon, 16 Jul 2018 15:03:34 -0400 Committer: Ingo Molnar CommitDate: Tue, 17 Jul 2018 09:35:33 +0200 x86/mm/tlb: Make lazy TLB mode lazier Lazy TLB mode can result in an idle CPU being woken up by a TLB flush, when all it really needs to do is reload %CR3 at the next context switch, assuming no page table pages got freed. Memory ordering is used to prevent race conditions between switch_mm_irqs_off, which checks whether .tlb_gen changed, and the TLB invalidation code, which increments .tlb_gen whenever page table entries get invalidated. The atomic increment in inc_mm_tlb_gen is its own barrier; the context switch code adds an explicit barrier between reading tlbstate.is_lazy and next->context.tlb_gen. Unlike the 2016 version of this patch, CPUs with cpu_tlbstate.is_lazy set are not removed from the mm_cpumask(mm), since that would prevent the TLB flush IPIs at page table free time from being sent to all the CPUs that need them. This patch reduces total CPU use in the system by about 1-2% for a memcache workload on two socket systems, and by about 1% for a heavily multi-process netperf between two systems. Tested-by: Song Liu Signed-off-by: Rik van Riel Acked-by: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: efa...@gmx.de Cc: kernel-t...@fb.com Cc: l...@kernel.org Link: http://lkml.kernel.org/r/20180716190337.26133-5-r...@surriel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 68 +++ 1 file changed, 59 insertions(+), 9 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 4b73fe835c95..26542cc17043 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -185,6 +186,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; bool need_flush; @@ -242,17 +244,40 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, next->context.ctx_id); /* -* We don't currently support having a real mm loaded without -* our cpu set in mm_cpumask(). We have all the bookkeeping -* in place to figure out whether we would need to flush -* if our cpu were cleared in mm_cpumask(), but we don't -* currently use it. +* Even in lazy TLB mode, the CPU should stay set in the +* mm_cpumask. The TLB shootdown code can figure out from +* from cpu_tlbstate.is_lazy whether or not to send an IPI. */ if (WARN_ON_ONCE(real_prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next cpumask_set_cpu(cpu, mm_cpumask(next)); - return; + /* +* If the CPU is not in lazy TLB mode, we are just switching +* from one thread in a process to another thread in the same +* process. No TLB flush required. +*/ + if (!was_lazy) + return; + + /* +* Read the tlb_gen to check whether a flush is needed. +* If the TLB is up to date, just use it. +* The barrier synchronizes with the tlb_gen increment in +* the TLB shootdown code. +*/ + smp_mb(); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == + next_tlb_gen) + return; + + /* +* TLB contents went out of date while we were in lazy +* mode. Fall through to the TLB switching code below. +*/ + new_asid = prev_asid; + need_flush = true; } else { u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); @@ -454,6 +479,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * paging-structure cache to avoid speculatively reading * garbage into our TLB. Since switching to init_mm is barely * slower than a minimal flush, just switch to init_mm. +* +* This should be rare, with native_flush_tlb_others skipping +* IPIs to lazy TLB mode CPUs. */
[tip:x86/mm] x86/mm/tlb: Restructure switch_mm_irqs_off()
Commit-ID: 61d0beb5796ab11f7f3bf38cb2eccc6579aaa70b Gitweb: https://git.kernel.org/tip/61d0beb5796ab11f7f3bf38cb2eccc6579aaa70b Author: Rik van Riel AuthorDate: Mon, 16 Jul 2018 15:03:33 -0400 Committer: Ingo Molnar CommitDate: Tue, 17 Jul 2018 09:35:32 +0200 x86/mm/tlb: Restructure switch_mm_irqs_off() Move some code that will be needed for the lazy -> !lazy state transition when a lazy TLB CPU has gotten out of date. No functional changes, since the if (real_prev == next) branch always returns. Suggested-by: Andy Lutomirski Signed-off-by: Rik van Riel Acked-by: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: efa...@gmx.de Cc: kernel-t...@fb.com Link: http://lkml.kernel.org/r/20180716190337.26133-4-r...@surriel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 60 +++ 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 9a893673c56b..4b73fe835c95 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -187,6 +187,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; + bool need_flush; + u16 new_asid; /* * NB: The scheduler will call us with prev == next when switching @@ -252,8 +254,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, return; } else { - u16 new_asid; - bool need_flush; u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); /* @@ -297,41 +297,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, next_tlb_gen = atomic64_read(&next->context.tlb_gen); choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + } - if (need_flush) { - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, true); - - /* -* NB: This gets called via leave_mm() in the idle path -* where RCU functions differently. Tracing normally -* uses RCU, so we need to use the _rcuidle variant. -* -* (There is no good reason for this. The idle code should -* be rearranged to call this before rcu_idle_enter().) -*/ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - } else { - /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, false); - - /* See above wrt _rcuidle. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); - } + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, new_asid, true); /* -* Record last user mm's context id, so we can avoid -* flushing branch buffer with IBPB if we switch back -* to the same user. +* NB: This gets called via leave_mm() in the idle path +* where RCU functions differently. Tracing normally +* uses RCU, so we need to use the _rcuidle variant. +* +* (There is no good reason for this. The idle code should +* be rearranged to call this before rcu_idle_enter().) */ - if (next != &init_mm) - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + load_new_mm_cr3(next->pgd, new_asid, false); - this_cpu_write(cpu_tlbstate.loaded_mm, next); - this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); } + /* +* Record last user mm's context id, so we can avoid +* flushing branch buffer with IBPB if we switch back +* to the same user. +*/ + if (next != &init_mm) + this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); + + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_w
[tip:x86/mm] x86/mm/tlb: Leave lazy TLB mode at page table free time
Commit-ID: 2ff6ddf19c0ec40633bd14d8fe28a289816bd98d Gitweb: https://git.kernel.org/tip/2ff6ddf19c0ec40633bd14d8fe28a289816bd98d Author: Rik van Riel AuthorDate: Mon, 16 Jul 2018 15:03:32 -0400 Committer: Ingo Molnar CommitDate: Tue, 17 Jul 2018 09:35:31 +0200 x86/mm/tlb: Leave lazy TLB mode at page table free time Andy discovered that speculative memory accesses while in lazy TLB mode can crash a system, when a CPU tries to dereference a speculative access using memory contents that used to be valid page table memory, but have since been reused for something else and point into la-la land. The latter problem can be prevented in two ways. The first is to always send a TLB shootdown IPI to CPUs in lazy TLB mode, while the second one is to only send the TLB shootdown at page table freeing time. The second should result in fewer IPIs, since operationgs like mprotect and madvise are very common with some workloads, but do not involve page table freeing. Also, on munmap, batching of page table freeing covers much larger ranges of virtual memory than the batching of unmapped user pages. Tested-by: Song Liu Signed-off-by: Rik van Riel Acked-by: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: efa...@gmx.de Cc: kernel-t...@fb.com Cc: l...@kernel.org Link: http://lkml.kernel.org/r/20180716190337.26133-3-r...@surriel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 5 + arch/x86/mm/tlb.c | 27 +++ include/asm-generic/tlb.h | 10 ++ mm/memory.c | 22 ++ 4 files changed, 56 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6690cd3fc8b1..3aa3204b5dc0 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -554,4 +554,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); native_flush_tlb_others(mask, info) #endif +extern void tlb_flush_remove_tables(struct mm_struct *mm); +extern void tlb_flush_remove_tables_local(void *arg); + +#define HAVE_TLB_FLUSH_REMOVE_TABLES + #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6eb1f34c3c85..9a893673c56b 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -646,6 +646,33 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, put_cpu(); } +void tlb_flush_remove_tables_local(void *arg) +{ + struct mm_struct *mm = arg; + + if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm && + this_cpu_read(cpu_tlbstate.is_lazy)) { + /* +* We're in lazy mode. We need to at least flush our +* paging-structure cache to avoid speculatively reading +* garbage into our TLB. Since switching to init_mm is barely +* slower than a minimal flush, just switch to init_mm. +*/ + switch_mm_irqs_off(NULL, &init_mm, NULL); + } +} + +void tlb_flush_remove_tables(struct mm_struct *mm) +{ + int cpu = get_cpu(); + /* +* XXX: this really only needs to be called for CPUs in lazy TLB mode. +*/ + if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) + smp_call_function_many(mm_cpumask(mm), tlb_flush_remove_tables_local, (void *)mm, 1); + + put_cpu(); +} static void do_flush_tlb_all(void *info) { diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 3063125197ad..e811ef7b8350 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -303,4 +303,14 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, #define tlb_migrate_finish(mm) do {} while (0) +/* + * Used to flush the TLB when page tables are removed, when lazy + * TLB mode may cause a CPU to retain intermediate translations + * pointing to about-to-be-freed page table memory. + */ +#ifndef HAVE_TLB_FLUSH_REMOVE_TABLES +#define tlb_flush_remove_tables(mm) do {} while (0) +#define tlb_flush_remove_tables_local(mm) do {} while (0) +#endif + #endif /* _ASM_GENERIC__TLB_H */ diff --git a/mm/memory.c b/mm/memory.c index 7206a634270b..18355e0b971a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -326,16 +326,20 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ #ifdef CONFIG_HAVE_RCU_TABLE_FREE -/* - * See the comment near struct mmu_table_batch. - */ - static void tlb_remove_table_smp_sync(void *arg) { - /* Simply deliver the interrupt */ + struct mm_struct __maybe_unused *mm = arg; + /* +* On most architectures this does nothing. Simply delivering the +* interrupt is enough to prevent races with software page table +* walking like that done in get_user_pages_fast. +* +* See the comment near struct mmu_table_batch. +*/ + tlb_flush_re
[tip:x86/mm] mm: Allocate the mm_cpumask (mm->cpu_bitmap[]) dynamically based on nr_cpu_ids
Commit-ID: c1a2f7f0c06454387c2cd7b93ff1491c715a8c69 Gitweb: https://git.kernel.org/tip/c1a2f7f0c06454387c2cd7b93ff1491c715a8c69 Author: Rik van Riel AuthorDate: Mon, 16 Jul 2018 15:03:31 -0400 Committer: Ingo Molnar CommitDate: Tue, 17 Jul 2018 09:35:30 +0200 mm: Allocate the mm_cpumask (mm->cpu_bitmap[]) dynamically based on nr_cpu_ids The mm_struct always contains a cpumask bitmap, regardless of CONFIG_CPUMASK_OFFSTACK. That means the first step can be to simplify things, and simply have one bitmask at the end of the mm_struct for the mm_cpumask. This does necessitate moving everything else in mm_struct into an anonymous sub-structure, which can be randomized when struct randomization is enabled. The second step is to determine the correct size for the mm_struct slab object from the size of the mm_struct (excluding the CPU bitmap) and the size the cpumask. For init_mm we can simply allocate the maximum size this kernel is compiled for, since we only have one init_mm in the system, anyway. Pointer magic by Mike Galbraith, to evade -Wstringop-overflow getting confused by the dynamically sized array. Tested-by: Song Liu Signed-off-by: Rik van Riel Signed-off-by: Mike Galbraith Signed-off-by: Rik van Riel Acked-by: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-t...@fb.com Cc: l...@kernel.org Link: http://lkml.kernel.org/r/20180716190337.26133-2-r...@surriel.com Signed-off-by: Ingo Molnar --- drivers/firmware/efi/efi.c | 1 + include/linux/mm_types.h | 241 +++-- kernel/fork.c | 15 +-- mm/init-mm.c | 11 +++ 4 files changed, 145 insertions(+), 123 deletions(-) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 232f4915223b..7f0b19410a95 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -82,6 +82,7 @@ struct mm_struct efi_mm = { .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), .page_table_lock= __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), + .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, }; static bool disable_runtime; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 99ce070e7dcb..efdc24dd9e97 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -335,176 +335,183 @@ struct core_state { struct kioctx_table; struct mm_struct { - struct vm_area_struct *mmap;/* list of VMAs */ - struct rb_root mm_rb; - u32 vmacache_seqnum; /* per-thread vmacache */ + struct { + struct vm_area_struct *mmap;/* list of VMAs */ + struct rb_root mm_rb; + u32 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU - unsigned long (*get_unmapped_area) (struct file *filp, + unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #endif - unsigned long mmap_base;/* base of mmap area */ - unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ + unsigned long mmap_base;/* base of mmap area */ + unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES - /* Base adresses for compatible mmap() */ - unsigned long mmap_compat_base; - unsigned long mmap_compat_legacy_base; + /* Base adresses for compatible mmap() */ + unsigned long mmap_compat_base; + unsigned long mmap_compat_legacy_base; #endif - unsigned long task_size;/* size of task vm space */ - unsigned long highest_vm_end; /* highest vma end address */ - pgd_t * pgd; - - /** -* @mm_users: The number of users including userspace. -* -* Use mmget()/mmget_not_zero()/mmput() to modify. When this drops -* to 0 (i.e. when the task exits and there are no other temporary -* reference holders), we also release a reference on @mm_count -* (which may then free the &struct mm_struct if @mm_count also -* drops to 0). -*/ - atomic_t mm_users; - - /** -* @mm_count: The number of references to &struct mm_struct -* (@mm_users count as 1). -* -* Use mmgrab()/mmdrop() to modify. When this drops to 0, the -* &struct mm_struct is freed. -*/ - atomic_t mm_count; + unsigned long task_size;/* size of task vm space */ + unsigned long highest_vm_end; /* highest vma end address */ + pgd_t * pgd; + + /** +
[tip:x86/fpu] x86/fpu: Add FPU state copying quirk to handle XRSTOR failure on Intel Skylake CPUs
Commit-ID: 0852b374173bb57f870d78e6c6839c77b339be5f Gitweb: http://git.kernel.org/tip/0852b374173bb57f870d78e6c6839c77b339be5f Author: Rik van Riel AuthorDate: Sat, 23 Sep 2017 15:00:04 +0200 Committer: Ingo Molnar CommitDate: Sun, 24 Sep 2017 13:04:34 +0200 x86/fpu: Add FPU state copying quirk to handle XRSTOR failure on Intel Skylake CPUs On Skylake CPUs I noticed that XRSTOR is unable to deal with states created by copyout_from_xsaves() if the xstate has only SSE/YMM state, and no FP state. That is, xfeatures had XFEATURE_MASK_SSE set, but not XFEATURE_MASK_FP. The reason is that part of the SSE/YMM state lives in the MXCSR and MXCSR_FLAGS fields of the FP state. Ensure that whenever we copy SSE or YMM state around, the MXCSR and MXCSR_FLAGS fields are also copied around. Signed-off-by: Rik van Riel Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170210085445.0f1cc...@annuminas.surriel.com Link: http://lkml.kernel.org/r/20170923130016.21448-22-mi...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 3 +++ arch/x86/kernel/fpu/xstate.c | 42 2 files changed, 45 insertions(+) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 0c314a3..71db45c 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -68,6 +68,9 @@ struct fxregs_state { /* Default value for fxregs_state.mxcsr: */ #define MXCSR_DEFAULT 0x1f80 +/* Copy both mxcsr & mxcsr_flags with a single u64 memcpy: */ +#define MXCSR_AND_FLAGS_SIZE sizeof(u64) + /* * Software based FPU emulation state. This is arbitrary really, * it matches the x87 format to make it easier to understand: diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 0ef3504..41c5225 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -921,6 +921,23 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, #endif /* ! CONFIG_ARCH_HAS_PKEYS */ /* + * Weird legacy quirk: SSE and YMM states store information in the + * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP + * area is marked as unused in the xfeatures header, we need to copy + * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use. + */ +static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) +{ + if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM))) + return 0; + + if (xfeatures & XFEATURE_MASK_FP) + return 0; + + return 1; +} + +/* * This is similar to user_regset_copyout(), but will not add offset to * the source data pointer or increment pos, count, kbuf, and ubuf. */ @@ -988,6 +1005,12 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of } + if (xfeatures_mxcsr_quirk(header.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total); + } + /* * Fill xsave->i387.sw_reserved value for ptrace frame: */ @@ -1070,6 +1093,12 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i } + if (xfeatures_mxcsr_quirk(header.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + __copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, size_total); + } + /* * Fill xsave->i387.sw_reserved value for ptrace frame: */ @@ -1122,6 +1151,12 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) } } + if (xfeatures_mxcsr_quirk(xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + memcpy(&xsave->i387.mxcsr, kbuf + offset, size); + } + /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': @@ -1177,6 +1212,13 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) } } + if (xfeatures_mxcsr_quirk(xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size)) + return -EFAULT; + } + /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures':
[tip:sched/core] sched/numa: Slow down scan rate if shared faults dominate
Commit-ID: 37ec97deb3a8c68a7adfab61beb261ffeab19d09 Gitweb: http://git.kernel.org/tip/37ec97deb3a8c68a7adfab61beb261ffeab19d09 Author: Rik van Riel AuthorDate: Mon, 31 Jul 2017 15:28:46 -0400 Committer: Ingo Molnar CommitDate: Thu, 10 Aug 2017 12:18:16 +0200 sched/numa: Slow down scan rate if shared faults dominate The comment above update_task_scan_period() says the scan period should be increased (scanning slows down) if the majority of memory accesses are on the local node, or if the majority of the page accesses are shared with other tasks. However, with the current code, all a high ratio of shared accesses does is slow down the rate at which scanning is made faster. This patch changes things so either lots of shared accesses or lots of local accesses will slow down scanning, and numa scanning is sped up only when there are lots of private faults on remote memory pages. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jhla...@redhat.com Cc: lvena...@redhat.com Link: http://lkml.kernel.org/r/20170731192847.23050-2-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 39 +-- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ef5b66b..cb6b7c8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1892,7 +1892,7 @@ static void update_task_scan_period(struct task_struct *p, unsigned long shared, unsigned long private) { unsigned int period_slot; - int ratio; + int lr_ratio, ps_ratio; int diff; unsigned long remote = p->numa_faults_locality[0]; @@ -1922,25 +1922,36 @@ static void update_task_scan_period(struct task_struct *p, * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) */ period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); - ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); - if (ratio >= NUMA_PERIOD_THRESHOLD) { - int slot = ratio - NUMA_PERIOD_THRESHOLD; + lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); + ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared); + + if (ps_ratio >= NUMA_PERIOD_THRESHOLD) { + /* +* Most memory accesses are local. There is no need to +* do fast NUMA scanning, since memory is already local. +*/ + int slot = ps_ratio - NUMA_PERIOD_THRESHOLD; + if (!slot) + slot = 1; + diff = slot * period_slot; + } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) { + /* +* Most memory accesses are shared with other tasks. +* There is no point in continuing fast NUMA scanning, +* since other tasks may just move the memory elsewhere. +*/ + int slot = lr_ratio - NUMA_PERIOD_THRESHOLD; if (!slot) slot = 1; diff = slot * period_slot; } else { - diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; - /* -* Scale scan rate increases based on sharing. There is an -* inverse relationship between the degree of sharing and -* the adjustment made to the scanning period. Broadly -* speaking the intent is that there is little point -* scanning faster if shared accesses dominate as it may -* simply bounce migrations uselessly +* Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS, +* yet they are not on the local NUMA node. Speed up +* NUMA scanning to get the memory moved over. */ - ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); - diff = (diff * ratio) / NUMA_PERIOD_SLOTS; + int ratio = max(lr_ratio, ps_ratio); + diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; } p->numa_scan_period = clamp(p->numa_scan_period + diff,
[tip:sched/core] sched/numa: Scale scan period with tasks in group and shared/private
Commit-ID: b5dd77c8bdada7b6262d0cba02a6ed525bf4e6e1 Gitweb: http://git.kernel.org/tip/b5dd77c8bdada7b6262d0cba02a6ed525bf4e6e1 Author: Rik van Riel AuthorDate: Mon, 31 Jul 2017 15:28:47 -0400 Committer: Ingo Molnar CommitDate: Thu, 10 Aug 2017 12:18:16 +0200 sched/numa: Scale scan period with tasks in group and shared/private Running 80 tasks in the same group, or as threads of the same process, results in the memory getting scanned 80x as fast as it would be if a single task was using the memory. This really hurts some workloads. Scale the scan period by the number of tasks in the numa group, and the shared / private ratio, so the average rate at which memory in the group is scanned corresponds roughly to the rate at which a single task would scan its memory. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jhla...@redhat.com Cc: lvena...@redhat.com Link: http://lkml.kernel.org/r/20170731192847.23050-3-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 111 1 file changed, 86 insertions(+), 25 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cb6b7c8..a7f1c3b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1071,6 +1071,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +struct numa_group { + atomic_t refcount; + + spinlock_t lock; /* nr_tasks, tasks */ + int nr_tasks; + pid_t gid; + int active_nodes; + + struct rcu_head rcu; + unsigned long total_faults; + unsigned long max_faults_cpu; + /* +* Faults_cpu is used to decide whether memory should move +* towards the CPU. As a consequence, these stats are weighted +* more by CPU use than by memory faults. +*/ + unsigned long *faults_cpu; + unsigned long faults[0]; +}; + +static inline unsigned long group_faults_priv(struct numa_group *ng); +static inline unsigned long group_faults_shared(struct numa_group *ng); + static unsigned int task_nr_scan_windows(struct task_struct *p) { unsigned long rss = 0; @@ -1107,13 +1130,47 @@ static unsigned int task_scan_min(struct task_struct *p) return max_t(unsigned int, floor, scan); } +static unsigned int task_scan_start(struct task_struct *p) +{ + unsigned long smin = task_scan_min(p); + unsigned long period = smin; + + /* Scale the maximum scan period with the amount of shared memory. */ + if (p->numa_group) { + struct numa_group *ng = p->numa_group; + unsigned long shared = group_faults_shared(ng); + unsigned long private = group_faults_priv(ng); + + period *= atomic_read(&ng->refcount); + period *= shared + 1; + period /= private + shared + 1; + } + + return max(smin, period); +} + static unsigned int task_scan_max(struct task_struct *p) { - unsigned int smin = task_scan_min(p); - unsigned int smax; + unsigned long smin = task_scan_min(p); + unsigned long smax; /* Watch for min being lower than max due to floor calculations */ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); + + /* Scale the maximum scan period with the amount of shared memory. */ + if (p->numa_group) { + struct numa_group *ng = p->numa_group; + unsigned long shared = group_faults_shared(ng); + unsigned long private = group_faults_priv(ng); + unsigned long period = smax; + + period *= atomic_read(&ng->refcount); + period *= shared + 1; + period /= private + shared + 1; + + smax = max(smax, period); + } + return max(smin, smax); } @@ -1129,26 +1186,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p) rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); } -struct numa_group { - atomic_t refcount; - - spinlock_t lock; /* nr_tasks, tasks */ - int nr_tasks; - pid_t gid; - int active_nodes; - - struct rcu_head rcu; - unsigned long total_faults; - unsigned long max_faults_cpu; - /* -* Faults_cpu is used to decide whether memory should move -* towards the CPU. As a consequence, these stats are weighted -* more by CPU use than by memory faults. -*/ - unsigned long *faults_cpu; - unsigned long faults[0]; -}; - /* Shared or private faults. */ #define NR_NUMA_HINT_FAULT_TYPES 2 @@ -1198,6 +1235,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) group-
[tip:sched/core] sched/fair: Remove effective_load()
Commit-ID: 815abf5af45f04f759f12f3172afd15226fd7f71 Gitweb: http://git.kernel.org/tip/815abf5af45f04f759f12f3172afd15226fd7f71 Author: Rik van Riel AuthorDate: Fri, 23 Jun 2017 12:55:30 -0400 Committer: Ingo Molnar CommitDate: Sat, 24 Jun 2017 08:57:53 +0200 sched/fair: Remove effective_load() The effective_load() function was only used by the NUMA balancing code, and not by the regular load balancing code. Now that the NUMA balancing code no longer uses it either, get rid of it. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jhla...@redhat.com Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170623165530.22514-5-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 124 +--- 1 file changed, 1 insertion(+), 123 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 79ac078..6f4f155 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1382,7 +1382,6 @@ static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long capacity_of(int cpu); -static long effective_load(struct task_group *tg, int cpu, long wl, long wg); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -3045,8 +3044,7 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) * differential update where we store the last value we propagated. This in * turn allows skipping updates if the differential is 'small'. * - * Updating tg's load_avg is necessary before update_cfs_share() (which is - * done) and effective_load() (which is not done because it is too costly). + * Updating tg's load_avg is necessary before update_cfs_share(). */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { @@ -5298,126 +5296,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) return 0; } -#ifdef CONFIG_FAIR_GROUP_SCHED -/* - * effective_load() calculates the load change as seen from the root_task_group - * - * Adding load to a group doesn't make a group heavier, but can cause movement - * of group shares between cpus. Assuming the shares were perfectly aligned one - * can calculate the shift in shares. - * - * Calculate the effective load difference if @wl is added (subtracted) to @tg - * on this @cpu and results in a total addition (subtraction) of @wg to the - * total group weight. - * - * Given a runqueue weight distribution (rw_i) we can compute a shares - * distribution (s_i) using: - * - * s_i = rw_i / \Sum rw_j(1) - * - * Suppose we have 4 CPUs and our @tg is a direct child of the root group and - * has 7 equal weight tasks, distributed as below (rw_i), with the resulting - * shares distribution (s_i): - * - * rw_i = { 2, 4, 1, 0 } - * s_i = { 2/7, 4/7, 1/7, 0 } - * - * As per wake_affine() we're interested in the load of two CPUs (the CPU the - * task used to run on and the CPU the waker is running on), we need to - * compute the effect of waking a task on either CPU and, in case of a sync - * wakeup, compute the effect of the current task going to sleep. - * - * So for a change of @wl to the local @cpu with an overall group weight change - * of @wl we can compute the new shares distribution (s'_i) using: - * - * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2) - * - * Suppose we're interested in CPUs 0 and 1, and want to compute the load - * differences in waking a task to CPU 0. The additional task changes the - * weight and shares distributions like: - * - * rw'_i = { 3, 4, 1, 0 } - * s'_i = { 3/8, 4/8, 1/8, 0 } - * - * We can then compute the difference in effective weight by using: - * - * dw_i = S * (s'_i - s_i) (3) - * - * Where 'S' is the group weight as seen by its parent. - * - * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) - * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - - * 4/7) times the weight of the group. - */ -static long effective_load(struct task_group *tg, int cpu, long wl, long wg) -{ - struct sched_entity *se = tg->se[cpu]; - - if (!tg->parent)/* the trivial, non-cgroup case */ - return wl; - - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = se->my_q; - long W, w = cfs_rq_load_avg(cfs_rq); - - tg = cfs_rq->tg; - - /* -* W = @wg + \Sum rw_j -*/ - W = wg + atomic_long_read(&tg->load_avg); - - /* Ensure \Sum rw_j >= rw_i */ - W -= cfs_rq->tg_load_avg_contrib; - W += w; - - /* -* w = rw_i + @wl -*/ -
[tip:sched/core] sched/numa: Implement NUMA node level wake_affine()
Commit-ID: 3fed382b46baac83703130fe4cd3d9147f427fb9 Gitweb: http://git.kernel.org/tip/3fed382b46baac83703130fe4cd3d9147f427fb9 Author: Rik van Riel AuthorDate: Fri, 23 Jun 2017 12:55:29 -0400 Committer: Ingo Molnar CommitDate: Sat, 24 Jun 2017 08:57:52 +0200 sched/numa: Implement NUMA node level wake_affine() Since select_idle_sibling() can place a task anywhere on a socket, comparing loads between individual CPU cores makes no real sense for deciding whether to do an affine wakeup across sockets, either. Instead, compare the load between the sockets in a similar way the load balancer and the numa balancing code do. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jhla...@redhat.com Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170623165530.22514-4-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 130 1 file changed, 71 insertions(+), 59 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe19016..79ac078 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2586,6 +2586,60 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } } + +/* + * Can a task be moved from prev_cpu to this_cpu without causing a load + * imbalance that would trigger the load balancer? + */ +static inline bool numa_wake_affine(struct sched_domain *sd, + struct task_struct *p, int this_cpu, + int prev_cpu, int sync) +{ + struct numa_stats prev_load, this_load; + s64 this_eff_load, prev_eff_load; + + update_numa_stats(&prev_load, cpu_to_node(prev_cpu)); + update_numa_stats(&this_load, cpu_to_node(this_cpu)); + + /* +* If sync wakeup then subtract the (maximum possible) +* effect of the currently running task from the load +* of the current CPU: +*/ + if (sync) { + unsigned long current_load = task_h_load(current); + + if (this_load.load > current_load) + this_load.load -= current_load; + else + this_load.load = 0; + } + + /* +* In low-load situations, where this_cpu's node is idle due to the +* sync cause above having dropped this_load.load to 0, move the task. +* Moving to an idle socket will not create a bad imbalance. +* +* Otherwise check if the nodes are near enough in load to allow this +* task to be woken on this_cpu's node. +*/ + if (this_load.load > 0) { + unsigned long task_load = task_h_load(p); + + this_eff_load = 100; + this_eff_load *= prev_load.compute_capacity; + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= this_load.compute_capacity; + + this_eff_load *= this_load.load + task_load; + prev_eff_load *= prev_load.load - task_load; + + return this_eff_load <= prev_eff_load; + } + + return true; +} #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { @@ -2598,6 +2652,13 @@ static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } + +static inline bool numa_wake_affine(struct sched_domain *sd, + struct task_struct *p, int this_cpu, + int prev_cpu, int sync) +{ + return true; +} #endif /* CONFIG_NUMA_BALANCING */ static void @@ -5407,74 +5468,25 @@ static int wake_wide(struct task_struct *p) static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { - s64 this_load, load; - s64 this_eff_load, prev_eff_load; - int idx, this_cpu; - struct task_group *tg; - unsigned long weight; - int balanced; - - idx = sd->wake_idx; - this_cpu = smp_processor_id(); - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); + int this_cpu = smp_processor_id(); + bool affine = false; /* * Common case: CPUs are in the same socket, and select_idle_sibling() * will do its thing regardless of what we return: */ if (cpus_share_cache(prev_cpu, this_cpu)) - return true; - - /* -* If sync wakeup then subtract the (maximum possible) -* effect of the currently running task from the load -* of the current CPU: -*/ - if (sync) { - tg = task_group(current); - weight = current->se.avg.load_avg; - - this_load += effective_load(tg, this_cpu, -weight,
[tip:sched/core] sched/fair: Simplify wake_affine() for the single socket case
Commit-ID: 7d894e6e34a5cdd12309c7e4a3f830277ad4b7bf Gitweb: http://git.kernel.org/tip/7d894e6e34a5cdd12309c7e4a3f830277ad4b7bf Author: Rik van Riel AuthorDate: Fri, 23 Jun 2017 12:55:28 -0400 Committer: Ingo Molnar CommitDate: Sat, 24 Jun 2017 08:57:52 +0200 sched/fair: Simplify wake_affine() for the single socket case Then 'this_cpu' and 'prev_cpu' are in the same socket, select_idle_sibling() will do its thing regardless of the return value of wake_affine(). Just return true and don't look at all the other things. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jhla...@redhat.com Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170623165530.22514-3-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 13 - 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e0c052..fe19016 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5420,6 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, this_load = target_load(this_cpu, idx); /* +* Common case: CPUs are in the same socket, and select_idle_sibling() +* will do its thing regardless of what we return: +*/ + if (cpus_share_cache(prev_cpu, this_cpu)) + return true; + + /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load * of the current CPU: @@ -6007,11 +6014,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) + if (cpu == prev_cpu) + goto pick_cpu; + + if (wake_affine(affine_sd, p, prev_cpu, sync)) new_cpu = cpu; } if (!sd) { + pick_cpu: if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
[tip:sched/core] sched/numa: Override part of migrate_degrades_locality() when idle balancing
Commit-ID: 739294fb03f590401bbd7faa6d31a507e3ffada5 Gitweb: http://git.kernel.org/tip/739294fb03f590401bbd7faa6d31a507e3ffada5 Author: Rik van Riel AuthorDate: Fri, 23 Jun 2017 12:55:27 -0400 Committer: Ingo Molnar CommitDate: Sat, 24 Jun 2017 08:57:46 +0200 sched/numa: Override part of migrate_degrades_locality() when idle balancing Several tests in the NAS benchmark seem to run a lot slower with NUMA balancing enabled, than with NUMA balancing disabled. The slower run time corresponds with increased idle time. Overriding the final test of migrate_degrades_locality (but still doing the other NUMA tests first) seems to improve performance of those benchmarks. Reported-by: Jirka Hladky Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170623165530.22514-2-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 1 file changed, 4 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 694c258..6e0c052 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6688,6 +6688,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (dst_nid == p->numa_preferred_nid) return 0; + /* Leaving a core idle is often worse than degrading locality. */ + if (env->idle != CPU_NOT_IDLE) + return -1; + if (numa_group) { src_faults = group_faults(p, src_nid); dst_faults = group_faults(p, dst_nid);
[tip:perf/core] x86/fpu: Add FPU state copying quirk to handle XRSTOR failure on Intel Skylake CPUs
Commit-ID: 85fb989d3a58cb9c7904bb7dd8264be61e18b185 Gitweb: http://git.kernel.org/tip/85fb989d3a58cb9c7904bb7dd8264be61e18b185 Author: Rik van Riel AuthorDate: Fri, 10 Feb 2017 08:54:45 -0500 Committer: Ingo Molnar CommitDate: Sat, 11 Feb 2017 11:00:22 +0100 x86/fpu: Add FPU state copying quirk to handle XRSTOR failure on Intel Skylake CPUs On Skylake CPUs I noticed that XRSTOR is unable to deal with states created by copyout_from_xsaves() if the xstate has only SSE/YMM state, and no FP state. That is, xfeatures had XFEATURE_MASK_SSE set, but not XFEATURE_MASK_FP. The reason is that part of the SSE/YMM state lives in the MXCSR and MXCSR_FLAGS fields of the FP state. Ensure that whenever we copy SSE or YMM state around, the MXCSR and MXCSR_FLAGS fields are also copied around. Signed-off-by: Rik van Riel Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170210085445.0f1cc...@annuminas.surriel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 3 +++ arch/x86/kernel/fpu/xstate.c | 42 2 files changed, 45 insertions(+) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index d15cbfe..ea65ab2 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -68,6 +68,9 @@ struct fxregs_state { /* Default value for fxregs_state.mxcsr: */ #define MXCSR_DEFAULT 0x1f80 +/* Copy both mxcsr & mxcsr_flags with a single u64 memcpy: */ +#define MXCSR_AND_FLAGS_SIZE sizeof(u64) + /* * Software based FPU emulation state. This is arbitrary really, * it matches the x87 format to make it easier to understand: diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 772a069..2e89383 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -920,6 +920,23 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, #endif /* ! CONFIG_ARCH_HAS_PKEYS */ /* + * Weird legacy quirk: SSE and YMM states store information in the + * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP + * area is marked as unused in the xfeatures header, we need to copy + * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use. + */ +static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) +{ + if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM))) + return 0; + + if (xfeatures & XFEATURE_MASK_FP) + return 0; + + return 1; +} + +/* * This is similar to user_regset_copyout(), but will not add offset to * the source data pointer or increment pos, count, kbuf, and ubuf. */ @@ -987,6 +1004,12 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of } + if (xfeatures_mxcsr_quirk(header.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total); + } + /* * Fill xsave->i387.sw_reserved value for ptrace frame: */ @@ -1069,6 +1092,12 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i } + if (xfeatures_mxcsr_quirk(header.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + __copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, size_total); + } + /* * Fill xsave->i387.sw_reserved value for ptrace frame: */ @@ -1121,6 +1150,12 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) } } + if (xfeatures_mxcsr_quirk(xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + memcpy(&xsave->i387.mxcsr, kbuf + offset, size); + } + /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': @@ -1176,6 +1211,13 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) } } + if (xfeatures_mxcsr_quirk(xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size)) + return -EFAULT; + } + /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures':
[tip:x86/fpu] x86/fpu: Split old_fpu & new_fpu handling into separate functions
Commit-ID: c474e50711aa79b7bd0ea30b44744baca5650375 Gitweb: http://git.kernel.org/tip/c474e50711aa79b7bd0ea30b44744baca5650375 Author: Rik van Riel AuthorDate: Fri, 14 Oct 2016 08:15:31 -0400 Committer: Ingo Molnar CommitDate: Sun, 16 Oct 2016 11:38:41 +0200 x86/fpu: Split old_fpu & new_fpu handling into separate functions By moving all of the new_fpu state handling into switch_fpu_finish(), the code can be simplified some more. This gets rid of the prefetch, but given the size of the FPU register state on modern CPUs, and the amount of work done by __switch_to() inbetween both functions, the value of a single cache line prefetch seems somewhat dubious anyway. Signed-off-by: Rik van Riel Acked-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1476447331-21566-3-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 48 - arch/x86/kernel/process_32.c| 5 ++-- arch/x86/kernel/process_64.c| 5 ++-- 3 files changed, 19 insertions(+), 39 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 590f274..d4a6849 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -552,27 +552,15 @@ static inline int fpregs_active(void) * * This is a two-stage process: * - * - switch_fpu_prepare() saves the old state and - *sets the new state of the CR0.TS bit. This is - *done within the context of the old process. + * - switch_fpu_prepare() saves the old state. + *This is done within the context of the old process. * * - switch_fpu_finish() restores the new state as *necessary. */ -typedef struct { int preload; } fpu_switch_t; - -static inline fpu_switch_t -switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) +static inline void +switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - fpu_switch_t fpu; - - /* -* If the task has used the math, pre-load the FPU on xsave processors -* or if the past 5 consecutive context-switches used math. -*/ - fpu.preload = static_cpu_has(X86_FEATURE_FPU) && - new_fpu->fpstate_active; - if (old_fpu->fpregs_active) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; @@ -584,16 +572,6 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) trace_x86_fpu_regs_deactivated(old_fpu); } else old_fpu->last_cpu = -1; - - if (fpu.preload) { - if (fpregs_state_valid(new_fpu, cpu)) - fpu.preload = 0; - else - prefetch(&new_fpu->state); - fpregs_activate(new_fpu); - } - - return fpu; } /* @@ -601,15 +579,19 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) */ /* - * By the time this gets called, we've already cleared CR0.TS and - * given the process the FPU if we are going to preload the FPU - * state - all we need to do is to conditionally restore the register - * state itself. + * Set up the userspace FPU context for the new task, if the task + * has used the FPU. */ -static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch) +static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) { - if (fpu_switch.preload) - copy_kernel_to_fpregs(&new_fpu->state); + bool preload = static_cpu_has(X86_FEATURE_FPU) && + new_fpu->fpstate_active; + + if (preload) { + if (!fpregs_state_valid(new_fpu, cpu)) + copy_kernel_to_fpregs(&new_fpu->state); + fpregs_activate(new_fpu); + } } /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bd7be8e..7dc8c9c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -232,11 +232,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - fpu_switch_t fpu_switch; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); + switch_fpu_prepare(prev_fpu, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -295,7 +294,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) lazy_load_gs(next->gs); - switc
[tip:x86/fpu] x86/fpu: Remove 'cpu' argument from __cpu_invalidate_fpregs_state()
Commit-ID: 317b622cb2fda1812d8646e211cdb23dce2564d0 Gitweb: http://git.kernel.org/tip/317b622cb2fda1812d8646e211cdb23dce2564d0 Author: Rik van Riel AuthorDate: Fri, 14 Oct 2016 08:15:30 -0400 Committer: Ingo Molnar CommitDate: Sun, 16 Oct 2016 11:38:31 +0200 x86/fpu: Remove 'cpu' argument from __cpu_invalidate_fpregs_state() The __{fpu,cpu}_invalidate_fpregs_state() functions can only be used to invalidate a resource they control. Document that, and change the API a little bit to reflect that. Go back to open coding the fpu_fpregs_owner_ctx write in the CPU hotplug code, which should be the exception, and move __kernel_fpu_begin() to this API. This patch has no functional changes to the current code. Signed-off-by: Rik van Riel Acked-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1476447331-21566-2-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 13 +++-- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 1dcb29e..590f274 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -488,15 +488,16 @@ DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); * * Any code that clobbers the FPU registers or updates the in-memory * FPU state for a task MUST let the rest of the kernel know that the - * FPU registers are no longer valid for this task. Calling either of - * these two invalidate functions is enough, use whichever is convenient. + * FPU registers are no longer valid for this task. * - * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx, - * on this CPU. + * Either one of these invalidation functions is enough. Invalidate + * a resource you control: CPU if using the CPU for something else + * (with preemption disabled), FPU for the current task, or a task that + * is prevented from running by the current task. */ -static inline void __cpu_invalidate_fpregs_state(unsigned int cpu) +static inline void __cpu_invalidate_fpregs_state(void) { - per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 25a45dd..30f11ab 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -106,7 +106,7 @@ void __kernel_fpu_begin(void) */ copy_fpregs_to_fpstate(fpu); } else { - this_cpu_write(fpu_fpregs_owner_ctx, NULL); + __cpu_invalidate_fpregs_state(); } } EXPORT_SYMBOL(__kernel_fpu_begin); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ca4c4ca..5cb801a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -,7 +,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) return err; /* the FPU context is blank, nobody can own it */ - __cpu_invalidate_fpregs_state(cpu); + per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; common_cpu_up(cpu, tidle);
[tip:x86/fpu] x86/fpu: Split old & new FPU code paths
Commit-ID: 9ad93fe35aff616fca4e2b9581fdeed498605f9e Gitweb: http://git.kernel.org/tip/9ad93fe35aff616fca4e2b9581fdeed498605f9e Author: Rik van Riel AuthorDate: Tue, 4 Oct 2016 20:34:38 -0400 Committer: Ingo Molnar CommitDate: Fri, 7 Oct 2016 11:14:43 +0200 x86/fpu: Split old & new FPU code paths Now that CR0.TS is no longer being manipulated, we can simplify switch_fpu_prepare() by no longer nesting the handling of new_fpu inside the two branches for the old_fpu. Signed-off-by: Rik van Riel Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Cc: pbonz...@redhat.com Link: http://lkml.kernel.org/r/1475627678-20788-10-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 22 -- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index d0324bc..1dcb29e 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -581,23 +581,17 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) /* But leave fpu_fpregs_owner_ctx! */ old_fpu->fpregs_active = 0; trace_x86_fpu_regs_deactivated(old_fpu); + } else + old_fpu->last_cpu = -1; - /* Don't change CR0.TS if we just switch! */ - if (fpu.preload) { - fpregs_activate(new_fpu); - trace_x86_fpu_regs_activated(new_fpu); + if (fpu.preload) { + if (fpregs_state_valid(new_fpu, cpu)) + fpu.preload = 0; + else prefetch(&new_fpu->state); - } - } else { - old_fpu->last_cpu = -1; - if (fpu.preload) { - if (fpregs_state_valid(new_fpu, cpu)) - fpu.preload = 0; - else - prefetch(&new_fpu->state); - fpregs_activate(new_fpu); - } + fpregs_activate(new_fpu); } + return fpu; }
[tip:x86/fpu] x86/fpu: Rename lazy restore functions to "register state valid"
Commit-ID: 25d83b531c1aa4fca5b4e24ed10f493268f162bc Gitweb: http://git.kernel.org/tip/25d83b531c1aa4fca5b4e24ed10f493268f162bc Author: Rik van Riel AuthorDate: Tue, 4 Oct 2016 20:34:36 -0400 Committer: Ingo Molnar CommitDate: Fri, 7 Oct 2016 11:14:41 +0200 x86/fpu: Rename lazy restore functions to "register state valid" Name the functions after the state they track, rather than the function they currently enable. This should make it more obvious when we use the fpu_register_state_valid() function for something else in the future. Signed-off-by: Rik van Riel Reviewed-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Cc: pbonz...@redhat.com Link: http://lkml.kernel.org/r/1475627678-20788-8-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 26 -- arch/x86/kernel/fpu/core.c | 4 ++-- arch/x86/kernel/smpboot.c | 2 +- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 499d6ed..d2cfe16 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -479,18 +479,32 @@ extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size) DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* + * The in-register FPU state for an FPU context on a CPU is assumed to be + * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx + * matches the FPU. + * + * If the FPU register state is valid, the kernel can skip restoring the + * FPU state from memory. + * + * Any code that clobbers the FPU registers or updates the in-memory + * FPU state for a task MUST let the rest of the kernel know that the + * FPU registers are no longer valid for this task. Calling either of + * these two invalidate functions is enough, use whichever is convenient. + * * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx, * on this CPU. - * - * This will disable any lazy FPU state restore of the current FPU state, - * but if the current thread owns the FPU, it will still be saved by. */ -static inline void __cpu_disable_lazy_restore(unsigned int cpu) +static inline void __cpu_invalidate_fpregs_state(unsigned int cpu) { per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; } -static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu) +static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) +{ + fpu->last_cpu = -1; +} + +static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) { return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } @@ -588,7 +602,7 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) } else { old_fpu->last_cpu = -1; if (fpu.preload) { - if (fpu_want_lazy_restore(new_fpu, cpu)) + if (fpregs_state_valid(new_fpu, cpu)) fpu.preload = 0; else prefetch(&new_fpu->state); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 6a37d52..25a45dd 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -336,7 +336,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu) if (fpu->fpstate_active) { /* Invalidate any lazy state: */ - fpu->last_cpu = -1; + __fpu_invalidate_fpregs_state(fpu); } else { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); @@ -379,7 +379,7 @@ void fpu__current_fpstate_write_begin(void) * ensures we will not be lazy and skip a XRSTOR in the * future. */ - fpu->last_cpu = -1; + __fpu_invalidate_fpregs_state(fpu); } /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 42a9362..ca4c4ca 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -,7 +,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) return err; /* the FPU context is blank, nobody can own it */ - __cpu_disable_lazy_restore(cpu); + __cpu_invalidate_fpregs_state(cpu); common_cpu_up(cpu, tidle);
[tip:x86/fpu] x86/fpu: Remove __fpregs_(de)activate()
Commit-ID: 66f314efca3843a8874405ab015e354d041f86dd Gitweb: http://git.kernel.org/tip/66f314efca3843a8874405ab015e354d041f86dd Author: Rik van Riel AuthorDate: Tue, 4 Oct 2016 20:34:37 -0400 Committer: Ingo Molnar CommitDate: Fri, 7 Oct 2016 11:14:42 +0200 x86/fpu: Remove __fpregs_(de)activate() Now that fpregs_activate() and fpregs_deactivate() do nothing except call the double underscored versions of themselves, we can get rid of the double underscore version. Signed-off-by: Rik van Riel Reviewed-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Cc: pbonz...@redhat.com Link: http://lkml.kernel.org/r/1475627678-20788-9-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 25 +++-- 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index d2cfe16..d0324bc 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -509,8 +509,11 @@ static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } - -static inline void __fpregs_deactivate(struct fpu *fpu) +/* + * These generally need preemption protection to work, + * do try to avoid using these on their own: + */ +static inline void fpregs_deactivate(struct fpu *fpu) { WARN_ON_FPU(!fpu->fpregs_active); @@ -519,7 +522,7 @@ static inline void __fpregs_deactivate(struct fpu *fpu) trace_x86_fpu_regs_deactivated(fpu); } -static inline void __fpregs_activate(struct fpu *fpu) +static inline void fpregs_activate(struct fpu *fpu) { WARN_ON_FPU(fpu->fpregs_active); @@ -544,20 +547,6 @@ static inline int fpregs_active(void) } /* - * These generally need preemption protection to work, - * do try to avoid using these on their own. - */ -static inline void fpregs_activate(struct fpu *fpu) -{ - __fpregs_activate(fpu); -} - -static inline void fpregs_deactivate(struct fpu *fpu) -{ - __fpregs_deactivate(fpu); -} - -/* * FPU state switching for scheduling. * * This is a two-stage process: @@ -595,7 +584,7 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { - __fpregs_activate(new_fpu); + fpregs_activate(new_fpu); trace_x86_fpu_regs_activated(new_fpu); prefetch(&new_fpu->state); }
[tip:x86/fpu] x86/fpu, kvm: Remove KVM vcpu->fpu_counter
Commit-ID: 3d42de25d290fdfe604835d1b389845b8cba5bff Gitweb: http://git.kernel.org/tip/3d42de25d290fdfe604835d1b389845b8cba5bff Author: Rik van Riel AuthorDate: Tue, 4 Oct 2016 20:34:35 -0400 Committer: Ingo Molnar CommitDate: Fri, 7 Oct 2016 11:14:41 +0200 x86/fpu, kvm: Remove KVM vcpu->fpu_counter With the removal of the lazy FPU code, this field is no longer used. Get rid of it. Signed-off-by: Rik van Riel Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Cc: pbonz...@redhat.com Link: http://lkml.kernel.org/r/1475627678-20788-7-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kvm/x86.c | 4 +--- include/linux/kvm_host.h | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 59d7761..2c7e775 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7348,10 +7348,8 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - if (!vcpu->guest_fpu_loaded) { - vcpu->fpu_counter = 0; + if (!vcpu->guest_fpu_loaded) return; - } vcpu->guest_fpu_loaded = 0; copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9c28b4d..4e6905c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -224,7 +224,6 @@ struct kvm_vcpu { int fpu_active; int guest_fpu_loaded, guest_xcr0_loaded; - unsigned char fpu_counter; struct swait_queue_head wq; struct pid *pid; int sigset_active;
[tip:x86/fpu] x86/fpu: Remove struct fpu::counter
Commit-ID: 3913cc3507575273beb165a5e027a081913ed507 Gitweb: http://git.kernel.org/tip/3913cc3507575273beb165a5e027a081913ed507 Author: Rik van Riel AuthorDate: Tue, 4 Oct 2016 20:34:34 -0400 Committer: Ingo Molnar CommitDate: Fri, 7 Oct 2016 11:14:40 +0200 x86/fpu: Remove struct fpu::counter With the lazy FPU code gone, we no longer use the counter field in struct fpu for anything. Get rid it. Signed-off-by: Rik van Riel Reviewed-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Cc: pbonz...@redhat.com Link: http://lkml.kernel.org/r/1475627678-20788-6-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 3 --- arch/x86/include/asm/fpu/types.h| 11 --- arch/x86/include/asm/trace/fpu.h| 5 + arch/x86/kernel/fpu/core.c | 3 --- 4 files changed, 1 insertion(+), 21 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 7801d32..499d6ed 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -581,16 +581,13 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { - new_fpu->counter++; __fpregs_activate(new_fpu); trace_x86_fpu_regs_activated(new_fpu); prefetch(&new_fpu->state); } } else { - old_fpu->counter = 0; old_fpu->last_cpu = -1; if (fpu.preload) { - new_fpu->counter++; if (fpu_want_lazy_restore(new_fpu, cpu)) fpu.preload = 0; else diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 48df486..e31332d 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -322,17 +322,6 @@ struct fpu { unsigned char fpregs_active; /* -* @counter: -* -* This counter contains the number of consecutive context switches -* during which the FPU stays used. If this is over a threshold, the -* lazy FPU restore logic becomes eager, to save the trap overhead. -* This is an unsigned char so that after 256 iterations the counter -* wraps and the context switch behavior turns lazy again; this is to -* deal with bursty apps that only use the FPU for a short time: -*/ - unsigned char counter; - /* * @state: * * In-memory copy of all FPU registers that we save/restore diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 9217ab1..342e597 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -14,7 +14,6 @@ DECLARE_EVENT_CLASS(x86_fpu, __field(struct fpu *, fpu) __field(bool, fpregs_active) __field(bool, fpstate_active) - __field(int, counter) __field(u64, xfeatures) __field(u64, xcomp_bv) ), @@ -23,17 +22,15 @@ DECLARE_EVENT_CLASS(x86_fpu, __entry->fpu= fpu; __entry->fpregs_active = fpu->fpregs_active; __entry->fpstate_active = fpu->fpstate_active; - __entry->counter= fpu->counter; if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->state.xsave.header.xfeatures; __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; } ), - TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d counter: %d xfeatures: %llx xcomp_bv: %llx", + TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, __entry->fpregs_active, __entry->fpstate_active, - __entry->counter, __entry->xfeatures, __entry->xcomp_bv ) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 036e14f..6a37d52 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -222,7 +222,6 @@ EXPORT_SYMBOL_GPL(fpstate_init); int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - dst_fpu->counter = 0; dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; @@ -430,7 +429,6 @@ void fpu__restore(struct fpu *fpu) trace_x86_fpu_before_restore(fpu); fpregs_activate(fpu)
[tip:sched/core] sched/numa, mm: Revert to checking pmd/pte_write instead of VMA flags
Commit-ID: d59dc7bcfa649ef2128a76b6487b16f4b3f14d23 Gitweb: http://git.kernel.org/tip/d59dc7bcfa649ef2128a76b6487b16f4b3f14d23 Author: Rik van Riel AuthorDate: Thu, 8 Sep 2016 21:30:53 -0400 Committer: Ingo Molnar CommitDate: Tue, 13 Sep 2016 20:31:33 +0200 sched/numa, mm: Revert to checking pmd/pte_write instead of VMA flags Commit: 4d9424669946 ("mm: convert p[te|md]_mknonnuma and remaining page table manipulations") changed NUMA balancing from _PAGE_NUMA to using PROT_NONE, and was quickly found to introduce a regression with NUMA grouping. It was followed up by these commits: 53da3bc2ba9e ("mm: fix up numa read-only thread grouping logic") bea66fbd11af ("mm: numa: group related processes based on VMA flags instead of page table flags") b191f9b106ea ("mm: numa: preserve PTE write permissions across a NUMA hinting fault") The first of those two commits try alternate approaches to NUMA grouping, which apparently do not work as well as looking at the PTE write permissions. The latter patch preserves the PTE write permissions across a NUMA protection fault. However, it forgets to revert the condition for whether or not to group tasks together back to what it was before v3.19, even though the information is now preserved in the page tables once again. This patch brings the NUMA grouping heuristic back to what it was before commit 4d9424669946, which the changelogs of subsequent commits suggest worked best. We have all the information again. We should probably use it. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: aarca...@redhat.com Cc: linux...@kvack.org Cc: mgor...@suse.de Link: http://lkml.kernel.org/r/20160908213053.07c99...@annuminas.surriel.com Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 2 +- mm/memory.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2db2112..c8bde27 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1168,7 +1168,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) } /* See similar comment in do_numa_page for explanation */ - if (!(vma->vm_flags & VM_WRITE)) + if (!pmd_write(pmd)) flags |= TNF_NO_GROUP; /* diff --git a/mm/memory.c b/mm/memory.c index 83be99d..558c852 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3398,7 +3398,7 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ - if (!(vma->vm_flags & VM_WRITE)) + if (!pte_write(pte)) flags |= TNF_NO_GROUP; /*
[tip:sched/core] sched: Remove struct rq::nohz_stamp
Commit-ID: 1fc770d5899c995db8e22d35eb918a2cb79559d9 Gitweb: http://git.kernel.org/tip/1fc770d5899c995db8e22d35eb918a2cb79559d9 Author: Rik van Riel AuthorDate: Mon, 15 Aug 2016 12:14:10 -0400 Committer: Ingo Molnar CommitDate: Thu, 18 Aug 2016 10:55:39 +0200 sched: Remove struct rq::nohz_stamp The nohz_stamp member of struct rq has been unused since 2010, when this commit removed the code that referenced it: 396e894d289d ("sched: Revert nohz_ratelimit() for now") Signed-off-by: Rik van Riel Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160815121410.5ea1c...@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c64fc51..afe76d0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -597,7 +597,6 @@ struct rq { #ifdef CONFIG_SMP unsigned long last_load_update_tick; #endif /* CONFIG_SMP */ - u64 nohz_stamp; unsigned long nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL
[tip:timers/nohz] sched/cputime: Drop local_irq_save/restore from irqtime_account_irq()
Commit-ID: 553bf6bbfd8a540c70aee28eb50e24caff456a03 Gitweb: http://git.kernel.org/tip/553bf6bbfd8a540c70aee28eb50e24caff456a03 Author: Rik van Riel AuthorDate: Wed, 13 Jul 2016 16:50:05 +0200 Committer: Ingo Molnar CommitDate: Thu, 14 Jul 2016 10:42:35 +0200 sched/cputime: Drop local_irq_save/restore from irqtime_account_irq() Paolo pointed out that irqs are already blocked when irqtime_account_irq() is called. That means there is no reason to call local_irq_save/restore() again. Suggested-by: Paolo Bonzini Signed-off-by: Rik van Riel Signed-off-by: Frederic Weisbecker Reviewed-by: Paolo Bonzini Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1468421405-20056-6-git-send-email-fweis...@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 4 1 file changed, 4 deletions(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 16a873c..ea0f6f3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); */ void irqtime_account_irq(struct task_struct *curr) { - unsigned long flags; s64 delta; int cpu; if (!sched_clock_irqtime) return; - local_irq_save(flags); - cpu = smp_processor_id(); delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); __this_cpu_add(irq_start_time, delta); @@ -75,7 +72,6 @@ void irqtime_account_irq(struct task_struct *curr) __this_cpu_add(cpu_softirq_time, delta); irq_time_write_end(); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(irqtime_account_irq);
[tip:timers/nohz] sched/cputime: Replace VTIME_GEN irq time code with IRQ_TIME_ACCOUNTING code
Commit-ID: b58c35840521bb02b150e1d0d34ca9197f8b7145 Gitweb: http://git.kernel.org/tip/b58c35840521bb02b150e1d0d34ca9197f8b7145 Author: Rik van Riel AuthorDate: Wed, 13 Jul 2016 16:50:02 +0200 Committer: Ingo Molnar CommitDate: Thu, 14 Jul 2016 10:42:34 +0200 sched/cputime: Replace VTIME_GEN irq time code with IRQ_TIME_ACCOUNTING code The CONFIG_VIRT_CPU_ACCOUNTING_GEN irq time tracking code does not appear to currently work right. On CPUs without nohz_full=, only tick based irq time sampling is done, which breaks down when dealing with a nohz_idle CPU. On firewalls and similar systems, no ticks may happen on a CPU for a while, and the irq time spent may never get accounted properly. This can cause issues with capacity planning and power saving, which use the CPU statistics as inputs in decision making. Remove the VTIME_GEN vtime irq time code, and replace it with the IRQ_TIME_ACCOUNTING code, when selected as a config option by the user. Signed-off-by: Rik van Riel Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1468421405-20056-3-git-send-email-fweis...@gmail.com Signed-off-by: Ingo Molnar --- include/linux/vtime.h | 32 ++-- init/Kconfig | 6 +++--- kernel/sched/cputime.c | 16 +++- 3 files changed, 20 insertions(+), 34 deletions(-) diff --git a/include/linux/vtime.h b/include/linux/vtime.h index fa21969..d1977d84 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -14,6 +14,18 @@ struct task_struct; */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static inline bool vtime_accounting_cpu_enabled(void) { return true; } + +#ifdef __ARCH_HAS_VTIME_ACCOUNT +extern void vtime_account_irq_enter(struct task_struct *tsk); +#else +extern void vtime_common_account_irq_enter(struct task_struct *tsk); +static inline void vtime_account_irq_enter(struct task_struct *tsk) +{ + if (vtime_accounting_cpu_enabled()) + vtime_common_account_irq_enter(tsk); +} +#endif /* __ARCH_HAS_VTIME_ACCOUNT */ + #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN @@ -64,17 +76,6 @@ extern void vtime_account_system(struct task_struct *tsk); extern void vtime_account_idle(struct task_struct *tsk); extern void vtime_account_user(struct task_struct *tsk); -#ifdef __ARCH_HAS_VTIME_ACCOUNT -extern void vtime_account_irq_enter(struct task_struct *tsk); -#else -extern void vtime_common_account_irq_enter(struct task_struct *tsk); -static inline void vtime_account_irq_enter(struct task_struct *tsk) -{ - if (vtime_accounting_cpu_enabled()) - vtime_common_account_irq_enter(tsk); -} -#endif /* __ARCH_HAS_VTIME_ACCOUNT */ - #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ static inline void vtime_task_switch(struct task_struct *prev) { } @@ -85,13 +86,8 @@ static inline void vtime_account_irq_enter(struct task_struct *tsk) { } #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern void arch_vtime_task_switch(struct task_struct *tsk); -extern void vtime_gen_account_irq_exit(struct task_struct *tsk); - -static inline void vtime_account_irq_exit(struct task_struct *tsk) -{ - if (vtime_accounting_cpu_enabled()) - vtime_gen_account_irq_exit(tsk); -} +static inline void vtime_account_irq_enter(struct task_struct *tsk) { } +static inline void vtime_account_irq_exit(struct task_struct *tsk) { } extern void vtime_user_enter(struct task_struct *tsk); diff --git a/init/Kconfig b/init/Kconfig index c02d897..787dd76 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -375,9 +375,11 @@ config VIRT_CPU_ACCOUNTING_GEN If unsure, say N. +endchoice + config IRQ_TIME_ACCOUNTING bool "Fine granularity task level IRQ time accounting" - depends on HAVE_IRQ_TIME_ACCOUNTING && !NO_HZ_FULL + depends on HAVE_IRQ_TIME_ACCOUNTING && !VIRT_CPU_ACCOUNTING_NATIVE help Select this option to enable fine granularity task irq time accounting. This is done by reading a timestamp on each @@ -386,8 +388,6 @@ config IRQ_TIME_ACCOUNTING If in doubt, say N here. -endchoice - config BSD_PROCESS_ACCT bool "BSD Process Accounting" depends on MULTIUSER diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index db82ae1..ca7e33c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -711,14 +711,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) static cputime_t get_vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - cputime_t delta, steal; + cputime_t delta, other; delta = jiffies_to_cputime(now - tsk->vtime_snap); - steal = steal_account_process_time(delta); + other = account_other_time(delta); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); tsk->vtime
[tip:timers/nohz] sched/cputime: Count actually elapsed irq & softirq time
Commit-ID: 57430218317e5b280a80582a139b26029c25de6c Gitweb: http://git.kernel.org/tip/57430218317e5b280a80582a139b26029c25de6c Author: Rik van Riel AuthorDate: Wed, 13 Jul 2016 16:50:01 +0200 Committer: Ingo Molnar CommitDate: Thu, 14 Jul 2016 10:42:34 +0200 sched/cputime: Count actually elapsed irq & softirq time Currently, if there was any irq or softirq time during 'ticks' jiffies, the entire period will be accounted as irq or softirq time. This is inaccurate if only a subset of the time was actually spent handling irqs, and could conceivably mis-count all of the ticks during a period as irq time, when there was some irq and some softirq time. This can actually happen when irqtime_account_process_tick is called from account_idle_ticks, which can pass a larger number of ticks down all at once. Fix this by changing irqtime_account_hi_update(), irqtime_account_si_update(), and steal_account_process_ticks() to work with cputime_t time units, and return the amount of time spent in each mode. Rename steal_account_process_ticks() to steal_account_process_time(), to reflect that time is now accounted in cputime_t, instead of ticks. Additionally, have irqtime_account_process_tick() take into account how much time was spent in each of steal, irq, and softirq time. The latter could help improve the accuracy of cputime accounting when returning from idle on a NO_HZ_IDLE CPU. Properly accounting how much time was spent in hardirq and softirq time will also allow the NO_HZ_FULL code to re-use these same functions for hardirq and softirq accounting. Signed-off-by: Rik van Riel [ Make nsecs_to_cputime64() actually return cputime64_t. ] Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1468421405-20056-2-git-send-email-fweis...@gmail.com Signed-off-by: Ingo Molnar --- include/asm-generic/cputime_nsecs.h | 2 + kernel/sched/cputime.c | 124 ++-- 2 files changed, 79 insertions(+), 47 deletions(-) diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index 0f1c6f3..a84e28e 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h @@ -50,6 +50,8 @@ typedef u64 __nocast cputime64_t; (__force u64)(__ct) #define nsecs_to_cputime(__nsecs) \ (__force cputime_t)(__nsecs) +#define nsecs_to_cputime64(__nsecs)\ + (__force cputime64_t)(__nsecs) /* diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 3d60e5d..db82ae1 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -79,40 +79,50 @@ void irqtime_account_irq(struct task_struct *curr) } EXPORT_SYMBOL_GPL(irqtime_account_irq); -static int irqtime_account_hi_update(void) +static cputime_t irqtime_account_hi_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; - u64 latest_ns; - int ret = 0; + cputime_t irq_cputime; local_irq_save(flags); - latest_ns = this_cpu_read(cpu_hardirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) - ret = 1; + irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) - + cpustat[CPUTIME_IRQ]; + irq_cputime = min(irq_cputime, maxtime); + cpustat[CPUTIME_IRQ] += irq_cputime; local_irq_restore(flags); - return ret; + return irq_cputime; } -static int irqtime_account_si_update(void) +static cputime_t irqtime_account_si_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; - u64 latest_ns; - int ret = 0; + cputime_t softirq_cputime; local_irq_save(flags); - latest_ns = this_cpu_read(cpu_softirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) - ret = 1; + softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - + cpustat[CPUTIME_SOFTIRQ]; + softirq_cputime = min(softirq_cputime, maxtime); + cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; local_irq_restore(flags); - return ret; + return softirq_cputime; } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ #define sched_clock_irqtime(0) +static cputime_t irqtime_account_hi_update(cputime_t dummy) +{ + return 0; +} + +static cputime_t irqtime_account_si_update(cputime_t dummy) +{ + return 0; +} + #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ static inline void task_group_account_field(struct task_struct *p, int index, @@ -257,32 +267,45 @@ void account_idle_time(cputime_t cputime) cpustat[CPUTIME_IDLE] += (__force u64) cputime; } -static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies) +static __alway
[tip:sched/core] time, acct: Drop irq save & restore from __acct_update_integrals()
Commit-ID: 9344c92c2e72e495f695caef8364b3dd73af0eab Gitweb: http://git.kernel.org/tip/9344c92c2e72e495f695caef8364b3dd73af0eab Author: Rik van Riel AuthorDate: Wed, 10 Feb 2016 20:08:26 -0500 Committer: Ingo Molnar CommitDate: Mon, 29 Feb 2016 09:53:09 +0100 time, acct: Drop irq save & restore from __acct_update_integrals() It looks like all the call paths that lead to __acct_update_integrals() already have irqs disabled, and __acct_update_integrals() does not need to disable irqs itself. This is very convenient since about half the CPU time left in this function was spent in local_irq_save alone. Performance of a microbenchmark that calls an invalid syscall ten million times in a row on a nohz_full CPU improves 21% vs. 4.5-rc1 with both the removal of divisions from __acct_update_integrals() and this patch, with runtime dropping from 3.7 to 2.9 seconds. With these patches applied, the highest remaining cpu user in the trace is native_sched_clock, which is addressed in the next patch. For testing purposes I stuck a WARN_ON(!irqs_disabled()) test in __acct_update_integrals(). It did not trigger. Suggested-by: Peter Zijlstra Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Mike Galbraith Cc: cl...@redhat.com Cc: eric.duma...@gmail.com Cc: fweis...@gmail.com Cc: l...@amacapital.net Link: http://lkml.kernel.org/r/1455152907-18495-4-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/tsacct.c | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/tsacct.c b/kernel/tsacct.c index d12e815..f8e26ab 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -126,20 +126,18 @@ static void __acct_update_integrals(struct task_struct *tsk, cputime_t utime, cputime_t stime) { cputime_t time, dtime; - unsigned long flags; u64 delta; if (!likely(tsk->mm)) return; - local_irq_save(flags); time = stime + utime; dtime = time - tsk->acct_timexpd; /* Avoid division: cputime_t is often in nanoseconds already. */ delta = cputime_to_nsecs(dtime); if (delta < TICK_NSEC) - goto out; + return; tsk->acct_timexpd = time; /* @@ -149,8 +147,6 @@ static void __acct_update_integrals(struct task_struct *tsk, */ tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; -out: - local_irq_restore(flags); } /** @@ -160,9 +156,12 @@ out: void acct_update_integrals(struct task_struct *tsk) { cputime_t utime, stime; + unsigned long flags; + local_irq_save(flags); task_cputime(tsk, &utime, &stime); __acct_update_integrals(tsk, utime, stime); + local_irq_restore(flags); } /**
[tip:sched/core] sched, time: Switch VIRT_CPU_ACCOUNTING_GEN to jiffy granularity
Commit-ID: ff9a9b4c4334b53b52ee9279f30bd5dd92ea9bdd Gitweb: http://git.kernel.org/tip/ff9a9b4c4334b53b52ee9279f30bd5dd92ea9bdd Author: Rik van Riel AuthorDate: Wed, 10 Feb 2016 20:08:27 -0500 Committer: Ingo Molnar CommitDate: Mon, 29 Feb 2016 09:53:10 +0100 sched, time: Switch VIRT_CPU_ACCOUNTING_GEN to jiffy granularity When profiling syscall overhead on nohz-full kernels, after removing __acct_update_integrals() from the profile, native_sched_clock() remains as the top CPU user. This can be reduced by moving VIRT_CPU_ACCOUNTING_GEN to jiffy granularity. This will reduce timing accuracy on nohz_full CPUs to jiffy based sampling, just like on normal CPUs. It results in totally removing native_sched_clock from the profile, and significantly speeding up the syscall entry and exit path, as well as irq entry and exit, and KVM guest entry & exit. Additionally, only call the more expensive functions (and advance the seqlock) when jiffies actually changed. This code relies on another CPU advancing jiffies when the system is busy. On a nohz_full system, this is done by a housekeeping CPU. A microbenchmark calling an invalid syscall number 10 million times in a row speeds up an additional 30% over the numbers with just the previous patches, for a total speedup of about 40% over 4.4 and 4.5-rc1. Run times for the microbenchmark: 4.43.8 seconds 4.5-rc13.7 seconds 4.5-rc1 + first patch 3.3 seconds 4.5-rc1 + first 3 patches 3.1 seconds 4.5-rc1 + all patches 2.3 seconds A non-NOHZ_FULL cpu (not the housekeeping CPU): all kernels1.86 seconds Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: cl...@redhat.com Cc: eric.duma...@gmail.com Cc: fweis...@gmail.com Cc: l...@amacapital.net Link: http://lkml.kernel.org/r/1455152907-18495-5-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 39 +++ 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b2ab2ff..01d9898 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -668,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static unsigned long long vtime_delta(struct task_struct *tsk) +static cputime_t vtime_delta(struct task_struct *tsk) { - unsigned long long clock; + unsigned long now = READ_ONCE(jiffies); - clock = local_clock(); - if (clock < tsk->vtime_snap) + if (time_before(now, (unsigned long)tsk->vtime_snap)) return 0; - return clock - tsk->vtime_snap; + return jiffies_to_cputime(now - tsk->vtime_snap); } static cputime_t get_vtime_delta(struct task_struct *tsk) { - unsigned long long delta = vtime_delta(tsk); + unsigned long now = READ_ONCE(jiffies); + unsigned long delta = now - tsk->vtime_snap; WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); - tsk->vtime_snap += delta; + tsk->vtime_snap = now; - /* CHECKME: always safe to convert nsecs to cputime? */ - return nsecs_to_cputime(delta); + return jiffies_to_cputime(delta); } static void __vtime_account_system(struct task_struct *tsk) @@ -699,6 +698,9 @@ static void __vtime_account_system(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk) { + if (!vtime_delta(tsk)) + return; + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); write_seqcount_end(&tsk->vtime_seqcount); @@ -707,7 +709,8 @@ void vtime_account_system(struct task_struct *tsk) void vtime_gen_account_irq_exit(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); - __vtime_account_system(tsk); + if (vtime_delta(tsk)) + __vtime_account_system(tsk); if (context_tracking_in_user()) tsk->vtime_snap_whence = VTIME_USER; write_seqcount_end(&tsk->vtime_seqcount); @@ -718,16 +721,19 @@ void vtime_account_user(struct task_struct *tsk) cputime_t delta_cpu; write_seqcount_begin(&tsk->vtime_seqcount); - delta_cpu = get_vtime_delta(tsk); tsk->vtime_snap_whence = VTIME_SYS; - account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + if (vtime_delta(tsk)) { + delta_cpu = get_vtime_delta(tsk); + account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + } write_seqcount_end(&tsk->vtime_seqcount); } void vtime_user_enter(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); - __vtime_account_system(tsk); + if
[tip:sched/core] acct, time: Change indentation in __acct_update_integrals()
Commit-ID: b2add86edd3bc050af350515e6ba26f4622c38f3 Gitweb: http://git.kernel.org/tip/b2add86edd3bc050af350515e6ba26f4622c38f3 Author: Rik van Riel AuthorDate: Wed, 10 Feb 2016 20:08:25 -0500 Committer: Ingo Molnar CommitDate: Mon, 29 Feb 2016 09:53:09 +0100 acct, time: Change indentation in __acct_update_integrals() Change the indentation in __acct_update_integrals() to make the function a little easier to read. Suggested-by: Peter Zijlstra Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Acked-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: cl...@redhat.com Cc: eric.duma...@gmail.com Cc: fweis...@gmail.com Cc: l...@amacapital.net Link: http://lkml.kernel.org/r/1455152907-18495-3-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/tsacct.c | 51 ++- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 460ee2b..d12e815 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -125,31 +125,32 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) static void __acct_update_integrals(struct task_struct *tsk, cputime_t utime, cputime_t stime) { - if (likely(tsk->mm)) { - cputime_t time, dtime; - unsigned long flags; - u64 delta; - - local_irq_save(flags); - time = stime + utime; - dtime = time - tsk->acct_timexpd; - /* Avoid division: cputime_t is often in nanoseconds already. */ - delta = cputime_to_nsecs(dtime); - - if (delta < TICK_NSEC) - goto out; - - tsk->acct_timexpd = time; - /* -* Divide by 1024 to avoid overflow, and to avoid division. -* The final unit reported to userspace is Mbyte-usecs, -* the rest of the math is done in xacct_add_tsk. -*/ - tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; - out: - local_irq_restore(flags); - } + cputime_t time, dtime; + unsigned long flags; + u64 delta; + + if (!likely(tsk->mm)) + return; + + local_irq_save(flags); + time = stime + utime; + dtime = time - tsk->acct_timexpd; + /* Avoid division: cputime_t is often in nanoseconds already. */ + delta = cputime_to_nsecs(dtime); + + if (delta < TICK_NSEC) + goto out; + + tsk->acct_timexpd = time; + /* +* Divide by 1024 to avoid overflow, and to avoid division. +* The final unit reported to userspace is Mbyte-usecs, +* the rest of the math is done in xacct_add_tsk. +*/ + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; + tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; +out: + local_irq_restore(flags); } /**
[tip:sched/core] sched, time: Remove non-power-of-two divides from __acct_update_integrals()
Commit-ID: 382c2fe994321d503647ce8ee329b9420dc7c1f9 Gitweb: http://git.kernel.org/tip/382c2fe994321d503647ce8ee329b9420dc7c1f9 Author: Rik van Riel AuthorDate: Wed, 10 Feb 2016 20:08:24 -0500 Committer: Ingo Molnar CommitDate: Mon, 29 Feb 2016 09:53:08 +0100 sched, time: Remove non-power-of-two divides from __acct_update_integrals() When running a microbenchmark calling an invalid syscall number in a loop, on a nohz_full CPU, we spend a full 9% of our CPU time in __acct_update_integrals(). This function converts cputime_t to jiffies, to a timeval, only to convert the timeval back to microseconds before discarding it. This patch leaves __acct_update_integrals() functionally equivalent, but speeds things up by about 12%, with 10 million calls to an invalid syscall number dropping from 3.7 to 3.25 seconds. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: cl...@redhat.com Cc: eric.duma...@gmail.com Cc: fweis...@gmail.com Cc: l...@amacapital.net Link: http://lkml.kernel.org/r/1455152907-18495-2-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/tsacct.c | 26 -- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 975cb49..460ee2b 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) { struct mm_struct *mm; - /* convert pages-usec to Mbyte-usec */ - stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB; - stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB; + /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */ + stats->coremem = p->acct_rss_mem1 * PAGE_SIZE; + do_div(stats->coremem, 1000 * KB); + stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE; + do_div(stats->virtmem, 1000 * KB); mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ @@ -125,22 +127,26 @@ static void __acct_update_integrals(struct task_struct *tsk, { if (likely(tsk->mm)) { cputime_t time, dtime; - struct timeval value; unsigned long flags; u64 delta; local_irq_save(flags); time = stime + utime; dtime = time - tsk->acct_timexpd; - jiffies_to_timeval(cputime_to_jiffies(dtime), &value); - delta = value.tv_sec; - delta = delta * USEC_PER_SEC + value.tv_usec; + /* Avoid division: cputime_t is often in nanoseconds already. */ + delta = cputime_to_nsecs(dtime); - if (delta == 0) + if (delta < TICK_NSEC) goto out; + tsk->acct_timexpd = time; - tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; + /* +* Divide by 1024 to avoid overflow, and to avoid division. +* The final unit reported to userspace is Mbyte-usecs, +* the rest of the math is done in xacct_add_tsk. +*/ + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; + tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; out: local_irq_restore(flags); }
[tip:sched/core] sched/numa: Spread memory according to CPU and memory use
Commit-ID: 4142c3ebb685bb338b7d96090d8f90ff49065ff6 Gitweb: http://git.kernel.org/tip/4142c3ebb685bb338b7d96090d8f90ff49065ff6 Author: Rik van Riel AuthorDate: Mon, 25 Jan 2016 17:07:39 -0500 Committer: Ingo Molnar CommitDate: Tue, 9 Feb 2016 14:47:18 +0100 sched/numa: Spread memory according to CPU and memory use The pseudo-interleaving in NUMA placement has a fundamental problem: using hard usage thresholds to spread memory equally between nodes can prevent workloads from converging, or keep memory "trapped" on nodes where the workload is barely running any more. In order for workloads to properly converge, the memory migration should not be stopped when nodes reach parity, but instead be distributed according to how heavily memory is used from each node. This way memory migration and task migration reinforce each other, instead of one putting the brakes on the other. Remove the hard thresholds from the pseudo-interleaving code, and instead use a more gradual policy on memory placement. This also seems to improve convergence of workloads that do not run flat out, but sleep in between bursts of activity. We still want to slow down NUMA scanning and migration once a workload has settled on a few actively used nodes, so keep the 3/4 hysteresis in place. Keep track of whether a workload is actively running on multiple nodes, so task_numa_migrate does a full scan of the system for better task placement. In the case of running 3 SPECjbb2005 instances on a 4 node system, this code seems to result in fairer distribution of memory between nodes, with more memory bandwidth for each instance. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: mgor...@suse.de Link: http://lkml.kernel.org/r/20160125170739.2fc9a...@annuminas.surriel.com [ Minor readability tweaks. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 87 + 1 file changed, 47 insertions(+), 40 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 51a4550..7ce24a4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -932,10 +932,11 @@ struct numa_group { spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; pid_t gid; + int active_nodes; struct rcu_head rcu; - nodemask_t active_nodes; unsigned long total_faults; + unsigned long max_faults_cpu; /* * Faults_cpu is used to decide whether memory should move * towards the CPU. As a consequence, these stats are weighted @@ -994,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; } +/* + * A node triggering more than 1/3 as many NUMA faults as the maximum is + * considered part of a numa group's pseudo-interleaving set. Migrations + * between these nodes are slowed down, to allow things to settle down. + */ +#define ACTIVE_NODE_FRACTION 3 + +static bool numa_is_active_node(int nid, struct numa_group *ng) +{ + return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; +} + /* Handle placement on systems where not all nodes are directly connected. */ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, int maxdist, bool task) @@ -1143,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, return true; /* -* Do not migrate if the destination is not a node that -* is actively used by this numa group. +* Destination node is much more heavily used than the source +* node? Allow migration. */ - if (!node_isset(dst_nid, ng->active_nodes)) - return false; - - /* -* Source is a node that is not actively used by this -* numa group, while the destination is. Migrate. -*/ - if (!node_isset(src_nid, ng->active_nodes)) + if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) * + ACTIVE_NODE_FRACTION) return true; /* -* Both source and destination are nodes in active -* use by this numa group. Maximize memory bandwidth -* by migrating from more heavily used groups, to less -* heavily used ones, spreading the load around. -* Use a 1/4 hysteresis to avoid spurious page movement. +* Distribute memory according to CPU & memory use on each node, +* with 3/4 hysteresis to avoid unnecessary memory migrations: +* +* faults_cpu(dst) 3 faults_cpu(src) +* --- * - > --- +* faults_mem(dst) 4 faults_mem(src) */ - return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); + return group_faults_cpu(ng, dst_nid)
[tip:sched/core] sched/numa: Cap PTE scanning overhead to 3% of run time
Commit-ID: 51170840fe91dfca10fd533b303ea39b2524782a Gitweb: http://git.kernel.org/tip/51170840fe91dfca10fd533b303ea39b2524782a Author: Rik van Riel AuthorDate: Thu, 5 Nov 2015 15:56:23 -0500 Committer: Ingo Molnar CommitDate: Mon, 23 Nov 2015 09:37:54 +0100 sched/numa: Cap PTE scanning overhead to 3% of run time There is a fundamental mismatch between the runtime based NUMA scanning at the task level, and the wall clock time NUMA scanning at the mm level. On a severely overloaded system, with very large processes, this mismatch can cause the system to spend all of its time in change_prot_numa(). This can happen if the task spends at least two ticks in change_prot_numa(), and only gets two ticks of CPU time in the real time between two scan intervals of the mm. This patch ensures that a task never spends more than 3% of run time scanning PTEs. It does that by ensuring that in-between task_numa_work() runs, the task spends at least 32x as much time on other things than it did on task_numa_work(). This is done stochastically: if a timer tick happens, or the task gets rescheduled during task_numa_work(), we delay a future run of task_numa_work() until the task has spent at least 32x the amount of CPU time doing something else, as it spent inside task_numa_work(). The longer task_numa_work() takes, the more likely it is this happens. If task_numa_work() takes very little time, chances are low that that code will do anything, but we will not care. Reported-and-tested-by: Jan Stancek Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: mgor...@suse.de Link: http://lkml.kernel.org/r/1446756983-28173-3-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 1 file changed, 12 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 309b1d5..95b944e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2155,6 +2155,7 @@ void task_numa_work(struct callback_head *work) unsigned long migrate, next_scan, now = jiffies; struct task_struct *p = current; struct mm_struct *mm = p->mm; + u64 runtime = p->se.sum_exec_runtime; struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; @@ -2277,6 +2278,17 @@ out: else reset_ptenuma_scan(p); up_read(&mm->mmap_sem); + + /* +* Make sure tasks use at least 32x as much time to run other code +* than they used here, to limit NUMA PTE scanning overhead to 3% max. +* Usually update_task_scan_period slows down scanning enough; on an +* overloaded system we need to limit overhead on a per task basis. +*/ + if (unlikely(p->se.sum_exec_runtime != runtime)) { + u64 diff = p->se.sum_exec_runtime - runtime; + p->node_stamp += 32 * diff; + } } /* -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/urgent] sched/numa: Fix math underflow in task_tick_numa()
Commit-ID: 25b3e5a3344e1f700c1efec5b6f0199f04707fb1 Gitweb: http://git.kernel.org/tip/25b3e5a3344e1f700c1efec5b6f0199f04707fb1 Author: Rik van Riel AuthorDate: Thu, 5 Nov 2015 15:56:22 -0500 Committer: Ingo Molnar CommitDate: Mon, 9 Nov 2015 16:13:27 +0100 sched/numa: Fix math underflow in task_tick_numa() The NUMA balancing code implements delays in scanning by advancing curr->node_stamp beyond curr->se.sum_exec_runtime. With unsigned math, that creates an underflow, which results in task_numa_work being queued all the time, even when we don't want to. Avoiding the math underflow makes it possible to reduce CPU overhead in the NUMA balancing code. Reported-and-tested-by: Jan Stancek Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: mgor...@suse.de Link: http://lkml.kernel.org/r/1446756983-28173-2-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 824aa9f..f04fda8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2302,7 +2302,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) now = curr->se.sum_exec_runtime; period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; - if (now - curr->node_stamp > period) { + if (now > curr->node_stamp + period) { if (!curr->node_stamp) curr->numa_scan_period = task_scan_min(curr); curr->node_stamp += period; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Limit the amount of virtual memory scanned in task_numa_work()
Commit-ID: 4620f8c1fda2af4ccbd11e194e2dd785f7d7f279 Gitweb: http://git.kernel.org/tip/4620f8c1fda2af4ccbd11e194e2dd785f7d7f279 Author: Rik van Riel AuthorDate: Fri, 11 Sep 2015 09:00:27 -0400 Committer: Ingo Molnar CommitDate: Fri, 18 Sep 2015 09:23:14 +0200 sched/numa: Limit the amount of virtual memory scanned in task_numa_work() Currently task_numa_work() scans up to numa_balancing_scan_size_mb worth of memory per invocation, but only counts memory areas that have at least one PTE that is still present and not marked for numa hint faulting. It will skip over arbitarily large amounts of memory that are either unused, full of swap ptes, or full of PTEs that were already marked for NUMA hint faults but have not been faulted on yet. This can cause excessive amounts of CPU use, due to there being essentially no upper limit on the scan rate of very large processes that are not yet in a phase where they are actively accessing old memory pages (eg. they are still initializing their data). Avoid that problem by placing an upper limit on the amount of virtual memory that task_numa_work() scans in each invocation. This can be a higher limit than "pages", to ensure the task still skips over unused areas fairly quickly. While we are here, also fix the "nr_pte_updates" logic, so it only counts page ranges with ptes in them. Reported-by: Andrea Arcangeli Reported-by: Jan Stancek Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150911090027.4a798...@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 18 -- 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9176f7c..1bfad9f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2157,7 +2157,7 @@ void task_numa_work(struct callback_head *work) struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; - long pages; + long pages, virtpages; WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); @@ -2203,9 +2203,11 @@ void task_numa_work(struct callback_head *work) start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <<= 20 - PAGE_SHIFT; /* MB in pages */ + virtpages = pages * 8; /* Scan up to this much virtual space */ if (!pages) return; + down_read(&mm->mmap_sem); vma = find_vma(mm, start); if (!vma) { @@ -2240,18 +2242,22 @@ void task_numa_work(struct callback_head *work) start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); end = min(end, vma->vm_end); - nr_pte_updates += change_prot_numa(vma, start, end); + nr_pte_updates = change_prot_numa(vma, start, end); /* -* Scan sysctl_numa_balancing_scan_size but ensure that -* at least one PTE is updated so that unused virtual -* address space is quickly skipped. +* Try to scan sysctl_numa_balancing_size worth of +* hpages that have at least one present PTE that +* is not already pte-numa. If the VMA contains +* areas that are unused or already full of prot_numa +* PTEs, scan up to virtpages, to skip through those +* areas faster. */ if (nr_pte_updates) pages -= (end - start) >> PAGE_SHIFT; + virtpages -= (end - start) >> PAGE_SHIFT; start = end; - if (pages <= 0) + if (pages <= 0 || virtpages <= 0) goto out; cond_resched(); -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Only consider less busy nodes as numa balancing destinations
Commit-ID: 6f9aad0bc37286c0441b57f0ba8cffee50715426 Gitweb: http://git.kernel.org/tip/6f9aad0bc37286c0441b57f0ba8cffee50715426 Author: Rik van Riel AuthorDate: Thu, 28 May 2015 09:52:49 -0400 Committer: Ingo Molnar CommitDate: Sun, 7 Jun 2015 15:57:45 +0200 sched/numa: Only consider less busy nodes as numa balancing destinations Changeset a43455a1d572 ("sched/numa: Ensure task_numa_migrate() checks the preferred node") fixes an issue where workloads would never converge on a fully loaded (or overloaded) system. However, it introduces a regression on less than fully loaded systems, where workloads converge on a few NUMA nodes, instead of properly staying spread out across the whole system. This leads to a reduction in available memory bandwidth, and usable CPU cache, with predictable performance problems. The root cause appears to be an interaction between the load balancer and NUMA balancing, where the short term load represented by the load balancer differs from the long term load the NUMA balancing code would like to base its decisions on. Simply reverting a43455a1d572 would re-introduce the non-convergence of workloads on fully loaded systems, so that is not a good option. As an aside, the check done before a43455a1d572 only applied to a task's preferred node, not to other candidate nodes in the system, so the converge-on-too-few-nodes problem still happens, just to a lesser degree. Instead, try to compensate for the impedance mismatch between the load balancer and NUMA balancing by only ever considering a lesser loaded node as a destination for NUMA balancing, regardless of whether the task is trying to move to the preferred node, or to another node. This patch also addresses the issue that a system with a single runnable thread would never migrate that thread to near its memory, introduced by 095bebf61a46 ("sched/numa: Do not move past the balance point if unbalanced"). A test where the main thread creates a large memory area, and spawns a worker thread to iterate over the memory (placed on another node by select_task_rq_fair), after which the main thread goes to sleep and waits for the worker thread to loop over all the memory now sees the worker thread migrated to where the memory is, instead of having all the memory migrated over like before. Jirka has run a number of performance tests on several systems: single instance SpecJBB 2005 performance is 7-15% higher on a 4 node system, with higher gains on systems with more cores per socket. Multi-instance SpecJBB 2005 (one per node), linpack, and stream see little or no changes with the revert of 095bebf61a46 and this patch. Reported-by: Artem Bityutski Reported-by: Jirka Hladky Tested-by: Jirka Hladky Tested-by: Artem Bityutskiy Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150528095249.3083a...@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 30 -- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 723d69e..4b6e5f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1398,6 +1398,30 @@ static void task_numa_find_cpu(struct task_numa_env *env, } } +/* Only move tasks to a NUMA node less busy than the current node. */ +static bool numa_has_capacity(struct task_numa_env *env) +{ + struct numa_stats *src = &env->src_stats; + struct numa_stats *dst = &env->dst_stats; + + if (src->has_free_capacity && !dst->has_free_capacity) + return false; + + /* +* Only consider a task move if the source has a higher load +* than the destination, corrected for CPU capacity on each node. +* +* src->loaddst->load +* - vs - +* src->compute_capacitydst->compute_capacity +*/ + if (src->load * dst->compute_capacity > + dst->load * src->compute_capacity) + return true; + + return false; +} + static int task_numa_migrate(struct task_struct *p) { struct task_numa_env env = { @@ -1452,7 +1476,8 @@ static int task_numa_migrate(struct task_struct *p) update_numa_stats(&env.dst_stats, env.dst_nid); /* Try to find a spot on the preferred nid. */ - task_numa_find_cpu(&env, taskimp, groupimp); + if (numa_has_capacity(&env)) + task_numa_find_cpu(&env, taskimp, groupimp); /* * Look at other nodes in these cases: @@ -1483,7 +1508,8 @@ static int task_numa_migrate(struct task_struct *p) env.dist = dist; env.dst_nid = nid; update_numa_stats(&env.dst_stats, env.dst_nid); - task_n
[tip:sched/core] Revert 095bebf61a46 ("sched/numa: Do not move past the balance point if unbalanced")
Commit-ID: e4991b240c622f0441c21f4869e13209abc08c5e Gitweb: http://git.kernel.org/tip/e4991b240c622f0441c21f4869e13209abc08c5e Author: Rik van Riel AuthorDate: Wed, 27 May 2015 15:04:27 -0400 Committer: Ingo Molnar CommitDate: Sun, 7 Jun 2015 15:57:44 +0200 Revert 095bebf61a46 ("sched/numa: Do not move past the balance point if unbalanced") Commit 095bebf61a46 ("sched/numa: Do not move past the balance point if unbalanced") broke convergence of workloads with just one runnable thread, by making it impossible for the one runnable thread on the system to move from one NUMA node to another. Instead, the thread would remain where it was, and pull all the memory across to its location, which is much slower than just migrating the thread to where the memory is. The next patch has a better fix for the issue that 095bebf61a46 tried to address. Reported-by: Jirka Hladky Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dedeki...@gmail.com Cc: mgor...@suse.de Link: http://lkml.kernel.org/r/1432753468-7785-2-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 41 +++-- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 84ada05..723d69e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1198,11 +1198,9 @@ static void task_numa_assign(struct task_numa_env *env, static bool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env) { + long imb, old_imb; + long orig_src_load, orig_dst_load; long src_capacity, dst_capacity; - long orig_src_load; - long load_a, load_b; - long moved_load; - long imb; /* * The load is corrected for the CPU capacity available on each node. @@ -1215,39 +1213,30 @@ static bool load_too_imbalanced(long src_load, long dst_load, dst_capacity = env->dst_stats.compute_capacity; /* We care about the slope of the imbalance, not the direction. */ - load_a = dst_load; - load_b = src_load; - if (load_a < load_b) - swap(load_a, load_b); + if (dst_load < src_load) + swap(dst_load, src_load); /* Is the difference below the threshold? */ - imb = load_a * src_capacity * 100 - - load_b * dst_capacity * env->imbalance_pct; + imb = dst_load * src_capacity * 100 - + src_load * dst_capacity * env->imbalance_pct; if (imb <= 0) return false; /* * The imbalance is above the allowed threshold. -* Allow a move that brings us closer to a balanced situation, -* without moving things past the point of balance. +* Compare it with the old imbalance. */ orig_src_load = env->src_stats.load; + orig_dst_load = env->dst_stats.load; - /* -* In a task swap, there will be one load moving from src to dst, -* and another moving back. This is the net sum of both moves. -* A simple task move will always have a positive value. -* Allow the move if it brings the system closer to a balanced -* situation, without crossing over the balance point. -*/ - moved_load = orig_src_load - src_load; + if (orig_dst_load < orig_src_load) + swap(orig_dst_load, orig_src_load); - if (moved_load > 0) - /* Moving src -> dst. Did we overshoot balance? */ - return src_load * dst_capacity < dst_load * src_capacity; - else - /* Moving dst -> src. Did we overshoot balance? */ - return dst_load * src_capacity < src_load * dst_capacity; + old_imb = orig_dst_load * src_capacity * 100 - + orig_src_load * dst_capacity * env->imbalance_pct; + + /* Would this change make things worse? */ + return (imb > old_imb); } /* -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Reduce conflict between fbq_classify_rq() and migration
Commit-ID: c1ceac6276e4ee12e4129afd380db10fae0db7df Gitweb: http://git.kernel.org/tip/c1ceac6276e4ee12e4129afd380db10fae0db7df Author: Rik van Riel AuthorDate: Thu, 14 May 2015 22:59:36 -0400 Committer: Ingo Molnar CommitDate: Tue, 19 May 2015 08:39:19 +0200 sched/numa: Reduce conflict between fbq_classify_rq() and migration It is possible for fbq_classify_rq() to indicate that a CPU has tasks that should be moved to another NUMA node, but for migrate_improves_locality and migrate_degrades_locality to not identify those tasks. This patch always gives preference to preferred node evaluations, and only checks the number of faults when evaluating moves between two non-preferred nodes on a larger NUMA system. On a two node system, the number of faults is never evaluated. Either a task is about to be pulled off its preferred node, or migrated onto it. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: mgor...@suse.de Link: http://lkml.kernel.org/r/20150514225936.35b91...@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 60 + 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a27d988..0d4632f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5663,10 +5663,15 @@ static int task_hot(struct task_struct *p, struct lb_env *env) } #ifdef CONFIG_NUMA_BALANCING -/* Returns true if the destination node has incurred more faults */ +/* + * Returns true if the destination node is the preferred node. + * Needs to match fbq_classify_rq(): if there is a runnable task + * that is not on its preferred node, we should identify it. + */ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); + unsigned long src_faults, dst_faults; int src_nid, dst_nid; if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || @@ -5680,29 +5685,30 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; - if (numa_group) { - /* Task is already in the group's interleave set. */ - if (node_isset(src_nid, numa_group->active_nodes)) - return false; - - /* Task is moving into the group's interleave set. */ - if (node_isset(dst_nid, numa_group->active_nodes)) - return true; - - return group_faults(p, dst_nid) > group_faults(p, src_nid); - } - /* Encourage migration to the preferred node. */ if (dst_nid == p->numa_preferred_nid) return true; - return task_faults(p, dst_nid) > task_faults(p, src_nid); + /* Migrating away from the preferred node is bad. */ + if (src_nid == p->numa_preferred_nid) + return false; + + if (numa_group) { + src_faults = group_faults(p, src_nid); + dst_faults = group_faults(p, dst_nid); + } else { + src_faults = task_faults(p, src_nid); + dst_faults = task_faults(p, dst_nid); + } + + return dst_faults > src_faults; } static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); + unsigned long src_faults, dst_faults; int src_nid, dst_nid; if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) @@ -5717,23 +5723,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; - if (numa_group) { - /* Task is moving within/into the group's interleave set. */ - if (node_isset(dst_nid, numa_group->active_nodes)) - return false; + /* Migrating away from the preferred node is bad. */ + if (src_nid == p->numa_preferred_nid) + return true; - /* Task is moving out of the group's interleave set. */ - if (node_isset(src_nid, numa_group->active_nodes)) - return true; + /* Encourage migration to the preferred node. */ + if (dst_nid == p->numa_preferred_nid) + return false; - return group_faults(p, dst_nid) < group_faults(p, src_nid); + if (numa_group) { + src_faults = group_faults(p, src_nid); + dst_faults = group_faults(p, dst_nid); + } else { + src_faults = task_faults(p, src_nid); + dst_faults = task_faults(p, dst_nid); } - /* Migrating away from the preferred node is always bad. */ - if (src_nid == p->numa_preferred_nid) -
[tip:x86/fpu] x86/fpu: Use an explicit if/ else in switch_fpu_prepare()
Commit-ID: 1361ef29c7e49ae7cf37220c25fac1904b77f71a Gitweb: http://git.kernel.org/tip/1361ef29c7e49ae7cf37220c25fac1904b77f71a Author: Rik van Riel AuthorDate: Fri, 6 Feb 2015 15:02:03 -0500 Committer: Borislav Petkov CommitDate: Thu, 19 Feb 2015 11:15:54 +0100 x86/fpu: Use an explicit if/else in switch_fpu_prepare() Use an explicit if/else branch after __save_init_fpu(old) in switch_fpu_prepare(). This makes substituting the assignment with a call in task_disable_lazy_fpu_restore() in the next patch easier to review. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Oleg Nesterov Link: http://lkml.kernel.org/r/1423252925-14451-7-git-send-email-r...@redhat.com [ Space out stuff for more readability. ] Signed-off-by: Borislav Petkov --- arch/x86/include/asm/fpu-internal.h | 14 +- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 9c27f44..04c2807 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -434,13 +434,17 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta * If the task has used the math, pre-load the FPU on xsave processors * or if the past 5 consecutive context-switches used math. */ - fpu.preload = tsk_used_math(new) && (use_eager_fpu() || -new->thread.fpu_counter > 5); + fpu.preload = tsk_used_math(new) && + (use_eager_fpu() || new->thread.fpu_counter > 5); + if (__thread_has_fpu(old)) { if (!__save_init_fpu(old)) - cpu = ~0; - old->thread.fpu.last_cpu = cpu; - old->thread.fpu.has_fpu = 0;/* But leave fpu_owner_task! */ + old->thread.fpu.last_cpu = ~0; + else + old->thread.fpu.last_cpu = cpu; + + /* But leave fpu_owner_task! */ + old->thread.fpu.has_fpu = 0; /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:x86/fpu] x86/fpu: Introduce task_disable_lazy_fpu_restore() helper
Commit-ID: 33e03dedd759cc9396252d9641b25d01909a26bb Gitweb: http://git.kernel.org/tip/33e03dedd759cc9396252d9641b25d01909a26bb Author: Rik van Riel AuthorDate: Fri, 6 Feb 2015 15:02:02 -0500 Committer: Borislav Petkov CommitDate: Thu, 19 Feb 2015 11:15:53 +0100 x86/fpu: Introduce task_disable_lazy_fpu_restore() helper Currently there are a few magic assignments sprinkled through the code that disable lazy FPU state restoring, some more effective than others, and all equally mystifying. It would be easier to have a helper to explicitly disable lazy FPU state restoring for a task. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Oleg Nesterov Link: http://lkml.kernel.org/r/1423252925-14451-6-git-send-email-r...@redhat.com Signed-off-by: Borislav Petkov --- arch/x86/include/asm/fpu-internal.h | 10 ++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 217d6d7..9c27f44 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -79,6 +79,16 @@ static inline void __cpu_disable_lazy_restore(unsigned int cpu) per_cpu(fpu_owner_task, cpu) = NULL; } +/* + * Used to indicate that the FPU state in memory is newer than the FPU + * state in registers, and the FPU state should be reloaded next time the + * task is run. Only safe on the current task, or non-running tasks. + */ +static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk) +{ + tsk->thread.fpu.last_cpu = ~0; +} + static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) { return new == this_cpu_read_stable(fpu_owner_task) && -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:x86/fpu] x86/fpu: Use task_disable_lazy_fpu_restore() helper
Commit-ID: 6a5fe8952bd676baf382d14df21e7b32b5d8943e Gitweb: http://git.kernel.org/tip/6a5fe8952bd676baf382d14df21e7b32b5d8943e Author: Rik van Riel AuthorDate: Fri, 6 Feb 2015 15:02:04 -0500 Committer: Borislav Petkov CommitDate: Thu, 19 Feb 2015 11:15:55 +0100 x86/fpu: Use task_disable_lazy_fpu_restore() helper Replace magic assignments of fpu.last_cpu = ~0 with more explicit task_disable_lazy_fpu_restore() calls. Signed-off-by: Rik van Riel Cc: Oleg Nesterov Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1423252925-14451-8-git-send-email-r...@redhat.com Signed-off-by: Borislav Petkov --- arch/x86/include/asm/fpu-internal.h | 4 ++-- arch/x86/kernel/i387.c | 2 +- arch/x86/kernel/process.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 04c2807..e5f8f8e 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -439,7 +439,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta if (__thread_has_fpu(old)) { if (!__save_init_fpu(old)) - old->thread.fpu.last_cpu = ~0; + task_disable_lazy_fpu_restore(old); else old->thread.fpu.last_cpu = cpu; @@ -455,7 +455,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta stts(); } else { old->thread.fpu_counter = 0; - old->thread.fpu.last_cpu = ~0; + task_disable_lazy_fpu_restore(old); if (fpu.preload) { new->thread.fpu_counter++; if (!use_eager_fpu() && fpu_lazy_restore(new, cpu)) diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index f3ced6f..5722ab6 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -236,7 +236,7 @@ int init_fpu(struct task_struct *tsk) if (tsk_used_math(tsk)) { if (cpu_has_fpu && tsk == current) unlazy_fpu(tsk); - tsk->thread.fpu.last_cpu = ~0; + task_disable_lazy_fpu_restore(tsk); return 0; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e127dda..ce8b103 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -68,8 +68,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) dst->thread.fpu_counter = 0; dst->thread.fpu.has_fpu = 0; - dst->thread.fpu.last_cpu = ~0; dst->thread.fpu.state = NULL; + task_disable_lazy_fpu_restore(dst); if (tsk_used_math(src)) { int err = fpu_alloc(&dst->thread.fpu); if (err) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:x86/fpu] x86/fpu: Also check fpu_lazy_restore() when use_eager_fpu()
Commit-ID: 728e53fef429a0f3c9dda3587c3ccc57ad268b70 Gitweb: http://git.kernel.org/tip/728e53fef429a0f3c9dda3587c3ccc57ad268b70 Author: Rik van Riel AuthorDate: Fri, 6 Feb 2015 15:02:05 -0500 Committer: Borislav Petkov CommitDate: Thu, 19 Feb 2015 11:15:55 +0100 x86/fpu: Also check fpu_lazy_restore() when use_eager_fpu() With Oleg's patch: 33a3ebdc077f ("x86, fpu: Don't abuse has_fpu in __kernel_fpu_begin/end()") kernel threads no longer have an FPU state even on systems with use_eager_fpu(). That in turn means that a task may still have its FPU state loaded in the FPU registers, if the task only got interrupted by kernel threads from when it went to sleep, to when it woke up again. In that case, there is no need to restore the FPU state for this task, since it is still in the registers. The kernel can simply use the same logic to determine this as is used for !use_eager_fpu() systems. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Oleg Nesterov Link: http://lkml.kernel.org/r/1423252925-14451-9-git-send-email-r...@redhat.com Signed-off-by: Borislav Petkov --- arch/x86/include/asm/fpu-internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index e5f8f8e..19fb41c 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -458,7 +458,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta task_disable_lazy_fpu_restore(old); if (fpu.preload) { new->thread.fpu_counter++; - if (!use_eager_fpu() && fpu_lazy_restore(new, cpu)) + if (fpu_lazy_restore(new, cpu)) fpu.preload = 0; else prefetch(new->thread.fpu.state); -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:x86/fpu] x86/fpu: Move lazy restore functions up a few lines
Commit-ID: 1c927eea4cad83c439cb51e9c96ad19cb005157d Gitweb: http://git.kernel.org/tip/1c927eea4cad83c439cb51e9c96ad19cb005157d Author: Rik van Riel AuthorDate: Fri, 6 Feb 2015 15:02:01 -0500 Committer: Borislav Petkov CommitDate: Thu, 19 Feb 2015 11:15:53 +0100 x86/fpu: Move lazy restore functions up a few lines We need another lazy restore related function, that will be called from a function that is above where the lazy restore functions are now. It would be nice to keep all three functions grouped together. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Oleg Nesterov Link: http://lkml.kernel.org/r/1423252925-14451-5-git-send-email-r...@redhat.com Signed-off-by: Borislav Petkov --- arch/x86/include/asm/fpu-internal.h | 36 ++-- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 02f2e08..217d6d7 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -67,6 +67,24 @@ extern void finit_soft_fpu(struct i387_soft_struct *soft); static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} #endif +/* + * Must be run with preemption disabled: this clears the fpu_owner_task, + * on this CPU. + * + * This will disable any lazy FPU state restore of the current FPU state, + * but if the current thread owns the FPU, it will still be saved by. + */ +static inline void __cpu_disable_lazy_restore(unsigned int cpu) +{ + per_cpu(fpu_owner_task, cpu) = NULL; +} + +static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) +{ + return new == this_cpu_read_stable(fpu_owner_task) && + cpu == new->thread.fpu.last_cpu; +} + static inline int is_ia32_compat_frame(void) { return config_enabled(CONFIG_IA32_EMULATION) && @@ -398,24 +416,6 @@ static inline void drop_init_fpu(struct task_struct *tsk) */ typedef struct { int preload; } fpu_switch_t; -/* - * Must be run with preemption disabled: this clears the fpu_owner_task, - * on this CPU. - * - * This will disable any lazy FPU state restore of the current FPU state, - * but if the current thread owns the FPU, it will still be saved by. - */ -static inline void __cpu_disable_lazy_restore(unsigned int cpu) -{ - per_cpu(fpu_owner_task, cpu) = NULL; -} - -static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) -{ - return new == this_cpu_read_stable(fpu_owner_task) && - cpu == new->thread.fpu.last_cpu; -} - static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) { fpu_switch_t fpu; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/rt/nohz: Stop scheduler tick if running realtime task
Commit-ID: 1e78cdbd9b2266503339accafe0ebdd99b93a531 Gitweb: http://git.kernel.org/tip/1e78cdbd9b2266503339accafe0ebdd99b93a531 Author: Rik van Riel AuthorDate: Mon, 16 Feb 2015 15:23:49 -0500 Committer: Ingo Molnar CommitDate: Wed, 18 Feb 2015 18:21:19 +0100 sched/rt/nohz: Stop scheduler tick if running realtime task If the CPU is running a realtime task that does not round-robin with another realtime task of equal priority, there is no point in keeping the scheduler tick going. After all, whenever the scheduler tick runs, the kernel will just decide not to reschedule. Extend sched_can_stop_tick() to recognize these situations, and inform the rest of the kernel that the scheduler tick can be stopped. Tested-by: Luiz Capitulino Signed-off-by: Rik van Riel Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: fweis...@redhat.com Cc: mtosa...@redhat.com Link: http://lkml.kernel.org/r/20150216152349.6a8ed...@annuminas.surriel.com [ Small cleanliness tweak. ] Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 17 + 1 file changed, 17 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a4869bd..97fe79c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -690,6 +690,23 @@ static inline bool got_nohz_idle_kick(void) bool sched_can_stop_tick(void) { /* +* FIFO realtime policy runs the highest priority task. Other runnable +* tasks are of a lower priority. The scheduler tick does nothing. +*/ + if (current->policy == SCHED_FIFO) + return true; + + /* +* Round-robin realtime tasks time slice with other tasks at the same +* realtime priority. Is this task the only one at this priority? +*/ + if (current->policy == SCHED_RR) { + struct sched_rt_entity *rt_se = ¤t->rt; + + return rt_se->run_list.prev == rt_se->run_list.next; + } + + /* * More than one running task need preemption. * nr_running update is assumed to be visible * after IPI is sent from wakers. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Do not move past the balance point if unbalanced
Commit-ID: 095bebf61a460ad7f6a45bb17ddbf3a9df2b4397 Gitweb: http://git.kernel.org/tip/095bebf61a460ad7f6a45bb17ddbf3a9df2b4397 Author: Rik van Riel AuthorDate: Tue, 3 Feb 2015 16:56:48 -0500 Committer: Ingo Molnar CommitDate: Wed, 18 Feb 2015 16:18:00 +0100 sched/numa: Do not move past the balance point if unbalanced There is a subtle interaction between the logic introduced in commit e63da03639cc ("sched/numa: Allow task switch if load imbalance improves"), the way the load balancer counts the load on each NUMA node, and the way NUMA hinting faults are done. Specifically, the load balancer only counts currently running tasks in the load, while NUMA hinting faults may cause tasks to stop, if the page is locked by another task. This could cause all of the threads of a large single instance workload, like SPECjbb2005, to migrate to the same NUMA node. This was possible because occasionally they all fault on the same few pages, and only one of the threads remains runnable. That thread can move to the process's preferred NUMA node without making the imbalance worse, because nothing else is running at that time. The fix is to check the direction of the net moving of load, and to refuse a NUMA move if it would cause the system to move past the point of balance. In an unbalanced state, only moves that bring us closer to the balance point are allowed. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: mgor...@suse.de Link: http://lkml.kernel.org/r/20150203165648.0e9ac...@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 41 ++--- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7ce18f3..28cbaca 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1196,9 +1196,11 @@ static void task_numa_assign(struct task_numa_env *env, static bool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env) { - long imb, old_imb; - long orig_src_load, orig_dst_load; long src_capacity, dst_capacity; + long orig_src_load; + long load_a, load_b; + long moved_load; + long imb; /* * The load is corrected for the CPU capacity available on each node. @@ -1211,30 +1213,39 @@ static bool load_too_imbalanced(long src_load, long dst_load, dst_capacity = env->dst_stats.compute_capacity; /* We care about the slope of the imbalance, not the direction. */ - if (dst_load < src_load) - swap(dst_load, src_load); + load_a = dst_load; + load_b = src_load; + if (load_a < load_b) + swap(load_a, load_b); /* Is the difference below the threshold? */ - imb = dst_load * src_capacity * 100 - - src_load * dst_capacity * env->imbalance_pct; + imb = load_a * src_capacity * 100 - + load_b * dst_capacity * env->imbalance_pct; if (imb <= 0) return false; /* * The imbalance is above the allowed threshold. -* Compare it with the old imbalance. +* Allow a move that brings us closer to a balanced situation, +* without moving things past the point of balance. */ orig_src_load = env->src_stats.load; - orig_dst_load = env->dst_stats.load; - if (orig_dst_load < orig_src_load) - swap(orig_dst_load, orig_src_load); - - old_imb = orig_dst_load * src_capacity * 100 - - orig_src_load * dst_capacity * env->imbalance_pct; + /* +* In a task swap, there will be one load moving from src to dst, +* and another moving back. This is the net sum of both moves. +* A simple task move will always have a positive value. +* Allow the move if it brings the system closer to a balanced +* situation, without crossing over the balance point. +*/ + moved_load = orig_src_load - src_load; - /* Would this change make things worse? */ - return (imb > old_imb); + if (moved_load > 0) + /* Moving src -> dst. Did we overshoot balance? */ + return src_load * dst_capacity < dst_load * src_capacity; + else + /* Moving dst -> src. Did we overshoot balance? */ + return dst_load * src_capacity < src_load * dst_capacity; } /* -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Check all nodes when placing a pseudo-interleaved group
Commit-ID: 9de05d48711cd5314920ed05f873d84eaf66ccf1 Gitweb: http://git.kernel.org/tip/9de05d48711cd5314920ed05f873d84eaf66ccf1 Author: Rik van Riel AuthorDate: Thu, 9 Oct 2014 17:27:47 -0400 Committer: Ingo Molnar CommitDate: Tue, 28 Oct 2014 10:47:52 +0100 sched/numa: Check all nodes when placing a pseudo-interleaved group In pseudo-interleaved numa_groups, all tasks try to relocate to the group's preferred_nid. When a group is spread across multiple NUMA nodes, this can lead to tasks swapping their location with other tasks inside the same group, instead of swapping location with tasks from other NUMA groups. This can keep NUMA groups from converging. Examining all nodes, when dealing with a task in a pseudo-interleaved NUMA group, avoids this problem. Note that only CPUs in nodes that improve the task or group score are examined, so the loop isn't too bad. Tested-by: Vinod Chegu Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: "Vinod Chegu" Cc: mgor...@suse.de Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20141009172747.0d97c...@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 +-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7760c2a..ec32c26d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1436,8 +1436,15 @@ static int task_numa_migrate(struct task_struct *p) /* Try to find a spot on the preferred nid. */ task_numa_find_cpu(&env, taskimp, groupimp); - /* No space available on the preferred nid. Look elsewhere. */ - if (env.best_cpu == -1) { + /* +* Look at other nodes in these cases: +* - there is no space available on the preferred_nid +* - the task is part of a numa_group that is interleaved across +* multiple NUMA nodes; in order to better consolidate the group, +* we need to check other locations. +*/ + if (env.best_cpu == -1 || (p->numa_group && + nodes_weight(p->numa_group->active_nodes) > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Prepare for complex topology placement
Commit-ID: 7bd953206b0b5e0a3aded871982367410b42e1b1 Gitweb: http://git.kernel.org/tip/7bd953206b0b5e0a3aded871982367410b42e1b1 Author: Rik van Riel AuthorDate: Fri, 17 Oct 2014 03:29:51 -0400 Committer: Ingo Molnar CommitDate: Tue, 28 Oct 2014 10:47:49 +0100 sched/numa: Prepare for complex topology placement Preparatory patch for adding NUMA placement on systems with complex NUMA topology. Also fix a potential divide by zero in group_weight() Signed-off-by: Rik van Riel Tested-by: Chegu Vinod Signed-off-by: Peter Zijlstra (Intel) Cc: mgor...@suse.de Cc: chegu_vi...@hp.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1413530994-9732-4-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 57 ++--- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 34baa60..0af3bed 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -931,9 +931,10 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) * larger multiplier, in order to group tasks together that are almost * evenly spread out between numa nodes. */ -static inline unsigned long task_weight(struct task_struct *p, int nid) +static inline unsigned long task_weight(struct task_struct *p, int nid, + int dist) { - unsigned long total_faults; + unsigned long faults, total_faults; if (!p->numa_faults_memory) return 0; @@ -943,15 +944,25 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) if (!total_faults) return 0; - return 1000 * task_faults(p, nid) / total_faults; + faults = task_faults(p, nid); + return 1000 * faults / total_faults; } -static inline unsigned long group_weight(struct task_struct *p, int nid) +static inline unsigned long group_weight(struct task_struct *p, int nid, +int dist) { - if (!p->numa_group || !p->numa_group->total_faults) + unsigned long faults, total_faults; + + if (!p->numa_group) + return 0; + + total_faults = p->numa_group->total_faults; + + if (!total_faults) return 0; - return 1000 * group_faults(p, nid) / p->numa_group->total_faults; + faults = group_faults(p, nid); + return 1000 * faults / total_faults; } bool should_numa_migrate_memory(struct task_struct *p, struct page * page, @@ -1084,6 +1095,7 @@ struct task_numa_env { struct numa_stats src_stats, dst_stats; int imbalance_pct; + int dist; struct task_struct *best_task; long best_imp; @@ -1163,6 +1175,7 @@ static void task_numa_compare(struct task_numa_env *env, long load; long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; + int dist = env->dist; rcu_read_lock(); @@ -1196,8 +1209,8 @@ static void task_numa_compare(struct task_numa_env *env, * in any group then look only at task weights. */ if (cur->numa_group == env->p->numa_group) { - imp = taskimp + task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); + imp = taskimp + task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); /* * Add some hysteresis to prevent swapping the * tasks within a group over tiny differences. @@ -1211,11 +1224,11 @@ static void task_numa_compare(struct task_numa_env *env, * instead. */ if (cur->numa_group) - imp += group_weight(cur, env->src_nid) - - group_weight(cur, env->dst_nid); + imp += group_weight(cur, env->src_nid, dist) - + group_weight(cur, env->dst_nid, dist); else - imp += task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); + imp += task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); } } @@ -1314,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p) }; struct sched_domain *sd; unsigned long taskweight, groupweight; - int nid, ret; + int nid, ret, dist; long taskimp, groupimp; /* @@ -1342,12 +1355,13 @@ static int task_numa_migrate(struct task_struct *p) return -EINVAL; } - taskweight = task_weight(p, env.src_nid); - groupweight = grou
[tip:sched/core] sched/numa: Calculate node scores in complex NUMA topologies
Commit-ID: 6c6b1193e71fed1a58dc3fab9d967d245177f87b Gitweb: http://git.kernel.org/tip/6c6b1193e71fed1a58dc3fab9d967d245177f87b Author: Rik van Riel AuthorDate: Fri, 17 Oct 2014 03:29:52 -0400 Committer: Ingo Molnar CommitDate: Tue, 28 Oct 2014 10:47:50 +0100 sched/numa: Calculate node scores in complex NUMA topologies In order to do task placement on systems with complex NUMA topologies, it is necessary to count the faults on nodes nearby the node that is being examined for a potential move. In case of a system with a backplane interconnect, we are dealing with groups of NUMA nodes; each of the nodes within a group is the same number of hops away from nodes in other groups in the system. Optimal placement on this topology is achieved by counting all nearby nodes equally. When comparing nodes A and B at distance N, nearby nodes are those at distances smaller than N from nodes A or B. Placement strategy on a system with a glueless mesh NUMA topology needs to be different, because there are no natural groups of nodes determined by the hardware. Instead, when dealing with two nodes A and B at distance N, N >= 2, there will be intermediate nodes at distance < N from both nodes A and B. Good placement can be achieved by right shifting the faults on nearby nodes by the number of hops from the node being scored. In this context, a nearby node is any node less than the maximum distance in the system away from the node. Those nodes are skipped for efficiency reasons, there is no real policy reason to do so. Placement policy on directly connected NUMA systems is not affected. Signed-off-by: Rik van Riel Tested-by: Chegu Vinod Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: mgor...@suse.de Cc: chegu_vi...@hp.com Link: http://lkml.kernel.org/r/1413530994-9732-5-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 74 + 1 file changed, 74 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0af3bed..7e5712a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -925,6 +925,71 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) group->faults_cpu[task_faults_idx(nid, 1)]; } +/* Handle placement on systems where not all nodes are directly connected. */ +static unsigned long score_nearby_nodes(struct task_struct *p, int nid, + int maxdist, bool task) +{ + unsigned long score = 0; + int node; + + /* +* All nodes are directly connected, and the same distance +* from each other. No need for fancy placement algorithms. +*/ + if (sched_numa_topology_type == NUMA_DIRECT) + return 0; + + /* +* This code is called for each node, introducing N^2 complexity, +* which should be ok given the number of nodes rarely exceeds 8. +*/ + for_each_online_node(node) { + unsigned long faults; + int dist = node_distance(nid, node); + + /* +* The furthest away nodes in the system are not interesting +* for placement; nid was already counted. +*/ + if (dist == sched_max_numa_distance || node == nid) + continue; + + /* +* On systems with a backplane NUMA topology, compare groups +* of nodes, and move tasks towards the group with the most +* memory accesses. When comparing two nodes at distance +* "hoplimit", only nodes closer by than "hoplimit" are part +* of each group. Skip other nodes. +*/ + if (sched_numa_topology_type == NUMA_BACKPLANE && + dist > maxdist) + continue; + + /* Add up the faults from nearby nodes. */ + if (task) + faults = task_faults(p, node); + else + faults = group_faults(p, node); + + /* +* On systems with a glueless mesh NUMA topology, there are +* no fixed "groups of nodes". Instead, nodes that are not +* directly connected bounce traffic through intermediate +* nodes; a numa_group can occupy any set of nodes. +* The further away a node is, the less the faults count. +* This seems to result in good task placement. +*/ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + faults *= (sched_max_numa_distance - dist); + faults /= (sched_max_numa_distance - LOCAL_DISTANCE); + } + + score += faults; + } + + return score; +} + /* * These return the fraction of accesses done by a particu
[tip:sched/core] sched/numa: Classify the NUMA topology of a system
Commit-ID: e3fe70b1f72e3f83a00d9c332ec09ab347a981e2 Gitweb: http://git.kernel.org/tip/e3fe70b1f72e3f83a00d9c332ec09ab347a981e2 Author: Rik van Riel AuthorDate: Fri, 17 Oct 2014 03:29:50 -0400 Committer: Ingo Molnar CommitDate: Tue, 28 Oct 2014 10:47:48 +0100 sched/numa: Classify the NUMA topology of a system Smaller NUMA systems tend to have all NUMA nodes directly connected to each other. This includes the degenerate case of a system with just one node, ie. a non-NUMA system. Larger systems can have two kinds of NUMA topology, which affects how tasks and memory should be placed on the system. On glueless mesh systems, nodes that are not directly connected to each other will bounce traffic through intermediary nodes. Task groups can be run closer to each other by moving tasks from a node to an intermediary node between it and the task's preferred node. On NUMA systems with backplane controllers, the intermediary hops are incapable of running programs. This creates "islands" of nodes that are at an equal distance to anywhere else in the system. Each kind of topology requires a slightly different placement algorithm; this patch provides the mechanism to detect the kind of NUMA topology of a system. Signed-off-by: Rik van Riel Tested-by: Chegu Vinod [ Changed to use kernel/sched/sched.h ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: mgor...@suse.de Cc: chegu_vi...@hp.com Link: http://lkml.kernel.org/r/1413530994-9732-3-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 53 kernel/sched/sched.h | 6 ++ 2 files changed, 59 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4007595..cde8481 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6128,6 +6128,7 @@ static void claim_allocations(int cpu, struct sched_domain *sd) #ifdef CONFIG_NUMA static int sched_domains_numa_levels; +enum numa_topology_type sched_numa_topology_type; static int *sched_domains_numa_distance; int sched_max_numa_distance; static struct cpumask ***sched_domains_numa_masks; @@ -6316,6 +6317,56 @@ bool find_numa_distance(int distance) return false; } +/* + * A system can have three types of NUMA topology: + * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system + * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes + * NUMA_BACKPLANE: nodes can reach other nodes through a backplane + * + * The difference between a glueless mesh topology and a backplane + * topology lies in whether communication between not directly + * connected nodes goes through intermediary nodes (where programs + * could run), or through backplane controllers. This affects + * placement of programs. + * + * The type of topology can be discerned with the following tests: + * - If the maximum distance between any nodes is 1 hop, the system + * is directly connected. + * - If for two nodes A and B, located N > 1 hops away from each other, + * there is an intermediary node C, which is < N hops away from both + * nodes A and B, the system is a glueless mesh. + */ +static void init_numa_topology_type(void) +{ + int a, b, c, n; + + n = sched_max_numa_distance; + + if (n <= 1) + sched_numa_topology_type = NUMA_DIRECT; + + for_each_online_node(a) { + for_each_online_node(b) { + /* Find two nodes furthest removed from each other. */ + if (node_distance(a, b) < n) + continue; + + /* Is there an intermediary node between a and b? */ + for_each_online_node(c) { + if (node_distance(a, c) < n && + node_distance(b, c) < n) { + sched_numa_topology_type = + NUMA_GLUELESS_MESH; + return; + } + } + + sched_numa_topology_type = NUMA_BACKPLANE; + return; + } + } +} + static void sched_init_numa(void) { int next_distance, curr_distance = node_distance(0, 0); @@ -6449,6 +6500,8 @@ static void sched_init_numa(void) sched_domains_numa_levels = level; sched_max_numa_distance = sched_domains_numa_distance[level - 1]; + + init_numa_topology_type(); } static void sched_domains_numa_masks_set(int cpu) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 443d6e1..57aacea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -679,6 +679,12 @@ static inline u64 rq_clock_task(struct rq *rq) } #ifdef CONFIG_NUMA +enum numa_topology_type { + NUMA_DIRECT, + NUMA_GLUELESS_MESH, + NUMA_BACKPLANE, +}; +extern enum numa_topology_type sched_
[tip:sched/core] sched/numa: Find the preferred nid with complex NUMA topology
Commit-ID: 54009416ac3b5f219c0df68559ce534287ae97b1 Gitweb: http://git.kernel.org/tip/54009416ac3b5f219c0df68559ce534287ae97b1 Author: Rik van Riel AuthorDate: Fri, 17 Oct 2014 03:29:53 -0400 Committer: Ingo Molnar CommitDate: Tue, 28 Oct 2014 10:47:51 +0100 sched/numa: Find the preferred nid with complex NUMA topology On systems with complex NUMA topologies, the node scoring is adjusted to allow workloads to converge on nodes that are near each other. The way a task group's preferred nid is determined needs to be adjusted, in order for the preferred_nid to be consistent with group_weight scoring. This ensures that we actually try to converge workloads on adjacent nodes. Signed-off-by: Rik van Riel Tested-by: Chegu Vinod Signed-off-by: Peter Zijlstra (Intel) Cc: mgor...@suse.de Cc: chegu_vi...@hp.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1413530994-9732-6-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 88 - 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7e5712a..7760c2a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1659,6 +1659,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) return delta; } +/* + * Determine the preferred nid for a task in a numa_group. This needs to + * be done in a way that produces consistent results with group_weight, + * otherwise workloads might not converge. + */ +static int preferred_group_nid(struct task_struct *p, int nid) +{ + nodemask_t nodes; + int dist; + + /* Direct connections between all NUMA nodes. */ + if (sched_numa_topology_type == NUMA_DIRECT) + return nid; + + /* +* On a system with glueless mesh NUMA topology, group_weight +* scores nodes according to the number of NUMA hinting faults on +* both the node itself, and on nearby nodes. +*/ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + unsigned long score, max_score = 0; + int node, max_node = nid; + + dist = sched_max_numa_distance; + + for_each_online_node(node) { + score = group_weight(p, node, dist); + if (score > max_score) { + max_score = score; + max_node = node; + } + } + return max_node; + } + + /* +* Finding the preferred nid in a system with NUMA backplane +* interconnect topology is more involved. The goal is to locate +* tasks from numa_groups near each other in the system, and +* untangle workloads from different sides of the system. This requires +* searching down the hierarchy of node groups, recursively searching +* inside the highest scoring group of nodes. The nodemask tricks +* keep the complexity of the search down. +*/ + nodes = node_online_map; + for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { + unsigned long max_faults = 0; + nodemask_t max_group; + int a, b; + + /* Are there nodes at this distance from each other? */ + if (!find_numa_distance(dist)) + continue; + + for_each_node_mask(a, nodes) { + unsigned long faults = 0; + nodemask_t this_group; + nodes_clear(this_group); + + /* Sum group's NUMA faults; includes a==b case. */ + for_each_node_mask(b, nodes) { + if (node_distance(a, b) < dist) { + faults += group_faults(p, b); + node_set(b, this_group); + node_clear(b, nodes); + } + } + + /* Remember the top group. */ + if (faults > max_faults) { + max_faults = faults; + max_group = this_group; + /* +* subtle: at the smallest distance there is +* just one node left in each "group", the +* winner is the preferred nid. +*/ + nid = a; + } + } + /* Next round, evaluate the nodes within max_group. */ + nodes = max_group; + } + return nid; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; @@ -1741,7 +1827,7 @@ static void tas
[tip:sched/core] sched/numa: Export info needed for NUMA balancing on complex topologies
Commit-ID: 9942f79baaaf111d63ebf0862a819278d84fccc4 Gitweb: http://git.kernel.org/tip/9942f79baaaf111d63ebf0862a819278d84fccc4 Author: Rik van Riel AuthorDate: Fri, 17 Oct 2014 03:29:49 -0400 Committer: Ingo Molnar CommitDate: Tue, 28 Oct 2014 10:47:47 +0100 sched/numa: Export info needed for NUMA balancing on complex topologies Export some information that is necessary to do placement of tasks on systems with multi-level NUMA topologies. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: mgor...@suse.de Cc: chegu_vi...@hp.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1413530994-9732-2-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +++- kernel/sched/sched.h | 5 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 240157c..4007595 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6129,6 +6129,7 @@ static void claim_allocations(int cpu, struct sched_domain *sd) #ifdef CONFIG_NUMA static int sched_domains_numa_levels; static int *sched_domains_numa_distance; +int sched_max_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; #endif @@ -6300,7 +6301,7 @@ static void sched_numa_warn(const char *str) printk(KERN_WARNING "\n"); } -static bool find_numa_distance(int distance) +bool find_numa_distance(int distance) { int i; @@ -6447,6 +6448,7 @@ static void sched_init_numa(void) sched_domain_topology = tl; sched_domains_numa_levels = level; + sched_max_numa_distance = sched_domains_numa_distance[level - 1]; } static void sched_domains_numa_masks_set(int cpu) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 24156c84..443d6e1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -678,6 +678,11 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } +#ifdef CONFIG_NUMA +extern int sched_max_numa_distance; +extern bool find_numa_distance(int distance); +#endif + #ifdef CONFIG_NUMA_BALANCING extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched, time: Fix build error with 64 bit cputime_t on 32 bit systems
Commit-ID: 347abad981c1ef815ea5ba861adba6a8c6aa1580 Gitweb: http://git.kernel.org/tip/347abad981c1ef815ea5ba861adba6a8c6aa1580 Author: Rik van Riel AuthorDate: Tue, 30 Sep 2014 15:59:47 -0400 Committer: Ingo Molnar CommitDate: Fri, 3 Oct 2014 05:46:55 +0200 sched, time: Fix build error with 64 bit cputime_t on 32 bit systems On 32 bit systems cmpxchg cannot handle 64 bit values, so some additional magic is required to allow a 32 bit system with CONFIG_VIRT_CPU_ACCOUNTING_GEN=y enabled to build. Make sure the correct cmpxchg function is used when doing an atomic swap of a cputime_t. Reported-by: Arnd Bergmann Signed-off-by: Rik van Riel Acked-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Cc: umgwanakikb...@gmail.com Cc: fweis...@gmail.com Cc: s...@redhat.com Cc: lwood...@redhat.com Cc: atheu...@redhat.com Cc: o...@redhat.com Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: linux...@de.ibm.com Cc: linux-a...@vger.kernel.org Cc: linuxppc-...@lists.ozlabs.org Cc: linux-s...@vger.kernel.org Link: http://lkml.kernel.org/r/20140930155947.070cd...@annuminas.surriel.com Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/cputime.h| 2 ++ arch/s390/include/asm/cputime.h | 2 ++ include/asm-generic/cputime_jiffies.h | 2 ++ include/asm-generic/cputime_nsecs.h | 2 ++ kernel/sched/cputime.c| 29 +++-- 5 files changed, 27 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 607559a..6c840ce 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -32,6 +32,8 @@ static inline void setup_cputime_one_jiffy(void) { } typedef u64 __nocast cputime_t; typedef u64 __nocast cputime64_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) + #ifdef __KERNEL__ /* diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index f65bd36..3001887 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -18,6 +18,8 @@ typedef unsigned long long __nocast cputime_t; typedef unsigned long long __nocast cputime64_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) + static inline unsigned long __div(unsigned long long n, unsigned long base) { #ifndef CONFIG_64BIT diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h index d5cb78f5..fe386fc 100644 --- a/include/asm-generic/cputime_jiffies.h +++ b/include/asm-generic/cputime_jiffies.h @@ -3,6 +3,8 @@ typedef unsigned long __nocast cputime_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) + #define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_to_jiffies(__ct) (__force unsigned long)(__ct) #define cputime_to_scaled(__ct)(__ct) diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index 4e81760..0419485 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h @@ -21,6 +21,8 @@ typedef u64 __nocast cputime_t; typedef u64 __nocast cputime64_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) + #define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_div(__ct, divisor) div_u64((__force u64)__ct, divisor) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 64492df..8394b1e 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -555,6 +555,23 @@ drop_precision: } /* + * Atomically advance counter to the new value. Interrupts, vcpu + * scheduling, and scaling inaccuracies can cause cputime_advance + * to be occasionally called with a new value smaller than counter. + * Let's enforce atomicity. + * + * Normally a caller will only go through this loop once, or not + * at all in case a previous caller updated counter the same jiffy. + */ +static void cputime_advance(cputime_t *counter, cputime_t new) +{ + cputime_t old; + + while (new > (old = ACCESS_ONCE(*counter))) + cmpxchg_cputime(counter, old, new); +} + +/* * Adjust tick based cputime random precision against scheduler * runtime accounting. */ @@ -599,16 +616,8 @@ static void cputime_adjust(struct task_cputime *curr, utime = rtime - stime; } - /* -* If the tick based count grows faster than the scheduler one, -* the result of the scaling may go backward. -* Let's enforce monotonicity. -* Atomic exchange protects against concurrent cputime_adjust(). -*/ - while (stime > (rtime = ACCESS_ONCE(prev->stime))) - cmpxchg(&prev->stime, rtime, stime); - while (utime > (rtime = ACCESS_ONCE(prev->utime))) - cmpxchg(&prev->utime, rtime, utime); + cputime_advance(&prev->stim
[tip:sched/core] sched, time: Fix lock inversion in thread_group_cputime()
Commit-ID: 9c368b5b6eccce1cbd7f68142106b3b4ddb1c5b5 Gitweb: http://git.kernel.org/tip/9c368b5b6eccce1cbd7f68142106b3b4ddb1c5b5 Author: Rik van Riel AuthorDate: Fri, 12 Sep 2014 09:12:15 -0400 Committer: Ingo Molnar CommitDate: Fri, 19 Sep 2014 12:35:17 +0200 sched, time: Fix lock inversion in thread_group_cputime() The sig->stats_lock nests inside the tasklist_lock and the sighand->siglock in __exit_signal and wait_task_zombie. However, both of those locks can be taken from irq context, which means we need to use the interrupt safe variant of read_seqbegin_or_lock. This blocks interrupts when the "lock" branch is taken (seq is odd), preventing the lock inversion. On the first (lockless) pass through the loop, irqs are not blocked. Reported-by: Stanislaw Gruszka Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: pra...@redhat.com Cc: o...@redhat.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1410527535-9814-3-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 2b57031..64492df 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -289,13 +289,14 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) cputime_t utime, stime; struct task_struct *t; unsigned int seq, nextseq; + unsigned long flags; rcu_read_lock(); /* Attempt a lockless read on the first round. */ nextseq = 0; do { seq = nextseq; - read_seqbegin_or_lock(&sig->stats_lock, &seq); + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); times->utime = sig->utime; times->stime = sig->stime; times->sum_exec_runtime = sig->sum_sched_runtime; @@ -309,7 +310,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) /* If lockless access failed, take the lock. */ nextseq = 1; } while (need_seqretry(&sig->stats_lock, seq)); - done_seqretry(&sig->stats_lock, seq); + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); rcu_read_unlock(); } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] seqlock: Add irqsave variant of read_seqbegin_or_lock()
Commit-ID: ef8ac06359ddf95431cf6bb04ad2b36fff562328 Gitweb: http://git.kernel.org/tip/ef8ac06359ddf95431cf6bb04ad2b36fff562328 Author: Rik van Riel AuthorDate: Fri, 12 Sep 2014 09:12:14 -0400 Committer: Ingo Molnar CommitDate: Fri, 19 Sep 2014 12:35:16 +0200 seqlock: Add irqsave variant of read_seqbegin_or_lock() There are cases where read_seqbegin_or_lock() needs to block irqs, because the seqlock in question nests inside a lock that is also be taken from irq context. Add read_seqbegin_or_lock_irqsave() and done_seqretry_irqrestore(), which are almost identical to read_seqbegin_or_lock() and done_seqretry(). Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: pra...@redhat.com Cc: o...@redhat.com Cc: sgrus...@redhat.com Cc: Al Viro Cc: John Stultz Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Stephen Boyd Cc: Trond Myklebust Link: http://lkml.kernel.org/r/1410527535-9814-2-git-send-email-r...@redhat.com [ Improved the readability of the code a bit. ] Signed-off-by: Ingo Molnar --- include/linux/seqlock.h | 19 +++ 1 file changed, 19 insertions(+) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index cc35963..f5df8f6 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -456,4 +456,23 @@ read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags) spin_unlock_irqrestore(&sl->lock, flags); } +static inline unsigned long +read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq) +{ + unsigned long flags = 0; + + if (!(*seq & 1))/* Even */ + *seq = read_seqbegin(lock); + else/* Odd */ + read_seqlock_excl_irqsave(lock, flags); + + return flags; +} + +static inline void +done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags) +{ + if (seq & 1) + read_sequnlock_excl_irqrestore(lock, flags); +} #endif /* __LINUX_SEQLOCK_H */ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Use select_idle_sibling() to select a destination for task_numa_move()
Commit-ID: ba7e5a279e72f4b246dc7a419ac707e1936ede3e Gitweb: http://git.kernel.org/tip/ba7e5a279e72f4b246dc7a419ac707e1936ede3e Author: Rik van Riel AuthorDate: Thu, 4 Sep 2014 16:35:30 -0400 Committer: Ingo Molnar CommitDate: Fri, 19 Sep 2014 12:35:14 +0200 sched/numa: Use select_idle_sibling() to select a destination for task_numa_move() The code in task_numa_compare() will only examine at most one idle CPU per node, because they all have the same score. However, some idle CPUs are better candidates than others, due to busy or idle SMT siblings, etc... The scheduler has logic to find the best CPU within an LLC to place a task. The NUMA code should probably use it. This seems to reduce the standard deviation for single instance SPECjbb2005 with a low warehouse count on my 4 node test system. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: mgor...@suse.de Cc: Mike Galbraith Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140904163530.189d4...@cuia.bos.redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 1 file changed, 8 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index be9e97b..96e7147 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -665,6 +665,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP +static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); static inline void __update_task_entity_contrib(struct sched_entity *se); @@ -1257,6 +1258,13 @@ balance: if (load_too_imbalanced(src_load, dst_load, env)) goto unlock; + /* +* One idle CPU per node is evaluated for a task numa move. +* Call select_idle_sibling to maybe find a better one. +*/ + if (!cur) + env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + assign: task_numa_assign(env, cur, imp); unlock: -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] time, signal: Protect resource use statistics with seqlock
Commit-ID: e78c3496790ee8a36522a838b59b388e8a709e65 Gitweb: http://git.kernel.org/tip/e78c3496790ee8a36522a838b59b388e8a709e65 Author: Rik van Riel AuthorDate: Sat, 16 Aug 2014 13:40:10 -0400 Committer: Ingo Molnar CommitDate: Mon, 8 Sep 2014 08:17:01 +0200 time, signal: Protect resource use statistics with seqlock Both times() and clock_gettime(CLOCK_PROCESS_CPUTIME_ID) have scalability issues on large systems, due to both functions being serialized with a lock. The lock protects against reporting a wrong value, due to a thread in the task group exiting, its statistics reporting up to the signal struct, and that exited task's statistics being counted twice (or not at all). Protecting that with a lock results in times() and clock_gettime() being completely serialized on large systems. This can be fixed by using a seqlock around the events that gather and propagate statistics. As an additional benefit, the protection code can be moved into thread_group_cputime(), slightly simplifying the calling functions. In the case of posix_cpu_clock_get_task() things can be simplified a lot, because the calling function already ensures that the task sticks around, and the rest is now taken care of in thread_group_cputime(). This way the statistics reporting code can run lockless. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Alex Thorlton Cc: Andrew Morton Cc: Daeseok Youn Cc: David Rientjes Cc: Dongsheng Yang Cc: Geert Uytterhoeven Cc: Guillaume Morin Cc: Ionut Alexa Cc: Kees Cook Cc: Linus Torvalds Cc: Li Zefan Cc: Michal Hocko Cc: Michal Schmidt Cc: Oleg Nesterov Cc: Vladimir Davydov Cc: umgwanakikb...@gmail.com Cc: fweis...@gmail.com Cc: s...@redhat.com Cc: lwood...@redhat.com Cc: atheu...@redhat.com Link: http://lkml.kernel.org/r/20140816134010.26a9b...@annuminas.surriel.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/exit.c | 4 kernel/fork.c | 1 + kernel/sched/cputime.c | 33 - kernel/sys.c | 2 -- kernel/time/posix-cpu-timers.c | 14 -- 6 files changed, 26 insertions(+), 29 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c2c885..dd9eb48 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -645,6 +645,7 @@ struct signal_struct { * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ + seqlock_t stats_lock; cputime_t utime, stime, cutime, cstime; cputime_t gtime; cputime_t cgtime; diff --git a/kernel/exit.c b/kernel/exit.c index b93d46d..fa09b86 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -127,6 +127,7 @@ static void __exit_signal(struct task_struct *tsk) * the signal_struct. */ task_cputime(tsk, &utime, &stime); + write_seqlock(&sig->stats_lock); sig->utime += utime; sig->stime += stime; sig->gtime += task_gtime(tsk); @@ -140,6 +141,7 @@ static void __exit_signal(struct task_struct *tsk) sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); /* * Do this under ->siglock, we can race with another thread @@ -1042,6 +1044,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; + write_seqlock(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; @@ -1064,6 +1067,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); + write_sequnlock(&psig->stats_lock); spin_unlock_irq(&p->real_parent->sighand->siglock); } diff --git a/kernel/fork.c b/kernel/fork.c index 0cf9cdb..9387ae8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1068,6 +1068,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); + seqlock_init(&sig->stats_lock); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 3e52836..49b7cfe 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -288,18 +288,28 @@ void thread_group_cputime(struct tas
[tip:sched/core] exit: Always reap resource stats in __exit_signal()
Commit-ID: 90ed9cbe765ad358b3151a12b8bf889a3cbcd573 Gitweb: http://git.kernel.org/tip/90ed9cbe765ad358b3151a12b8bf889a3cbcd573 Author: Rik van Riel AuthorDate: Fri, 15 Aug 2014 16:05:36 -0400 Committer: Ingo Molnar CommitDate: Mon, 8 Sep 2014 08:17:00 +0200 exit: Always reap resource stats in __exit_signal() Oleg pointed out that wait_task_zombie adds a task's usage statistics to the parent's signal struct, but the task's own signal struct should also propagate the statistics at exit time. This allows thread_group_cputime(reaped_zombie) to get the statistics after __unhash_process() has made the task invisible to for_each_thread, but before the thread has actually been rcu freed, making sure no non-monotonic results are returned inside that window. Suggested-by: Oleg Nesterov Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: David Rientjes Cc: Guillaume Morin Cc: Ionut Alexa Cc: Linus Torvalds Cc: Li Zefan Cc: Michal Hocko Cc: Michal Schmidt Cc: Oleg Nesterov Cc: umgwanakikb...@gmail.com Cc: fweis...@gmail.com Cc: s...@redhat.com Cc: lwood...@redhat.com Cc: atheu...@redhat.com Link: http://lkml.kernel.org/r/1408133138-22048-2-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/exit.c | 43 +-- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 32c58f7..b93d46d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -115,30 +115,29 @@ static void __exit_signal(struct task_struct *tsk) if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); - /* -* Accumulate here the counters for all threads but the -* group leader as they die, so they can be added into -* the process-wide totals when those are taken. -* The group leader stays around as a zombie as long -* as there are other threads. When it gets reaped, -* the exit.c code will add its counts into these totals. -* We won't ever get here for the group leader, since it -* will have been the last reference on the signal_struct. -*/ - task_cputime(tsk, &utime, &stime); - sig->utime += utime; - sig->stime += stime; - sig->gtime += task_gtime(tsk); - sig->min_flt += tsk->min_flt; - sig->maj_flt += tsk->maj_flt; - sig->nvcsw += tsk->nvcsw; - sig->nivcsw += tsk->nivcsw; - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; } + /* +* Accumulate here the counters for all threads but the group leader +* as they die, so they can be added into the process-wide totals +* when those are taken. The group leader stays around as a zombie as +* long as there are other threads. When it gets reaped, the exit.c +* code will add its counts into these totals. We won't ever get here +* for the group leader, since it will have been the last reference on +* the signal_struct. +*/ + task_cputime(tsk, &utime, &stime); + sig->utime += utime; + sig->stime += stime; + sig->gtime += task_gtime(tsk); + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; + sig->nvcsw += tsk->nvcsw; + sig->nivcsw += tsk->nivcsw; + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched, time: Atomically increment stime & utime
Commit-ID: eb1b4af0a64ac7bb0ee36f579c1c7cefcbc3ac2c Gitweb: http://git.kernel.org/tip/eb1b4af0a64ac7bb0ee36f579c1c7cefcbc3ac2c Author: Rik van Riel AuthorDate: Fri, 15 Aug 2014 16:05:38 -0400 Committer: Ingo Molnar CommitDate: Mon, 8 Sep 2014 08:17:02 +0200 sched, time: Atomically increment stime & utime The functions task_cputime_adjusted and thread_group_cputime_adjusted() can be called locklessly, as well as concurrently on many different CPUs. This can occasionally lead to the utime and stime reported by times(), and other syscalls like it, going backward. The cause for this appears to be multiple threads racing in cputime_adjust(), both with values for utime or stime that is larger than the original, but each with a different value. Sometimes the larger value gets saved first, only to be immediately overwritten with a smaller value by another thread. Using atomic exchange prevents that problem, and ensures time progresses monotonically. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: umgwanakikb...@gmail.com Cc: fweis...@gmail.com Cc: a...@linux-foundation.org Cc: s...@redhat.com Cc: lwood...@redhat.com Cc: atheu...@redhat.com Cc: o...@redhat.com Link: http://lkml.kernel.org/r/1408133138-22048-4-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 49b7cfe..2b57031 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -602,9 +602,12 @@ static void cputime_adjust(struct task_cputime *curr, * If the tick based count grows faster than the scheduler one, * the result of the scaling may go backward. * Let's enforce monotonicity. +* Atomic exchange protects against concurrent cputime_adjust(). */ - prev->stime = max(prev->stime, stime); - prev->utime = max(prev->utime, utime); + while (stime > (rtime = ACCESS_ONCE(prev->stime))) + cmpxchg(&prev->stime, rtime, stime); + while (utime > (rtime = ACCESS_ONCE(prev->utime))) + cmpxchg(&prev->utime, rtime, utime); out: *ut = prev->utime; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Fix off-by-one in capacity check
Commit-ID: b932c03c34f3b03c7364c06aa8cae5b74609fc41 Gitweb: http://git.kernel.org/tip/b932c03c34f3b03c7364c06aa8cae5b74609fc41 Author: Rik van Riel AuthorDate: Mon, 4 Aug 2014 13:23:27 -0400 Committer: Ingo Molnar CommitDate: Tue, 12 Aug 2014 12:48:22 +0200 sched/numa: Fix off-by-one in capacity check Commit a43455a1d572daf7b730fe12eb747d1e17411365 ensures that task_numa_migrate will call task_numa_compare on the preferred node all the time, even when the preferred node has no free capacity. This could lead to a performance regression if nr_running == capacity on both the source and the destination node. This can be avoided by also checking for nr_running == capacity on the source node, which is one stricter than checking .has_free_capacity. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgor...@suse.de Cc: vincent.guit...@linaro.org Cc: morten.rasmus...@arm.com Cc: nicolas.pi...@linaro.org Cc: efa...@gmx.de Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1407173008-9334-2-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df1ed17..e1cf419 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1206,7 +1206,7 @@ static void task_numa_compare(struct task_numa_env *env, if (!cur) { /* Is there capacity at our destination? */ - if (env->src_stats.has_free_capacity && + if (env->src_stats.nr_running <= env->src_stats.task_capacity && !env->dst_stats.has_free_capacity) goto unlock; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/fair: Make update_sd_pick_busiest() return 'true' on a busier sd
Commit-ID: caeb178c60f4f93f1b45c0bc056b5cf6d217b67f Gitweb: http://git.kernel.org/tip/caeb178c60f4f93f1b45c0bc056b5cf6d217b67f Author: Rik van Riel AuthorDate: Mon, 28 Jul 2014 14:16:28 -0400 Committer: Ingo Molnar CommitDate: Tue, 12 Aug 2014 12:48:19 +0200 sched/fair: Make update_sd_pick_busiest() return 'true' on a busier sd Currently update_sd_pick_busiest only identifies the busiest sd that is either overloaded, or has a group imbalance. When no sd is imbalanced or overloaded, the load balancer fails to find the busiest domain. This breaks load balancing between domains that are not overloaded, in the !SD_ASYM_PACKING case. This patch makes update_sd_pick_busiest return true when the busiest sd yet is encountered. Groups are ranked in the order overloaded > imbalanced > other, with higher ranked groups getting priority even when their load is lower. This is necessary due to the possibility of unequal capacities and cpumasks between domains within a sched group. Behaviour for SD_ASYM_PACKING does not seem to match the comment, but I have no hardware to test that so I have left the behaviour of that code unchanged. Enum for group classification suggested by Peter Zijlstra. Signed-off-by: Rik van Riel [peterz: replaced sg_lb_stats::group_imb with the new enum group_type in an attempt to avoid endless recalculation] Signed-off-by: Peter Zijlstra Acked-by: Vincent Guittot Acked-by: Michael Neuling Cc: ktk...@parallels.com Cc: tim.c.c...@linux.intel.com Cc: nicolas.pi...@linaro.org Cc: jhla...@redhat.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140729152743.GI3935@laptop Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 49 + 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e9477e6..9437725 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5559,6 +5559,13 @@ static unsigned long task_h_load(struct task_struct *p) #endif /** Helpers for find_busiest_group / + +enum group_type { + group_other = 0, + group_imbalanced, + group_overloaded, +}; + /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -5572,7 +5579,7 @@ struct sg_lb_stats { unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; - int group_imb; /* Is there an imbalance in the group ? */ + enum group_type group_type; int group_has_free_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -5610,6 +5617,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .total_capacity = 0UL, .busiest_stat = { .avg_load = 0UL, + .sum_nr_running = 0, + .group_type = group_other, }, }; } @@ -5891,6 +5900,18 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro return capacity_factor; } +static enum group_type +group_classify(struct sched_group *group, struct sg_lb_stats *sgs) +{ + if (sgs->sum_nr_running > sgs->group_capacity_factor) + return group_overloaded; + + if (sg_imbalanced(group)) + return group_imbalanced; + + return group_other; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -5942,9 +5963,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; sgs->group_weight = group->group_weight; - - sgs->group_imb = sg_imbalanced(group); sgs->group_capacity_factor = sg_capacity_factor(env, group); + sgs->group_type = group_classify(group, sgs); if (sgs->group_capacity_factor > sgs->sum_nr_running) sgs->group_has_free_capacity = 1; @@ -5968,13 +5988,19 @@ static bool update_sd_pick_busiest(struct lb_env *env, struct sched_group *sg, struct sg_lb_stats *sgs) { - if (sgs->avg_load <= sds->busiest_stat.avg_load) - return false; + struct sg_lb_stats *busiest = &sds->busiest_stat; - if (sgs->sum_nr_running > sgs->group_capacity_factor) + if (sgs->group_type > busiest->group_type) return true; - if (sgs->group_imb) + if (sgs->group_type < busiest->group_type) + return false; + + if (sgs->avg_load <= busiest->avg_load) + return false; + + /* This is the busiest node in its class. */ + if (!(env->sd->flags & SD_ASYM_PACKING)) return true; /* @@ -5982,8 +6008,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * numbered CPUs in the group, therefore ma
[tip:sched/core] sched/numa: Fix numa capacity computation
Commit-ID: 83d7f2424741c9dc76c21377c9d00d47abaf88df Gitweb: http://git.kernel.org/tip/83d7f2424741c9dc76c21377c9d00d47abaf88df Author: Rik van Riel AuthorDate: Mon, 4 Aug 2014 13:23:28 -0400 Committer: Ingo Molnar CommitDate: Tue, 12 Aug 2014 12:48:23 +0200 sched/numa: Fix numa capacity computation Commit c61037e9 fixes the phenomenon of 'fantom' cores due to N*frac(smt_power) >= 1 by limiting the capacity to the actual number of cores in the load balancing code. This patch applies the same correction to the NUMA balancing code. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgor...@suse.de Cc: vincent.guit...@linaro.org Cc: morten.rasmus...@arm.com Cc: nicolas.pi...@linaro.org Cc: efa...@gmx.de Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1407173008-9334-3-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 --- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e1cf419..1413c44 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1038,7 +1038,8 @@ struct numa_stats { */ static void update_numa_stats(struct numa_stats *ns, int nid) { - int cpu, cpus = 0; + int smt, cpu, cpus = 0; + unsigned long capacity; memset(ns, 0, sizeof(*ns)); for_each_cpu(cpu, cpumask_of_node(nid)) { @@ -1062,8 +1063,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - ns->task_capacity = - DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); + /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ + smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); + capacity = cpus / smt; /* cores */ + + ns->task_capacity = min_t(unsigned, capacity, + DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Rework best node setting in task_numa_migrate()
Commit-ID: db015daedb56251b73f956f70b3b8813f80d8ee1 Gitweb: http://git.kernel.org/tip/db015daedb56251b73f956f70b3b8813f80d8ee1 Author: Rik van Riel AuthorDate: Mon, 23 Jun 2014 11:41:34 -0400 Committer: Ingo Molnar CommitDate: Sat, 5 Jul 2014 11:17:39 +0200 sched/numa: Rework best node setting in task_numa_migrate() Fix up the best node setting in task_numa_migrate() to deal with a task in a pseudo-interleaved NUMA group, which is already running in the best location. Set the task's preferred nid to the current nid, so task migration is not retried at a high rate. Signed-off-by: Rik van Riel Cc: mgor...@suse.de Cc: chegu_vi...@hp.com Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538095-31256-7-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 19 +-- 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9d1734a..7bb2f46 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1354,10 +1354,6 @@ static int task_numa_migrate(struct task_struct *p) } } - /* No better CPU than the current one was found. */ - if (env.best_cpu == -1) - return -EAGAIN; - /* * If the task is part of a workload that spans multiple NUMA nodes, * and is migrating into one of the workload's active nodes, remember @@ -1366,8 +1362,19 @@ static int task_numa_migrate(struct task_struct *p) * A task that migrated to a second choice node will be better off * trying for a better one later. Do not set the preferred node here. */ - if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) - sched_setnuma(p, env.dst_nid); + if (p->numa_group) { + if (env.best_cpu == -1) + nid = env.src_nid; + else + nid = env.dst_nid; + + if (node_isset(nid, p->numa_group->active_nodes)) + sched_setnuma(p, env.dst_nid); + } + + /* No better CPU than the current one was found. */ + if (env.best_cpu == -1) + return -EAGAIN; /* * Reset the scan period if the task is being rescheduled on an -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Change scan period code to match intent
Commit-ID: a22b4b012340b988dbe7a58461d6fcc582f34aa0 Gitweb: http://git.kernel.org/tip/a22b4b012340b988dbe7a58461d6fcc582f34aa0 Author: Rik van Riel AuthorDate: Mon, 23 Jun 2014 11:41:35 -0400 Committer: Ingo Molnar CommitDate: Sat, 5 Jul 2014 11:17:40 +0200 sched/numa: Change scan period code to match intent Reading through the scan period code and comment, it appears the intent was to slow down NUMA scanning when a majority of accesses are on the local node, specifically a local:remote ratio of 3:1. However, the code actually tests local / (local + remote), and the actual cut-off point was around 30% local accesses, well before a task has actually converged on a node. Changing the threshold to 7 means scanning slows down when a task has around 70% of its accesses local, which appears to match the intent of the code more closely. Signed-off-by: Rik van Riel Cc: mgor...@suse.de Cc: chegu_vi...@hp.com Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538095-31256-8-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7bb2f46..a140c6a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1452,12 +1452,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) /* * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS * increments. The more local the fault statistics are, the higher the scan - * period will be for the next scan window. If local/remote ratio is below - * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the - * scan period will decrease + * period will be for the next scan window. If local/(local+remote) ratio is + * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) + * the scan period will decrease. Aim for 70% local accesses. */ #define NUMA_PERIOD_SLOTS 10 -#define NUMA_PERIOD_THRESHOLD 3 +#define NUMA_PERIOD_THRESHOLD 7 /* * Increase the scan period (slow down scanning) if the majority of -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Use effective_load() to balance NUMA loads
Commit-ID: 6dc1a672ab15604947361dcd02e459effa09bad5 Gitweb: http://git.kernel.org/tip/6dc1a672ab15604947361dcd02e459effa09bad5 Author: Rik van Riel AuthorDate: Mon, 23 Jun 2014 11:46:14 -0400 Committer: Ingo Molnar CommitDate: Sat, 5 Jul 2014 11:17:35 +0200 sched/numa: Use effective_load() to balance NUMA loads When CONFIG_FAIR_GROUP_SCHED is enabled, the load that a task places on a CPU is determined by the group the task is in. The active groups on the source and destination CPU can be different, resulting in a different load contribution by the same task at its source and at its destination. As a result, the load needs to be calculated separately for each CPU, instead of estimated once with task_h_load(). Getting this calculation right allows some workloads to converge, where previously the last thread could get stuck on another node, without being able to migrate to its final destination. Signed-off-by: Rik van Riel Cc: mgor...@suse.de Cc: chegu_vi...@hp.com Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538378-31571-3-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 20 ++-- 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f287d0b..d6526d2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1151,6 +1151,7 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; + struct task_group *tg; long src_load, dst_load; long load; long imp = (groupimp > 0) ? groupimp : taskimp; @@ -1225,14 +1226,21 @@ static void task_numa_compare(struct task_numa_env *env, * In the overloaded case, try and keep the load balanced. */ balance: - load = task_h_load(env->p); - dst_load = env->dst_stats.load + load; - src_load = env->src_stats.load - load; + src_load = env->src_stats.load; + dst_load = env->dst_stats.load; + + /* Calculate the effect of moving env->p from src to dst. */ + load = env->p->se.load.weight; + tg = task_group(env->p); + src_load += effective_load(tg, env->src_cpu, -load, -load); + dst_load += effective_load(tg, env->dst_cpu, load, load); if (cur) { - load = task_h_load(cur); - dst_load -= load; - src_load += load; + /* Cur moves in the opposite direction. */ + load = cur->se.load.weight; + tg = task_group(cur); + src_load += effective_load(tg, env->src_cpu, load, load); + dst_load += effective_load(tg, env->dst_cpu, -load, -load); } if (load_too_imbalanced(src_load, dst_load, env)) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Examine a task move when examining a task swap
Commit-ID: 0132c3e1777ceabc24c7d209b7cbe78c28c03c09 Gitweb: http://git.kernel.org/tip/0132c3e1777ceabc24c7d209b7cbe78c28c03c09 Author: Rik van Riel AuthorDate: Mon, 23 Jun 2014 11:46:16 -0400 Committer: Ingo Molnar CommitDate: Sat, 5 Jul 2014 11:17:38 +0200 sched/numa: Examine a task move when examining a task swap Running "perf bench numa mem -0 -m -P 1000 -p 8 -t 20" on a 4 node system results in 160 runnable threads on a system with 80 CPU threads. Once a process has nearly converged, with 39 threads on one node and 1 thread on another node, the remaining thread will be unable to migrate to its preferred node through a task swap. However, a simple task move would make the workload converge, witout causing an imbalance. Test for this unlikely occurrence, and attempt a task move to the preferred nid when it happens. # Running main, "perf bench numa mem -p 8 -t 20 -0 -m -P 1000" ### # 160 tasks will execute (on 4 nodes, 80 CPUs): # -1x 0MB global shared mem operations # -1x 1000MB process shared mem operations # -1x 0MB thread local mem operations ### ### # #0.0% [0.2 mins] 0/0 1/1 36/2 0/0 [36/3 ] l: 0-0 ( 0) {0-2} #0.0% [0.3 mins] 43/3 37/2 39/2 41/3 [ 6/10] l: 0-1 ( 1) {1-2} #0.0% [0.4 mins] 42/3 38/2 40/2 40/2 [ 4/9 ] l: 1-2 ( 1) [50.0%] {1-2} #0.0% [0.6 mins] 41/3 39/2 40/2 40/2 [ 2/9 ] l: 2-4 ( 2) [50.0%] {1-2} #0.0% [0.7 mins] 40/2 40/2 40/2 40/2 [ 0/8 ] l: 3-5 ( 2) [40.0%] ( 41.8s converged) Without this patch, this same perf bench numa mem run had to rely on the scheduler load balancer to first balance out the load (moving a random task), before a task swap could complete the NUMA convergence. The load balancer does not normally take action unless the load difference exceeds 25%. Convergence times of over half an hour have been observed without this patch. With this patch, the NUMA balancing code will simply migrate the task, if that does not cause an imbalance. Also skip examining a CPU in detail if the improvement on that CPU is no more than the best we already have. Signed-off-by: Rik van Riel Cc: chegu_vi...@hp.com Cc: mgor...@suse.de Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-ggthh0rnh0yua6o5o3p6c...@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 23 +-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cebb312..9d1734a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1155,6 +1155,7 @@ static void task_numa_compare(struct task_numa_env *env, long src_load, dst_load; long load; long imp = env->p->numa_group ? groupimp : taskimp; + long moveimp = imp; rcu_read_lock(); cur = ACCESS_ONCE(dst_rq->curr); @@ -1201,7 +1202,7 @@ static void task_numa_compare(struct task_numa_env *env, } } - if (imp < env->best_imp) + if (imp <= env->best_imp && moveimp <= env->best_imp) goto unlock; if (!cur) { @@ -1214,7 +1215,8 @@ static void task_numa_compare(struct task_numa_env *env, } /* Balance doesn't matter much if we're running a task per cpu */ - if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) + if (imp > env->best_imp && src_rq->nr_running == 1 && + dst_rq->nr_running == 1) goto assign; /* @@ -1230,6 +1232,23 @@ balance: src_load += effective_load(tg, env->src_cpu, -load, -load); dst_load += effective_load(tg, env->dst_cpu, load, load); + if (moveimp > imp && moveimp > env->best_imp) { + /* +* If the improvement from just moving env->p direction is +* better than swapping tasks around, check if a move is +* possible. Store a slightly smaller score than moveimp, +* so an actually idle CPU will win. +*/ + if (!load_too_imbalanced(src_load, dst_load, env)) { + imp = moveimp - 1; + cur = NULL; + goto assign; + } + } + + if (imp <= env->best_imp) + goto unlock; + if (cur) { /* Cur moves in the opposite direction. */ load = cur->se.load.weight; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Simplify task_numa_compare()
Commit-ID: 1c5d3eb3759013bc7ee4197aa0a9f245bdb6eb90 Gitweb: http://git.kernel.org/tip/1c5d3eb3759013bc7ee4197aa0a9f245bdb6eb90 Author: Rik van Riel AuthorDate: Mon, 23 Jun 2014 11:46:15 -0400 Committer: Ingo Molnar CommitDate: Sat, 5 Jul 2014 11:17:37 +0200 sched/numa: Simplify task_numa_compare() When a task is part of a numa_group, the comparison should always use the group weight, in order to make workloads converge. Signed-off-by: Rik van Riel Cc: chegu_vi...@hp.com Cc: mgor...@suse.de Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538378-31571-4-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +-- 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d6526d2..cebb312 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1154,7 +1154,7 @@ static void task_numa_compare(struct task_numa_env *env, struct task_group *tg; long src_load, dst_load; long load; - long imp = (groupimp > 0) ? groupimp : taskimp; + long imp = env->p->numa_group ? groupimp : taskimp; rcu_read_lock(); cur = ACCESS_ONCE(dst_rq->curr); @@ -1192,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env, * itself (not part of a group), use the task weight * instead. */ - if (env->p->numa_group) - imp = groupimp; - else - imp = taskimp; - if (cur->numa_group) imp += group_weight(cur, env->src_nid) - group_weight(cur, env->dst_nid); -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Move power adjustment into load_too_imbalanced()
Commit-ID: 28a21745190a0ca613cab817bfe3dc65373158bf Gitweb: http://git.kernel.org/tip/28a21745190a0ca613cab817bfe3dc65373158bf Author: Rik van Riel AuthorDate: Mon, 23 Jun 2014 11:46:13 -0400 Committer: Ingo Molnar CommitDate: Sat, 5 Jul 2014 11:17:34 +0200 sched/numa: Move power adjustment into load_too_imbalanced() Currently the NUMA code scales the load on each node with the amount of CPU power available on that node, but it does not apply any adjustment to the load of the task that is being moved over. On systems with SMT/HT, this results in a task being weighed much more heavily than a CPU core, and a task move that would even out the load between nodes being disallowed. The correct thing is to apply the power correction to the numbers after we have first applied the move of the tasks' loads to them. This also allows us to do the power correction with a multiplication, rather than a division. Also drop two function arguments for load_too_unbalanced, since it takes various factors from env already. Signed-off-by: Rik van Riel Cc: chegu_vi...@hp.com Cc: mgor...@suse.de Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538378-31571-2-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 39 --- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96b2d39..f287d0b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity; ns->task_capacity = DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); ns->has_free_capacity = (ns->nr_running < ns->task_capacity); @@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env, env->best_cpu = env->dst_cpu; } -static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, - long src_load, long dst_load, +static bool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env) { long imb, old_imb; + long orig_src_load, orig_dst_load; + long src_capacity, dst_capacity; + + /* +* The load is corrected for the CPU capacity available on each node. +* +* src_loaddst_load +* vs - +* src_capacitydst_capacity +*/ + src_capacity = env->src_stats.compute_capacity; + dst_capacity = env->dst_stats.compute_capacity; /* We care about the slope of the imbalance, not the direction. */ if (dst_load < src_load) swap(dst_load, src_load); /* Is the difference below the threshold? */ - imb = dst_load * 100 - src_load * env->imbalance_pct; + imb = dst_load * src_capacity * 100 - + src_load * dst_capacity * env->imbalance_pct; if (imb <= 0) return false; @@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, * The imbalance is above the allowed threshold. * Compare it with the old imbalance. */ + orig_src_load = env->src_stats.load; + orig_dst_load = env->dst_stats.load; + if (orig_dst_load < orig_src_load) swap(orig_dst_load, orig_src_load); - old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + old_imb = orig_dst_load * src_capacity * 100 - + orig_src_load * dst_capacity * env->imbalance_pct; /* Would this change make things worse? */ return (imb > old_imb); @@ -1136,8 +1151,7 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; - long orig_src_load, src_load; - long orig_dst_load, dst_load; + long src_load, dst_load; long load; long imp = (groupimp > 0) ? groupimp : taskimp; @@ -1211,13 +1225,9 @@ static void task_numa_compare(struct task_numa_env *env, * In the overloaded case, try and keep the load balanced. */ balance: - orig_dst_load = env->dst_stats.load; - orig_src_load = env->src_stats.load; - - /* XXX missing capacity terms */ load = task_h_load(env->p); - dst_load = orig_dst_load + load; - src_load = orig_src_load - load; + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; if (cur) { load = task_h_load(cur); @@ -1225,8 +1235,7 @@ balance: src_load += load; } - if (load_too_imbalanced(orig_src_load, orig_dst
[tip:sched/core] sched/numa: Use group's max nid as task' s preferred nid
Commit-ID: f0b8a4afd6a8c500161e45065a91738b490bf5ae Gitweb: http://git.kernel.org/tip/f0b8a4afd6a8c500161e45065a91738b490bf5ae Author: Rik van Riel AuthorDate: Mon, 23 Jun 2014 11:41:29 -0400 Committer: Ingo Molnar CommitDate: Sat, 5 Jul 2014 11:17:33 +0200 sched/numa: Use group's max nid as task's preferred nid >From task_numa_placement, always try to consolidate the tasks in a group on the group's top nid. In case this task is part of a group that is interleaved over multiple nodes, task_numa_migrate will set the task's preferred nid to the best node it could find for the task, so this patch will cause at most one run through task_numa_migrate. Signed-off-by: Rik van Riel Cc: mgor...@suse.de Cc: chegu_vi...@hp.com Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538095-31256-2-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 17 + 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e3ff3d1..96b2d39 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1594,23 +1594,8 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_group) { update_numa_active_node_mask(p->numa_group); - /* -* If the preferred task and group nids are different, -* iterate over the nodes again to find the best place. -*/ - if (max_nid != max_group_nid) { - unsigned long weight, max_weight = 0; - - for_each_online_node(nid) { - weight = task_weight(p, nid) + group_weight(p, nid); - if (weight > max_weight) { - max_weight = weight; - max_nid = nid; - } - } - } - spin_unlock_irq(group_lock); + max_nid = max_group_nid; } if (max_faults) { -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Always try to migrate to preferred node at task_numa_placement() time
Commit-ID: bb97fc31647539f1f102eed646a95e200160a150 Gitweb: http://git.kernel.org/tip/bb97fc31647539f1f102eed646a95e200160a150 Author: Rik van Riel AuthorDate: Wed, 4 Jun 2014 16:33:15 -0400 Committer: Ingo Molnar CommitDate: Wed, 18 Jun 2014 18:29:58 +0200 sched/numa: Always try to migrate to preferred node at task_numa_placement() time It is possible that at task_numa_placement() time, the task's numa_preferred_nid does not change, but the task is not actually running on the preferred node at the time. In that case, we still want to attempt migration to the preferred node. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgor...@suse.de Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140604163315.1dbc7...@cuia.bos.redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 +++- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8fbb011..3fa3e18 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1613,11 +1613,13 @@ static void task_numa_placement(struct task_struct *p) spin_unlock_irq(group_lock); } - /* Preferred node as the node with the most faults */ - if (max_faults && max_nid != p->numa_preferred_nid) { - /* Update the preferred nid and migrate task if possible */ - sched_setnuma(p, max_nid); - numa_migrate_preferred(p); + if (max_faults) { + /* Set the new preferred node */ + if (max_nid != p->numa_preferred_nid) + sched_setnuma(p, max_nid); + + if (task_node(p) != p->numa_preferred_nid) + numa_migrate_preferred(p); } } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Ensure task_numa_migrate() checks the preferred node
Commit-ID: a43455a1d572daf7b730fe12eb747d1e17411365 Gitweb: http://git.kernel.org/tip/a43455a1d572daf7b730fe12eb747d1e17411365 Author: Rik van Riel AuthorDate: Wed, 4 Jun 2014 16:09:42 -0400 Committer: Ingo Molnar CommitDate: Wed, 18 Jun 2014 18:29:57 +0200 sched/numa: Ensure task_numa_migrate() checks the preferred node The first thing task_numa_migrate() does is check to see if there is CPU capacity available on the preferred node, in order to move the task there. However, if the preferred node is all busy, we would skip considering that node for tasks swaps in the subsequent loop. This prevents NUMA convergence of tasks on busy systems. However, swapping locations with a task on our preferred nid, when the preferred nid is busy, is perfectly fine. The fix is to also look for a CPU on our preferred nid when it is totally busy. This changes "perf bench numa mem -p 4 -t 20 -m -0 -P 1000" from not converging in 15 minutes on my 4 node system, to converging in 10-20 seconds. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgor...@suse.de Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140604160942.6969b...@cuia.bos.redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fea7d33..8fbb011 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1302,9 +1302,8 @@ static int task_numa_migrate(struct task_struct *p) groupimp = group_weight(p, env.dst_nid) - groupweight; update_numa_stats(&env.dst_stats, env.dst_nid); - /* If the preferred nid has free capacity, try to use it. */ - if (env.dst_stats.has_free_capacity) - task_numa_find_cpu(&env, taskimp, groupimp); + /* Try to find a spot on the preferred nid. */ + task_numa_find_cpu(&env, taskimp, groupimp); /* No space available on the preferred nid. Look elsewhere. */ if (env.best_cpu == -1) { -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Decay -> wakee_flips instead of zeroing
Commit-ID: 096aa33863a5e48de52d2ff30e0801b7487944f4 Gitweb: http://git.kernel.org/tip/096aa33863a5e48de52d2ff30e0801b7487944f4 Author: Rik van Riel AuthorDate: Fri, 16 May 2014 00:13:32 -0400 Committer: Ingo Molnar CommitDate: Thu, 22 May 2014 11:16:41 +0200 sched/numa: Decay ->wakee_flips instead of zeroing Affine wakeups have the potential to interfere with NUMA placement. If a task wakes up too many other tasks, affine wakeups will get disabled. However, regardless of how many other tasks it wakes up, it gets re-enabled once a second, potentially interfering with NUMA placement of other tasks. By decaying wakee_wakes in half instead of zeroing it, we can avoid that problem for some workloads. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: chegu_vi...@hp.com Cc: umgwanakikb...@gmail.com Link: http://lkml.kernel.org/r/20140516001332.67f91...@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 503f750..c9617b7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4065,7 +4065,7 @@ static void record_wakee(struct task_struct *p) * about the loss. */ if (jiffies > current->wakee_flip_decay_ts + HZ) { - current->wakee_flips = 0; + current->wakee_flips >>= 1; current->wakee_flip_decay_ts = jiffies; } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Update migrate_improves/ degrades_locality()
Commit-ID: b1ad065e65f56103db8b97edbd218a271ff5b1bb Gitweb: http://git.kernel.org/tip/b1ad065e65f56103db8b97edbd218a271ff5b1bb Author: Rik van Riel AuthorDate: Thu, 15 May 2014 13:03:06 -0400 Committer: Ingo Molnar CommitDate: Thu, 22 May 2014 11:16:39 +0200 sched/numa: Update migrate_improves/degrades_locality() Update the migrate_improves/degrades_locality() functions with knowledge of pseudo-interleaving. Do not consider moving tasks around within the set of group's active nodes as improving or degrading locality. Instead, leave the load balancer free to balance the load between a numa_group's active nodes. Also, switch from the group/task_weight functions to the group/task_fault functions. The "weight" functions involve a division, but both calls use the same divisor, so there's no point in doing that from these functions. On a 4 node (x10 core) system, performance of SPECjbb2005 seems unaffected, though the number of migrations with 2 8-warehouse wide instances seems to have almost halved, due to the scheduler running each instance on a single node. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgor...@suse.de Cc: chegu_vi...@hp.com Link: http://lkml.kernel.org/r/20140515130306.61aae...@cuia.bos.redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 42 +- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b899613..503f750 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5123,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now) /* Returns true if the destination node has incurred more faults */ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) { + struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || @@ -5136,21 +5137,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; - /* Always encourage migration to the preferred node. */ - if (dst_nid == p->numa_preferred_nid) - return true; + if (numa_group) { + /* Task is already in the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return false; + + /* Task is moving into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return true; - /* If both task and group weight improve, this move is a winner. */ - if (task_weight(p, dst_nid) > task_weight(p, src_nid) && - group_weight(p, dst_nid) > group_weight(p, src_nid)) + return group_faults(p, dst_nid) > group_faults(p, src_nid); + } + + /* Encourage migration to the preferred node. */ + if (dst_nid == p->numa_preferred_nid) return true; - return false; + return task_faults(p, dst_nid) > task_faults(p, src_nid); } static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { + struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) @@ -5165,16 +5174,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; + if (numa_group) { + /* Task is moving within/into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return false; + + /* Task is moving out of the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return true; + + return group_faults(p, dst_nid) < group_faults(p, src_nid); + } + /* Migrating away from the preferred node is always bad. */ if (src_nid == p->numa_preferred_nid) return true; - /* If either task or group weight get worse, don't do it. */ - if (task_weight(p, dst_nid) < task_weight(p, src_nid) || - group_weight(p, dst_nid) < group_weight(p, src_nid)) - return true; - - return false; + return task_faults(p, dst_nid) < task_faults(p, src_nid); } #else -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Allow task switch if load imbalance improves
Commit-ID: e63da03639cc9e6e83b62e7ef8ffdbb92421416a Gitweb: http://git.kernel.org/tip/e63da03639cc9e6e83b62e7ef8ffdbb92421416a Author: Rik van Riel AuthorDate: Wed, 14 May 2014 13:22:21 -0400 Committer: Ingo Molnar CommitDate: Thu, 22 May 2014 11:16:38 +0200 sched/numa: Allow task switch if load imbalance improves Currently the NUMA balancing code only allows moving tasks between NUMA nodes when the load on both nodes is in balance. This breaks down when the load was imbalanced to begin with. Allow tasks to be moved between NUMA nodes if the imbalance is small, or if the new imbalance is be smaller than the original one. Suggested-by: Peter Zijlstra Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgor...@suse.de Cc: chegu_vi...@hp.com Signed-off-by: Ingo Molnar Link: http://lkml.kernel.org/r/20140514132221.274b3...@annuminas.surriel.com --- kernel/sched/fair.c | 46 -- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f7cac2b..b899613 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env, env->best_cpu = env->dst_cpu; } +static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, + long src_load, long dst_load, + struct task_numa_env *env) +{ + long imb, old_imb; + + /* We care about the slope of the imbalance, not the direction. */ + if (dst_load < src_load) + swap(dst_load, src_load); + + /* Is the difference below the threshold? */ + imb = dst_load * 100 - src_load * env->imbalance_pct; + if (imb <= 0) + return false; + + /* +* The imbalance is above the allowed threshold. +* Compare it with the old imbalance. +*/ + if (orig_dst_load < orig_src_load) + swap(orig_dst_load, orig_src_load); + + old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + + /* Would this change make things worse? */ + return (old_imb > imb); +} + /* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking @@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; - long dst_load, src_load; + long orig_src_load, src_load; + long orig_dst_load, dst_load; long load; long imp = (groupimp > 0) ? groupimp : taskimp; @@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env, * In the overloaded case, try and keep the load balanced. */ balance: - dst_load = env->dst_stats.load; - src_load = env->src_stats.load; + orig_dst_load = env->dst_stats.load; + orig_src_load = env->src_stats.load; /* XXX missing power terms */ load = task_h_load(env->p); - dst_load += load; - src_load -= load; + dst_load = orig_dst_load + load; + src_load = orig_src_load - load; if (cur) { load = task_h_load(cur); @@ -1195,11 +1224,8 @@ balance: src_load += load; } - /* make src_load the smaller */ - if (dst_load < src_load) - swap(dst_load, src_load); - - if (src_load * env->imbalance_pct < dst_load * 100) + if (load_too_imbalanced(orig_src_load, orig_dst_load, + src_load, dst_load, env)) goto unlock; assign: -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched: Call select_idle_sibling() when not affine_sd
Commit-ID: 8bf21433f38b020c3d8a3805d1d7fb73d7b40c01 Gitweb: http://git.kernel.org/tip/8bf21433f38b020c3d8a3805d1d7fb73d7b40c01 Author: Rik van Riel AuthorDate: Wed, 14 May 2014 11:40:37 -0400 Committer: Ingo Molnar CommitDate: Thu, 22 May 2014 11:16:28 +0200 sched: Call select_idle_sibling() when not affine_sd On smaller systems, the top level sched domain will be an affine domain, and select_idle_sibling is invoked for every SD_WAKE_AFFINE wakeup. This seems to be working well. On larger systems, with the node distance between far away NUMA nodes being > RECLAIM_DISTANCE, select_idle_sibling is only called if the waker and the wakee are on nodes less than RECLAIM_DISTANCE apart. This patch leaves in place the policy of not pulling the task across nodes on such systems, while fixing the issue that select_idle_sibling is not called at all in certain circumstances. The code will look for an idle CPU in the same CPU package as the CPU where the task ran previously. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: morten.rasmus...@arm.com Cc: george.mccollis...@gmail.com Cc: ktk...@parallels.com Cc: Mel Gorman Cc: Mike Galbraith Link: http://lkml.kernel.org/r/20140514114037.2d932...@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd3fa14..429164d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4473,10 +4473,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f sd = tmp; } - if (affine_sd) { - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; + if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + prev_cpu = cpu; + if (sd_flag & SD_BALANCE_WAKE) { new_cpu = select_idle_sibling(p, prev_cpu); goto unlock; } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched,numa: Update migrate_improves/ degrades_locality
Commit-ID: f5c1e1af91b2a4238d7c2a6dc4aa0076908b5864 Gitweb: http://git.kernel.org/tip/f5c1e1af91b2a4238d7c2a6dc4aa0076908b5864 Author: Rik van Riel AuthorDate: Thu, 15 May 2014 13:03:06 -0400 Committer: Thomas Gleixner CommitDate: Mon, 19 May 2014 22:02:43 +0900 sched,numa: Update migrate_improves/degrades_locality Update the migrate_improves/degrades_locality functions with knowledge of pseudo-interleaving. Do not consider moving tasks around within the set of group's active nodes as improving or degrading locality. Instead, leave the load balancer free to balance the load between a numa_group's active nodes. Also, switch from the group/task_weight functions to the group/task_fault functions. The "weight" functions involve a division, but both calls use the same divisor, so there's no point in doing that from these functions. On a 4 node (x10 core) system, performance of SPECjbb2005 seems unaffected, though the number of migrations with 2 8-warehouse wide instances seems to have almost halved, due to the scheduler running each instance on a single node. Cc: mgor...@suse.de Cc: chegu_vi...@hp.com Cc: mi...@kernel.org Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140515130306.61aae...@cuia.bos.redhat.com Signed-off-by: Thomas Gleixner --- kernel/sched/fair.c | 42 +- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b899613..503f750 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5123,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now) /* Returns true if the destination node has incurred more faults */ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) { + struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || @@ -5136,21 +5137,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; - /* Always encourage migration to the preferred node. */ - if (dst_nid == p->numa_preferred_nid) - return true; + if (numa_group) { + /* Task is already in the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return false; + + /* Task is moving into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return true; - /* If both task and group weight improve, this move is a winner. */ - if (task_weight(p, dst_nid) > task_weight(p, src_nid) && - group_weight(p, dst_nid) > group_weight(p, src_nid)) + return group_faults(p, dst_nid) > group_faults(p, src_nid); + } + + /* Encourage migration to the preferred node. */ + if (dst_nid == p->numa_preferred_nid) return true; - return false; + return task_faults(p, dst_nid) > task_faults(p, src_nid); } static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { + struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) @@ -5165,16 +5174,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; + if (numa_group) { + /* Task is moving within/into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return false; + + /* Task is moving out of the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return true; + + return group_faults(p, dst_nid) < group_faults(p, src_nid); + } + /* Migrating away from the preferred node is always bad. */ if (src_nid == p->numa_preferred_nid) return true; - /* If either task or group weight get worse, don't do it. */ - if (task_weight(p, dst_nid) < task_weight(p, src_nid) || - group_weight(p, dst_nid) < group_weight(p, src_nid)) - return true; - - return false; + return task_faults(p, dst_nid) < task_faults(p, src_nid); } #else -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched,numa: Decay wakee_flips instead of zeroing
Commit-ID: 5658b4f43e63f8c7b4a27995dcb2cf43a52ee398 Gitweb: http://git.kernel.org/tip/5658b4f43e63f8c7b4a27995dcb2cf43a52ee398 Author: Rik van Riel AuthorDate: Fri, 16 May 2014 00:13:32 -0400 Committer: Thomas Gleixner CommitDate: Mon, 19 May 2014 22:02:43 +0900 sched,numa: Decay wakee_flips instead of zeroing Affine wakeups have the potential to interfere with NUMA placement. If a task wakes up too many other tasks, affine wakeups will get disabled. However, regardless of how many other tasks it wakes up, it gets re-enabled once a second, potentially interfering with NUMA placement of other tasks. By decaying wakee_wakes in half instead of zeroing it, we can avoid that problem for some workloads. Cc: chegu_vi...@hp.com Cc: mi...@kernel.org Cc: umgwanakikb...@gmail.com Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140516001332.67f91...@annuminas.surriel.com Signed-off-by: Thomas Gleixner --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 503f750..c9617b7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4065,7 +4065,7 @@ static void record_wakee(struct task_struct *p) * about the loss. */ if (jiffies > current->wakee_flip_decay_ts + HZ) { - current->wakee_flips = 0; + current->wakee_flips >>= 1; current->wakee_flip_decay_ts = jiffies; } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched,numa: Allow task switch if load imbalance improves
Commit-ID: b1fda183e09d70ea75d478ea055e2b6059476eff Gitweb: http://git.kernel.org/tip/b1fda183e09d70ea75d478ea055e2b6059476eff Author: Rik van Riel AuthorDate: Wed, 14 May 2014 13:22:21 -0400 Committer: Thomas Gleixner CommitDate: Mon, 19 May 2014 22:02:42 +0900 sched,numa: Allow task switch if load imbalance improves Currently the NUMA balancing code only allows moving tasks between NUMA nodes when the load on both nodes is in balance. This breaks down when the load was imbalanced to begin with. Allow tasks to be moved between NUMA nodes if the imbalance is small, or if the new imbalance is be smaller than the original one. Cc: mi...@kernel.org Cc: mgor...@suse.de Cc: chegu_vi...@hp.com Signed-off-by: Rik van Riel Suggested-by: Peter Zijlstra Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140514132221.274b3...@annuminas.surriel.com Signed-off-by: Thomas Gleixner --- kernel/sched/fair.c | 46 -- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f7cac2b..b899613 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env, env->best_cpu = env->dst_cpu; } +static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, + long src_load, long dst_load, + struct task_numa_env *env) +{ + long imb, old_imb; + + /* We care about the slope of the imbalance, not the direction. */ + if (dst_load < src_load) + swap(dst_load, src_load); + + /* Is the difference below the threshold? */ + imb = dst_load * 100 - src_load * env->imbalance_pct; + if (imb <= 0) + return false; + + /* +* The imbalance is above the allowed threshold. +* Compare it with the old imbalance. +*/ + if (orig_dst_load < orig_src_load) + swap(orig_dst_load, orig_src_load); + + old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + + /* Would this change make things worse? */ + return (old_imb > imb); +} + /* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking @@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; - long dst_load, src_load; + long orig_src_load, src_load; + long orig_dst_load, dst_load; long load; long imp = (groupimp > 0) ? groupimp : taskimp; @@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env, * In the overloaded case, try and keep the load balanced. */ balance: - dst_load = env->dst_stats.load; - src_load = env->src_stats.load; + orig_dst_load = env->dst_stats.load; + orig_src_load = env->src_stats.load; /* XXX missing power terms */ load = task_h_load(env->p); - dst_load += load; - src_load -= load; + dst_load = orig_dst_load + load; + src_load = orig_src_load - load; if (cur) { load = task_h_load(cur); @@ -1195,11 +1224,8 @@ balance: src_load += load; } - /* make src_load the smaller */ - if (dst_load < src_load) - swap(dst_load, src_load); - - if (src_load * env->imbalance_pct < dst_load * 100) + if (load_too_imbalanced(orig_src_load, orig_dst_load, + src_load, dst_load, env)) goto unlock; assign: -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched: call select_idle_sibling when not affine_sd
Commit-ID: b45cf72cf7e1dd3b4a95947f85659cfdc01dbdad Gitweb: http://git.kernel.org/tip/b45cf72cf7e1dd3b4a95947f85659cfdc01dbdad Author: Rik van Riel AuthorDate: Wed, 14 May 2014 11:40:37 -0400 Committer: Thomas Gleixner CommitDate: Mon, 19 May 2014 22:02:40 +0900 sched: call select_idle_sibling when not affine_sd On smaller systems, the top level sched domain will be an affine domain, and select_idle_sibling is invoked for every SD_WAKE_AFFINE wakeup. This seems to be working well. On larger systems, with the node distance between far away NUMA nodes being > RECLAIM_DISTANCE, select_idle_sibling is only called if the waker and the wakee are on nodes less than RECLAIM_DISTANCE apart. This patch leaves in place the policy of not pulling the task across nodes on such systems, while fixing the issue that select_idle_sibling is not called at all in certain circumstances. The code will look for an idle CPU in the same CPU package as the CPU where the task ran previously. Cc: morten.rasmus...@arm.com Cc: mi...@kernel.org Cc: george.mccollis...@gmail.com Cc: ktk...@parallels.com Cc: Mel Gorman Cc: Mike Galbraith Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140514114037.2d932...@annuminas.surriel.com Signed-off-by: Thomas Gleixner --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd3fa14..429164d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4473,10 +4473,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f sd = tmp; } - if (affine_sd) { - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; + if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + prev_cpu = cpu; + if (sd_flag & SD_BALANCE_WAKE) { new_cpu = select_idle_sibling(p, prev_cpu); goto unlock; } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] mm/numa: Remove BUG_ON() in __handle_mm_fault()
Commit-ID: 107437febd495a50e2cd09c81bbaa84d30e57b07 Gitweb: http://git.kernel.org/tip/107437febd495a50e2cd09c81bbaa84d30e57b07 Author: Rik van Riel AuthorDate: Tue, 29 Apr 2014 15:36:15 -0400 Committer: Ingo Molnar CommitDate: Wed, 7 May 2014 13:33:48 +0200 mm/numa: Remove BUG_ON() in __handle_mm_fault() Changing PTEs and PMDs to pte_numa & pmd_numa is done with the mmap_sem held for reading, which means a pmd can be instantiated and turned into a numa one while __handle_mm_fault() is examining the value of old_pmd. If that happens, __handle_mm_fault() should just return and let the page fault retry, instead of throwing an oops. This is handled by the test for pmd_trans_huge(*pmd) below. Signed-off-by: Rik van Riel Reviewed-by: Naoya Horiguchi Reported-by: Sunil Pandey Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Mel Gorman Cc: linux...@kvack.org Cc: lwood...@redhat.com Cc: dave.han...@intel.com Link: http://lkml.kernel.org/r/20140429153615.2d720...@annuminas.surriel.com Signed-off-by: Ingo Molnar --- mm/memory.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index d0f0bef..9c2dc65 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3900,9 +3900,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, } } - /* THP should already have been handled */ - BUG_ON(pmd_numa(*pmd)); - /* * Use __pte_alloc instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Do not set preferred_node on migration to a second choice node
Commit-ID: 68d1b02a58f5d9f584c1fb2923ed60ec68cbbd9b Gitweb: http://git.kernel.org/tip/68d1b02a58f5d9f584c1fb2923ed60ec68cbbd9b Author: Rik van Riel AuthorDate: Fri, 11 Apr 2014 13:00:29 -0400 Committer: Ingo Molnar CommitDate: Wed, 7 May 2014 13:33:47 +0200 sched/numa: Do not set preferred_node on migration to a second choice node Setting the numa_preferred_node for a task in task_numa_migrate does nothing on a 2-node system. Either we migrate to the node that already was our preferred node, or we stay where we were. On a 4-node system, it can slightly decrease overhead, by not calling the NUMA code as much. Since every node tends to be directly connected to every other node, running on the wrong node for a while does not do much damage. However, on an 8 node system, there are far more bad nodes than there are good ones, and pretending that a second choice is actually the preferred node can greatly delay, or even prevent, a workload from converging. The only time we can safely pretend that a second choice node is the preferred node is when the task is part of a workload that spans multiple NUMA nodes. Signed-off-by: Rik van Riel Tested-by: Vinod Chegu Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1397235629-16328-4-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ecea8d9..051903f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1301,7 +1301,16 @@ static int task_numa_migrate(struct task_struct *p) if (env.best_cpu == -1) return -EAGAIN; - sched_setnuma(p, env.dst_nid); + /* +* If the task is part of a workload that spans multiple NUMA nodes, +* and is migrating into one of the workload's active nodes, remember +* this node as the task's preferred numa node, so the workload can +* settle down. +* A task that migrated to a second choice node will be better off +* trying for a better one later. Do not set the preferred node here. +*/ + if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) + sched_setnuma(p, env.dst_nid); /* * Reset the scan period if the task is being rescheduled on an -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Retry placement more frequently when misplaced
Commit-ID: 5085e2a328849bdee6650b32d52c87c3788ab01c Gitweb: http://git.kernel.org/tip/5085e2a328849bdee6650b32d52c87c3788ab01c Author: Rik van Riel AuthorDate: Fri, 11 Apr 2014 13:00:28 -0400 Committer: Ingo Molnar CommitDate: Wed, 7 May 2014 13:33:46 +0200 sched/numa: Retry placement more frequently when misplaced When tasks have not converged on their preferred nodes yet, we want to retry fairly often, to make sure we do not migrate a task's memory to an undesirable location, only to have to move it again later. This patch reduces the interval at which migration is retried, when the task's numa_scan_period is small. Signed-off-by: Rik van Riel Tested-by: Vinod Chegu Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1397235629-16328-3-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f6457b6..ecea8d9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1326,12 +1326,15 @@ static int task_numa_migrate(struct task_struct *p) /* Attempt to migrate a task to a CPU on the preferred node. */ static void numa_migrate_preferred(struct task_struct *p) { + unsigned long interval = HZ; + /* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) return; /* Periodically retry migrating the task to the preferred node */ - p->numa_migrate_retry = jiffies + HZ; + interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); + p->numa_migrate_retry = jiffies + interval; /* Success if task is already running on preferred CPU */ if (task_node(p) == p->numa_preferred_nid) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/core] sched/numa: Count pages on active node as local
Commit-ID: 792568ec6a31ca560ca4d528782cbc6cd2cea8b0 Gitweb: http://git.kernel.org/tip/792568ec6a31ca560ca4d528782cbc6cd2cea8b0 Author: Rik van Riel AuthorDate: Fri, 11 Apr 2014 13:00:27 -0400 Committer: Ingo Molnar CommitDate: Wed, 7 May 2014 13:33:45 +0200 sched/numa: Count pages on active node as local The NUMA code is smart enough to distribute the memory of workloads that span multiple NUMA nodes across those NUMA nodes. However, it still has a pretty high scan rate for such workloads, because any memory that is left on a node other than the node of the CPU that faulted on the memory is counted as non-local, which causes the scan rate to go up. Counting the memory on any node where the task's numa group is actively running as local, allows the scan rate to slow down once the application is settled in. This should reduce the overhead of the automatic NUMA placement code, when a workload spans multiple NUMA nodes. Signed-off-by: Rik van Riel Tested-by: Vinod Chegu Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1397235629-16328-2-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5d859ec..f6457b6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1738,6 +1738,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) struct task_struct *p = current; bool migrated = flags & TNF_MIGRATED; int cpu_node = task_node(current); + int local = !!(flags & TNF_FAULT_LOCAL); int priv; if (!numabalancing_enabled) @@ -1786,6 +1787,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) task_numa_group(p, last_cpupid, flags, &priv); } + /* +* If a workload spans multiple NUMA nodes, a shared fault that +* occurs wholly within the set of nodes that the workload is +* actively using should be counted as local. This allows the +* scan rate to slow down when a workload has settled down. +*/ + if (!priv && !local && p->numa_group && + node_isset(cpu_node, p->numa_group->active_nodes) && + node_isset(mem_node, p->numa_group->active_nodes)) + local = 1; + task_numa_placement(p); /* @@ -1800,7 +1812,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; - p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; + p->numa_faults_locality[local] += pages; } static void reset_ptenuma_scan(struct task_struct *p) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/urgent] sched,numa: add cond_resched to task_numa_work
Commit-ID: 3cf1962cdbf6b3a9e3ef21116d215bbab350ea37 Gitweb: http://git.kernel.org/tip/3cf1962cdbf6b3a9e3ef21116d215bbab350ea37 Author: Rik van Riel AuthorDate: Tue, 18 Feb 2014 17:12:44 -0500 Committer: Thomas Gleixner CommitDate: Fri, 21 Feb 2014 21:27:10 +0100 sched,numa: add cond_resched to task_numa_work Normally task_numa_work scans over a fairly small amount of memory, but it is possible to run into a large unpopulated part of virtual memory, with no pages mapped. In that case, task_numa_work can run for a while, and it may make sense to reschedule as required. Cc: a...@linux-foundation.org Cc: Andrea Arcangeli Signed-off-by: Rik van Riel Reported-by: Xing Gang Tested-by: Chegu Vinod Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392761566-24834-2-git-send-email-r...@redhat.com Signed-off-by: Thomas Gleixner --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 966cc2b..7815709 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1757,6 +1757,8 @@ void task_numa_work(struct callback_head *work) start = end; if (pages <= 0) goto out; + + cond_resched(); } while (end != vma->vm_end); } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/numa] sched/numa, mm: Remove p->numa_migrate_deferred
Commit-ID: 52bf84aa206cd2c2516dfa3e03b578edf8a3242f Gitweb: http://git.kernel.org/tip/52bf84aa206cd2c2516dfa3e03b578edf8a3242f Author: Rik van Riel AuthorDate: Mon, 27 Jan 2014 17:03:40 -0500 Committer: Ingo Molnar CommitDate: Tue, 28 Jan 2014 13:17:04 +0100 sched/numa, mm: Remove p->numa_migrate_deferred Excessive migration of pages can hurt the performance of workloads that span multiple NUMA nodes. However, it turns out that the p->numa_migrate_deferred knob is a really big hammer, which does reduce migration rates, but does not actually help performance. Now that the second stage of the automatic numa balancing code has stabilized, it is time to replace the simplistic migration deferral code with something smarter. Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-2-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- Documentation/sysctl/kernel.txt | 10 + include/linux/sched.h | 1 - kernel/sched/fair.c | 8 kernel/sysctl.c | 7 --- mm/mempolicy.c | 45 - 5 files changed, 1 insertion(+), 70 deletions(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 6d48640..760f6e6a 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -386,8 +386,7 @@ feature should be disabled. Otherwise, if the system overhead from the feature is too high then the rate the kernel samples for NUMA hinting faults may be controlled by the numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, -numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and -numa_balancing_migrate_deferred. +numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls. == @@ -428,13 +427,6 @@ rate for each task. numa_balancing_scan_size_mb is how many megabytes worth of pages are scanned for a given scan. -numa_balancing_migrate_deferred is how many page migrations get skipped -unconditionally, after a page migration is skipped because a page is shared -with other tasks. This reduces page migration overhead, and determines -how much stronger the "move task near its memory" policy scheduler becomes, -versus the "move memory near its task" memory management policy, for workloads -with shared memory. - == osrelease, ostype & version: diff --git a/include/linux/sched.h b/include/linux/sched.h index ffccdad..d572d5b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1457,7 +1457,6 @@ struct task_struct { unsigned int numa_scan_period; unsigned int numa_scan_period_max; int numa_preferred_nid; - int numa_migrate_deferred; unsigned long numa_migrate_retry; u64 node_stamp; /* migration stamp */ struct callback_head numa_work; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index efe6457..7cdde91 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -819,14 +819,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; -/* - * After skipping a page migration on a shared page, skip N more numa page - * migrations unconditionally. This reduces the number of NUMA migrations - * in shared memory workloads, and has the effect of pulling tasks towards - * where their memory lives, over pulling the memory towards the task. - */ -unsigned int sysctl_numa_balancing_migrate_deferred = 16; - static unsigned int task_nr_scan_windows(struct task_struct *p) { unsigned long rss = 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8da99f..b41d61d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -384,13 +384,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "numa_balancing_migrate_deferred", - .data = &sysctl_numa_balancing_migrate_deferred, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0cd2c4d..68d5c7f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2304,35 +2304,6 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } -#ifdef CONFIG_NUMA_BALANCING -static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) -{ - /* Never defer a private fault */ - if (cpupid_match
[tip:sched/numa] sched/numa: Rename p-> numa_faults to numa_faults_memory
Commit-ID: ff1df896aef8e0ec1556a5c44f424bd45bfa2cbe Gitweb: http://git.kernel.org/tip/ff1df896aef8e0ec1556a5c44f424bd45bfa2cbe Author: Rik van Riel AuthorDate: Mon, 27 Jan 2014 17:03:41 -0500 Committer: Ingo Molnar CommitDate: Tue, 28 Jan 2014 13:17:05 +0100 sched/numa: Rename p->numa_faults to numa_faults_memory In order to get a more consistent naming scheme, making it clear which fault statistics track memory locality, and which track CPU locality, rename the memory fault statistics. Suggested-by: Mel Gorman Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-3-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 kernel/sched/core.c | 4 ++-- kernel/sched/debug.c | 6 +++--- kernel/sched/fair.c | 56 +-- 4 files changed, 37 insertions(+), 37 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d572d5b..144d509 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1469,15 +1469,15 @@ struct task_struct { * Scheduling placement decisions are made based on the these counts. * The values remain static for the duration of a PTE scan */ - unsigned long *numa_faults; + unsigned long *numa_faults_memory; unsigned long total_numa_faults; /* * numa_faults_buffer records faults per node during the current -* scan window. When the scan completes, the counts in numa_faults -* decay and these values are copied. +* scan window. When the scan completes, the counts in +* numa_faults_memory decay and these values are copied. */ - unsigned long *numa_faults_buffer; + unsigned long *numa_faults_buffer_memory; /* * numa_faults_locality tracks if faults recorded during the last diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 81343d6..bc708c5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1744,8 +1744,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - p->numa_faults_buffer = NULL; + p->numa_faults_memory = NULL; + p->numa_faults_buffer_memory = NULL; INIT_LIST_HEAD(&p->numa_entry); p->numa_group = NULL; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index dd52e7f..31b908d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -533,15 +533,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) unsigned long nr_faults = -1; int cpu_current, home_node; - if (p->numa_faults) - nr_faults = p->numa_faults[2*node + i]; + if (p->numa_faults_memory) + nr_faults = p->numa_faults_memory[2*node + i]; cpu_current = !i ? (task_node(p) == node) : (pol && node_isset(node, pol->v.nodes)); home_node = (p->numa_preferred_nid == node); - SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", + SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n", i, node, cpu_current, home_node, nr_faults); } } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7cdde91..3e616d7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -901,11 +901,11 @@ static inline int task_faults_idx(int nid, int priv) static inline unsigned long task_faults(struct task_struct *p, int nid) { - if (!p->numa_faults) + if (!p->numa_faults_memory) return 0; - return p->numa_faults[task_faults_idx(nid, 0)] + - p->numa_faults[task_faults_idx(nid, 1)]; + return p->numa_faults_memory[task_faults_idx(nid, 0)] + + p->numa_faults_memory[task_faults_idx(nid, 1)]; } static inline unsigned long group_faults(struct task_struct *p, int nid) @@ -927,7 +927,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) { unsigned long total_faults; - if (!p->numa_faults) + if (!p->numa_faults_memory) return 0; total_faults = p->total_numa_faults; @@ -1255,7 +1255,7 @@ static int task_numa_migrate(struct task_struct *p) static void numa_migrate_preferred(struct task_struct *p) { /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) + if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
[tip:sched/numa] sched/numa: Turn some magic numbers into #defines
Commit-ID: be1e4e760d940c14d119bffef5eb007dfdf29046 Gitweb: http://git.kernel.org/tip/be1e4e760d940c14d119bffef5eb007dfdf29046 Author: Rik van Riel AuthorDate: Mon, 27 Jan 2014 17:03:48 -0500 Committer: Ingo Molnar CommitDate: Tue, 28 Jan 2014 15:03:21 +0100 sched/numa: Turn some magic numbers into #defines Cleanup suggested by Mel Gorman. Now the code contains some more hints on what statistics go where. Suggested-by: Mel Gorman Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-10-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 34 +- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d5832c3..1f41b12 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -896,6 +896,15 @@ struct numa_group { unsigned long faults[0]; }; +/* Shared or private faults. */ +#define NR_NUMA_HINT_FAULT_TYPES 2 + +/* Memory and CPU locality */ +#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2) + +/* Averaged statistics, and temporary buffers. */ +#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2) + pid_t task_numa_group_id(struct task_struct *p) { return p->numa_group ? p->numa_group->gid : 0; @@ -903,7 +912,7 @@ pid_t task_numa_group_id(struct task_struct *p) static inline int task_faults_idx(int nid, int priv) { - return 2 * nid + priv; + return NR_NUMA_HINT_FAULT_TYPES * nid + priv; } static inline unsigned long task_faults(struct task_struct *p, int nid) @@ -1509,7 +1518,7 @@ static void task_numa_placement(struct task_struct *p) unsigned long faults = 0, group_faults = 0; int priv, i; - for (priv = 0; priv < 2; priv++) { + for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { long diff, f_diff, f_weight; i = task_faults_idx(nid, priv); @@ -1620,11 +1629,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, INIT_LIST_HEAD(&grp->task_list); grp->gid = p->pid; /* Second half of the array tracks nids where faults happen */ - grp->faults_cpu = grp->faults + 2 * nr_node_ids; + grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * + nr_node_ids; node_set(task_node(current), grp->active_nodes); - for (i = 0; i < 4*nr_node_ids; i++) + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] = p->numa_faults_memory[i]; grp->total_faults = p->total_numa_faults; @@ -1682,7 +1692,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, double_lock(&my_grp->lock, &grp->lock); - for (i = 0; i < 4*nr_node_ids; i++) { + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { my_grp->faults[i] -= p->numa_faults_memory[i]; grp->faults[i] += p->numa_faults_memory[i]; } @@ -1714,7 +1724,7 @@ void task_numa_free(struct task_struct *p) if (grp) { spin_lock(&grp->lock); - for (i = 0; i < 4*nr_node_ids; i++) + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] -= p->numa_faults_memory[i]; grp->total_faults -= p->total_numa_faults; @@ -1755,14 +1765,20 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults_memory)) { - int size = sizeof(*p->numa_faults_memory) * 4 * nr_node_ids; + int size = sizeof(*p->numa_faults_memory) * + NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; - /* numa_faults and numa_faults_buffer share the allocation */ - p->numa_faults_memory = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); + p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); if (!p->numa_faults_memory) return; BUG_ON(p->numa_faults_buffer_memory); + /* +* The averaged statistics, shared & private, memory & cpu, +* occupy the first half of the array. The second half of the +* array is for current counters, which are averaged into the +* first set by task_numa_placement. +*/ p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); p->numa_faults_buffer_cpu = p->numa_f
[tip:sched/numa] sched/numa: Rename variables in task_numa_fault( )
Commit-ID: 58b46da336a9312b2e21bb576d1c2c484dbf6257 Gitweb: http://git.kernel.org/tip/58b46da336a9312b2e21bb576d1c2c484dbf6257 Author: Rik van Riel AuthorDate: Mon, 27 Jan 2014 17:03:47 -0500 Committer: Ingo Molnar CommitDate: Tue, 28 Jan 2014 15:03:19 +0100 sched/numa: Rename variables in task_numa_fault() We track both the node of the memory after a NUMA fault, and the node of the CPU on which the fault happened. Rename the local variables in task_numa_fault to make things more explicit. Suggested-by: Mel Gorman Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-9-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4c44990..d5832c3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1735,11 +1735,11 @@ void task_numa_free(struct task_struct *p) /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int last_cpupid, int node, int pages, int flags) +void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) { struct task_struct *p = current; bool migrated = flags & TNF_MIGRATED; - int this_node = task_node(current); + int cpu_node = task_node(current); int priv; if (!numabalancing_enabled) @@ -1794,8 +1794,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; - p->numa_faults_buffer_memory[task_faults_idx(node, priv)] += pages; - p->numa_faults_buffer_cpu[task_faults_idx(this_node, priv)] += pages; + p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; + p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:sched/numa] sched/numa: Normalize faults_cpu stats and weigh by CPU use
Commit-ID: 7e2703e6099609adc93679c4d45cd6247f565971 Gitweb: http://git.kernel.org/tip/7e2703e6099609adc93679c4d45cd6247f565971 Author: Rik van Riel AuthorDate: Mon, 27 Jan 2014 17:03:45 -0500 Committer: Ingo Molnar CommitDate: Tue, 28 Jan 2014 15:03:10 +0100 sched/numa: Normalize faults_cpu stats and weigh by CPU use Tracing the code that decides the active nodes has made it abundantly clear that the naive implementation of the faults_from code has issues. Specifically, the garbage collector in some workloads will access orders of magnitudes more memory than the threads that do all the active work. This resulted in the node with the garbage collector being marked the only active node in the group. This issue is avoided if we weigh the statistics by CPU use of each task in the numa group, instead of by how many faults each thread has occurred. To achieve this, we normalize the number of faults to the fraction of faults that occurred on each node, and then multiply that fraction by the fraction of CPU time the task has used since the last time task_numa_placement was invoked. This way the nodes in the active node mask will be the ones where the tasks from the numa group are most actively running, and the influence of eg. the garbage collector and other do-little threads is properly minimized. On a 4 node system, using CPU use statistics calculated over a longer interval results in about 1% fewer page migrations with two 32-warehouse specjbb runs on a 4 node system, and about 5% fewer page migrations, as well as 1% better throughput, with two 8-warehouse specjbb runs, as compared with the shorter term statistics kept by the scheduler. Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-7-git-send-email-r...@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/sched/core.c | 2 ++ kernel/sched/fair.c | 53 +-- 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5ab3b89..ef92953 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1459,6 +1459,8 @@ struct task_struct { int numa_preferred_nid; unsigned long numa_migrate_retry; u64 node_stamp; /* migration stamp */ + u64 last_task_numa_placement; + u64 last_sum_exec_runtime; struct callback_head numa_work; struct list_head numa_entry; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bc708c5..a561c9e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1746,6 +1746,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->numa_work.next = &p->numa_work; p->numa_faults_memory = NULL; p->numa_faults_buffer_memory = NULL; + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; INIT_LIST_HEAD(&p->numa_entry); p->numa_group = NULL; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eeabb33..8fc3a82 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -887,6 +887,11 @@ struct numa_group { struct rcu_head rcu; nodemask_t active_nodes; unsigned long total_faults; + /* +* Faults_cpu is used to decide whether memory should move +* towards the CPU. As a consequence, these stats are weighted +* more by CPU use than by memory faults. +*/ unsigned long *faults_cpu; unsigned long faults[0]; }; @@ -1446,11 +1451,41 @@ static void update_task_scan_period(struct task_struct *p, memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } +/* + * Get the fraction of time the task has been running since the last + * NUMA placement cycle. The scheduler keeps similar statistics, but + * decays those on a 32ms period, which is orders of magnitude off + * from the dozens-of-seconds NUMA balancing period. Use the scheduler + * stats only if the task is so new there are no NUMA statistics yet. + */ +static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) +{ + u64 runtime, delta, now; + /* Use the start of this time slice to avoid calculations. */ + now = p->se.exec_start; + runtime = p->se.sum_exec_runtime; + + if (p->last_task_numa_placement) { + delta = runtime - p->last_sum_exec_runtime; + *period = now - p->last_task_numa_placement; + } else { + delta = p->se.avg.runnable_avg_sum; + *period = p->se.avg.runnable_avg_period; + } + + p->last_sum_exec_runtime = runtime; + p->last_task_numa_placement = now; + + return delta; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; unsigned long max_faults = 0, max_group_faults