from:"tip\-bot for Rik van Riel"

[tip:x86/fpu] x86/fpu: Defer FPU state load until return to userspace

2019-04-13 Thread tip-bot for Rik van Riel

Commit-ID:  5f409e20b794565e2d60ad333e79334630a6c798
Gitweb: https://git.kernel.org/tip/5f409e20b794565e2d60ad333e79334630a6c798
Author: Rik van Riel 
AuthorDate: Wed, 3 Apr 2019 18:41:52 +0200
Committer:  Borislav Petkov 
CommitDate: Fri, 12 Apr 2019 19:34:47 +0200

x86/fpu: Defer FPU state load until return to userspace

Defer loading of FPU state until return to userspace. This gives
the kernel the potential to skip loading FPU state for tasks that
stay in kernel mode, or for tasks that end up with repeated
invocations of kernel_fpu_begin() & kernel_fpu_end().

The fpregs_lock/unlock() section ensures that the registers remain
unchanged. Otherwise a context switch or a bottom half could save the
registers to its FPU context and the processor's FPU registers would
became random if modified at the same time.

KVM swaps the host/guest registers on entry/exit path. This flow has
been kept as is. First it ensures that the registers are loaded and then
saves the current (host) state before it loads the guest's registers. The
swap is done at the very end with disabled interrupts so it should not
change anymore before theg guest is entered. The read/save version seems
to be cheaper compared to memcpy() in a micro benchmark.

Each thread gets TIF_NEED_FPU_LOAD set as part of fork() / fpu__copy().
For kernel threads, this flag gets never cleared which avoids saving /
restoring the FPU state for kernel threads and during in-kernel usage of
the FPU registers.

 [
   bp: Correct and update commit message and fix checkpatch warnings.
   s/register/registers/ where it is used in plural.
   minor comment corrections.
   remove unused trace_x86_fpu_activate_state() TP.
 ]

Signed-off-by: Rik van Riel 
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Borislav Petkov 
Reviewed-by: Dave Hansen 
Reviewed-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Aubrey Li 
Cc: Babu Moger 
Cc: "Chang S. Bae" 
Cc: Dmitry Safonov 
Cc: "H. Peter Anvin" 
Cc: Ingo Molnar 
Cc: Jann Horn 
Cc: "Jason A. Donenfeld" 
Cc: Joerg Roedel 
Cc: Konrad Rzeszutek Wilk 
Cc: kvm ML 
Cc: Nicolai Stange 
Cc: Paolo Bonzini 
Cc: "Radim Krčmář" 
Cc: Tim Chen 
Cc: Waiman Long 
Cc: x86-ml 
Cc: Yi Wang 
Link: https://lkml.kernel.org/r/20190403164156.19645-24-bige...@linutronix.de
---
 arch/x86/entry/common.c |  10 +++-
 arch/x86/include/asm/fpu/api.h  |  22 +++-
 arch/x86/include/asm/fpu/internal.h |  27 +
 arch/x86/include/asm/trace/fpu.h|  10 ++--
 arch/x86/kernel/fpu/core.c  | 106 +++-
 arch/x86/kernel/fpu/signal.c|  49 ++---
 arch/x86/kernel/process.c   |   2 +-
 arch/x86/kernel/process_32.c|   5 +-
 arch/x86/kernel/process_64.c|   5 +-
 arch/x86/kvm/x86.c  |  20 +--
 10 files changed, 184 insertions(+), 72 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 7bc105f47d21..51beb8d29123 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -25,12 +25,13 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
-#include 
 #include 
+#include 
 
 #define CREATE_TRACE_POINTS
 #include 
@@ -196,6 +197,13 @@ __visible inline void prepare_exit_to_usermode(struct 
pt_regs *regs)
if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
exit_to_usermode_loop(regs, cached_flags);
 
+   /* Reload ti->flags; we may have rescheduled above. */
+   cached_flags = READ_ONCE(ti->flags);
+
+   fpregs_assert_state_consistent();
+   if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD))
+   switch_fpu_return();
+
 #ifdef CONFIG_COMPAT
/*
 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 73e684160f35..b774c52e5411 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -10,7 +10,7 @@
 
 #ifndef _ASM_X86_FPU_API_H
 #define _ASM_X86_FPU_API_H
-#include 
+#include 
 
 /*
  * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
@@ -22,17 +22,37 @@
 extern void kernel_fpu_begin(void);
 extern void kernel_fpu_end(void);
 extern bool irq_fpu_usable(void);
+extern void fpregs_mark_activate(void);
 
+/*
+ * Use fpregs_lock() while editing CPU's FPU registers or fpu->state.
+ * A context switch will (and softirq might) save CPU's FPU registers to
+ * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in
+ * a random state.
+ */
 static inline void fpregs_lock(void)
 {
preempt_disable();
+   local_bh_disable();
 }
 
 static inline void fpregs_unlock(void)
 {
+   local_bh_enable();
preempt_enable();
 }
 
+#ifdef CONFIG_X86_DEBUG_FPU
+extern void fpregs_assert_state_consistent(void);
+#else
+static inline void fpregs_assert_state_consistent(void) { }
+#endif
+
+/*
+ * Load the task FPU state before returning to userspace.
+ */
+extern

[tip:x86/fpu] x86/fpu: Prepare copy_fpstate_to_sigframe() for TIF_NEED_FPU_LOAD

2019-04-13 Thread tip-bot for Rik van Riel

Commit-ID:  a352a3b7b7920212ee4c45a41500c66826318e92
Gitweb: https://git.kernel.org/tip/a352a3b7b7920212ee4c45a41500c66826318e92
Author: Rik van Riel 
AuthorDate: Wed, 3 Apr 2019 18:41:47 +0200
Committer:  Borislav Petkov 
CommitDate: Thu, 11 Apr 2019 18:20:04 +0200

x86/fpu: Prepare copy_fpstate_to_sigframe() for TIF_NEED_FPU_LOAD

The FPU registers need only to be saved if TIF_NEED_FPU_LOAD is not set.
Otherwise this has been already done and can be skipped.

 [ bp: Massage a bit. ]

Signed-off-by: Rik van Riel 
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Borislav Petkov 
Reviewed-by: Dave Hansen 
Reviewed-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: "H. Peter Anvin" 
Cc: Ingo Molnar 
Cc: Jann Horn 
Cc: "Jason A. Donenfeld" 
Cc: kvm ML 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Rik van Riel 
Cc: x86-ml 
Link: https://lkml.kernel.org/r/20190403164156.19645-19-bige...@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 8f23f5237218..9b9dfdc96285 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -171,7 +171,17 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user 
*buf_fx, int size)
sizeof(struct user_i387_ia32_struct), NULL,
(struct _fpstate_32 __user *) buf) ? -1 : 1;
 
-   copy_fpregs_to_fpstate(fpu);
+   /*
+* If we do not need to load the FPU registers at return to userspace
+* then the CPU has the current state and we need to save it. Otherwise,
+* it has already been done and we can skip it.
+*/
+   fpregs_lock();
+   if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
+   copy_fpregs_to_fpstate(fpu);
+   set_thread_flag(TIF_NEED_FPU_LOAD);
+   }
+   fpregs_unlock();
 
if (using_compacted_format()) {
if (copy_xstate_to_user(buf_fx, xsave, 0, size))

[tip:x86/fpu] x86/fpu: Always store the registers in copy_fpstate_to_sigframe()

2019-04-13 Thread tip-bot for Rik van Riel

Commit-ID:  69277c98f5eef0d9839699b7825923c3985f665f
Gitweb: https://git.kernel.org/tip/69277c98f5eef0d9839699b7825923c3985f665f
Author: Rik van Riel 
AuthorDate: Wed, 3 Apr 2019 18:41:46 +0200
Committer:  Borislav Petkov 
CommitDate: Thu, 11 Apr 2019 18:08:57 +0200

x86/fpu: Always store the registers in copy_fpstate_to_sigframe()

copy_fpstate_to_sigframe() stores the registers directly to user space.
This is okay because the FPU registers are valid and saving them
directly avoids saving them into kernel memory and making a copy.

However, this cannot be done anymore if the FPU registers are going
to be restored on the return to userland. It is possible that the FPU
registers will be invalidated in the middle of the save operation and
this should be done with disabled preemption / BH.

Save the FPU registers to the task's FPU struct and copy them to the
user memory later on.

Signed-off-by: Rik van Riel 
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Borislav Petkov 
Reviewed-by: Dave Hansen 
Reviewed-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: "H. Peter Anvin" 
Cc: Ingo Molnar 
Cc: Jann Horn 
Cc: "Jason A. Donenfeld" 
Cc: kvm ML 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: x86-ml 
Link: https://lkml.kernel.org/r/20190403164156.19645-18-bige...@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 19 ++-
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 155f4552413e..8f23f5237218 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -144,8 +144,8 @@ static inline int copy_fpregs_to_sigframe(struct 
xregs_state __user *buf)
  * buf == buf_fx for 64-bit frames and 32-bit fsave frame.
  * buf != buf_fx for 32-bit frames with fxstate.
  *
- * Save the state directly to the user frame pointed by the aligned pointer
- * 'buf_fx'.
+ * Save the state to task's fpu->state and then copy it to the user frame
+ * pointed to by the aligned pointer 'buf_fx'.
  *
  * If this is a 32-bit frame with fxstate, put a fsave header before
  * the aligned state at 'buf_fx'.
@@ -155,6 +155,8 @@ static inline int copy_fpregs_to_sigframe(struct 
xregs_state __user *buf)
  */
 int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 {
+   struct fpu *fpu = ¤t->thread.fpu;
+   struct xregs_state *xsave = &fpu->state.xsave;
struct task_struct *tsk = current;
int ia32_fxstate = (buf != buf_fx);
 
@@ -169,9 +171,16 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user 
*buf_fx, int size)
sizeof(struct user_i387_ia32_struct), NULL,
(struct _fpstate_32 __user *) buf) ? -1 : 1;
 
-   /* Save the live registers state to the user frame directly. */
-   if (copy_fpregs_to_sigframe(buf_fx))
-   return -1;
+   copy_fpregs_to_fpstate(fpu);
+
+   if (using_compacted_format()) {
+   if (copy_xstate_to_user(buf_fx, xsave, 0, size))
+   return -1;
+   } else {
+   fpstate_sanitize_xstate(fpu);
+   if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size))
+   return -1;
+   }
 
/* Save the fsave header for the 32-bit frames. */
if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))

[tip:x86/fpu] x86/fpu: Eager switch PKRU state

2019-04-13 Thread tip-bot for Rik van Riel

Commit-ID:  0cecca9d03c964abbd2b7927d0670eb70db4ebf2
Gitweb: https://git.kernel.org/tip/0cecca9d03c964abbd2b7927d0670eb70db4ebf2
Author: Rik van Riel 
AuthorDate: Wed, 3 Apr 2019 18:41:44 +0200
Committer:  Borislav Petkov 
CommitDate: Thu, 11 Apr 2019 15:57:10 +0200

x86/fpu: Eager switch PKRU state

While most of a task's FPU state is only needed in user space, the
protection keys need to be in place immediately after a context switch.

The reason is that any access to userspace memory while running in
kernel mode also needs to abide by the memory permissions specified in
the protection keys.

The "eager switch" is a preparation for loading the FPU state on return
to userland. Instead of decoupling PKRU state from xstate, update PKRU
within xstate on write operations by the kernel.

For user tasks the PKRU should be always read from the xsave area and it
should not change anything because the PKRU value was loaded as part of
FPU restore.

For kernel threads the default "init_pkru_value" will be written. Before
this commit, the kernel thread would end up with a random value which it
inherited from the previous user task.

 [ bigeasy: save pkru to xstate, no cache, don't use __raw_xsave_addr() ]

 [ bp: update commit message, sort headers properly in asm/fpu/xstate.h ]

Signed-off-by: Rik van Riel 
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Borislav Petkov 
Reviewed-by: Dave Hansen 
Reviewed-by: Thomas Gleixner 
Cc: Andi Kleen 
Cc: Andy Lutomirski 
Cc: Aubrey Li 
Cc: "H. Peter Anvin" 
Cc: Ingo Molnar 
Cc: Jann Horn 
Cc: "Jason A. Donenfeld" 
Cc: Joerg Roedel 
Cc: Juergen Gross 
Cc: "Kirill A. Shutemov" 
Cc: kvm ML 
Cc: Michal Hocko 
Cc: Paolo Bonzini 
Cc: Peter Zijlstra 
Cc: Radim Krčmář 
Cc: x86-ml 
Link: https://lkml.kernel.org/r/20190403164156.19645-16-bige...@linutronix.de
---
 arch/x86/include/asm/fpu/internal.h | 24 ++--
 arch/x86/include/asm/fpu/xstate.h   |  4 +++-
 arch/x86/include/asm/pgtable.h  |  6 ++
 arch/x86/mm/pkeys.c |  1 -
 4 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 3e0c2c496f2d..6eb4a0b1ad0e 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -534,8 +535,27 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
  */
 static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
 {
-   if (static_cpu_has(X86_FEATURE_FPU))
-   __fpregs_load_activate(new_fpu, cpu);
+   u32 pkru_val = init_pkru_value;
+   struct pkru_state *pk;
+
+   if (!static_cpu_has(X86_FEATURE_FPU))
+   return;
+
+   __fpregs_load_activate(new_fpu, cpu);
+
+   if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+   return;
+
+   /*
+* PKRU state is switched eagerly because it needs to be valid before we
+* return to userland e.g. for a copy_to_user() operation.
+*/
+   if (current->mm) {
+   pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU);
+   if (pk)
+   pkru_val = pk->pkru;
+   }
+   __write_pkru(pkru_val);
 }
 
 /*
diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index fbe41f808e5d..7e42b285c856 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -2,9 +2,11 @@
 #ifndef __ASM_X86_XSAVE_H
 #define __ASM_X86_XSAVE_H
 
+#include 
 #include 
+
 #include 
-#include 
+#include 
 
 /* Bit 63 of XCR0 is reserved for future expansion */
 #define XFEATURE_MASK_EXTEND   (~(XFEATURE_MASK_FPSSE | (1ULL << 63)))
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e8875ca75623..9beb371b1adf 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1355,6 +1355,12 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
 #define PKRU_WD_BIT 0x2
 #define PKRU_BITS_PER_PKEY 2
 
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+extern u32 init_pkru_value;
+#else
+#define init_pkru_value0
+#endif
+
 static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
 {
int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 50f65fc1b9a3..2ecbf4155f98 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -126,7 +126,6 @@ int __arch_override_mprotect_pkey(struct vm_area_struct 
*vma, int prot, int pkey
  * in the process's lifetime will not accidentally get access
  * to data which is pkey-protected later on.
  */
-static
 u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
  PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) |
  PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) |

[tip:x86/fpu] x86/fpu: Add an __fpregs_load_activate() internal helper

2019-04-13 Thread tip-bot for Rik van Riel

Commit-ID:  4ee91519e1dccc175665fe24bb20a47c6053575c
Gitweb: https://git.kernel.org/tip/4ee91519e1dccc175665fe24bb20a47c6053575c
Author: Rik van Riel 
AuthorDate: Wed, 3 Apr 2019 18:41:38 +0200
Committer:  Borislav Petkov 
CommitDate: Wed, 10 Apr 2019 16:23:14 +0200

x86/fpu: Add an __fpregs_load_activate() internal helper

Add a helper function that ensures the floating point registers for the
current task are active. Use with preemption disabled.

While at it, add fpregs_lock/unlock() helpers too, to be used in later
patches.

 [ bp: Add a comment about its intended usage. ]

Signed-off-by: Rik van Riel 
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Borislav Petkov 
Reviewed-by: Dave Hansen 
Reviewed-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Aubrey Li 
Cc: "H. Peter Anvin" 
Cc: Ingo Molnar 
Cc: Jann Horn 
Cc: "Jason A. Donenfeld" 
Cc: kvm ML 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Rik van Riel 
Cc: x86-ml 
Link: https://lkml.kernel.org/r/20190403164156.19645-10-bige...@linutronix.de
---
 arch/x86/include/asm/fpu/api.h  | 11 +++
 arch/x86/include/asm/fpu/internal.h | 22 ++
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index b56d504af654..73e684160f35 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -10,6 +10,7 @@
 
 #ifndef _ASM_X86_FPU_API_H
 #define _ASM_X86_FPU_API_H
+#include 
 
 /*
  * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
@@ -22,6 +23,16 @@ extern void kernel_fpu_begin(void);
 extern void kernel_fpu_end(void);
 extern bool irq_fpu_usable(void);
 
+static inline void fpregs_lock(void)
+{
+   preempt_disable();
+}
+
+static inline void fpregs_unlock(void)
+{
+   preempt_enable();
+}
+
 /*
  * Query the presence of one or more xfeatures. Works on any legacy CPU as 
well.
  *
diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 54f70cae2f15..3e0c2c496f2d 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -484,6 +484,18 @@ static inline void fpregs_activate(struct fpu *fpu)
trace_x86_fpu_regs_activated(fpu);
 }
 
+/*
+ * Internal helper, do not use directly. Use switch_fpu_return() instead.
+ */
+static inline void __fpregs_load_activate(struct fpu *fpu, int cpu)
+{
+   if (!fpregs_state_valid(fpu, cpu)) {
+   if (current->mm)
+   copy_kernel_to_fpregs(&fpu->state);
+   fpregs_activate(fpu);
+   }
+}
+
 /*
  * FPU state switching for scheduling.
  *
@@ -522,14 +534,8 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
  */
 static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
 {
-   if (static_cpu_has(X86_FEATURE_FPU)) {
-   if (!fpregs_state_valid(new_fpu, cpu)) {
-   if (current->mm)
-   copy_kernel_to_fpregs(&new_fpu->state);
-   }
-
-   fpregs_activate(new_fpu);
-   }
+   if (static_cpu_has(X86_FEATURE_FPU))
+   __fpregs_load_activate(new_fpu, cpu);
 }
 
 /*

[tip:x86/mm] x86/mm/tlb: Make lazy TLB mode lazier

2018-10-09 Thread tip-bot for Rik van Riel

Commit-ID:  145f573b89a62bf53cfc0144fa9b1c56b0f70b45
Gitweb: https://git.kernel.org/tip/145f573b89a62bf53cfc0144fa9b1c56b0f70b45
Author: Rik van Riel 
AuthorDate: Tue, 25 Sep 2018 23:58:44 -0400
Committer:  Peter Zijlstra 
CommitDate: Tue, 9 Oct 2018 16:51:12 +0200

x86/mm/tlb: Make lazy TLB mode lazier

Lazy TLB mode can result in an idle CPU being woken up by a TLB flush,
when all it really needs to do is reload %CR3 at the next context switch,
assuming no page table pages got freed.

Memory ordering is used to prevent race conditions between switch_mm_irqs_off,
which checks whether .tlb_gen changed, and the TLB invalidation code, which
increments .tlb_gen whenever page table entries get invalidated.

The atomic increment in inc_mm_tlb_gen is its own barrier; the context
switch code adds an explicit barrier between reading tlbstate.is_lazy and
next->context.tlb_gen.

CPUs in lazy TLB mode remain part of the mm_cpumask(mm), both because
that allows TLB flush IPIs to be sent at page table freeing time, and
because the cache line bouncing on the mm_cpumask(mm) was responsible
for about half the CPU use in switch_mm_irqs_off().

We can change native_flush_tlb_others() without touching other
(paravirt) implementations of flush_tlb_others() because we'll be
flushing less. The existing implementations flush more and are
therefore still correct.

Cc: npig...@gmail.com
Cc: mi...@kernel.org
Cc: will.dea...@arm.com
Cc: kernel-t...@fb.com
Cc: l...@kernel.org
Cc: h...@zytor.com
Tested-by: Song Liu 
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Link: http://lkml.kernel.org/r/20180926035844.1420-8-r...@surriel.com
---
 arch/x86/mm/tlb.c | 67 +++
 1 file changed, 58 insertions(+), 9 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 92e46f4c058c..7d68489cfdb1 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -185,6 +185,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
 {
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+   bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
bool need_flush;
@@ -242,17 +243,40 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
   next->context.ctx_id);
 
/*
-* We don't currently support having a real mm loaded without
-* our cpu set in mm_cpumask().  We have all the bookkeeping
-* in place to figure out whether we would need to flush
-* if our cpu were cleared in mm_cpumask(), but we don't
-* currently use it.
+* Even in lazy TLB mode, the CPU should stay set in the
+* mm_cpumask. The TLB shootdown code can figure out from
+* from cpu_tlbstate.is_lazy whether or not to send an IPI.
 */
if (WARN_ON_ONCE(real_prev != &init_mm &&
 !cpumask_test_cpu(cpu, mm_cpumask(next
cpumask_set_cpu(cpu, mm_cpumask(next));
 
-   return;
+   /*
+* If the CPU is not in lazy TLB mode, we are just switching
+* from one thread in a process to another thread in the same
+* process. No TLB flush required.
+*/
+   if (!was_lazy)
+   return;
+
+   /*
+* Read the tlb_gen to check whether a flush is needed.
+* If the TLB is up to date, just use it.
+* The barrier synchronizes with the tlb_gen increment in
+* the TLB shootdown code.
+*/
+   smp_mb();
+   next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+   if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
+   next_tlb_gen)
+   return;
+
+   /*
+* TLB contents went out of date while we were in lazy
+* mode. Fall through to the TLB switching code below.
+*/
+   new_asid = prev_asid;
+   need_flush = true;
} else {
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
 
@@ -346,8 +370,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
 
-   load_mm_cr4(next);
-   switch_ldt(real_prev, next);
+   if (next != real_prev) {
+   load_mm_cr4(next);
+   switch_ldt(real_prev, next);
+   }
 }
 
 /*
@@ -455,6 +481,9 @@ static void flush_tlb_func_common(const struct 
flush_tlb_info *f,

[tip:x86/mm] x86/mm/tlb: Add freed_tables element to flush_tlb_info

2018-10-09 Thread tip-bot for Rik van Riel

Commit-ID:  97807813fe7074ee865d6bc1df1d0f8fb878ee9d
Gitweb: https://git.kernel.org/tip/97807813fe7074ee865d6bc1df1d0f8fb878ee9d
Author: Rik van Riel 
AuthorDate: Tue, 25 Sep 2018 23:58:43 -0400
Committer:  Peter Zijlstra 
CommitDate: Tue, 9 Oct 2018 16:51:12 +0200

x86/mm/tlb: Add freed_tables element to flush_tlb_info

Pass the information on to native_flush_tlb_others.

No functional changes.

Cc: npig...@gmail.com
Cc: mi...@kernel.org
Cc: will.dea...@arm.com
Cc: songliubrav...@fb.com
Cc: kernel-t...@fb.com
Cc: h...@zytor.com
Cc: l...@kernel.org
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Link: http://lkml.kernel.org/r/20180926035844.1420-7-r...@surriel.com
---
 arch/x86/include/asm/tlbflush.h | 1 +
 arch/x86/mm/tlb.c   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 1dea9860ce5b..323a313947e0 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -532,6 +532,7 @@ struct flush_tlb_info {
unsigned long   end;
u64 new_tlb_gen;
unsigned intstride_shift;
+   boolfreed_tables;
 };
 
 #define local_flush_tlb() __flush_tlb()
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 14bf39fc0447..92e46f4c058c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -617,6 +617,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long 
start,
struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
.mm = mm,
.stride_shift = stride_shift,
+   .freed_tables = freed_tables,
};
 
cpu = get_cpu();

[tip:x86/mm] x86/mm/tlb: Add freed_tables argument to flush_tlb_mm_range

2018-10-09 Thread tip-bot for Rik van Riel

Commit-ID:  016c4d92cd16f569c6485ae62b076c1a4b779536
Gitweb: https://git.kernel.org/tip/016c4d92cd16f569c6485ae62b076c1a4b779536
Author: Rik van Riel 
AuthorDate: Tue, 25 Sep 2018 23:58:42 -0400
Committer:  Peter Zijlstra 
CommitDate: Tue, 9 Oct 2018 16:51:12 +0200

x86/mm/tlb: Add freed_tables argument to flush_tlb_mm_range

Add an argument to flush_tlb_mm_range to indicate whether page tables
are about to be freed after this TLB flush. This allows for an
optimization of flush_tlb_mm_range to skip CPUs in lazy TLB mode.

No functional changes.

Cc: npig...@gmail.com
Cc: mi...@kernel.org
Cc: will.dea...@arm.com
Cc: songliubrav...@fb.com
Cc: kernel-t...@fb.com
Cc: l...@kernel.org
Cc: h...@zytor.com
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Link: http://lkml.kernel.org/r/20180926035844.1420-6-r...@surriel.com
---
 arch/x86/include/asm/tlb.h  |  2 +-
 arch/x86/include/asm/tlbflush.h | 10 ++
 arch/x86/kernel/ldt.c   |  2 +-
 arch/x86/kernel/vm86_32.c   |  2 +-
 arch/x86/mm/tlb.c   |  3 ++-
 5 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index afbe7d1e68cf..404b8b1d44f5 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -20,7 +20,7 @@ static inline void tlb_flush(struct mmu_gather *tlb)
end = tlb->end;
}
 
-   flush_tlb_mm_range(tlb->mm, start, end, stride_shift);
+   flush_tlb_mm_range(tlb->mm, start, end, stride_shift, 
tlb->freed_tables);
 }
 
 /*
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index d6c0cd9e9591..1dea9860ce5b 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -536,22 +536,24 @@ struct flush_tlb_info {
 
 #define local_flush_tlb() __flush_tlb()
 
-#define flush_tlb_mm(mm)   flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
+#define flush_tlb_mm(mm)   \
+   flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
 
 #define flush_tlb_range(vma, start, end)   \
flush_tlb_mm_range((vma)->vm_mm, start, end,\
   ((vma)->vm_flags & VM_HUGETLB)   \
? huge_page_shift(hstate_vma(vma))  \
-   : PAGE_SHIFT)
+   : PAGE_SHIFT, false)
 
 extern void flush_tlb_all(void);
 extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
-   unsigned long end, unsigned int stride_shift);
+   unsigned long end, unsigned int stride_shift,
+   bool freed_tables);
 extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
 
 static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
 {
-   flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT);
+   flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
 }
 
 void native_flush_tlb_others(const struct cpumask *cpumask,
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 7fdb2414ca65..ab18e0884dc6 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct 
*ldt, int slot)
map_ldt_struct_to_user(mm);
 
va = (unsigned long)ldt_slot_va(slot);
-   flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT);
+   flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false);
 
ldt->slot = slot;
return 0;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 52fed70f671e..c2fd39752da8 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
pte_unmap_unlock(pte, ptl);
 out:
up_write(&mm->mmap_sem);
-   flush_tlb_mm_range(mm, 0xA, 0xA + 32*PAGE_SIZE, PAGE_SHIFT);
+   flush_tlb_mm_range(mm, 0xA, 0xA + 32*PAGE_SIZE, PAGE_SHIFT, 
false);
 }
 
 
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 9fb30d27854b..14bf39fc0447 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -609,7 +609,8 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
 
 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
-   unsigned long end, unsigned int stride_shift)
+   unsigned long end, unsigned int stride_shift,
+   bool freed_tables)
 {
int cpu;

[tip:x86/mm] smp,cpumask: introduce on_each_cpu_cond_mask

2018-10-09 Thread tip-bot for Rik van Riel

Commit-ID:  7d49b28a80b830c3ca876d33bedc58d62a78e16f
Gitweb: https://git.kernel.org/tip/7d49b28a80b830c3ca876d33bedc58d62a78e16f
Author: Rik van Riel 
AuthorDate: Tue, 25 Sep 2018 23:58:41 -0400
Committer:  Peter Zijlstra 
CommitDate: Tue, 9 Oct 2018 16:51:11 +0200

smp,cpumask: introduce on_each_cpu_cond_mask

Introduce a variant of on_each_cpu_cond that iterates only over the
CPUs in a cpumask, in order to avoid making callbacks for every single
CPU in the system when we only need to test a subset.

Cc: npig...@gmail.com
Cc: mi...@kernel.org
Cc: will.dea...@arm.com
Cc: songliubrav...@fb.com
Cc: kernel-t...@fb.com
Cc: h...@zytor.com
Cc: l...@kernel.org
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Link: http://lkml.kernel.org/r/20180926035844.1420-5-r...@surriel.com
---
 include/linux/smp.h |  4 
 kernel/smp.c| 17 +
 kernel/up.c | 14 +++---
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 9fb239e12b82..a56f08ff3097 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -53,6 +53,10 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags);
 
+void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
+   smp_call_func_t func, void *info, bool wait,
+   gfp_t gfp_flags, const struct cpumask *mask);
+
 int smp_call_function_single_async(int cpu, call_single_data_t *csd);
 
 #ifdef CONFIG_SMP
diff --git a/kernel/smp.c b/kernel/smp.c
index a7d4f9f50a49..163c451af42e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -669,9 +669,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
  * You must not call this function with disabled interrupts or
  * from a hardware interrupt handler or from a bottom half handler.
  */
-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
-   gfp_t gfp_flags)
+   gfp_t gfp_flags, const struct cpumask *mask)
 {
cpumask_var_t cpus;
int cpu, ret;
@@ -680,7 +680,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void 
*info),
 
if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN {
preempt_disable();
-   for_each_online_cpu(cpu)
+   for_each_cpu(cpu, mask)
if (cond_func(cpu, info))
__cpumask_set_cpu(cpu, cpus);
on_each_cpu_mask(cpus, func, info, wait);
@@ -692,7 +692,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void 
*info),
 * just have to IPI them one by one.
 */
preempt_disable();
-   for_each_online_cpu(cpu)
+   for_each_cpu(cpu, mask)
if (cond_func(cpu, info)) {
ret = smp_call_function_single(cpu, func,
info, wait);
@@ -701,6 +701,15 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void 
*info),
preempt_enable();
}
 }
+EXPORT_SYMBOL(on_each_cpu_cond_mask);
+
+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+   smp_call_func_t func, void *info, bool wait,
+   gfp_t gfp_flags)
+{
+   on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags,
+   cpu_online_mask);
+}
 EXPORT_SYMBOL(on_each_cpu_cond);
 
 static void do_nothing(void *unused)
diff --git a/kernel/up.c b/kernel/up.c
index 42c46bf3e0a5..ff536f9cc8a2 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
  * Preemption is disabled here to make sure the cond_func is called under the
  * same condtions in UP and SMP.
  */
-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags)
+void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
+  smp_call_func_t func, void *info, bool wait,
+  gfp_t gfp_flags, const struct cpumask *mask)
 {
unsigned long flags;
 
@@ -82,6 +82,14 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
}
preempt_enable();
 }
+EXPORT_SYMBOL(on_each_cpu_cond_mask);
+
+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+ smp_call_func_t func, void *info, bool wait,
+ gfp_t gfp_flags)
+{
+   on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL);
+}
 EXPORT_SYMBOL(on_each_cpu_cond);
 
 int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool 
phys)

[tip:x86/mm] smp: use __cpumask_set_cpu in on_each_cpu_cond

2018-10-09 Thread tip-bot for Rik van Riel

Commit-ID:  c3f7f2c7eba1a53d2e5ffbc2dcc9a20c5f094890
Gitweb: https://git.kernel.org/tip/c3f7f2c7eba1a53d2e5ffbc2dcc9a20c5f094890
Author: Rik van Riel 
AuthorDate: Tue, 25 Sep 2018 23:58:40 -0400
Committer:  Peter Zijlstra 
CommitDate: Tue, 9 Oct 2018 16:51:11 +0200

smp: use __cpumask_set_cpu in on_each_cpu_cond

The code in on_each_cpu_cond sets CPUs in a locally allocated bitmask,
which should never be used by other CPUs simultaneously. There is no
need to use locked memory accesses to set the bits in this bitmap.

Switch to __cpumask_set_cpu.

Cc: npig...@gmail.com
Cc: mi...@kernel.org
Cc: will.dea...@arm.com
Cc: songliubrav...@fb.com
Cc: kernel-t...@fb.com
Cc: h...@zytor.com
Suggested-by: Peter Zijlstra 
Signed-off-by: Rik van Riel 
Reviewed-by: Andy Lutomirski 
Signed-off-by: Peter Zijlstra (Intel) 
Link: http://lkml.kernel.org/r/20180926035844.1420-4-r...@surriel.com
---
 kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index d86eec5f51c1..a7d4f9f50a49 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -682,7 +682,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void 
*info),
preempt_disable();
for_each_online_cpu(cpu)
if (cond_func(cpu, info))
-   cpumask_set_cpu(cpu, cpus);
+   __cpumask_set_cpu(cpu, cpus);
on_each_cpu_mask(cpus, func, info, wait);
preempt_enable();
free_cpumask_var(cpus);

[tip:x86/mm] x86/mm/tlb: Restructure switch_mm_irqs_off()

2018-10-09 Thread tip-bot for Rik van Riel

Commit-ID:  12c4d978fd170ccdd7260ec11f93b11e46904228
Gitweb: https://git.kernel.org/tip/12c4d978fd170ccdd7260ec11f93b11e46904228
Author: Rik van Riel 
AuthorDate: Tue, 25 Sep 2018 23:58:39 -0400
Committer:  Peter Zijlstra 
CommitDate: Tue, 9 Oct 2018 16:51:11 +0200

x86/mm/tlb: Restructure switch_mm_irqs_off()

Move some code that will be needed for the lazy -> !lazy state
transition when a lazy TLB CPU has gotten out of date.

No functional changes, since the if (real_prev == next) branch
always returns.

(cherry picked from commit 61d0beb5796ab11f7f3bf38cb2eccc6579aaa70b)
Cc: npig...@gmail.com
Cc: efa...@gmx.de
Cc: will.dea...@arm.com
Cc: Linus Torvalds 
Cc: Thomas Gleixner 
Cc: songliubrav...@fb.com
Cc: kernel-t...@fb.com
Cc: h...@zytor.com
Suggested-by: Andy Lutomirski 
Signed-off-by: Rik van Riel 
Acked-by: Dave Hansen 
Signed-off-by: Ingo Molnar 
Signed-off-by: Peter Zijlstra (Intel) 
Link: http://lkml.kernel.org/r/20180716190337.26133-4-r...@surriel.com
---
 arch/x86/mm/tlb.c | 66 +++
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 54a5870190a6..9fb30d27854b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -187,6 +187,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
+   bool need_flush;
+   u16 new_asid;
 
/*
 * NB: The scheduler will call us with prev == next when switching
@@ -252,8 +254,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
 
return;
} else {
-   u16 new_asid;
-   bool need_flush;
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
 
/*
@@ -308,44 +308,44 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
/* Let nmi_uaccess_okay() know that we're changing CR3. */
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
barrier();
+   }
 
-   if (need_flush) {
-   this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, 
next->context.ctx_id);
-   this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, 
next_tlb_gen);
-   load_new_mm_cr3(next->pgd, new_asid, true);
-
-   /*
-* NB: This gets called via leave_mm() in the idle path
-* where RCU functions differently.  Tracing normally
-* uses RCU, so we need to use the _rcuidle variant.
-*
-* (There is no good reason for this.  The idle code 
should
-*  be rearranged to call this before rcu_idle_enter().)
-*/
-   trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 
TLB_FLUSH_ALL);
-   } else {
-   /* The new ASID is already up to date. */
-   load_new_mm_cr3(next->pgd, new_asid, false);
-
-   /* See above wrt _rcuidle. */
-   trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
-   }
+   if (need_flush) {
+   this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, 
next->context.ctx_id);
+   this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, 
next_tlb_gen);
+   load_new_mm_cr3(next->pgd, new_asid, true);
 
/*
-* Record last user mm's context id, so we can avoid
-* flushing branch buffer with IBPB if we switch back
-* to the same user.
+* NB: This gets called via leave_mm() in the idle path
+* where RCU functions differently.  Tracing normally
+* uses RCU, so we need to use the _rcuidle variant.
+*
+* (There is no good reason for this.  The idle code should
+*  be rearranged to call this before rcu_idle_enter().)
 */
-   if (next != &init_mm)
-   this_cpu_write(cpu_tlbstate.last_ctx_id, 
next->context.ctx_id);
-
-   /* Make sure we write CR3 before loaded_mm. */
-   barrier();
+   trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 
TLB_FLUSH_ALL);
+   } else {
+   /* The new ASID is already up to date. */
+   load_new_mm_cr3(next->pgd, new_asid, false);
 
-   this_cpu_write(cpu_tlbstate.loaded_mm, next);
-   this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+   /* See above wrt _rcuidle. */
+   trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
 
+   /*
+* Record last user mm's context id,

[tip:x86/mm] x86/mm/tlb: Always use lazy TLB mode

2018-10-09 Thread tip-bot for Rik van Riel

Commit-ID:  5462bc3a9a3c38328bbbd276d51164c7cf21d6a8
Gitweb: https://git.kernel.org/tip/5462bc3a9a3c38328bbbd276d51164c7cf21d6a8
Author: Rik van Riel 
AuthorDate: Tue, 25 Sep 2018 23:58:38 -0400
Committer:  Peter Zijlstra 
CommitDate: Tue, 9 Oct 2018 16:51:11 +0200

x86/mm/tlb: Always use lazy TLB mode

On most workloads, the number of context switches far exceeds the
number of TLB flushes sent. Optimizing the context switches, by always
using lazy TLB mode, speeds up those workloads.

This patch results in about a 1% reduction in CPU use on a two socket
Broadwell system running a memcache like workload.

Cc: npig...@gmail.com
Cc: efa...@gmx.de
Cc: will.dea...@arm.com
Cc: Linus Torvalds 
Cc: Thomas Gleixner 
Cc: kernel-t...@fb.com
Cc: h...@zytor.com
Cc: l...@kernel.org
Tested-by: Song Liu 
Signed-off-by: Rik van Riel 
(cherry picked from commit 95b0e6357d3e4e05349668940d7ff8f3b7e7e11e)
Acked-by: Dave Hansen 
Signed-off-by: Ingo Molnar 
Signed-off-by: Peter Zijlstra (Intel) 
Link: http://lkml.kernel.org/r/20180716190337.26133-7-r...@surriel.com
---
 arch/x86/include/asm/tlbflush.h | 16 
 arch/x86/mm/tlb.c   | 15 +--
 2 files changed, 1 insertion(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 671f65309ce7..d6c0cd9e9591 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, 
u16 asid)
 #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
 #endif
 
-static inline bool tlb_defer_switch_to_init_mm(void)
-{
-   /*
-* If we have PCID, then switching to init_mm is reasonably
-* fast.  If we don't have PCID, then switching to init_mm is
-* quite slow, so we try to defer it in the hopes that we can
-* avoid it entirely.  The latter approach runs the risk of
-* receiving otherwise unnecessary IPIs.
-*
-* This choice is just a heuristic.  The tlb code can handle this
-* function returning true or false regardless of whether we have
-* PCID.
-*/
-   return !static_cpu_has(X86_FEATURE_PCID);
-}
-
 struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6aa195796dec..54a5870190a6 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -368,20 +368,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct 
task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
 
-   if (tlb_defer_switch_to_init_mm()) {
-   /*
-* There's a significant optimization that may be possible
-* here.  We have accurate enough TLB flush tracking that we
-* don't need to maintain coherence of TLB per se when we're
-* lazy.  We do, however, need to maintain coherence of
-* paging-structure caches.  We could, in principle, leave our
-* old mm loaded and only switch to init_mm when
-* tlb_remove_page() happens.
-*/
-   this_cpu_write(cpu_tlbstate.is_lazy, true);
-   } else {
-   switch_mm(NULL, &init_mm, NULL);
-   }
+   this_cpu_write(cpu_tlbstate.is_lazy, true);
 }
 
 /*

[tip:x86/mm] x86/mm/tlb: Skip atomic operations for 'init_mm' in switch_mm_irqs_off()

2018-07-17 Thread tip-bot for Rik van Riel

Commit-ID:  e9d8c61557687b7126101e9550bdf243223f0d8f
Gitweb: https://git.kernel.org/tip/e9d8c61557687b7126101e9550bdf243223f0d8f
Author: Rik van Riel 
AuthorDate: Mon, 16 Jul 2018 15:03:37 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 17 Jul 2018 09:35:34 +0200

x86/mm/tlb: Skip atomic operations for 'init_mm' in switch_mm_irqs_off()

Song Liu noticed switch_mm_irqs_off() taking a lot of CPU time in recent
kernels,using 1.8% of a 48 CPU system during a netperf to localhost run.
Digging into the profile, we noticed that cpumask_clear_cpu and
cpumask_set_cpu together take about half of the CPU time taken by
switch_mm_irqs_off().

However, the CPUs running netperf end up switching back and forth
between netperf and the idle task, which does not require changes
to the mm_cpumask. Furthermore, the init_mm cpumask ends up being
the most heavily contended one in the system.

Simply skipping changes to mm_cpumask(&init_mm) reduces overhead.

Reported-and-tested-by: Song Liu 
Signed-off-by: Rik van Riel 
Acked-by: Dave Hansen 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: efa...@gmx.de
Cc: kernel-t...@fb.com
Cc: l...@kernel.org
Link: http://lkml.kernel.org/r/20180716190337.26133-8-r...@surriel.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/mm/tlb.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 493559cae2d5..f086195f644c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -310,15 +310,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
sync_current_stack_to_mm(next);
}
 
-   /* Stop remote flushes for the previous mm */
-   VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
-   real_prev != &init_mm);
-   cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+   /*
+* Stop remote flushes for the previous mm.
+* Skip kernel threads; we never send init_mm TLB flushing IPIs,
+* but the bitmap manipulation can cause cache line contention.
+*/
+   if (real_prev != &init_mm) {
+   VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
+   mm_cpumask(real_prev)));
+   cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+   }
 
/*
 * Start remote flushes and then read tlb_gen.
 */
-   cpumask_set_cpu(cpu, mm_cpumask(next));
+   if (next != &init_mm)
+   cpumask_set_cpu(cpu, mm_cpumask(next));
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
 
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);

[tip:x86/mm] x86/mm/tlb: Always use lazy TLB mode

2018-07-17 Thread tip-bot for Rik van Riel

Commit-ID:  95b0e6357d3e4e05349668940d7ff8f3b7e7e11e
Gitweb: https://git.kernel.org/tip/95b0e6357d3e4e05349668940d7ff8f3b7e7e11e
Author: Rik van Riel 
AuthorDate: Mon, 16 Jul 2018 15:03:36 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 17 Jul 2018 09:35:34 +0200

x86/mm/tlb: Always use lazy TLB mode

Now that CPUs in lazy TLB mode no longer receive TLB shootdown IPIs, except
at page table freeing time, and idle CPUs will no longer get shootdown IPIs
for things like mprotect and madvise, we can always use lazy TLB mode.

Tested-by: Song Liu 
Signed-off-by: Rik van Riel 
Acked-by: Dave Hansen 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: efa...@gmx.de
Cc: kernel-t...@fb.com
Cc: l...@kernel.org
Link: http://lkml.kernel.org/r/20180716190337.26133-7-r...@surriel.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/tlbflush.h | 16 
 arch/x86/mm/tlb.c   | 15 +--
 2 files changed, 1 insertion(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 3aa3204b5dc0..511bf5fae8b8 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, 
u16 asid)
 #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
 #endif
 
-static inline bool tlb_defer_switch_to_init_mm(void)
-{
-   /*
-* If we have PCID, then switching to init_mm is reasonably
-* fast.  If we don't have PCID, then switching to init_mm is
-* quite slow, so we try to defer it in the hopes that we can
-* avoid it entirely.  The latter approach runs the risk of
-* receiving otherwise unnecessary IPIs.
-*
-* This choice is just a heuristic.  The tlb code can handle this
-* function returning true or false regardless of whether we have
-* PCID.
-*/
-   return !static_cpu_has(X86_FEATURE_PCID);
-}
-
 struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index e4156e37aa71..493559cae2d5 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -379,20 +379,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct 
task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
 
-   if (tlb_defer_switch_to_init_mm()) {
-   /*
-* There's a significant optimization that may be possible
-* here.  We have accurate enough TLB flush tracking that we
-* don't need to maintain coherence of TLB per se when we're
-* lazy.  We do, however, need to maintain coherence of
-* paging-structure caches.  We could, in principle, leave our
-* old mm loaded and only switch to init_mm when
-* tlb_remove_page() happens.
-*/
-   this_cpu_write(cpu_tlbstate.is_lazy, true);
-   } else {
-   switch_mm(NULL, &init_mm, NULL);
-   }
+   this_cpu_write(cpu_tlbstate.is_lazy, true);
 }
 
 /*

[tip:x86/mm] x86/mm/tlb: Only send page table free TLB flush to lazy TLB CPUs

2018-07-17 Thread tip-bot for Rik van Riel

Commit-ID:  64482aafe55fc7e84d0741c356f8176ee7bde357
Gitweb: https://git.kernel.org/tip/64482aafe55fc7e84d0741c356f8176ee7bde357
Author: Rik van Riel 
AuthorDate: Mon, 16 Jul 2018 15:03:35 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 17 Jul 2018 09:35:33 +0200

x86/mm/tlb: Only send page table free TLB flush to lazy TLB CPUs

CPUs in !is_lazy have either received TLB flush IPIs earlier on during
the munmap (when the user memory was unmapped), or have context switched
and reloaded during that stage of the munmap.

Page table free TLB flushes only need to be sent to CPUs in lazy TLB
mode, which TLB contents might not yet be up to date yet.

Tested-by: Song Liu 
Signed-off-by: Rik van Riel 
Acked-by: Dave Hansen 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: efa...@gmx.de
Cc: kernel-t...@fb.com
Cc: l...@kernel.org
Link: http://lkml.kernel.org/r/20180716190337.26133-6-r...@surriel.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/mm/tlb.c | 43 +++
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 26542cc17043..e4156e37aa71 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -712,15 +712,50 @@ void tlb_flush_remove_tables_local(void *arg)
}
 }
 
+static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm,
+ struct cpumask *lazy_cpus)
+{
+   int cpu;
+
+   for_each_cpu(cpu, mm_cpumask(mm)) {
+   if (!per_cpu(cpu_tlbstate.is_lazy, cpu))
+   cpumask_set_cpu(cpu, lazy_cpus);
+   }
+}
+
 void tlb_flush_remove_tables(struct mm_struct *mm)
 {
int cpu = get_cpu();
+   cpumask_var_t lazy_cpus;
+
+   if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) {
+   put_cpu();
+   return;
+   }
+
+   if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) {
+   /*
+* If the cpumask allocation fails, do a brute force flush
+* on all the CPUs that have this mm loaded.
+*/
+   smp_call_function_many(mm_cpumask(mm),
+   tlb_flush_remove_tables_local, (void *)mm, 1);
+   put_cpu();
+   return;
+   }
+
/*
-* XXX: this really only needs to be called for CPUs in lazy TLB mode.
+* CPUs with !is_lazy either received a TLB flush IPI while the user
+* pages in this address range were unmapped, or have context switched
+* and reloaded %CR3 since then.
+*
+* Shootdown IPIs at page table freeing time only need to be sent to
+* CPUs that may have out of date TLB contents.
 */
-   if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
-   smp_call_function_many(mm_cpumask(mm), 
tlb_flush_remove_tables_local, (void *)mm, 1);
-
+   mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus);
+   smp_call_function_many(lazy_cpus,
+   tlb_flush_remove_tables_local, (void *)mm, 1);
+   free_cpumask_var(lazy_cpus);
put_cpu();
 }

[tip:x86/mm] x86/mm/tlb: Make lazy TLB mode lazier

2018-07-17 Thread tip-bot for Rik van Riel

Commit-ID:  ac0315896970d8589291e9d8a1569fc65967b7f1
Gitweb: https://git.kernel.org/tip/ac0315896970d8589291e9d8a1569fc65967b7f1
Author: Rik van Riel 
AuthorDate: Mon, 16 Jul 2018 15:03:34 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 17 Jul 2018 09:35:33 +0200

x86/mm/tlb: Make lazy TLB mode lazier

Lazy TLB mode can result in an idle CPU being woken up by a TLB flush,
when all it really needs to do is reload %CR3 at the next context switch,
assuming no page table pages got freed.

Memory ordering is used to prevent race conditions between switch_mm_irqs_off,
which checks whether .tlb_gen changed, and the TLB invalidation code, which
increments .tlb_gen whenever page table entries get invalidated.

The atomic increment in inc_mm_tlb_gen is its own barrier; the context
switch code adds an explicit barrier between reading tlbstate.is_lazy and
next->context.tlb_gen.

Unlike the 2016 version of this patch, CPUs with cpu_tlbstate.is_lazy set
are not removed from the mm_cpumask(mm), since that would prevent the TLB
flush IPIs at page table free time from being sent to all the CPUs
that need them.

This patch reduces total CPU use in the system by about 1-2% for a
memcache workload on two socket systems, and by about 1% for a heavily
multi-process netperf between two systems.

Tested-by: Song Liu 
Signed-off-by: Rik van Riel 
Acked-by: Dave Hansen 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: efa...@gmx.de
Cc: kernel-t...@fb.com
Cc: l...@kernel.org
Link: http://lkml.kernel.org/r/20180716190337.26133-5-r...@surriel.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/mm/tlb.c | 68 +++
 1 file changed, 59 insertions(+), 9 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 4b73fe835c95..26542cc17043 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -185,6 +186,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
 {
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+   bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
bool need_flush;
@@ -242,17 +244,40 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
   next->context.ctx_id);
 
/*
-* We don't currently support having a real mm loaded without
-* our cpu set in mm_cpumask().  We have all the bookkeeping
-* in place to figure out whether we would need to flush
-* if our cpu were cleared in mm_cpumask(), but we don't
-* currently use it.
+* Even in lazy TLB mode, the CPU should stay set in the
+* mm_cpumask. The TLB shootdown code can figure out from
+* from cpu_tlbstate.is_lazy whether or not to send an IPI.
 */
if (WARN_ON_ONCE(real_prev != &init_mm &&
 !cpumask_test_cpu(cpu, mm_cpumask(next
cpumask_set_cpu(cpu, mm_cpumask(next));
 
-   return;
+   /*
+* If the CPU is not in lazy TLB mode, we are just switching
+* from one thread in a process to another thread in the same
+* process. No TLB flush required.
+*/
+   if (!was_lazy)
+   return;
+
+   /*
+* Read the tlb_gen to check whether a flush is needed.
+* If the TLB is up to date, just use it.
+* The barrier synchronizes with the tlb_gen increment in
+* the TLB shootdown code.
+*/
+   smp_mb();
+   next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+   if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
+   next_tlb_gen)
+   return;
+
+   /*
+* TLB contents went out of date while we were in lazy
+* mode. Fall through to the TLB switching code below.
+*/
+   new_asid = prev_asid;
+   need_flush = true;
} else {
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
 
@@ -454,6 +479,9 @@ static void flush_tlb_func_common(const struct 
flush_tlb_info *f,
 * paging-structure cache to avoid speculatively reading
 * garbage into our TLB.  Since switching to init_mm is barely
 * slower than a minimal flush, just switch to init_mm.
+*
+* This should be rare, with native_flush_tlb_others skipping
+* IPIs to lazy TLB mode CPUs.
 */

[tip:x86/mm] x86/mm/tlb: Restructure switch_mm_irqs_off()

2018-07-17 Thread tip-bot for Rik van Riel

Commit-ID:  61d0beb5796ab11f7f3bf38cb2eccc6579aaa70b
Gitweb: https://git.kernel.org/tip/61d0beb5796ab11f7f3bf38cb2eccc6579aaa70b
Author: Rik van Riel 
AuthorDate: Mon, 16 Jul 2018 15:03:33 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 17 Jul 2018 09:35:32 +0200

x86/mm/tlb: Restructure switch_mm_irqs_off()

Move some code that will be needed for the lazy -> !lazy state
transition when a lazy TLB CPU has gotten out of date.

No functional changes, since the if (real_prev == next) branch
always returns.

Suggested-by: Andy Lutomirski 
Signed-off-by: Rik van Riel 
Acked-by: Dave Hansen 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: efa...@gmx.de
Cc: kernel-t...@fb.com
Link: http://lkml.kernel.org/r/20180716190337.26133-4-r...@surriel.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/mm/tlb.c | 60 +++
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 9a893673c56b..4b73fe835c95 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -187,6 +187,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
+   bool need_flush;
+   u16 new_asid;
 
/*
 * NB: The scheduler will call us with prev == next when switching
@@ -252,8 +254,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
 
return;
} else {
-   u16 new_asid;
-   bool need_flush;
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
 
/*
@@ -297,41 +297,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
 
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+   }
 
-   if (need_flush) {
-   this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, 
next->context.ctx_id);
-   this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, 
next_tlb_gen);
-   load_new_mm_cr3(next->pgd, new_asid, true);
-
-   /*
-* NB: This gets called via leave_mm() in the idle path
-* where RCU functions differently.  Tracing normally
-* uses RCU, so we need to use the _rcuidle variant.
-*
-* (There is no good reason for this.  The idle code 
should
-*  be rearranged to call this before rcu_idle_enter().)
-*/
-   trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 
TLB_FLUSH_ALL);
-   } else {
-   /* The new ASID is already up to date. */
-   load_new_mm_cr3(next->pgd, new_asid, false);
-
-   /* See above wrt _rcuidle. */
-   trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
-   }
+   if (need_flush) {
+   this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, 
next->context.ctx_id);
+   this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, 
next_tlb_gen);
+   load_new_mm_cr3(next->pgd, new_asid, true);
 
/*
-* Record last user mm's context id, so we can avoid
-* flushing branch buffer with IBPB if we switch back
-* to the same user.
+* NB: This gets called via leave_mm() in the idle path
+* where RCU functions differently.  Tracing normally
+* uses RCU, so we need to use the _rcuidle variant.
+*
+* (There is no good reason for this.  The idle code should
+*  be rearranged to call this before rcu_idle_enter().)
 */
-   if (next != &init_mm)
-   this_cpu_write(cpu_tlbstate.last_ctx_id, 
next->context.ctx_id);
+   trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 
TLB_FLUSH_ALL);
+   } else {
+   /* The new ASID is already up to date. */
+   load_new_mm_cr3(next->pgd, new_asid, false);
 
-   this_cpu_write(cpu_tlbstate.loaded_mm, next);
-   this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+   /* See above wrt _rcuidle. */
+   trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
 
+   /*
+* Record last user mm's context id, so we can avoid
+* flushing branch buffer with IBPB if we switch back
+* to the same user.
+*/
+   if (next != &init_mm)
+   this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+
+   this_cpu_write(cpu_tlbstate.loaded_mm, next);
+   this_cpu_w

[tip:x86/mm] x86/mm/tlb: Leave lazy TLB mode at page table free time

2018-07-17 Thread tip-bot for Rik van Riel

Commit-ID:  2ff6ddf19c0ec40633bd14d8fe28a289816bd98d
Gitweb: https://git.kernel.org/tip/2ff6ddf19c0ec40633bd14d8fe28a289816bd98d
Author: Rik van Riel 
AuthorDate: Mon, 16 Jul 2018 15:03:32 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 17 Jul 2018 09:35:31 +0200

x86/mm/tlb: Leave lazy TLB mode at page table free time

Andy discovered that speculative memory accesses while in lazy
TLB mode can crash a system, when a CPU tries to dereference a
speculative access using memory contents that used to be valid
page table memory, but have since been reused for something else
and point into la-la land.

The latter problem can be prevented in two ways. The first is to
always send a TLB shootdown IPI to CPUs in lazy TLB mode, while
the second one is to only send the TLB shootdown at page table
freeing time.

The second should result in fewer IPIs, since operationgs like
mprotect and madvise are very common with some workloads, but
do not involve page table freeing. Also, on munmap, batching
of page table freeing covers much larger ranges of virtual
memory than the batching of unmapped user pages.

Tested-by: Song Liu 
Signed-off-by: Rik van Riel 
Acked-by: Dave Hansen 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: efa...@gmx.de
Cc: kernel-t...@fb.com
Cc: l...@kernel.org
Link: http://lkml.kernel.org/r/20180716190337.26133-3-r...@surriel.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/tlbflush.h |  5 +
 arch/x86/mm/tlb.c   | 27 +++
 include/asm-generic/tlb.h   | 10 ++
 mm/memory.c | 22 ++
 4 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6690cd3fc8b1..3aa3204b5dc0 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -554,4 +554,9 @@ extern void arch_tlbbatch_flush(struct 
arch_tlbflush_unmap_batch *batch);
native_flush_tlb_others(mask, info)
 #endif
 
+extern void tlb_flush_remove_tables(struct mm_struct *mm);
+extern void tlb_flush_remove_tables_local(void *arg);
+
+#define HAVE_TLB_FLUSH_REMOVE_TABLES
+
 #endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6eb1f34c3c85..9a893673c56b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -646,6 +646,33 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned 
long start,
put_cpu();
 }
 
+void tlb_flush_remove_tables_local(void *arg)
+{
+   struct mm_struct *mm = arg;
+
+   if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm &&
+   this_cpu_read(cpu_tlbstate.is_lazy)) {
+   /*
+* We're in lazy mode.  We need to at least flush our
+* paging-structure cache to avoid speculatively reading
+* garbage into our TLB.  Since switching to init_mm is barely
+* slower than a minimal flush, just switch to init_mm.
+*/
+   switch_mm_irqs_off(NULL, &init_mm, NULL);
+   }
+}
+
+void tlb_flush_remove_tables(struct mm_struct *mm)
+{
+   int cpu = get_cpu();
+   /*
+* XXX: this really only needs to be called for CPUs in lazy TLB mode.
+*/
+   if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
+   smp_call_function_many(mm_cpumask(mm), 
tlb_flush_remove_tables_local, (void *)mm, 1);
+
+   put_cpu();
+}
 
 static void do_flush_tlb_all(void *info)
 {
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 3063125197ad..e811ef7b8350 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -303,4 +303,14 @@ static inline void 
tlb_remove_check_page_size_change(struct mmu_gather *tlb,
 
 #define tlb_migrate_finish(mm) do {} while (0)
 
+/*
+ * Used to flush the TLB when page tables are removed, when lazy
+ * TLB mode may cause a CPU to retain intermediate translations
+ * pointing to about-to-be-freed page table memory.
+ */
+#ifndef HAVE_TLB_FLUSH_REMOVE_TABLES
+#define tlb_flush_remove_tables(mm) do {} while (0)
+#define tlb_flush_remove_tables_local(mm) do {} while (0)
+#endif
+
 #endif /* _ASM_GENERIC__TLB_H */
diff --git a/mm/memory.c b/mm/memory.c
index 7206a634270b..18355e0b971a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -326,16 +326,20 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, 
struct page *page, int page_
 
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 
-/*
- * See the comment near struct mmu_table_batch.
- */
-
 static void tlb_remove_table_smp_sync(void *arg)
 {
-   /* Simply deliver the interrupt */
+   struct mm_struct __maybe_unused *mm = arg;
+   /*
+* On most architectures this does nothing. Simply delivering the
+* interrupt is enough to prevent races with software page table
+* walking like that done in get_user_pages_fast.
+*
+* See the comment near struct mmu_table_batch.
+*/
+   tlb_flush_re

[tip:x86/mm] mm: Allocate the mm_cpumask (mm->cpu_bitmap[]) dynamically based on nr_cpu_ids

2018-07-17 Thread tip-bot for Rik van Riel

Commit-ID:  c1a2f7f0c06454387c2cd7b93ff1491c715a8c69
Gitweb: https://git.kernel.org/tip/c1a2f7f0c06454387c2cd7b93ff1491c715a8c69
Author: Rik van Riel 
AuthorDate: Mon, 16 Jul 2018 15:03:31 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 17 Jul 2018 09:35:30 +0200

mm: Allocate the mm_cpumask (mm->cpu_bitmap[]) dynamically based on nr_cpu_ids

The mm_struct always contains a cpumask bitmap, regardless of
CONFIG_CPUMASK_OFFSTACK. That means the first step can be to
simplify things, and simply have one bitmask at the end of the
mm_struct for the mm_cpumask.

This does necessitate moving everything else in mm_struct into
an anonymous sub-structure, which can be randomized when struct
randomization is enabled.

The second step is to determine the correct size for the
mm_struct slab object from the size of the mm_struct
(excluding the CPU bitmap) and the size the cpumask.

For init_mm we can simply allocate the maximum size this
kernel is compiled for, since we only have one init_mm
in the system, anyway.

Pointer magic by Mike Galbraith, to evade -Wstringop-overflow
getting confused by the dynamically sized array.

Tested-by: Song Liu 
Signed-off-by: Rik van Riel 
Signed-off-by: Mike Galbraith 
Signed-off-by: Rik van Riel 
Acked-by: Dave Hansen 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: kernel-t...@fb.com
Cc: l...@kernel.org
Link: http://lkml.kernel.org/r/20180716190337.26133-2-r...@surriel.com
Signed-off-by: Ingo Molnar 
---
 drivers/firmware/efi/efi.c |   1 +
 include/linux/mm_types.h   | 241 +++--
 kernel/fork.c  |  15 +--
 mm/init-mm.c   |  11 +++
 4 files changed, 145 insertions(+), 123 deletions(-)

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 232f4915223b..7f0b19410a95 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -82,6 +82,7 @@ struct mm_struct efi_mm = {
.mmap_sem   = __RWSEM_INITIALIZER(efi_mm.mmap_sem),
.page_table_lock= __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(efi_mm.mmlist),
+   .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
 };
 
 static bool disable_runtime;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 99ce070e7dcb..efdc24dd9e97 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -335,176 +335,183 @@ struct core_state {
 
 struct kioctx_table;
 struct mm_struct {
-   struct vm_area_struct *mmap;/* list of VMAs */
-   struct rb_root mm_rb;
-   u32 vmacache_seqnum;   /* per-thread vmacache */
+   struct {
+   struct vm_area_struct *mmap;/* list of VMAs */
+   struct rb_root mm_rb;
+   u32 vmacache_seqnum;   /* per-thread vmacache */
 #ifdef CONFIG_MMU
-   unsigned long (*get_unmapped_area) (struct file *filp,
+   unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags);
 #endif
-   unsigned long mmap_base;/* base of mmap area */
-   unsigned long mmap_legacy_base; /* base of mmap area in 
bottom-up allocations */
+   unsigned long mmap_base;/* base of mmap area */
+   unsigned long mmap_legacy_base; /* base of mmap area in 
bottom-up allocations */
 #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
-   /* Base adresses for compatible mmap() */
-   unsigned long mmap_compat_base;
-   unsigned long mmap_compat_legacy_base;
+   /* Base adresses for compatible mmap() */
+   unsigned long mmap_compat_base;
+   unsigned long mmap_compat_legacy_base;
 #endif
-   unsigned long task_size;/* size of task vm space */
-   unsigned long highest_vm_end;   /* highest vma end address */
-   pgd_t * pgd;
-
-   /**
-* @mm_users: The number of users including userspace.
-*
-* Use mmget()/mmget_not_zero()/mmput() to modify. When this drops
-* to 0 (i.e. when the task exits and there are no other temporary
-* reference holders), we also release a reference on @mm_count
-* (which may then free the &struct mm_struct if @mm_count also
-* drops to 0).
-*/
-   atomic_t mm_users;
-
-   /**
-* @mm_count: The number of references to &struct mm_struct
-* (@mm_users count as 1).
-*
-* Use mmgrab()/mmdrop() to modify. When this drops to 0, the
-* &struct mm_struct is freed.
-*/
-   atomic_t mm_count;
+   unsigned long task_size;/* size of task vm space */
+   unsigned long highest_vm_end;   /* highest vma end address */
+   pgd_t * pgd;
+
+   /**
+

[tip:x86/fpu] x86/fpu: Add FPU state copying quirk to handle XRSTOR failure on Intel Skylake CPUs

2017-09-26 Thread tip-bot for Rik van Riel

Commit-ID:  0852b374173bb57f870d78e6c6839c77b339be5f
Gitweb: http://git.kernel.org/tip/0852b374173bb57f870d78e6c6839c77b339be5f
Author: Rik van Riel 
AuthorDate: Sat, 23 Sep 2017 15:00:04 +0200
Committer:  Ingo Molnar 
CommitDate: Sun, 24 Sep 2017 13:04:34 +0200

x86/fpu: Add FPU state copying quirk to handle XRSTOR failure on Intel Skylake 
CPUs

On Skylake CPUs I noticed that XRSTOR is unable to deal with states
created by copyout_from_xsaves() if the xstate has only SSE/YMM state, and
no FP state. That is, xfeatures had XFEATURE_MASK_SSE set, but not
XFEATURE_MASK_FP.

The reason is that part of the SSE/YMM state lives in the MXCSR and
MXCSR_FLAGS fields of the FP state.

Ensure that whenever we copy SSE or YMM state around, the MXCSR and
MXCSR_FLAGS fields are also copied around.

Signed-off-by: Rik van Riel 
Cc: Andrew Morton 
Cc: Andy Lutomirski 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: Eric Biggers 
Cc: Fenghua Yu 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Yu-cheng Yu 
Link: http://lkml.kernel.org/r/20170210085445.0f1cc...@annuminas.surriel.com
Link: http://lkml.kernel.org/r/20170923130016.21448-22-mi...@kernel.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/fpu/types.h |  3 +++
 arch/x86/kernel/fpu/xstate.c | 42 
 2 files changed, 45 insertions(+)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 0c314a3..71db45c 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -68,6 +68,9 @@ struct fxregs_state {
 /* Default value for fxregs_state.mxcsr: */
 #define MXCSR_DEFAULT  0x1f80
 
+/* Copy both mxcsr & mxcsr_flags with a single u64 memcpy: */
+#define MXCSR_AND_FLAGS_SIZE sizeof(u64)
+
 /*
  * Software based FPU emulation state. This is arbitrary really,
  * it matches the x87 format to make it easier to understand:
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 0ef3504..41c5225 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -921,6 +921,23 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int 
pkey,
 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
 
 /*
+ * Weird legacy quirk: SSE and YMM states store information in the
+ * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP
+ * area is marked as unused in the xfeatures header, we need to copy
+ * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use.
+ */
+static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
+{
+   if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM)))
+   return 0;
+
+   if (xfeatures & XFEATURE_MASK_FP)
+   return 0;
+
+   return 1;
+}
+
+/*
  * This is similar to user_regset_copyout(), but will not add offset to
  * the source data pointer or increment pos, count, kbuf, and ubuf.
  */
@@ -988,6 +1005,12 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state 
*xsave, unsigned int of
 
}
 
+   if (xfeatures_mxcsr_quirk(header.xfeatures)) {
+   offset = offsetof(struct fxregs_state, mxcsr);
+   size = MXCSR_AND_FLAGS_SIZE;
+   __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, 
size_total);
+   }
+
/*
 * Fill xsave->i387.sw_reserved value for ptrace frame:
 */
@@ -1070,6 +1093,12 @@ int copy_xstate_to_user(void __user *ubuf, struct 
xregs_state *xsave, unsigned i
 
}
 
+   if (xfeatures_mxcsr_quirk(header.xfeatures)) {
+   offset = offsetof(struct fxregs_state, mxcsr);
+   size = MXCSR_AND_FLAGS_SIZE;
+   __copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, 
size_total);
+   }
+
/*
 * Fill xsave->i387.sw_reserved value for ptrace frame:
 */
@@ -1122,6 +1151,12 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, 
const void *kbuf)
}
}
 
+   if (xfeatures_mxcsr_quirk(xfeatures)) {
+   offset = offsetof(struct fxregs_state, mxcsr);
+   size = MXCSR_AND_FLAGS_SIZE;
+   memcpy(&xsave->i387.mxcsr, kbuf + offset, size);
+   }
+
/*
 * The state that came in from userspace was user-state only.
 * Mask all the user states out of 'xfeatures':
@@ -1177,6 +1212,13 @@ int copy_user_to_xstate(struct xregs_state *xsave, const 
void __user *ubuf)
}
}
 
+   if (xfeatures_mxcsr_quirk(xfeatures)) {
+   offset = offsetof(struct fxregs_state, mxcsr);
+   size = MXCSR_AND_FLAGS_SIZE;
+   if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size))
+   return -EFAULT;
+   }
+
/*
 * The state that came in from userspace was user-state only.
 * Mask all the user states out of 'xfeatures':

[tip:sched/core] sched/numa: Slow down scan rate if shared faults dominate

2017-08-10 Thread tip-bot for Rik van Riel

Commit-ID:  37ec97deb3a8c68a7adfab61beb261ffeab19d09
Gitweb: http://git.kernel.org/tip/37ec97deb3a8c68a7adfab61beb261ffeab19d09
Author: Rik van Riel 
AuthorDate: Mon, 31 Jul 2017 15:28:46 -0400
Committer:  Ingo Molnar 
CommitDate: Thu, 10 Aug 2017 12:18:16 +0200

sched/numa: Slow down scan rate if shared faults dominate

The comment above update_task_scan_period() says the scan period should
be increased (scanning slows down) if the majority of memory accesses
are on the local node, or if the majority of the page accesses are
shared with other tasks.

However, with the current code, all a high ratio of shared accesses
does is slow down the rate at which scanning is made faster.

This patch changes things so either lots of shared accesses or
lots of local accesses will slow down scanning, and numa scanning
is sped up only when there are lots of private faults on remote
memory pages.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Mel Gorman 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: jhla...@redhat.com
Cc: lvena...@redhat.com
Link: http://lkml.kernel.org/r/20170731192847.23050-2-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 39 +--
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef5b66b..cb6b7c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1892,7 +1892,7 @@ static void update_task_scan_period(struct task_struct *p,
unsigned long shared, unsigned long private)
 {
unsigned int period_slot;
-   int ratio;
+   int lr_ratio, ps_ratio;
int diff;
 
unsigned long remote = p->numa_faults_locality[0];
@@ -1922,25 +1922,36 @@ static void update_task_scan_period(struct task_struct 
*p,
 *   >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
 */
period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
-   ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
-   if (ratio >= NUMA_PERIOD_THRESHOLD) {
-   int slot = ratio - NUMA_PERIOD_THRESHOLD;
+   lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+   ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
+
+   if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
+   /*
+* Most memory accesses are local. There is no need to
+* do fast NUMA scanning, since memory is already local.
+*/
+   int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
+   if (!slot)
+   slot = 1;
+   diff = slot * period_slot;
+   } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
+   /*
+* Most memory accesses are shared with other tasks.
+* There is no point in continuing fast NUMA scanning,
+* since other tasks may just move the memory elsewhere.
+*/
+   int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
if (!slot)
slot = 1;
diff = slot * period_slot;
} else {
-   diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
-
/*
-* Scale scan rate increases based on sharing. There is an
-* inverse relationship between the degree of sharing and
-* the adjustment made to the scanning period. Broadly
-* speaking the intent is that there is little point
-* scanning faster if shared accesses dominate as it may
-* simply bounce migrations uselessly
+* Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
+* yet they are not on the local NUMA node. Speed up
+* NUMA scanning to get the memory moved over.
 */
-   ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + 
shared + 1));
-   diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+   int ratio = max(lr_ratio, ps_ratio);
+   diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
}
 
p->numa_scan_period = clamp(p->numa_scan_period + diff,

[tip:sched/core] sched/numa: Scale scan period with tasks in group and shared/private

2017-08-10 Thread tip-bot for Rik van Riel

Commit-ID:  b5dd77c8bdada7b6262d0cba02a6ed525bf4e6e1
Gitweb: http://git.kernel.org/tip/b5dd77c8bdada7b6262d0cba02a6ed525bf4e6e1
Author: Rik van Riel 
AuthorDate: Mon, 31 Jul 2017 15:28:47 -0400
Committer:  Ingo Molnar 
CommitDate: Thu, 10 Aug 2017 12:18:16 +0200

sched/numa: Scale scan period with tasks in group and shared/private

Running 80 tasks in the same group, or as threads of the same process,
results in the memory getting scanned 80x as fast as it would be if a
single task was using the memory.

This really hurts some workloads.

Scale the scan period by the number of tasks in the numa group, and
the shared / private ratio, so the average rate at which memory in
the group is scanned corresponds roughly to the rate at which a single
task would scan its memory.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Mel Gorman 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: jhla...@redhat.com
Cc: lvena...@redhat.com
Link: http://lkml.kernel.org/r/20170731192847.23050-3-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 111 
 1 file changed, 86 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cb6b7c8..a7f1c3b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1071,6 +1071,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 unsigned int sysctl_numa_balancing_scan_delay = 1000;
 
+struct numa_group {
+   atomic_t refcount;
+
+   spinlock_t lock; /* nr_tasks, tasks */
+   int nr_tasks;
+   pid_t gid;
+   int active_nodes;
+
+   struct rcu_head rcu;
+   unsigned long total_faults;
+   unsigned long max_faults_cpu;
+   /*
+* Faults_cpu is used to decide whether memory should move
+* towards the CPU. As a consequence, these stats are weighted
+* more by CPU use than by memory faults.
+*/
+   unsigned long *faults_cpu;
+   unsigned long faults[0];
+};
+
+static inline unsigned long group_faults_priv(struct numa_group *ng);
+static inline unsigned long group_faults_shared(struct numa_group *ng);
+
 static unsigned int task_nr_scan_windows(struct task_struct *p)
 {
unsigned long rss = 0;
@@ -1107,13 +1130,47 @@ static unsigned int task_scan_min(struct task_struct *p)
return max_t(unsigned int, floor, scan);
 }
 
+static unsigned int task_scan_start(struct task_struct *p)
+{
+   unsigned long smin = task_scan_min(p);
+   unsigned long period = smin;
+
+   /* Scale the maximum scan period with the amount of shared memory. */
+   if (p->numa_group) {
+   struct numa_group *ng = p->numa_group;
+   unsigned long shared = group_faults_shared(ng);
+   unsigned long private = group_faults_priv(ng);
+
+   period *= atomic_read(&ng->refcount);
+   period *= shared + 1;
+   period /= private + shared + 1;
+   }
+
+   return max(smin, period);
+}
+
 static unsigned int task_scan_max(struct task_struct *p)
 {
-   unsigned int smin = task_scan_min(p);
-   unsigned int smax;
+   unsigned long smin = task_scan_min(p);
+   unsigned long smax;
 
/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+
+   /* Scale the maximum scan period with the amount of shared memory. */
+   if (p->numa_group) {
+   struct numa_group *ng = p->numa_group;
+   unsigned long shared = group_faults_shared(ng);
+   unsigned long private = group_faults_priv(ng);
+   unsigned long period = smax;
+
+   period *= atomic_read(&ng->refcount);
+   period *= shared + 1;
+   period /= private + shared + 1;
+
+   smax = max(smax, period);
+   }
+
return max(smin, smax);
 }
 
@@ -1129,26 +1186,6 @@ static void account_numa_dequeue(struct rq *rq, struct 
task_struct *p)
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
 }
 
-struct numa_group {
-   atomic_t refcount;
-
-   spinlock_t lock; /* nr_tasks, tasks */
-   int nr_tasks;
-   pid_t gid;
-   int active_nodes;
-
-   struct rcu_head rcu;
-   unsigned long total_faults;
-   unsigned long max_faults_cpu;
-   /*
-* Faults_cpu is used to decide whether memory should move
-* towards the CPU. As a consequence, these stats are weighted
-* more by CPU use than by memory faults.
-*/
-   unsigned long *faults_cpu;
-   unsigned long faults[0];
-};
-
 /* Shared or private faults. */
 #define NR_NUMA_HINT_FAULT_TYPES 2
 
@@ -1198,6 +1235,30 @@ static inline unsigned long group_faults_cpu(struct 
numa_group *group, int nid)
group-

[tip:sched/core] sched/fair: Remove effective_load()

2017-06-24 Thread tip-bot for Rik van Riel

Commit-ID:  815abf5af45f04f759f12f3172afd15226fd7f71
Gitweb: http://git.kernel.org/tip/815abf5af45f04f759f12f3172afd15226fd7f71
Author: Rik van Riel 
AuthorDate: Fri, 23 Jun 2017 12:55:30 -0400
Committer:  Ingo Molnar 
CommitDate: Sat, 24 Jun 2017 08:57:53 +0200

sched/fair: Remove effective_load()

The effective_load() function was only used by the NUMA balancing
code, and not by the regular load balancing code. Now that the
NUMA balancing code no longer uses it either, get rid of it.

Signed-off-by: Rik van Riel 
Cc: Linus Torvalds 
Cc: Mel Gorman 
Cc: Mike Galbraith 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: jhla...@redhat.com
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20170623165530.22514-5-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 124 +---
 1 file changed, 1 insertion(+), 123 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 79ac078..6f4f155 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1382,7 +1382,6 @@ static unsigned long weighted_cpuload(const int cpu);
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
 static unsigned long capacity_of(int cpu);
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
 
 /* Cached statistics for all CPUs within a node */
 struct numa_stats {
@@ -3045,8 +3044,7 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq 
*cfs_rq)
  * differential update where we store the last value we propagated. This in
  * turn allows skipping updates if the differential is 'small'.
  *
- * Updating tg's load_avg is necessary before update_cfs_share() (which is
- * done) and effective_load() (which is not done because it is too costly).
+ * Updating tg's load_avg is necessary before update_cfs_share().
  */
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 {
@@ -5298,126 +5296,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
return 0;
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * effective_load() calculates the load change as seen from the root_task_group
- *
- * Adding load to a group doesn't make a group heavier, but can cause movement
- * of group shares between cpus. Assuming the shares were perfectly aligned one
- * can calculate the shift in shares.
- *
- * Calculate the effective load difference if @wl is added (subtracted) to @tg
- * on this @cpu and results in a total addition (subtraction) of @wg to the
- * total group weight.
- *
- * Given a runqueue weight distribution (rw_i) we can compute a shares
- * distribution (s_i) using:
- *
- *   s_i = rw_i / \Sum rw_j(1)
- *
- * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
- * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
- * shares distribution (s_i):
- *
- *   rw_i = {   2,   4,   1,   0 }
- *   s_i  = { 2/7, 4/7, 1/7,   0 }
- *
- * As per wake_affine() we're interested in the load of two CPUs (the CPU the
- * task used to run on and the CPU the waker is running on), we need to
- * compute the effect of waking a task on either CPU and, in case of a sync
- * wakeup, compute the effect of the current task going to sleep.
- *
- * So for a change of @wl to the local @cpu with an overall group weight change
- * of @wl we can compute the new shares distribution (s'_i) using:
- *
- *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)   (2)
- *
- * Suppose we're interested in CPUs 0 and 1, and want to compute the load
- * differences in waking a task to CPU 0. The additional task changes the
- * weight and shares distributions like:
- *
- *   rw'_i = {   3,   4,   1,   0 }
- *   s'_i  = { 3/8, 4/8, 1/8,   0 }
- *
- * We can then compute the difference in effective weight by using:
- *
- *   dw_i = S * (s'_i - s_i)   (3)
- *
- * Where 'S' is the group weight as seen by its parent.
- *
- * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
- * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
- * 4/7) times the weight of the group.
- */
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-{
-   struct sched_entity *se = tg->se[cpu];
-
-   if (!tg->parent)/* the trivial, non-cgroup case */
-   return wl;
-
-   for_each_sched_entity(se) {
-   struct cfs_rq *cfs_rq = se->my_q;
-   long W, w = cfs_rq_load_avg(cfs_rq);
-
-   tg = cfs_rq->tg;
-
-   /*
-* W = @wg + \Sum rw_j
-*/
-   W = wg + atomic_long_read(&tg->load_avg);
-
-   /* Ensure \Sum rw_j >= rw_i */
-   W -= cfs_rq->tg_load_avg_contrib;
-   W += w;
-
-   /*
-* w = rw_i + @wl
-*/
-

[tip:sched/core] sched/numa: Implement NUMA node level wake_affine()

2017-06-24 Thread tip-bot for Rik van Riel

Commit-ID:  3fed382b46baac83703130fe4cd3d9147f427fb9
Gitweb: http://git.kernel.org/tip/3fed382b46baac83703130fe4cd3d9147f427fb9
Author: Rik van Riel 
AuthorDate: Fri, 23 Jun 2017 12:55:29 -0400
Committer:  Ingo Molnar 
CommitDate: Sat, 24 Jun 2017 08:57:52 +0200

sched/numa: Implement NUMA node level wake_affine()

Since select_idle_sibling() can place a task anywhere on a socket,
comparing loads between individual CPU cores makes no real sense
for deciding whether to do an affine wakeup across sockets, either.

Instead, compare the load between the sockets in a similar way the
load balancer and the numa balancing code do.

Signed-off-by: Rik van Riel 
Cc: Linus Torvalds 
Cc: Mel Gorman 
Cc: Mike Galbraith 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: jhla...@redhat.com
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20170623165530.22514-4-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 130 
 1 file changed, 71 insertions(+), 59 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fe19016..79ac078 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2586,6 +2586,60 @@ void task_tick_numa(struct rq *rq, struct task_struct 
*curr)
}
}
 }
+
+/*
+ * Can a task be moved from prev_cpu to this_cpu without causing a load
+ * imbalance that would trigger the load balancer?
+ */
+static inline bool numa_wake_affine(struct sched_domain *sd,
+   struct task_struct *p, int this_cpu,
+   int prev_cpu, int sync)
+{
+   struct numa_stats prev_load, this_load;
+   s64 this_eff_load, prev_eff_load;
+
+   update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
+   update_numa_stats(&this_load, cpu_to_node(this_cpu));
+
+   /*
+* If sync wakeup then subtract the (maximum possible)
+* effect of the currently running task from the load
+* of the current CPU:
+*/
+   if (sync) {
+   unsigned long current_load = task_h_load(current);
+
+   if (this_load.load > current_load)
+   this_load.load -= current_load;
+   else
+   this_load.load = 0;
+   }
+
+   /*
+* In low-load situations, where this_cpu's node is idle due to the
+* sync cause above having dropped this_load.load to 0, move the task.
+* Moving to an idle socket will not create a bad imbalance.
+*
+* Otherwise check if the nodes are near enough in load to allow this
+* task to be woken on this_cpu's node.
+*/
+   if (this_load.load > 0) {
+   unsigned long task_load = task_h_load(p);
+
+   this_eff_load = 100;
+   this_eff_load *= prev_load.compute_capacity;
+
+   prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+   prev_eff_load *= this_load.compute_capacity;
+
+   this_eff_load *= this_load.load + task_load;
+   prev_eff_load *= prev_load.load - task_load;
+
+   return this_eff_load <= prev_eff_load;
+   }
+
+   return true;
+}
 #else
 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
@@ -2598,6 +2652,13 @@ static inline void account_numa_enqueue(struct rq *rq, 
struct task_struct *p)
 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 {
 }
+
+static inline bool numa_wake_affine(struct sched_domain *sd,
+   struct task_struct *p, int this_cpu,
+   int prev_cpu, int sync)
+{
+   return true;
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 static void
@@ -5407,74 +5468,25 @@ static int wake_wide(struct task_struct *p)
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
   int prev_cpu, int sync)
 {
-   s64 this_load, load;
-   s64 this_eff_load, prev_eff_load;
-   int idx, this_cpu;
-   struct task_group *tg;
-   unsigned long weight;
-   int balanced;
-
-   idx   = sd->wake_idx;
-   this_cpu  = smp_processor_id();
-   load  = source_load(prev_cpu, idx);
-   this_load = target_load(this_cpu, idx);
+   int this_cpu = smp_processor_id();
+   bool affine = false;
 
/*
 * Common case: CPUs are in the same socket, and select_idle_sibling()
 * will do its thing regardless of what we return:
 */
if (cpus_share_cache(prev_cpu, this_cpu))
-   return true;
-
-   /*
-* If sync wakeup then subtract the (maximum possible)
-* effect of the currently running task from the load
-* of the current CPU:
-*/
-   if (sync) {
-   tg = task_group(current);
-   weight = current->se.avg.load_avg;
-
-   this_load += effective_load(tg, this_cpu, -weight,

[tip:sched/core] sched/fair: Simplify wake_affine() for the single socket case

2017-06-24 Thread tip-bot for Rik van Riel

Commit-ID:  7d894e6e34a5cdd12309c7e4a3f830277ad4b7bf
Gitweb: http://git.kernel.org/tip/7d894e6e34a5cdd12309c7e4a3f830277ad4b7bf
Author: Rik van Riel 
AuthorDate: Fri, 23 Jun 2017 12:55:28 -0400
Committer:  Ingo Molnar 
CommitDate: Sat, 24 Jun 2017 08:57:52 +0200

sched/fair: Simplify wake_affine() for the single socket case

Then 'this_cpu' and 'prev_cpu' are in the same socket, select_idle_sibling()
will do its thing regardless of the return value of wake_affine().

Just return true and don't look at all the other things.

Signed-off-by: Rik van Riel 
Cc: Linus Torvalds 
Cc: Mel Gorman 
Cc: Mike Galbraith 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: jhla...@redhat.com
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20170623165530.22514-3-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e0c052..fe19016 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5420,6 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct 
task_struct *p,
this_load = target_load(this_cpu, idx);
 
/*
+* Common case: CPUs are in the same socket, and select_idle_sibling()
+* will do its thing regardless of what we return:
+*/
+   if (cpus_share_cache(prev_cpu, this_cpu))
+   return true;
+
+   /*
 * If sync wakeup then subtract the (maximum possible)
 * effect of the currently running task from the load
 * of the current CPU:
@@ -6007,11 +6014,15 @@ select_task_rq_fair(struct task_struct *p, int 
prev_cpu, int sd_flag, int wake_f
 
if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */
-   if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, 
sync))
+   if (cpu == prev_cpu)
+   goto pick_cpu;
+
+   if (wake_affine(affine_sd, p, prev_cpu, sync))
new_cpu = cpu;
}
 
if (!sd) {
+ pick_cpu:
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);

[tip:sched/core] sched/numa: Override part of migrate_degrades_locality() when idle balancing

2017-06-24 Thread tip-bot for Rik van Riel

Commit-ID:  739294fb03f590401bbd7faa6d31a507e3ffada5
Gitweb: http://git.kernel.org/tip/739294fb03f590401bbd7faa6d31a507e3ffada5
Author: Rik van Riel 
AuthorDate: Fri, 23 Jun 2017 12:55:27 -0400
Committer:  Ingo Molnar 
CommitDate: Sat, 24 Jun 2017 08:57:46 +0200

sched/numa: Override part of migrate_degrades_locality() when idle balancing

Several tests in the NAS benchmark seem to run a lot slower with
NUMA balancing enabled, than with NUMA balancing disabled. The
slower run time corresponds with increased idle time.

Overriding the final test of migrate_degrades_locality (but still
doing the other NUMA tests first) seems to improve performance
of those benchmarks.

Reported-by: Jirka Hladky 
Signed-off-by: Rik van Riel 
Cc: Linus Torvalds 
Cc: Mel Gorman 
Cc: Mike Galbraith 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20170623165530.22514-2-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 694c258..6e0c052 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6688,6 +6688,10 @@ static int migrate_degrades_locality(struct task_struct 
*p, struct lb_env *env)
if (dst_nid == p->numa_preferred_nid)
return 0;
 
+   /* Leaving a core idle is often worse than degrading locality. */
+   if (env->idle != CPU_NOT_IDLE)
+   return -1;
+
if (numa_group) {
src_faults = group_faults(p, src_nid);
dst_faults = group_faults(p, dst_nid);

[tip:perf/core] x86/fpu: Add FPU state copying quirk to handle XRSTOR failure on Intel Skylake CPUs

2017-04-24 Thread tip-bot for Rik van Riel

Commit-ID:  85fb989d3a58cb9c7904bb7dd8264be61e18b185
Gitweb: http://git.kernel.org/tip/85fb989d3a58cb9c7904bb7dd8264be61e18b185
Author: Rik van Riel 
AuthorDate: Fri, 10 Feb 2017 08:54:45 -0500
Committer:  Ingo Molnar 
CommitDate: Sat, 11 Feb 2017 11:00:22 +0100

x86/fpu: Add FPU state copying quirk to handle XRSTOR failure on Intel Skylake 
CPUs

On Skylake CPUs I noticed that XRSTOR is unable to deal with states
created by copyout_from_xsaves() if the xstate has only SSE/YMM state, and
no FP state. That is, xfeatures had XFEATURE_MASK_SSE set, but not
XFEATURE_MASK_FP.

The reason is that part of the SSE/YMM state lives in the MXCSR and
MXCSR_FLAGS fields of the FP state.

Ensure that whenever we copy SSE or YMM state around, the MXCSR and
MXCSR_FLAGS fields are also copied around.

Signed-off-by: Rik van Riel 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: Fenghua Yu 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Yu-cheng Yu 
Link: http://lkml.kernel.org/r/20170210085445.0f1cc...@annuminas.surriel.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/fpu/types.h |  3 +++
 arch/x86/kernel/fpu/xstate.c | 42 
 2 files changed, 45 insertions(+)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index d15cbfe..ea65ab2 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -68,6 +68,9 @@ struct fxregs_state {
 /* Default value for fxregs_state.mxcsr: */
 #define MXCSR_DEFAULT  0x1f80
 
+/* Copy both mxcsr & mxcsr_flags with a single u64 memcpy: */
+#define MXCSR_AND_FLAGS_SIZE sizeof(u64)
+
 /*
  * Software based FPU emulation state. This is arbitrary really,
  * it matches the x87 format to make it easier to understand:
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 772a069..2e89383 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -920,6 +920,23 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int 
pkey,
 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
 
 /*
+ * Weird legacy quirk: SSE and YMM states store information in the
+ * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP
+ * area is marked as unused in the xfeatures header, we need to copy
+ * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use.
+ */
+static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
+{
+   if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM)))
+   return 0;
+
+   if (xfeatures & XFEATURE_MASK_FP)
+   return 0;
+
+   return 1;
+}
+
+/*
  * This is similar to user_regset_copyout(), but will not add offset to
  * the source data pointer or increment pos, count, kbuf, and ubuf.
  */
@@ -987,6 +1004,12 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state 
*xsave, unsigned int of
 
}
 
+   if (xfeatures_mxcsr_quirk(header.xfeatures)) {
+   offset = offsetof(struct fxregs_state, mxcsr);
+   size = MXCSR_AND_FLAGS_SIZE;
+   __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, 
size_total);
+   }
+
/*
 * Fill xsave->i387.sw_reserved value for ptrace frame:
 */
@@ -1069,6 +1092,12 @@ int copy_xstate_to_user(void __user *ubuf, struct 
xregs_state *xsave, unsigned i
 
}
 
+   if (xfeatures_mxcsr_quirk(header.xfeatures)) {
+   offset = offsetof(struct fxregs_state, mxcsr);
+   size = MXCSR_AND_FLAGS_SIZE;
+   __copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, 
size_total);
+   }
+
/*
 * Fill xsave->i387.sw_reserved value for ptrace frame:
 */
@@ -1121,6 +1150,12 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, 
const void *kbuf)
}
}
 
+   if (xfeatures_mxcsr_quirk(xfeatures)) {
+   offset = offsetof(struct fxregs_state, mxcsr);
+   size = MXCSR_AND_FLAGS_SIZE;
+   memcpy(&xsave->i387.mxcsr, kbuf + offset, size);
+   }
+
/*
 * The state that came in from userspace was user-state only.
 * Mask all the user states out of 'xfeatures':
@@ -1176,6 +1211,13 @@ int copy_user_to_xstate(struct xregs_state *xsave, const 
void __user *ubuf)
}
}
 
+   if (xfeatures_mxcsr_quirk(xfeatures)) {
+   offset = offsetof(struct fxregs_state, mxcsr);
+   size = MXCSR_AND_FLAGS_SIZE;
+   if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size))
+   return -EFAULT;
+   }
+
/*
 * The state that came in from userspace was user-state only.
 * Mask all the user states out of 'xfeatures':

[tip:x86/fpu] x86/fpu: Split old_fpu & new_fpu handling into separate functions

2016-10-16 Thread tip-bot for Rik van Riel

Commit-ID:  c474e50711aa79b7bd0ea30b44744baca5650375
Gitweb: http://git.kernel.org/tip/c474e50711aa79b7bd0ea30b44744baca5650375
Author: Rik van Riel 
AuthorDate: Fri, 14 Oct 2016 08:15:31 -0400
Committer:  Ingo Molnar 
CommitDate: Sun, 16 Oct 2016 11:38:41 +0200

x86/fpu: Split old_fpu & new_fpu handling into separate functions

By moving all of the new_fpu state handling into switch_fpu_finish(),
the code can be simplified some more.

This gets rid of the prefetch, but given the size of the FPU register
state on modern CPUs, and the amount of work done by __switch_to()
inbetween both functions, the value of a single cache line prefetch
seems somewhat dubious anyway.

Signed-off-by: Rik van Riel 
Acked-by: Dave Hansen 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: Fenghua Yu 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Cc: Peter Zijlstra 
Cc: Quentin Casasnovas 
Cc: Thomas Gleixner 
Link: http://lkml.kernel.org/r/1476447331-21566-3-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/fpu/internal.h | 48 -
 arch/x86/kernel/process_32.c|  5 ++--
 arch/x86/kernel/process_64.c|  5 ++--
 3 files changed, 19 insertions(+), 39 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 590f274..d4a6849 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -552,27 +552,15 @@ static inline int fpregs_active(void)
  *
  * This is a two-stage process:
  *
- *  - switch_fpu_prepare() saves the old state and
- *sets the new state of the CR0.TS bit. This is
- *done within the context of the old process.
+ *  - switch_fpu_prepare() saves the old state.
+ *This is done within the context of the old process.
  *
  *  - switch_fpu_finish() restores the new state as
  *necessary.
  */
-typedef struct { int preload; } fpu_switch_t;
-
-static inline fpu_switch_t
-switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu)
+static inline void
+switch_fpu_prepare(struct fpu *old_fpu, int cpu)
 {
-   fpu_switch_t fpu;
-
-   /*
-* If the task has used the math, pre-load the FPU on xsave processors
-* or if the past 5 consecutive context-switches used math.
-*/
-   fpu.preload = static_cpu_has(X86_FEATURE_FPU) &&
- new_fpu->fpstate_active;
-
if (old_fpu->fpregs_active) {
if (!copy_fpregs_to_fpstate(old_fpu))
old_fpu->last_cpu = -1;
@@ -584,16 +572,6 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu 
*new_fpu, int cpu)
trace_x86_fpu_regs_deactivated(old_fpu);
} else
old_fpu->last_cpu = -1;
-
-   if (fpu.preload) {
-   if (fpregs_state_valid(new_fpu, cpu))
-   fpu.preload = 0;
-   else
-   prefetch(&new_fpu->state);
-   fpregs_activate(new_fpu);
-   }
-
-   return fpu;
 }
 
 /*
@@ -601,15 +579,19 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu 
*new_fpu, int cpu)
  */
 
 /*
- * By the time this gets called, we've already cleared CR0.TS and
- * given the process the FPU if we are going to preload the FPU
- * state - all we need to do is to conditionally restore the register
- * state itself.
+ * Set up the userspace FPU context for the new task, if the task
+ * has used the FPU.
  */
-static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t 
fpu_switch)
+static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
 {
-   if (fpu_switch.preload)
-   copy_kernel_to_fpregs(&new_fpu->state);
+   bool preload = static_cpu_has(X86_FEATURE_FPU) &&
+  new_fpu->fpstate_active;
+
+   if (preload) {
+   if (!fpregs_state_valid(new_fpu, cpu))
+   copy_kernel_to_fpregs(&new_fpu->state);
+   fpregs_activate(new_fpu);
+   }
 }
 
 /*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index bd7be8e..7dc8c9c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -232,11 +232,10 @@ __switch_to(struct task_struct *prev_p, struct 
task_struct *next_p)
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
-   fpu_switch_t fpu_switch;
 
/* never put a printk in __switch_to... printk() calls wake_up*() 
indirectly */
 
-   fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
+   switch_fpu_prepare(prev_fpu, cpu);
 
/*
 * Save away %gs. No need to save %fs, as it was saved on the
@@ -295,7 +294,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct 
*next_p)
if (prev->gs | next->gs)
lazy_load_gs(next->gs);
 
-   switc

[tip:x86/fpu] x86/fpu: Remove 'cpu' argument from __cpu_invalidate_fpregs_state()

2016-10-16 Thread tip-bot for Rik van Riel

Commit-ID:  317b622cb2fda1812d8646e211cdb23dce2564d0
Gitweb: http://git.kernel.org/tip/317b622cb2fda1812d8646e211cdb23dce2564d0
Author: Rik van Riel 
AuthorDate: Fri, 14 Oct 2016 08:15:30 -0400
Committer:  Ingo Molnar 
CommitDate: Sun, 16 Oct 2016 11:38:31 +0200

x86/fpu: Remove 'cpu' argument from __cpu_invalidate_fpregs_state()

The __{fpu,cpu}_invalidate_fpregs_state() functions can only be used
to invalidate a resource they control.  Document that, and change
the API a little bit to reflect that.

Go back to open coding the fpu_fpregs_owner_ctx write in the CPU
hotplug code, which should be the exception, and move __kernel_fpu_begin()
to this API.

This patch has no functional changes to the current code.

Signed-off-by: Rik van Riel 
Acked-by: Dave Hansen 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: Fenghua Yu 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Cc: Peter Zijlstra 
Cc: Quentin Casasnovas 
Cc: Thomas Gleixner 
Link: http://lkml.kernel.org/r/1476447331-21566-2-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/fpu/internal.h | 13 +++--
 arch/x86/kernel/fpu/core.c  |  2 +-
 arch/x86/kernel/smpboot.c   |  2 +-
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 1dcb29e..590f274 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -488,15 +488,16 @@ DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
  *
  * Any code that clobbers the FPU registers or updates the in-memory
  * FPU state for a task MUST let the rest of the kernel know that the
- * FPU registers are no longer valid for this task. Calling either of
- * these two invalidate functions is enough, use whichever is convenient.
+ * FPU registers are no longer valid for this task.
  *
- * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx,
- * on this CPU.
+ * Either one of these invalidation functions is enough. Invalidate
+ * a resource you control: CPU if using the CPU for something else
+ * (with preemption disabled), FPU for the current task, or a task that
+ * is prevented from running by the current task.
  */
-static inline void __cpu_invalidate_fpregs_state(unsigned int cpu)
+static inline void __cpu_invalidate_fpregs_state(void)
 {
-   per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
+   __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
 }
 
 static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 25a45dd..30f11ab 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -106,7 +106,7 @@ void __kernel_fpu_begin(void)
 */
copy_fpregs_to_fpstate(fpu);
} else {
-   this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+   __cpu_invalidate_fpregs_state();
}
 }
 EXPORT_SYMBOL(__kernel_fpu_begin);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ca4c4ca..5cb801a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -,7 +,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct 
*tidle)
return err;
 
/* the FPU context is blank, nobody can own it */
-   __cpu_invalidate_fpregs_state(cpu);
+   per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
 
common_cpu_up(cpu, tidle);

[tip:x86/fpu] x86/fpu: Split old & new FPU code paths

2016-10-07 Thread tip-bot for Rik van Riel

Commit-ID:  9ad93fe35aff616fca4e2b9581fdeed498605f9e
Gitweb: http://git.kernel.org/tip/9ad93fe35aff616fca4e2b9581fdeed498605f9e
Author: Rik van Riel 
AuthorDate: Tue, 4 Oct 2016 20:34:38 -0400
Committer:  Ingo Molnar 
CommitDate: Fri, 7 Oct 2016 11:14:43 +0200

x86/fpu: Split old & new FPU code paths

Now that CR0.TS is no longer being manipulated, we can simplify
switch_fpu_prepare() by no longer nesting the handling of new_fpu
inside the two branches for the old_fpu.

Signed-off-by: Rik van Riel 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: Fenghua Yu 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Cc: Peter Zijlstra 
Cc: Quentin Casasnovas 
Cc: Thomas Gleixner 
Cc: pbonz...@redhat.com
Link: 
http://lkml.kernel.org/r/1475627678-20788-10-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/fpu/internal.h | 22 --
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index d0324bc..1dcb29e 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -581,23 +581,17 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu 
*new_fpu, int cpu)
/* But leave fpu_fpregs_owner_ctx! */
old_fpu->fpregs_active = 0;
trace_x86_fpu_regs_deactivated(old_fpu);
+   } else
+   old_fpu->last_cpu = -1;
 
-   /* Don't change CR0.TS if we just switch! */
-   if (fpu.preload) {
-   fpregs_activate(new_fpu);
-   trace_x86_fpu_regs_activated(new_fpu);
+   if (fpu.preload) {
+   if (fpregs_state_valid(new_fpu, cpu))
+   fpu.preload = 0;
+   else
prefetch(&new_fpu->state);
-   }
-   } else {
-   old_fpu->last_cpu = -1;
-   if (fpu.preload) {
-   if (fpregs_state_valid(new_fpu, cpu))
-   fpu.preload = 0;
-   else
-   prefetch(&new_fpu->state);
-   fpregs_activate(new_fpu);
-   }
+   fpregs_activate(new_fpu);
}
+
return fpu;
 }

[tip:x86/fpu] x86/fpu: Rename lazy restore functions to "register state valid"

2016-10-07 Thread tip-bot for Rik van Riel

Commit-ID:  25d83b531c1aa4fca5b4e24ed10f493268f162bc
Gitweb: http://git.kernel.org/tip/25d83b531c1aa4fca5b4e24ed10f493268f162bc
Author: Rik van Riel 
AuthorDate: Tue, 4 Oct 2016 20:34:36 -0400
Committer:  Ingo Molnar 
CommitDate: Fri, 7 Oct 2016 11:14:41 +0200

x86/fpu: Rename lazy restore functions to "register state valid"

Name the functions after the state they track, rather than the function
they currently enable. This should make it more obvious when we use the
fpu_register_state_valid() function for something else in the future.

Signed-off-by: Rik van Riel 
Reviewed-by: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: Fenghua Yu 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Cc: Peter Zijlstra 
Cc: Quentin Casasnovas 
Cc: Thomas Gleixner 
Cc: pbonz...@redhat.com
Link: http://lkml.kernel.org/r/1475627678-20788-8-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/fpu/internal.h | 26 --
 arch/x86/kernel/fpu/core.c  |  4 ++--
 arch/x86/kernel/smpboot.c   |  2 +-
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 499d6ed..d2cfe16 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -479,18 +479,32 @@ extern int copy_fpstate_to_sigframe(void __user *buf, 
void __user *fp, int size)
 DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
 
 /*
+ * The in-register FPU state for an FPU context on a CPU is assumed to be
+ * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
+ * matches the FPU.
+ *
+ * If the FPU register state is valid, the kernel can skip restoring the
+ * FPU state from memory.
+ *
+ * Any code that clobbers the FPU registers or updates the in-memory
+ * FPU state for a task MUST let the rest of the kernel know that the
+ * FPU registers are no longer valid for this task. Calling either of
+ * these two invalidate functions is enough, use whichever is convenient.
+ *
  * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx,
  * on this CPU.
- *
- * This will disable any lazy FPU state restore of the current FPU state,
- * but if the current thread owns the FPU, it will still be saved by.
  */
-static inline void __cpu_disable_lazy_restore(unsigned int cpu)
+static inline void __cpu_invalidate_fpregs_state(unsigned int cpu)
 {
per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
 }
 
-static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu)
+static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
+{
+   fpu->last_cpu = -1;
+}
+
+static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
 {
return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == 
fpu->last_cpu;
 }
@@ -588,7 +602,7 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu 
*new_fpu, int cpu)
} else {
old_fpu->last_cpu = -1;
if (fpu.preload) {
-   if (fpu_want_lazy_restore(new_fpu, cpu))
+   if (fpregs_state_valid(new_fpu, cpu))
fpu.preload = 0;
else
prefetch(&new_fpu->state);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 6a37d52..25a45dd 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -336,7 +336,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
 
if (fpu->fpstate_active) {
/* Invalidate any lazy state: */
-   fpu->last_cpu = -1;
+   __fpu_invalidate_fpregs_state(fpu);
} else {
fpstate_init(&fpu->state);
trace_x86_fpu_init_state(fpu);
@@ -379,7 +379,7 @@ void fpu__current_fpstate_write_begin(void)
 * ensures we will not be lazy and skip a XRSTOR in the
 * future.
 */
-   fpu->last_cpu = -1;
+   __fpu_invalidate_fpregs_state(fpu);
 }
 
 /*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 42a9362..ca4c4ca 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -,7 +,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct 
*tidle)
return err;
 
/* the FPU context is blank, nobody can own it */
-   __cpu_disable_lazy_restore(cpu);
+   __cpu_invalidate_fpregs_state(cpu);
 
common_cpu_up(cpu, tidle);

[tip:x86/fpu] x86/fpu: Remove __fpregs_(de)activate()

2016-10-07 Thread tip-bot for Rik van Riel

Commit-ID:  66f314efca3843a8874405ab015e354d041f86dd
Gitweb: http://git.kernel.org/tip/66f314efca3843a8874405ab015e354d041f86dd
Author: Rik van Riel 
AuthorDate: Tue, 4 Oct 2016 20:34:37 -0400
Committer:  Ingo Molnar 
CommitDate: Fri, 7 Oct 2016 11:14:42 +0200

x86/fpu: Remove __fpregs_(de)activate()

Now that fpregs_activate() and fpregs_deactivate() do nothing except
call the double underscored versions of themselves, we can get
rid of the double underscore version.

Signed-off-by: Rik van Riel 
Reviewed-by: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: Fenghua Yu 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Cc: Peter Zijlstra 
Cc: Quentin Casasnovas 
Cc: Thomas Gleixner 
Cc: pbonz...@redhat.com
Link: http://lkml.kernel.org/r/1475627678-20788-9-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/fpu/internal.h | 25 +++--
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index d2cfe16..d0324bc 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -509,8 +509,11 @@ static inline int fpregs_state_valid(struct fpu *fpu, 
unsigned int cpu)
return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == 
fpu->last_cpu;
 }
 
-
-static inline void __fpregs_deactivate(struct fpu *fpu)
+/*
+ * These generally need preemption protection to work,
+ * do try to avoid using these on their own:
+ */
+static inline void fpregs_deactivate(struct fpu *fpu)
 {
WARN_ON_FPU(!fpu->fpregs_active);
 
@@ -519,7 +522,7 @@ static inline void __fpregs_deactivate(struct fpu *fpu)
trace_x86_fpu_regs_deactivated(fpu);
 }
 
-static inline void __fpregs_activate(struct fpu *fpu)
+static inline void fpregs_activate(struct fpu *fpu)
 {
WARN_ON_FPU(fpu->fpregs_active);
 
@@ -544,20 +547,6 @@ static inline int fpregs_active(void)
 }
 
 /*
- * These generally need preemption protection to work,
- * do try to avoid using these on their own.
- */
-static inline void fpregs_activate(struct fpu *fpu)
-{
-   __fpregs_activate(fpu);
-}
-
-static inline void fpregs_deactivate(struct fpu *fpu)
-{
-   __fpregs_deactivate(fpu);
-}
-
-/*
  * FPU state switching for scheduling.
  *
  * This is a two-stage process:
@@ -595,7 +584,7 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu 
*new_fpu, int cpu)
 
/* Don't change CR0.TS if we just switch! */
if (fpu.preload) {
-   __fpregs_activate(new_fpu);
+   fpregs_activate(new_fpu);
trace_x86_fpu_regs_activated(new_fpu);
prefetch(&new_fpu->state);
}

[tip:x86/fpu] x86/fpu, kvm: Remove KVM vcpu->fpu_counter

2016-10-07 Thread tip-bot for Rik van Riel

Commit-ID:  3d42de25d290fdfe604835d1b389845b8cba5bff
Gitweb: http://git.kernel.org/tip/3d42de25d290fdfe604835d1b389845b8cba5bff
Author: Rik van Riel 
AuthorDate: Tue, 4 Oct 2016 20:34:35 -0400
Committer:  Ingo Molnar 
CommitDate: Fri, 7 Oct 2016 11:14:41 +0200

x86/fpu, kvm: Remove KVM vcpu->fpu_counter

With the removal of the lazy FPU code, this field is no longer used.
Get rid of it.

Signed-off-by: Rik van Riel 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: Fenghua Yu 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Cc: Peter Zijlstra 
Cc: Quentin Casasnovas 
Cc: Thomas Gleixner 
Cc: pbonz...@redhat.com
Link: http://lkml.kernel.org/r/1475627678-20788-7-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/kvm/x86.c   | 4 +---
 include/linux/kvm_host.h | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 59d7761..2c7e775 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7348,10 +7348,8 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
-   if (!vcpu->guest_fpu_loaded) {
-   vcpu->fpu_counter = 0;
+   if (!vcpu->guest_fpu_loaded)
return;
-   }
 
vcpu->guest_fpu_loaded = 0;
copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9c28b4d..4e6905c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -224,7 +224,6 @@ struct kvm_vcpu {
 
int fpu_active;
int guest_fpu_loaded, guest_xcr0_loaded;
-   unsigned char fpu_counter;
struct swait_queue_head wq;
struct pid *pid;
int sigset_active;

[tip:x86/fpu] x86/fpu: Remove struct fpu::counter

2016-10-07 Thread tip-bot for Rik van Riel

Commit-ID:  3913cc3507575273beb165a5e027a081913ed507
Gitweb: http://git.kernel.org/tip/3913cc3507575273beb165a5e027a081913ed507
Author: Rik van Riel 
AuthorDate: Tue, 4 Oct 2016 20:34:34 -0400
Committer:  Ingo Molnar 
CommitDate: Fri, 7 Oct 2016 11:14:40 +0200

x86/fpu: Remove struct fpu::counter

With the lazy FPU code gone, we no longer use the counter field
in struct fpu for anything. Get rid it.

Signed-off-by: Rik van Riel 
Reviewed-by: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: Fenghua Yu 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Cc: Peter Zijlstra 
Cc: Quentin Casasnovas 
Cc: Thomas Gleixner 
Cc: pbonz...@redhat.com
Link: http://lkml.kernel.org/r/1475627678-20788-6-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/fpu/internal.h |  3 ---
 arch/x86/include/asm/fpu/types.h| 11 ---
 arch/x86/include/asm/trace/fpu.h|  5 +
 arch/x86/kernel/fpu/core.c  |  3 ---
 4 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 7801d32..499d6ed 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -581,16 +581,13 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu 
*new_fpu, int cpu)
 
/* Don't change CR0.TS if we just switch! */
if (fpu.preload) {
-   new_fpu->counter++;
__fpregs_activate(new_fpu);
trace_x86_fpu_regs_activated(new_fpu);
prefetch(&new_fpu->state);
}
} else {
-   old_fpu->counter = 0;
old_fpu->last_cpu = -1;
if (fpu.preload) {
-   new_fpu->counter++;
if (fpu_want_lazy_restore(new_fpu, cpu))
fpu.preload = 0;
else
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 48df486..e31332d 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -322,17 +322,6 @@ struct fpu {
unsigned char   fpregs_active;
 
/*
-* @counter:
-*
-* This counter contains the number of consecutive context switches
-* during which the FPU stays used. If this is over a threshold, the
-* lazy FPU restore logic becomes eager, to save the trap overhead.
-* This is an unsigned char so that after 256 iterations the counter
-* wraps and the context switch behavior turns lazy again; this is to
-* deal with bursty apps that only use the FPU for a short time:
-*/
-   unsigned char   counter;
-   /*
 * @state:
 *
 * In-memory copy of all FPU registers that we save/restore
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index 9217ab1..342e597 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -14,7 +14,6 @@ DECLARE_EVENT_CLASS(x86_fpu,
__field(struct fpu *, fpu)
__field(bool, fpregs_active)
__field(bool, fpstate_active)
-   __field(int, counter)
__field(u64, xfeatures)
__field(u64, xcomp_bv)
),
@@ -23,17 +22,15 @@ DECLARE_EVENT_CLASS(x86_fpu,
__entry->fpu= fpu;
__entry->fpregs_active  = fpu->fpregs_active;
__entry->fpstate_active = fpu->fpstate_active;
-   __entry->counter= fpu->counter;
if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
__entry->xfeatures = fpu->state.xsave.header.xfeatures;
__entry->xcomp_bv  = fpu->state.xsave.header.xcomp_bv;
}
),
-   TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d counter: %d 
xfeatures: %llx xcomp_bv: %llx",
+   TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d xfeatures: 
%llx xcomp_bv: %llx",
__entry->fpu,
__entry->fpregs_active,
__entry->fpstate_active,
-   __entry->counter,
__entry->xfeatures,
__entry->xcomp_bv
)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 036e14f..6a37d52 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -222,7 +222,6 @@ EXPORT_SYMBOL_GPL(fpstate_init);
 
 int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 {
-   dst_fpu->counter = 0;
dst_fpu->fpregs_active = 0;
dst_fpu->last_cpu = -1;
 
@@ -430,7 +429,6 @@ void fpu__restore(struct fpu *fpu)
trace_x86_fpu_before_restore(fpu);
fpregs_activate(fpu)

[tip:sched/core] sched/numa, mm: Revert to checking pmd/pte_write instead of VMA flags

2016-09-13 Thread tip-bot for Rik van Riel

Commit-ID:  d59dc7bcfa649ef2128a76b6487b16f4b3f14d23
Gitweb: http://git.kernel.org/tip/d59dc7bcfa649ef2128a76b6487b16f4b3f14d23
Author: Rik van Riel 
AuthorDate: Thu, 8 Sep 2016 21:30:53 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 13 Sep 2016 20:31:33 +0200

sched/numa, mm: Revert to checking pmd/pte_write instead of VMA flags

Commit:

  4d9424669946 ("mm: convert p[te|md]_mknonnuma and remaining page table 
manipulations")

changed NUMA balancing from _PAGE_NUMA to using PROT_NONE, and was quickly
found to introduce a regression with NUMA grouping.

It was followed up by these commits:

 53da3bc2ba9e ("mm: fix up numa read-only thread grouping logic")
 bea66fbd11af ("mm: numa: group related processes based on VMA flags instead of 
page table flags")
 b191f9b106ea ("mm: numa: preserve PTE write permissions across a NUMA hinting 
fault")

The first of those two commits try alternate approaches to NUMA
grouping, which apparently do not work as well as looking at the PTE
write permissions.

The latter patch preserves the PTE write permissions across a NUMA
protection fault. However, it forgets to revert the condition for
whether or not to group tasks together back to what it was before
v3.19, even though the information is now preserved in the page tables
once again.

This patch brings the NUMA grouping heuristic back to what it was
before commit 4d9424669946, which the changelogs of subsequent
commits suggest worked best.

We have all the information again. We should probably use it.

Signed-off-by: Rik van Riel 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: aarca...@redhat.com
Cc: linux...@kvack.org
Cc: mgor...@suse.de
Link: http://lkml.kernel.org/r/20160908213053.07c99...@annuminas.surriel.com
Signed-off-by: Ingo Molnar 
---
 mm/huge_memory.c | 2 +-
 mm/memory.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2db2112..c8bde27 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1168,7 +1168,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
}
 
/* See similar comment in do_numa_page for explanation */
-   if (!(vma->vm_flags & VM_WRITE))
+   if (!pmd_write(pmd))
flags |= TNF_NO_GROUP;
 
/*
diff --git a/mm/memory.c b/mm/memory.c
index 83be99d..558c852 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3398,7 +3398,7 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
 * pte_dirty has unpredictable behaviour between PTE scan updates,
 * background writeback, dirty balancing and application behaviour.
 */
-   if (!(vma->vm_flags & VM_WRITE))
+   if (!pte_write(pte))
flags |= TNF_NO_GROUP;
 
/*

[tip:sched/core] sched: Remove struct rq::nohz_stamp

2016-08-18 Thread tip-bot for Rik van Riel

Commit-ID:  1fc770d5899c995db8e22d35eb918a2cb79559d9
Gitweb: http://git.kernel.org/tip/1fc770d5899c995db8e22d35eb918a2cb79559d9
Author: Rik van Riel 
AuthorDate: Mon, 15 Aug 2016 12:14:10 -0400
Committer:  Ingo Molnar 
CommitDate: Thu, 18 Aug 2016 10:55:39 +0200

sched: Remove struct rq::nohz_stamp

The nohz_stamp member of struct rq has been unused since 2010,
when this commit removed the code that referenced it:

  396e894d289d ("sched: Revert nohz_ratelimit() for now")

Signed-off-by: Rik van Riel 
Cc: Frederic Weisbecker 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Link: http://lkml.kernel.org/r/20160815121410.5ea1c...@annuminas.surriel.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/sched.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c64fc51..afe76d0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -597,7 +597,6 @@ struct rq {
 #ifdef CONFIG_SMP
unsigned long last_load_update_tick;
 #endif /* CONFIG_SMP */
-   u64 nohz_stamp;
unsigned long nohz_flags;
 #endif /* CONFIG_NO_HZ_COMMON */
 #ifdef CONFIG_NO_HZ_FULL

[tip:timers/nohz] sched/cputime: Drop local_irq_save/restore from irqtime_account_irq()

2016-07-14 Thread tip-bot for Rik van Riel

Commit-ID:  553bf6bbfd8a540c70aee28eb50e24caff456a03
Gitweb: http://git.kernel.org/tip/553bf6bbfd8a540c70aee28eb50e24caff456a03
Author: Rik van Riel 
AuthorDate: Wed, 13 Jul 2016 16:50:05 +0200
Committer:  Ingo Molnar 
CommitDate: Thu, 14 Jul 2016 10:42:35 +0200

sched/cputime: Drop local_irq_save/restore from irqtime_account_irq()

Paolo pointed out that irqs are already blocked when irqtime_account_irq()
is called. That means there is no reason to call local_irq_save/restore()
again.

Suggested-by: Paolo Bonzini 
Signed-off-by: Rik van Riel 
Signed-off-by: Frederic Weisbecker 
Reviewed-by: Paolo Bonzini 
Cc: Linus Torvalds 
Cc: Mike Galbraith 
Cc: Peter Zijlstra 
Cc: Radim Krcmar 
Cc: Thomas Gleixner 
Cc: Wanpeng Li 
Link: 
http://lkml.kernel.org/r/1468421405-20056-6-git-send-email-fweis...@gmail.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/cputime.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 16a873c..ea0f6f3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);
  */
 void irqtime_account_irq(struct task_struct *curr)
 {
-   unsigned long flags;
s64 delta;
int cpu;
 
if (!sched_clock_irqtime)
return;
 
-   local_irq_save(flags);
-
cpu = smp_processor_id();
delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
__this_cpu_add(irq_start_time, delta);
@@ -75,7 +72,6 @@ void irqtime_account_irq(struct task_struct *curr)
__this_cpu_add(cpu_softirq_time, delta);
 
irq_time_write_end();
-   local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);

[tip:timers/nohz] sched/cputime: Replace VTIME_GEN irq time code with IRQ_TIME_ACCOUNTING code

2016-07-14 Thread tip-bot for Rik van Riel

Commit-ID:  b58c35840521bb02b150e1d0d34ca9197f8b7145
Gitweb: http://git.kernel.org/tip/b58c35840521bb02b150e1d0d34ca9197f8b7145
Author: Rik van Riel 
AuthorDate: Wed, 13 Jul 2016 16:50:02 +0200
Committer:  Ingo Molnar 
CommitDate: Thu, 14 Jul 2016 10:42:34 +0200

sched/cputime: Replace VTIME_GEN irq time code with IRQ_TIME_ACCOUNTING code

The CONFIG_VIRT_CPU_ACCOUNTING_GEN irq time tracking code does not
appear to currently work right.

On CPUs without nohz_full=, only tick based irq time sampling is
done, which breaks down when dealing with a nohz_idle CPU.

On firewalls and similar systems, no ticks may happen on a CPU for a
while, and the irq time spent may never get accounted properly. This
can cause issues with capacity planning and power saving, which use
the CPU statistics as inputs in decision making.

Remove the VTIME_GEN vtime irq time code, and replace it with the
IRQ_TIME_ACCOUNTING code, when selected as a config option by the user.

Signed-off-by: Rik van Riel 
Signed-off-by: Frederic Weisbecker 
Cc: Linus Torvalds 
Cc: Mike Galbraith 
Cc: Paolo Bonzini 
Cc: Peter Zijlstra 
Cc: Radim Krcmar 
Cc: Thomas Gleixner 
Cc: Wanpeng Li 
Link: 
http://lkml.kernel.org/r/1468421405-20056-3-git-send-email-fweis...@gmail.com
Signed-off-by: Ingo Molnar 
---
 include/linux/vtime.h  | 32 ++--
 init/Kconfig   |  6 +++---
 kernel/sched/cputime.c | 16 +++-
 3 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index fa21969..d1977d84 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -14,6 +14,18 @@ struct task_struct;
  */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 static inline bool vtime_accounting_cpu_enabled(void) { return true; }
+
+#ifdef __ARCH_HAS_VTIME_ACCOUNT
+extern void vtime_account_irq_enter(struct task_struct *tsk);
+#else
+extern void vtime_common_account_irq_enter(struct task_struct *tsk);
+static inline void vtime_account_irq_enter(struct task_struct *tsk)
+{
+   if (vtime_accounting_cpu_enabled())
+   vtime_common_account_irq_enter(tsk);
+}
+#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -64,17 +76,6 @@ extern void vtime_account_system(struct task_struct *tsk);
 extern void vtime_account_idle(struct task_struct *tsk);
 extern void vtime_account_user(struct task_struct *tsk);
 
-#ifdef __ARCH_HAS_VTIME_ACCOUNT
-extern void vtime_account_irq_enter(struct task_struct *tsk);
-#else
-extern void vtime_common_account_irq_enter(struct task_struct *tsk);
-static inline void vtime_account_irq_enter(struct task_struct *tsk)
-{
-   if (vtime_accounting_cpu_enabled())
-   vtime_common_account_irq_enter(tsk);
-}
-#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
 static inline void vtime_task_switch(struct task_struct *prev) { }
@@ -85,13 +86,8 @@ static inline void vtime_account_irq_enter(struct 
task_struct *tsk) { }
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void arch_vtime_task_switch(struct task_struct *tsk);
-extern void vtime_gen_account_irq_exit(struct task_struct *tsk);
-
-static inline void vtime_account_irq_exit(struct task_struct *tsk)
-{
-   if (vtime_accounting_cpu_enabled())
-   vtime_gen_account_irq_exit(tsk);
-}
+static inline void vtime_account_irq_enter(struct task_struct *tsk) { }
+static inline void vtime_account_irq_exit(struct task_struct *tsk) { }
 
 extern void vtime_user_enter(struct task_struct *tsk);
 
diff --git a/init/Kconfig b/init/Kconfig
index c02d897..787dd76 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -375,9 +375,11 @@ config VIRT_CPU_ACCOUNTING_GEN
 
  If unsure, say N.
 
+endchoice
+
 config IRQ_TIME_ACCOUNTING
bool "Fine granularity task level IRQ time accounting"
-   depends on HAVE_IRQ_TIME_ACCOUNTING && !NO_HZ_FULL
+   depends on HAVE_IRQ_TIME_ACCOUNTING && !VIRT_CPU_ACCOUNTING_NATIVE
help
  Select this option to enable fine granularity task irq time
  accounting. This is done by reading a timestamp on each
@@ -386,8 +388,6 @@ config IRQ_TIME_ACCOUNTING
 
  If in doubt, say N here.
 
-endchoice
-
 config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
depends on MULTIUSER
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index db82ae1..ca7e33c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -711,14 +711,14 @@ static cputime_t vtime_delta(struct task_struct *tsk)
 static cputime_t get_vtime_delta(struct task_struct *tsk)
 {
unsigned long now = READ_ONCE(jiffies);
-   cputime_t delta, steal;
+   cputime_t delta, other;
 
delta = jiffies_to_cputime(now - tsk->vtime_snap);
-   steal = steal_account_process_time(delta);
+   other = account_other_time(delta);
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
tsk->vtime

[tip:timers/nohz] sched/cputime: Count actually elapsed irq & softirq time

2016-07-14 Thread tip-bot for Rik van Riel

Commit-ID:  57430218317e5b280a80582a139b26029c25de6c
Gitweb: http://git.kernel.org/tip/57430218317e5b280a80582a139b26029c25de6c
Author: Rik van Riel 
AuthorDate: Wed, 13 Jul 2016 16:50:01 +0200
Committer:  Ingo Molnar 
CommitDate: Thu, 14 Jul 2016 10:42:34 +0200

sched/cputime: Count actually elapsed irq & softirq time

Currently, if there was any irq or softirq time during 'ticks'
jiffies, the entire period will be accounted as irq or softirq
time.

This is inaccurate if only a subset of the time was actually spent
handling irqs, and could conceivably mis-count all of the ticks during
a period as irq time, when there was some irq and some softirq time.

This can actually happen when irqtime_account_process_tick is called
from account_idle_ticks, which can pass a larger number of ticks down
all at once.

Fix this by changing irqtime_account_hi_update(), irqtime_account_si_update(),
and steal_account_process_ticks() to work with cputime_t time units, and
return the amount of time spent in each mode.

Rename steal_account_process_ticks() to steal_account_process_time(), to
reflect that time is now accounted in cputime_t, instead of ticks.

Additionally, have irqtime_account_process_tick() take into account how
much time was spent in each of steal, irq, and softirq time.

The latter could help improve the accuracy of cputime
accounting when returning from idle on a NO_HZ_IDLE CPU.

Properly accounting how much time was spent in hardirq and
softirq time will also allow the NO_HZ_FULL code to re-use
these same functions for hardirq and softirq accounting.

Signed-off-by: Rik van Riel 
[ Make nsecs_to_cputime64() actually return cputime64_t. ]
Signed-off-by: Frederic Weisbecker 
Cc: Linus Torvalds 
Cc: Mike Galbraith 
Cc: Paolo Bonzini 
Cc: Peter Zijlstra 
Cc: Radim Krcmar 
Cc: Thomas Gleixner 
Cc: Wanpeng Li 
Link: 
http://lkml.kernel.org/r/1468421405-20056-2-git-send-email-fweis...@gmail.com
Signed-off-by: Ingo Molnar 
---
 include/asm-generic/cputime_nsecs.h |   2 +
 kernel/sched/cputime.c  | 124 ++--
 2 files changed, 79 insertions(+), 47 deletions(-)

diff --git a/include/asm-generic/cputime_nsecs.h 
b/include/asm-generic/cputime_nsecs.h
index 0f1c6f3..a84e28e 100644
--- a/include/asm-generic/cputime_nsecs.h
+++ b/include/asm-generic/cputime_nsecs.h
@@ -50,6 +50,8 @@ typedef u64 __nocast cputime64_t;
(__force u64)(__ct)
 #define nsecs_to_cputime(__nsecs)  \
(__force cputime_t)(__nsecs)
+#define nsecs_to_cputime64(__nsecs)\
+   (__force cputime64_t)(__nsecs)
 
 
 /*
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 3d60e5d..db82ae1 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -79,40 +79,50 @@ void irqtime_account_irq(struct task_struct *curr)
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
 
-static int irqtime_account_hi_update(void)
+static cputime_t irqtime_account_hi_update(cputime_t maxtime)
 {
u64 *cpustat = kcpustat_this_cpu->cpustat;
unsigned long flags;
-   u64 latest_ns;
-   int ret = 0;
+   cputime_t irq_cputime;
 
local_irq_save(flags);
-   latest_ns = this_cpu_read(cpu_hardirq_time);
-   if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
-   ret = 1;
+   irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
+ cpustat[CPUTIME_IRQ];
+   irq_cputime = min(irq_cputime, maxtime);
+   cpustat[CPUTIME_IRQ] += irq_cputime;
local_irq_restore(flags);
-   return ret;
+   return irq_cputime;
 }
 
-static int irqtime_account_si_update(void)
+static cputime_t irqtime_account_si_update(cputime_t maxtime)
 {
u64 *cpustat = kcpustat_this_cpu->cpustat;
unsigned long flags;
-   u64 latest_ns;
-   int ret = 0;
+   cputime_t softirq_cputime;
 
local_irq_save(flags);
-   latest_ns = this_cpu_read(cpu_softirq_time);
-   if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
-   ret = 1;
+   softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) -
+ cpustat[CPUTIME_SOFTIRQ];
+   softirq_cputime = min(softirq_cputime, maxtime);
+   cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
local_irq_restore(flags);
-   return ret;
+   return softirq_cputime;
 }
 
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 
 #define sched_clock_irqtime(0)
 
+static cputime_t irqtime_account_hi_update(cputime_t dummy)
+{
+   return 0;
+}
+
+static cputime_t irqtime_account_si_update(cputime_t dummy)
+{
+   return 0;
+}
+
 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
 
 static inline void task_group_account_field(struct task_struct *p, int index,
@@ -257,32 +267,45 @@ void account_idle_time(cputime_t cputime)
cpustat[CPUTIME_IDLE] += (__force u64) cputime;
 }
 
-static __always_inline unsigned long steal_account_process_tick(unsigned long 
max_jiffies)
+static __alway

[tip:sched/core] time, acct: Drop irq save & restore from __acct_update_integrals()

2016-02-29 Thread tip-bot for Rik van Riel

Commit-ID:  9344c92c2e72e495f695caef8364b3dd73af0eab
Gitweb: http://git.kernel.org/tip/9344c92c2e72e495f695caef8364b3dd73af0eab
Author: Rik van Riel 
AuthorDate: Wed, 10 Feb 2016 20:08:26 -0500
Committer:  Ingo Molnar 
CommitDate: Mon, 29 Feb 2016 09:53:09 +0100

time, acct: Drop irq save & restore from __acct_update_integrals()

It looks like all the call paths that lead to __acct_update_integrals()
already have irqs disabled, and __acct_update_integrals() does not need
to disable irqs itself.

This is very convenient since about half the CPU time left in this
function was spent in local_irq_save alone.

Performance of a microbenchmark that calls an invalid syscall
ten million times in a row on a nohz_full CPU improves 21% vs.
4.5-rc1 with both the removal of divisions from __acct_update_integrals()
and this patch, with runtime dropping from 3.7 to 2.9 seconds.

With these patches applied, the highest remaining cpu user in
the trace is native_sched_clock, which is addressed in the next
patch.

For testing purposes I stuck a WARN_ON(!irqs_disabled()) test
in __acct_update_integrals(). It did not trigger.

Suggested-by: Peter Zijlstra 
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Thomas Gleixner 
Cc: Linus Torvalds 
Cc: Mike Galbraith 
Cc: cl...@redhat.com
Cc: eric.duma...@gmail.com
Cc: fweis...@gmail.com
Cc: l...@amacapital.net
Link: http://lkml.kernel.org/r/1455152907-18495-4-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/tsacct.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index d12e815..f8e26ab 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -126,20 +126,18 @@ static void __acct_update_integrals(struct task_struct 
*tsk,
cputime_t utime, cputime_t stime)
 {
cputime_t time, dtime;
-   unsigned long flags;
u64 delta;
 
if (!likely(tsk->mm))
return;
 
-   local_irq_save(flags);
time = stime + utime;
dtime = time - tsk->acct_timexpd;
/* Avoid division: cputime_t is often in nanoseconds already. */
delta = cputime_to_nsecs(dtime);
 
if (delta < TICK_NSEC)
-   goto out;
+   return;
 
tsk->acct_timexpd = time;
/*
@@ -149,8 +147,6 @@ static void __acct_update_integrals(struct task_struct *tsk,
 */
tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
-out:
-   local_irq_restore(flags);
 }
 
 /**
@@ -160,9 +156,12 @@ out:
 void acct_update_integrals(struct task_struct *tsk)
 {
cputime_t utime, stime;
+   unsigned long flags;
 
+   local_irq_save(flags);
task_cputime(tsk, &utime, &stime);
__acct_update_integrals(tsk, utime, stime);
+   local_irq_restore(flags);
 }
 
 /**

[tip:sched/core] sched, time: Switch VIRT_CPU_ACCOUNTING_GEN to jiffy granularity

2016-02-29 Thread tip-bot for Rik van Riel

Commit-ID:  ff9a9b4c4334b53b52ee9279f30bd5dd92ea9bdd
Gitweb: http://git.kernel.org/tip/ff9a9b4c4334b53b52ee9279f30bd5dd92ea9bdd
Author: Rik van Riel 
AuthorDate: Wed, 10 Feb 2016 20:08:27 -0500
Committer:  Ingo Molnar 
CommitDate: Mon, 29 Feb 2016 09:53:10 +0100

sched, time: Switch VIRT_CPU_ACCOUNTING_GEN to jiffy granularity

When profiling syscall overhead on nohz-full kernels,
after removing __acct_update_integrals() from the profile,
native_sched_clock() remains as the top CPU user. This can be
reduced by moving VIRT_CPU_ACCOUNTING_GEN to jiffy granularity.

This will reduce timing accuracy on nohz_full CPUs to jiffy
based sampling, just like on normal CPUs. It results in
totally removing native_sched_clock from the profile, and
significantly speeding up the syscall entry and exit path,
as well as irq entry and exit, and KVM guest entry & exit.

Additionally, only call the more expensive functions (and
advance the seqlock) when jiffies actually changed.

This code relies on another CPU advancing jiffies when the
system is busy. On a nohz_full system, this is done by a
housekeeping CPU.

A microbenchmark calling an invalid syscall number 10 million
times in a row speeds up an additional 30% over the numbers
with just the previous patches, for a total speedup of about
40% over 4.4 and 4.5-rc1.

Run times for the microbenchmark:

 4.43.8 seconds
 4.5-rc13.7 seconds
 4.5-rc1 + first patch  3.3 seconds
 4.5-rc1 + first 3 patches  3.1 seconds
 4.5-rc1 + all patches  2.3 seconds

A non-NOHZ_FULL cpu (not the housekeeping CPU):

 all kernels1.86 seconds

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Thomas Gleixner 
Cc: Linus Torvalds 
Cc: Mike Galbraith 
Cc: Peter Zijlstra 
Cc: cl...@redhat.com
Cc: eric.duma...@gmail.com
Cc: fweis...@gmail.com
Cc: l...@amacapital.net
Link: http://lkml.kernel.org/r/1455152907-18495-5-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/cputime.c | 39 +++
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index b2ab2ff..01d9898 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -668,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, 
cputime_t *ut, cputime
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static unsigned long long vtime_delta(struct task_struct *tsk)
+static cputime_t vtime_delta(struct task_struct *tsk)
 {
-   unsigned long long clock;
+   unsigned long now = READ_ONCE(jiffies);
 
-   clock = local_clock();
-   if (clock < tsk->vtime_snap)
+   if (time_before(now, (unsigned long)tsk->vtime_snap))
return 0;
 
-   return clock - tsk->vtime_snap;
+   return jiffies_to_cputime(now - tsk->vtime_snap);
 }
 
 static cputime_t get_vtime_delta(struct task_struct *tsk)
 {
-   unsigned long long delta = vtime_delta(tsk);
+   unsigned long now = READ_ONCE(jiffies);
+   unsigned long delta = now - tsk->vtime_snap;
 
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
-   tsk->vtime_snap += delta;
+   tsk->vtime_snap = now;
 
-   /* CHECKME: always safe to convert nsecs to cputime? */
-   return nsecs_to_cputime(delta);
+   return jiffies_to_cputime(delta);
 }
 
 static void __vtime_account_system(struct task_struct *tsk)
@@ -699,6 +698,9 @@ static void __vtime_account_system(struct task_struct *tsk)
 
 void vtime_account_system(struct task_struct *tsk)
 {
+   if (!vtime_delta(tsk))
+   return;
+
write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
write_seqcount_end(&tsk->vtime_seqcount);
@@ -707,7 +709,8 @@ void vtime_account_system(struct task_struct *tsk)
 void vtime_gen_account_irq_exit(struct task_struct *tsk)
 {
write_seqcount_begin(&tsk->vtime_seqcount);
-   __vtime_account_system(tsk);
+   if (vtime_delta(tsk))
+   __vtime_account_system(tsk);
if (context_tracking_in_user())
tsk->vtime_snap_whence = VTIME_USER;
write_seqcount_end(&tsk->vtime_seqcount);
@@ -718,16 +721,19 @@ void vtime_account_user(struct task_struct *tsk)
cputime_t delta_cpu;
 
write_seqcount_begin(&tsk->vtime_seqcount);
-   delta_cpu = get_vtime_delta(tsk);
tsk->vtime_snap_whence = VTIME_SYS;
-   account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+   if (vtime_delta(tsk)) {
+   delta_cpu = get_vtime_delta(tsk);
+   account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+   }
write_seqcount_end(&tsk->vtime_seqcount);
 }
 
 void vtime_user_enter(struct task_struct *tsk)
 {
write_seqcount_begin(&tsk->vtime_seqcount);
-   __vtime_account_system(tsk);
+   if

[tip:sched/core] acct, time: Change indentation in __acct_update_integrals()

2016-02-29 Thread tip-bot for Rik van Riel

Commit-ID:  b2add86edd3bc050af350515e6ba26f4622c38f3
Gitweb: http://git.kernel.org/tip/b2add86edd3bc050af350515e6ba26f4622c38f3
Author: Rik van Riel 
AuthorDate: Wed, 10 Feb 2016 20:08:25 -0500
Committer:  Ingo Molnar 
CommitDate: Mon, 29 Feb 2016 09:53:09 +0100

acct, time: Change indentation in __acct_update_integrals()

Change the indentation in __acct_update_integrals() to make the function
a little easier to read.

Suggested-by: Peter Zijlstra 
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Thomas Gleixner 
Acked-by: Frederic Weisbecker 
Cc: Linus Torvalds 
Cc: Mike Galbraith 
Cc: cl...@redhat.com
Cc: eric.duma...@gmail.com
Cc: fweis...@gmail.com
Cc: l...@amacapital.net
Link: http://lkml.kernel.org/r/1455152907-18495-3-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/tsacct.c | 51 ++-
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 460ee2b..d12e815 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -125,31 +125,32 @@ void xacct_add_tsk(struct taskstats *stats, struct 
task_struct *p)
 static void __acct_update_integrals(struct task_struct *tsk,
cputime_t utime, cputime_t stime)
 {
-   if (likely(tsk->mm)) {
-   cputime_t time, dtime;
-   unsigned long flags;
-   u64 delta;
-
-   local_irq_save(flags);
-   time = stime + utime;
-   dtime = time - tsk->acct_timexpd;
-   /* Avoid division: cputime_t is often in nanoseconds already. */
-   delta = cputime_to_nsecs(dtime);
-
-   if (delta < TICK_NSEC)
-   goto out;
-
-   tsk->acct_timexpd = time;
-   /*
-* Divide by 1024 to avoid overflow, and to avoid division.
-* The final unit reported to userspace is Mbyte-usecs,
-* the rest of the math is done in xacct_add_tsk.
-*/
-   tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
-   tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
-   out:
-   local_irq_restore(flags);
-   }
+   cputime_t time, dtime;
+   unsigned long flags;
+   u64 delta;
+
+   if (!likely(tsk->mm))
+   return;
+
+   local_irq_save(flags);
+   time = stime + utime;
+   dtime = time - tsk->acct_timexpd;
+   /* Avoid division: cputime_t is often in nanoseconds already. */
+   delta = cputime_to_nsecs(dtime);
+
+   if (delta < TICK_NSEC)
+   goto out;
+
+   tsk->acct_timexpd = time;
+   /*
+* Divide by 1024 to avoid overflow, and to avoid division.
+* The final unit reported to userspace is Mbyte-usecs,
+* the rest of the math is done in xacct_add_tsk.
+*/
+   tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
+   tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
+out:
+   local_irq_restore(flags);
 }
 
 /**

[tip:sched/core] sched, time: Remove non-power-of-two divides from __acct_update_integrals()

2016-02-29 Thread tip-bot for Rik van Riel

Commit-ID:  382c2fe994321d503647ce8ee329b9420dc7c1f9
Gitweb: http://git.kernel.org/tip/382c2fe994321d503647ce8ee329b9420dc7c1f9
Author: Rik van Riel 
AuthorDate: Wed, 10 Feb 2016 20:08:24 -0500
Committer:  Ingo Molnar 
CommitDate: Mon, 29 Feb 2016 09:53:08 +0100

sched, time: Remove non-power-of-two divides from __acct_update_integrals()

When running a microbenchmark calling an invalid syscall number
in a loop, on a nohz_full CPU, we spend a full 9% of our CPU
time in __acct_update_integrals().

This function converts cputime_t to jiffies, to a timeval, only to
convert the timeval back to microseconds before discarding it.

This patch leaves __acct_update_integrals() functionally equivalent,
but speeds things up by about 12%, with 10 million calls to an
invalid syscall number dropping from 3.7 to 3.25 seconds.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Thomas Gleixner 
Cc: Linus Torvalds 
Cc: Mike Galbraith 
Cc: Peter Zijlstra 
Cc: cl...@redhat.com
Cc: eric.duma...@gmail.com
Cc: fweis...@gmail.com
Cc: l...@amacapital.net
Link: http://lkml.kernel.org/r/1455152907-18495-2-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/tsacct.c | 26 --
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 975cb49..460ee2b 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct 
task_struct *p)
 {
struct mm_struct *mm;
 
-   /* convert pages-usec to Mbyte-usec */
-   stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
-   stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
+   /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */
+   stats->coremem = p->acct_rss_mem1 * PAGE_SIZE;
+   do_div(stats->coremem, 1000 * KB);
+   stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE;
+   do_div(stats->virtmem, 1000 * KB);
mm = get_task_mm(p);
if (mm) {
/* adjust to KB unit */
@@ -125,22 +127,26 @@ static void __acct_update_integrals(struct task_struct 
*tsk,
 {
if (likely(tsk->mm)) {
cputime_t time, dtime;
-   struct timeval value;
unsigned long flags;
u64 delta;
 
local_irq_save(flags);
time = stime + utime;
dtime = time - tsk->acct_timexpd;
-   jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
-   delta = value.tv_sec;
-   delta = delta * USEC_PER_SEC + value.tv_usec;
+   /* Avoid division: cputime_t is often in nanoseconds already. */
+   delta = cputime_to_nsecs(dtime);
 
-   if (delta == 0)
+   if (delta < TICK_NSEC)
goto out;
+
tsk->acct_timexpd = time;
-   tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
-   tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
+   /*
+* Divide by 1024 to avoid overflow, and to avoid division.
+* The final unit reported to userspace is Mbyte-usecs,
+* the rest of the math is done in xacct_add_tsk.
+*/
+   tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
+   tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
out:
local_irq_restore(flags);
}

[tip:sched/core] sched/numa: Spread memory according to CPU and memory use

2016-02-09 Thread tip-bot for Rik van Riel

Commit-ID:  4142c3ebb685bb338b7d96090d8f90ff49065ff6
Gitweb: http://git.kernel.org/tip/4142c3ebb685bb338b7d96090d8f90ff49065ff6
Author: Rik van Riel 
AuthorDate: Mon, 25 Jan 2016 17:07:39 -0500
Committer:  Ingo Molnar 
CommitDate: Tue, 9 Feb 2016 14:47:18 +0100

sched/numa: Spread memory according to CPU and memory use

The pseudo-interleaving in NUMA placement has a fundamental problem:
using hard usage thresholds to spread memory equally between nodes
can prevent workloads from converging, or keep memory "trapped" on
nodes where the workload is barely running any more.

In order for workloads to properly converge, the memory migration
should not be stopped when nodes reach parity, but instead be
distributed according to how heavily memory is used from each node.
This way memory migration and task migration reinforce each other,
instead of one putting the brakes on the other.

Remove the hard thresholds from the pseudo-interleaving code, and
instead use a more gradual policy on memory placement. This also
seems to improve convergence of workloads that do not run flat out,
but sleep in between bursts of activity.

We still want to slow down NUMA scanning and migration once a workload
has settled on a few actively used nodes, so keep the 3/4 hysteresis
in place. Keep track of whether a workload is actively running on
multiple nodes, so task_numa_migrate does a full scan of the system
for better task placement.

In the case of running 3 SPECjbb2005 instances on a 4 node system,
this code seems to result in fairer distribution of memory between
nodes, with more memory bandwidth for each instance.

Signed-off-by: Rik van Riel 
Cc: Linus Torvalds 
Cc: Mike Galbraith 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: mgor...@suse.de
Link: http://lkml.kernel.org/r/20160125170739.2fc9a...@annuminas.surriel.com
[ Minor readability tweaks. ]
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 87 +
 1 file changed, 47 insertions(+), 40 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 51a4550..7ce24a4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -932,10 +932,11 @@ struct numa_group {
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
pid_t gid;
+   int active_nodes;
 
struct rcu_head rcu;
-   nodemask_t active_nodes;
unsigned long total_faults;
+   unsigned long max_faults_cpu;
/*
 * Faults_cpu is used to decide whether memory should move
 * towards the CPU. As a consequence, these stats are weighted
@@ -994,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct 
numa_group *group, int nid)
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
 }
 
+/*
+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
+ * considered part of a numa group's pseudo-interleaving set. Migrations
+ * between these nodes are slowed down, to allow things to settle down.
+ */
+#define ACTIVE_NODE_FRACTION 3
+
+static bool numa_is_active_node(int nid, struct numa_group *ng)
+{
+   return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > 
ng->max_faults_cpu;
+}
+
 /* Handle placement on systems where not all nodes are directly connected. */
 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
int maxdist, bool task)
@@ -1143,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, 
struct page * page,
return true;
 
/*
-* Do not migrate if the destination is not a node that
-* is actively used by this numa group.
+* Destination node is much more heavily used than the source
+* node? Allow migration.
 */
-   if (!node_isset(dst_nid, ng->active_nodes))
-   return false;
-
-   /*
-* Source is a node that is not actively used by this
-* numa group, while the destination is. Migrate.
-*/
-   if (!node_isset(src_nid, ng->active_nodes))
+   if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
+   ACTIVE_NODE_FRACTION)
return true;
 
/*
-* Both source and destination are nodes in active
-* use by this numa group. Maximize memory bandwidth
-* by migrating from more heavily used groups, to less
-* heavily used ones, spreading the load around.
-* Use a 1/4 hysteresis to avoid spurious page movement.
+* Distribute memory according to CPU & memory use on each node,
+* with 3/4 hysteresis to avoid unnecessary memory migrations:
+*
+* faults_cpu(dst)   3   faults_cpu(src)
+* --- * - > ---
+* faults_mem(dst)   4   faults_mem(src)
 */
-   return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+   return group_faults_cpu(ng, dst_nid)

[tip:sched/core] sched/numa: Cap PTE scanning overhead to 3% of run time

2015-11-23 Thread tip-bot for Rik van Riel

Commit-ID:  51170840fe91dfca10fd533b303ea39b2524782a
Gitweb: http://git.kernel.org/tip/51170840fe91dfca10fd533b303ea39b2524782a
Author: Rik van Riel 
AuthorDate: Thu, 5 Nov 2015 15:56:23 -0500
Committer:  Ingo Molnar 
CommitDate: Mon, 23 Nov 2015 09:37:54 +0100

sched/numa: Cap PTE scanning overhead to 3% of run time

There is a fundamental mismatch between the runtime based NUMA scanning
at the task level, and the wall clock time NUMA scanning at the mm level.
On a severely overloaded system, with very large processes, this mismatch
can cause the system to spend all of its time in change_prot_numa().

This can happen if the task spends at least two ticks in change_prot_numa(),
and only gets two ticks of CPU time in the real time between two scan
intervals of the mm.

This patch ensures that a task never spends more than 3% of run
time scanning PTEs. It does that by ensuring that in-between
task_numa_work() runs, the task spends at least 32x as much time on
other things than it did on task_numa_work().

This is done stochastically: if a timer tick happens, or the task
gets rescheduled during task_numa_work(), we delay a future run of
task_numa_work() until the task has spent at least 32x the amount of
CPU time doing something else, as it spent inside task_numa_work().
The longer task_numa_work() takes, the more likely it is this happens.

If task_numa_work() takes very little time, chances are low that that
code will do anything, but we will not care.

Reported-and-tested-by: Jan Stancek 
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: Mike Galbraith 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: mgor...@suse.de
Link: http://lkml.kernel.org/r/1446756983-28173-3-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 309b1d5..95b944e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2155,6 +2155,7 @@ void task_numa_work(struct callback_head *work)
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
+   u64 runtime = p->se.sum_exec_runtime;
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
@@ -2277,6 +2278,17 @@ out:
else
reset_ptenuma_scan(p);
up_read(&mm->mmap_sem);
+
+   /*
+* Make sure tasks use at least 32x as much time to run other code
+* than they used here, to limit NUMA PTE scanning overhead to 3% max.
+* Usually update_task_scan_period slows down scanning enough; on an
+* overloaded system we need to limit overhead on a per task basis.
+*/
+   if (unlikely(p->se.sum_exec_runtime != runtime)) {
+   u64 diff = p->se.sum_exec_runtime - runtime;
+   p->node_stamp += 32 * diff;
+   }
 }
 
 /*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/urgent] sched/numa: Fix math underflow in task_tick_numa()

2015-11-09 Thread tip-bot for Rik van Riel

Commit-ID:  25b3e5a3344e1f700c1efec5b6f0199f04707fb1
Gitweb: http://git.kernel.org/tip/25b3e5a3344e1f700c1efec5b6f0199f04707fb1
Author: Rik van Riel 
AuthorDate: Thu, 5 Nov 2015 15:56:22 -0500
Committer:  Ingo Molnar 
CommitDate: Mon, 9 Nov 2015 16:13:27 +0100

sched/numa: Fix math underflow in task_tick_numa()

The NUMA balancing code implements delays in scanning by
advancing curr->node_stamp beyond curr->se.sum_exec_runtime.

With unsigned math, that creates an underflow, which results
in task_numa_work being queued all the time, even when we
don't want to.

Avoiding the math underflow makes it possible to reduce CPU
overhead in the NUMA balancing code.

Reported-and-tested-by: Jan Stancek 
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: mgor...@suse.de
Link: http://lkml.kernel.org/r/1446756983-28173-2-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 824aa9f..f04fda8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2302,7 +2302,7 @@ void task_tick_numa(struct rq *rq, struct task_struct 
*curr)
now = curr->se.sum_exec_runtime;
period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
 
-   if (now - curr->node_stamp > period) {
+   if (now > curr->node_stamp + period) {
if (!curr->node_stamp)
curr->numa_scan_period = task_scan_min(curr);
curr->node_stamp += period;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Limit the amount of virtual memory scanned in task_numa_work()

2015-09-18 Thread tip-bot for Rik van Riel

Commit-ID:  4620f8c1fda2af4ccbd11e194e2dd785f7d7f279
Gitweb: http://git.kernel.org/tip/4620f8c1fda2af4ccbd11e194e2dd785f7d7f279
Author: Rik van Riel 
AuthorDate: Fri, 11 Sep 2015 09:00:27 -0400
Committer:  Ingo Molnar 
CommitDate: Fri, 18 Sep 2015 09:23:14 +0200

sched/numa: Limit the amount of virtual memory scanned in task_numa_work()

Currently task_numa_work() scans up to numa_balancing_scan_size_mb worth
of memory per invocation, but only counts memory areas that have at
least one PTE that is still present and not marked for numa hint faulting.

It will skip over arbitarily large amounts of memory that are either
unused, full of swap ptes, or full of PTEs that were already marked
for NUMA hint faults but have not been faulted on yet.

This can cause excessive amounts of CPU use, due to there being
essentially no upper limit on the scan rate of very large processes
that are not yet in a phase where they are actively accessing old
memory pages (eg. they are still initializing their data).

Avoid that problem by placing an upper limit on the amount of virtual
memory that task_numa_work() scans in each invocation. This can be a
higher limit than "pages", to ensure the task still skips over unused
areas fairly quickly.

While we are here, also fix the "nr_pte_updates" logic, so it only
counts page ranges with ptes in them.

Reported-by: Andrea Arcangeli 
Reported-by: Jan Stancek 
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Mel Gorman 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Link: http://lkml.kernel.org/r/20150911090027.4a798...@annuminas.surriel.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9176f7c..1bfad9f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2157,7 +2157,7 @@ void task_numa_work(struct callback_head *work)
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
-   long pages;
+   long pages, virtpages;
 
WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
 
@@ -2203,9 +2203,11 @@ void task_numa_work(struct callback_head *work)
start = mm->numa_scan_offset;
pages = sysctl_numa_balancing_scan_size;
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
+   virtpages = pages * 8; /* Scan up to this much virtual space */
if (!pages)
return;
 
+
down_read(&mm->mmap_sem);
vma = find_vma(mm, start);
if (!vma) {
@@ -2240,18 +2242,22 @@ void task_numa_work(struct callback_head *work)
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
end = min(end, vma->vm_end);
-   nr_pte_updates += change_prot_numa(vma, start, end);
+   nr_pte_updates = change_prot_numa(vma, start, end);
 
/*
-* Scan sysctl_numa_balancing_scan_size but ensure that
-* at least one PTE is updated so that unused virtual
-* address space is quickly skipped.
+* Try to scan sysctl_numa_balancing_size worth of
+* hpages that have at least one present PTE that
+* is not already pte-numa. If the VMA contains
+* areas that are unused or already full of prot_numa
+* PTEs, scan up to virtpages, to skip through those
+* areas faster.
 */
if (nr_pte_updates)
pages -= (end - start) >> PAGE_SHIFT;
+   virtpages -= (end - start) >> PAGE_SHIFT;
 
start = end;
-   if (pages <= 0)
+   if (pages <= 0 || virtpages <= 0)
goto out;
 
cond_resched();
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Only consider less busy nodes as numa balancing destinations

2015-06-07 Thread tip-bot for Rik van Riel

Commit-ID:  6f9aad0bc37286c0441b57f0ba8cffee50715426
Gitweb: http://git.kernel.org/tip/6f9aad0bc37286c0441b57f0ba8cffee50715426
Author: Rik van Riel 
AuthorDate: Thu, 28 May 2015 09:52:49 -0400
Committer:  Ingo Molnar 
CommitDate: Sun, 7 Jun 2015 15:57:45 +0200

sched/numa: Only consider less busy nodes as numa balancing destinations

Changeset a43455a1d572 ("sched/numa: Ensure task_numa_migrate() checks
the preferred node") fixes an issue where workloads would never
converge on a fully loaded (or overloaded) system.

However, it introduces a regression on less than fully loaded systems,
where workloads converge on a few NUMA nodes, instead of properly
staying spread out across the whole system. This leads to a reduction
in available memory bandwidth, and usable CPU cache, with predictable
performance problems.

The root cause appears to be an interaction between the load balancer
and NUMA balancing, where the short term load represented by the load
balancer differs from the long term load the NUMA balancing code would
like to base its decisions on.

Simply reverting a43455a1d572 would re-introduce the non-convergence
of workloads on fully loaded systems, so that is not a good option. As
an aside, the check done before a43455a1d572 only applied to a task's
preferred node, not to other candidate nodes in the system, so the
converge-on-too-few-nodes problem still happens, just to a lesser
degree.

Instead, try to compensate for the impedance mismatch between the load
balancer and NUMA balancing by only ever considering a lesser loaded
node as a destination for NUMA balancing, regardless of whether the
task is trying to move to the preferred node, or to another node.

This patch also addresses the issue that a system with a single
runnable thread would never migrate that thread to near its memory,
introduced by 095bebf61a46 ("sched/numa: Do not move past the balance
point if unbalanced").

A test where the main thread creates a large memory area, and spawns a
worker thread to iterate over the memory (placed on another node by
select_task_rq_fair), after which the main thread goes to sleep and
waits for the worker thread to loop over all the memory now sees the
worker thread migrated to where the memory is, instead of having all
the memory migrated over like before.

Jirka has run a number of performance tests on several systems: single
instance SpecJBB 2005 performance is 7-15% higher on a 4 node system,
with higher gains on systems with more cores per socket.
Multi-instance SpecJBB 2005 (one per node), linpack, and stream see
little or no changes with the revert of 095bebf61a46 and this patch.

Reported-by: Artem Bityutski 
Reported-by: Jirka Hladky 
Tested-by: Jirka Hladky 
Tested-by: Artem Bityutskiy 
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Mel Gorman 
Cc: Andrew Morton 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Srikar Dronamraju 
Cc: Thomas Gleixner 
Link: http://lkml.kernel.org/r/20150528095249.3083a...@annuminas.surriel.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 30 --
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 723d69e..4b6e5f6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1398,6 +1398,30 @@ static void task_numa_find_cpu(struct task_numa_env *env,
}
 }
 
+/* Only move tasks to a NUMA node less busy than the current node. */
+static bool numa_has_capacity(struct task_numa_env *env)
+{
+   struct numa_stats *src = &env->src_stats;
+   struct numa_stats *dst = &env->dst_stats;
+
+   if (src->has_free_capacity && !dst->has_free_capacity)
+   return false;
+
+   /*
+* Only consider a task move if the source has a higher load
+* than the destination, corrected for CPU capacity on each node.
+*
+*  src->loaddst->load
+* - vs -
+* src->compute_capacitydst->compute_capacity
+*/
+   if (src->load * dst->compute_capacity >
+   dst->load * src->compute_capacity)
+   return true;
+
+   return false;
+}
+
 static int task_numa_migrate(struct task_struct *p)
 {
struct task_numa_env env = {
@@ -1452,7 +1476,8 @@ static int task_numa_migrate(struct task_struct *p)
update_numa_stats(&env.dst_stats, env.dst_nid);
 
/* Try to find a spot on the preferred nid. */
-   task_numa_find_cpu(&env, taskimp, groupimp);
+   if (numa_has_capacity(&env))
+   task_numa_find_cpu(&env, taskimp, groupimp);
 
/*
 * Look at other nodes in these cases:
@@ -1483,7 +1508,8 @@ static int task_numa_migrate(struct task_struct *p)
env.dist = dist;
env.dst_nid = nid;
update_numa_stats(&env.dst_stats, env.dst_nid);
-   task_n

[tip:sched/core] Revert 095bebf61a46 ("sched/numa: Do not move past the balance point if unbalanced")

2015-06-07 Thread tip-bot for Rik van Riel

Commit-ID:  e4991b240c622f0441c21f4869e13209abc08c5e
Gitweb: http://git.kernel.org/tip/e4991b240c622f0441c21f4869e13209abc08c5e
Author: Rik van Riel 
AuthorDate: Wed, 27 May 2015 15:04:27 -0400
Committer:  Ingo Molnar 
CommitDate: Sun, 7 Jun 2015 15:57:44 +0200

Revert 095bebf61a46 ("sched/numa: Do not move past the balance point if 
unbalanced")

Commit 095bebf61a46 ("sched/numa: Do not move past the balance point
if unbalanced") broke convergence of workloads with just one runnable
thread, by making it impossible for the one runnable thread on the
system to move from one NUMA node to another.

Instead, the thread would remain where it was, and pull all the memory
across to its location, which is much slower than just migrating the
thread to where the memory is.

The next patch has a better fix for the issue that 095bebf61a46 tried
to address.

Reported-by: Jirka Hladky 
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Andrew Morton 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: dedeki...@gmail.com
Cc: mgor...@suse.de
Link: http://lkml.kernel.org/r/1432753468-7785-2-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 41 +++--
 1 file changed, 15 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 84ada05..723d69e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1198,11 +1198,9 @@ static void task_numa_assign(struct task_numa_env *env,
 static bool load_too_imbalanced(long src_load, long dst_load,
struct task_numa_env *env)
 {
+   long imb, old_imb;
+   long orig_src_load, orig_dst_load;
long src_capacity, dst_capacity;
-   long orig_src_load;
-   long load_a, load_b;
-   long moved_load;
-   long imb;
 
/*
 * The load is corrected for the CPU capacity available on each node.
@@ -1215,39 +1213,30 @@ static bool load_too_imbalanced(long src_load, long 
dst_load,
dst_capacity = env->dst_stats.compute_capacity;
 
/* We care about the slope of the imbalance, not the direction. */
-   load_a = dst_load;
-   load_b = src_load;
-   if (load_a < load_b)
-   swap(load_a, load_b);
+   if (dst_load < src_load)
+   swap(dst_load, src_load);
 
/* Is the difference below the threshold? */
-   imb = load_a * src_capacity * 100 -
-   load_b * dst_capacity * env->imbalance_pct;
+   imb = dst_load * src_capacity * 100 -
+ src_load * dst_capacity * env->imbalance_pct;
if (imb <= 0)
return false;
 
/*
 * The imbalance is above the allowed threshold.
-* Allow a move that brings us closer to a balanced situation,
-* without moving things past the point of balance.
+* Compare it with the old imbalance.
 */
orig_src_load = env->src_stats.load;
+   orig_dst_load = env->dst_stats.load;
 
-   /*
-* In a task swap, there will be one load moving from src to dst,
-* and another moving back. This is the net sum of both moves.
-* A simple task move will always have a positive value.
-* Allow the move if it brings the system closer to a balanced
-* situation, without crossing over the balance point.
-*/
-   moved_load = orig_src_load - src_load;
+   if (orig_dst_load < orig_src_load)
+   swap(orig_dst_load, orig_src_load);
 
-   if (moved_load > 0)
-   /* Moving src -> dst. Did we overshoot balance? */
-   return src_load * dst_capacity < dst_load * src_capacity;
-   else
-   /* Moving dst -> src. Did we overshoot balance? */
-   return dst_load * src_capacity < src_load * dst_capacity;
+   old_imb = orig_dst_load * src_capacity * 100 -
+ orig_src_load * dst_capacity * env->imbalance_pct;
+
+   /* Would this change make things worse? */
+   return (imb > old_imb);
 }
 
 /*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Reduce conflict between fbq_classify_rq() and migration

2015-05-19 Thread tip-bot for Rik van Riel

Commit-ID:  c1ceac6276e4ee12e4129afd380db10fae0db7df
Gitweb: http://git.kernel.org/tip/c1ceac6276e4ee12e4129afd380db10fae0db7df
Author: Rik van Riel 
AuthorDate: Thu, 14 May 2015 22:59:36 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 19 May 2015 08:39:19 +0200

sched/numa: Reduce conflict between fbq_classify_rq() and migration

It is possible for fbq_classify_rq() to indicate that a CPU has tasks that
should be moved to another NUMA node, but for migrate_improves_locality
and migrate_degrades_locality to not identify those tasks.

This patch always gives preference to preferred node evaluations, and
only checks the number of faults when evaluating moves between two
non-preferred nodes on a larger NUMA system.

On a two node system, the number of faults is never evaluated. Either
a task is about to be pulled off its preferred node, or migrated onto
it.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: mgor...@suse.de
Link: http://lkml.kernel.org/r/20150514225936.35b91...@annuminas.surriel.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 60 +
 1 file changed, 33 insertions(+), 27 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a27d988..0d4632f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5663,10 +5663,15 @@ static int task_hot(struct task_struct *p, struct 
lb_env *env)
 }
 
 #ifdef CONFIG_NUMA_BALANCING
-/* Returns true if the destination node has incurred more faults */
+/*
+ * Returns true if the destination node is the preferred node.
+ * Needs to match fbq_classify_rq(): if there is a runnable task
+ * that is not on its preferred node, we should identify it.
+ */
 static bool migrate_improves_locality(struct task_struct *p, struct lb_env 
*env)
 {
struct numa_group *numa_group = rcu_dereference(p->numa_group);
+   unsigned long src_faults, dst_faults;
int src_nid, dst_nid;
 
if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
@@ -5680,29 +5685,30 @@ static bool migrate_improves_locality(struct 
task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
 
-   if (numa_group) {
-   /* Task is already in the group's interleave set. */
-   if (node_isset(src_nid, numa_group->active_nodes))
-   return false;
-
-   /* Task is moving into the group's interleave set. */
-   if (node_isset(dst_nid, numa_group->active_nodes))
-   return true;
-
-   return group_faults(p, dst_nid) > group_faults(p, src_nid);
-   }
-
/* Encourage migration to the preferred node. */
if (dst_nid == p->numa_preferred_nid)
return true;
 
-   return task_faults(p, dst_nid) > task_faults(p, src_nid);
+   /* Migrating away from the preferred node is bad. */
+   if (src_nid == p->numa_preferred_nid)
+   return false;
+
+   if (numa_group) {
+   src_faults = group_faults(p, src_nid);
+   dst_faults = group_faults(p, dst_nid);
+   } else {
+   src_faults = task_faults(p, src_nid);
+   dst_faults = task_faults(p, dst_nid);
+   }
+
+   return dst_faults > src_faults;
 }
 
 
 static bool migrate_degrades_locality(struct task_struct *p, struct lb_env 
*env)
 {
struct numa_group *numa_group = rcu_dereference(p->numa_group);
+   unsigned long src_faults, dst_faults;
int src_nid, dst_nid;
 
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5717,23 +5723,23 @@ static bool migrate_degrades_locality(struct 
task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
 
-   if (numa_group) {
-   /* Task is moving within/into the group's interleave set. */
-   if (node_isset(dst_nid, numa_group->active_nodes))
-   return false;
+   /* Migrating away from the preferred node is bad. */
+   if (src_nid == p->numa_preferred_nid)
+   return true;
 
-   /* Task is moving out of the group's interleave set. */
-   if (node_isset(src_nid, numa_group->active_nodes))
-   return true;
+   /* Encourage migration to the preferred node. */
+   if (dst_nid == p->numa_preferred_nid)
+   return false;
 
-   return group_faults(p, dst_nid) < group_faults(p, src_nid);
+   if (numa_group) {
+   src_faults = group_faults(p, src_nid);
+   dst_faults = group_faults(p, dst_nid);
+   } else {
+   src_faults = task_faults(p, src_nid);
+   dst_faults = task_faults(p, dst_nid);
}
 
-   /* Migrating away from the preferred node is always bad. */
-   if (src_nid == p->numa_preferred_nid)
-

[tip:x86/fpu] x86/fpu: Use an explicit if/ else in switch_fpu_prepare()

2015-02-19 Thread tip-bot for Rik van Riel

Commit-ID:  1361ef29c7e49ae7cf37220c25fac1904b77f71a
Gitweb: http://git.kernel.org/tip/1361ef29c7e49ae7cf37220c25fac1904b77f71a
Author: Rik van Riel 
AuthorDate: Fri, 6 Feb 2015 15:02:03 -0500
Committer:  Borislav Petkov 
CommitDate: Thu, 19 Feb 2015 11:15:54 +0100

x86/fpu: Use an explicit if/else in switch_fpu_prepare()

Use an explicit if/else branch after __save_init_fpu(old) in
switch_fpu_prepare(). This makes substituting the assignment with a call
in task_disable_lazy_fpu_restore() in the next patch easier to review.

Signed-off-by: Rik van Riel 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Link: http://lkml.kernel.org/r/1423252925-14451-7-git-send-email-r...@redhat.com
[ Space out stuff for more readability. ]
Signed-off-by: Borislav Petkov 
---
 arch/x86/include/asm/fpu-internal.h | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/fpu-internal.h 
b/arch/x86/include/asm/fpu-internal.h
index 9c27f44..04c2807 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -434,13 +434,17 @@ static inline fpu_switch_t switch_fpu_prepare(struct 
task_struct *old, struct ta
 * If the task has used the math, pre-load the FPU on xsave processors
 * or if the past 5 consecutive context-switches used math.
 */
-   fpu.preload = tsk_used_math(new) && (use_eager_fpu() ||
-new->thread.fpu_counter > 5);
+   fpu.preload = tsk_used_math(new) &&
+ (use_eager_fpu() || new->thread.fpu_counter > 5);
+
if (__thread_has_fpu(old)) {
if (!__save_init_fpu(old))
-   cpu = ~0;
-   old->thread.fpu.last_cpu = cpu;
-   old->thread.fpu.has_fpu = 0;/* But leave fpu_owner_task! */
+   old->thread.fpu.last_cpu = ~0;
+   else
+   old->thread.fpu.last_cpu = cpu;
+
+   /* But leave fpu_owner_task! */
+   old->thread.fpu.has_fpu = 0;
 
/* Don't change CR0.TS if we just switch! */
if (fpu.preload) {
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:x86/fpu] x86/fpu: Introduce task_disable_lazy_fpu_restore() helper

2015-02-19 Thread tip-bot for Rik van Riel

Commit-ID:  33e03dedd759cc9396252d9641b25d01909a26bb
Gitweb: http://git.kernel.org/tip/33e03dedd759cc9396252d9641b25d01909a26bb
Author: Rik van Riel 
AuthorDate: Fri, 6 Feb 2015 15:02:02 -0500
Committer:  Borislav Petkov 
CommitDate: Thu, 19 Feb 2015 11:15:53 +0100

x86/fpu: Introduce task_disable_lazy_fpu_restore() helper

Currently there are a few magic assignments sprinkled through the
code that disable lazy FPU state restoring, some more effective than
others, and all equally mystifying.

It would be easier to have a helper to explicitly disable lazy
FPU state restoring for a task.

Signed-off-by: Rik van Riel 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Link: http://lkml.kernel.org/r/1423252925-14451-6-git-send-email-r...@redhat.com
Signed-off-by: Borislav Petkov 
---
 arch/x86/include/asm/fpu-internal.h | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/x86/include/asm/fpu-internal.h 
b/arch/x86/include/asm/fpu-internal.h
index 217d6d7..9c27f44 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -79,6 +79,16 @@ static inline void __cpu_disable_lazy_restore(unsigned int 
cpu)
per_cpu(fpu_owner_task, cpu) = NULL;
 }
 
+/*
+ * Used to indicate that the FPU state in memory is newer than the FPU
+ * state in registers, and the FPU state should be reloaded next time the
+ * task is run. Only safe on the current task, or non-running tasks.
+ */
+static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk)
+{
+   tsk->thread.fpu.last_cpu = ~0;
+}
+
 static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
 {
return new == this_cpu_read_stable(fpu_owner_task) &&
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:x86/fpu] x86/fpu: Use task_disable_lazy_fpu_restore() helper

2015-02-19 Thread tip-bot for Rik van Riel

Commit-ID:  6a5fe8952bd676baf382d14df21e7b32b5d8943e
Gitweb: http://git.kernel.org/tip/6a5fe8952bd676baf382d14df21e7b32b5d8943e
Author: Rik van Riel 
AuthorDate: Fri, 6 Feb 2015 15:02:04 -0500
Committer:  Borislav Petkov 
CommitDate: Thu, 19 Feb 2015 11:15:55 +0100

x86/fpu: Use task_disable_lazy_fpu_restore() helper

Replace magic assignments of fpu.last_cpu = ~0 with more explicit
task_disable_lazy_fpu_restore() calls.

Signed-off-by: Rik van Riel 
Cc: Oleg Nesterov 
Cc: Linus Torvalds 
Link: http://lkml.kernel.org/r/1423252925-14451-8-git-send-email-r...@redhat.com
Signed-off-by: Borislav Petkov 
---
 arch/x86/include/asm/fpu-internal.h | 4 ++--
 arch/x86/kernel/i387.c  | 2 +-
 arch/x86/kernel/process.c   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/fpu-internal.h 
b/arch/x86/include/asm/fpu-internal.h
index 04c2807..e5f8f8e 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -439,7 +439,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct 
task_struct *old, struct ta
 
if (__thread_has_fpu(old)) {
if (!__save_init_fpu(old))
-   old->thread.fpu.last_cpu = ~0;
+   task_disable_lazy_fpu_restore(old);
else
old->thread.fpu.last_cpu = cpu;
 
@@ -455,7 +455,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct 
task_struct *old, struct ta
stts();
} else {
old->thread.fpu_counter = 0;
-   old->thread.fpu.last_cpu = ~0;
+   task_disable_lazy_fpu_restore(old);
if (fpu.preload) {
new->thread.fpu_counter++;
if (!use_eager_fpu() && fpu_lazy_restore(new, cpu))
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index f3ced6f..5722ab6 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -236,7 +236,7 @@ int init_fpu(struct task_struct *tsk)
if (tsk_used_math(tsk)) {
if (cpu_has_fpu && tsk == current)
unlazy_fpu(tsk);
-   tsk->thread.fpu.last_cpu = ~0;
+   task_disable_lazy_fpu_restore(tsk);
return 0;
}
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e127dda..ce8b103 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -68,8 +68,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct 
task_struct *src)
 
dst->thread.fpu_counter = 0;
dst->thread.fpu.has_fpu = 0;
-   dst->thread.fpu.last_cpu = ~0;
dst->thread.fpu.state = NULL;
+   task_disable_lazy_fpu_restore(dst);
if (tsk_used_math(src)) {
int err = fpu_alloc(&dst->thread.fpu);
if (err)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:x86/fpu] x86/fpu: Also check fpu_lazy_restore() when use_eager_fpu()

2015-02-19 Thread tip-bot for Rik van Riel

Commit-ID:  728e53fef429a0f3c9dda3587c3ccc57ad268b70
Gitweb: http://git.kernel.org/tip/728e53fef429a0f3c9dda3587c3ccc57ad268b70
Author: Rik van Riel 
AuthorDate: Fri, 6 Feb 2015 15:02:05 -0500
Committer:  Borislav Petkov 
CommitDate: Thu, 19 Feb 2015 11:15:55 +0100

x86/fpu: Also check fpu_lazy_restore() when use_eager_fpu()

With Oleg's patch:

  33a3ebdc077f ("x86, fpu: Don't abuse has_fpu in __kernel_fpu_begin/end()")

kernel threads no longer have an FPU state even on systems with
use_eager_fpu().

That in turn means that a task may still have its FPU state
loaded in the FPU registers, if the task only got interrupted by
kernel threads from when it went to sleep, to when it woke up
again.

In that case, there is no need to restore the FPU state for
this task, since it is still in the registers.

The kernel can simply use the same logic to determine this as
is used for !use_eager_fpu() systems.

Signed-off-by: Rik van Riel 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Link: http://lkml.kernel.org/r/1423252925-14451-9-git-send-email-r...@redhat.com
Signed-off-by: Borislav Petkov 
---
 arch/x86/include/asm/fpu-internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/fpu-internal.h 
b/arch/x86/include/asm/fpu-internal.h
index e5f8f8e..19fb41c 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -458,7 +458,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct 
task_struct *old, struct ta
task_disable_lazy_fpu_restore(old);
if (fpu.preload) {
new->thread.fpu_counter++;
-   if (!use_eager_fpu() && fpu_lazy_restore(new, cpu))
+   if (fpu_lazy_restore(new, cpu))
fpu.preload = 0;
else
prefetch(new->thread.fpu.state);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:x86/fpu] x86/fpu: Move lazy restore functions up a few lines

2015-02-19 Thread tip-bot for Rik van Riel

Commit-ID:  1c927eea4cad83c439cb51e9c96ad19cb005157d
Gitweb: http://git.kernel.org/tip/1c927eea4cad83c439cb51e9c96ad19cb005157d
Author: Rik van Riel 
AuthorDate: Fri, 6 Feb 2015 15:02:01 -0500
Committer:  Borislav Petkov 
CommitDate: Thu, 19 Feb 2015 11:15:53 +0100

x86/fpu: Move lazy restore functions up a few lines

We need another lazy restore related function, that will be called
from a function that is above where the lazy restore functions are
now. It would be nice to keep all three functions grouped together.

Signed-off-by: Rik van Riel 
Cc: Linus Torvalds 
Cc: Oleg Nesterov 
Link: http://lkml.kernel.org/r/1423252925-14451-5-git-send-email-r...@redhat.com
Signed-off-by: Borislav Petkov 
---
 arch/x86/include/asm/fpu-internal.h | 36 ++--
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/fpu-internal.h 
b/arch/x86/include/asm/fpu-internal.h
index 02f2e08..217d6d7 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -67,6 +67,24 @@ extern void finit_soft_fpu(struct i387_soft_struct *soft);
 static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
 #endif
 
+/*
+ * Must be run with preemption disabled: this clears the fpu_owner_task,
+ * on this CPU.
+ *
+ * This will disable any lazy FPU state restore of the current FPU state,
+ * but if the current thread owns the FPU, it will still be saved by.
+ */
+static inline void __cpu_disable_lazy_restore(unsigned int cpu)
+{
+   per_cpu(fpu_owner_task, cpu) = NULL;
+}
+
+static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
+{
+   return new == this_cpu_read_stable(fpu_owner_task) &&
+   cpu == new->thread.fpu.last_cpu;
+}
+
 static inline int is_ia32_compat_frame(void)
 {
return config_enabled(CONFIG_IA32_EMULATION) &&
@@ -398,24 +416,6 @@ static inline void drop_init_fpu(struct task_struct *tsk)
  */
 typedef struct { int preload; } fpu_switch_t;
 
-/*
- * Must be run with preemption disabled: this clears the fpu_owner_task,
- * on this CPU.
- *
- * This will disable any lazy FPU state restore of the current FPU state,
- * but if the current thread owns the FPU, it will still be saved by.
- */
-static inline void __cpu_disable_lazy_restore(unsigned int cpu)
-{
-   per_cpu(fpu_owner_task, cpu) = NULL;
-}
-
-static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
-{
-   return new == this_cpu_read_stable(fpu_owner_task) &&
-   cpu == new->thread.fpu.last_cpu;
-}
-
 static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct 
task_struct *new, int cpu)
 {
fpu_switch_t fpu;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/rt/nohz: Stop scheduler tick if running realtime task

2015-02-18 Thread tip-bot for Rik van Riel

Commit-ID:  1e78cdbd9b2266503339accafe0ebdd99b93a531
Gitweb: http://git.kernel.org/tip/1e78cdbd9b2266503339accafe0ebdd99b93a531
Author: Rik van Riel 
AuthorDate: Mon, 16 Feb 2015 15:23:49 -0500
Committer:  Ingo Molnar 
CommitDate: Wed, 18 Feb 2015 18:21:19 +0100

sched/rt/nohz: Stop scheduler tick if running realtime task

If the CPU is running a realtime task that does not round-robin
with another realtime task of equal priority, there is no point
in keeping the scheduler tick going. After all, whenever the
scheduler tick runs, the kernel will just decide not to
reschedule.

Extend sched_can_stop_tick() to recognize these situations, and
inform the rest of the kernel that the scheduler tick can be
stopped.

Tested-by: Luiz Capitulino 
Signed-off-by: Rik van Riel 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Cc: Thomas Gleixner 
Cc: fweis...@redhat.com
Cc: mtosa...@redhat.com
Link: http://lkml.kernel.org/r/20150216152349.6a8ed...@annuminas.surriel.com
[ Small cleanliness tweak. ]
Signed-off-by: Ingo Molnar 
---
 kernel/sched/core.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a4869bd..97fe79c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -690,6 +690,23 @@ static inline bool got_nohz_idle_kick(void)
 bool sched_can_stop_tick(void)
 {
/*
+* FIFO realtime policy runs the highest priority task. Other runnable
+* tasks are of a lower priority. The scheduler tick does nothing.
+*/
+   if (current->policy == SCHED_FIFO)
+   return true;
+
+   /*
+* Round-robin realtime tasks time slice with other tasks at the same
+* realtime priority. Is this task the only one at this priority?
+*/
+   if (current->policy == SCHED_RR) {
+   struct sched_rt_entity *rt_se = ¤t->rt;
+
+   return rt_se->run_list.prev == rt_se->run_list.next;
+   }
+
+   /*
 * More than one running task need preemption.
 * nr_running update is assumed to be visible
 * after IPI is sent from wakers.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Do not move past the balance point if unbalanced

2015-02-18 Thread tip-bot for Rik van Riel

Commit-ID:  095bebf61a460ad7f6a45bb17ddbf3a9df2b4397
Gitweb: http://git.kernel.org/tip/095bebf61a460ad7f6a45bb17ddbf3a9df2b4397
Author: Rik van Riel 
AuthorDate: Tue, 3 Feb 2015 16:56:48 -0500
Committer:  Ingo Molnar 
CommitDate: Wed, 18 Feb 2015 16:18:00 +0100

sched/numa: Do not move past the balance point if unbalanced

There is a subtle interaction between the logic introduced in commit
e63da03639cc ("sched/numa: Allow task switch if load imbalance improves"),
the way the load balancer counts the load on each NUMA node, and the way
NUMA hinting faults are done.

Specifically, the load balancer only counts currently running tasks
in the load, while NUMA hinting faults may cause tasks to stop, if
the page is locked by another task.

This could cause all of the threads of a large single instance workload,
like SPECjbb2005, to migrate to the same NUMA node. This was possible
because occasionally they all fault on the same few pages, and only one
of the threads remains runnable. That thread can move to the process's
preferred NUMA node without making the imbalance worse, because nothing
else is running at that time.

The fix is to check the direction of the net moving of load, and to
refuse a NUMA move if it would cause the system to move past the point
of balance.  In an unbalanced state, only moves that bring us closer
to the balance point are allowed.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: mgor...@suse.de
Link: http://lkml.kernel.org/r/20150203165648.0e9ac...@annuminas.surriel.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 41 ++---
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7ce18f3..28cbaca 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1196,9 +1196,11 @@ static void task_numa_assign(struct task_numa_env *env,
 static bool load_too_imbalanced(long src_load, long dst_load,
struct task_numa_env *env)
 {
-   long imb, old_imb;
-   long orig_src_load, orig_dst_load;
long src_capacity, dst_capacity;
+   long orig_src_load;
+   long load_a, load_b;
+   long moved_load;
+   long imb;
 
/*
 * The load is corrected for the CPU capacity available on each node.
@@ -1211,30 +1213,39 @@ static bool load_too_imbalanced(long src_load, long 
dst_load,
dst_capacity = env->dst_stats.compute_capacity;
 
/* We care about the slope of the imbalance, not the direction. */
-   if (dst_load < src_load)
-   swap(dst_load, src_load);
+   load_a = dst_load;
+   load_b = src_load;
+   if (load_a < load_b)
+   swap(load_a, load_b);
 
/* Is the difference below the threshold? */
-   imb = dst_load * src_capacity * 100 -
- src_load * dst_capacity * env->imbalance_pct;
+   imb = load_a * src_capacity * 100 -
+   load_b * dst_capacity * env->imbalance_pct;
if (imb <= 0)
return false;
 
/*
 * The imbalance is above the allowed threshold.
-* Compare it with the old imbalance.
+* Allow a move that brings us closer to a balanced situation,
+* without moving things past the point of balance.
 */
orig_src_load = env->src_stats.load;
-   orig_dst_load = env->dst_stats.load;
 
-   if (orig_dst_load < orig_src_load)
-   swap(orig_dst_load, orig_src_load);
-
-   old_imb = orig_dst_load * src_capacity * 100 -
- orig_src_load * dst_capacity * env->imbalance_pct;
+   /*
+* In a task swap, there will be one load moving from src to dst,
+* and another moving back. This is the net sum of both moves.
+* A simple task move will always have a positive value.
+* Allow the move if it brings the system closer to a balanced
+* situation, without crossing over the balance point.
+*/
+   moved_load = orig_src_load - src_load;
 
-   /* Would this change make things worse? */
-   return (imb > old_imb);
+   if (moved_load > 0)
+   /* Moving src -> dst. Did we overshoot balance? */
+   return src_load * dst_capacity < dst_load * src_capacity;
+   else
+   /* Moving dst -> src. Did we overshoot balance? */
+   return dst_load * src_capacity < src_load * dst_capacity;
 }
 
 /*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Check all nodes when placing a pseudo-interleaved group

2014-10-28 Thread tip-bot for Rik van Riel

Commit-ID:  9de05d48711cd5314920ed05f873d84eaf66ccf1
Gitweb: http://git.kernel.org/tip/9de05d48711cd5314920ed05f873d84eaf66ccf1
Author: Rik van Riel 
AuthorDate: Thu, 9 Oct 2014 17:27:47 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 28 Oct 2014 10:47:52 +0100

sched/numa: Check all nodes when placing a pseudo-interleaved group

In pseudo-interleaved numa_groups, all tasks try to relocate to
the group's preferred_nid.  When a group is spread across multiple
NUMA nodes, this can lead to tasks swapping their location with
other tasks inside the same group, instead of swapping location with
tasks from other NUMA groups. This can keep NUMA groups from converging.

Examining all nodes, when dealing with a task in a pseudo-interleaved
NUMA group, avoids this problem. Note that only CPUs in nodes that
improve the task or group score are examined, so the loop isn't too
bad.

Tested-by: Vinod Chegu 
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: "Vinod Chegu" 
Cc: mgor...@suse.de
Cc: Linus Torvalds 
Link: http://lkml.kernel.org/r/20141009172747.0d97c...@annuminas.surriel.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7760c2a..ec32c26d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1436,8 +1436,15 @@ static int task_numa_migrate(struct task_struct *p)
/* Try to find a spot on the preferred nid. */
task_numa_find_cpu(&env, taskimp, groupimp);
 
-   /* No space available on the preferred nid. Look elsewhere. */
-   if (env.best_cpu == -1) {
+   /*
+* Look at other nodes in these cases:
+* - there is no space available on the preferred_nid
+* - the task is part of a numa_group that is interleaved across
+*   multiple NUMA nodes; in order to better consolidate the group,
+*   we need to check other locations.
+*/
+   if (env.best_cpu == -1 || (p->numa_group &&
+   nodes_weight(p->numa_group->active_nodes) > 1)) {
for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Prepare for complex topology placement

2014-10-28 Thread tip-bot for Rik van Riel

Commit-ID:  7bd953206b0b5e0a3aded871982367410b42e1b1
Gitweb: http://git.kernel.org/tip/7bd953206b0b5e0a3aded871982367410b42e1b1
Author: Rik van Riel 
AuthorDate: Fri, 17 Oct 2014 03:29:51 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 28 Oct 2014 10:47:49 +0100

sched/numa: Prepare for complex topology placement

Preparatory patch for adding NUMA placement on systems with
complex NUMA topology. Also fix a potential divide by zero
in group_weight()

Signed-off-by: Rik van Riel 
Tested-by: Chegu Vinod 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: mgor...@suse.de
Cc: chegu_vi...@hp.com
Cc: Linus Torvalds 
Link: http://lkml.kernel.org/r/1413530994-9732-4-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 57 ++---
 1 file changed, 37 insertions(+), 20 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 34baa60..0af3bed 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -931,9 +931,10 @@ static inline unsigned long group_faults_cpu(struct 
numa_group *group, int nid)
  * larger multiplier, in order to group tasks together that are almost
  * evenly spread out between numa nodes.
  */
-static inline unsigned long task_weight(struct task_struct *p, int nid)
+static inline unsigned long task_weight(struct task_struct *p, int nid,
+   int dist)
 {
-   unsigned long total_faults;
+   unsigned long faults, total_faults;
 
if (!p->numa_faults_memory)
return 0;
@@ -943,15 +944,25 @@ static inline unsigned long task_weight(struct 
task_struct *p, int nid)
if (!total_faults)
return 0;
 
-   return 1000 * task_faults(p, nid) / total_faults;
+   faults = task_faults(p, nid);
+   return 1000 * faults / total_faults;
 }
 
-static inline unsigned long group_weight(struct task_struct *p, int nid)
+static inline unsigned long group_weight(struct task_struct *p, int nid,
+int dist)
 {
-   if (!p->numa_group || !p->numa_group->total_faults)
+   unsigned long faults, total_faults;
+
+   if (!p->numa_group)
+   return 0;
+
+   total_faults = p->numa_group->total_faults;
+
+   if (!total_faults)
return 0;
 
-   return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
+   faults = group_faults(p, nid);
+   return 1000 * faults / total_faults;
 }
 
 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
@@ -1084,6 +1095,7 @@ struct task_numa_env {
struct numa_stats src_stats, dst_stats;
 
int imbalance_pct;
+   int dist;
 
struct task_struct *best_task;
long best_imp;
@@ -1163,6 +1175,7 @@ static void task_numa_compare(struct task_numa_env *env,
long load;
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
+   int dist = env->dist;
 
rcu_read_lock();
 
@@ -1196,8 +1209,8 @@ static void task_numa_compare(struct task_numa_env *env,
 * in any group then look only at task weights.
 */
if (cur->numa_group == env->p->numa_group) {
-   imp = taskimp + task_weight(cur, env->src_nid) -
- task_weight(cur, env->dst_nid);
+   imp = taskimp + task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
/*
 * Add some hysteresis to prevent swapping the
 * tasks within a group over tiny differences.
@@ -1211,11 +1224,11 @@ static void task_numa_compare(struct task_numa_env *env,
 * instead.
 */
if (cur->numa_group)
-   imp += group_weight(cur, env->src_nid) -
-  group_weight(cur, env->dst_nid);
+   imp += group_weight(cur, env->src_nid, dist) -
+  group_weight(cur, env->dst_nid, dist);
else
-   imp += task_weight(cur, env->src_nid) -
-  task_weight(cur, env->dst_nid);
+   imp += task_weight(cur, env->src_nid, dist) -
+  task_weight(cur, env->dst_nid, dist);
}
}
 
@@ -1314,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p)
};
struct sched_domain *sd;
unsigned long taskweight, groupweight;
-   int nid, ret;
+   int nid, ret, dist;
long taskimp, groupimp;
 
/*
@@ -1342,12 +1355,13 @@ static int task_numa_migrate(struct task_struct *p)
return -EINVAL;
}
 
-   taskweight = task_weight(p, env.src_nid);
-   groupweight = grou

[tip:sched/core] sched/numa: Calculate node scores in complex NUMA topologies

2014-10-28 Thread tip-bot for Rik van Riel

Commit-ID:  6c6b1193e71fed1a58dc3fab9d967d245177f87b
Gitweb: http://git.kernel.org/tip/6c6b1193e71fed1a58dc3fab9d967d245177f87b
Author: Rik van Riel 
AuthorDate: Fri, 17 Oct 2014 03:29:52 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 28 Oct 2014 10:47:50 +0100

sched/numa: Calculate node scores in complex NUMA topologies

In order to do task placement on systems with complex NUMA topologies,
it is necessary to count the faults on nodes nearby the node that is
being examined for a potential move.

In case of a system with a backplane interconnect, we are dealing with
groups of NUMA nodes; each of the nodes within a group is the same number
of hops away from nodes in other groups in the system. Optimal placement
on this topology is achieved by counting all nearby nodes equally. When
comparing nodes A and B at distance N, nearby nodes are those at distances
smaller than N from nodes A or B.

Placement strategy on a system with a glueless mesh NUMA topology needs
to be different, because there are no natural groups of nodes determined
by the hardware. Instead, when dealing with two nodes A and B at distance
N, N >= 2, there will be intermediate nodes at distance < N from both nodes
A and B. Good placement can be achieved by right shifting the faults on
nearby nodes by the number of hops from the node being scored. In this
context, a nearby node is any node less than the maximum distance in the
system away from the node. Those nodes are skipped for efficiency reasons,
there is no real policy reason to do so.

Placement policy on directly connected NUMA systems is not affected.

Signed-off-by: Rik van Riel 
Tested-by: Chegu Vinod 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: mgor...@suse.de
Cc: chegu_vi...@hp.com
Link: http://lkml.kernel.org/r/1413530994-9732-5-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 74 +
 1 file changed, 74 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0af3bed..7e5712a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -925,6 +925,71 @@ static inline unsigned long group_faults_cpu(struct 
numa_group *group, int nid)
group->faults_cpu[task_faults_idx(nid, 1)];
 }
 
+/* Handle placement on systems where not all nodes are directly connected. */
+static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
+   int maxdist, bool task)
+{
+   unsigned long score = 0;
+   int node;
+
+   /*
+* All nodes are directly connected, and the same distance
+* from each other. No need for fancy placement algorithms.
+*/
+   if (sched_numa_topology_type == NUMA_DIRECT)
+   return 0;
+
+   /*
+* This code is called for each node, introducing N^2 complexity,
+* which should be ok given the number of nodes rarely exceeds 8.
+*/
+   for_each_online_node(node) {
+   unsigned long faults;
+   int dist = node_distance(nid, node);
+
+   /*
+* The furthest away nodes in the system are not interesting
+* for placement; nid was already counted.
+*/
+   if (dist == sched_max_numa_distance || node == nid)
+   continue;
+
+   /*
+* On systems with a backplane NUMA topology, compare groups
+* of nodes, and move tasks towards the group with the most
+* memory accesses. When comparing two nodes at distance
+* "hoplimit", only nodes closer by than "hoplimit" are part
+* of each group. Skip other nodes.
+*/
+   if (sched_numa_topology_type == NUMA_BACKPLANE &&
+   dist > maxdist)
+   continue;
+
+   /* Add up the faults from nearby nodes. */
+   if (task)
+   faults = task_faults(p, node);
+   else
+   faults = group_faults(p, node);
+
+   /*
+* On systems with a glueless mesh NUMA topology, there are
+* no fixed "groups of nodes". Instead, nodes that are not
+* directly connected bounce traffic through intermediate
+* nodes; a numa_group can occupy any set of nodes.
+* The further away a node is, the less the faults count.
+* This seems to result in good task placement.
+*/
+   if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+   faults *= (sched_max_numa_distance - dist);
+   faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+   }
+
+   score += faults;
+   }
+
+   return score;
+}
+
 /*
  * These return the fraction of accesses done by a particu

[tip:sched/core] sched/numa: Classify the NUMA topology of a system

2014-10-28 Thread tip-bot for Rik van Riel

Commit-ID:  e3fe70b1f72e3f83a00d9c332ec09ab347a981e2
Gitweb: http://git.kernel.org/tip/e3fe70b1f72e3f83a00d9c332ec09ab347a981e2
Author: Rik van Riel 
AuthorDate: Fri, 17 Oct 2014 03:29:50 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 28 Oct 2014 10:47:48 +0100

sched/numa: Classify the NUMA topology of a system

Smaller NUMA systems tend to have all NUMA nodes directly connected
to each other. This includes the degenerate case of a system with just
one node, ie. a non-NUMA system.

Larger systems can have two kinds of NUMA topology, which affects how
tasks and memory should be placed on the system.

On glueless mesh systems, nodes that are not directly connected to
each other will bounce traffic through intermediary nodes. Task groups
can be run closer to each other by moving tasks from a node to an
intermediary node between it and the task's preferred node.

On NUMA systems with backplane controllers, the intermediary hops
are incapable of running programs. This creates "islands" of nodes
that are at an equal distance to anywhere else in the system.

Each kind of topology requires a slightly different placement
algorithm; this patch provides the mechanism to detect the kind
of NUMA topology of a system.

Signed-off-by: Rik van Riel 
Tested-by: Chegu Vinod 
[ Changed to use kernel/sched/sched.h ]
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: mgor...@suse.de
Cc: chegu_vi...@hp.com
Link: http://lkml.kernel.org/r/1413530994-9732-3-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/core.c  | 53 
 kernel/sched/sched.h |  6 ++
 2 files changed, 59 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4007595..cde8481 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6128,6 +6128,7 @@ static void claim_allocations(int cpu, struct 
sched_domain *sd)
 
 #ifdef CONFIG_NUMA
 static int sched_domains_numa_levels;
+enum numa_topology_type sched_numa_topology_type;
 static int *sched_domains_numa_distance;
 int sched_max_numa_distance;
 static struct cpumask ***sched_domains_numa_masks;
@@ -6316,6 +6317,56 @@ bool find_numa_distance(int distance)
return false;
 }
 
+/*
+ * A system can have three types of NUMA topology:
+ * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
+ * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
+ * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
+ *
+ * The difference between a glueless mesh topology and a backplane
+ * topology lies in whether communication between not directly
+ * connected nodes goes through intermediary nodes (where programs
+ * could run), or through backplane controllers. This affects
+ * placement of programs.
+ *
+ * The type of topology can be discerned with the following tests:
+ * - If the maximum distance between any nodes is 1 hop, the system
+ *   is directly connected.
+ * - If for two nodes A and B, located N > 1 hops away from each other,
+ *   there is an intermediary node C, which is < N hops away from both
+ *   nodes A and B, the system is a glueless mesh.
+ */
+static void init_numa_topology_type(void)
+{
+   int a, b, c, n;
+
+   n = sched_max_numa_distance;
+
+   if (n <= 1)
+   sched_numa_topology_type = NUMA_DIRECT;
+
+   for_each_online_node(a) {
+   for_each_online_node(b) {
+   /* Find two nodes furthest removed from each other. */
+   if (node_distance(a, b) < n)
+   continue;
+
+   /* Is there an intermediary node between a and b? */
+   for_each_online_node(c) {
+   if (node_distance(a, c) < n &&
+   node_distance(b, c) < n) {
+   sched_numa_topology_type =
+   NUMA_GLUELESS_MESH;
+   return;
+   }
+   }
+
+   sched_numa_topology_type = NUMA_BACKPLANE;
+   return;
+   }
+   }
+}
+
 static void sched_init_numa(void)
 {
int next_distance, curr_distance = node_distance(0, 0);
@@ -6449,6 +6500,8 @@ static void sched_init_numa(void)
 
sched_domains_numa_levels = level;
sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+
+   init_numa_topology_type();
 }
 
 static void sched_domains_numa_masks_set(int cpu)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 443d6e1..57aacea 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -679,6 +679,12 @@ static inline u64 rq_clock_task(struct rq *rq)
 }
 
 #ifdef CONFIG_NUMA
+enum numa_topology_type {
+   NUMA_DIRECT,
+   NUMA_GLUELESS_MESH,
+   NUMA_BACKPLANE,
+};
+extern enum numa_topology_type sched_

[tip:sched/core] sched/numa: Find the preferred nid with complex NUMA topology

2014-10-28 Thread tip-bot for Rik van Riel

Commit-ID:  54009416ac3b5f219c0df68559ce534287ae97b1
Gitweb: http://git.kernel.org/tip/54009416ac3b5f219c0df68559ce534287ae97b1
Author: Rik van Riel 
AuthorDate: Fri, 17 Oct 2014 03:29:53 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 28 Oct 2014 10:47:51 +0100

sched/numa: Find the preferred nid with complex NUMA topology

On systems with complex NUMA topologies, the node scoring is adjusted
to allow workloads to converge on nodes that are near each other.

The way a task group's preferred nid is determined needs to be adjusted,
in order for the preferred_nid to be consistent with group_weight scoring.
This ensures that we actually try to converge workloads on adjacent nodes.

Signed-off-by: Rik van Riel 
Tested-by: Chegu Vinod 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: mgor...@suse.de
Cc: chegu_vi...@hp.com
Cc: Linus Torvalds 
Link: http://lkml.kernel.org/r/1413530994-9732-6-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 88 -
 1 file changed, 87 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7e5712a..7760c2a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1659,6 +1659,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, 
u64 *period)
return delta;
 }
 
+/*
+ * Determine the preferred nid for a task in a numa_group. This needs to
+ * be done in a way that produces consistent results with group_weight,
+ * otherwise workloads might not converge.
+ */
+static int preferred_group_nid(struct task_struct *p, int nid)
+{
+   nodemask_t nodes;
+   int dist;
+
+   /* Direct connections between all NUMA nodes. */
+   if (sched_numa_topology_type == NUMA_DIRECT)
+   return nid;
+
+   /*
+* On a system with glueless mesh NUMA topology, group_weight
+* scores nodes according to the number of NUMA hinting faults on
+* both the node itself, and on nearby nodes.
+*/
+   if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+   unsigned long score, max_score = 0;
+   int node, max_node = nid;
+
+   dist = sched_max_numa_distance;
+
+   for_each_online_node(node) {
+   score = group_weight(p, node, dist);
+   if (score > max_score) {
+   max_score = score;
+   max_node = node;
+   }
+   }
+   return max_node;
+   }
+
+   /*
+* Finding the preferred nid in a system with NUMA backplane
+* interconnect topology is more involved. The goal is to locate
+* tasks from numa_groups near each other in the system, and
+* untangle workloads from different sides of the system. This requires
+* searching down the hierarchy of node groups, recursively searching
+* inside the highest scoring group of nodes. The nodemask tricks
+* keep the complexity of the search down.
+*/
+   nodes = node_online_map;
+   for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
+   unsigned long max_faults = 0;
+   nodemask_t max_group;
+   int a, b;
+
+   /* Are there nodes at this distance from each other? */
+   if (!find_numa_distance(dist))
+   continue;
+
+   for_each_node_mask(a, nodes) {
+   unsigned long faults = 0;
+   nodemask_t this_group;
+   nodes_clear(this_group);
+
+   /* Sum group's NUMA faults; includes a==b case. */
+   for_each_node_mask(b, nodes) {
+   if (node_distance(a, b) < dist) {
+   faults += group_faults(p, b);
+   node_set(b, this_group);
+   node_clear(b, nodes);
+   }
+   }
+
+   /* Remember the top group. */
+   if (faults > max_faults) {
+   max_faults = faults;
+   max_group = this_group;
+   /*
+* subtle: at the smallest distance there is
+* just one node left in each "group", the
+* winner is the preferred nid.
+*/
+   nid = a;
+   }
+   }
+   /* Next round, evaluate the nodes within max_group. */
+   nodes = max_group;
+   }
+   return nid;
+}
+
 static void task_numa_placement(struct task_struct *p)
 {
int seq, nid, max_nid = -1, max_group_nid = -1;
@@ -1741,7 +1827,7 @@ static void tas

[tip:sched/core] sched/numa: Export info needed for NUMA balancing on complex topologies

2014-10-28 Thread tip-bot for Rik van Riel

Commit-ID:  9942f79baaaf111d63ebf0862a819278d84fccc4
Gitweb: http://git.kernel.org/tip/9942f79baaaf111d63ebf0862a819278d84fccc4
Author: Rik van Riel 
AuthorDate: Fri, 17 Oct 2014 03:29:49 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 28 Oct 2014 10:47:47 +0100

sched/numa: Export info needed for NUMA balancing on complex topologies

Export some information that is necessary to do placement of
tasks on systems with multi-level NUMA topologies.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: mgor...@suse.de
Cc: chegu_vi...@hp.com
Cc: Linus Torvalds 
Link: http://lkml.kernel.org/r/1413530994-9732-2-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/core.c  | 4 +++-
 kernel/sched/sched.h | 5 +
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 240157c..4007595 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6129,6 +6129,7 @@ static void claim_allocations(int cpu, struct 
sched_domain *sd)
 #ifdef CONFIG_NUMA
 static int sched_domains_numa_levels;
 static int *sched_domains_numa_distance;
+int sched_max_numa_distance;
 static struct cpumask ***sched_domains_numa_masks;
 static int sched_domains_curr_level;
 #endif
@@ -6300,7 +6301,7 @@ static void sched_numa_warn(const char *str)
printk(KERN_WARNING "\n");
 }
 
-static bool find_numa_distance(int distance)
+bool find_numa_distance(int distance)
 {
int i;
 
@@ -6447,6 +6448,7 @@ static void sched_init_numa(void)
sched_domain_topology = tl;
 
sched_domains_numa_levels = level;
+   sched_max_numa_distance = sched_domains_numa_distance[level - 1];
 }
 
 static void sched_domains_numa_masks_set(int cpu)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 24156c84..443d6e1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -678,6 +678,11 @@ static inline u64 rq_clock_task(struct rq *rq)
return rq->clock_task;
 }
 
+#ifdef CONFIG_NUMA
+extern int sched_max_numa_distance;
+extern bool find_numa_distance(int distance);
+#endif
+
 #ifdef CONFIG_NUMA_BALANCING
 extern void sched_setnuma(struct task_struct *p, int node);
 extern int migrate_task_to(struct task_struct *p, int cpu);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched, time: Fix build error with 64 bit cputime_t on 32 bit systems

2014-10-02 Thread tip-bot for Rik van Riel

Commit-ID:  347abad981c1ef815ea5ba861adba6a8c6aa1580
Gitweb: http://git.kernel.org/tip/347abad981c1ef815ea5ba861adba6a8c6aa1580
Author: Rik van Riel 
AuthorDate: Tue, 30 Sep 2014 15:59:47 -0400
Committer:  Ingo Molnar 
CommitDate: Fri, 3 Oct 2014 05:46:55 +0200

sched, time: Fix build error with 64 bit cputime_t on 32 bit systems

On 32 bit systems cmpxchg cannot handle 64 bit values, so
some additional magic is required to allow a 32 bit system
with CONFIG_VIRT_CPU_ACCOUNTING_GEN=y enabled to build.

Make sure the correct cmpxchg function is used when doing
an atomic swap of a cputime_t.

Reported-by: Arnd Bergmann 
Signed-off-by: Rik van Riel 
Acked-by: Arnd Bergmann 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: umgwanakikb...@gmail.com
Cc: fweis...@gmail.com
Cc: s...@redhat.com
Cc: lwood...@redhat.com
Cc: atheu...@redhat.com
Cc: o...@redhat.com
Cc: Andrew Morton 
Cc: Benjamin Herrenschmidt 
Cc: Heiko Carstens 
Cc: Linus Torvalds 
Cc: Martin Schwidefsky 
Cc: Michael Ellerman 
Cc: Paul Mackerras 
Cc: linux...@de.ibm.com
Cc: linux-a...@vger.kernel.org
Cc: linuxppc-...@lists.ozlabs.org
Cc: linux-s...@vger.kernel.org
Link: http://lkml.kernel.org/r/20140930155947.070cd...@annuminas.surriel.com
Signed-off-by: Ingo Molnar 
---
 arch/powerpc/include/asm/cputime.h|  2 ++
 arch/s390/include/asm/cputime.h   |  2 ++
 include/asm-generic/cputime_jiffies.h |  2 ++
 include/asm-generic/cputime_nsecs.h   |  2 ++
 kernel/sched/cputime.c| 29 +++--
 5 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/cputime.h 
b/arch/powerpc/include/asm/cputime.h
index 607559a..6c840ce 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -32,6 +32,8 @@ static inline void setup_cputime_one_jiffy(void) { }
 typedef u64 __nocast cputime_t;
 typedef u64 __nocast cputime64_t;
 
+#define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new)
+
 #ifdef __KERNEL__
 
 /*
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h
index f65bd36..3001887 100644
--- a/arch/s390/include/asm/cputime.h
+++ b/arch/s390/include/asm/cputime.h
@@ -18,6 +18,8 @@
 typedef unsigned long long __nocast cputime_t;
 typedef unsigned long long __nocast cputime64_t;
 
+#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new)
+
 static inline unsigned long __div(unsigned long long n, unsigned long base)
 {
 #ifndef CONFIG_64BIT
diff --git a/include/asm-generic/cputime_jiffies.h 
b/include/asm-generic/cputime_jiffies.h
index d5cb78f5..fe386fc 100644
--- a/include/asm-generic/cputime_jiffies.h
+++ b/include/asm-generic/cputime_jiffies.h
@@ -3,6 +3,8 @@
 
 typedef unsigned long __nocast cputime_t;
 
+#define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new)
+
 #define cputime_one_jiffy  jiffies_to_cputime(1)
 #define cputime_to_jiffies(__ct)   (__force unsigned long)(__ct)
 #define cputime_to_scaled(__ct)(__ct)
diff --git a/include/asm-generic/cputime_nsecs.h 
b/include/asm-generic/cputime_nsecs.h
index 4e81760..0419485 100644
--- a/include/asm-generic/cputime_nsecs.h
+++ b/include/asm-generic/cputime_nsecs.h
@@ -21,6 +21,8 @@
 typedef u64 __nocast cputime_t;
 typedef u64 __nocast cputime64_t;
 
+#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new)
+
 #define cputime_one_jiffy  jiffies_to_cputime(1)
 
 #define cputime_div(__ct, divisor)  div_u64((__force u64)__ct, divisor)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 64492df..8394b1e 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -555,6 +555,23 @@ drop_precision:
 }
 
 /*
+ * Atomically advance counter to the new value. Interrupts, vcpu
+ * scheduling, and scaling inaccuracies can cause cputime_advance
+ * to be occasionally called with a new value smaller than counter.
+ * Let's enforce atomicity.
+ *
+ * Normally a caller will only go through this loop once, or not
+ * at all in case a previous caller updated counter the same jiffy.
+ */
+static void cputime_advance(cputime_t *counter, cputime_t new)
+{
+   cputime_t old;
+
+   while (new > (old = ACCESS_ONCE(*counter)))
+   cmpxchg_cputime(counter, old, new);
+}
+
+/*
  * Adjust tick based cputime random precision against scheduler
  * runtime accounting.
  */
@@ -599,16 +616,8 @@ static void cputime_adjust(struct task_cputime *curr,
utime = rtime - stime;
}
 
-   /*
-* If the tick based count grows faster than the scheduler one,
-* the result of the scaling may go backward.
-* Let's enforce monotonicity.
-* Atomic exchange protects against concurrent cputime_adjust().
-*/
-   while (stime > (rtime = ACCESS_ONCE(prev->stime)))
-   cmpxchg(&prev->stime, rtime, stime);
-   while (utime > (rtime = ACCESS_ONCE(prev->utime)))
-   cmpxchg(&prev->utime, rtime, utime);
+   cputime_advance(&prev->stim

[tip:sched/core] sched, time: Fix lock inversion in thread_group_cputime()

2014-09-19 Thread tip-bot for Rik van Riel

Commit-ID:  9c368b5b6eccce1cbd7f68142106b3b4ddb1c5b5
Gitweb: http://git.kernel.org/tip/9c368b5b6eccce1cbd7f68142106b3b4ddb1c5b5
Author: Rik van Riel 
AuthorDate: Fri, 12 Sep 2014 09:12:15 -0400
Committer:  Ingo Molnar 
CommitDate: Fri, 19 Sep 2014 12:35:17 +0200

sched, time: Fix lock inversion in thread_group_cputime()

The sig->stats_lock nests inside the tasklist_lock and the
sighand->siglock in __exit_signal and wait_task_zombie.

However, both of those locks can be taken from irq context,
which means we need to use the interrupt safe variant of
read_seqbegin_or_lock. This blocks interrupts when the "lock"
branch is taken (seq is odd), preventing the lock inversion.

On the first (lockless) pass through the loop, irqs are not
blocked.

Reported-by: Stanislaw Gruszka 
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: pra...@redhat.com
Cc: o...@redhat.com
Cc: Linus Torvalds 
Link: http://lkml.kernel.org/r/1410527535-9814-3-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/cputime.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2b57031..64492df 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -289,13 +289,14 @@ void thread_group_cputime(struct task_struct *tsk, struct 
task_cputime *times)
cputime_t utime, stime;
struct task_struct *t;
unsigned int seq, nextseq;
+   unsigned long flags;
 
rcu_read_lock();
/* Attempt a lockless read on the first round. */
nextseq = 0;
do {
seq = nextseq;
-   read_seqbegin_or_lock(&sig->stats_lock, &seq);
+   flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
times->utime = sig->utime;
times->stime = sig->stime;
times->sum_exec_runtime = sig->sum_sched_runtime;
@@ -309,7 +310,7 @@ void thread_group_cputime(struct task_struct *tsk, struct 
task_cputime *times)
/* If lockless access failed, take the lock. */
nextseq = 1;
} while (need_seqretry(&sig->stats_lock, seq));
-   done_seqretry(&sig->stats_lock, seq);
+   done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
rcu_read_unlock();
 }
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] seqlock: Add irqsave variant of read_seqbegin_or_lock()

2014-09-19 Thread tip-bot for Rik van Riel

Commit-ID:  ef8ac06359ddf95431cf6bb04ad2b36fff562328
Gitweb: http://git.kernel.org/tip/ef8ac06359ddf95431cf6bb04ad2b36fff562328
Author: Rik van Riel 
AuthorDate: Fri, 12 Sep 2014 09:12:14 -0400
Committer:  Ingo Molnar 
CommitDate: Fri, 19 Sep 2014 12:35:16 +0200

seqlock: Add irqsave variant of read_seqbegin_or_lock()

There are cases where read_seqbegin_or_lock() needs to block irqs,
because the seqlock in question nests inside a lock that is also
be taken from irq context.

Add read_seqbegin_or_lock_irqsave() and done_seqretry_irqrestore(), which
are almost identical to read_seqbegin_or_lock() and done_seqretry().

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: pra...@redhat.com
Cc: o...@redhat.com
Cc: sgrus...@redhat.com
Cc: Al Viro 
Cc: John Stultz 
Cc: Linus Torvalds 
Cc: Mathieu Desnoyers 
Cc: Stephen Boyd 
Cc: Trond Myklebust 
Link: http://lkml.kernel.org/r/1410527535-9814-2-git-send-email-r...@redhat.com
[ Improved the readability of the code a bit. ]
Signed-off-by: Ingo Molnar 
---
 include/linux/seqlock.h | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index cc35963..f5df8f6 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -456,4 +456,23 @@ read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned 
long flags)
spin_unlock_irqrestore(&sl->lock, flags);
 }
 
+static inline unsigned long
+read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq)
+{
+   unsigned long flags = 0;
+
+   if (!(*seq & 1))/* Even */
+   *seq = read_seqbegin(lock);
+   else/* Odd */
+   read_seqlock_excl_irqsave(lock, flags);
+
+   return flags;
+}
+
+static inline void
+done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags)
+{
+   if (seq & 1)
+   read_sequnlock_excl_irqrestore(lock, flags);
+}
 #endif /* __LINUX_SEQLOCK_H */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Use select_idle_sibling() to select a destination for task_numa_move()

2014-09-19 Thread tip-bot for Rik van Riel

Commit-ID:  ba7e5a279e72f4b246dc7a419ac707e1936ede3e
Gitweb: http://git.kernel.org/tip/ba7e5a279e72f4b246dc7a419ac707e1936ede3e
Author: Rik van Riel 
AuthorDate: Thu, 4 Sep 2014 16:35:30 -0400
Committer:  Ingo Molnar 
CommitDate: Fri, 19 Sep 2014 12:35:14 +0200

sched/numa: Use select_idle_sibling() to select a destination for 
task_numa_move()

The code in task_numa_compare() will only examine at most one idle CPU per node,
because they all have the same score. However, some idle CPUs are better
candidates than others, due to busy or idle SMT siblings, etc...

The scheduler has logic to find the best CPU within an LLC to place a
task. The NUMA code should probably use it.

This seems to reduce the standard deviation for single instance SPECjbb2005
with a low warehouse count on my 4 node test system.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: mgor...@suse.de
Cc: Mike Galbraith 
Cc: Linus Torvalds 
Link: http://lkml.kernel.org/r/20140904163530.189d4...@cuia.bos.redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index be9e97b..96e7147 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -665,6 +665,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 }
 
 #ifdef CONFIG_SMP
+static int select_idle_sibling(struct task_struct *p, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
 
 static inline void __update_task_entity_contrib(struct sched_entity *se);
@@ -1257,6 +1258,13 @@ balance:
if (load_too_imbalanced(src_load, dst_load, env))
goto unlock;
 
+   /*
+* One idle CPU per node is evaluated for a task numa move.
+* Call select_idle_sibling to maybe find a better one.
+*/
+   if (!cur)
+   env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+
 assign:
task_numa_assign(env, cur, imp);
 unlock:
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] time, signal: Protect resource use statistics with seqlock

2014-09-07 Thread tip-bot for Rik van Riel

Commit-ID:  e78c3496790ee8a36522a838b59b388e8a709e65
Gitweb: http://git.kernel.org/tip/e78c3496790ee8a36522a838b59b388e8a709e65
Author: Rik van Riel 
AuthorDate: Sat, 16 Aug 2014 13:40:10 -0400
Committer:  Ingo Molnar 
CommitDate: Mon, 8 Sep 2014 08:17:01 +0200

time, signal: Protect resource use statistics with seqlock

Both times() and clock_gettime(CLOCK_PROCESS_CPUTIME_ID) have scalability
issues on large systems, due to both functions being serialized with a
lock.

The lock protects against reporting a wrong value, due to a thread in the
task group exiting, its statistics reporting up to the signal struct, and
that exited task's statistics being counted twice (or not at all).

Protecting that with a lock results in times() and clock_gettime() being
completely serialized on large systems.

This can be fixed by using a seqlock around the events that gather and
propagate statistics. As an additional benefit, the protection code can
be moved into thread_group_cputime(), slightly simplifying the calling
functions.

In the case of posix_cpu_clock_get_task() things can be simplified a
lot, because the calling function already ensures that the task sticks
around, and the rest is now taken care of in thread_group_cputime().

This way the statistics reporting code can run lockless.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Alex Thorlton 
Cc: Andrew Morton 
Cc: Daeseok Youn 
Cc: David Rientjes 
Cc: Dongsheng Yang 
Cc: Geert Uytterhoeven 
Cc: Guillaume Morin 
Cc: Ionut Alexa 
Cc: Kees Cook 
Cc: Linus Torvalds 
Cc: Li Zefan 
Cc: Michal Hocko 
Cc: Michal Schmidt 
Cc: Oleg Nesterov 
Cc: Vladimir Davydov 
Cc: umgwanakikb...@gmail.com
Cc: fweis...@gmail.com
Cc: s...@redhat.com
Cc: lwood...@redhat.com
Cc: atheu...@redhat.com
Link: http://lkml.kernel.org/r/20140816134010.26a9b...@annuminas.surriel.com
Signed-off-by: Ingo Molnar 
---
 include/linux/sched.h  |  1 +
 kernel/exit.c  |  4 
 kernel/fork.c  |  1 +
 kernel/sched/cputime.c | 33 -
 kernel/sys.c   |  2 --
 kernel/time/posix-cpu-timers.c | 14 --
 6 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c2c885..dd9eb48 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -645,6 +645,7 @@ struct signal_struct {
 * Live threads maintain their own counters and add to these
 * in __exit_signal, except for the group leader.
 */
+   seqlock_t stats_lock;
cputime_t utime, stime, cutime, cstime;
cputime_t gtime;
cputime_t cgtime;
diff --git a/kernel/exit.c b/kernel/exit.c
index b93d46d..fa09b86 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -127,6 +127,7 @@ static void __exit_signal(struct task_struct *tsk)
 * the signal_struct.
 */
task_cputime(tsk, &utime, &stime);
+   write_seqlock(&sig->stats_lock);
sig->utime += utime;
sig->stime += stime;
sig->gtime += task_gtime(tsk);
@@ -140,6 +141,7 @@ static void __exit_signal(struct task_struct *tsk)
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->nr_threads--;
__unhash_process(tsk, group_dead);
+   write_sequnlock(&sig->stats_lock);
 
/*
 * Do this under ->siglock, we can race with another thread
@@ -1042,6 +1044,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct 
task_struct *p)
spin_lock_irq(&p->real_parent->sighand->siglock);
psig = p->real_parent->signal;
sig = p->signal;
+   write_seqlock(&psig->stats_lock);
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1064,6 +1067,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct 
task_struct *p)
psig->cmaxrss = maxrss;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
+   write_sequnlock(&psig->stats_lock);
spin_unlock_irq(&p->real_parent->sighand->siglock);
}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 0cf9cdb..9387ae8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1068,6 +1068,7 @@ static int copy_signal(unsigned long clone_flags, struct 
task_struct *tsk)
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
INIT_LIST_HEAD(&sig->posix_timers);
+   seqlock_init(&sig->stats_lock);
 
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
sig->real_timer.function = it_real_fn;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 3e52836..49b7cfe 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -288,18 +288,28 @@ void thread_group_cputime(struct tas

[tip:sched/core] exit: Always reap resource stats in __exit_signal()

2014-09-07 Thread tip-bot for Rik van Riel

Commit-ID:  90ed9cbe765ad358b3151a12b8bf889a3cbcd573
Gitweb: http://git.kernel.org/tip/90ed9cbe765ad358b3151a12b8bf889a3cbcd573
Author: Rik van Riel 
AuthorDate: Fri, 15 Aug 2014 16:05:36 -0400
Committer:  Ingo Molnar 
CommitDate: Mon, 8 Sep 2014 08:17:00 +0200

exit: Always reap resource stats in __exit_signal()

Oleg pointed out that wait_task_zombie adds a task's usage statistics
to the parent's signal struct, but the task's own signal struct should
also propagate the statistics at exit time.

This allows thread_group_cputime(reaped_zombie) to get the statistics
after __unhash_process() has made the task invisible to for_each_thread,
but before the thread has actually been rcu freed, making sure no
non-monotonic results are returned inside that window.

Suggested-by: Oleg Nesterov 
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Andrew Morton 
Cc: David Rientjes 
Cc: Guillaume Morin 
Cc: Ionut Alexa 
Cc: Linus Torvalds 
Cc: Li Zefan 
Cc: Michal Hocko 
Cc: Michal Schmidt 
Cc: Oleg Nesterov 
Cc: umgwanakikb...@gmail.com
Cc: fweis...@gmail.com
Cc: s...@redhat.com
Cc: lwood...@redhat.com
Cc: atheu...@redhat.com
Link: http://lkml.kernel.org/r/1408133138-22048-2-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/exit.c | 43 +--
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 32c58f7..b93d46d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -115,30 +115,29 @@ static void __exit_signal(struct task_struct *tsk)
 
if (tsk == sig->curr_target)
sig->curr_target = next_thread(tsk);
-   /*
-* Accumulate here the counters for all threads but the
-* group leader as they die, so they can be added into
-* the process-wide totals when those are taken.
-* The group leader stays around as a zombie as long
-* as there are other threads.  When it gets reaped,
-* the exit.c code will add its counts into these totals.
-* We won't ever get here for the group leader, since it
-* will have been the last reference on the signal_struct.
-*/
-   task_cputime(tsk, &utime, &stime);
-   sig->utime += utime;
-   sig->stime += stime;
-   sig->gtime += task_gtime(tsk);
-   sig->min_flt += tsk->min_flt;
-   sig->maj_flt += tsk->maj_flt;
-   sig->nvcsw += tsk->nvcsw;
-   sig->nivcsw += tsk->nivcsw;
-   sig->inblock += task_io_get_inblock(tsk);
-   sig->oublock += task_io_get_oublock(tsk);
-   task_io_accounting_add(&sig->ioac, &tsk->ioac);
-   sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
}
 
+   /*
+* Accumulate here the counters for all threads but the group leader
+* as they die, so they can be added into the process-wide totals
+* when those are taken.  The group leader stays around as a zombie as
+* long as there are other threads.  When it gets reaped, the exit.c
+* code will add its counts into these totals.  We won't ever get here
+* for the group leader, since it will have been the last reference on
+* the signal_struct.
+*/
+   task_cputime(tsk, &utime, &stime);
+   sig->utime += utime;
+   sig->stime += stime;
+   sig->gtime += task_gtime(tsk);
+   sig->min_flt += tsk->min_flt;
+   sig->maj_flt += tsk->maj_flt;
+   sig->nvcsw += tsk->nvcsw;
+   sig->nivcsw += tsk->nivcsw;
+   sig->inblock += task_io_get_inblock(tsk);
+   sig->oublock += task_io_get_oublock(tsk);
+   task_io_accounting_add(&sig->ioac, &tsk->ioac);
+   sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->nr_threads--;
__unhash_process(tsk, group_dead);
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched, time: Atomically increment stime & utime

2014-09-07 Thread tip-bot for Rik van Riel

Commit-ID:  eb1b4af0a64ac7bb0ee36f579c1c7cefcbc3ac2c
Gitweb: http://git.kernel.org/tip/eb1b4af0a64ac7bb0ee36f579c1c7cefcbc3ac2c
Author: Rik van Riel 
AuthorDate: Fri, 15 Aug 2014 16:05:38 -0400
Committer:  Ingo Molnar 
CommitDate: Mon, 8 Sep 2014 08:17:02 +0200

sched, time: Atomically increment stime & utime

The functions task_cputime_adjusted and thread_group_cputime_adjusted()
can be called locklessly, as well as concurrently on many different CPUs.

This can occasionally lead to the utime and stime reported by times(), and
other syscalls like it, going backward. The cause for this appears to be
multiple threads racing in cputime_adjust(), both with values for utime or
stime that is larger than the original, but each with a different value.

Sometimes the larger value gets saved first, only to be immediately
overwritten with a smaller value by another thread.

Using atomic exchange prevents that problem, and ensures time
progresses monotonically.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: umgwanakikb...@gmail.com
Cc: fweis...@gmail.com
Cc: a...@linux-foundation.org
Cc: s...@redhat.com
Cc: lwood...@redhat.com
Cc: atheu...@redhat.com
Cc: o...@redhat.com
Link: http://lkml.kernel.org/r/1408133138-22048-4-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/cputime.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 49b7cfe..2b57031 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -602,9 +602,12 @@ static void cputime_adjust(struct task_cputime *curr,
 * If the tick based count grows faster than the scheduler one,
 * the result of the scaling may go backward.
 * Let's enforce monotonicity.
+* Atomic exchange protects against concurrent cputime_adjust().
 */
-   prev->stime = max(prev->stime, stime);
-   prev->utime = max(prev->utime, utime);
+   while (stime > (rtime = ACCESS_ONCE(prev->stime)))
+   cmpxchg(&prev->stime, rtime, stime);
+   while (utime > (rtime = ACCESS_ONCE(prev->utime)))
+   cmpxchg(&prev->utime, rtime, utime);
 
 out:
*ut = prev->utime;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Fix off-by-one in capacity check

2014-08-12 Thread tip-bot for Rik van Riel

Commit-ID:  b932c03c34f3b03c7364c06aa8cae5b74609fc41
Gitweb: http://git.kernel.org/tip/b932c03c34f3b03c7364c06aa8cae5b74609fc41
Author: Rik van Riel 
AuthorDate: Mon, 4 Aug 2014 13:23:27 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 12 Aug 2014 12:48:22 +0200

sched/numa: Fix off-by-one in capacity check

Commit a43455a1d572daf7b730fe12eb747d1e17411365 ensures that
task_numa_migrate will call task_numa_compare on the preferred
node all the time, even when the preferred node has no free capacity.

This could lead to a performance regression if nr_running == capacity
on both the source and the destination node. This can be avoided by
also checking for nr_running == capacity on the source node, which is
one stricter than checking .has_free_capacity.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra 
Cc: mgor...@suse.de
Cc: vincent.guit...@linaro.org
Cc: morten.rasmus...@arm.com
Cc: nicolas.pi...@linaro.org
Cc: efa...@gmx.de
Cc: Linus Torvalds 
Link: http://lkml.kernel.org/r/1407173008-9334-2-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df1ed17..e1cf419 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1206,7 +1206,7 @@ static void task_numa_compare(struct task_numa_env *env,
 
if (!cur) {
/* Is there capacity at our destination? */
-   if (env->src_stats.has_free_capacity &&
+   if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
!env->dst_stats.has_free_capacity)
goto unlock;
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/fair: Make update_sd_pick_busiest() return 'true' on a busier sd

2014-08-12 Thread tip-bot for Rik van Riel

Commit-ID:  caeb178c60f4f93f1b45c0bc056b5cf6d217b67f
Gitweb: http://git.kernel.org/tip/caeb178c60f4f93f1b45c0bc056b5cf6d217b67f
Author: Rik van Riel 
AuthorDate: Mon, 28 Jul 2014 14:16:28 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 12 Aug 2014 12:48:19 +0200

sched/fair: Make update_sd_pick_busiest() return 'true' on a busier sd

Currently update_sd_pick_busiest only identifies the busiest sd
that is either overloaded, or has a group imbalance. When no
sd is imbalanced or overloaded, the load balancer fails to find
the busiest domain.

This breaks load balancing between domains that are not overloaded,
in the !SD_ASYM_PACKING case. This patch makes update_sd_pick_busiest
return true when the busiest sd yet is encountered.

Groups are ranked in the order overloaded > imbalanced > other,
with higher ranked groups getting priority even when their load
is lower. This is necessary due to the possibility of unequal
capacities and cpumasks between domains within a sched group.

Behaviour for SD_ASYM_PACKING does not seem to match the comment,
but I have no hardware to test that so I have left the behaviour
of that code unchanged.

Enum for group classification suggested by Peter Zijlstra.

Signed-off-by: Rik van Riel 
[peterz: replaced sg_lb_stats::group_imb with the new enum group_type
 in an attempt to avoid endless recalculation]
Signed-off-by: Peter Zijlstra 
Acked-by: Vincent Guittot 
Acked-by: Michael Neuling 
Cc: ktk...@parallels.com
Cc: tim.c.c...@linux.intel.com
Cc: nicolas.pi...@linaro.org
Cc: jhla...@redhat.com
Cc: Linus Torvalds 
Link: http://lkml.kernel.org/r/20140729152743.GI3935@laptop
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 49 +
 1 file changed, 37 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9477e6..9437725 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5559,6 +5559,13 @@ static unsigned long task_h_load(struct task_struct *p)
 #endif
 
 /** Helpers for find_busiest_group /
+
+enum group_type {
+   group_other = 0,
+   group_imbalanced,
+   group_overloaded,
+};
+
 /*
  * sg_lb_stats - stats of a sched_group required for load_balancing
  */
@@ -5572,7 +5579,7 @@ struct sg_lb_stats {
unsigned int group_capacity_factor;
unsigned int idle_cpus;
unsigned int group_weight;
-   int group_imb; /* Is there an imbalance in the group ? */
+   enum group_type group_type;
int group_has_free_capacity;
 #ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -5610,6 +5617,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats 
*sds)
.total_capacity = 0UL,
.busiest_stat = {
.avg_load = 0UL,
+   .sum_nr_running = 0,
+   .group_type = group_other,
},
};
 }
@@ -5891,6 +5900,18 @@ static inline int sg_capacity_factor(struct lb_env *env, 
struct sched_group *gro
return capacity_factor;
 }
 
+static enum group_type
+group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
+{
+   if (sgs->sum_nr_running > sgs->group_capacity_factor)
+   return group_overloaded;
+
+   if (sg_imbalanced(group))
+   return group_imbalanced;
+
+   return group_other;
+}
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -5942,9 +5963,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->load_per_task = sgs->sum_weighted_load / 
sgs->sum_nr_running;
 
sgs->group_weight = group->group_weight;
-
-   sgs->group_imb = sg_imbalanced(group);
sgs->group_capacity_factor = sg_capacity_factor(env, group);
+   sgs->group_type = group_classify(group, sgs);
 
if (sgs->group_capacity_factor > sgs->sum_nr_running)
sgs->group_has_free_capacity = 1;
@@ -5968,13 +5988,19 @@ static bool update_sd_pick_busiest(struct lb_env *env,
   struct sched_group *sg,
   struct sg_lb_stats *sgs)
 {
-   if (sgs->avg_load <= sds->busiest_stat.avg_load)
-   return false;
+   struct sg_lb_stats *busiest = &sds->busiest_stat;
 
-   if (sgs->sum_nr_running > sgs->group_capacity_factor)
+   if (sgs->group_type > busiest->group_type)
return true;
 
-   if (sgs->group_imb)
+   if (sgs->group_type < busiest->group_type)
+   return false;
+
+   if (sgs->avg_load <= busiest->avg_load)
+   return false;
+
+   /* This is the busiest node in its class. */
+   if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
 
/*
@@ -5982,8 +6008,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 * numbered CPUs in the group, therefore ma

[tip:sched/core] sched/numa: Fix numa capacity computation

2014-08-12 Thread tip-bot for Rik van Riel

Commit-ID:  83d7f2424741c9dc76c21377c9d00d47abaf88df
Gitweb: http://git.kernel.org/tip/83d7f2424741c9dc76c21377c9d00d47abaf88df
Author: Rik van Riel 
AuthorDate: Mon, 4 Aug 2014 13:23:28 -0400
Committer:  Ingo Molnar 
CommitDate: Tue, 12 Aug 2014 12:48:23 +0200

sched/numa: Fix numa capacity computation

Commit c61037e9 fixes the phenomenon of 'fantom' cores due to
N*frac(smt_power) >= 1 by limiting the capacity to the actual
number of cores in the load balancing code.

This patch applies the same correction to the NUMA balancing
code.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra 
Cc: mgor...@suse.de
Cc: vincent.guit...@linaro.org
Cc: morten.rasmus...@arm.com
Cc: nicolas.pi...@linaro.org
Cc: efa...@gmx.de
Cc: Linus Torvalds 
Link: http://lkml.kernel.org/r/1407173008-9334-3-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e1cf419..1413c44 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1038,7 +1038,8 @@ struct numa_stats {
  */
 static void update_numa_stats(struct numa_stats *ns, int nid)
 {
-   int cpu, cpus = 0;
+   int smt, cpu, cpus = 0;
+   unsigned long capacity;
 
memset(ns, 0, sizeof(*ns));
for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1062,8 +1063,12 @@ static void update_numa_stats(struct numa_stats *ns, int 
nid)
if (!cpus)
return;
 
-   ns->task_capacity =
-   DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
+   /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
+   smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
+   capacity = cpus / smt; /* cores */
+
+   ns->task_capacity = min_t(unsigned, capacity,
+   DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
 }
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Rework best node setting in task_numa_migrate()

2014-07-05 Thread tip-bot for Rik van Riel

Commit-ID:  db015daedb56251b73f956f70b3b8813f80d8ee1
Gitweb: http://git.kernel.org/tip/db015daedb56251b73f956f70b3b8813f80d8ee1
Author: Rik van Riel 
AuthorDate: Mon, 23 Jun 2014 11:41:34 -0400
Committer:  Ingo Molnar 
CommitDate: Sat, 5 Jul 2014 11:17:39 +0200

sched/numa: Rework best node setting in task_numa_migrate()

Fix up the best node setting in task_numa_migrate() to deal with a task
in a pseudo-interleaved NUMA group, which is already running in the
best location.

Set the task's preferred nid to the current nid, so task migration is
not retried at a high rate.

Signed-off-by: Rik van Riel 
Cc: mgor...@suse.de
Cc: chegu_vi...@hp.com
Cc: Linus Torvalds 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/1403538095-31256-7-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 19 +--
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9d1734a..7bb2f46 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1354,10 +1354,6 @@ static int task_numa_migrate(struct task_struct *p)
}
}
 
-   /* No better CPU than the current one was found. */
-   if (env.best_cpu == -1)
-   return -EAGAIN;
-
/*
 * If the task is part of a workload that spans multiple NUMA nodes,
 * and is migrating into one of the workload's active nodes, remember
@@ -1366,8 +1362,19 @@ static int task_numa_migrate(struct task_struct *p)
 * A task that migrated to a second choice node will be better off
 * trying for a better one later. Do not set the preferred node here.
 */
-   if (p->numa_group && node_isset(env.dst_nid, 
p->numa_group->active_nodes))
-   sched_setnuma(p, env.dst_nid);
+   if (p->numa_group) {
+   if (env.best_cpu == -1)
+   nid = env.src_nid;
+   else
+   nid = env.dst_nid;
+
+   if (node_isset(nid, p->numa_group->active_nodes))
+   sched_setnuma(p, env.dst_nid);
+   }
+
+   /* No better CPU than the current one was found. */
+   if (env.best_cpu == -1)
+   return -EAGAIN;
 
/*
 * Reset the scan period if the task is being rescheduled on an
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Change scan period code to match intent

2014-07-05 Thread tip-bot for Rik van Riel

Commit-ID:  a22b4b012340b988dbe7a58461d6fcc582f34aa0
Gitweb: http://git.kernel.org/tip/a22b4b012340b988dbe7a58461d6fcc582f34aa0
Author: Rik van Riel 
AuthorDate: Mon, 23 Jun 2014 11:41:35 -0400
Committer:  Ingo Molnar 
CommitDate: Sat, 5 Jul 2014 11:17:40 +0200

sched/numa: Change scan period code to match intent

Reading through the scan period code and comment, it appears the
intent was to slow down NUMA scanning when a majority of accesses
are on the local node, specifically a local:remote ratio of 3:1.

However, the code actually tests local / (local + remote), and
the actual cut-off point was around 30% local accesses, well before
a task has actually converged on a node.

Changing the threshold to 7 means scanning slows down when a task
has around 70% of its accesses local, which appears to match the
intent of the code more closely.

Signed-off-by: Rik van Riel 
Cc: mgor...@suse.de
Cc: chegu_vi...@hp.com
Cc: Linus Torvalds 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/1403538095-31256-8-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7bb2f46..a140c6a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1452,12 +1452,12 @@ static void update_numa_active_node_mask(struct 
numa_group *numa_group)
 /*
  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
  * increments. The more local the fault statistics are, the higher the scan
- * period will be for the next scan window. If local/remote ratio is below
- * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
- * scan period will decrease
+ * period will be for the next scan window. If local/(local+remote) ratio is
+ * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
+ * the scan period will decrease. Aim for 70% local accesses.
  */
 #define NUMA_PERIOD_SLOTS 10
-#define NUMA_PERIOD_THRESHOLD 3
+#define NUMA_PERIOD_THRESHOLD 7
 
 /*
  * Increase the scan period (slow down scanning) if the majority of
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Use effective_load() to balance NUMA loads

2014-07-05 Thread tip-bot for Rik van Riel

Commit-ID:  6dc1a672ab15604947361dcd02e459effa09bad5
Gitweb: http://git.kernel.org/tip/6dc1a672ab15604947361dcd02e459effa09bad5
Author: Rik van Riel 
AuthorDate: Mon, 23 Jun 2014 11:46:14 -0400
Committer:  Ingo Molnar 
CommitDate: Sat, 5 Jul 2014 11:17:35 +0200

sched/numa: Use effective_load() to balance NUMA loads

When CONFIG_FAIR_GROUP_SCHED is enabled, the load that a task places
on a CPU is determined by the group the task is in. The active groups
on the source and destination CPU can be different, resulting in a
different load contribution by the same task at its source and at its
destination. As a result, the load needs to be calculated separately
for each CPU, instead of estimated once with task_h_load().

Getting this calculation right allows some workloads to converge,
where previously the last thread could get stuck on another node,
without being able to migrate to its final destination.

Signed-off-by: Rik van Riel 
Cc: mgor...@suse.de
Cc: chegu_vi...@hp.com
Cc: Linus Torvalds 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/1403538378-31571-3-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f287d0b..d6526d2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1151,6 +1151,7 @@ static void task_numa_compare(struct task_numa_env *env,
struct rq *src_rq = cpu_rq(env->src_cpu);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
struct task_struct *cur;
+   struct task_group *tg;
long src_load, dst_load;
long load;
long imp = (groupimp > 0) ? groupimp : taskimp;
@@ -1225,14 +1226,21 @@ static void task_numa_compare(struct task_numa_env *env,
 * In the overloaded case, try and keep the load balanced.
 */
 balance:
-   load = task_h_load(env->p);
-   dst_load = env->dst_stats.load + load;
-   src_load = env->src_stats.load - load;
+   src_load = env->src_stats.load;
+   dst_load = env->dst_stats.load;
+
+   /* Calculate the effect of moving env->p from src to dst. */
+   load = env->p->se.load.weight;
+   tg = task_group(env->p);
+   src_load += effective_load(tg, env->src_cpu, -load, -load);
+   dst_load += effective_load(tg, env->dst_cpu, load, load);
 
if (cur) {
-   load = task_h_load(cur);
-   dst_load -= load;
-   src_load += load;
+   /* Cur moves in the opposite direction. */
+   load = cur->se.load.weight;
+   tg = task_group(cur);
+   src_load += effective_load(tg, env->src_cpu, load, load);
+   dst_load += effective_load(tg, env->dst_cpu, -load, -load);
}
 
if (load_too_imbalanced(src_load, dst_load, env))
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Examine a task move when examining a task swap

2014-07-05 Thread tip-bot for Rik van Riel

Commit-ID:  0132c3e1777ceabc24c7d209b7cbe78c28c03c09
Gitweb: http://git.kernel.org/tip/0132c3e1777ceabc24c7d209b7cbe78c28c03c09
Author: Rik van Riel 
AuthorDate: Mon, 23 Jun 2014 11:46:16 -0400
Committer:  Ingo Molnar 
CommitDate: Sat, 5 Jul 2014 11:17:38 +0200

sched/numa: Examine a task move when examining a task swap

Running "perf bench numa mem -0 -m -P 1000 -p 8 -t 20" on a 4
node system results in 160 runnable threads on a system with 80
CPU threads.

Once a process has nearly converged, with 39 threads on one node
and 1 thread on another node, the remaining thread will be unable
to migrate to its preferred node through a task swap.

However, a simple task move would make the workload converge,
witout causing an imbalance.

Test for this unlikely occurrence, and attempt a task move to
the preferred nid when it happens.

 # Running main, "perf bench numa mem -p 8 -t 20 -0 -m -P 1000"

 ###
 # 160 tasks will execute (on 4 nodes, 80 CPUs):
 # -1x 0MB global  shared mem operations
 # -1x  1000MB process shared mem operations
 # -1x 0MB thread  local  mem operations
 ###

 ###
 #
 #0.0%  [0.2 mins]  0/0   1/1  36/2   0/0  [36/3 ] l:  0-0   (  0) {0-2}
 #0.0%  [0.3 mins] 43/3  37/2  39/2  41/3  [ 6/10] l:  0-1   (  1) {1-2}
 #0.0%  [0.4 mins] 42/3  38/2  40/2  40/2  [ 4/9 ] l:  1-2   (  1) [50.0%] 
{1-2}
 #0.0%  [0.6 mins] 41/3  39/2  40/2  40/2  [ 2/9 ] l:  2-4   (  2) [50.0%] 
{1-2}
 #0.0%  [0.7 mins] 40/2  40/2  40/2  40/2  [ 0/8 ] l:  3-5   (  2) [40.0%] 
(  41.8s converged)

Without this patch, this same perf bench numa mem run had to
rely on the scheduler load balancer to first balance out the
load (moving a random task), before a task swap could complete
the NUMA convergence.

The load balancer does not normally take action unless the load

difference exceeds 25%. Convergence times of over half an hour
have been observed without this patch.

With this patch, the NUMA balancing code will simply migrate the
task, if that does not cause an imbalance.

Also skip examining a CPU in detail if the improvement on that CPU
is no more than the best we already have.

Signed-off-by: Rik van Riel 
Cc: chegu_vi...@hp.com
Cc: mgor...@suse.de
Cc: Linus Torvalds 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/n/tip-ggthh0rnh0yua6o5o3p6c...@git.kernel.org
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 23 +--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cebb312..9d1734a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1155,6 +1155,7 @@ static void task_numa_compare(struct task_numa_env *env,
long src_load, dst_load;
long load;
long imp = env->p->numa_group ? groupimp : taskimp;
+   long moveimp = imp;
 
rcu_read_lock();
cur = ACCESS_ONCE(dst_rq->curr);
@@ -1201,7 +1202,7 @@ static void task_numa_compare(struct task_numa_env *env,
}
}
 
-   if (imp < env->best_imp)
+   if (imp <= env->best_imp && moveimp <= env->best_imp)
goto unlock;
 
if (!cur) {
@@ -1214,7 +1215,8 @@ static void task_numa_compare(struct task_numa_env *env,
}
 
/* Balance doesn't matter much if we're running a task per cpu */
-   if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+   if (imp > env->best_imp && src_rq->nr_running == 1 &&
+   dst_rq->nr_running == 1)
goto assign;
 
/*
@@ -1230,6 +1232,23 @@ balance:
src_load += effective_load(tg, env->src_cpu, -load, -load);
dst_load += effective_load(tg, env->dst_cpu, load, load);
 
+   if (moveimp > imp && moveimp > env->best_imp) {
+   /*
+* If the improvement from just moving env->p direction is
+* better than swapping tasks around, check if a move is
+* possible. Store a slightly smaller score than moveimp,
+* so an actually idle CPU will win.
+*/
+   if (!load_too_imbalanced(src_load, dst_load, env)) {
+   imp = moveimp - 1;
+   cur = NULL;
+   goto assign;
+   }
+   }
+
+   if (imp <= env->best_imp)
+   goto unlock;
+
if (cur) {
/* Cur moves in the opposite direction. */
load = cur->se.load.weight;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Simplify task_numa_compare()

2014-07-05 Thread tip-bot for Rik van Riel

Commit-ID:  1c5d3eb3759013bc7ee4197aa0a9f245bdb6eb90
Gitweb: http://git.kernel.org/tip/1c5d3eb3759013bc7ee4197aa0a9f245bdb6eb90
Author: Rik van Riel 
AuthorDate: Mon, 23 Jun 2014 11:46:15 -0400
Committer:  Ingo Molnar 
CommitDate: Sat, 5 Jul 2014 11:17:37 +0200

sched/numa: Simplify task_numa_compare()

When a task is part of a numa_group, the comparison should always use
the group weight, in order to make workloads converge.

Signed-off-by: Rik van Riel 
Cc: chegu_vi...@hp.com
Cc: mgor...@suse.de
Cc: Linus Torvalds 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/1403538378-31571-4-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d6526d2..cebb312 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1154,7 +1154,7 @@ static void task_numa_compare(struct task_numa_env *env,
struct task_group *tg;
long src_load, dst_load;
long load;
-   long imp = (groupimp > 0) ? groupimp : taskimp;
+   long imp = env->p->numa_group ? groupimp : taskimp;
 
rcu_read_lock();
cur = ACCESS_ONCE(dst_rq->curr);
@@ -1192,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env,
 * itself (not part of a group), use the task weight
 * instead.
 */
-   if (env->p->numa_group)
-   imp = groupimp;
-   else
-   imp = taskimp;
-
if (cur->numa_group)
imp += group_weight(cur, env->src_nid) -
   group_weight(cur, env->dst_nid);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Move power adjustment into load_too_imbalanced()

2014-07-05 Thread tip-bot for Rik van Riel

Commit-ID:  28a21745190a0ca613cab817bfe3dc65373158bf
Gitweb: http://git.kernel.org/tip/28a21745190a0ca613cab817bfe3dc65373158bf
Author: Rik van Riel 
AuthorDate: Mon, 23 Jun 2014 11:46:13 -0400
Committer:  Ingo Molnar 
CommitDate: Sat, 5 Jul 2014 11:17:34 +0200

sched/numa: Move power adjustment into load_too_imbalanced()

Currently the NUMA code scales the load on each node with the
amount of CPU power available on that node, but it does not
apply any adjustment to the load of the task that is being
moved over.

On systems with SMT/HT, this results in a task being weighed
much more heavily than a CPU core, and a task move that would
even out the load between nodes being disallowed.

The correct thing is to apply the power correction to the
numbers after we have first applied the move of the tasks'
loads to them.

This also allows us to do the power correction with a multiplication,
rather than a division.

Also drop two function arguments for load_too_unbalanced, since it
takes various factors from env already.

Signed-off-by: Rik van Riel 
Cc: chegu_vi...@hp.com
Cc: mgor...@suse.de
Cc: Linus Torvalds 
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/1403538378-31571-2-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 39 ---
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 96b2d39..f287d0b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int 
nid)
if (!cpus)
return;
 
-   ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
ns->task_capacity =
DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
@@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env,
env->best_cpu = env->dst_cpu;
 }
 
-static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
-   long src_load, long dst_load,
+static bool load_too_imbalanced(long src_load, long dst_load,
struct task_numa_env *env)
 {
long imb, old_imb;
+   long orig_src_load, orig_dst_load;
+   long src_capacity, dst_capacity;
+
+   /*
+* The load is corrected for the CPU capacity available on each node.
+*
+* src_loaddst_load
+*  vs -
+* src_capacitydst_capacity
+*/
+   src_capacity = env->src_stats.compute_capacity;
+   dst_capacity = env->dst_stats.compute_capacity;
 
/* We care about the slope of the imbalance, not the direction. */
if (dst_load < src_load)
swap(dst_load, src_load);
 
/* Is the difference below the threshold? */
-   imb = dst_load * 100 - src_load * env->imbalance_pct;
+   imb = dst_load * src_capacity * 100 -
+ src_load * dst_capacity * env->imbalance_pct;
if (imb <= 0)
return false;
 
@@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, 
long orig_dst_load,
 * The imbalance is above the allowed threshold.
 * Compare it with the old imbalance.
 */
+   orig_src_load = env->src_stats.load;
+   orig_dst_load = env->dst_stats.load;
+
if (orig_dst_load < orig_src_load)
swap(orig_dst_load, orig_src_load);
 
-   old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
+   old_imb = orig_dst_load * src_capacity * 100 -
+ orig_src_load * dst_capacity * env->imbalance_pct;
 
/* Would this change make things worse? */
return (imb > old_imb);
@@ -1136,8 +1151,7 @@ static void task_numa_compare(struct task_numa_env *env,
struct rq *src_rq = cpu_rq(env->src_cpu);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
struct task_struct *cur;
-   long orig_src_load, src_load;
-   long orig_dst_load, dst_load;
+   long src_load, dst_load;
long load;
long imp = (groupimp > 0) ? groupimp : taskimp;
 
@@ -1211,13 +1225,9 @@ static void task_numa_compare(struct task_numa_env *env,
 * In the overloaded case, try and keep the load balanced.
 */
 balance:
-   orig_dst_load = env->dst_stats.load;
-   orig_src_load = env->src_stats.load;
-
-   /* XXX missing capacity terms */
load = task_h_load(env->p);
-   dst_load = orig_dst_load + load;
-   src_load = orig_src_load - load;
+   dst_load = env->dst_stats.load + load;
+   src_load = env->src_stats.load - load;
 
if (cur) {
load = task_h_load(cur);
@@ -1225,8 +1235,7 @@ balance:
src_load += load;
}
 
-   if (load_too_imbalanced(orig_src_load, orig_dst

[tip:sched/core] sched/numa: Use group's max nid as task' s preferred nid

2014-07-05 Thread tip-bot for Rik van Riel

Commit-ID:  f0b8a4afd6a8c500161e45065a91738b490bf5ae
Gitweb: http://git.kernel.org/tip/f0b8a4afd6a8c500161e45065a91738b490bf5ae
Author: Rik van Riel 
AuthorDate: Mon, 23 Jun 2014 11:41:29 -0400
Committer:  Ingo Molnar 
CommitDate: Sat, 5 Jul 2014 11:17:33 +0200

sched/numa: Use group's max nid as task's preferred nid

>From task_numa_placement, always try to consolidate the tasks
in a group on the group's top nid.

In case this task is part of a group that is interleaved over
multiple nodes, task_numa_migrate will set the task's preferred
nid to the best node it could find for the task, so this patch
will cause at most one run through task_numa_migrate.

Signed-off-by: Rik van Riel 
Cc: mgor...@suse.de
Cc: chegu_vi...@hp.com
Cc: Linus Torvalds 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/1403538095-31256-2-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 17 +
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e3ff3d1..96b2d39 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1594,23 +1594,8 @@ static void task_numa_placement(struct task_struct *p)
 
if (p->numa_group) {
update_numa_active_node_mask(p->numa_group);
-   /*
-* If the preferred task and group nids are different,
-* iterate over the nodes again to find the best place.
-*/
-   if (max_nid != max_group_nid) {
-   unsigned long weight, max_weight = 0;
-
-   for_each_online_node(nid) {
-   weight = task_weight(p, nid) + group_weight(p, 
nid);
-   if (weight > max_weight) {
-   max_weight = weight;
-   max_nid = nid;
-   }
-   }
-   }
-
spin_unlock_irq(group_lock);
+   max_nid = max_group_nid;
}
 
if (max_faults) {
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Always try to migrate to preferred node at task_numa_placement() time

2014-06-19 Thread tip-bot for Rik van Riel

Commit-ID:  bb97fc31647539f1f102eed646a95e200160a150
Gitweb: http://git.kernel.org/tip/bb97fc31647539f1f102eed646a95e200160a150
Author: Rik van Riel 
AuthorDate: Wed, 4 Jun 2014 16:33:15 -0400
Committer:  Ingo Molnar 
CommitDate: Wed, 18 Jun 2014 18:29:58 +0200

sched/numa: Always try to migrate to preferred node at task_numa_placement() 
time

It is possible that at task_numa_placement() time, the task's
numa_preferred_nid does not change, but the task is not
actually running on the preferred node at the time.

In that case, we still want to attempt migration to the
preferred node.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra 
Cc: mgor...@suse.de
Cc: Linus Torvalds 
Link: http://lkml.kernel.org/r/20140604163315.1dbc7...@cuia.bos.redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8fbb011..3fa3e18 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1613,11 +1613,13 @@ static void task_numa_placement(struct task_struct *p)
spin_unlock_irq(group_lock);
}
 
-   /* Preferred node as the node with the most faults */
-   if (max_faults && max_nid != p->numa_preferred_nid) {
-   /* Update the preferred nid and migrate task if possible */
-   sched_setnuma(p, max_nid);
-   numa_migrate_preferred(p);
+   if (max_faults) {
+   /* Set the new preferred node */
+   if (max_nid != p->numa_preferred_nid)
+   sched_setnuma(p, max_nid);
+
+   if (task_node(p) != p->numa_preferred_nid)
+   numa_migrate_preferred(p);
}
 }
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Ensure task_numa_migrate() checks the preferred node

2014-06-19 Thread tip-bot for Rik van Riel

Commit-ID:  a43455a1d572daf7b730fe12eb747d1e17411365
Gitweb: http://git.kernel.org/tip/a43455a1d572daf7b730fe12eb747d1e17411365
Author: Rik van Riel 
AuthorDate: Wed, 4 Jun 2014 16:09:42 -0400
Committer:  Ingo Molnar 
CommitDate: Wed, 18 Jun 2014 18:29:57 +0200

sched/numa: Ensure task_numa_migrate() checks the preferred node

The first thing task_numa_migrate() does is check to see if there is
CPU capacity available on the preferred node, in order to move the
task there.

However, if the preferred node is all busy, we would skip considering
that node for tasks swaps in the subsequent loop. This prevents NUMA
convergence of tasks on busy systems.

However, swapping locations with a task on our preferred nid, when
the preferred nid is busy, is perfectly fine.

The fix is to also look for a CPU on our preferred nid when it is
totally busy.

This changes "perf bench numa mem -p 4 -t 20 -m -0 -P 1000" from
not converging in 15 minutes on my 4 node system, to converging in
10-20 seconds.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra 
Cc: mgor...@suse.de
Cc: Linus Torvalds 
Link: http://lkml.kernel.org/r/20140604160942.6969b...@cuia.bos.redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fea7d33..8fbb011 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1302,9 +1302,8 @@ static int task_numa_migrate(struct task_struct *p)
groupimp = group_weight(p, env.dst_nid) - groupweight;
update_numa_stats(&env.dst_stats, env.dst_nid);
 
-   /* If the preferred nid has free capacity, try to use it. */
-   if (env.dst_stats.has_free_capacity)
-   task_numa_find_cpu(&env, taskimp, groupimp);
+   /* Try to find a spot on the preferred nid. */
+   task_numa_find_cpu(&env, taskimp, groupimp);
 
/* No space available on the preferred nid. Look elsewhere. */
if (env.best_cpu == -1) {
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Decay -> wakee_flips instead of zeroing

2014-05-22 Thread tip-bot for Rik van Riel

Commit-ID:  096aa33863a5e48de52d2ff30e0801b7487944f4
Gitweb: http://git.kernel.org/tip/096aa33863a5e48de52d2ff30e0801b7487944f4
Author: Rik van Riel 
AuthorDate: Fri, 16 May 2014 00:13:32 -0400
Committer:  Ingo Molnar 
CommitDate: Thu, 22 May 2014 11:16:41 +0200

sched/numa: Decay ->wakee_flips instead of zeroing

Affine wakeups have the potential to interfere with NUMA placement.
If a task wakes up too many other tasks, affine wakeups will get
disabled.

However, regardless of how many other tasks it wakes up, it gets
re-enabled once a second, potentially interfering with NUMA
placement of other tasks.

By decaying wakee_wakes in half instead of zeroing it, we can avoid
that problem for some workloads.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra 
Cc: chegu_vi...@hp.com
Cc: umgwanakikb...@gmail.com
Link: http://lkml.kernel.org/r/20140516001332.67f91...@annuminas.surriel.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 503f750..c9617b7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4065,7 +4065,7 @@ static void record_wakee(struct task_struct *p)
 * about the loss.
 */
if (jiffies > current->wakee_flip_decay_ts + HZ) {
-   current->wakee_flips = 0;
+   current->wakee_flips >>= 1;
current->wakee_flip_decay_ts = jiffies;
}
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Update migrate_improves/ degrades_locality()

2014-05-22 Thread tip-bot for Rik van Riel

Commit-ID:  b1ad065e65f56103db8b97edbd218a271ff5b1bb
Gitweb: http://git.kernel.org/tip/b1ad065e65f56103db8b97edbd218a271ff5b1bb
Author: Rik van Riel 
AuthorDate: Thu, 15 May 2014 13:03:06 -0400
Committer:  Ingo Molnar 
CommitDate: Thu, 22 May 2014 11:16:39 +0200

sched/numa: Update migrate_improves/degrades_locality()

Update the migrate_improves/degrades_locality() functions with
knowledge of pseudo-interleaving.

Do not consider moving tasks around within the set of group's active
nodes as improving or degrading locality. Instead, leave the load
balancer free to balance the load between a numa_group's active nodes.

Also, switch from the group/task_weight functions to the group/task_fault
functions. The "weight" functions involve a division, but both calls use
the same divisor, so there's no point in doing that from these functions.

On a 4 node (x10 core) system, performance of SPECjbb2005 seems
unaffected, though the number of migrations with 2 8-warehouse wide
instances seems to have almost halved, due to the scheduler running
each instance on a single node.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra 
Cc: mgor...@suse.de
Cc: chegu_vi...@hp.com
Link: http://lkml.kernel.org/r/20140515130306.61aae...@cuia.bos.redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 42 +-
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b899613..503f750 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5123,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now)
 /* Returns true if the destination node has incurred more faults */
 static bool migrate_improves_locality(struct task_struct *p, struct lb_env 
*env)
 {
+   struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid;
 
if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
@@ -5136,21 +5137,29 @@ static bool migrate_improves_locality(struct 
task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
 
-   /* Always encourage migration to the preferred node. */
-   if (dst_nid == p->numa_preferred_nid)
-   return true;
+   if (numa_group) {
+   /* Task is already in the group's interleave set. */
+   if (node_isset(src_nid, numa_group->active_nodes))
+   return false;
+
+   /* Task is moving into the group's interleave set. */
+   if (node_isset(dst_nid, numa_group->active_nodes))
+   return true;
 
-   /* If both task and group weight improve, this move is a winner. */
-   if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
-   group_weight(p, dst_nid) > group_weight(p, src_nid))
+   return group_faults(p, dst_nid) > group_faults(p, src_nid);
+   }
+
+   /* Encourage migration to the preferred node. */
+   if (dst_nid == p->numa_preferred_nid)
return true;
 
-   return false;
+   return task_faults(p, dst_nid) > task_faults(p, src_nid);
 }
 
 
 static bool migrate_degrades_locality(struct task_struct *p, struct lb_env 
*env)
 {
+   struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid;
 
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5165,16 +5174,23 @@ static bool migrate_degrades_locality(struct 
task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
 
+   if (numa_group) {
+   /* Task is moving within/into the group's interleave set. */
+   if (node_isset(dst_nid, numa_group->active_nodes))
+   return false;
+
+   /* Task is moving out of the group's interleave set. */
+   if (node_isset(src_nid, numa_group->active_nodes))
+   return true;
+
+   return group_faults(p, dst_nid) < group_faults(p, src_nid);
+   }
+
/* Migrating away from the preferred node is always bad. */
if (src_nid == p->numa_preferred_nid)
return true;
 
-   /* If either task or group weight get worse, don't do it. */
-   if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
-   group_weight(p, dst_nid) < group_weight(p, src_nid))
-   return true;
-
-   return false;
+   return task_faults(p, dst_nid) < task_faults(p, src_nid);
 }
 
 #else
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Allow task switch if load imbalance improves

2014-05-22 Thread tip-bot for Rik van Riel

Commit-ID:  e63da03639cc9e6e83b62e7ef8ffdbb92421416a
Gitweb: http://git.kernel.org/tip/e63da03639cc9e6e83b62e7ef8ffdbb92421416a
Author: Rik van Riel 
AuthorDate: Wed, 14 May 2014 13:22:21 -0400
Committer:  Ingo Molnar 
CommitDate: Thu, 22 May 2014 11:16:38 +0200

sched/numa: Allow task switch if load imbalance improves

Currently the NUMA balancing code only allows moving tasks between NUMA
nodes when the load on both nodes is in balance. This breaks down when
the load was imbalanced to begin with.

Allow tasks to be moved between NUMA nodes if the imbalance is small,
or if the new imbalance is be smaller than the original one.

Suggested-by: Peter Zijlstra 
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra 
Cc: mgor...@suse.de
Cc: chegu_vi...@hp.com
Signed-off-by: Ingo Molnar 
Link: http://lkml.kernel.org/r/20140514132221.274b3...@annuminas.surriel.com
---
 kernel/sched/fair.c | 46 --
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f7cac2b..b899613 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env,
env->best_cpu = env->dst_cpu;
 }
 
+static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
+   long src_load, long dst_load,
+   struct task_numa_env *env)
+{
+   long imb, old_imb;
+
+   /* We care about the slope of the imbalance, not the direction. */
+   if (dst_load < src_load)
+   swap(dst_load, src_load);
+
+   /* Is the difference below the threshold? */
+   imb = dst_load * 100 - src_load * env->imbalance_pct;
+   if (imb <= 0)
+   return false;
+
+   /*
+* The imbalance is above the allowed threshold.
+* Compare it with the old imbalance.
+*/
+   if (orig_dst_load < orig_src_load)
+   swap(orig_dst_load, orig_src_load);
+
+   old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
+
+   /* Would this change make things worse? */
+   return (old_imb > imb);
+}
+
 /*
  * This checks if the overall compute and NUMA accesses of the system would
  * be improved if the source tasks was migrated to the target dst_cpu taking
@@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env,
struct rq *src_rq = cpu_rq(env->src_cpu);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
struct task_struct *cur;
-   long dst_load, src_load;
+   long orig_src_load, src_load;
+   long orig_dst_load, dst_load;
long load;
long imp = (groupimp > 0) ? groupimp : taskimp;
 
@@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env,
 * In the overloaded case, try and keep the load balanced.
 */
 balance:
-   dst_load = env->dst_stats.load;
-   src_load = env->src_stats.load;
+   orig_dst_load = env->dst_stats.load;
+   orig_src_load = env->src_stats.load;
 
/* XXX missing power terms */
load = task_h_load(env->p);
-   dst_load += load;
-   src_load -= load;
+   dst_load = orig_dst_load + load;
+   src_load = orig_src_load - load;
 
if (cur) {
load = task_h_load(cur);
@@ -1195,11 +1224,8 @@ balance:
src_load += load;
}
 
-   /* make src_load the smaller */
-   if (dst_load < src_load)
-   swap(dst_load, src_load);
-
-   if (src_load * env->imbalance_pct < dst_load * 100)
+   if (load_too_imbalanced(orig_src_load, orig_dst_load,
+   src_load, dst_load, env))
goto unlock;
 
 assign:
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched: Call select_idle_sibling() when not affine_sd

2014-05-22 Thread tip-bot for Rik van Riel

Commit-ID:  8bf21433f38b020c3d8a3805d1d7fb73d7b40c01
Gitweb: http://git.kernel.org/tip/8bf21433f38b020c3d8a3805d1d7fb73d7b40c01
Author: Rik van Riel 
AuthorDate: Wed, 14 May 2014 11:40:37 -0400
Committer:  Ingo Molnar 
CommitDate: Thu, 22 May 2014 11:16:28 +0200

sched: Call select_idle_sibling() when not affine_sd

On smaller systems, the top level sched domain will be an affine
domain, and select_idle_sibling is invoked for every SD_WAKE_AFFINE
wakeup. This seems to be working well.

On larger systems, with the node distance between far away NUMA nodes
being > RECLAIM_DISTANCE, select_idle_sibling is only called if the
waker and the wakee are on nodes less than RECLAIM_DISTANCE apart.

This patch leaves in place the policy of not pulling the task across
nodes on such systems, while fixing the issue that select_idle_sibling
is not called at all in certain circumstances.

The code will look for an idle CPU in the same CPU package as the
CPU where the task ran previously.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra 
Cc: morten.rasmus...@arm.com
Cc: george.mccollis...@gmail.com
Cc: ktk...@parallels.com
Cc: Mel Gorman 
Cc: Mike Galbraith 
Link: http://lkml.kernel.org/r/20140514114037.2d932...@annuminas.surriel.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dd3fa14..429164d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4473,10 +4473,10 @@ select_task_rq_fair(struct task_struct *p, int 
prev_cpu, int sd_flag, int wake_f
sd = tmp;
}
 
-   if (affine_sd) {
-   if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
-   prev_cpu = cpu;
+   if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+   prev_cpu = cpu;
 
+   if (sd_flag & SD_BALANCE_WAKE) {
new_cpu = select_idle_sibling(p, prev_cpu);
goto unlock;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched,numa: Update migrate_improves/ degrades_locality

2014-05-19 Thread tip-bot for Rik van Riel

Commit-ID:  f5c1e1af91b2a4238d7c2a6dc4aa0076908b5864
Gitweb: http://git.kernel.org/tip/f5c1e1af91b2a4238d7c2a6dc4aa0076908b5864
Author: Rik van Riel 
AuthorDate: Thu, 15 May 2014 13:03:06 -0400
Committer:  Thomas Gleixner 
CommitDate: Mon, 19 May 2014 22:02:43 +0900

sched,numa: Update migrate_improves/degrades_locality

Update the migrate_improves/degrades_locality functions with
knowledge of pseudo-interleaving.

Do not consider moving tasks around within the set of group's active
nodes as improving or degrading locality. Instead, leave the load
balancer free to balance the load between a numa_group's active nodes.

Also, switch from the group/task_weight functions to the group/task_fault
functions. The "weight" functions involve a division, but both calls use
the same divisor, so there's no point in doing that from these functions.

On a 4 node (x10 core) system, performance of SPECjbb2005 seems
unaffected, though the number of migrations with 2 8-warehouse wide
instances seems to have almost halved, due to the scheduler running
each instance on a single node.

Cc: mgor...@suse.de
Cc: chegu_vi...@hp.com
Cc: mi...@kernel.org
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20140515130306.61aae...@cuia.bos.redhat.com
Signed-off-by: Thomas Gleixner 
---
 kernel/sched/fair.c | 42 +-
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b899613..503f750 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5123,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now)
 /* Returns true if the destination node has incurred more faults */
 static bool migrate_improves_locality(struct task_struct *p, struct lb_env 
*env)
 {
+   struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid;
 
if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
@@ -5136,21 +5137,29 @@ static bool migrate_improves_locality(struct 
task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
 
-   /* Always encourage migration to the preferred node. */
-   if (dst_nid == p->numa_preferred_nid)
-   return true;
+   if (numa_group) {
+   /* Task is already in the group's interleave set. */
+   if (node_isset(src_nid, numa_group->active_nodes))
+   return false;
+
+   /* Task is moving into the group's interleave set. */
+   if (node_isset(dst_nid, numa_group->active_nodes))
+   return true;
 
-   /* If both task and group weight improve, this move is a winner. */
-   if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
-   group_weight(p, dst_nid) > group_weight(p, src_nid))
+   return group_faults(p, dst_nid) > group_faults(p, src_nid);
+   }
+
+   /* Encourage migration to the preferred node. */
+   if (dst_nid == p->numa_preferred_nid)
return true;
 
-   return false;
+   return task_faults(p, dst_nid) > task_faults(p, src_nid);
 }
 
 
 static bool migrate_degrades_locality(struct task_struct *p, struct lb_env 
*env)
 {
+   struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid;
 
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5165,16 +5174,23 @@ static bool migrate_degrades_locality(struct 
task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
 
+   if (numa_group) {
+   /* Task is moving within/into the group's interleave set. */
+   if (node_isset(dst_nid, numa_group->active_nodes))
+   return false;
+
+   /* Task is moving out of the group's interleave set. */
+   if (node_isset(src_nid, numa_group->active_nodes))
+   return true;
+
+   return group_faults(p, dst_nid) < group_faults(p, src_nid);
+   }
+
/* Migrating away from the preferred node is always bad. */
if (src_nid == p->numa_preferred_nid)
return true;
 
-   /* If either task or group weight get worse, don't do it. */
-   if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
-   group_weight(p, dst_nid) < group_weight(p, src_nid))
-   return true;
-
-   return false;
+   return task_faults(p, dst_nid) < task_faults(p, src_nid);
 }
 
 #else
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched,numa: Decay wakee_flips instead of zeroing

2014-05-19 Thread tip-bot for Rik van Riel

Commit-ID:  5658b4f43e63f8c7b4a27995dcb2cf43a52ee398
Gitweb: http://git.kernel.org/tip/5658b4f43e63f8c7b4a27995dcb2cf43a52ee398
Author: Rik van Riel 
AuthorDate: Fri, 16 May 2014 00:13:32 -0400
Committer:  Thomas Gleixner 
CommitDate: Mon, 19 May 2014 22:02:43 +0900

sched,numa: Decay wakee_flips instead of zeroing

Affine wakeups have the potential to interfere with NUMA placement.
If a task wakes up too many other tasks, affine wakeups will get
disabled.

However, regardless of how many other tasks it wakes up, it gets
re-enabled once a second, potentially interfering with NUMA
placement of other tasks.

By decaying wakee_wakes in half instead of zeroing it, we can avoid
that problem for some workloads.

Cc: chegu_vi...@hp.com
Cc: mi...@kernel.org
Cc: umgwanakikb...@gmail.com
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20140516001332.67f91...@annuminas.surriel.com
Signed-off-by: Thomas Gleixner 
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 503f750..c9617b7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4065,7 +4065,7 @@ static void record_wakee(struct task_struct *p)
 * about the loss.
 */
if (jiffies > current->wakee_flip_decay_ts + HZ) {
-   current->wakee_flips = 0;
+   current->wakee_flips >>= 1;
current->wakee_flip_decay_ts = jiffies;
}
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched,numa: Allow task switch if load imbalance improves

2014-05-19 Thread tip-bot for Rik van Riel

Commit-ID:  b1fda183e09d70ea75d478ea055e2b6059476eff
Gitweb: http://git.kernel.org/tip/b1fda183e09d70ea75d478ea055e2b6059476eff
Author: Rik van Riel 
AuthorDate: Wed, 14 May 2014 13:22:21 -0400
Committer:  Thomas Gleixner 
CommitDate: Mon, 19 May 2014 22:02:42 +0900

sched,numa: Allow task switch if load imbalance improves

Currently the NUMA balancing code only allows moving tasks between NUMA
nodes when the load on both nodes is in balance. This breaks down when
the load was imbalanced to begin with.

Allow tasks to be moved between NUMA nodes if the imbalance is small,
or if the new imbalance is be smaller than the original one.

Cc: mi...@kernel.org
Cc: mgor...@suse.de
Cc: chegu_vi...@hp.com
Signed-off-by: Rik van Riel 
Suggested-by: Peter Zijlstra 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20140514132221.274b3...@annuminas.surriel.com
Signed-off-by: Thomas Gleixner 
---
 kernel/sched/fair.c | 46 --
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f7cac2b..b899613 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env,
env->best_cpu = env->dst_cpu;
 }
 
+static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
+   long src_load, long dst_load,
+   struct task_numa_env *env)
+{
+   long imb, old_imb;
+
+   /* We care about the slope of the imbalance, not the direction. */
+   if (dst_load < src_load)
+   swap(dst_load, src_load);
+
+   /* Is the difference below the threshold? */
+   imb = dst_load * 100 - src_load * env->imbalance_pct;
+   if (imb <= 0)
+   return false;
+
+   /*
+* The imbalance is above the allowed threshold.
+* Compare it with the old imbalance.
+*/
+   if (orig_dst_load < orig_src_load)
+   swap(orig_dst_load, orig_src_load);
+
+   old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
+
+   /* Would this change make things worse? */
+   return (old_imb > imb);
+}
+
 /*
  * This checks if the overall compute and NUMA accesses of the system would
  * be improved if the source tasks was migrated to the target dst_cpu taking
@@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env,
struct rq *src_rq = cpu_rq(env->src_cpu);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
struct task_struct *cur;
-   long dst_load, src_load;
+   long orig_src_load, src_load;
+   long orig_dst_load, dst_load;
long load;
long imp = (groupimp > 0) ? groupimp : taskimp;
 
@@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env,
 * In the overloaded case, try and keep the load balanced.
 */
 balance:
-   dst_load = env->dst_stats.load;
-   src_load = env->src_stats.load;
+   orig_dst_load = env->dst_stats.load;
+   orig_src_load = env->src_stats.load;
 
/* XXX missing power terms */
load = task_h_load(env->p);
-   dst_load += load;
-   src_load -= load;
+   dst_load = orig_dst_load + load;
+   src_load = orig_src_load - load;
 
if (cur) {
load = task_h_load(cur);
@@ -1195,11 +1224,8 @@ balance:
src_load += load;
}
 
-   /* make src_load the smaller */
-   if (dst_load < src_load)
-   swap(dst_load, src_load);
-
-   if (src_load * env->imbalance_pct < dst_load * 100)
+   if (load_too_imbalanced(orig_src_load, orig_dst_load,
+   src_load, dst_load, env))
goto unlock;
 
 assign:
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched: call select_idle_sibling when not affine_sd

2014-05-19 Thread tip-bot for Rik van Riel

Commit-ID:  b45cf72cf7e1dd3b4a95947f85659cfdc01dbdad
Gitweb: http://git.kernel.org/tip/b45cf72cf7e1dd3b4a95947f85659cfdc01dbdad
Author: Rik van Riel 
AuthorDate: Wed, 14 May 2014 11:40:37 -0400
Committer:  Thomas Gleixner 
CommitDate: Mon, 19 May 2014 22:02:40 +0900

sched: call select_idle_sibling when not affine_sd

On smaller systems, the top level sched domain will be an affine
domain, and select_idle_sibling is invoked for every SD_WAKE_AFFINE
wakeup. This seems to be working well.

On larger systems, with the node distance between far away NUMA nodes
being > RECLAIM_DISTANCE, select_idle_sibling is only called if the
waker and the wakee are on nodes less than RECLAIM_DISTANCE apart.

This patch leaves in place the policy of not pulling the task across
nodes on such systems, while fixing the issue that select_idle_sibling
is not called at all in certain circumstances.

The code will look for an idle CPU in the same CPU package as the
CPU where the task ran previously.

Cc: morten.rasmus...@arm.com
Cc: mi...@kernel.org
Cc: george.mccollis...@gmail.com
Cc: ktk...@parallels.com
Cc: Mel Gorman 
Cc: Mike Galbraith 
Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20140514114037.2d932...@annuminas.surriel.com
Signed-off-by: Thomas Gleixner 
---
 kernel/sched/fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dd3fa14..429164d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4473,10 +4473,10 @@ select_task_rq_fair(struct task_struct *p, int 
prev_cpu, int sd_flag, int wake_f
sd = tmp;
}
 
-   if (affine_sd) {
-   if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
-   prev_cpu = cpu;
+   if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+   prev_cpu = cpu;
 
+   if (sd_flag & SD_BALANCE_WAKE) {
new_cpu = select_idle_sibling(p, prev_cpu);
goto unlock;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] mm/numa: Remove BUG_ON() in __handle_mm_fault()

2014-05-08 Thread tip-bot for Rik van Riel

Commit-ID:  107437febd495a50e2cd09c81bbaa84d30e57b07
Gitweb: http://git.kernel.org/tip/107437febd495a50e2cd09c81bbaa84d30e57b07
Author: Rik van Riel 
AuthorDate: Tue, 29 Apr 2014 15:36:15 -0400
Committer:  Ingo Molnar 
CommitDate: Wed, 7 May 2014 13:33:48 +0200

mm/numa: Remove BUG_ON() in __handle_mm_fault()

Changing PTEs and PMDs to pte_numa & pmd_numa is done with the
mmap_sem held for reading, which means a pmd can be instantiated
and turned into a numa one while __handle_mm_fault() is examining
the value of old_pmd.

If that happens, __handle_mm_fault() should just return and let
the page fault retry, instead of throwing an oops. This is
handled by the test for pmd_trans_huge(*pmd) below.

Signed-off-by: Rik van Riel 
Reviewed-by: Naoya Horiguchi 
Reported-by: Sunil Pandey 
Signed-off-by: Peter Zijlstra 
Cc: Andrew Morton 
Cc: Johannes Weiner 
Cc: Kirill A. Shutemov 
Cc: Linus Torvalds 
Cc: Mel Gorman 
Cc: linux...@kvack.org
Cc: lwood...@redhat.com
Cc: dave.han...@intel.com
Link: http://lkml.kernel.org/r/20140429153615.2d720...@annuminas.surriel.com
Signed-off-by: Ingo Molnar 
---
 mm/memory.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index d0f0bef..9c2dc65 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3900,9 +3900,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct 
vm_area_struct *vma,
}
}
 
-   /* THP should already have been handled */
-   BUG_ON(pmd_numa(*pmd));
-
/*
 * Use __pte_alloc instead of pte_alloc_map, because we can't
 * run pte_offset_map on the pmd, if an huge pmd could
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Do not set preferred_node on migration to a second choice node

2014-05-08 Thread tip-bot for Rik van Riel

Commit-ID:  68d1b02a58f5d9f584c1fb2923ed60ec68cbbd9b
Gitweb: http://git.kernel.org/tip/68d1b02a58f5d9f584c1fb2923ed60ec68cbbd9b
Author: Rik van Riel 
AuthorDate: Fri, 11 Apr 2014 13:00:29 -0400
Committer:  Ingo Molnar 
CommitDate: Wed, 7 May 2014 13:33:47 +0200

sched/numa: Do not set preferred_node on migration to a second choice node

Setting the numa_preferred_node for a task in task_numa_migrate
does nothing on a 2-node system. Either we migrate to the node
that already was our preferred node, or we stay where we were.

On a 4-node system, it can slightly decrease overhead, by not
calling the NUMA code as much. Since every node tends to be
directly connected to every other node, running on the wrong
node for a while does not do much damage.

However, on an 8 node system, there are far more bad nodes
than there are good ones, and pretending that a second choice
is actually the preferred node can greatly delay, or even
prevent, a workload from converging.

The only time we can safely pretend that a second choice
node is the preferred node is when the task is part of a
workload that spans multiple NUMA nodes.

Signed-off-by: Rik van Riel 
Tested-by: Vinod Chegu 
Acked-by: Mel Gorman 
Signed-off-by: Peter Zijlstra 
Cc: Linus Torvalds 
Link: http://lkml.kernel.org/r/1397235629-16328-4-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ecea8d9..051903f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1301,7 +1301,16 @@ static int task_numa_migrate(struct task_struct *p)
if (env.best_cpu == -1)
return -EAGAIN;
 
-   sched_setnuma(p, env.dst_nid);
+   /*
+* If the task is part of a workload that spans multiple NUMA nodes,
+* and is migrating into one of the workload's active nodes, remember
+* this node as the task's preferred numa node, so the workload can
+* settle down.
+* A task that migrated to a second choice node will be better off
+* trying for a better one later. Do not set the preferred node here.
+*/
+   if (p->numa_group && node_isset(env.dst_nid, 
p->numa_group->active_nodes))
+   sched_setnuma(p, env.dst_nid);
 
/*
 * Reset the scan period if the task is being rescheduled on an
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Retry placement more frequently when misplaced

2014-05-08 Thread tip-bot for Rik van Riel

Commit-ID:  5085e2a328849bdee6650b32d52c87c3788ab01c
Gitweb: http://git.kernel.org/tip/5085e2a328849bdee6650b32d52c87c3788ab01c
Author: Rik van Riel 
AuthorDate: Fri, 11 Apr 2014 13:00:28 -0400
Committer:  Ingo Molnar 
CommitDate: Wed, 7 May 2014 13:33:46 +0200

sched/numa: Retry placement more frequently when misplaced

When tasks have not converged on their preferred nodes yet, we want
to retry fairly often, to make sure we do not migrate a task's memory
to an undesirable location, only to have to move it again later.

This patch reduces the interval at which migration is retried,
when the task's numa_scan_period is small.

Signed-off-by: Rik van Riel 
Tested-by: Vinod Chegu 
Acked-by: Mel Gorman 
Signed-off-by: Peter Zijlstra 
Cc: Linus Torvalds 
Link: http://lkml.kernel.org/r/1397235629-16328-3-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f6457b6..ecea8d9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1326,12 +1326,15 @@ static int task_numa_migrate(struct task_struct *p)
 /* Attempt to migrate a task to a CPU on the preferred node. */
 static void numa_migrate_preferred(struct task_struct *p)
 {
+   unsigned long interval = HZ;
+
/* This task has no NUMA fault statistics yet */
if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
return;
 
/* Periodically retry migrating the task to the preferred node */
-   p->numa_migrate_retry = jiffies + HZ;
+   interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
+   p->numa_migrate_retry = jiffies + interval;
 
/* Success if task is already running on preferred CPU */
if (task_node(p) == p->numa_preferred_nid)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/core] sched/numa: Count pages on active node as local

2014-05-08 Thread tip-bot for Rik van Riel

Commit-ID:  792568ec6a31ca560ca4d528782cbc6cd2cea8b0
Gitweb: http://git.kernel.org/tip/792568ec6a31ca560ca4d528782cbc6cd2cea8b0
Author: Rik van Riel 
AuthorDate: Fri, 11 Apr 2014 13:00:27 -0400
Committer:  Ingo Molnar 
CommitDate: Wed, 7 May 2014 13:33:45 +0200

sched/numa: Count pages on active node as local

The NUMA code is smart enough to distribute the memory of workloads
that span multiple NUMA nodes across those NUMA nodes.

However, it still has a pretty high scan rate for such workloads,
because any memory that is left on a node other than the node of
the CPU that faulted on the memory is counted as non-local, which
causes the scan rate to go up.

Counting the memory on any node where the task's numa group is
actively running as local, allows the scan rate to slow down
once the application is settled in.

This should reduce the overhead of the automatic NUMA placement
code, when a workload spans multiple NUMA nodes.

Signed-off-by: Rik van Riel 
Tested-by: Vinod Chegu 
Acked-by: Mel Gorman 
Signed-off-by: Peter Zijlstra 
Cc: Linus Torvalds 
Link: http://lkml.kernel.org/r/1397235629-16328-2-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5d859ec..f6457b6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1738,6 +1738,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int 
pages, int flags)
struct task_struct *p = current;
bool migrated = flags & TNF_MIGRATED;
int cpu_node = task_node(current);
+   int local = !!(flags & TNF_FAULT_LOCAL);
int priv;
 
if (!numabalancing_enabled)
@@ -1786,6 +1787,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int 
pages, int flags)
task_numa_group(p, last_cpupid, flags, &priv);
}
 
+   /*
+* If a workload spans multiple NUMA nodes, a shared fault that
+* occurs wholly within the set of nodes that the workload is
+* actively using should be counted as local. This allows the
+* scan rate to slow down when a workload has settled down.
+*/
+   if (!priv && !local && p->numa_group &&
+   node_isset(cpu_node, p->numa_group->active_nodes) &&
+   node_isset(mem_node, p->numa_group->active_nodes))
+   local = 1;
+
task_numa_placement(p);
 
/*
@@ -1800,7 +1812,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int 
pages, int flags)
 
p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
-   p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
+   p->numa_faults_locality[local] += pages;
 }
 
 static void reset_ptenuma_scan(struct task_struct *p)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/urgent] sched,numa: add cond_resched to task_numa_work

2014-02-21 Thread tip-bot for Rik van Riel

Commit-ID:  3cf1962cdbf6b3a9e3ef21116d215bbab350ea37
Gitweb: http://git.kernel.org/tip/3cf1962cdbf6b3a9e3ef21116d215bbab350ea37
Author: Rik van Riel 
AuthorDate: Tue, 18 Feb 2014 17:12:44 -0500
Committer:  Thomas Gleixner 
CommitDate: Fri, 21 Feb 2014 21:27:10 +0100

sched,numa: add cond_resched to task_numa_work

Normally task_numa_work scans over a fairly small amount of memory,
but it is possible to run into a large unpopulated part of virtual
memory, with no pages mapped. In that case, task_numa_work can run
for a while, and it may make sense to reschedule as required.

Cc: a...@linux-foundation.org
Cc: Andrea Arcangeli 
Signed-off-by: Rik van Riel 
Reported-by: Xing Gang 
Tested-by: Chegu Vinod 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/1392761566-24834-2-git-send-email-r...@redhat.com
Signed-off-by: Thomas Gleixner 
---
 kernel/sched/fair.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 966cc2b..7815709 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1757,6 +1757,8 @@ void task_numa_work(struct callback_head *work)
start = end;
if (pages <= 0)
goto out;
+
+   cond_resched();
} while (end != vma->vm_end);
}
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/numa] sched/numa, mm: Remove p->numa_migrate_deferred

2014-01-28 Thread tip-bot for Rik van Riel

Commit-ID:  52bf84aa206cd2c2516dfa3e03b578edf8a3242f
Gitweb: http://git.kernel.org/tip/52bf84aa206cd2c2516dfa3e03b578edf8a3242f
Author: Rik van Riel 
AuthorDate: Mon, 27 Jan 2014 17:03:40 -0500
Committer:  Ingo Molnar 
CommitDate: Tue, 28 Jan 2014 13:17:04 +0100

sched/numa, mm: Remove p->numa_migrate_deferred

Excessive migration of pages can hurt the performance of workloads
that span multiple NUMA nodes.  However, it turns out that the
p->numa_migrate_deferred knob is a really big hammer, which does
reduce migration rates, but does not actually help performance.

Now that the second stage of the automatic numa balancing code
has stabilized, it is time to replace the simplistic migration
deferral code with something smarter.

Signed-off-by: Rik van Riel 
Acked-by: Mel Gorman 
Signed-off-by: Peter Zijlstra 
Cc: Chegu Vinod 
Link: http://lkml.kernel.org/r/1390860228-21539-2-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 Documentation/sysctl/kernel.txt | 10 +
 include/linux/sched.h   |  1 -
 kernel/sched/fair.c |  8 
 kernel/sysctl.c |  7 ---
 mm/mempolicy.c  | 45 -
 5 files changed, 1 insertion(+), 70 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 6d48640..760f6e6a 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -386,8 +386,7 @@ feature should be disabled. Otherwise, if the system 
overhead from the
 feature is too high then the rate the kernel samples for NUMA hinting
 faults may be controlled by the numa_balancing_scan_period_min_ms,
 numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
-numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
-numa_balancing_migrate_deferred.
+numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls.
 
 ==
 
@@ -428,13 +427,6 @@ rate for each task.
 numa_balancing_scan_size_mb is how many megabytes worth of pages are
 scanned for a given scan.
 
-numa_balancing_migrate_deferred is how many page migrations get skipped
-unconditionally, after a page migration is skipped because a page is shared
-with other tasks. This reduces page migration overhead, and determines
-how much stronger the "move task near its memory" policy scheduler becomes,
-versus the "move memory near its task" memory management policy, for workloads
-with shared memory.
-
 ==
 
 osrelease, ostype & version:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffccdad..d572d5b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1457,7 +1457,6 @@ struct task_struct {
unsigned int numa_scan_period;
unsigned int numa_scan_period_max;
int numa_preferred_nid;
-   int numa_migrate_deferred;
unsigned long numa_migrate_retry;
u64 node_stamp; /* migration stamp  */
struct callback_head numa_work;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index efe6457..7cdde91 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -819,14 +819,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 unsigned int sysctl_numa_balancing_scan_delay = 1000;
 
-/*
- * After skipping a page migration on a shared page, skip N more numa page
- * migrations unconditionally. This reduces the number of NUMA migrations
- * in shared memory workloads, and has the effect of pulling tasks towards
- * where their memory lives, over pulling the memory towards the task.
- */
-unsigned int sysctl_numa_balancing_migrate_deferred = 16;
-
 static unsigned int task_nr_scan_windows(struct task_struct *p)
 {
unsigned long rss = 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8da99f..b41d61d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -384,13 +384,6 @@ static struct ctl_table kern_table[] = {
.mode   = 0644,
.proc_handler   = proc_dointvec,
},
-   {
-   .procname   = "numa_balancing_migrate_deferred",
-   .data   = &sysctl_numa_balancing_migrate_deferred,
-   .maxlen = sizeof(unsigned int),
-   .mode   = 0644,
-   .proc_handler   = proc_dointvec,
-   },
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
{
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0cd2c4d..68d5c7f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2304,35 +2304,6 @@ static void sp_free(struct sp_node *n)
kmem_cache_free(sn_cache, n);
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
-{
-   /* Never defer a private fault */
-   if (cpupid_match

[tip:sched/numa] sched/numa: Rename p-> numa_faults to numa_faults_memory

2014-01-28 Thread tip-bot for Rik van Riel

Commit-ID:  ff1df896aef8e0ec1556a5c44f424bd45bfa2cbe
Gitweb: http://git.kernel.org/tip/ff1df896aef8e0ec1556a5c44f424bd45bfa2cbe
Author: Rik van Riel 
AuthorDate: Mon, 27 Jan 2014 17:03:41 -0500
Committer:  Ingo Molnar 
CommitDate: Tue, 28 Jan 2014 13:17:05 +0100

sched/numa: Rename p->numa_faults to numa_faults_memory

In order to get a more consistent naming scheme, making it clear
which fault statistics track memory locality, and which track
CPU locality, rename the memory fault statistics.

Suggested-by: Mel Gorman 
Signed-off-by: Rik van Riel 
Acked-by: Mel Gorman 
Signed-off-by: Peter Zijlstra 
Cc: Chegu Vinod 
Link: http://lkml.kernel.org/r/1390860228-21539-3-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 include/linux/sched.h |  8 
 kernel/sched/core.c   |  4 ++--
 kernel/sched/debug.c  |  6 +++---
 kernel/sched/fair.c   | 56 +--
 4 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d572d5b..144d509 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1469,15 +1469,15 @@ struct task_struct {
 * Scheduling placement decisions are made based on the these counts.
 * The values remain static for the duration of a PTE scan
 */
-   unsigned long *numa_faults;
+   unsigned long *numa_faults_memory;
unsigned long total_numa_faults;
 
/*
 * numa_faults_buffer records faults per node during the current
-* scan window. When the scan completes, the counts in numa_faults
-* decay and these values are copied.
+* scan window. When the scan completes, the counts in
+* numa_faults_memory decay and these values are copied.
 */
-   unsigned long *numa_faults_buffer;
+   unsigned long *numa_faults_buffer_memory;
 
/*
 * numa_faults_locality tracks if faults recorded during the last
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 81343d6..bc708c5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1744,8 +1744,8 @@ static void __sched_fork(unsigned long clone_flags, 
struct task_struct *p)
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work;
-   p->numa_faults = NULL;
-   p->numa_faults_buffer = NULL;
+   p->numa_faults_memory = NULL;
+   p->numa_faults_buffer_memory = NULL;
 
INIT_LIST_HEAD(&p->numa_entry);
p->numa_group = NULL;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index dd52e7f..31b908d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -533,15 +533,15 @@ static void sched_show_numa(struct task_struct *p, struct 
seq_file *m)
unsigned long nr_faults = -1;
int cpu_current, home_node;
 
-   if (p->numa_faults)
-   nr_faults = p->numa_faults[2*node + i];
+   if (p->numa_faults_memory)
+   nr_faults = p->numa_faults_memory[2*node + i];
 
cpu_current = !i ? (task_node(p) == node) :
(pol && node_isset(node, pol->v.nodes));
 
home_node = (p->numa_preferred_nid == node);
 
-   SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
+   SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, 
%ld\n",
i, node, cpu_current, home_node, nr_faults);
}
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7cdde91..3e616d7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -901,11 +901,11 @@ static inline int task_faults_idx(int nid, int priv)
 
 static inline unsigned long task_faults(struct task_struct *p, int nid)
 {
-   if (!p->numa_faults)
+   if (!p->numa_faults_memory)
return 0;
 
-   return p->numa_faults[task_faults_idx(nid, 0)] +
-   p->numa_faults[task_faults_idx(nid, 1)];
+   return p->numa_faults_memory[task_faults_idx(nid, 0)] +
+   p->numa_faults_memory[task_faults_idx(nid, 1)];
 }
 
 static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -927,7 +927,7 @@ static inline unsigned long task_weight(struct task_struct 
*p, int nid)
 {
unsigned long total_faults;
 
-   if (!p->numa_faults)
+   if (!p->numa_faults_memory)
return 0;
 
total_faults = p->total_numa_faults;
@@ -1255,7 +1255,7 @@ static int task_numa_migrate(struct task_struct *p)
 static void numa_migrate_preferred(struct task_struct *p)
 {
/* This task has no NUMA fault statistics yet */
-   if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+   if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))

[tip:sched/numa] sched/numa: Turn some magic numbers into #defines

2014-01-28 Thread tip-bot for Rik van Riel

Commit-ID:  be1e4e760d940c14d119bffef5eb007dfdf29046
Gitweb: http://git.kernel.org/tip/be1e4e760d940c14d119bffef5eb007dfdf29046
Author: Rik van Riel 
AuthorDate: Mon, 27 Jan 2014 17:03:48 -0500
Committer:  Ingo Molnar 
CommitDate: Tue, 28 Jan 2014 15:03:21 +0100

sched/numa: Turn some magic numbers into #defines

Cleanup suggested by Mel Gorman. Now the code contains some more
hints on what statistics go where.

Suggested-by: Mel Gorman 
Signed-off-by: Rik van Riel 
Acked-by: Mel Gorman 
Signed-off-by: Peter Zijlstra 
Cc: Chegu Vinod 
Link: 
http://lkml.kernel.org/r/1390860228-21539-10-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 34 +-
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d5832c3..1f41b12 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -896,6 +896,15 @@ struct numa_group {
unsigned long faults[0];
 };
 
+/* Shared or private faults. */
+#define NR_NUMA_HINT_FAULT_TYPES 2
+
+/* Memory and CPU locality */
+#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
+
+/* Averaged statistics, and temporary buffers. */
+#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
+
 pid_t task_numa_group_id(struct task_struct *p)
 {
return p->numa_group ? p->numa_group->gid : 0;
@@ -903,7 +912,7 @@ pid_t task_numa_group_id(struct task_struct *p)
 
 static inline int task_faults_idx(int nid, int priv)
 {
-   return 2 * nid + priv;
+   return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
 }
 
 static inline unsigned long task_faults(struct task_struct *p, int nid)
@@ -1509,7 +1518,7 @@ static void task_numa_placement(struct task_struct *p)
unsigned long faults = 0, group_faults = 0;
int priv, i;
 
-   for (priv = 0; priv < 2; priv++) {
+   for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
long diff, f_diff, f_weight;
 
i = task_faults_idx(nid, priv);
@@ -1620,11 +1629,12 @@ static void task_numa_group(struct task_struct *p, int 
cpupid, int flags,
INIT_LIST_HEAD(&grp->task_list);
grp->gid = p->pid;
/* Second half of the array tracks nids where faults happen */
-   grp->faults_cpu = grp->faults + 2 * nr_node_ids;
+   grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
+   nr_node_ids;
 
node_set(task_node(current), grp->active_nodes);
 
-   for (i = 0; i < 4*nr_node_ids; i++)
+   for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] = p->numa_faults_memory[i];
 
grp->total_faults = p->total_numa_faults;
@@ -1682,7 +1692,7 @@ static void task_numa_group(struct task_struct *p, int 
cpupid, int flags,
 
double_lock(&my_grp->lock, &grp->lock);
 
-   for (i = 0; i < 4*nr_node_ids; i++) {
+   for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
my_grp->faults[i] -= p->numa_faults_memory[i];
grp->faults[i] += p->numa_faults_memory[i];
}
@@ -1714,7 +1724,7 @@ void task_numa_free(struct task_struct *p)
 
if (grp) {
spin_lock(&grp->lock);
-   for (i = 0; i < 4*nr_node_ids; i++)
+   for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] -= p->numa_faults_memory[i];
grp->total_faults -= p->total_numa_faults;
 
@@ -1755,14 +1765,20 @@ void task_numa_fault(int last_cpupid, int mem_node, int 
pages, int flags)
 
/* Allocate buffer to track faults on a per-node basis */
if (unlikely(!p->numa_faults_memory)) {
-   int size = sizeof(*p->numa_faults_memory) * 4 * nr_node_ids;
+   int size = sizeof(*p->numa_faults_memory) *
+  NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
 
-   /* numa_faults and numa_faults_buffer share the allocation */
-   p->numa_faults_memory = kzalloc(size * 2, 
GFP_KERNEL|__GFP_NOWARN);
+   p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
if (!p->numa_faults_memory)
return;
 
BUG_ON(p->numa_faults_buffer_memory);
+   /*
+* The averaged statistics, shared & private, memory & cpu,
+* occupy the first half of the array. The second half of the
+* array is for current counters, which are averaged into the
+* first set by task_numa_placement.
+*/
p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * 
nr_node_ids);
p->numa_faults_buffer_cpu = p->numa_f

[tip:sched/numa] sched/numa: Rename variables in task_numa_fault( )

2014-01-28 Thread tip-bot for Rik van Riel

Commit-ID:  58b46da336a9312b2e21bb576d1c2c484dbf6257
Gitweb: http://git.kernel.org/tip/58b46da336a9312b2e21bb576d1c2c484dbf6257
Author: Rik van Riel 
AuthorDate: Mon, 27 Jan 2014 17:03:47 -0500
Committer:  Ingo Molnar 
CommitDate: Tue, 28 Jan 2014 15:03:19 +0100

sched/numa: Rename variables in task_numa_fault()

We track both the node of the memory after a NUMA fault, and the node
of the CPU on which the fault happened. Rename the local variables in
task_numa_fault to make things more explicit.

Suggested-by: Mel Gorman 
Signed-off-by: Rik van Riel 
Acked-by: Mel Gorman 
Signed-off-by: Peter Zijlstra 
Cc: Chegu Vinod 
Link: http://lkml.kernel.org/r/1390860228-21539-9-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4c44990..d5832c3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1735,11 +1735,11 @@ void task_numa_free(struct task_struct *p)
 /*
  * Got a PROT_NONE fault for a page on @node.
  */
-void task_numa_fault(int last_cpupid, int node, int pages, int flags)
+void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 {
struct task_struct *p = current;
bool migrated = flags & TNF_MIGRATED;
-   int this_node = task_node(current);
+   int cpu_node = task_node(current);
int priv;
 
if (!numabalancing_enabled)
@@ -1794,8 +1794,8 @@ void task_numa_fault(int last_cpupid, int node, int 
pages, int flags)
if (migrated)
p->numa_pages_migrated += pages;
 
-   p->numa_faults_buffer_memory[task_faults_idx(node, priv)] += pages;
-   p->numa_faults_buffer_cpu[task_faults_idx(this_node, priv)] += pages;
+   p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
+   p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
 }
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/numa] sched/numa: Normalize faults_cpu stats and weigh by CPU use

2014-01-28 Thread tip-bot for Rik van Riel

Commit-ID:  7e2703e6099609adc93679c4d45cd6247f565971
Gitweb: http://git.kernel.org/tip/7e2703e6099609adc93679c4d45cd6247f565971
Author: Rik van Riel 
AuthorDate: Mon, 27 Jan 2014 17:03:45 -0500
Committer:  Ingo Molnar 
CommitDate: Tue, 28 Jan 2014 15:03:10 +0100

sched/numa: Normalize faults_cpu stats and weigh by CPU use

Tracing the code that decides the active nodes has made it abundantly clear
that the naive implementation of the faults_from code has issues.

Specifically, the garbage collector in some workloads will access orders
of magnitudes more memory than the threads that do all the active work.
This resulted in the node with the garbage collector being marked the only
active node in the group.

This issue is avoided if we weigh the statistics by CPU use of each task in
the numa group, instead of by how many faults each thread has occurred.

To achieve this, we normalize the number of faults to the fraction of faults
that occurred on each node, and then multiply that fraction by the fraction
of CPU time the task has used since the last time task_numa_placement was
invoked.

This way the nodes in the active node mask will be the ones where the tasks
from the numa group are most actively running, and the influence of eg. the
garbage collector and other do-little threads is properly minimized.

On a 4 node system, using CPU use statistics calculated over a longer interval
results in about 1% fewer page migrations with two 32-warehouse specjbb runs
on a 4 node system, and about 5% fewer page migrations, as well as 1% better
throughput, with two 8-warehouse specjbb runs, as compared with the shorter
term statistics kept by the scheduler.

Signed-off-by: Rik van Riel 
Acked-by: Mel Gorman 
Signed-off-by: Peter Zijlstra 
Cc: Chegu Vinod 
Link: http://lkml.kernel.org/r/1390860228-21539-7-git-send-email-r...@redhat.com
Signed-off-by: Ingo Molnar 
---
 include/linux/sched.h |  2 ++
 kernel/sched/core.c   |  2 ++
 kernel/sched/fair.c   | 53 +--
 3 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5ab3b89..ef92953 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1459,6 +1459,8 @@ struct task_struct {
int numa_preferred_nid;
unsigned long numa_migrate_retry;
u64 node_stamp; /* migration stamp  */
+   u64 last_task_numa_placement;
+   u64 last_sum_exec_runtime;
struct callback_head numa_work;
 
struct list_head numa_entry;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bc708c5..a561c9e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1746,6 +1746,8 @@ static void __sched_fork(unsigned long clone_flags, 
struct task_struct *p)
p->numa_work.next = &p->numa_work;
p->numa_faults_memory = NULL;
p->numa_faults_buffer_memory = NULL;
+   p->last_task_numa_placement = 0;
+   p->last_sum_exec_runtime = 0;
 
INIT_LIST_HEAD(&p->numa_entry);
p->numa_group = NULL;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eeabb33..8fc3a82 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -887,6 +887,11 @@ struct numa_group {
struct rcu_head rcu;
nodemask_t active_nodes;
unsigned long total_faults;
+   /*
+* Faults_cpu is used to decide whether memory should move
+* towards the CPU. As a consequence, these stats are weighted
+* more by CPU use than by memory faults.
+*/
unsigned long *faults_cpu;
unsigned long faults[0];
 };
@@ -1446,11 +1451,41 @@ static void update_task_scan_period(struct task_struct 
*p,
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
 }
 
+/*
+ * Get the fraction of time the task has been running since the last
+ * NUMA placement cycle. The scheduler keeps similar statistics, but
+ * decays those on a 32ms period, which is orders of magnitude off
+ * from the dozens-of-seconds NUMA balancing period. Use the scheduler
+ * stats only if the task is so new there are no NUMA statistics yet.
+ */
+static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
+{
+   u64 runtime, delta, now;
+   /* Use the start of this time slice to avoid calculations. */
+   now = p->se.exec_start;
+   runtime = p->se.sum_exec_runtime;
+
+   if (p->last_task_numa_placement) {
+   delta = runtime - p->last_sum_exec_runtime;
+   *period = now - p->last_task_numa_placement;
+   } else {
+   delta = p->se.avg.runnable_avg_sum;
+   *period = p->se.avg.runnable_avg_period;
+   }
+
+   p->last_sum_exec_runtime = runtime;
+   p->last_task_numa_placement = now;
+
+   return delta;
+}
+
 static void task_numa_placement(struct task_struct *p)
 {
int seq, nid, max_nid = -1, max_group_nid = -1;
unsigned long max_faults = 0, max_group_faults

1 2 >

1 - 100 of 140 matches

Mail list logo