Improve the memory layout of 'struct fpu':

 - change ->fpregs_active from 'int' to 'char' - it's just a single flag
   and modern x86 CPUs can do efficient byte accesses.

 - pack related fields closer to each other: often 'fpu->state' will not be
   touched, while the other fields will - so pack them into a group.

Also add comments to each field, describing their purpose, and add
some background information about lazy restores.

Also fix an obsolete, lazy switching related comment in fpu_copy()'s 
description.

Cc: Andy Lutomirski <l...@amacapital.net>
Cc: Borislav Petkov <b...@alien8.de>
Cc: Dave Hansen <dave.han...@linux.intel.com>
Cc: Fenghua Yu <fenghua...@intel.com>
Cc: H. Peter Anvin <h...@zytor.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Oleg Nesterov <o...@redhat.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 arch/x86/include/asm/fpu/types.h | 82 
++++++++++++++++++++++++++++++++++++++++++++++++++++--------
 arch/x86/kernel/fpu/core.c       |  6 ++---
 arch/x86/kernel/fpu/xstate.c     |  9 ++++---
 3 files changed, 79 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index fe2ce3276a38..261cfb76065f 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -159,8 +159,44 @@ union fpregs_state {
 
 struct fpu {
        /*
+        * @state:
+        *
+        * In-memory copy of all FPU registers that we save/restore
+        * over context switches. If the task is using the FPU then
+        * the registers in the FPU are more recent than this state
+        * copy. If the task context-switches away then they get
+        * saved here and represent the FPU state.
+        *
+        * After context switches there may be a (short) time period
+        * during which the in-FPU hardware registers are unchanged
+        * and still perfectly match this state, if the tasks
+        * scheduled afterwards are not using the FPU.
+        *
+        * This is the 'lazy restore' window of optimization, which
+        * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
+        *
+        * We detect whether a subsequent task uses the FPU via setting
+        * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
+        *
+        * During this window, if the task gets scheduled again, we
+        * might be able to skip having to do a restore from this
+        * memory buffer to the hardware registers - at the cost of
+        * incurring the overhead of #NM fault traps.
+        *
+        * Note that on modern CPUs that support the XSAVEOPT (or other
+        * optimized XSAVE instructions), we don't use #NM traps anymore,
+        * as the hardware can track whether FPU registers need saving
+        * or not. On such CPUs we activate the non-lazy ('eagerfpu')
+        * logic, which unconditionally saves/restores all FPU state
+        * across context switches. (if FPU state exists.)
+        */
+       union fpregs_state              state;
+
+       /*
+        * @last_cpu:
+        *
         * Records the last CPU on which this context was loaded into
-        * FPU registers. (In the lazy-switching case we might be
+        * FPU registers. (In the lazy-restore case we might be
         * able to reuse FPU registers across multiple context switches
         * this way, if no intermediate task used the FPU.)
         *
@@ -170,23 +206,49 @@ struct fpu {
         */
        unsigned int                    last_cpu;
 
-       unsigned int                    fpregs_active;
-       union fpregs_state              state;
        /*
+        * @fpstate_active:
+        *
+        * This flag indicates whether this context is active: if the task
+        * is not running then we can restore from this context, if the task
+        * is running then we should save into this context.
+        */
+       unsigned char                   fpstate_active;
+
+       /*
+        * @fpregs_active:
+        *
+        * This flag determines whether a given context is actively
+        * loaded into the FPU's registers and that those registers
+        * represent the task's current FPU state.
+        *
+        * Note the interaction with fpstate_active:
+        *
+        *   # task does not use the FPU:
+        *   fpstate_active == 0
+        *
+        *   # task uses the FPU and regs are active:
+        *   fpstate_active == 1 && fpregs_active == 1
+        *
+        *   # the regs are inactive but still match fpstate:
+        *   fpstate_active == 1 && fpregs_active == 0 && fpregs_owner == fpu
+        *
+        * The third state is what we use for the lazy restore optimization
+        * on lazy-switching CPUs.
+        */
+       unsigned char                   fpregs_active;
+
+       /*
+        * @counter:
+        *
         * This counter contains the number of consecutive context switches
         * during which the FPU stays used. If this is over a threshold, the
-        * lazy fpu saving logic becomes unlazy, to save the trap overhead.
+        * lazy FPU restore logic becomes eager, to save the trap overhead.
         * This is an unsigned char so that after 256 iterations the counter
         * wraps and the context switch behavior turns lazy again; this is to
         * deal with bursty apps that only use the FPU for a short time:
         */
        unsigned char                   counter;
-       /*
-        * This flag indicates whether this context is fpstate_active: if the 
task is
-        * not running then we can restore from this context, if the task
-        * is running then we should save into this context.
-        */
-       unsigned char                   fpstate_active;
 };
 
 #endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 0acdfc5f8d19..63496c49a590 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -227,10 +227,8 @@ EXPORT_SYMBOL_GPL(fpstate_init);
 /*
  * Copy the current task's FPU state to a new task's FPU context.
  *
- * In the 'eager' case we just save to the destination context.
- *
- * In the 'lazy' case we save to the source context, mark the FPU lazy
- * via stts() and copy the source context into the destination context.
+ * In both the 'eager' and the 'lazy' case we save hardware registers
+ * directly to the destination buffer.
  */
 static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 733a8aec7bd7..cd7f1a6bd933 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -76,10 +76,11 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char 
**feature_name)
 EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
 
 /*
- * When executing XSAVEOPT (optimized XSAVE), if a processor implementation
- * detects that an FPU state component is still (or is again) in its
- * initialized state, it may clear the corresponding bit in the 
header.xfeatures
- * field, and can skip the writeout of registers to the corresponding memory 
layout.
+ * When executing XSAVEOPT (or other optimized XSAVE instructions), if
+ * a processor implementation detects that an FPU state component is still
+ * (or is again) in its initialized state, it may clear the corresponding
+ * bit in the header.xfeatures field, and can skip the writeout of registers
+ * to the corresponding memory layout.
  *
  * This means that when the bit is zero, the state component might still 
contain
  * some previous - non-initialized register state.
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to