This patch is not for inclusion -- I just want to see if the idea is sound. It's based on suggestions from <[EMAIL PROTECTED]>
When saving FP context, the current CPU number is saved in the tasks thread structure, and a pointer to that structure is saved in a per-cpu data area. On loading an FP context, the per-cpu pointer is cleared. (But the CPU number in the task is untouched.) Upon task switch, the CPU number in the per-task area is compared to the current CPU and the per-CPU pointer is checked. If everything matches, loading of the FPU context will be skipped. To prevent extra overhead when a task does short bursts of FP math and then switches to integer, a normal FPU context load will be forced after 100 skipped loads. Problems: - As posted, the code only works on machines with fxsr. GCC internal errors prevent the commented-out code from compiling; I guess a conditional jump is needed. - May not be preempt-safe (but AFAICT it is.) Volanomark profile results are promising: Before After 8304 8176 device_not_available 11809 12334 math_state_restore ----------------- 20114 20500 So it seems to be reducing the number of traps but each trap takes a bit longer. This is a good result from a worst-case scenario. The other worst-case test is for systems not using FP math at all. This is untested, and best-case results are still pending as well. Signed-off-by: Chuck Ebbert <[EMAIL PROTECTED]> Index: 2.6.13-rc3-mm3/arch/i386/kernel/i387.c =================================================================== --- 2.6.13-rc3-mm3.orig/arch/i386/kernel/i387.c 2005-07-29 02:26:39.000000000 -0400 +++ 2.6.13-rc3-mm3/arch/i386/kernel/i387.c 2005-07-29 14:41:34.000000000 -0400 @@ -27,6 +27,8 @@ static unsigned long mxcsr_feature_mask = 0xffffffff; +DEFINE_PER_CPU(struct thread_struct *, current_i387_thread); + void mxcsr_feature_mask_init(void) { unsigned long mask = 0; Index: 2.6.13-rc3-mm3/arch/i386/kernel/process.c =================================================================== --- 2.6.13-rc3-mm3.orig/arch/i386/kernel/process.c 2005-07-29 02:26:39.000000000 -0400 +++ 2.6.13-rc3-mm3/arch/i386/kernel/process.c 2005-07-29 14:41:34.000000000 -0400 @@ -475,6 +475,8 @@ p->thread.eip = (unsigned long) ret_from_fork; + p->thread.current_i387_cpu = -1; + savesegment(fs,p->thread.fs); savesegment(gs,p->thread.gs); @@ -679,8 +681,29 @@ /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - __unlazy_fpu(prev_p); + if (prev_p->thread_info->status & TS_USEDFPU) { + save_init_fpu(prev_p); + goto lazy_load; + } + + /* This breaks GCC 3.3 and 4.0.1 (internal compiler error) */ +// alternative_input( /* do lazy restore if fxsr unsupported */ +// "jmp %1", +// "", +// X86_FEATURE_FXSR, +// "a" (*&&lazy_load)); + + if (next->current_i387_cpu == smp_processor_id() + && next == per_cpu(current_i387_thread, smp_processor_id())) { + + if (likely(++next->lazy_i387_switches < 100)) { + next_p->thread_info->status |= TS_USEDFPU; + clts(); + } else + next->lazy_i387_switches = 0; + } +lazy_load: /* * Reload esp0, LDT and the page table pointer: */ Index: 2.6.13-rc3-mm3/include/asm-i386/i387.h =================================================================== --- 2.6.13-rc3-mm3.orig/include/asm-i386/i387.h 2005-07-29 14:32:03.000000000 -0400 +++ 2.6.13-rc3-mm3/include/asm-i386/i387.h 2005-07-29 14:41:34.000000000 -0400 @@ -17,6 +17,8 @@ #include <asm/sigcontext.h> #include <asm/user.h> +DECLARE_PER_CPU(struct thread_struct *, current_i387_thread); + extern void mxcsr_feature_mask_init(void); extern void init_fpu(struct task_struct *); @@ -24,16 +26,31 @@ * FPU lazy state save handling... */ -/* - * The "nop" is needed to make the instructions the same - * length. - */ -#define restore_fpu(tsk) \ - alternative_input( \ - "nop ; frstor %1", \ - "fxrstor %1", \ - X86_FEATURE_FXSR, \ - "m" ((tsk)->thread.i387.fxsave)) +static inline void restore_fpu( struct task_struct *tsk ) +{ + /* + * The "nop" is needed to make the instructions the same + * length. + */ + alternative_input( + "frstor %1 ; nop", + "fxrstor %1", + X86_FEATURE_FXSR, + "m" (tsk->thread.i387.fxsave)); + + /* This breaks GCC 3.3 and 4.0.1 (internal compiler error) */ +// alternative_input( /* skip ahead if fxsr unsupported */ +// "jmp %1", +// "", +// X86_FEATURE_FXSR, +// "a" (*&&no_fxsr)); + + /* ??? is preempt disabled when this is called? */ + per_cpu(current_i387_thread, smp_processor_id()) = 0; +no_fxsr: + __attribute__((unused)) + return; /* required to avoid gcc error */ +} extern void kernel_fpu_begin(void); #define kernel_fpu_end() do { stts(); preempt_enable(); } while(0) @@ -49,6 +66,18 @@ X86_FEATURE_FXSR, "m" (tsk->thread.i387.fxsave) :"memory"); + + /* This breaks GCC 3.3 and 4.0.1 (internal compiler error) */ +// alternative_input( /* skip ahead if fxsr unsupported */ +// "jmp %1", +// "", +// X86_FEATURE_FXSR, +// "a" (*&&no_fxsr)); + + tsk->thread.current_i387_cpu = smp_processor_id(); + per_cpu(current_i387_thread, smp_processor_id()) = &tsk->thread; +no_fxsr: + __attribute__((unused)) tsk->thread_info->status &= ~TS_USEDFPU; } Index: 2.6.13-rc3-mm3/include/asm-i386/processor.h =================================================================== --- 2.6.13-rc3-mm3.orig/include/asm-i386/processor.h 2005-07-13 16:20:26.000000000 -0400 +++ 2.6.13-rc3-mm3/include/asm-i386/processor.h 2005-07-29 14:41:34.000000000 -0400 @@ -447,6 +447,7 @@ unsigned long cr2, trap_no, error_code; /* floating point info */ union i387_union i387; + int current_i387_cpu, lazy_i387_switches; /* virtual 86 mode info */ struct vm86_struct __user * vm86_info; unsigned long screen_bitmap; __ Chuck - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/