Currently there are 2 fpu users in the kernel: raid5 checksumming and 3dnow memcpy/memset. raid5 checksumming is not problematic, but _mmx_memcpy() has unexpected side effects if someone else is also using the fpu: memcopy is a really generic function, and calling it saves the current fpu state into thread.i387.f{,x}save. IMHO that's wrong, memcopy must save into a local buffer like raid5 checksumming. I've attached a proposal that supports arbitrary combinations of fpu users in kernel space. * kernel threads can use the fpu freely. Nothing new. * 2 sets of functions for fpu usage in "normal" threads: - kfpu_acquire(), kfpu_try_acquire() + release functions. - kfpu_full_begin(), kfpu_mmx_begin(), kfpu_nosave_begin(), kfpu_try_begin() + _end() functions. The first set is only possible from normal process context. The caller can sleep between _acquire() and _release(). The second set can be called from arbitrary context, but the caller must not sleep between _begin() and _end(). The _try() functions check if the fpu is unused, and fail if the fpu is currently in use. That way the memcpy()/memset() functions can avoid fpu context saves/restores and it saves stack space. Nesting is partially possible: _begin() within _acquire is possible, _acquire() within _begin() will BUG. [_acquire()/_release() have similar restrictions as down() and up(), _begin()/_end() have similar restrictions as spinlocks] The patch itself is alpha quality: only the sse functions are tested, it boots when compiled for Pentium III, the raid5 checksum _benchmark_ still works and distributed.net still cracks rc5. I haven't yet checked that the exception handlers are still called properly. What do you think? -- Manfred
// $Header$ // Kernel Version: // VERSION = 2 // PATCHLEVEL = 4 // SUBLEVEL = 1 // EXTRAVERSION = diff -urN 2.4/include/asm-i386/i387.h build-2.4/include/asm-i386/i387.h --- 2.4/include/asm-i386/i387.h Fri Feb 2 15:20:36 2001 +++ build-2.4/include/asm-i386/i387.h Sun Feb 11 19:40:45 2001 @@ -16,6 +16,7 @@ #include <asm/sigcontext.h> #include <asm/user.h> +#include <asm/kfpu.h> extern void init_fpu(void); /* * FPU lazy state save handling... @@ -23,9 +24,7 @@ extern void save_init_fpu( struct task_struct *tsk ); extern void restore_fpu( struct task_struct *tsk ); -extern void kernel_fpu_begin(void); -#define kernel_fpu_end() stts() - +#include <asm/kfpu.h> #define unlazy_fpu( tsk ) do { \ if ( tsk->flags & PF_USEDFPU ) \ @@ -36,7 +35,7 @@ if ( tsk->flags & PF_USEDFPU ) { \ asm volatile("fwait"); \ tsk->flags &= ~PF_USEDFPU; \ - stts(); \ + kfpu_leave(); \ } \ } while (0) diff -urN 2.4/include/asm-i386/kfpu.h build-2.4/include/asm-i386/kfpu.h --- 2.4/include/asm-i386/kfpu.h Thu Jan 1 01:00:00 1970 +++ build-2.4/include/asm-i386/kfpu.h Sun Feb 11 19:59:51 2001 @@ -0,0 +1,83 @@ +#ifndef _ASM_KFPU_H +#define _ASM_KFPU_H + +/* + * FPU support for kernel threads + * + * currently limited to MMX, SSE and SSE2. + * x87 and fpu emulation are not supported. + */ + +/**********************************/ +/* + * Enable full fpu access. + * Only for kernel threads. + */ +void kfpu_start(void); + + +/**********************************/ +/* + * Get full fpu access. + * + * Only permitted from process context. + * Caller must check that the FPU is present before calling. + */ +struct kfpubuf_acquire { + unsigned long saved; + unsigned char buffer[512+16]; +} kfpubuf_acquire; + +void kfpu_acquire(struct kfpubuf_acquire *buf); +void kfpu_release(struct kfpubuf_acquire *buf); + +/* returns 1 if it got fpu access, 0 otherwise */ +int kfpu_try_acquire(void); +void kfpu_try_release(void); + + +/**********************************/ +/* + * Get short term fpu access. + * + * The functions can be called from any context (process, + * softirq, interrupt) + * Caller must check that the FPU is present before calling. + * The caller must not sleep between _begin() and _end() + */ +struct kfpubuf_full { + unsigned char buffer[512]; +}; + +struct kfpubuf_mmx { + unsigned char buffer[108]; +}; + +void kfpu_full_begin(struct kfpubuf_full *buf); +void kfpu_mmx_begin(struct kfpubuf_mmx *buf); +/* + * ret val 0: caller doesn't need to save clobbered regs + * ret val !0: the caller must save & restore any clobbered + * fpu registers. + * This function DOES NOT reinitialize the fpu! + */ +int kfpu_nosave_begin(void); + +void kfpu_full_end(struct kfpubuf_full *buf); +void kfpu_mmx_end(struct kfpubuf_mmx *buf); +void kfpu_nosave_end(void); + +/* returns 1 if it got fpu access */ +int kfpu_try_begin(void); +void kfpu_try_end(void); + +/**********************************/ + +/* internal function, called by math_state_restore() */ +void kfpu_enter(void); +/* internal function, called by save_init_fpu() */ +void kfpu_leave(void); +/* internal function, called by cpu_init() */ +void kfpu_initialize(void); + +#endif diff -urN 2.4/include/asm-i386/page.h build-2.4/include/asm-i386/page.h --- 2.4/include/asm-i386/page.h Thu Jan 4 23:50:46 2001 +++ build-2.4/include/asm-i386/page.h Sun Feb 11 13:10:17 2001 @@ -11,7 +11,14 @@ #include <linux/config.h> -#ifdef CONFIG_X86_USE_3DNOW +#ifdef CONFIG_X86_USE_SSE + +#include <asm/sse.h> + +#define clear_page(page) sse_clear_page(page) +#define copy_page(to,from) sse_copy_page(to,from) + +#elif defined(CONFIG_X86_USE_3DNOW) #include <asm/mmx.h> diff -urN 2.4/include/asm-i386/sse.h build-2.4/include/asm-i386/sse.h --- 2.4/include/asm-i386/sse.h Thu Jan 1 01:00:00 1970 +++ build-2.4/include/asm-i386/sse.h Sun Feb 11 12:59:43 2001 @@ -0,0 +1,11 @@ +#ifndef _ASM_SSE_H +#define _ASM_SSE_H + +/* + * SSE helper operations + */ + +extern void sse_clear_page(void *page); +extern void sse_copy_page(void *to, void *from); + +#endif diff -urN 2.4/include/asm-i386/system.h build-2.4/include/asm-i386/system.h --- 2.4/include/asm-i386/system.h Sun Feb 11 00:39:07 2001 +++ build-2.4/include/asm-i386/system.h Sun Feb 11 12:41:01 2001 @@ -100,7 +100,6 @@ /* * Clear and set 'TS' bit respectively */ -#define clts() __asm__ __volatile__ ("clts") #define read_cr0() ({ \ unsigned int __dummy; \ __asm__( \ @@ -110,7 +109,6 @@ }) #define write_cr0(x) \ __asm__("movl %0,%%cr0": :"r" (x)); -#define stts() write_cr0(8 | read_cr0()) #endif /* __KERNEL__ */ diff -urN 2.4/include/asm-i386/xor.h build-2.4/include/asm-i386/xor.h --- 2.4/include/asm-i386/xor.h Mon Nov 13 04:39:51 2000 +++ build-2.4/include/asm-i386/xor.h Sun Feb 11 19:37:24 2001 @@ -18,18 +18,16 @@ * Copyright (C) 1998 Ingo Molnar. */ +#include <asm/kfpu.h> + #define FPU_SAVE \ do { \ - if (!(current->flags & PF_USEDFPU)) \ - __asm__ __volatile__ (" clts;\n"); \ - __asm__ __volatile__ ("fsave %0; fwait": "=m"(fpu_save[0])); \ + kfpu_mmx_begin(&fpu_save); \ } while (0) #define FPU_RESTORE \ do { \ - __asm__ __volatile__ ("frstor %0": : "m"(fpu_save[0])); \ - if (!(current->flags & PF_USEDFPU)) \ - stts(); \ + kfpu_mmx_end(&fpu_save); \ } while (0) #define LD(x,y) " movq 8*("#x")(%1), %%mm"#y" ;\n" @@ -44,7 +42,7 @@ xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) { unsigned long lines = bytes >> 7; - char fpu_save[108]; + struct kfpubuf_mmx fpu_save; FPU_SAVE; @@ -89,7 +87,7 @@ unsigned long *p3) { unsigned long lines = bytes >> 7; - char fpu_save[108]; + struct kfpubuf_mmx fpu_save; FPU_SAVE; @@ -139,7 +137,7 @@ unsigned long *p3, unsigned long *p4) { unsigned long lines = bytes >> 7; - char fpu_save[108]; + struct kfpubuf_mmx fpu_save; FPU_SAVE; @@ -194,7 +192,7 @@ unsigned long *p3, unsigned long *p4, unsigned long *p5) { unsigned long lines = bytes >> 7; - char fpu_save[108]; + struct kfpubuf_mmx fpu_save; FPU_SAVE; @@ -261,7 +259,7 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) { unsigned long lines = bytes >> 6; - char fpu_save[108]; + struct kfpubuf_mmx fpu_save; FPU_SAVE; @@ -310,7 +308,7 @@ unsigned long *p3) { unsigned long lines = bytes >> 6; - char fpu_save[108]; + struct kfpubuf_mmx fpu_save; FPU_SAVE; @@ -368,7 +366,7 @@ unsigned long *p3, unsigned long *p4) { unsigned long lines = bytes >> 6; - char fpu_save[108]; + struct kfpubuf_mmx fpu_save; FPU_SAVE; @@ -435,7 +433,7 @@ unsigned long *p3, unsigned long *p4, unsigned long *p5) { unsigned long lines = bytes >> 6; - char fpu_save[108]; + struct kfpubuf_mmx fpu_save; FPU_SAVE; @@ -531,28 +529,31 @@ */ #define XMMS_SAVE \ - __asm__ __volatile__ ( \ - "movl %%cr0,%0 ;\n\t" \ - "clts ;\n\t" \ - "movups %%xmm0,(%1) ;\n\t" \ - "movups %%xmm1,0x10(%1) ;\n\t" \ - "movups %%xmm2,0x20(%1) ;\n\t" \ - "movups %%xmm3,0x30(%1) ;\n\t" \ - : "=r" (cr0) \ + restore = kfpu_nosave_begin(); \ + if (restore) \ + __asm__ __volatile__ ( \ + "movups %%xmm0,(%0) ;\n\t" \ + "movups %%xmm1,0x10(%0) ;\n\t" \ + "movups %%xmm2,0x20(%0) ;\n\t" \ + "movups %%xmm3,0x30(%0) ;\n\t" \ + : /* no output */ \ : "r" (xmm_save) \ : "memory") #define XMMS_RESTORE \ - __asm__ __volatile__ ( \ - "sfence ;\n\t" \ - "movups (%1),%%xmm0 ;\n\t" \ - "movups 0x10(%1),%%xmm1 ;\n\t" \ - "movups 0x20(%1),%%xmm2 ;\n\t" \ - "movups 0x30(%1),%%xmm3 ;\n\t" \ - "movl %0,%%cr0 ;\n\t" \ - : \ - : "r" (cr0), "r" (xmm_save) \ - : "memory") + __asm__ __volatile__ ( \ + "sfence\n\t" \ + : : : "memory"); \ + if (restore) \ + __asm__ __volatile__ ( \ + "movups (%0),%%xmm0 ;\n\t" \ + "movups 0x10(%0),%%xmm1 ;\n\t" \ + "movups 0x20(%0),%%xmm2 ;\n\t" \ + "movups 0x30(%0),%%xmm3 ;\n\t" \ + : /* no output */ \ + : "r" (xmm_save) \ + : "memory"); \ + kfpu_nosave_end() #define OFFS(x) "16*("#x")" #define PF0(x) " prefetcht0 "OFFS(x)"(%1) ;\n" @@ -575,7 +576,7 @@ { unsigned long lines = bytes >> 8; char xmm_save[16*4]; - int cr0; + int restore; XMMS_SAVE; @@ -629,7 +630,7 @@ { unsigned long lines = bytes >> 8; char xmm_save[16*4]; - int cr0; + int restore; XMMS_SAVE; @@ -690,7 +691,7 @@ { unsigned long lines = bytes >> 8; char xmm_save[16*4]; - int cr0; + int restore; XMMS_SAVE; @@ -758,7 +759,7 @@ { unsigned long lines = bytes >> 8; char xmm_save[16*4]; - int cr0; + int restore; XMMS_SAVE; diff -urN --exclude .depend 2.4/arch/i386/lib/Makefile build-2.4/arch/i386/lib/Makefile --- 2.4/arch/i386/lib/Makefile Sun Feb 11 00:37:51 2001 +++ build-2.4/arch/i386/lib/Makefile Sun Feb 11 12:59:43 2001 @@ -12,6 +12,7 @@ memcpy.o obj-$(CONFIG_X86_USE_3DNOW) += mmx.o +obj-$(CONFIG_X86_USE_SSE) += sse.o obj-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o include $(TOPDIR)/Rules.make diff -urN --exclude .depend 2.4/arch/i386/lib/mmx.c build-2.4/arch/i386/lib/mmx.c --- 2.4/arch/i386/lib/mmx.c Sat Feb 3 14:02:24 2001 +++ build-2.4/arch/i386/lib/mmx.c Sun Feb 11 19:28:12 2001 @@ -28,7 +28,11 @@ void *p=to; int i= len >> 6; /* len/64 */ - kernel_fpu_begin(); + if(!kfpu_try_begin() { + /* FIXME: this belongs into string.h */ + __memcpy(to, from, len); + return; + } __asm__ __volatile__ ( "1: prefetch (%0)\n" /* This set is 28 bytes */ @@ -84,7 +88,7 @@ * Now do the tail of the block */ __memcpy(to, from, len&63); - kernel_fpu_end(); + kfpu_try_end(); return p; } @@ -92,8 +96,6 @@ { int i; - kernel_fpu_begin(); - __asm__ __volatile__ ( " pxor %%mm0, %%mm0\n" : : ); @@ -118,18 +120,12 @@ __asm__ __volatile__ ( " sfence \n" : : ); - kernel_fpu_end(); } static void fast_copy_page(void *to, void *from) { int i; - kernel_fpu_begin(); - - /* maybe the prefetch stuff can go before the expensive fnsave... - * but that is for later. -AV - */ __asm__ __volatile__ ( "1: prefetch (%0)\n" " prefetch 64(%0)\n" @@ -185,7 +181,6 @@ __asm__ __volatile__ ( " sfence \n" : : ); - kernel_fpu_end(); } /* @@ -205,10 +200,12 @@ void mmx_clear_page(void * page) { - if(in_interrupt()) - slow_zero_page(page); - else + if (kfpu_try_begin()) { fast_clear_page(page); + kfpu_try_end(); + } else { + slow_zero_page(page); + } } static void slow_copy_page(void *to, void *from) @@ -225,8 +222,10 @@ void mmx_copy_page(void *to, void *from) { - if(in_interrupt()) - slow_copy_page(to, from); - else + if (kfpu_try_begin()) { fast_copy_page(to, from); + kfpu_try_end(); + } else { + slow_copy_page(to, from); + } } diff -urN --exclude .depend 2.4/arch/i386/lib/sse.c build-2.4/arch/i386/lib/sse.c --- 2.4/arch/i386/lib/sse.c Thu Jan 1 01:00:00 1970 +++ build-2.4/arch/i386/lib/sse.c Sun Feb 11 18:02:30 2001 @@ -0,0 +1,107 @@ +#include <linux/types.h> +#include <linux/string.h> +#include <linux/sched.h> + +#include <asm/i387.h> + +/* + * SSE library helper functions + * + * Copyright (C) 2001 Manfred Spraul + * + * Based on Intel sample code from + * Block Copy Using Pentium(R) III Streaming SIMD Extensions + * Revision 1.9 + * January 12, 1999 + * + */ + + +void sse_clear_page(void * page) +{ + int storage[4]; + int restore; + int d0; + restore = kfpu_nosave_begin(); + if (restore) { + __asm__ __volatile__( + "movups %%xmm0, (%0)\n\t" + : /* no output */ + : "r" (&storage[0]) + : "memory" ); + } + __asm__ __volatile__( + "xorps %%xmm0, %%xmm0\n\t" + "xor %0, %0\n\t" + "1: movntps %%xmm0, (%1)\n\t" + "movntps %%xmm0, 16(%1)\n\t" + "add $32, %1\n\t" + "inc %0\n\t" + "cmp $128, %0\n\t" + "jne 1b\n\t" + "sfence\n\t" + : "=&r" (d0), "=&r" (page) + : "1" (page) + : "cc", "memory"); + if (restore) { + __asm__ __volatile__( + "movups (%0), %%xmm0\n\t" + : /* no output */ + : "r" (&storage[0]) + : "memory" ); + } + kfpu_nosave_end(); +} + +void sse_copy_page(void *to, void *from) +{ + int storage[16]; + int restore; + int d0; + restore = kfpu_nosave_begin(); + if (restore) { + __asm__ __volatile__( + "movups %%xmm0, (%0)\n\t" + "movups %%xmm1, 16(%0)\n\t" + "movups %%xmm2, 32(%0)\n\t" + "movups %%xmm3, 48(%0)\n\t" + : /* no output */ + : "r" (&storage[0]) + : "memory" ); + } + __asm__ __volatile__( + "mov (%2), %0\n\t" /* step 1: load the TLB */ + "xor %0, %0\n\t" /* step 2: prefetch the page */ + "1:prefetchnta (%2, %0)\n\t" + "prefetchnta 32(%2, %0)\n\t" + "add $64,%0\n\t" + "cmp $4096, %0\n\t" + "jne 1b\n\t" + "2: movaps (%2), %%xmm0\n\t" /* step 3: copy the page */ + "movaps 16(%2), %%xmm1\n\t" + "movaps 32(%2), %%xmm2\n\t" + "movaps 48(%2), %%xmm3\n\t" + "add $64, %2\n\t" + "movntps %%xmm0, (%1)\n\t" + "movntps %%xmm1, 16(%1)\n\t" + "movntps %%xmm2, 32(%1)\n\t" + "movntps %%xmm3, 48(%1)\n\t" + "add $64, %1\n\t" + "sub $64, %0\n\t" + "jnz 2b\n\t" + "sfence\n\t" + : "=&r" (d0), "=&r" (to), "=&r" (from) + : "1" (to), "2" (from) + : "cc", "memory"); + if (restore) { + __asm__ __volatile__( + "movups (%0), %%xmm0\n\t" + "movups 16(%0), %%xmm1\n\t" + "movups 32(%0), %%xmm2\n\t" + "movups 48(%0), %%xmm3\n\t" + : /* no output */ + : "r" (&storage[0]) + : "memory" ); + } + kfpu_nosave_end(); +} --- 2.4/arch/i386/kernel/i387.c Sun Feb 11 00:37:51 2001 +++ build-2.4/arch/i386/kernel/i387.c Sun Feb 11 19:35:49 2001 @@ -24,6 +24,8 @@ #define HAVE_HWFP 1 #endif +static volatile int cpu_fpuactive[NR_CPUS]; + /* * The _current_ task is using the FPU for the first time * so initialize it and set the mxcsr to its default @@ -39,10 +41,19 @@ current->used_math = 1; } +void inline stts(void) +{ + write_cr0(8 | read_cr0()) +} + +char *sse_aligned(char *buffer) +{ + return (char*)((((unsigned long)buffer)+15)&(~15)); +} + /* * FPU lazy state save handling. */ - static inline void __save_init_fpu( struct task_struct *tsk ) { if ( cpu_has_fxsr ) { @@ -58,18 +69,191 @@ void save_init_fpu( struct task_struct *tsk ) { __save_init_fpu(tsk); - stts(); + kfpu_leave(); +} + +void kfpu_start(void) +{ + if ( cpu_has_fpu ) { + cpu_fpuactive[smp_processor_id()]++; + __asm__ __volatile__("clts"); /* Allow maths ops (or we recurse) */ + init_fpu(); + current->flags |= PF_USEDFPU; /* So we fnsave on switch_to() */ + current->used_math = 1; + } } -void kernel_fpu_begin(void) +void kfpu_acquire(struct kfpubuf_acquire *buf) { - struct task_struct *tsk = current; + if (!current->used_math) { + buf->saved = 0; + kfpu_try_acquire(); + return; + } + buf->saved = 1; + unlazy_fpu(current); + if ( cpu_has_fxsr ) { + memcpy(buf->buffer, ¤t->thread.i387.fxsave, + sizeof(current->thread.i387.fxsave)); + } else { + memcpy(buf->buffer, ¤t->thread.i387.fsave, + sizeof(current->thread.i387.fsave)); + } + + if (cpu_fpuactive[smp_processor_id()]) + BUG(); + cpu_fpuactive[smp_processor_id()]++; + __asm__ __volatile__("clts"); + init_fpu(); + current->flags |= PF_USEDFPU; +} - if (tsk->flags & PF_USEDFPU) { - __save_init_fpu(tsk); +void kfpu_release(struct kfpubuf_acquire *buf) +{ + if (!buf->saved) { + kfpu_try_release(); return; } - clts(); + clear_fpu(current); + if ( cpu_has_fxsr ) { + memcpy(¤t->thread.i387.fxsave, buf->buffer, + sizeof(current->thread.i387.fxsave)); + } else { + memcpy(¤t->thread.i387.fsave, buf->buffer, + sizeof(current->thread.i387.fsave)); + } + if (cpu_fpuactive[smp_processor_id()]) + BUG(); + if (current->flags & PF_USEDFPU) + BUG(); +} + +/* returns 1 if it got fpu access, 0 otherwise */ +int kfpu_try_acquire(void) +{ + if (current->used_math) + return 0; + current->used_math = 1; + cpu_fpuactive[smp_processor_id()]++; + __asm__ __volatile__("clts"); + init_fpu(); + current->flags |= PF_USEDFPU; + return 1; +} + +void kfpu_try_release(void) +{ + clear_fpu(current); + current->used_math = 0; + if (cpu_fpuactive[smp_processor_id()]) + BUG(); + if (current->flags & PF_USEDFPU) + BUG(); +} + +void kfpu_full_begin(struct kfpubuf_full *buf) +{ + cpu_fpuactive[smp_processor_id()]++; + __asm__ __volatile__("clts"); + if(cpu_fpuactive[smp_processor_id()] > 1) { + char *buffer = sse_aligned(buffer); + asm volatile( "fxsave %0 ; fnclex" + : "=m" (buffer) ); + } + __asm__("fninit"); + load_mxcsr(0x1f80); +} + +void kfpu_mmx_begin(struct kfpubuf_mmx *buf) +{ + cpu_fpuactive[smp_processor_id()]++; + __asm__ __volatile__("clts"); + if(cpu_fpuactive[smp_processor_id()] > 1) { + asm volatile( "fnsave %0 ; fwait" + : "=m" (buf->buffer) ); + } + __asm__("fninit"); +} + +/* ret val 0: caller doesn't need to save clobbered regs */ +int kfpu_nosave_begin(void) +{ + cpu_fpuactive[smp_processor_id()]++; + __asm__ __volatile__("clts"); + return cpu_fpuactive[smp_processor_id()]-1; +} + +void kfpu_full_end(struct kfpubuf_full *buf) +{ + if(cpu_fpuactive[smp_processor_id()] > 1) { + char *buffer = sse_aligned(buffer); + asm volatile( "fxrstor %0" + : "=m" (buffer) ); + cpu_fpuactive[smp_processor_id()]--; + } else { + cpu_fpuactive[smp_processor_id()]--; + stts(); + } +} + +void kfpu_mmx_end(struct kfpubuf_mmx *buf) +{ + if(cpu_fpuactive[smp_processor_id()] > 1) { + asm volatile( "frstor %0" + : "=m" (buf->buffer) ); + cpu_fpuactive[smp_processor_id()]--; + } else { + cpu_fpuactive[smp_processor_id()]--; + stts(); + } +} + +void kfpu_nosave_end(void) +{ + cpu_fpuactive[smp_processor_id()]--; + if(cpu_fpuactive[smp_processor_id()] == 0) + stts(); +} + +/* returns 1 if it got fpu access */ +int kfpu_try_begin(void) +{ + if (cpu_fpuactive[smp_processor_id()] > 0) + return 0; + cpu_fpuactive[smp_processor_id()]++; + __asm__ __volatile__("clts"); + __asm__("fninit"); + if ( cpu_has_xmm ) + load_mxcsr(0x1f80); + return 1; +} + +void kfpu_try_end(void) +{ + cpu_fpuactive[smp_processor_id()]--; + stts(); +} + +void kfpu_initialize(void) +{ + cpu_fpuactive[smp_processor_id()] = 0; + stts(); +} + +void kfpu_enter(void) +{ + if(cpu_fpuactive[smp_processor_id()] != 0) + BUG(); + cpu_fpuactive[smp_processor_id()]++; + __asm__ __volatile__("clts"); +} + +void kfpu_leave(void) +{ + cpu_fpuactive[smp_processor_id()]--; + stts(); + if(cpu_fpuactive[smp_processor_id()] != 0) + BUG(); } void restore_fpu( struct task_struct *tsk ) --- 2.4/arch/i386/kernel/traps.c Sun Feb 11 00:37:51 2001 +++ build-2.4/arch/i386/kernel/traps.c Sun Feb 11 19:18:59 2001 @@ -731,7 +731,7 @@ */ asmlinkage void math_state_restore(struct pt_regs regs) { - __asm__ __volatile__("clts"); /* Allow maths ops (or we recurse) */ + kfpu_enter(); /* Allow maths ops (or we recurse) */ if (current->used_math) { restore_fpu(current); --- 2.4/arch/i386/kernel/setup.c Sun Feb 11 00:37:51 2001 +++ build-2.4/arch/i386/kernel/setup.c Sun Feb 11 14:33:58 2001 @@ -2274,7 +2274,8 @@ */ current->flags &= ~PF_USEDFPU; current->used_math = 0; - stts(); + kfpu_initialize(); + } /* --- 2.4/arch/i386/kernel/i386_ksyms.c Sun Feb 11 00:37:51 2001 +++ build-2.4/arch/i386/kernel/i386_ksyms.c Sun Feb 11 19:25:05 2001 @@ -117,6 +117,25 @@ EXPORT_SYMBOL(mmx_copy_page); #endif +#ifdef CONFIG_X86_USE_SSE +EXPORT_SYMBOL(sse_clear_page); +EXPORT_SYMBOL(sse_copy_page); +#endif + +EXPORT_SYMBOL(kfpu_start); +EXPORT_SYMBOL(kfpu_acquire); +EXPORT_SYMBOL(kfpu_release); +EXPORT_SYMBOL(kfpu_try_acquire); +EXPORT_SYMBOL(kfpu_try_release); +EXPORT_SYMBOL(kfpu_full_begin); +EXPORT_SYMBOL(kfpu_mmx_begin); +EXPORT_SYMBOL(kfpu_nosave_begin); +EXPORT_SYMBOL(kfpu_full_end); +EXPORT_SYMBOL(kfpu_mmx_end); +EXPORT_SYMBOL(kfpu_nosave_end); +EXPORT_SYMBOL(kfpu_try_begin); +EXPORT_SYMBOL(kfpu_try_end); + #ifdef CONFIG_SMP EXPORT_SYMBOL(cpu_data); EXPORT_SYMBOL(kernel_flag); --- 2.4/arch/i386/config.in Sun Feb 11 00:37:50 2001 +++ build-2.4/arch/i386/config.in Sun Feb 11 19:18:59 2001 @@ -91,6 +91,7 @@ define_bool CONFIG_X86_GOOD_APIC y define_bool CONFIG_X86_PGE y define_bool CONFIG_X86_USE_PPRO_CHECKSUM y + define_bool CONFIG_X86_USE_SSE y fi if [ "$CONFIG_MPENTIUM4" = "y" ]; then define_int CONFIG_X86_L1_CACHE_SHIFT 7 @@ -98,6 +99,7 @@ define_bool CONFIG_X86_GOOD_APIC y define_bool CONFIG_X86_PGE y define_bool CONFIG_X86_USE_PPRO_CHECKSUM y + define_bool CONFIG_X86_USE_SSE y fi if [ "$CONFIG_MK6" = "y" ]; then define_int CONFIG_X86_L1_CACHE_SHIFT 5