This patch adds optimization that we clear ipsr.mfh and FPH_VALID
bit in the thread structure at system call time.  This allows
application to optimistically use high floating point partition
without incurring heavy context switch overhead for saving/restoring
high FPH.  Thus significantly boost application's performance.

This optimization does not cost any cycles in the syscall entry path.

Diff'ed relative to linux-ia64-release-2.6.11.  Please apply.



Signed-off-by: Ken Chen <[EMAIL PROTECTED]>


--- linux-ia64-release-2.6.11/arch/ia64/kernel/asm-offsets.c.orig       
2005-02-08 13:53:27.000000000 -0800
+++ linux-ia64-release-2.6.11/arch/ia64/kernel/asm-offsets.c    2005-02-08 
14:01:44.000000000 -0800
@@ -50,6 +50,7 @@ void foo(void)
        DEFINE(IA64_TASK_TGID_OFFSET, offsetof (struct task_struct, tgid));
        DEFINE(IA64_TASK_THREAD_KSP_OFFSET, offsetof (struct task_struct, 
thread.ksp));
        DEFINE(IA64_TASK_THREAD_ON_USTACK_OFFSET, offsetof (struct task_struct, 
thread.on_ustack));
+       DEFINE(IA64_TASK_THREAD_FLAGS_OFFSET, offsetof (struct task_struct, 
thread.flags));

        BLANK();

--- linux-ia64-release-2.6.11/arch/ia64/kernel/ivt.S.orig       2005-02-08 
13:53:27.000000000 -0800
+++ linux-ia64-release-2.6.11/arch/ia64/kernel/ivt.S    2005-02-08 
14:09:26.000000000 -0800
@@ -856,6 +856,8 @@ GLOBAL_ENTRY(ia64_syscall_setup)
        st8 [r1]=r19                            // save b6
        add r16=PT(CR_IPSR),r1                  // initialize first base pointer
        add r17=PT(R11),r1                      // initialize second base 
pointer
+       dep r29=0,r29,IA64_PSR_MFH_BIT,1
+       add r30=IA64_TASK_THREAD_FLAGS_OFFSET,r2
        ;;
        alloc r19=ar.pfs,8,0,0,0                // ensure in0-in7 are writable
        st8 [r16]=r29,PT(AR_PFS)-PT(CR_IPSR)    // save cr.ipsr
@@ -885,6 +887,7 @@ GLOBAL_ENTRY(ia64_syscall_setup)
 (p9)   mov in1=-1
        ;;

+       ld4 r9=[r30]
 (pUStk) sub r18=r18,r22                                // r18=RSE.ndirty*8
        tnat.nz p10,p0=in2
        add r11=8,r11
@@ -924,14 +927,14 @@ GLOBAL_ENTRY(ia64_syscall_setup)

        st8.spill [r17]=r15                     // save r15
        tnat.nz p8,p0=in7
-       nop.i 0
+       dep r9=0,r9,IA64_THREAD_FPH_BIT,1

        mov r13=r2                              // establish `current'
        movl r1=__gp                            // establish kernel global 
pointer
        ;;
+       st4 [r30]=r9
 (p14)  mov in6=-1
 (p8)   mov in7=-1
-       nop.i 0

        cmp.eq pSys,pNonSys=r0,r0               // set pSys=1, pNonSys=0
        movl r17=FPSR_DEFAULT
--- linux-ia64-release-2.6.11/arch/ia64/kernel/process.c.orig   2005-02-08 
14:14:23.000000000 -0800
+++ linux-ia64-release-2.6.11/arch/ia64/kernel/process.c        2005-02-08 
14:17:09.000000000 -0800
@@ -448,12 +448,8 @@ copy_thread (int nr, unsigned long clone
         * child and all we have to do is to make sure that
         * IA64_THREAD_FPH_VALID is cleared in the child.
         *
-        * XXX We could push this optimization a bit further by
-        * clearing IA64_THREAD_FPH_VALID on ANY system call.
-        * However, it's not clear this is worth doing.  Also, it
-        * would be a slight deviation from the normal Linux system
-        * call behavior where scratch registers are preserved across
-        * system calls (unless used by the system call itself).
+        * We push this optimization a bit further by clearing
+        * IA64_THREAD_FPH_VALID and ipsr.mfh on ANY system call.
         */
 #      define THREAD_FLAGS_TO_CLEAR    (IA64_THREAD_FPH_VALID | 
IA64_THREAD_DBG_VALID \
                                         | IA64_THREAD_PM_VALID)
--- linux-ia64-release-2.6.11/include/asm-ia64/processor.h.orig 2005-02-08 
14:07:39.000000000 -0800
+++ linux-ia64-release-2.6.11/include/asm-ia64/processor.h      2005-02-08 
14:07:31.000000000 -0800
@@ -56,7 +56,8 @@
  */
 #define TASK_UNMAPPED_BASE     (current->thread.map_base)

-#define IA64_THREAD_FPH_VALID  (__IA64_UL(1) << 0)     /* floating-point high 
state valid? */
+#define IA64_THREAD_FPH_BIT    0                       /* floating-point high 
state valid? */
+#define IA64_THREAD_FPH_VALID  (__IA64_UL(1) << IA64_THREAD_FPH_BIT)
 #define IA64_THREAD_DBG_VALID  (__IA64_UL(1) << 1)     /* debug registers 
valid? */
 #define IA64_THREAD_PM_VALID   (__IA64_UL(1) << 2)     /* performance 
registers valid? */
 #define IA64_THREAD_UAC_NOPRINT        (__IA64_UL(1) << 3)     /* don't log 
unaligned accesses */


-
To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to