This patch adds optimization that we clear ipsr.mfh and FPH_VALID
bit in the thread structure at system call time. This allows
application to optimistically use high floating point partition
without incurring heavy context switch overhead for saving/restoring
high FPH. Thus significantly boost application's performance.
This optimization does not cost any cycles in the syscall entry path.
Diff'ed relative to linux-ia64-release-2.6.11. Please apply.
Signed-off-by: Ken Chen <[EMAIL PROTECTED]>
--- linux-ia64-release-2.6.11/arch/ia64/kernel/asm-offsets.c.orig
2005-02-08 13:53:27.000000000 -0800
+++ linux-ia64-release-2.6.11/arch/ia64/kernel/asm-offsets.c 2005-02-08
14:01:44.000000000 -0800
@@ -50,6 +50,7 @@ void foo(void)
DEFINE(IA64_TASK_TGID_OFFSET, offsetof (struct task_struct, tgid));
DEFINE(IA64_TASK_THREAD_KSP_OFFSET, offsetof (struct task_struct,
thread.ksp));
DEFINE(IA64_TASK_THREAD_ON_USTACK_OFFSET, offsetof (struct task_struct,
thread.on_ustack));
+ DEFINE(IA64_TASK_THREAD_FLAGS_OFFSET, offsetof (struct task_struct,
thread.flags));
BLANK();
--- linux-ia64-release-2.6.11/arch/ia64/kernel/ivt.S.orig 2005-02-08
13:53:27.000000000 -0800
+++ linux-ia64-release-2.6.11/arch/ia64/kernel/ivt.S 2005-02-08
14:09:26.000000000 -0800
@@ -856,6 +856,8 @@ GLOBAL_ENTRY(ia64_syscall_setup)
st8 [r1]=r19 // save b6
add r16=PT(CR_IPSR),r1 // initialize first base pointer
add r17=PT(R11),r1 // initialize second base
pointer
+ dep r29=0,r29,IA64_PSR_MFH_BIT,1
+ add r30=IA64_TASK_THREAD_FLAGS_OFFSET,r2
;;
alloc r19=ar.pfs,8,0,0,0 // ensure in0-in7 are writable
st8 [r16]=r29,PT(AR_PFS)-PT(CR_IPSR) // save cr.ipsr
@@ -885,6 +887,7 @@ GLOBAL_ENTRY(ia64_syscall_setup)
(p9) mov in1=-1
;;
+ ld4 r9=[r30]
(pUStk) sub r18=r18,r22 // r18=RSE.ndirty*8
tnat.nz p10,p0=in2
add r11=8,r11
@@ -924,14 +927,14 @@ GLOBAL_ENTRY(ia64_syscall_setup)
st8.spill [r17]=r15 // save r15
tnat.nz p8,p0=in7
- nop.i 0
+ dep r9=0,r9,IA64_THREAD_FPH_BIT,1
mov r13=r2 // establish `current'
movl r1=__gp // establish kernel global
pointer
;;
+ st4 [r30]=r9
(p14) mov in6=-1
(p8) mov in7=-1
- nop.i 0
cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0
movl r17=FPSR_DEFAULT
--- linux-ia64-release-2.6.11/arch/ia64/kernel/process.c.orig 2005-02-08
14:14:23.000000000 -0800
+++ linux-ia64-release-2.6.11/arch/ia64/kernel/process.c 2005-02-08
14:17:09.000000000 -0800
@@ -448,12 +448,8 @@ copy_thread (int nr, unsigned long clone
* child and all we have to do is to make sure that
* IA64_THREAD_FPH_VALID is cleared in the child.
*
- * XXX We could push this optimization a bit further by
- * clearing IA64_THREAD_FPH_VALID on ANY system call.
- * However, it's not clear this is worth doing. Also, it
- * would be a slight deviation from the normal Linux system
- * call behavior where scratch registers are preserved across
- * system calls (unless used by the system call itself).
+ * We push this optimization a bit further by clearing
+ * IA64_THREAD_FPH_VALID and ipsr.mfh on ANY system call.
*/
# define THREAD_FLAGS_TO_CLEAR (IA64_THREAD_FPH_VALID |
IA64_THREAD_DBG_VALID \
| IA64_THREAD_PM_VALID)
--- linux-ia64-release-2.6.11/include/asm-ia64/processor.h.orig 2005-02-08
14:07:39.000000000 -0800
+++ linux-ia64-release-2.6.11/include/asm-ia64/processor.h 2005-02-08
14:07:31.000000000 -0800
@@ -56,7 +56,8 @@
*/
#define TASK_UNMAPPED_BASE (current->thread.map_base)
-#define IA64_THREAD_FPH_VALID (__IA64_UL(1) << 0) /* floating-point high
state valid? */
+#define IA64_THREAD_FPH_BIT 0 /* floating-point high
state valid? */
+#define IA64_THREAD_FPH_VALID (__IA64_UL(1) << IA64_THREAD_FPH_BIT)
#define IA64_THREAD_DBG_VALID (__IA64_UL(1) << 1) /* debug registers
valid? */
#define IA64_THREAD_PM_VALID (__IA64_UL(1) << 2) /* performance
registers valid? */
#define IA64_THREAD_UAC_NOPRINT (__IA64_UL(1) << 3) /* don't log
unaligned accesses */
-
To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html