Here is a new trick. Labeling previous implementation (assume system applied [1/2] only) as V.1, it does:
- At exit of kernel (ia64_leave_kernel), calculate cycles from last check point using last stamp (ac_stamp), then accumulate the cycle as "system's cycles" (ac_stime) and updates the stamp. - At entrance of kernel (break_fault etc.), calculate cycles from last check point using last stamp, then accumulate the cycle as "user's cycles" (ac_utime) and updates the stamp. It takes times from both of kernel entrance path and exit path, so it results in considerable increase of system call overhead, unfortunately. This 2 of 2 is magical patch to reduce the overhead. Labeling new implementation (assume system applied [1/2] and [2/2]) as V.2, it does: - At exit of kernel (ia64_leave_kernel), do nothing but only save the "leave time" as ac_leave separated from usual time stamp (ac_stamp). - At entrance of kernel (break_fault etc.): 1. calculate cycles from last check point (in kernel) to "last leave" using ac_stamp and ac_leave, then accumulate the cycle to ac_stime. 2. calculate cycles from "last leave" to now, then accumulate the cycle to ac_utime. 3. updates the stamp (ac_stamp). It can be said that this patch combines most part of separated jobs and moves it to entrance side. The change is simple, however: - Exit path becomes quite simple. Only needed is posting value of ITC to memory. There were few registers and slots in bundles available for extra work, but fortunately I made it without increasing the number of bundles ;-) - Entry path becomes slightly complicated. But we can load/store data at once, and do not need to do it at both of exit/entrance anymore. Following result of benchmark shows the performance impact of my patches. (V.1 = 2.6.24-rc5 + [1/2], V.2 = rc5 + [1/2] + [2/2], orig. = rc5) =========================================================================== INDEX VALUES RATIO(%) TEST (Unixbench-v4.1.0) V.1 V.2 orig. V.1 V.2 orig. =============================== ====== ====== ====== ===== ===== ===== Dhrystone 2 using register var. 304.3 304.3 304.4 100.0 100.0 100.0 Double-Precision Whetstone 171.3 171.3 171.1 100.1 100.1 100.0 Execl Throughput 471.3 466.2 467.3 100.9 99.8 100.0 File Copy 1024 buf 2000 maxblks 496.6 511.1 507.1 97.9 100.8 100.0 File Copy 256 buf 500 maxblks 352.4 355.3 366.2 96.2 97.0 100.0 File Copy 4096 buf 8000 maxblks 765.8 768.6 778.1 98.4 98.8 100.0 Pipe Throughput 422.0 427.0 416.1 101.4 102.6 100.0 Process Creation 945.1 949.3 948.0 99.7 100.1 100.0 Shell Scripts (8 concurrent) 1646.7 1646.2 1654.5 99.5 99.5 100.0 System Call Overhead 695.4 732.1 820.0 84.8 89.3 100.0 =============================== ====== ====== ====== ===== ===== ===== FINAL SCORE 522.0 527.1 533.9 97.8 98.7 100.0 =========================================================================== (@ Madison 1.5GHz x 4) Most affected is undoubtedly system call, which is originally well optimized. But from macro-viewpoint, unless you are a full-time-system-call-aholic, I believe it's worthwhile to make a concession. The faster hardware goes (or even the more active software becomes), the more accuracy of traditional tick-sampling based CPU time accounting drops. When would be the decision point? Signed-off-by: Hidetoshi Seto <[EMAIL PROTECTED]> --- arch/ia64/kernel/asm-offsets.c | 1 arch/ia64/kernel/entry.S | 87 ++++++++++++++++++++++++++++++----------- arch/ia64/kernel/fsys.S | 20 ++++++--- arch/ia64/kernel/ivt.S | 42 ++++++++++++------- include/asm-ia64/thread_info.h | 1 5 files changed, 107 insertions(+), 44 deletions(-) Index: linux-2.6.24-rc5/arch/ia64/kernel/asm-offsets.c =================================================================== --- linux-2.6.24-rc5.orig/arch/ia64/kernel/asm-offsets.c +++ linux-2.6.24-rc5/arch/ia64/kernel/asm-offsets.c @@ -41,6 +41,7 @@ DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count)); #ifdef CONFIG_VIRT_CPU_ACCOUNTING DEFINE(TI_AC_STAMP, offsetof(struct thread_info, ac_stamp)); + DEFINE(TI_AC_LEAVE, offsetof(struct thread_info, ac_leave)); DEFINE(TI_AC_STIME, offsetof(struct thread_info, ac_stime)); DEFINE(TI_AC_UTIME, offsetof(struct thread_info, ac_utime)); #endif Index: linux-2.6.24-rc5/arch/ia64/kernel/entry.S =================================================================== --- linux-2.6.24-rc5.orig/arch/ia64/kernel/entry.S +++ linux-2.6.24-rc5/arch/ia64/kernel/entry.S @@ -710,6 +710,16 @@ (pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk #endif .work_processed_syscall: +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + adds r2=PT(LOADRS)+16,r12 +(pUStk) mov.m r22=ar.itc // fetch time at leave + adds r18=TI_FLAGS+IA64_TASK_SIZE,r13 + ;; +(p6) ld4 r31=[r18] // load current_thread_info()->flags + ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs" + adds r3=PT(AR_BSPSTORE)+16,r12 // deferred + ;; +#else adds r2=PT(LOADRS)+16,r12 adds r3=PT(AR_BSPSTORE)+16,r12 adds r18=TI_FLAGS+IA64_TASK_SIZE,r13 @@ -718,6 +728,7 @@ ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs" nop.i 0 ;; +#endif mov r16=ar.bsp // M2 get existing backing store pointer ld8 r18=[r2],PT(R9)-PT(B6) // load b6 (p6) and r15=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE? @@ -737,12 +748,21 @@ ld8 r29=[r2],16 // M0|1 load cr.ipsr ld8 r28=[r3],16 // M0|1 load cr.iip +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +(pUStk) add r14=TI_AC_LEAVE+IA64_TASK_SIZE,r13 + ;; + ld8 r30=[r2],16 // M0|1 load cr.ifs + ld8 r25=[r3],16 // M0|1 load ar.unat +(pUStk) add r15=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13 + ;; +#else mov r22=r0 // A clear r22 ;; ld8 r30=[r2],16 // M0|1 load cr.ifs ld8 r25=[r3],16 // M0|1 load ar.unat (pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13 ;; +#endif ld8 r26=[r2],PT(B0)-PT(AR_PFS) // M0|1 load ar.pfs (pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled nop 0 @@ -759,7 +779,11 @@ ld8.fill r1=[r3],16 // M0|1 load r1 (pUStk) mov r17=1 // A ;; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +(pUStk) st1 [r15]=r17 // M2|3 +#else (pUStk) st1 [r14]=r17 // M2|3 +#endif ld8.fill r13=[r3],16 // M0|1 mov f8=f0 // F clear f8 ;; @@ -775,12 +799,22 @@ shr.u r18=r19,16 // I0|1 get byte size of existing "dirty" partition cover // B add current frame into dirty partition & set cr.ifs ;; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + mov r19=ar.bsp // M2 get new backing store pointer + st8 [r14]=r22 // M save time at leave + mov f10=f0 // F clear f10 + + mov r22=r0 // A clear r22 + movl r14=__kernel_syscall_via_epc // X + ;; +#else mov r19=ar.bsp // M2 get new backing store pointer mov f10=f0 // F clear f10 nop.m 0 movl r14=__kernel_syscall_via_epc // X ;; +#endif mov.m ar.csd=r0 // M2 clear ar.csd mov.m ar.ccv=r0 // M2 clear ar.ccv mov b7=r14 // I0 clear b7 (hint with __kernel_syscall_via_epc) @@ -913,10 +947,18 @@ adds r16=PT(CR_IPSR)+16,r12 adds r17=PT(CR_IIP)+16,r12 +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + .pred.rel.mutex pUStk,pKStk +(pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled +(pUStk) mov.m r22=ar.itc // M fetch time at leave + nop.i 0 + ;; +#else (pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled nop.i 0 nop.i 0 ;; +#endif ld8 r29=[r16],16 // load cr.ipsr ld8 r28=[r17],16 // load cr.iip ;; @@ -938,15 +980,37 @@ ;; ld8.fill r12=[r16],16 ld8.fill r13=[r17],16 +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +(pUStk) adds r3=TI_AC_LEAVE+IA64_TASK_SIZE,r18 +#else (pUStk) adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18 +#endif ;; ld8 r20=[r16],16 // ar.fpsr ld8.fill r15=[r17],16 +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +(pUStk) adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18 // deferred +#endif ;; ld8.fill r14=[r16],16 ld8.fill r2=[r17] (pUStk) mov r17=1 ;; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + // mmi_ : ld8 st1 shr;; mmi_ : st8 st1 shr;; + // mib : mov add br -> mib : ld8 add br + // bbb_ : br nop cover;; mbb_ : mov br cover;; + // + // no one require bsp in r16 if (pKStk) branch is selected. +(pUStk) st8 [r3]=r22 // save time at leave +(pUStk) st1 [r18]=r17 // restore current->thread.on_ustack + shr.u r18=r19,16 // get byte size of existing "dirty" partition + ;; + ld8.fill r3=[r16] // deferred + LOAD_PHYS_STACK_REG_SIZE(r17) +(pKStk) br.cond.dpnt skip_rbs_switch + mov r16=ar.bsp // get existing backing store pointer +#else ld8.fill r3=[r16] (pUStk) st1 [r18]=r17 // restore current->thread.on_ustack shr.u r18=r19,16 // get byte size of existing "dirty" partition @@ -954,6 +1018,7 @@ mov r16=ar.bsp // get existing backing store pointer LOAD_PHYS_STACK_REG_SIZE(r17) (pKStk) br.cond.dpnt skip_rbs_switch +#endif /* * Restore user backing store. @@ -995,28 +1060,6 @@ shladd in0=loc1,3,r17 mov in1=0 ;; -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -account_sys_leave: - // The size of current frame - // (in * 2 + loc * N (N = 8 or 12) + out * 2) - // is enough to work, so just take care to keep in0,in1 - mov loc0=ar.itc - mov loc1=IA64_KR(CURRENT) // M2 (12 cycle read latency) - ;; - add loc2=TI_AC_STAMP+IA64_TASK_SIZE,loc1 - add loc3=TI_AC_STIME+IA64_TASK_SIZE,loc1 - ;; - ld8 loc4=[loc2] // get last stamp - ld8 loc5=[loc3] // cumulated stime - ;; - sub loc4=loc0,loc4 // elapsed time - ;; - add loc5=loc5,loc4 // sum - ;; - st8 [loc2]=loc0 // update stamp - st8 [loc3]=loc5 // update stime - ;; -#endif TEXT_ALIGN(32) rse_clear_invalid: #ifdef CONFIG_ITANIUM Index: linux-2.6.24-rc5/arch/ia64/kernel/fsys.S =================================================================== --- linux-2.6.24-rc5.orig/arch/ia64/kernel/fsys.S +++ linux-2.6.24-rc5/arch/ia64/kernel/fsys.S @@ -689,17 +689,23 @@ #ifdef CONFIG_VIRT_CPU_ACCOUNTING // mov.m r30=ar.itc is called in advance add r16=TI_AC_STAMP+IA64_TASK_SIZE,r2 - add r17=TI_AC_UTIME+IA64_TASK_SIZE,r2 + add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r2 ;; - ld8 r18=[r16] // get last stamp - ld8 r19=[r17] // cumulated utime + ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP // time at last check in kernel + ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE // time at leave kernel ;; - sub r18=r30,r18 // elapsed time + ld8 r20=[r16],TI_AC_STAMP-TI_AC_STIME // cumulated stime + ld8 r21=[r17] // cumulated utime + sub r22=r19,r18 // stime before leave kernel ;; - add r19=r19,r18 // sum + st8 [r16]=r30,TI_AC_STIME-TI_AC_STAMP // update stamp + sub r18=r30,r19 // elapsed time in user mode ;; - st8 [r16]=r30 // update stamp - st8 [r17]=r19 // update utime + add r20=r20,r22 // sum stime + add r21=r21,r18 // sum utime + ;; + st8 [r16]=r20 // update stime + st8 [r17]=r21 // update utime ;; #endif mov ar.rsc=0x3 // M2 set eager mode, pl 0, LE, loadrs=0 Index: linux-2.6.24-rc5/arch/ia64/kernel/ivt.S =================================================================== --- linux-2.6.24-rc5.orig/arch/ia64/kernel/ivt.S +++ linux-2.6.24-rc5/arch/ia64/kernel/ivt.S @@ -841,18 +841,24 @@ #ifdef CONFIG_VIRT_CPU_ACCOUNTING // mov.m r30=ar.itc is called in advance, and r13 is current add r16=TI_AC_STAMP+IA64_TASK_SIZE,r13 // A - add r17=TI_AC_UTIME+IA64_TASK_SIZE,r13 // A + add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r13 // A (pKStk) br.cond.spnt .skip_accounting // B unlikely skip ;; - ld8 r18=[r16] // M get last stamp - ld8 r19=[r17] // M cumulated utime + ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP // M get last stamp + ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE // M time at leave ;; - sub r18=r30,r18 // A elapsed time + ld8 r20=[r16],TI_AC_STAMP-TI_AC_STIME // M cumulated stime + ld8 r21=[r17] // M cumulated utime + sub r22=r19,r18 // A stime before leave ;; - add r19=r19,r18 // A sum + st8 [r16]=r30,TI_AC_STIME-TI_AC_STAMP // M update stamp + sub r18=r30,r19 // A elapsed time in user ;; - st8 [r16]=r30 // M update stamp - st8 [r17]=r19 // M update utime + add r20=r20,r22 // A sum stime + add r21=r21,r18 // A sum utime + ;; + st8 [r16]=r20 // M update stime + st8 [r17]=r21 // M update utime ;; .skip_accounting: #endif @@ -1131,18 +1137,24 @@ ENTRY(account_sys_enter) // mov.m r20=ar.itc is called in advance, and r13 is current add r16=TI_AC_STAMP+IA64_TASK_SIZE,r13 - add r17=TI_AC_UTIME+IA64_TASK_SIZE,r13 + add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r13 + ;; + ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP // time at last check in kernel + ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE // time at left from kernel + ;; + ld8 r23=[r16],TI_AC_STAMP-TI_AC_STIME // cumulated stime + ld8 r21=[r17] // cumulated utime + sub r22=r19,r18 // stime before leave kernel ;; - ld8 r18=[r16] // get last stamp - ld8 r19=[r17] // cumulated utime + st8 [r16]=r20,TI_AC_STIME-TI_AC_STAMP // update stamp + sub r18=r20,r19 // elapsed time in user mode ;; - sub r18=r20,r18 // elapsed time + add r23=r23,r22 // sum stime + add r21=r21,r18 // sum utime ;; - add r19=r19,r18 // sum + st8 [r16]=r23 // update stime + st8 [r17]=r21 // update utime ;; - st8 [r16]=r20 // update stamp - st8 [r17]=r19 // update utime - ;; br.ret.sptk.many rp END(account_sys_enter) #endif Index: linux-2.6.24-rc5/include/asm-ia64/thread_info.h =================================================================== --- linux-2.6.24-rc5.orig/include/asm-ia64/thread_info.h +++ linux-2.6.24-rc5/include/asm-ia64/thread_info.h @@ -33,6 +33,7 @@ struct restart_block restart_block; #ifdef CONFIG_VIRT_CPU_ACCOUNTING __u64 ac_stamp; + __u64 ac_leave; __u64 ac_stime; __u64 ac_utime; #endif - To unsubscribe from this list: send the line "unsubscribe linux-ia64" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html