Re: RFC: Petition Intel/AMD to add POPF_IF insn

Denys Vlasenko Thu, 18 Aug 2016 05:20:36 -0700

On 08/18/2016 11:21 AM, Denys Vlasenko wrote:

Of course, if somebody uses native_restore_fl() to actually *disable*
interrupts (when they weren't already disabled), then this untested
patch will just not work. But why would you do somethign so stupid?
Famous last words...


Looking around, the vsmp code actually uses "native_restore_fl()" to
not just set the interrupt flag, but AC as well.

And the PV spinlock case has that "push;popf" sequence encoded in an alternate.

So that trivial patch may (or may not - still not tested) work for
some quick testing, but needs more effort for any *real* use.


I'm going to test the attached patch.

...


+# CONFIG_UPROBES is not set
+# CONFIG_SCHED_OMIT_FRAME_POINTER is not set
+# CONFIG_HYPERVISOR_GUEST is not set
+# CONFIG_SYS_HYPERVISOR is not set
+# CONFIG_FRAME_POINTER is not set
+# CONFIG_KMEMCHECK is not set
+# CONFIG_DEBUG_LOCK_ALLOC is not set
+# CONFIG_PROVE_LOCKING is not set
+# CONFIG_LOCK_STAT is not set
+# CONFIG_PROVE_RCU is not set
+# CONFIG_LATENCYTOP is not set
+# CONFIG_FTRACE is not set
+# CONFIG_BINARY_PRINTF is not set


Need also !CONFIG_DEBUG_SPINLOCK, then unpatched irqrestore is:

ffffffff817115a0 <_raw_spin_unlock_irqrestore>:
ffffffff817115a0:       c6 07 00                movb   $0x0,(%rdi)
ffffffff817115a3:       56                      push   %rsi
ffffffff817115a4:       9d                      popfq
ffffffff817115a5:       65 ff 0d e4 ad 8f 7e    decl   %gs:__preempt_count
ffffffff817115ac:       c3                      retq
ffffffff817115ad:       0f 1f 00                nopl   (%rax)

patched one is

ffffffff81711660 <_raw_spin_unlock_irqrestore>:
ffffffff81711660:       f7 c6 00 02 00 00       test   $0x200,%esi
ffffffff81711666:       c6 07 00                movb   $0x0,(%rdi)
ffffffff81711669:       74 01                   je     ffffffff8171166c 
<_raw_spin_unlock_irqrestore+0xc>
ffffffff8171166b:       fb                      sti
ffffffff8171166c:       65 ff 0d 1d ad 8f 7e    decl   %gs:__preempt_count
ffffffff81711673:       c3                      retq
ffffffff81711674:       66 90                   xchg   %ax,%ax
ffffffff81711676:       66 2e 0f 1f 84 00 00 00 00 00   nopw   
%cs:0x0(%rax,%rax,1)


Ran the following twice on a quiesced machine:

taskset 1 perf stat -r60 perf bench sched messaging
taskset 1 perf stat -r60 perf bench sched pipe

Out of these four runs, both "perf bench sched pipe" runs show improvements:

-       2648.279829      task-clock (msec)         #    1.000 CPUs utilized     
       ( +-  0.24% )
+       2483.143469      task-clock (msec)         #    0.998 CPUs utilized     
       ( +-  0.31% )
-         2,000,002      context-switches          #    0.755 M/sec             
       ( +-  0.00% )
+         2,000,013      context-switches          #    0.805 M/sec             
       ( +-  0.00% )
-               547      page-faults               #    0.206 K/sec             
       ( +-  0.04% )
+               546      page-faults               #    0.220 K/sec             
       ( +-  0.04% )
-     8,723,284,926      cycles                    #    3.294 GHz               
       ( +-  0.06% )
+     8,157,949,449      cycles                    #    3.285 GHz               
       ( +-  0.07% )
-    12,286,937,344      instructions              #    1.41  insn per cycle    
       ( +-  0.03% )
+    12,255,616,405      instructions              #    1.50  insn per cycle    
       ( +-  0.05% )
-     2,588,839,023      branches                  #  977.555 M/sec             
       ( +-  0.02% )
+     2,599,319,615      branches                  # 1046.786 M/sec             
       ( +-  0.04% )
-         3,620,273      branch-misses             #    0.14% of all branches   
       ( +-  0.67% )
+         3,577,771      branch-misses             #    0.14% of all branches   
       ( +-  0.69% )
-       2.648799072 seconds time elapsed                                        
  ( +-  0.24% )
+       2.487452268 seconds time elapsed                                        
  ( +-  0.31% )

Good, we run more insns/cycle, as expected. However, a bit more branches.

But of two "perf bench sched messaging" run, one was slower on a patched kernel,
and perf shows why: more branches, and also branch miss percentage is larger:

-        690.008697      task-clock (msec)         #    0.996 CPUs utilized     
       ( +-  0.45% )
+        699.526509      task-clock (msec)         #    0.994 CPUs utilized     
       ( +-  0.28% )
-            26,796      context-switches          #    0.039 M/sec             
       ( +-  8.31% )
+            29,088      context-switches          #    0.042 M/sec             
       ( +-  6.62% )
-            35,477      page-faults               #    0.051 M/sec             
       ( +-  0.11% )
+            35,504      page-faults               #    0.051 M/sec             
       ( +-  0.14% )
-     2,157,701,609      cycles                    #    3.127 GHz               
       ( +-  0.35% )
+     2,143,781,407      cycles                    #    3.065 GHz               
       ( +-  0.25% )
-     3,115,212,672      instructions              #    1.44  insn per cycle    
       ( +-  0.28% )
+     3,253,499,549      instructions              #    1.52  insn per cycle    
       ( +-  0.19% )
-       661,888,593      branches                  #  959.247 M/sec             
       ( +-  0.36% )
+       707,862,655      branches                  # 1011.917 M/sec             
       ( +-  0.20% )
-         2,793,203      branch-misses             #    0.42% of all branches   
       ( +-  1.04% )
+         3,453,397      branch-misses             #    0.49% of all branches   
       ( +-  0.32% )
-       0.693004918 seconds time elapsed                                        
  ( +-  0.45% )
+       0.703630988 seconds time elapsed                                        
  ( +-  0.27% )

This tipped the scales, and despite higher insns/cycle, run time is worse.

The other "perf bench sched messaging" run was more lucky:

-        706.944245      task-clock (msec)         #    0.995 CPUs utilized     
       ( +-  0.32% )
+        687.038856      task-clock (msec)         #    0.993 CPUs utilized     
       ( +-  0.31% )
-            23,489      context-switches          #    0.033 M/sec             
       ( +-  7.02% )
+            26,644      context-switches          #    0.039 M/sec             
       ( +-  7.46% )
-            35,360      page-faults               #    0.050 M/sec             
       ( +-  0.12% )
+            35,417      page-faults               #    0.052 M/sec             
       ( +-  0.13% )
-     2,183,639,816      cycles                    #    3.089 GHz               
       ( +-  0.35% )
+     2,123,086,753      cycles                    #    3.090 GHz               
       ( +-  0.27% )
-     3,131,362,238      instructions              #    1.43  insn per cycle    
       ( +-  0.19% )
+     3,236,613,433      instructions              #    1.52  insn per cycle    
       ( +-  0.19% )
-       667,874,319      branches                  #  944.734 M/sec             
       ( +-  0.21% )
+       703,677,908      branches                  # 1024.219 M/sec             
       ( +-  0.20% )
-         2,859,521      branch-misses             #    0.43% of all branches   
       ( +-  0.56% )
+         3,462,063      branch-misses             #    0.49% of all branches   
       ( +-  0.33% )
-       0.710738536 seconds time elapsed                                        
  ( +-  0.32% )
+       0.691908533 seconds time elapsed                                        
  ( +-  0.31% )

However, it still had more branches (~5% more), and worse branch miss 
percentage.

The patch seems to work. It also does not bloat the kernel:

   text    data     bss     dec     hex filename
8199556 5026784 2924544 16150884 f67164 vmlinux
8199897 5026784 2924544 16151225 f672b9 vmlinux.patched

However, a "conditional CLI/STI from r/m" insn could be better still.

The patch is attached.

diff -urp linux-4.7.1.org/arch/x86/include/asm/irqflags.h linux-4.7.1.obj/arch/x86/include/asm/irqflags.h
--- linux-4.7.1.org/arch/x86/include/asm/irqflags.h	2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/include/asm/irqflags.h	2016-08-18 10:01:09.514219644 +0200
@@ -44,6 +44,16 @@ static inline void native_irq_enable(voi
 	asm volatile("sti": : :"memory");
 }
 
+/*
+ * This produces a "test; jz; sti" insn sequence.
+ * It's faster than "push reg; popf". If sti is skipped, it's much faster.
+ */
+static inline void native_cond_irq_enable(unsigned long flags)
+{
+	if (flags & X86_EFLAGS_IF)
+		native_irq_enable();
+}
+
 static inline void native_safe_halt(void)
 {
 	asm volatile("sti; hlt": : :"memory");
@@ -69,7 +79,8 @@ static inline notrace unsigned long arch
 
 static inline notrace void arch_local_irq_restore(unsigned long flags)
 {
-	native_restore_fl(flags);
+	if (flags & X86_EFLAGS_IF)
+		native_irq_enable();
 }
 
 static inline notrace void arch_local_irq_disable(void)
diff -urp linux-4.7.1.org/arch/x86/kernel/paravirt.c linux-4.7.1.obj/arch/x86/kernel/paravirt.c
--- linux-4.7.1.org/arch/x86/kernel/paravirt.c	2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/kernel/paravirt.c	2016-08-18 10:01:24.797155109 +0200
@@ -313,7 +313,7 @@ struct pv_time_ops pv_time_ops = {
 
 __visible struct pv_irq_ops pv_irq_ops = {
 	.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
-	.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
+	.restore_fl = __PV_IS_CALLEE_SAVE(native_cond_irq_enable),
 	.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
 	.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
 	.safe_halt = native_safe_halt,
diff -urp linux-4.7.1.org/arch/x86/kernel/paravirt_patch_32.c linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_32.c
--- linux-4.7.1.org/arch/x86/kernel/paravirt_patch_32.c	2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_32.c	2016-08-18 04:43:19.388102755 +0200
@@ -2,7 +2,7 @@
 
 DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
 DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
+DEF_NATIVE(pv_irq_ops, restore_fl, "testb $((1<<9)>>8),%ah; jz 1f; sti; 1: ;");
 DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
 DEF_NATIVE(pv_cpu_ops, iret, "iret");
 DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
diff -urp linux-4.7.1.org/arch/x86/kernel/paravirt_patch_64.c linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_64.c
--- linux-4.7.1.org/arch/x86/kernel/paravirt_patch_64.c	2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_64.c	2016-08-18 04:42:56.364545509 +0200
@@ -4,7 +4,7 @@
 
 DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
 DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
+DEF_NATIVE(pv_irq_ops, restore_fl, "testw $(1<<9),%di; jz 1f; sti; 1: ;");
 DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");

Re: RFC: Petition Intel/AMD to add POPF_IF insn

Reply via email to