On Thu, Oct 11, 2018 at 12:31 AM Peter Zijlstra <pet...@infradead.org> wrote: > > On Wed, Oct 10, 2018 at 05:33:36PM -0700, Eric Dumazet wrote: > > While looking at native_sched_clock() disassembly I had > > the surprise to see the compiler (gcc 7.3 here) had > > optimized out the loop, meaning the code is broken. > > > > Using the documented and approved API not only fixes the bug, > > it also makes the code more readable. > > > > Replacing five this_cpu_read() by one this_cpu_ptr() makes > > the generated code smaller. > > Does not for me, that is, the resulting asm is actually larger
[resent in non html] I counted the number of bytes of text, and really the after my patch code size is smaller. %gs prefixes are one-byte, but the 32bit offsets are adding up. Prior version (with resinstaed loop, thanks to one smp_rmb() diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index b52bd2b6cdb443ba0c89d78aaa52b02b82a10b6e..d4ad30bbaa0cc8bf81da0272829da26f229734bf 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -71,7 +71,7 @@ void cyc2ns_read_begin(struct cyc2ns_data *data) data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); - + smp_rmb(); } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); } Total length : 0xA7 bytes 00000000000002a0 <native_sched_clock>: 2a0: 4c 8d 54 24 08 lea 0x8(%rsp),%r10 2a5: 48 83 e4 f0 and $0xfffffffffffffff0,%rsp 2a9: 41 ff 72 f8 pushq -0x8(%r10) 2ad: 55 push %rbp 2ae: 48 89 e5 mov %rsp,%rbp 2b1: 41 52 push %r10 2b3: e9 60 00 00 00 jmpq 318 <native_sched_clock+0x78> 2b8: 0f 31 rdtsc 2ba: 48 c1 e2 20 shl $0x20,%rdx 2be: 48 09 c2 or %rax,%rdx 2c1: 49 89 d2 mov %rdx,%r10 2c4: 49 c7 c1 00 00 00 00 mov $0x0,%r9 2c7: R_X86_64_32S .data..percpu..shared_aligned 2cb: 65 8b 05 00 00 00 00 mov %gs:0x0(%rip),%eax # 2d2 <native_sched_clock+0x32> 2ce: R_X86_64_PC32 .data..percpu..shared_aligned+0x1c 2d2: 89 c1 mov %eax,%ecx 2d4: 83 e1 01 and $0x1,%ecx 2d7: 48 c1 e1 04 shl $0x4,%rcx 2db: 4c 01 c9 add %r9,%rcx 2de: 65 48 8b 79 08 mov %gs:0x8(%rcx),%rdi 2e3: 65 8b 31 mov %gs:(%rcx),%esi 2e6: 65 8b 49 04 mov %gs:0x4(%rcx),%ecx 2ea: 65 44 8b 05 00 00 00 mov %gs:0x0(%rip),%r8d # 2f2 <native_sched_clock+0x52> 2f1: 00 2ee: R_X86_64_PC32 .data..percpu..shared_aligned+0x1c 2f2: 44 39 c0 cmp %r8d,%eax 2f5: 75 d4 jne 2cb <native_sched_clock+0x2b> 2f7: 89 f6 mov %esi,%esi 2f9: 48 89 f0 mov %rsi,%rax 2fc: 49 f7 e2 mul %r10 2ff: 48 0f ad d0 shrd %cl,%rdx,%rax 303: 48 d3 ea shr %cl,%rdx 306: f6 c1 40 test $0x40,%cl 309: 48 0f 45 c2 cmovne %rdx,%rax 30d: 48 01 f8 add %rdi,%rax 310: 41 5a pop %r10 312: 5d pop %rbp 313: 49 8d 62 f8 lea -0x8(%r10),%rsp 317: c3 retq 318: 48 69 05 00 00 00 00 imul $0x3d0900,0x0(%rip),%rax # 323 <native_sched_clock+0x83> 31f: 00 09 3d 00 31b: R_X86_64_PC32 jiffies_64-0x8 323: 41 5a pop %r10 325: 48 ba 00 b8 64 d9 45 movabs $0xffc2f745d964b800,%rdx 32c: f7 c2 ff 32f: 5d pop %rbp 330: 49 8d 62 f8 lea -0x8(%r10),%rsp 334: 48 01 d0 add %rdx,%rax 337: c3 retq New version (my cleanup patch) Total length = 0x91 bytes 00000000000002a0 <native_sched_clock>: 2a0: 4c 8d 54 24 08 lea 0x8(%rsp),%r10 2a5: 48 83 e4 f0 and $0xfffffffffffffff0,%rsp 2a9: 41 ff 72 f8 pushq -0x8(%r10) 2ad: 55 push %rbp 2ae: 48 89 e5 mov %rsp,%rbp 2b1: 41 52 push %r10 2b3: e9 59 00 00 00 jmpq 311 <native_sched_clock+0x71> 2b8: 0f 31 rdtsc 2ba: 48 c1 e2 20 shl $0x20,%rdx 2be: 48 09 c2 or %rax,%rdx 2c1: 49 89 d1 mov %rdx,%r9 2c4: 49 c7 c0 00 00 00 00 mov $0x0,%r8 2c7: R_X86_64_32S .data..percpu..shared_aligned 2cb: 65 4c 03 05 00 00 00 add %gs:0x0(%rip),%r8 # 2d3 <native_sched_clock+0x33> 2d2: 00 2cf: R_X86_64_PC32 this_cpu_off-0x4 2d3: 41 8b 40 20 mov 0x20(%r8),%eax 2d7: 89 c6 mov %eax,%esi 2d9: 83 e6 01 and $0x1,%esi 2dc: 48 c1 e6 04 shl $0x4,%rsi 2e0: 4c 01 c6 add %r8,%rsi 2e3: 8b 3e mov (%rsi),%edi 2e5: 8b 4e 04 mov 0x4(%rsi),%ecx 2e8: 48 8b 76 08 mov 0x8(%rsi),%rsi 2ec: 41 3b 40 20 cmp 0x20(%r8),%eax 2f0: 75 e1 jne 2d3 <native_sched_clock+0x33> 2f2: 48 89 f8 mov %rdi,%rax 2f5: 49 f7 e1 mul %r9 2f8: 48 0f ad d0 shrd %cl,%rdx,%rax 2fc: 48 d3 ea shr %cl,%rdx 2ff: f6 c1 40 test $0x40,%cl 302: 48 0f 45 c2 cmovne %rdx,%rax 306: 48 01 f0 add %rsi,%rax 309: 41 5a pop %r10 30b: 5d pop %rbp 30c: 49 8d 62 f8 lea -0x8(%r10),%rsp 310: c3 retq 311: 48 69 05 00 00 00 00 imul $0x3d0900,0x0(%rip),%rax # 31c <native_sched_clock+0x7c> 318: 00 09 3d 00 314: R_X86_64_PC32 jiffies_64-0x8 31c: 41 5a pop %r10 31e: 48 ba 00 b8 64 d9 45 movabs $0xffc2f745d964b800,%rdx 325: f7 c2 ff 328: 5d pop %rbp 329: 49 8d 62 f8 lea -0x8(%r10),%rsp 32d: 48 01 d0 add %rdx,%rax 330: c3 retq 331: