On Thu, Oct 11, 2018 at 09:31:33AM +0200, Peter Zijlstra wrote:
> On Wed, Oct 10, 2018 at 05:33:36PM -0700, Eric Dumazet wrote:
> > While looking at native_sched_clock() disassembly I had
> > the surprise to see the compiler (gcc 7.3 here) had
> > optimized out the loop, meaning the code is broken.
> > 
> > Using the documented and approved API not only fixes the bug,
> > it also makes the code more readable.
> > 
> > Replacing five this_cpu_read() by one this_cpu_ptr() makes
> > the generated code smaller.
> 
> Does not for me, that is, the resulting asm is actually larger
> 
> You're quite right the loop went missing; no idea wth that compiler is
> smoking (gcc-8.2 for me). In order to eliminate that loop it needs to
> think that two consecutive loads of this_cpu_read(cyc2ns.seq.sequence)
> will return the same value. But this_cpu_read() is an asm() statement,
> it _should_ not assume such.
> 
> We assume that this_cpu_read() implies READ_ONCE() in a number of
> locations, this really should not happen.
> 
> The reason it was written using this_cpu_read() is so that it can use
> %gs: prefixed instructions and avoid ever loading that percpu offset and
> doing manual address computation.
> 
> Let me prod at this with a sharp stick.

OK, so apart from the inlining being crap, which is fixed by:

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6490f618e096..638491062fea 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -57,7 +57,7 @@ struct cyc2ns {
 
 static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
 
-void cyc2ns_read_begin(struct cyc2ns_data *data)
+void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data)
 {
        int seq, idx;
 
@@ -74,7 +74,7 @@ void cyc2ns_read_begin(struct cyc2ns_data *data)
        } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
 }
 
-void cyc2ns_read_end(void)
+void __always_inline cyc2ns_read_end(void)
 {
        preempt_enable_notrace();
 }
@@ -103,7 +103,7 @@ void cyc2ns_read_end(void)
  *                      [email protected] "math is hard, lets go shopping!"
  */
 
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
 {
        struct cyc2ns_data data;
        unsigned long long ns;


That gets us:

native_sched_clock:
        pushq   %rbp    #
        movq    %rsp, %rbp      #,

 ... jump label ...

        rdtsc
        salq    $32, %rdx       #, tmp110
        orq     %rax, %rdx      # low, tmp110
        movl %gs:cyc2ns+32(%rip),%ecx   # cyc2ns.seq.sequence, pfo_ret__
        andl    $1, %ecx        #, idx
        salq    $4, %rcx        #, tmp116
        movl %gs:cyc2ns(%rcx),%esi      # cyc2ns.data[idx_14].cyc2ns_mul, 
pfo_ret__
        movl    %esi, %esi      # pfo_ret__, pfo_ret__
        movq    %rsi, %rax      # pfo_ret__, tmp133
        mulq    %rdx    # _23
        movq %gs:cyc2ns+8(%rcx),%rdi    # cyc2ns.data[idx_14].cyc2ns_offset, 
pfo_ret__
        addq    $cyc2ns, %rcx   #, tmp117
        movl %gs:4(%rcx),%ecx   # cyc2ns.data[idx_14].cyc2ns_shift, pfo_ret__
        shrdq   %rdx, %rax      # pfo_ret__,, tmp134
        shrq    %cl, %rdx       # pfo_ret__,
        testb   $64, %cl        #, pfo_ret__
        cmovne  %rdx, %rax      #,, tmp134
        addq    %rdi, %rax      # pfo_ret__, <retval>
        popq    %rbp    #
        ret


If we then fix the percpu mess, with:

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index e9202a0de8f0..1a19d11cfbbd 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -185,22 +185,22 @@ do {                                                      
                \
        typeof(var) pfo_ret__;                          \
        switch (sizeof(var)) {                          \
        case 1:                                         \
-               asm(op "b "__percpu_arg(1)",%0"         \
+               asm volatile(op "b "__percpu_arg(1)",%0"\
                    : "=q" (pfo_ret__)                  \
                    : "m" (var));                       \
                break;                                  \
        case 2:                                         \
-               asm(op "w "__percpu_arg(1)",%0"         \
+               asm volatile(op "w "__percpu_arg(1)",%0"\
                    : "=r" (pfo_ret__)                  \
                    : "m" (var));                       \
                break;                                  \
        case 4:                                         \
-               asm(op "l "__percpu_arg(1)",%0"         \
+               asm volatile(op "l "__percpu_arg(1)",%0"\
                    : "=r" (pfo_ret__)                  \
                    : "m" (var));                       \
                break;                                  \
        case 8:                                         \
-               asm(op "q "__percpu_arg(1)",%0"         \
+               asm volatile(op "q "__percpu_arg(1)",%0"\
                    : "=r" (pfo_ret__)                  \
                    : "m" (var));                       \
                break;                                  \


That turns into:

native_sched_clock:
        pushq   %rbp    #
        movq    %rsp, %rbp      #,

 ... jump label ...

        rdtsc
        salq    $32, %rdx       #, tmp110
        orq     %rax, %rdx      # low, tmp110
        movq    %rdx, %r10      # tmp110, _23
        movq    $cyc2ns, %r9    #, tmp136
.L235:
        movl %gs:cyc2ns+32(%rip),%eax   # cyc2ns.seq.sequence, pfo_ret__
        movl    %eax, %ecx      # pfo_ret__, idx
        andl    $1, %ecx        #, idx
        salq    $4, %rcx        #, tmp116
        addq    %r9, %rcx       # tmp136, tmp117
        movq %gs:8(%rcx),%rdi   # cyc2ns.data[idx_14].cyc2ns_offset, pfo_ret__
        movl %gs:(%rcx),%esi    # cyc2ns.data[idx_14].cyc2ns_mul, pfo_ret__
        movl %gs:4(%rcx),%ecx   # cyc2ns.data[idx_14].cyc2ns_shift, pfo_ret__
        movl %gs:cyc2ns+32(%rip),%r8d   # cyc2ns.seq.sequence, pfo_ret__
        cmpl    %r8d, %eax      # pfo_ret__, pfo_ret__
        jne     .L235   #,
        movl    %esi, %esi      # pfo_ret__, pfo_ret__
        movq    %rsi, %rax      # pfo_ret__, tmp133
        mulq    %r10    # _23
        shrdq   %rdx, %rax      # pfo_ret__,, tmp134
        shrq    %cl, %rdx       # pfo_ret__,
        testb   $64, %cl        #, pfo_ret__
        cmovne  %rdx, %rax      #,, tmp134
        addq    %rdi, %rax      # pfo_ret__, <retval>
        popq    %rbp    #
        ret

which is exactly right. Except perhaps for the mess that
mul_u64_u32_shr() turns into.

Reply via email to