[Bug tree-optimization/88713] Vectorized code slow vs. flang

hjl.tools at gmail dot com Wed, 23 Jan 2019 08:13:31 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88713


--- Comment #46 from H.J. Lu <hjl.tools at gmail dot com> ---
We generate sqrtps for scalar sqrtf:

[hjl@gnu-skx-1 pr88713]$ cat s.i
extern float sqrtf(float x);

float
rsqrt(float r)
{
  return sqrtf (r);
}
[hjl@gnu-skx-1 pr88713]$ gcc -Ofast -S s.i
[hjl@gnu-skx-1 pr88713]$ cat s.s
        .file   "s.i"
        .text
        .p2align 4,,15
        .globl  rsqrt
        .type   rsqrt, @function
rsqrt:
.LFB0:
        .cfi_startproc
        sqrtss  %xmm0, %xmm0
        ret
        .cfi_endproc
.LFE0:
        .size   rsqrt, .-rsqrt
        .ident  "GCC: (GNU) 8.2.1 20190109 (Red Hat 8.2.1-7)"
        .section        .note.GNU-stack,"",@progbits
[hjl@gnu-skx-1 pr88713]$ 

But why don't we generate sqrtps for vector sqrtf?


[hjl@gnu-skx-1 pr88713]$ cat y.i
extern float sqrtf(float x);

void
rsqrt(float* restrict r, float* restrict a){
    for (int i = 0; i < 16; i++){
        r[i] = sqrtf(a[i]);
    }
}
[hjl@gnu-skx-1 pr88713]$ gcc -S -Ofast y.i 
[hjl@gnu-skx-1 pr88713]$ cat y.s
        .file   "y.i"
        .text
        .p2align 4,,15
        .globl  rsqrt
        .type   rsqrt, @function
rsqrt:
.LFB0:
        .cfi_startproc
        movups  (%rsi), %xmm1
        pxor    %xmm2, %xmm2
        movaps  .LC0(%rip), %xmm4
        movaps  %xmm2, %xmm3
        rsqrtps %xmm1, %xmm0
        cmpneqps        %xmm1, %xmm3
        movaps  %xmm1, %xmm5
        andps   %xmm3, %xmm0
        movaps  .LC1(%rip), %xmm3
        mulps   %xmm0, %xmm5
        mulps   %xmm5, %xmm0
        mulps   %xmm3, %xmm5
        movaps  %xmm0, %xmm1
        movups  16(%rsi), %xmm0
        addps   %xmm4, %xmm1
        mulps   %xmm5, %xmm1
        movaps  %xmm2, %xmm5
        cmpneqps        %xmm0, %xmm5
        movups  %xmm1, (%rdi)
        rsqrtps %xmm0, %xmm1
        andps   %xmm5, %xmm1
        movaps  %xmm2, %xmm5
        mulps   %xmm1, %xmm0
        mulps   %xmm0, %xmm1
        mulps   %xmm3, %xmm0
        addps   %xmm4, %xmm1
        mulps   %xmm0, %xmm1
        movups  32(%rsi), %xmm0
        cmpneqps        %xmm0, %xmm5
        movups  %xmm1, 16(%rdi)
        rsqrtps %xmm0, %xmm1
        andps   %xmm5, %xmm1
        mulps   %xmm1, %xmm0
        mulps   %xmm0, %xmm1
        mulps   %xmm3, %xmm0
        addps   %xmm4, %xmm1
        mulps   %xmm0, %xmm1
        movups  %xmm1, 32(%rdi)
        movups  48(%rsi), %xmm1
        rsqrtps %xmm1, %xmm0
        cmpneqps        %xmm1, %xmm2
        andps   %xmm2, %xmm0
        mulps   %xmm0, %xmm1
        mulps   %xmm1, %xmm0
        mulps   %xmm3, %xmm1
        addps   %xmm4, %xmm0
        mulps   %xmm1, %xmm0
        movups  %xmm0, 48(%rdi)
        ret
        .cfi_endproc
.LFE0:
        .size   rsqrt, .-rsqrt
        .section        .rodata.cst16,"aM",@progbits,16
        .align 16
.LC0:
        .long   3225419776
        .long   3225419776
        .long   3225419776
        .long   3225419776
        .align 16
.LC1:
        .long   3204448256
        .long   3204448256
        .long   3204448256
        .long   3204448256
        .ident  "GCC: (GNU) 8.2.1 20190109 (Red Hat 8.2.1-7)"
        .section        .note.GNU-stack,"",@progbits
[hjl@gnu-skx-1 pr88713]$

[Bug tree-optimization/88713] Vectorized code slow vs. flang

Reply via email to