https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89176
H.J. Lu changed:
What|Removed |Added
CC||crazylht at gmail dot com
--- Comment #2 from H.J. Lu ---
(In reply to Richard Biener from comment #1)
>
> The epilogue vectorization issue also needs investigation.
Epilogue vectorization doesn't seem to work:
[hjl@gnu-cfl-1 pr89176]$ cat x.i
extern float *v1;
extern float *v2;
extern float *res;
void
foo (int n)
{
int i;
for (i = 0; i < n; i++)
res[i] = v2[i] * v1[i];
}
[hjl@gnu-cfl-1 pr89176]$ make x.s
/export/build/gnu/tools-build/gcc-debug/build-x86_64-linux/gcc/xgcc
-B/export/build/gnu/tools-build/gcc-debug/build-x86_64-linux/gcc/ -O3
-march=skylake -S x.i
[hjl@gnu-cfl-1 pr89176]$ cat x.s
.file "x.i"
.text
.p2align 4
.globl foo
.type foo, @function
foo:
.LFB0:
.cfi_startproc
testl %edi, %edi
jle .L23
movqv2(%rip), %rcx
movqres(%rip), %rdx
movqv1(%rip), %rsi
leaq31(%rcx), %r8
subq%rdx, %r8
cmpq$62, %r8
leaq31(%rsi), %r8
seta%r9b
subq%rdx, %r8
cmpq$62, %r8
seta%r8b
leal-1(%rdi), %eax
testb %r8b, %r9b
je .L3
cmpl$6, %eax
jbe .L3
movl%edi, %r8d
shrl$3, %r8d
salq$5, %r8
xorl%eax, %eax
.p2align 4,,10
.p2align 3
.L4:
vmovups (%rcx,%rax), %ymm1
vmulps (%rsi,%rax), %ymm1, %ymm0
vmovups %ymm0, (%rdx,%rax)
addq$32, %rax
cmpq%r8, %rax
jne .L4
movl%edi, %eax
andl$-8, %eax
testb $7, %dil
je .L22
movl%eax, %r8d
vmovss (%rcx,%r8,4), %xmm0
vmulss (%rsi,%r8,4), %xmm0, %xmm0
vmovss %xmm0, (%rdx,%r8,4)
leal1(%rax), %r8d
cmpl%r8d, %edi
jle .L22
movslq %r8d, %r8
vmovss (%rcx,%r8,4), %xmm0
vmulss (%rsi,%r8,4), %xmm0, %xmm0
vmovss %xmm0, (%rdx,%r8,4)
leal2(%rax), %r8d
cmpl%r8d, %edi
jle .L22
movslq %r8d, %r8
vmovss (%rcx,%r8,4), %xmm0
vmulss (%rsi,%r8,4), %xmm0, %xmm0
vmovss %xmm0, (%rdx,%r8,4)
leal3(%rax), %r8d
cmpl%r8d, %edi
jle .L22
movslq %r8d, %r8
vmovss (%rcx,%r8,4), %xmm0
vmulss (%rsi,%r8,4), %xmm0, %xmm0
vmovss %xmm0, (%rdx,%r8,4)
leal4(%rax), %r8d
cmpl%r8d, %edi
jle .L22
movslq %r8d, %r8
vmovss (%rcx,%r8,4), %xmm0
vmulss (%rsi,%r8,4), %xmm0, %xmm0
vmovss %xmm0, (%rdx,%r8,4)
leal5(%rax), %r8d
cmpl%r8d, %edi
jle .L22
movslq %r8d, %r8
vmovss (%rcx,%r8,4), %xmm0
addl$6, %eax
vmulss (%rsi,%r8,4), %xmm0, %xmm0
vmovss %xmm0, (%rdx,%r8,4)
cmpl%eax, %edi
jle .L22
cltq
vmovss (%rcx,%rax,4), %xmm0
vmulss (%rsi,%rax,4), %xmm0, %xmm0
vmovss %xmm0, (%rdx,%rax,4)
vzeroupper
ret
.p2align 4,,10
.p2align 3
.L22:
vzeroupper
.L23:
ret
.p2align 4,,10
.p2align 3
.L3:
movl%eax, %edi
xorl%eax, %eax
.p2align 4,,10
.p2align 3
.L6:
vmovss (%rcx,%rax,4), %xmm0
movq%rax, %r8
vmulss (%rsi,%rax,4), %xmm0, %xmm0
vmovss %xmm0, (%rdx,%rax,4)
incq%rax
cmpq%rdi, %r8
jne .L6
ret
.cfi_endproc
.LFE0:
.size foo, .-foo
.ident "GCC: (GNU) 10.0.0 20190723 (experimental)"
.section.note.GNU-stack,"",@progbits
[hjl@gnu-cfl-1 pr89176]$