[Bug tree-optimization/89176] Vectorizer fails to consider narrower vector width for res[i] = v1[i] < v2[i] ? v2[i] : v1[i]

2021-08-16 Thread crazylht at gmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89176

Hongtao.liu  changed:

   What|Removed |Added

 CC||crazylht at gmail dot com

--- Comment #3 from Hongtao.liu  ---
It's fixed in GCC10, https://godbolt.org/z/17WdcoEbW

[Bug tree-optimization/89176] Vectorizer fails to consider narrower vector width for res[i] = v1[i] < v2[i] ? v2[i] : v1[i]

2021-08-16 Thread pinskia at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89176

Andrew Pinski  changed:

   What|Removed |Added

   Keywords||missed-optimization
   Severity|normal  |enhancement

[Bug tree-optimization/89176] Vectorizer fails to consider narrower vector width for res[i] = v1[i] < v2[i] ? v2[i] : v1[i]

2019-08-08 Thread hjl.tools at gmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89176

H.J. Lu  changed:

   What|Removed |Added

 CC||crazylht at gmail dot com

--- Comment #2 from H.J. Lu  ---
(In reply to Richard Biener from comment #1)
> 
> The epilogue vectorization issue also needs investigation.

Epilogue vectorization doesn't seem to work:

[hjl@gnu-cfl-1 pr89176]$ cat x.i
extern float *v1;
extern float *v2;
extern float *res;


void
foo (int n)
{
  int i;

  for (i = 0; i < n; i++)
res[i] = v2[i] * v1[i];
}
[hjl@gnu-cfl-1 pr89176]$ make x.s
/export/build/gnu/tools-build/gcc-debug/build-x86_64-linux/gcc/xgcc
-B/export/build/gnu/tools-build/gcc-debug/build-x86_64-linux/gcc/ -O3
-march=skylake  -S x.i
[hjl@gnu-cfl-1 pr89176]$ cat x.s
.file   "x.i"
.text
.p2align 4
.globl  foo
.type   foo, @function
foo:
.LFB0:
.cfi_startproc
testl   %edi, %edi
jle .L23
movqv2(%rip), %rcx
movqres(%rip), %rdx
movqv1(%rip), %rsi
leaq31(%rcx), %r8
subq%rdx, %r8
cmpq$62, %r8
leaq31(%rsi), %r8
seta%r9b
subq%rdx, %r8
cmpq$62, %r8
seta%r8b
leal-1(%rdi), %eax
testb   %r8b, %r9b
je  .L3
cmpl$6, %eax
jbe .L3
movl%edi, %r8d
shrl$3, %r8d
salq$5, %r8
xorl%eax, %eax
.p2align 4,,10
.p2align 3
.L4:
vmovups (%rcx,%rax), %ymm1
vmulps  (%rsi,%rax), %ymm1, %ymm0
vmovups %ymm0, (%rdx,%rax)
addq$32, %rax
cmpq%r8, %rax
jne .L4
movl%edi, %eax
andl$-8, %eax
testb   $7, %dil
je  .L22
movl%eax, %r8d
vmovss  (%rcx,%r8,4), %xmm0
vmulss  (%rsi,%r8,4), %xmm0, %xmm0
vmovss  %xmm0, (%rdx,%r8,4)
leal1(%rax), %r8d
cmpl%r8d, %edi
jle .L22
movslq  %r8d, %r8
vmovss  (%rcx,%r8,4), %xmm0
vmulss  (%rsi,%r8,4), %xmm0, %xmm0
vmovss  %xmm0, (%rdx,%r8,4)
leal2(%rax), %r8d
cmpl%r8d, %edi
jle .L22
movslq  %r8d, %r8
vmovss  (%rcx,%r8,4), %xmm0
vmulss  (%rsi,%r8,4), %xmm0, %xmm0
vmovss  %xmm0, (%rdx,%r8,4)
leal3(%rax), %r8d
cmpl%r8d, %edi
jle .L22
movslq  %r8d, %r8
vmovss  (%rcx,%r8,4), %xmm0
vmulss  (%rsi,%r8,4), %xmm0, %xmm0
vmovss  %xmm0, (%rdx,%r8,4)
leal4(%rax), %r8d
cmpl%r8d, %edi
jle .L22
movslq  %r8d, %r8
vmovss  (%rcx,%r8,4), %xmm0
vmulss  (%rsi,%r8,4), %xmm0, %xmm0
vmovss  %xmm0, (%rdx,%r8,4)
leal5(%rax), %r8d
cmpl%r8d, %edi
jle .L22
movslq  %r8d, %r8
vmovss  (%rcx,%r8,4), %xmm0
addl$6, %eax
vmulss  (%rsi,%r8,4), %xmm0, %xmm0
vmovss  %xmm0, (%rdx,%r8,4)
cmpl%eax, %edi
jle .L22
cltq
vmovss  (%rcx,%rax,4), %xmm0
vmulss  (%rsi,%rax,4), %xmm0, %xmm0
vmovss  %xmm0, (%rdx,%rax,4)
vzeroupper
ret
.p2align 4,,10
.p2align 3
.L22:
vzeroupper
.L23:
ret
.p2align 4,,10
.p2align 3
.L3:
movl%eax, %edi
xorl%eax, %eax
.p2align 4,,10
.p2align 3
.L6:
vmovss  (%rcx,%rax,4), %xmm0
movq%rax, %r8
vmulss  (%rsi,%rax,4), %xmm0, %xmm0
vmovss  %xmm0, (%rdx,%rax,4)
incq%rax
cmpq%rdi, %r8
jne .L6
ret
.cfi_endproc
.LFE0:
.size   foo, .-foo
.ident  "GCC: (GNU) 10.0.0 20190723 (experimental)"
.section.note.GNU-stack,"",@progbits
[hjl@gnu-cfl-1 pr89176]$

[Bug tree-optimization/89176] Vectorizer fails to consider narrower vector width for res[i] = v1[i] < v2[i] ? v2[i] : v1[i]

2019-02-04 Thread rguenth at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89176

Richard Biener  changed:

   What|Removed |Added

 Status|UNCONFIRMED |NEW
   Last reconfirmed||2019-02-04
 Blocks||53947
 Ever confirmed|0   |1

--- Comment #1 from Richard Biener  ---
Oddly enough with --param vect-epilogues-nomask=1 we vectorize the epilogue but
immediately throw it away afterwards.

The reason it works with multiplication is that basic-block vectorization
works there but in the condition case is presented with non-ifconverted
form in multiple BBs which it doesn't handle.

I think this one is then a duplicate of a bug that mentions that phiopt
doesn't use .FMIN/MAX internal functions if available.

The epilogue vectorization issue also needs investigation.


Referenced Bugs:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations