[Bug rtl-optimization/46391] false dependencies are computed after vectorization (#2)

2021-09-08 Thread rguenth at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=46391

Richard Biener  changed:

   What|Removed |Added

 Status|UNCONFIRMED |RESOLVED
  Known to work||12.0, 7.5.0
 Resolution|--- |FIXED
  Known to fail||4.3.5

--- Comment #5 from Richard Biener  ---
It is.  Slightly altered testcase (to avoid unrolling on GIMPLE), with
-fno-schedule-insn2 on x86_64:

.L2:
movdqu  (%rdi,%rax), %xmm1
paddb   %xmm0, %xmm1
movups  %xmm1, (%rsi,%rax)
movdqu  16(%rdi,%rax), %xmm2
paddb   %xmm0, %xmm2
movups  %xmm2, 16(%rsi,%rax)
movdqu  32(%rdi,%rax), %xmm3
paddb   %xmm0, %xmm3
movups  %xmm3, 32(%rsi,%rax)
movdqu  48(%rdi,%rax), %xmm4
paddb   %xmm0, %xmm4
movups  %xmm4, 48(%rsi,%rax)
movdqu  64(%rdi,%rax), %xmm5
paddb   %xmm0, %xmm5
movups  %xmm5, 64(%rsi,%rax)
movdqu  80(%rdi,%rax), %xmm6
paddb   %xmm0, %xmm6
movups  %xmm6, 80(%rsi,%rax)
movdqu  96(%rdi,%rax), %xmm7
paddb   %xmm0, %xmm7
movups  %xmm7, 96(%rsi,%rax)
movdqu  112(%rdi,%rax), %xmm8
paddb   %xmm0, %xmm8
movups  %xmm8, 112(%rsi,%rax)
subq$-128, %rax
cmpq$4096, %rax
jne .L2

and without:

.L2:
movdqu  (%rdi,%rax), %xmm1
movdqu  16(%rdi,%rax), %xmm2
movdqu  32(%rdi,%rax), %xmm3
movdqu  48(%rdi,%rax), %xmm4
movdqu  64(%rdi,%rax), %xmm5
paddb   %xmm0, %xmm1
paddb   %xmm0, %xmm2
movdqu  80(%rdi,%rax), %xmm6
movdqu  96(%rdi,%rax), %xmm7
paddb   %xmm0, %xmm3
paddb   %xmm0, %xmm4
movups  %xmm1, (%rsi,%rax)
movdqu  112(%rdi,%rax), %xmm8
paddb   %xmm0, %xmm5
paddb   %xmm0, %xmm6
movups  %xmm2, 16(%rsi,%rax)
paddb   %xmm0, %xmm7
movups  %xmm3, 32(%rsi,%rax)
paddb   %xmm0, %xmm8
movups  %xmm4, 48(%rsi,%rax)
movups  %xmm5, 64(%rsi,%rax)
movups  %xmm6, 80(%rsi,%rax)
movups  %xmm7, 96(%rsi,%rax)
movups  %xmm8, 112(%rsi,%rax)
subq$-128, %rax
cmpq$4096, %rax
jne .L2

and that's only possible if this dependence is not visible.  4.3 shows the
problem still, GCC 7 doesn't.

[Bug rtl-optimization/46391] false dependencies are computed after vectorization (#2)

2021-08-15 Thread pinskia at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=46391

Andrew Pinski  changed:

   What|Removed |Added

   Keywords||alias, missed-optimization
 Blocks||53947

--- Comment #4 from Andrew Pinski  ---
I suspect this has been long fixed.


Referenced Bugs:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations

[Bug rtl-optimization/46391] false dependencies are computed after vectorization (#2)

2010-11-11 Thread roy.1rosen at gmail dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46391

--- Comment #3 from Roy Rosen roy.1rosen at gmail dot com 2010-11-11 08:03:57 
UTC ---
It happens also on i386 with ./cc1 -O3 a.c -fdump-rtl-all -funroll-loops
-fsched-verbose=8 -fschedule-insns:

In asmcons:

(insn 16 15 17 4 a.c:5 (set (reg:SI 72 [ vect_var_.26 ])
(mem:SI (plus:SI (reg/v/f:SI 74 [ c ])
(reg:SI 70 [ ivtmp.42 ])) [0 MEM[(char * restrict)vect_p.22]+0
S4 A32])) 50 {*movsi_internal} (nil))

(insn 17 16 134 4 a.c:5 (set (mem:SI (plus:SI (reg/v/f:SI 75 [ d ])
(reg:SI 70 [ ivtmp.42 ])) [0 MEM[(char * restrict)vect_p.27]+0
S4 A32])
(reg:SI 72 [ vect_var_.26 ])) 50 {*movsi_internal} (expr_list:REG_DEAD
(reg:SI 72 [ vect_var_.26 ])
(nil)))

(insn 134 17 135 4 a.c:5 (set (reg:SI 82 [ vect_var_.26 ])
(mem:SI (plus:SI (plus:SI (reg/v/f:SI 74 [ c ])
(reg:SI 70 [ ivtmp.42 ]))
(const_int 4 [0x4])) [0 MEM[(char * restrict)vect_p.22]+0 S4
A32])) 50 {*movsi_internal} (nil))

and in sched1:

;;   ==
;;   -- basic block 4 from 16 to 198 -- before reload
;;   ==

;;   --- forward dependences:  

;;   --- Region Dependences --- b 4 bb 0 
;;  insn  codebb   dep  prio  cost   reservation
;;    --   ---       ---
;;   1650 4 035 4   decodern,p2: 198 196 195 185
175 165 155 145 135 17 
;;   1750 4 131 1   decoder0,(p4+p3): 198 196 194
184 174 164 154 144 134 
;;  13450 4 131 4   decodern,p2: 198 196 195 185
175 165 155 145 135 
;;  13550 4 227 1   decoder0,(p4+p3): 198 196 194
184 174 164 154 144  

There should not be any dependency between 17 (store) and 134 (load).
BTW, I failed building i386 from the current snapshot so I used an old one.
(../../gcc-4.6-20101106/gcc/config/i386/bdver1.md:528: unknown mode `V4DF'
../../gcc-4.6-20101106/gcc/config/i386/bdver1.md:528: following context is `0
r  egister_operand)'
)


[Bug rtl-optimization/46391] false dependencies are computed after vectorization (#2)

2010-11-09 Thread rguenth at gcc dot gnu.org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46391

--- Comment #1 from Richard Guenther rguenth at gcc dot gnu.org 2010-11-09 
13:18:47 UTC ---
On x86_64 restrict information is there (-fdump-tree-optimized-alias):

bb 5:
  # PT = nonlocal { PARM_RESTRICT.0 } (restr)
  vect_p.27_71 = c_8(D) + prolog_loop_niters.16_27;
  # PT = nonlocal { PARM_RESTRICT.1 } (restr)
  vect_p.32_75 = d_5(D) + prolog_loop_niters.16_27;

bb 6:
  # ivtmp.52_46 = PHI ivtmp.52_31(6), 0(5)
  # ivtmp.55_47 = PHI ivtmp.55_33(6), 0(5)
  vect_var_.28_74 = MEM[base: vect_p.27_71, index: ivtmp.55_47, offset: 0B];
  MEM[base: vect_p.32_75, index: ivtmp.55_47, offset: 0B] = vect_var_.28_74;
  ivtmp.52_31 = ivtmp.52_46 + 1;
  ivtmp.55_33 = ivtmp.55_47 + 16;
  if (ivtmp.52_31  bnd.20_49)
goto bb 6;
  else
goto bb 7;


[Bug rtl-optimization/46391] false dependencies are computed after vectorization (#2)

2010-11-09 Thread roy.1rosen at gmail dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46391

--- Comment #2 from Roy Rosen roy.1rosen at gmail dot com 2010-11-09 13:48:15 
UTC ---
Seems to me that also on ia64 it is there but the dependecies are still wrong:

;; Function nor (nor)

nor (char * restrict c, char * restrict d)
{
  long unsigned int D.2085;
  long unsigned int D.2086;
  vector(8) char * restrict D.2087;
  void * D.2084;
  long unsigned int ivtmp?52;
  void * D.2071;
  void * D.2070;
  long unsigned int ivtmp?41;
  long unsigned int ivtmp?38;
  vector(8) char vect_var_?25;
  long int andmask?20;
  long int orptrs1?19;
  long int addr2int1?18;
  long int addr2int0?14;
  int i;
  char D.2021;

bb 2:
  addr2int0?14_22 = (long int) c_8(D);
  addr2int1?18_24 = (long int) d_5(D);
  orptrs1?19_25 = addr2int1?18_24 | addr2int0?14_22;
  andmask?20_26 = orptrs1?19_25  7;
  if (andmask?20_26 == 0)
goto bb 3;
  else
goto bb 5;

bb 3:
  ivtmp?52_37 = (long unsigned int) c_8(D);
  D.2085_32 = (long unsigned int) d_5(D);
  D.2086_48 = D.2085_32 + 256;
  D.2087_49 = (vector(8) char * restrict) D.2086_48;

bb 4:
  # PT = nonlocal { PARM_RESTRICT?1 } (restr)
  # ALIGN = 8, MISALIGN = 0
  # d_43 = PHI d_44(4), d_5(D)(3)
  # ivtmp?52_31 = PHI ivtmp?52_30(4), ivtmp?52_37(3)
  # PT = nonlocal { PARM_RESTRICT?0 } (restr)
  # ALIGN = 8, MISALIGN = 0
  D.2084_33 = (void *) ivtmp?52_31;
  vect_var_?25_41 = MEM[(char *)D.2084_33];
  ivtmp?52_30 = ivtmp?52_31 + 8;
  MEM[(char *)d_43] = vect_var_?25_41;
  # PT = nonlocal { PARM_RESTRICT?1 } (restr)
  d_44 = d_43 + 8;
  if (d_44 != D.2087_49)
goto bb 4;
  else
goto bb 7;

bb 5:
  ivtmp?38_4 = (long unsigned int) c_8(D);
  ivtmp?41_10 = (long unsigned int) d_5(D);

bb 6:
  # i_28 = PHI 0(5), i_36(6)
  # ivtmp?38_19 = PHI ivtmp?38_4(5), ivtmp?38_17(6)
  # ivtmp?41_6 = PHI ivtmp?41_10(5), ivtmp?41_9(6)
  # PT = nonlocal { PARM_RESTRICT?0 } (restr)
  D.2070_11 = (void *) ivtmp?38_19;
  D.2021_34 = MEM[(char *)D.2070_11];
  ivtmp?38_17 = ivtmp?38_19 + 1;
  # PT = nonlocal { PARM_RESTRICT?1 } (restr)
  D.2071_13 = (void *) ivtmp?41_6;
  MEM[(char *)D.2071_13] = D.2021_34;
  ivtmp?41_9 = ivtmp?41_6 + 1;
  i_36 = i_28 + 1;
  if (i_36 != 256)
goto bb 6;
  else
goto bb 7;

bb 7:
  return 0;

}