[Bug target/88963] gcc generates terrible code for vectors of 64+ length which are not natively supported

rguenth at gcc dot gnu.org Tue, 22 Jan 2019 01:18:08 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88963


Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
                 CC|                            |jakub at gcc dot gnu.org
             Blocks|                            |88670
     Ever confirmed|0                           |1

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
Confirmed.  The reason is that vector lowering only lowers the arithmetic
but leaves the loads and stores alone:

   _1 = *b_5(D);
   _2 = *c_6(D);
-  _3 = _1 + _2;
+  _9 = BIT_FIELD_REF <_1, 128, 0>;
+  _10 = BIT_FIELD_REF <_2, 128, 0>;
+  _11 = _9 + _10;
+  _12 = BIT_FIELD_REF <_1, 128, 128>;
+  _13 = BIT_FIELD_REF <_2, 128, 128>;
+  _14 = _12 + _13;
+  _15 = BIT_FIELD_REF <_1, 128, 256>;
+  _16 = BIT_FIELD_REF <_2, 128, 256>;
+  _17 = _15 + _16;
+  _18 = BIT_FIELD_REF <_1, 128, 384>;
+  _19 = BIT_FIELD_REF <_2, 128, 384>;
+  _20 = _18 + _19;
+  _3 = {_11, _14, _17, _20};
   *a_7(D) = _3;

there's some hack^Wcode in tree-ssa-forwprop.c to deal with similar cases
using {REAL,IMAG}PART_EXPR and COMPLEX_EXPR, splitting feeding/destination
memory accesses.  The same trick is missing for vector loads/stores.

OTOH it would be more reasonable for vector lowering to split the loads.
It's not so difficult to do - the main "issue" would be making sure
the wide vector load goes away (or maybe that's even a secondary issue
that could be ignored).

With just the loads handled code generation improves to

test:
.LFB0:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        andq    $-64, %rsp
        subq    $8, %rsp
        movdqa  (%rsi), %xmm3
        movdqa  16(%rsi), %xmm2
        movdqa  32(%rsi), %xmm1
        movdqa  48(%rsi), %xmm0
        paddd   (%rdx), %xmm3
        paddd   16(%rdx), %xmm2
        paddd   32(%rdx), %xmm1
        paddd   48(%rdx), %xmm0
        movaps  %xmm3, (%rdi)
        movaps  %xmm2, 16(%rdi)
        movaps  %xmm1, 32(%rdi)
        movaps  %xmm0, 48(%rdi)
        leave
        ret
        .cfi_endproc

for SSE2 and

test:
.LFB0:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        andq    $-64, %rsp
        subq    $8, %rsp
        vmovdqa (%rsi), %ymm3
        vpaddd  (%rdx), %ymm3, %ymm0
        vmovdqa %xmm0, %xmm2
        vmovdqa %ymm0, -120(%rsp)
        vmovdqa 32(%rsi), %ymm0
        vmovdqa -104(%rsp), %xmm4
        vpaddd  32(%rdx), %ymm0, %ymm0
        vmovaps %xmm2, (%rdi)
        vmovdqa %ymm0, -88(%rsp)
        vmovdqa -72(%rsp), %xmm5
        vmovaps %xmm4, 16(%rdi)
        vmovaps %xmm0, 32(%rdi)
        vmovaps %xmm0, 32(%rdi)
        vmovaps %xmm5, 48(%rdi)
        vzeroupper
        leave
        .cfi_def_cfa 7, 8
        ret

for skylake.  Not sure why we spill anything with the above, with the SSE
code we manage to elide the spills (but not the stack reservation).  I
guess we need to handle the stores as well.

The odd thing is that if I simply do

  _12 = BIT_FIELD_REF <*b_5(D), 256, 256>;
  _9 = BIT_FIELD_REF <*b_5(D), 256, 0>;
  _13 = BIT_FIELD_REF <*c_6(D), 256, 256>;
  _10 = BIT_FIELD_REF <*c_6(D), 256, 0>;
  _11 = _9 + _10;
  _14 = _12 + _13;
  BIT_FIELD_REF <*a_7(D), 256, 0> = _11;
  BIT_FIELD_REF <*a_7(D), 256, 256> = _14;

code-generation is even worse:

        vmovdqa (%rsi), %ymm0
        vmovdqa 32(%rsi), %ymm2
        vpaddd  (%rdx), %ymm0, %ymm3
        vpaddd  32(%rdx), %ymm2, %ymm1
        vmovdqa %ymm3, -64(%rsp)
        movq    -56(%rsp), %rax
        vmovdqa %ymm1, -32(%rsp)
        movq    %rax, 8(%rdi)
        movq    -48(%rsp), %rax
        vmovdqa -64(%rsp), %xmm0
        movq    %rax, 16(%rdi)
        movq    -40(%rsp), %rax
        vmovq   %xmm0, (%rdi)
        movq    %rax, 24(%rdi)
        movq    -24(%rsp), %rax
        vmovdqa -32(%rsp), %xmm0
        movq    %rax, 40(%rdi)
        movq    -16(%rsp), %rax
        vmovq   %xmm0, 32(%rdi)
        movq    %rax, 48(%rdi)
        movq    -8(%rsp), %rax
        movq    %rax, 56(%rdi)
        vzeroupper
        leave
        .cfi_def_cfa 7, 8
        ret

the stores expand to

;; BIT_FIELD_REF <*a_7(D), 256, 0> = _11;

(insn 14 13 15 (set (mem/j:DI (reg/v/f:DI 88 [ a ]) [1 *a_7(D)+0 S8 A256])
        (subreg:DI (reg:V8SI 84 [ _11 ]) 0)) "t.c":6:6 -1
     (nil))

(insn 15 14 16 (set (mem/j:DI (plus:DI (reg/v/f:DI 88 [ a ])
                (const_int 8 [0x8])) [1 *a_7(D)+8 S8 A64])
        (subreg:DI (reg:V8SI 84 [ _11 ]) 8)) "t.c":6:6 -1
     (nil))

(insn 16 15 17 (set (mem/j:DI (plus:DI (reg/v/f:DI 88 [ a ])
                (const_int 16 [0x10])) [1 *a_7(D)+16 S8 A128])
        (subreg:DI (reg:V8SI 84 [ _11 ]) 16)) "t.c":6:6 -1
     (nil))

(insn 17 16 0 (set (mem/j:DI (plus:DI (reg/v/f:DI 88 [ a ])
                (const_int 24 [0x18])) [1 *a_7(D)+24 S8 A64])
        (subreg:DI (reg:V8SI 84 [ _11 ]) 24)) "t.c":6:6 -1
     (nil))

for whatever reason.


Referenced Bugs:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88670
[Bug 88670] [meta-bug] generic vector extension issues

[Bug target/88963] gcc generates terrible code for vectors of 64+ length which are not natively supported

Reply via email to