https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99409

            Bug ID: 99409
           Summary: s252 benchmark of TSVC is vectorized by clang and not
                    by gcc
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

typedef float real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];

void main()
{

//    scalar and array expansion
//    loop with ambiguous scalar temporary

    real_t t, s;
    for (int nl = 0; nl < iterations; nl++) {
        t = (real_t) 0.;
        for (int i = 0; i < LEN_1D; i++) {
            s = b[i] * c[i];
            a[i] = s + t;
            t = s;
        }
    }

}

clang does:
main:                                   # @main
        .cfi_startproc
# %bb.0:
        xorl    %eax, %eax
        .p2align        4, 0x90
.LBB0_1:                                # =>This Loop Header: Depth=1
                                        #     Child Loop BB0_2 Depth 2
        vxorps  %xmm0, %xmm0, %xmm0
        movq    $-128000, %rcx                  # imm = 0xFFFE0C00
        .p2align        4, 0x90
.LBB0_2:                                #   Parent Loop BB0_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vmovups c+128000(%rcx), %ymm1
        vmovups c+128032(%rcx), %ymm2
        vmovups c+128064(%rcx), %ymm3
        vmovups c+128096(%rcx), %ymm4
        vmulps  b+128000(%rcx), %ymm1, %ymm1
        vmulps  b+128032(%rcx), %ymm2, %ymm2
        vmulps  b+128064(%rcx), %ymm3, %ymm3
        vmulps  b+128096(%rcx), %ymm4, %ymm4
        vperm2f128      $33, %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[2,3],ymm1[0,1]
        vperm2f128      $33, %ymm2, %ymm1, %ymm5 # ymm5 = ymm1[2,3],ymm2[0,1]
        vperm2f128      $33, %ymm3, %ymm2, %ymm6 # ymm6 = ymm2[2,3],ymm3[0,1]
        vperm2f128      $33, %ymm4, %ymm3, %ymm7 # ymm7 = ymm3[2,3],ymm4[0,1]
        vshufps $3, %ymm1, %ymm0, %ymm0         # ymm0 =
ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4]
        vshufps $3, %ymm2, %ymm5, %ymm5         # ymm5 =
ymm5[3,0],ymm2[0,0],ymm5[7,4],ymm2[4,4]
        vshufps $3, %ymm3, %ymm6, %ymm6         # ymm6 =
ymm6[3,0],ymm3[0,0],ymm6[7,4],ymm3[4,4]
        vshufps $3, %ymm4, %ymm7, %ymm7         # ymm7 =
ymm7[3,0],ymm4[0,0],ymm7[7,4],ymm4[4,4]
        vshufps $152, %ymm1, %ymm0, %ymm0       # ymm0 =
ymm0[0,2],ymm1[1,2],ymm0[4,6],ymm1[5,6]
        vshufps $152, %ymm2, %ymm5, %ymm5       # ymm5 =
ymm5[0,2],ymm2[1,2],ymm5[4,6],ymm2[5,6]
        vshufps $152, %ymm3, %ymm6, %ymm6       # ymm6 =
ymm6[0,2],ymm3[1,2],ymm6[4,6],ymm3[5,6]
        vshufps $152, %ymm4, %ymm7, %ymm7       # ymm7 =
ymm7[0,2],ymm4[1,2],ymm7[4,6],ymm4[5,6]
        vaddps  %ymm0, %ymm1, %ymm0
        vaddps  %ymm5, %ymm2, %ymm1
        vaddps  %ymm6, %ymm3, %ymm2
        vaddps  %ymm7, %ymm4, %ymm3
        vmovups %ymm0, a+128000(%rcx)
        vmovups %ymm1, a+128032(%rcx)
        vmovups %ymm2, a+128064(%rcx)
        vmovups %ymm3, a+128096(%rcx)
        subq    $-128, %rcx
        vmovaps %ymm4, %ymm0
        jne     .LBB0_2
# %bb.3:                                #   in Loop: Header=BB0_1 Depth=1
        incl    %eax
        cmpl    $100000, %eax                   # imm = 0x186A0
        jne     .LBB0_1
# %bb.4:
        vzeroupper
        retq

s252.c:18:27: note:   worklist: examine stmt: _3 = s_11 + t_21;
s252.c:18:27: note:   vect_is_simple_use: operand _1 * _2, type of def:
internal
s252.c:18:27: note:   mark relevant 5, live 0: s_11 = _1 * _2;
s252.c:18:27: note:   vect_is_simple_use: operand t_21 = PHI <s_11(8), 0.0(5)>,
type of def: unknown
s252.c:18:27: missed:   Unsupported pattern.
s252.c:20:22: missed:   not vectorized: unsupported use in stmt.
s252.c:18:27: missed:  unexpected pattern.

  <bb 8> [local count: 1052266996]:

  <bb 3> [local count: 1063004409]:
  # t_21 = PHI <s_11(8), 0.0(5)>
  # i_23 = PHI <i_13(8), 0(5)>
  # ivtmp_20 = PHI <ivtmp_19(8), 32000(5)>
  _1 = b[i_23];
  _2 = c[i_23];
  s_11 = _1 * _2;
  _3 = s_11 + t_21;
  a[i_23] = _3;
  i_13 = i_23 + 1;
  ivtmp_19 = ivtmp_20 - 1;
  if (ivtmp_19 != 0)
    goto <bb 8>; [98.99%]
  else
    goto <bb 4>; [1.01%]

Reply via email to