https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99409
Bug ID: 99409 Summary: s252 benchmark of TSVC is vectorized by clang and not by gcc Product: gcc Version: 11.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D]; void main() { // scalar and array expansion // loop with ambiguous scalar temporary real_t t, s; for (int nl = 0; nl < iterations; nl++) { t = (real_t) 0.; for (int i = 0; i < LEN_1D; i++) { s = b[i] * c[i]; a[i] = s + t; t = s; } } } clang does: main: # @main .cfi_startproc # %bb.0: xorl %eax, %eax .p2align 4, 0x90 .LBB0_1: # =>This Loop Header: Depth=1 # Child Loop BB0_2 Depth 2 vxorps %xmm0, %xmm0, %xmm0 movq $-128000, %rcx # imm = 0xFFFE0C00 .p2align 4, 0x90 .LBB0_2: # Parent Loop BB0_1 Depth=1 # => This Inner Loop Header: Depth=2 vmovups c+128000(%rcx), %ymm1 vmovups c+128032(%rcx), %ymm2 vmovups c+128064(%rcx), %ymm3 vmovups c+128096(%rcx), %ymm4 vmulps b+128000(%rcx), %ymm1, %ymm1 vmulps b+128032(%rcx), %ymm2, %ymm2 vmulps b+128064(%rcx), %ymm3, %ymm3 vmulps b+128096(%rcx), %ymm4, %ymm4 vperm2f128 $33, %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[2,3],ymm1[0,1] vperm2f128 $33, %ymm2, %ymm1, %ymm5 # ymm5 = ymm1[2,3],ymm2[0,1] vperm2f128 $33, %ymm3, %ymm2, %ymm6 # ymm6 = ymm2[2,3],ymm3[0,1] vperm2f128 $33, %ymm4, %ymm3, %ymm7 # ymm7 = ymm3[2,3],ymm4[0,1] vshufps $3, %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4] vshufps $3, %ymm2, %ymm5, %ymm5 # ymm5 = ymm5[3,0],ymm2[0,0],ymm5[7,4],ymm2[4,4] vshufps $3, %ymm3, %ymm6, %ymm6 # ymm6 = ymm6[3,0],ymm3[0,0],ymm6[7,4],ymm3[4,4] vshufps $3, %ymm4, %ymm7, %ymm7 # ymm7 = ymm7[3,0],ymm4[0,0],ymm7[7,4],ymm4[4,4] vshufps $152, %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[0,2],ymm1[1,2],ymm0[4,6],ymm1[5,6] vshufps $152, %ymm2, %ymm5, %ymm5 # ymm5 = ymm5[0,2],ymm2[1,2],ymm5[4,6],ymm2[5,6] vshufps $152, %ymm3, %ymm6, %ymm6 # ymm6 = ymm6[0,2],ymm3[1,2],ymm6[4,6],ymm3[5,6] vshufps $152, %ymm4, %ymm7, %ymm7 # ymm7 = ymm7[0,2],ymm4[1,2],ymm7[4,6],ymm4[5,6] vaddps %ymm0, %ymm1, %ymm0 vaddps %ymm5, %ymm2, %ymm1 vaddps %ymm6, %ymm3, %ymm2 vaddps %ymm7, %ymm4, %ymm3 vmovups %ymm0, a+128000(%rcx) vmovups %ymm1, a+128032(%rcx) vmovups %ymm2, a+128064(%rcx) vmovups %ymm3, a+128096(%rcx) subq $-128, %rcx vmovaps %ymm4, %ymm0 jne .LBB0_2 # %bb.3: # in Loop: Header=BB0_1 Depth=1 incl %eax cmpl $100000, %eax # imm = 0x186A0 jne .LBB0_1 # %bb.4: vzeroupper retq s252.c:18:27: note: worklist: examine stmt: _3 = s_11 + t_21; s252.c:18:27: note: vect_is_simple_use: operand _1 * _2, type of def: internal s252.c:18:27: note: mark relevant 5, live 0: s_11 = _1 * _2; s252.c:18:27: note: vect_is_simple_use: operand t_21 = PHI <s_11(8), 0.0(5)>, type of def: unknown s252.c:18:27: missed: Unsupported pattern. s252.c:20:22: missed: not vectorized: unsupported use in stmt. s252.c:18:27: missed: unexpected pattern. <bb 8> [local count: 1052266996]: <bb 3> [local count: 1063004409]: # t_21 = PHI <s_11(8), 0.0(5)> # i_23 = PHI <i_13(8), 0(5)> # ivtmp_20 = PHI <ivtmp_19(8), 32000(5)> _1 = b[i_23]; _2 = c[i_23]; s_11 = _1 * _2; _3 = s_11 + t_21; a[i_23] = _3; i_13 = i_23 + 1; ivtmp_19 = ivtmp_20 - 1; if (ivtmp_19 != 0) goto <bb 8>; [98.99%] else goto <bb 4>; [1.01%]