https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114570
Bug ID: 114570 Summary: GCC doesn't perform good loop invariant code motion for very long vector operations. Product: gcc Version: 14.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: liuhongt at gcc dot gnu.org Target Milestone: --- typedef float v128_32 __attribute__((vector_size (128 * 4), aligned(2048))); v128_32 foo (v128_32 a, v128_32 b, v128_32 c, int n) { for (int i = 0; i != 2048; i++) { a = a / c; a = a / b; } return a; } <bb 3> [local count: 1063004408]: # a_13 = PHI <a_9(3), a_3(D)(2)> # ivtmp_2 = PHI <ivtmp_1(3), 2048(2)> # DEBUG i => NULL # DEBUG a => NULL # DEBUG BEGIN_STMT _14 = BIT_FIELD_REF <a_13, 256, 0>; _15 = BIT_FIELD_REF <c_6(D), 256, 0>; _10 = _14 / _15; _11 = BIT_FIELD_REF <a_13, 256, 256>; _12 = BIT_FIELD_REF <c_6(D), 256, 256>; _16 = _11 / _12; _17 = BIT_FIELD_REF <a_13, 256, 512>; _18 = BIT_FIELD_REF <c_6(D), 256, 512>; _19 = _17 / _18; _20 = BIT_FIELD_REF <a_13, 256, 768>; _21 = BIT_FIELD_REF <c_6(D), 256, 768>; _22 = _20 / _21; _23 = BIT_FIELD_REF <a_13, 256, 1024>; _24 = BIT_FIELD_REF <c_6(D), 256, 1024>; _25 = _23 / _24; _26 = BIT_FIELD_REF <a_13, 256, 1280>; _27 = BIT_FIELD_REF <c_6(D), 256, 1280>; _28 = _26 / _27; _29 = BIT_FIELD_REF <a_13, 256, 1536>; _30 = BIT_FIELD_REF <c_6(D), 256, 1536>; _31 = _29 / _30; _32 = BIT_FIELD_REF <a_13, 256, 1792>; _33 = BIT_FIELD_REF <c_6(D), 256, 1792>; _34 = _32 / _33; _35 = BIT_FIELD_REF <a_13, 256, 2048>; _36 = BIT_FIELD_REF <c_6(D), 256, 2048>; _37 = _35 / _36; _38 = BIT_FIELD_REF <a_13, 256, 2304>; _39 = BIT_FIELD_REF <c_6(D), 256, 2304>; _40 = _38 / _39; _41 = BIT_FIELD_REF <a_13, 256, 2560>; _42 = BIT_FIELD_REF <c_6(D), 256, 2560>; _43 = _41 / _42; _44 = BIT_FIELD_REF <a_13, 256, 2816>; _45 = BIT_FIELD_REF <c_6(D), 256, 2816>; _46 = _44 / _45; _47 = BIT_FIELD_REF <a_13, 256, 3072>; _48 = BIT_FIELD_REF <c_6(D), 256, 3072>; _49 = _47 / _48; _50 = BIT_FIELD_REF <a_13, 256, 3328>; _51 = BIT_FIELD_REF <c_6(D), 256, 3328>; _52 = _50 / _51; _53 = BIT_FIELD_REF <a_13, 256, 3584>; _54 = BIT_FIELD_REF <c_6(D), 256, 3584>; _55 = _53 / _54; _56 = BIT_FIELD_REF <a_13, 256, 3840>; _57 = BIT_FIELD_REF <c_6(D), 256, 3840>; _58 = _56 / _57; # DEBUG a => {_10, _16, _19, _22, _25, _28, _31, _34, _37, _40, _43, _46, _49, _52, _55, _58} # DEBUG BEGIN_STMT _59 = BIT_FIELD_REF <b_8(D), 256, 0>; _60 = _10 / _59; _61 = BIT_FIELD_REF <b_8(D), 256, 256>; _62 = _16 / _61; _63 = BIT_FIELD_REF <b_8(D), 256, 512>; _64 = _19 / _63; _65 = BIT_FIELD_REF <b_8(D), 256, 768>; _66 = _22 / _65; _67 = BIT_FIELD_REF <b_8(D), 256, 1024>; _68 = _25 / _67; _69 = BIT_FIELD_REF <b_8(D), 256, 1280>; _70 = _28 / _69; _71 = BIT_FIELD_REF <b_8(D), 256, 1536>; _72 = _31 / _71; _73 = BIT_FIELD_REF <b_8(D), 256, 1792>; _74 = _34 / _73; _75 = BIT_FIELD_REF <b_8(D), 256, 2048>; _76 = _37 / _75; _77 = BIT_FIELD_REF <b_8(D), 256, 2304>; _78 = _40 / _77; _79 = BIT_FIELD_REF <b_8(D), 256, 2560>; _80 = _43 / _79; _81 = BIT_FIELD_REF <b_8(D), 256, 2816>; _82 = _46 / _81; _83 = BIT_FIELD_REF <b_8(D), 256, 3072>; _84 = _49 / _83; _85 = BIT_FIELD_REF <b_8(D), 256, 3328>; _86 = _52 / _85; _87 = BIT_FIELD_REF <b_8(D), 256, 3584>; _88 = _55 / _87; _89 = BIT_FIELD_REF <b_8(D), 256, 3840>; _90 = _58 / _89; a_9 = {_60, _62, _64, _66, _68, _70, _72, _74, _76, _78, _80, _82, _84, _86, _88, _90}; # DEBUG a => a_9 # DEBUG BEGIN_STMT # DEBUG i => NULL # DEBUG a => a_9 # DEBUG BEGIN_STMT ivtmp_1 = ivtmp_2 + 4294967295; if (ivtmp_1 != 0) goto <bb 3>; [98.99%] else goto <bb 4>; [1.01%] Ideally, those BIT_FIELD_REF can be hoisted out and # a_13 = PHI <a_9(3), a_3(D)(2)> can be optimized with those 256-bit vectors. we finanly generate foo: pushq %rbp movq %rdi, %rax movl $2048, %edx movq %rsp, %rbp subq $408, %rsp leaq -120(%rsp), %r8 .L2: vmovaps 16(%rbp), %ymm15 vmovaps 48(%rbp), %ymm14 movq %r8, %rsi vdivps 1040(%rbp), %ymm15, %ymm15 vmovaps 80(%rbp), %ymm13 vmovaps 112(%rbp), %ymm12 vdivps 528(%rbp), %ymm15, %ymm15 vdivps 1072(%rbp), %ymm14, %ymm14 vmovaps 144(%rbp), %ymm11 vmovaps 176(%rbp), %ymm10 vdivps 560(%rbp), %ymm14, %ymm14 vdivps 1104(%rbp), %ymm13, %ymm13 vmovaps 208(%rbp), %ymm9 vmovaps 240(%rbp), %ymm8 vdivps 592(%rbp), %ymm13, %ymm13 vdivps 1136(%rbp), %ymm12, %ymm12 vmovaps 272(%rbp), %ymm7 vmovaps 304(%rbp), %ymm6 vdivps 624(%rbp), %ymm12, %ymm12 vdivps 1168(%rbp), %ymm11, %ymm11 vmovaps 336(%rbp), %ymm5 vdivps 656(%rbp), %ymm11, %ymm11 vdivps 1200(%rbp), %ymm10, %ymm10 vdivps 1232(%rbp), %ymm9, %ymm9 vdivps 688(%rbp), %ymm10, %ymm10 vdivps 720(%rbp), %ymm9, %ymm9 vdivps 1264(%rbp), %ymm8, %ymm8 vdivps 1296(%rbp), %ymm7, %ymm7 vdivps 752(%rbp), %ymm8, %ymm8 vdivps 784(%rbp), %ymm7, %ymm7 vdivps 1328(%rbp), %ymm6, %ymm6 movl $64, %ecx vdivps 816(%rbp), %ymm6, %ymm6 leaq 16(%rbp), %rdi vdivps 1360(%rbp), %ymm5, %ymm5 vdivps 848(%rbp), %ymm5, %ymm5 vmovaps 368(%rbp), %ymm4 vmovaps 400(%rbp), %ymm3 vdivps 1392(%rbp), %ymm4, %ymm4 vdivps 1424(%rbp), %ymm3, %ymm3 vmovaps 432(%rbp), %ymm2 vmovaps 464(%rbp), %ymm1 vdivps 880(%rbp), %ymm4, %ymm4 vdivps 912(%rbp), %ymm3, %ymm3 vmovaps 496(%rbp), %ymm0 vdivps 1456(%rbp), %ymm2, %ymm2 vdivps 1488(%rbp), %ymm1, %ymm1 vdivps 944(%rbp), %ymm2, %ymm2 vdivps 976(%rbp), %ymm1, %ymm1 vdivps 1520(%rbp), %ymm0, %ymm0 vmovaps %ymm15, -120(%rsp) vdivps 1008(%rbp), %ymm0, %ymm0 vmovaps %ymm14, -88(%rsp) vmovaps %ymm13, -56(%rsp) vmovaps %ymm12, -24(%rsp) vmovaps %ymm11, 8(%rsp) vmovaps %ymm10, 40(%rsp) vmovaps %ymm9, 72(%rsp) vmovaps %ymm8, 104(%rsp) vmovaps %ymm7, 136(%rsp) vmovaps %ymm6, 168(%rsp) vmovaps %ymm5, 200(%rsp) vmovaps %ymm4, 232(%rsp) vmovaps %ymm3, 264(%rsp) vmovaps %ymm2, 296(%rsp) vmovaps %ymm1, 328(%rsp) vmovaps %ymm0, 360(%rsp) rep movsq subl $1, %edx jne .L2 leaq 16(%rbp), %rsi movl $64, %ecx movq %rax, %rdi rep movsq vzeroupper leave ret But it can be better with just foo: # @foo pushq %rbp movq %rsp, %rbp andq $-512, %rsp # imm = 0xFE00 subq $1536, %rsp # imm = 0x600 movq %rdi, %rax vmovaps 496(%rbp), %ymm14 vmovaps 464(%rbp), %ymm13 vmovaps 432(%rbp), %ymm12 vmovaps 400(%rbp), %ymm11 vmovaps 368(%rbp), %ymm10 vmovaps 336(%rbp), %ymm9 vmovaps 304(%rbp), %ymm8 vmovaps 272(%rbp), %ymm7 vmovaps 240(%rbp), %ymm6 vmovaps 208(%rbp), %ymm5 vmovaps 176(%rbp), %ymm4 vmovaps 144(%rbp), %ymm3 vmovaps 16(%rbp), %ymm0 vmovaps %ymm0, 416(%rsp) # 32-byte Spill vmovaps 48(%rbp), %ymm2 vmovaps 80(%rbp), %ymm15 vmovaps 112(%rbp), %ymm0 vmovaps %ymm0, 448(%rsp) # 32-byte Spill movl $2048, %ecx # imm = 0x800 vmovaps 1008(%rbp), %ymm0 vmovaps %ymm0, 1472(%rsp) # 32-byte Spill vmovaps 976(%rbp), %ymm1 vmovaps %ymm1, 1440(%rsp) # 32-byte Spill vmovaps %ymm2, %ymm1 vmovaps 944(%rbp), %ymm2 vmovaps %ymm2, 1408(%rsp) # 32-byte Spill vmovaps %ymm3, %ymm2 vmovaps 912(%rbp), %ymm3 vmovaps %ymm3, 1376(%rsp) # 32-byte Spill vmovaps %ymm4, %ymm3 vmovaps 880(%rbp), %ymm4 vmovaps %ymm4, 1344(%rsp) # 32-byte Spill vmovaps %ymm5, %ymm4 vmovaps 848(%rbp), %ymm5 vmovaps %ymm5, 1312(%rsp) # 32-byte Spill vmovaps %ymm6, %ymm5 vmovaps 816(%rbp), %ymm6 vmovaps %ymm6, 1280(%rsp) # 32-byte Spill vmovaps %ymm7, %ymm6 vmovaps 784(%rbp), %ymm7 vmovaps %ymm7, 1248(%rsp) # 32-byte Spill vmovaps %ymm8, %ymm7 vmovaps 752(%rbp), %ymm8 vmovaps %ymm8, 1216(%rsp) # 32-byte Spill vmovaps %ymm9, %ymm8 vmovaps 720(%rbp), %ymm9 vmovaps %ymm9, 1184(%rsp) # 32-byte Spill vmovaps %ymm10, %ymm9 vmovaps 688(%rbp), %ymm10 vmovaps %ymm10, 1152(%rsp) # 32-byte Spill vmovaps %ymm11, %ymm10 vmovaps 656(%rbp), %ymm11 vmovaps %ymm11, 1120(%rsp) # 32-byte Spill vmovaps %ymm12, %ymm11 vmovaps 528(%rbp), %ymm12 vmovaps %ymm12, 1088(%rsp) # 32-byte Spill vmovaps %ymm13, %ymm12 vmovaps 560(%rbp), %ymm13 vmovaps %ymm13, 1056(%rsp) # 32-byte Spill vmovaps %ymm14, %ymm13 vmovaps 592(%rbp), %ymm14 vmovaps %ymm14, 1024(%rsp) # 32-byte Spill vmovaps %ymm15, %ymm14 vmovaps 624(%rbp), %ymm15 vmovaps %ymm15, 992(%rsp) # 32-byte Spill vmovaps 448(%rsp), %ymm15 # 32-byte Reload vmovaps 1520(%rbp), %ymm0 vmovaps %ymm0, 960(%rsp) # 32-byte Spill vmovaps 1488(%rbp), %ymm0 vmovaps %ymm0, 928(%rsp) # 32-byte Spill vmovaps 1456(%rbp), %ymm0 vmovaps %ymm0, 896(%rsp) # 32-byte Spill vmovaps 1424(%rbp), %ymm0 vmovaps %ymm0, 864(%rsp) # 32-byte Spill vmovaps 1392(%rbp), %ymm0 vmovaps %ymm0, 832(%rsp) # 32-byte Spill vmovaps 1360(%rbp), %ymm0 vmovaps %ymm0, 800(%rsp) # 32-byte Spill vmovaps 1328(%rbp), %ymm0 vmovaps %ymm0, 768(%rsp) # 32-byte Spill vmovaps 1296(%rbp), %ymm0 vmovaps %ymm0, 736(%rsp) # 32-byte Spill vmovaps 1264(%rbp), %ymm0 vmovaps %ymm0, 704(%rsp) # 32-byte Spill vmovaps 1232(%rbp), %ymm0 vmovaps %ymm0, 672(%rsp) # 32-byte Spill vmovaps 1200(%rbp), %ymm0 vmovaps %ymm0, 640(%rsp) # 32-byte Spill vmovaps 1168(%rbp), %ymm0 vmovaps %ymm0, 608(%rsp) # 32-byte Spill vmovaps 1040(%rbp), %ymm0 vmovaps %ymm0, 576(%rsp) # 32-byte Spill vmovaps 1072(%rbp), %ymm0 vmovaps %ymm0, 544(%rsp) # 32-byte Spill vmovaps 1104(%rbp), %ymm0 vmovaps %ymm0, 512(%rsp) # 32-byte Spill vmovaps 1136(%rbp), %ymm0 vmovaps %ymm0, 480(%rsp) # 32-byte Spill .LBB0_1: # =>This Inner Loop Header: Depth=1 vdivps 960(%rsp), %ymm13, %ymm13 # 32-byte Folded Reload vdivps 928(%rsp), %ymm12, %ymm12 # 32-byte Folded Reload vdivps 896(%rsp), %ymm11, %ymm11 # 32-byte Folded Reload vdivps 864(%rsp), %ymm10, %ymm10 # 32-byte Folded Reload vdivps 832(%rsp), %ymm9, %ymm9 # 32-byte Folded Reload vdivps 800(%rsp), %ymm8, %ymm8 # 32-byte Folded Reload vdivps 768(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload vdivps 736(%rsp), %ymm6, %ymm6 # 32-byte Folded Reload vdivps 704(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload vdivps 672(%rsp), %ymm4, %ymm4 # 32-byte Folded Reload vdivps 640(%rsp), %ymm3, %ymm3 # 32-byte Folded Reload vdivps 608(%rsp), %ymm2, %ymm2 # 32-byte Folded Reload vdivps 480(%rsp), %ymm15, %ymm15 # 32-byte Folded Reload vdivps 512(%rsp), %ymm14, %ymm14 # 32-byte Folded Reload vdivps 544(%rsp), %ymm1, %ymm1 # 32-byte Folded Reload vmovaps 416(%rsp), %ymm0 # 32-byte Reload vdivps 576(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload vdivps 1088(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload vmovaps %ymm0, 416(%rsp) # 32-byte Spill vdivps 1056(%rsp), %ymm1, %ymm1 # 32-byte Folded Reload vdivps 1024(%rsp), %ymm14, %ymm14 # 32-byte Folded Reload vdivps 992(%rsp), %ymm15, %ymm15 # 32-byte Folded Reload vdivps 1120(%rsp), %ymm2, %ymm2 # 32-byte Folded Reload vdivps 1152(%rsp), %ymm3, %ymm3 # 32-byte Folded Reload vdivps 1184(%rsp), %ymm4, %ymm4 # 32-byte Folded Reload vdivps 1216(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload vdivps 1248(%rsp), %ymm6, %ymm6 # 32-byte Folded Reload vdivps 1280(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload vdivps 1312(%rsp), %ymm8, %ymm8 # 32-byte Folded Reload vdivps 1344(%rsp), %ymm9, %ymm9 # 32-byte Folded Reload vdivps 1376(%rsp), %ymm10, %ymm10 # 32-byte Folded Reload vdivps 1408(%rsp), %ymm11, %ymm11 # 32-byte Folded Reload vdivps 1440(%rsp), %ymm12, %ymm12 # 32-byte Folded Reload vdivps 1472(%rsp), %ymm13, %ymm13 # 32-byte Folded Reload decl %ecx jne .LBB0_1 vmovaps 416(%rsp), %ymm0 # 32-byte Reload vmovaps %ymm0, (%rax) vmovaps %ymm1, 32(%rax) vmovaps %ymm14, 64(%rax) vmovaps %ymm15, 96(%rax) vmovaps %ymm2, 128(%rax) vmovaps %ymm3, 160(%rax) vmovaps %ymm4, 192(%rax) vmovaps %ymm5, 224(%rax) vmovaps %ymm6, 256(%rax) vmovaps %ymm7, 288(%rax) vmovaps %ymm8, 320(%rax) vmovaps %ymm9, 352(%rax) vmovaps %ymm10, 384(%rax) vmovaps %ymm11, 416(%rax) vmovaps %ymm12, 448(%rax) vmovaps %ymm13, 480(%rax) movq %rbp, %rsp popq %rbp vzeroupper retq