https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #15 from Richard Biener <rguenth at gcc dot gnu.org> --- I can confirm we get .L3: vmovupd (%rsi), %ymm1 vmovupd 32(%rsi), %ymm0 addl $1, %eax addq $64, %rdi addq $64, %rsi vblendpd $14, %ymm1, %ymm0, %ymm3 vblendpd $14, %ymm0, %ymm1, %ymm2 vfnmadd213pd -64(%rdi), %ymm5, %ymm3 vfmadd213pd -32(%rdi), %ymm7, %ymm1 vfnmadd132pd %ymm4, %ymm3, %ymm2 vfnmadd132pd %ymm6, %ymm1, %ymm0 vmovupd %ymm2, -64(%rdi) vmovupd %ymm0, -32(%rdi) cmpl %edx, %eax jb .L3 instead of .L3: vmovupd (%rdx), %ymm1 vmovupd (%rdx), %ymm0 addl $1, %ecx addq $64, %rax vfmadd213pd -32(%rax), %ymm3, %ymm1 vfnmadd213pd -64(%rax), %ymm2, %ymm0 addq $64, %rdx vfnmadd231pd -32(%rdx), %ymm3, %ymm0 vfnmadd231pd -32(%rdx), %ymm2, %ymm1 vmovupd %ymm0, -64(%rax) vmovupd %ymm1, -32(%rax) cmpl %esi, %ecx jb .L3 the good case sees <bb 4> [local count: 214748368]: # ivtmp.27_211 = PHI <ivtmp.27_210(4), 0(3)> # ivtmp.32_209 = PHI <ivtmp.32_208(4), ivtmp.32_212(3)> # ivtmp.34_28 = PHI <ivtmp.34_51(4), ivtmp.34_52(3)> _53 = (void *) ivtmp.34_28; vect_x_re_54.13_193 = MEM <const vector(4) double> [(const double *)_53]; vect_x_im_60.21_176 = MEM <const vector(4) double> [(const double *)_53 + 32B]; _54 = (void *) ivtmp.32_209; vect_y_re_62.9_200 = MEM <vector(4) double> [(double *)_54]; vect_y_re_62.10_198 = MEM <vector(4) double> [(double *)_54 + 32B]; vect__154.17_185 = .FMA (vect_x_re_54.13_193, _197, vect_y_re_62.10_198); vect__66.16_188 = .FNMA (vect_x_re_54.13_193, _196, vect_y_re_62.9_200); vect_y_re_68.23_173 = .FNMA (vect_x_im_60.21_176, _197, vect__66.16_188); vect_y_re_68.23_172 = .FNMA (vect_x_im_60.21_176, _196, vect__154.17_185); MEM <vector(4) double> [(double *)_54] = vect_y_re_68.23_173; MEM <vector(4) double> [(double *)_54 + 32B] = vect_y_re_68.23_172; ivtmp.27_210 = ivtmp.27_211 + 1; ivtmp.32_208 = ivtmp.32_209 + 64; ivtmp.34_51 = ivtmp.34_28 + 64; if (bnd.6_207 > ivtmp.27_210) goto <bb 4>; [90.00%] while the bad has <bb 4> [local count: 214748368]: # ivtmp.31_65 = PHI <ivtmp.31_64(4), 0(3)> # ivtmp.36_63 = PHI <ivtmp.36_62(4), ivtmp.36_204(3)> # ivtmp.38_203 = PHI <ivtmp.38_59(4), ivtmp.38_60(3)> _61 = (void *) ivtmp.38_203; vect_x_im_60.13_211 = MEM <const vector(4) double> [(const double *)_61]; vect_x_im_60.14_209 = MEM <const vector(4) double> [(const double *)_61 + 32B]; vect_x_re_54.15_208 = VEC_PERM_EXPR <vect_x_im_60.14_209, vect_x_im_60.13_211, { 0, 5, 6, 7 }>; vect_x_re_54.23_192 = VEC_PERM_EXPR <vect_x_im_60.13_211, vect_x_im_60.14_209, { 0, 5, 6, 7 }>; _58 = (void *) ivtmp.36_63; vect_y_re_62.9_218 = MEM <vector(4) double> [(double *)_58]; vect_y_re_62.10_216 = MEM <vector(4) double> [(double *)_58 + 32B]; vect__41.18_202 = .FMA (vect_x_im_60.13_211, _215, vect_y_re_62.10_216); vect_y_re_68.17_205 = .FNMA (vect_x_re_54.15_208, _214, vect_y_re_62.9_218); vect_y_re_68.25_189 = .FNMA (vect_x_re_54.23_192, _198, vect_y_re_68.17_205); vect_y_re_68.25_188 = .FNMA (_199, vect_x_im_60.14_209, vect__41.18_202); MEM <vector(4) double> [(double *)_58] = vect_y_re_68.25_189; MEM <vector(4) double> [(double *)_58 + 32B] = vect_y_re_68.25_188; ivtmp.31_64 = ivtmp.31_65 + 1; ivtmp.36_62 = ivtmp.36_63 + 64; ivtmp.38_59 = ivtmp.38_203 + 64; if (ivtmp.31_64 < bnd.6_225) goto <bb 4>; [90.00%] the blends do not look like no-ops so I wonder if this is really computing the same thing ... (it swaps lane 0 from the two loads from x but not the stores)