https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113590
--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
Move update to the latch:
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index edd7d4d8763..8b282019840 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10808,7 +10808,9 @@ vectorizable_induction (loop_vec_info loop_vinfo,
vec_def = gimple_build (&stmts,
PLUS_EXPR, step_vectype, vec_def, up);
vec_def = gimple_convert (&stmts, vectype, vec_def);
- gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
+ gimple_stmt_iterator si2
+ = gsi_after_labels (loop_latch_edge (iv_loop)->src);
+ gsi_insert_seq_before (&si2, stmts, GSI_SAME_STMT);
add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
UNKNOWN_LOCATION);
generates
jmp .L6
.p2align 5
.p2align 4,,10
.p2align 3
.L4:
paddd %xmm3, %xmm0
.L6:
movdqa %xmm0, %xmm1
addq $16, %rax
paddd %xmm2, %xmm1
movups %xmm1, -16(%rax)
cmpq %rdx, %rax
jne .L4
instead of
.p2align 5
.p2align 4
.p2align 3
.L4:
movdqa %xmm0, %xmm1
addq $16, %rax
paddd %xmm2, %xmm0
paddd %xmm3, %xmm1
movups %xmm1, -16(%rax)
cmpq %rdx, %rax
jne .L4