https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69710
--- Comment #3 from Andrew Pinski ---
For the double one, in .optimized on the trunk for aarch64
(--with-cpu=thunderx) I get:
:
# ivtmp.22_60 = PHI <0(10), ivtmp.22_59(11)>
# ivtmp.25_75 = PHI <0(10), ivtmp.25_79(11)>
vect__12.14_86 = MEM[base: vectp_dy.13_82, index: ivtmp.25_75, offset: 0B];
vect__15.17_91 = MEM[base: vectp_dx.16_87, index: ivtmp.25_75, offset: 0B];
vect__17.19_94 = vect__15.17_91 * vect_cst__92 + vect__12.14_86;
MEM[base: vectp_dy.13_82, index: ivtmp.25_75, offset: 0B] = vect__17.19_94;
ivtmp.22_59 = ivtmp.22_60 + 1;
ivtmp.25_79 = ivtmp.25_75 + 16;
if (bnd.9_55 > ivtmp.22_59)
goto ;
else
So yes there are two IV one for the memory accesses and one for the comparison.
The assembly code for the above loop (without -funroll-all-loops):
.L8:
ldr q1, [x7, x0]
add w5, w5, 1
ldr q2, [x2, x0]
fmlav1.2d, v2.2d, v3.2d
str q1, [x7, x0]
add x0, x0, 16
cmp w6, w5
bhi .L8
For SP I get:
:
# ivtmp.22_80 = PHI <0(12), ivtmp.22_79(13)>
# ivtmp.25_113 = PHI <0(12), ivtmp.25_112(13)>
vect__12.14_87 = MEM[base: vectp_dy.13_83, index: ivtmp.25_113, offset: 0B];
vect__15.17_92 = MEM[base: vectp_dx.16_88, index: ivtmp.25_113, offset: 0B];
vect__17.19_95 = vect__15.17_92 * vect_cst__93 + vect__12.14_87;
MEM[base: vectp_dy.13_83, index: ivtmp.25_113, offset: 0B] = vect__17.19_95;
ivtmp.22_79 = ivtmp.22_80 + 1;
ivtmp.25_112 = ivtmp.25_113 + 16;
if (bnd.9_55 > ivtmp.22_79)
goto ;
else
goto ;
.L8:
ldr q1, [x8, x4]
add w7, w7, 1
ldr q2, [x2, x4]
fmlav1.4s, v2.4s, v3.4s
str q1, [x8, x4]
add x4, x4, 16
cmp w5, w7
bhi .L8
So it works for me on aarch64 correctly.
For -funroll-all-loops I get (similarly for DP also):
.L8:
ldr q7, [x15, x4]
add x7, x4, 16
ldr q16, [x16, x4]
add x9, x4, 32
ldr q17, [x16, x7]
add x8, x4, 48
ldr q19, [x16, x9]
add x11, x4, 64
ldr q22, [x16, x8]
add x14, x4, 80
fmlav7.4s, v16.4s, v21.4s
ldr q24, [x16, x11]
ldr q26, [x16, x14]
add x12, x4, 96
ldr q28, [x16, x12]
add x17, x4, 112
ldr q30, [x16, x17]
add w2, w2, 8
str q7, [x15, x4]
add x4, x4, 128
ldr q18, [x15, x7]
fmlav18.4s, v17.4s, v21.4s
str q18, [x15, x7]
ldr q20, [x15, x9]
fmlav20.4s, v19.4s, v21.4s
str q20, [x15, x9]
ldr q23, [x15, x8]
fmlav23.4s, v22.4s, v21.4s
str q23, [x15, x8]
ldr q25, [x15, x11]
fmlav25.4s, v24.4s, v21.4s
str q25, [x15, x11]
ldr q27, [x15, x14]
fmlav27.4s, v26.4s, v21.4s
str q27, [x15, x14]
ldr q29, [x15, x12]
fmlav29.4s, v28.4s, v21.4s
str q29, [x15, x12]
ldr q31, [x15, x17]
fmlav31.4s, v30.4s, v21.4s
str q31, [x15, x17]
cmp w13, w2
bhi .L8