On Mon, 23 Jun 2025, Tamar Christina wrote: > Consider the loop > > void f1 (int *restrict a, int n) > { > #pragma GCC unroll 4 requested > for (int i = 0; i < n; i++) > a[i] *= 2; > } > > Which today is vectorized and then unrolled 3x by the RTL unroller due to the > use of the pragma. This is unfortunate because the pragma was intended for > the > scalar loop but we end up with an unrolled vector loop and a longer path to > the > entry which has a low enough VF requirement to enter. > > This patch instead seeds the suggested_unroll_factor with the value the user > requested and instead uses it to maintain the total VF that the user wanted > the > scalar loop to maintain. > > In effect it applies the unrolling inside the vector loop itself. This has > the > benefits for things like reductions, as it allows us to split the accumulator > and so the unrolled loop is more efficient. For early-break it allows the > cbranch call to be shared between the unrolled elements, giving you more > effective unrolling because it doesn't need the repeated cbranch which can be > expensive. > > The target can then choose to create multiple epilogues to deal with the > "rest". > > The example above now generates: > > .L4: > ldr q31, [x2] > add v31.4s, v31.4s, v31.4s > str q31, [x2], 16 > cmp x2, x3 > bne .L4 > > as V4SI maintains the requested VF, but e.g. pragma unroll 8 generates: > > .L4: > ldp q30, q31, [x2] > add v30.4s, v30.4s, v30.4s > add v31.4s, v31.4s, v31.4s > stp q30, q31, [x2], 32 > cmp x3, x2 > bne .L4 > > Bootstrapped Regtested on aarch64-none-linux-gnu, > arm-none-linux-gnueabihf, x86_64-pc-linux-gnu > -m32, -m64 and no issues. > > Ok for master?
OK. Thanks, Richard. > Thanks, > Tamar > > gcc/ChangeLog: > > * doc/extend.texi: Document pragma unroll interaction with vectorizer. > * tree-vectorizer.h (LOOP_VINFO_USER_UNROLL): New. > (class _loop_vec_info): Add user_unroll. > * tree-vect-loop.cc (vect_analyze_loop_1): Set > suggested_unroll_factor and retry. > (_loop_vec_info::_loop_vec_info): Initialize user_unroll. > (vect_transform_loop): Clear the loop->unroll value if the pragma was > used. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/unroll-vect.c: New test. > > --- > diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi > index > 69c6512074642ece47f1f9a3d7bdde20ec800d40..7da99f77ec82b23f7a79558f3a0fa98b208f8283 > 100644 > --- a/gcc/doc/extend.texi > +++ b/gcc/doc/extend.texi > @@ -10382,6 +10382,11 @@ loop or a @code{#pragma GCC ivdep}, and applies only > to the loop that follows. > @var{n} is an integer constant expression specifying the unrolling factor. > The values of @math{0} and @math{1} block any unrolling of the loop. > > +If the loop was vectorized the unroll factor specified will be used to seed > the > +vectorizer unroll factor. Whether the loop is unrolled or not will be > +determined by target costing. The resulting vectorized loop may still be > +unrolled more in later passes depending on the target costing. > + > @end table > > @node Thread-Local > diff --git a/gcc/testsuite/gcc.target/aarch64/unroll-vect.c > b/gcc/testsuite/gcc.target/aarch64/unroll-vect.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..3cb774ba95787ebee488fbe7306299ef28e6bb35 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/unroll-vect.c > @@ -0,0 +1,20 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-O3 -march=armv8-a --param > aarch64-autovec-preference=asimd-only -std=gnu99" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > + > +/* > +** f1: > +** ... > +** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** ... > +*/ > +void f1 (int *restrict a, int n) > +{ > +#pragma GCC unroll 16 > + for (int i = 0; i < n; i++) > + a[i] *= 2; > +} > + > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index > 9ac4d7e5f7a099a7039cd4186666cf64328b8ee6..44f304b6e3766d43d388599b6a80ab9e8e3123ef > 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -1073,6 +1073,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, > vec_info_shared *shared) > peeling_for_gaps (false), > peeling_for_niter (false), > early_breaks (false), > + user_unroll (false), > no_data_dependencies (false), > has_mask_store (false), > scalar_loop_scaling (profile_probability::uninitialized ()), > @@ -3428,27 +3429,50 @@ vect_analyze_loop_1 (class loop *loop, > vec_info_shared *shared, > res ? "succeeded" : "failed", > GET_MODE_NAME (loop_vinfo->vector_mode)); > > - if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) && suggested_unroll_factor > > 1) > + auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll; > + if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) > + /* Check to see if the user wants to unroll or if the target wants to. > */ > + && (suggested_unroll_factor > 1 || user_unroll > 1)) > { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_NOTE, vect_location, > + if (suggested_unroll_factor == 1) > + { > + int assumed_vf = vect_vf_for_cost (loop_vinfo); > + suggested_unroll_factor = user_unroll / assumed_vf; > + if (suggested_unroll_factor > 1) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "setting unroll factor to %d based on user requested " > + "unroll factor %d and suggested vectorization " > + "factor: %d\n", > + suggested_unroll_factor, user_unroll, assumed_vf); > + } > + } > + > + if (suggested_unroll_factor > 1) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > "***** Re-trying analysis for unrolling" > " with unroll factor %d and slp %s.\n", > suggested_unroll_factor, > slp_done_for_suggested_uf ? "on" : "off"); > - loop_vec_info unroll_vinfo > - = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL); > - unroll_vinfo->vector_mode = vector_mode; > - unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor; > - opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL, > - slp_done_for_suggested_uf); > - if (new_res) > - { > - delete loop_vinfo; > - loop_vinfo = unroll_vinfo; > - } > - else > - delete unroll_vinfo; > + loop_vec_info unroll_vinfo > + = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL); > + unroll_vinfo->vector_mode = vector_mode; > + unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor; > + opt_result new_res > + = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL, > + slp_done_for_suggested_uf); > + if (new_res) > + { > + delete loop_vinfo; > + loop_vinfo = unroll_vinfo; > + LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1; > + } > + else > + delete unroll_vinfo; > + } > } > > /* Remember the autodetected vector mode. */ > @@ -12041,6 +12065,13 @@ vect_transform_loop (loop_vec_info loop_vinfo, > gimple *loop_vectorized_call) > dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to" > " variable-length vectorization factor\n"); > } > + > + /* When we have unrolled the loop due to a user requested value we should > + leave it up to the RTL unroll heuristics to determine if it's still > worth > + while to unroll more. */ > + if (LOOP_VINFO_USER_UNROLL (loop_vinfo)) > + loop->unroll = 0; > + > /* Free SLP instances here because otherwise stmt reference counting > won't work. */ > slp_instance instance; > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > index > 7aa2b02b63cb88a61b88295c43d4325ee487c619..5351e1aee44fb3b9a75c13940b5d239fb48871cb > 100644 > --- a/gcc/tree-vectorizer.h > +++ b/gcc/tree-vectorizer.h > @@ -970,6 +970,10 @@ public: > /* Main loop IV cond. */ > gcond* loop_iv_cond; > > + /* True if we have an unroll factor requested by the user through pragma > GCC > + unroll. */ > + bool user_unroll; > + > /* True if there are no loop carried data dependencies in the loop. > If loop->safelen <= 1, then this is always true, either the loop > didn't have any loop carried data dependencies, or the loop is being > @@ -1094,6 +1098,7 @@ public: > #define LOOP_VINFO_CHECK_UNEQUAL_ADDRS(L) (L)->check_unequal_addrs > #define LOOP_VINFO_CHECK_NONZERO(L) (L)->check_nonzero > #define LOOP_VINFO_LOWER_BOUNDS(L) (L)->lower_bounds > +#define LOOP_VINFO_USER_UNROLL(L) (L)->user_unroll > #define LOOP_VINFO_GROUPED_STORES(L) (L)->grouped_stores > #define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances > #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor > > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)