Hi,
This RFC is motivated by the IV sharing RFC in
https://gcc.gnu.org/pipermail/gcc-patches/2021-May/569502.html and the
need to have the IVOPTS pass be able to clean up IV's shared between
multiple loops. When creating a similar problem with C code I noticed
IVOPTs treated IV's with uses outside the loop differently, this didn't
even required multiple loops, take for instance the following example
using SVE intrinsics:
#include <arm_sve.h>
#include <limits.h>
extern void use (char *);
void bar (char * __restrict__ a, char * __restrict__ b, char *
__restrict__ c, unsigned n)
{
svbool_t all_true = svptrue_b8 ();
unsigned i = 0;
if (n < (UINT_MAX - svcntb() - 1))
{
for (; i < n; i += svcntb())
{
svuint8_t va = svld1 (all_true, (uint8_t*)a);
svuint8_t vb = svld1 (all_true, (uint8_t*)b);
svst1 (all_true, (uint8_t *)c, svadd_z (all_true, va,vb));
a += svcntb();
b += svcntb();
c += svcntb();
}
}
use (a);
}
IVOPTs tends to generate a shared IV for SVE memory accesses, as we
don't have a post-increment for SVE load/stores. If we had not included
'use (a);' in this example, IVOPTs would have replaced the IV's for a, b
and c with a single one, (also used for the loop-control). See:
<bb 4> [local count: 955630225]:
# ivtmp.7_8 = PHI <ivtmp.7_25(7), 0(6)>
va_14 = MEM <svuint8_t> [(unsigned char *)a_10(D) + ivtmp.7_8 * 1];
vb_15 = MEM <svuint8_t> [(unsigned char *)b_11(D) + ivtmp.7_8 * 1];
_2 = svadd_u8_z ({ -1, ... }, va_14, vb_15);
MEM <__SVUint8_t> [(unsigned char *)c_12(D) + ivtmp.7_8 * 1] = _2;
ivtmp.7_25 = ivtmp.7_8 + POLY_INT_CST [16, 16];
i_23 = (unsigned int) ivtmp.7_25;
if (n_9(D) > i_23)
goto <bb 7>; [89.00%]
else
goto <bb 5>; [11.00%]
However, due to the 'use (a);' it will create two IVs one for
loop-control, b and c and one for a. See:
<bb 4> [local count: 955630225]:
# a_28 = PHI <a_18(7), a_11(D)(6)>
# ivtmp.7_25 = PHI <ivtmp.7_24(7), 0(6)>
va_15 = MEM <svuint8_t> [(unsigned char *)a_28];
vb_16 = MEM <svuint8_t> [(unsigned char *)b_12(D) + ivtmp.7_25 * 1];
_2 = svadd_u8_z ({ -1, ... }, va_15, vb_16);
MEM <__SVUint8_t> [(unsigned char *)c_13(D) + ivtmp.7_25 * 1] = _2;
a_18 = a_28 + POLY_INT_CST [16, 16];
ivtmp.7_24 = ivtmp.7_25 + POLY_INT_CST [16, 16];
i_8 = (unsigned int) ivtmp.7_24;
if (n_10(D) > i_8)
goto <bb 7>; [89.00%]
else
goto <bb 10>; [11.00%]
With the first patch attached in this RFC 'no_cost.patch', I tell IVOPTs
to not cost uses outside of the loop. This makes IVOPTs generate a
single IV, but unfortunately it decides to create the variable for the
use inside the loop and it also seems to use the pre-increment value of
the shared-IV and add the [16,16] to it. See:
<bb 4> [local count: 955630225]:
# ivtmp.7_25 = PHI <ivtmp.7_24(7), 0(6)>
va_15 = MEM <svuint8_t> [(unsigned char *)a_11(D) + ivtmp.7_25 * 1];
vb_16 = MEM <svuint8_t> [(unsigned char *)b_12(D) + ivtmp.7_25 * 1];
_2 = svadd_u8_z ({ -1, ... }, va_15, vb_16);
MEM <__SVUint8_t> [(unsigned char *)c_13(D) + ivtmp.7_25 * 1] = _2;
_8 = (unsigned long) a_11(D);
_7 = _8 + ivtmp.7_25;
_6 = _7 + POLY_INT_CST [16, 16];
a_18 = (char * restrict) _6;
ivtmp.7_24 = ivtmp.7_25 + POLY_INT_CST [16, 16];
i_5 = (unsigned int) ivtmp.7_24;
if (n_10(D) > i_5)
goto <bb 7>; [89.00%]
else
goto <bb 10>; [11.00%]
With the patch 'var_after.patch' I make get_computation_aff_1 use
'cand->var_after' for outside uses thus using the post-increment var of
the candidate IV. This means I have to insert it in a different place
and make sure to delete the old use->stmt. I'm sure there is a better
way to do this using IVOPTs current framework, but I didn't find one
yet. See the result:
<bb 4> [local count: 955630225]:
# ivtmp.7_25 = PHI <ivtmp.7_24(7), 0(6)>
va_15 = MEM <svuint8_t> [(unsigned char *)a_11(D) + ivtmp.7_25 * 1];
vb_16 = MEM <svuint8_t> [(unsigned char *)b_12(D) + ivtmp.7_25 * 1];
_2 = svadd_u8_z ({ -1, ... }, va_15, vb_16);
MEM <__SVUint8_t> [(unsigned char *)c_13(D) + ivtmp.7_25 * 1] = _2;
ivtmp.7_24 = ivtmp.7_25 + POLY_INT_CST [16, 16];
_8 = (unsigned long) a_11(D);
_7 = _8 + ivtmp.7_24;
a_18 = (char * restrict) _7;
i_6 = (unsigned int) ivtmp.7_24;
if (n_10(D) > i_6)
goto <bb 7>; [89.00%]
else
goto <bb 10>; [11.00%]
This is still not optimal as we are still doing the update inside the
loop and there is absolutely no need for that. I found that running sink
would solve it and it seems someone has added a second sink pass, so
that saves me a third patch :) see after sink2:
<bb 4> [local count: 955630225]:
# ivtmp.7_25 = PHI <ivtmp.7_24(7), 0(6)>
va_15 = MEM <svuint8_t> [(unsigned char *)a_11(D) + ivtmp.7_25 * 1];
vb_16 = MEM <svuint8_t> [(unsigned char *)b_12(D) + ivtmp.7_25 * 1];
_2 = svadd_u8_z ({ -1, ... }, va_15, vb_16);
MEM <__SVUint8_t> [(unsigned char *)c_13(D) + ivtmp.7_25 * 1] = _2;
ivtmp.7_24 = ivtmp.7_25 + POLY_INT_CST [16, 16];
i_6 = (unsigned int) ivtmp.7_24;
if (i_6 < n_10(D))
goto <bb 7>; [89.00%]
else
goto <bb 10>; [11.00%]
<bb 10> [local count: 105119324]:
_8 = (unsigned long) a_11(D);
_7 = _8 + ivtmp.7_24;
a_18 = (char * restrict) _7;
goto <bb 5>; [100.00%]
I haven't tested this at all, but I wanted to get the opinion of someone
more knowledgeable in IVOPTs before I continued this avenue. I have two
main questions:
1) How should we be costing outside uses, right now I use a nocost, but
that's not entirely accurate. Should we use a constant multiply factor
for inside loop uses to make them outweigh outside uses? Should we use
iteration count if available? Do we want to use a backend hook to let
targets provide their own costing for these?
2) Is there a cleaner way to generate the optimal 'post-increment' use
for the outside-use variable? I first thought the position in the
candidate might be something I could use or even the var_at_stmt
functionality, but the outside IV has the actual increment of the
variable as it's use, rather than the outside uses. This is this RFC's
main weakness I find.
Kind regards,
Andre
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index
12a8a49a3071c09f222fbb6aef68c2a24a107252..1e80da3826ec427fefc9d9e8d882c21d2b3b05c8
100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -413,6 +413,9 @@ struct iv_use
tree addr_base; /* Base address with const offset stripped. */
poly_uint64_pod addr_offset;
/* Const offset stripped from base address. */
+ bool outside; /* True if the use of this IV is outside of the
loop,
+ use this to make such uses 'less costly' and avoid
+ updating it inside the loop. */
};
/* Group of uses. */
@@ -1538,6 +1541,7 @@ record_use (struct iv_group *group, tree *use_p, struct
iv *iv,
use->op_p = use_p;
use->addr_base = addr_base;
use->addr_offset = addr_offset;
+ use->outside = false;
group->vuses.safe_push (use);
return use;
@@ -1666,6 +1670,23 @@ find_interesting_uses_op (struct ivopts_data *data, tree
op)
use = record_group_use (data, NULL, iv, stmt, USE_NONLINEAR_EXPR, NULL_TREE);
iv->nonlin_use = use;
+
+ /* Find out whether this is only used outside of the loop. */
+ use->outside = true;
+ tree def;
+ if (gimple_code (stmt) == GIMPLE_PHI)
+ def = PHI_RESULT (stmt);
+ else
+ def = gimple_get_lhs (stmt);
+
+ imm_use_iterator imm_iter;
+ FOR_EACH_IMM_USE_STMT (stmt, imm_iter, def)
+ {
+ /* Do not count it's own PHI. */
+ if (gimple_code (stmt) != GIMPLE_PHI
+ && flow_bb_inside_loop_p (data->current_loop, gimple_bb (stmt)))
+ use->outside = false;
+ }
return use;
}
@@ -4958,7 +4979,8 @@ determine_group_iv_cost_generic (struct ivopts_data *data,
original biv, the cost is 0. This also prevents us from counting the
cost of increment twice -- once at this use and once in the cost of
the candidate. */
- if (cand->pos == IP_ORIGINAL && cand->incremented_at == use->stmt)
+ if (use->outside
+ || (cand->pos == IP_ORIGINAL && cand->incremented_at == use->stmt))
cost = no_cost;
else
cost = get_computation_cost (data, use, cand, false,
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index
1e80da3826ec427fefc9d9e8d882c21d2b3b05c8..ba6ced36e27b7b3a30d51135fd6aba72d66dbe0d
100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -3994,7 +3994,13 @@ get_computation_aff_1 (class loop *loop, gimple *at,
struct iv_use *use,
if (TYPE_PRECISION (utype) > TYPE_PRECISION (ctype))
return false;
- var = var_at_stmt (loop, cand, at);
+ if (use->outside)
+ {
+ var = cand->var_after;
+ ubase = fold_build2 (MINUS_EXPR, utype, ubase, ustep);
+ }
+ else
+ var = var_at_stmt (loop, cand, at);
uutype = unsigned_type_for (utype);
/* If the conversion is not noop, perform it. */
@@ -7328,19 +7334,32 @@ rewrite_use_nonlinear_expr (struct ivopts_data *data,
}
}
- gsi_insert_seq_before (&bsi, stmt_list, GSI_SAME_STMT);
- if (gimple_code (use->stmt) == GIMPLE_PHI)
+ if (use->outside)
{
+ gcc_assert (gimple_code (use->stmt) != GIMPLE_PHI);
ass = gimple_build_assign (tgt, comp);
- gsi_insert_before (&bsi, ass, GSI_SAME_STMT);
-
+ gimple_seq_add_stmt (&stmt_list, ass);
+ bsi = gsi_for_stmt (SSA_NAME_DEF_STMT (cand->var_after));
+ gsi_insert_seq_after (&bsi, stmt_list, GSI_SAME_STMT);
bsi = gsi_for_stmt (use->stmt);
- remove_phi_node (&bsi, false);
+ gsi_remove (&bsi, true);
}
else
{
- gimple_assign_set_rhs_from_tree (&bsi, comp);
- use->stmt = gsi_stmt (bsi);
+ gsi_insert_seq_before (&bsi, stmt_list, GSI_SAME_STMT);
+ if (gimple_code (use->stmt) == GIMPLE_PHI)
+ {
+ ass = gimple_build_assign (tgt, comp);
+ gsi_insert_before (&bsi, ass, GSI_SAME_STMT);
+
+ bsi = gsi_for_stmt (use->stmt);
+ remove_phi_node (&bsi, false);
+ }
+ else
+ {
+ gimple_assign_set_rhs_from_tree (&bsi, comp);
+ use->stmt = gsi_stmt (bsi);
+ }
}
}