https://gcc.gnu.org/g:867ca749704d1d7bd042f1a4d8403801df8cc172
commit r17-495-g867ca749704d1d7bd042f1a4d8403801df8cc172 Author: Tamar Christina <[email protected]> Date: Wed May 13 12:36:07 2026 +0100 scev: maintain affine CHRECs in the presence of type conversions The example float *e; void f (float *f, float *g, char *h, int n, int b, int c, int d) { float a = 0; for (int i = 0; i < n; ++i) { int j = b + i, k = c + i * d; float l = g[j], m = h[i] ? g[k] : l; a += f[i] * m; } *e = a; } gets vectorized using gathers for the access to g: .L5: ld1b z4.s, p7/z, [x2, x6] cmpne p6.b, p7/z, z4.b, #0 ld1w z2.s, p7/z, [x0, x6, lsl 2] add z7.s, z30.s, z16.s add z6.s, z16.s, z18.s add x6, x6, x7 ld1w z5.s, p7/z, [x1, z6.s, sxtw 2] ld1w z3.s, p6/z, [x1, z7.s, sxtw 2] incw z16.s sel z3.s, p6, z3.s, z5.s fmla z17.s, p7/m, z2.s, z3.s whilelo p7.s, w6, w3 b.any .L5 however the first g is g[b+i] and second is g[c + i*d]; since b is loop invariant the access to g[b+i] is actually linear and since c is loop invariant, then the base of the second access g[c + i *d] can be simplified by recognizing the base as g + c. Today however SCEV fails to analyze these accesses as affine and as a consequence we end up with gathers: : missed: failed: evolution of base is not affine. base_address: offset from base address: constant offset from base address: step: base alignment: 0 base misalignment: 0 offset alignment: 0 step alignment: 0 base_object: *_63 Looking at SCEV this is because of an outer cast around the CHREC: ) (set_scalar_evolution instantiated_below = 25 (scalar = _65) (scalar_evolution = (long unsigned int) {b_22(D), +, 1}_2)) ) (instantiate_scev (instantiate_below = 25 -> 12) (evolution_loop = 2) (chrec = (long unsigned int) {b_22(D), +, 1}_2) (instantiate_scev (instantiate_below = 25 -> 12) (evolution_loop = 2) (chrec = g_27(D)) (res = g_27(D))) which corresponds to j_66 = b_22(D) + i_67; _65 = (long unsigned int) j_66; _64 = _65 * 4; _63 = g_27(D) + _64; l_62 = *_63; and the _64 is deemed to not be affine: (instantiate_scev (instantiate_below = 25 -> 12) (evolution_loop = 2) (chrec = _64) (analyze_scalar_evolution (loop_nb = 2) (scalar = _64) (get_scalar_evolution (scalar = _64) (scalar_evolution = _64)) ) (res = scev_not_known)) This patch fixes it by (very carefully) folding a multiply on an unsigned affine CHREC into the CHREC itself. which results in (instantiate_scev (instantiate_below = 25 -> 12) (evolution_loop = 2) (chrec = 4) (res = 4)) (set_scalar_evolution instantiated_below = 25 (scalar = _64) (scalar_evolution = {(long unsigned int) b_22(D) * 4, +, 4}_2)) ) (instantiate_scev (instantiate_below = 25 -> 12) (evolution_loop = 2) (chrec = g_27(D)) (res = g_27(D))) (instantiate_scev (instantiate_below = 25 -> 12) (evolution_loop = 2) (chrec = {(long unsigned int) b_22(D) * 4, +, 4}_2) (res = {(long unsigned int) b_22(D) * 4, +, 4}_2)) (set_scalar_evolution instantiated_below = 25 (scalar = _63) (scalar_evolution = {g_27(D) + (long unsigned int) b_22(D) * 4, +, 4}_2)) ) and dataref now correctly analyzes the base base_address: g_27(D) + (sizetype) b_22(D) * 4 offset from base address: 0 constant offset from base address: 0 step: 4 base alignment: 4 base misalignment: 0 offset alignment: 128 step alignment: 4 base_object: *g_27(D) + (sizetype) b_22(D) * 4 Access function 0: {0B, +, 4}_2 producing the final codegen: .L7: ld1b z4.s, p7/z, [x2, x6] cmpne p6.b, p7/z, z4.b, #0 ld1w z29.s, p7/z, [x4, x6, lsl 2] ld1w z2.s, p7/z, [x0, x6, lsl 2] ld1w z3.s, p6/z, [x5] add x6, x6, x7 sel z3.s, p6, z3.s, z29.s add x5, x5, x1 fmla z30.s, p7/m, z2.s, z3.s whilelo p7.s, w6, w3 b.any .L7 faddv s31, p5, z30.s gcc/ChangeLog: * tree-chrec.cc (chrec_convert_1): Fold unsigned CHREC converts. gcc/testsuite/ChangeLog: * gcc.dg/vect/vect-scev-affine_1.c: New test. Diff: --- gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c | 17 +++++++++++++++++ gcc/tree-chrec.cc | 25 +++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c b/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c new file mode 100644 index 000000000000..929012184e0a --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_float } */ + +float *e; +void f (float *f, float *g, char *h, int n, + int b, int c, int d) +{ + float a = 0; + for (int i = 0; i < n; ++i) { + int j = b + i, k = c + i * d; + float l = g[j], m = h[i] ? g[k] : l; + a += f[i] * m; + } + *e = a; +} + +/* { dg-final { scan-tree-dump-not {failed: evolution of base is not affine} "vect" { target aarch64*-*-* } } } */ diff --git a/gcc/tree-chrec.cc b/gcc/tree-chrec.cc index 20beaeb09ec0..bad0396d0407 100644 --- a/gcc/tree-chrec.cc +++ b/gcc/tree-chrec.cc @@ -1598,6 +1598,31 @@ keep_cast: CHREC_RIGHT (chrec))); res = chrec_convert_1 (type, res, at_stmt, use_overflow_semantics, from); } + /* Similar perform the trick that (unsigned T)(base + step) can be + folded to ((unsigned T)x + (unsigned T)step). */ + else if (use_overflow_semantics + && TREE_CODE (chrec) == POLYNOMIAL_CHREC + && INTEGRAL_TYPE_P (ct) + && INTEGRAL_TYPE_P (type) + && TYPE_OVERFLOW_UNDEFINED (type) + /* Must be unsigned so we don't introduce any UB. */ + && TYPE_UNSIGNED (type) + /* The outer type must at least as wide than the inner type so we + don't truncate when we fold and must the inner CHREC must be + non-wrapping so we don't change the behavior when folding to + a wider type. */ + && TYPE_PRECISION (type) >= TYPE_PRECISION (ct) + && (!TYPE_UNSIGNED (ct) + || TYPE_PRECISION (type) == TYPE_PRECISION (ct) + || nonwrapping_chrec_p (chrec))) + { + res = build_polynomial_chrec (CHREC_VARIABLE (chrec), + fold_convert (type, + CHREC_LEFT (chrec)), + fold_convert (type, + CHREC_RIGHT (chrec))); + res = chrec_convert_1 (type, res, at_stmt, use_overflow_semantics, from); + } else res = fold_convert (type, chrec);
