Yuliang Wang <yuliang.w...@arm.com> writes: > Hi, > > The C snippets below (signed division/modulo by a power-of-2 immediate > value): > > #define P ... > > void foo_div (int *a, int *b, int N) > { > for (int i = 0; i < N; i++) > a[i] = b[i] / (1 << P); > } > void foo_mod (int *a, int *b, int N) > { > for (int i = 0; i < N; i++) > a[i] = b[i] % (1 << P); > } > > Vectorize to the following on AArch64 + SVE: > > foo_div: > movx0, 0 > movw2, N > ptruep1.b, all > whilelop0.s, wzr, w2 > .p2align3,,7 > .L2: > ld1wz1.s, p0/z, [x3, x0, lsl 2] > cmpltp2.s, p1/z, z1.s, #0// > movz0.s, p2/z, #7// > addz0.s, z0.s, z1.s// > asrz0.s, z0.s, #3// > st1wz0.s, p0, [x1, x0, lsl 2] > incwx0 > whilelop0.s, w0, w2 > b.any.L2 > ret > > foo_mod: > ... > .L2: > ld1wz0.s, p0/z, [x3, x0, lsl 2] > cmpltp2.s, p1/z, z0.s, #0// > movz1.s, p2/z, #-1// > lsrz1.s, z1.s, #29// > addz0.s, z0.s, z1.s// > andz0.s, z0.s, #{2^P-1}// > subz0.s, z0.s, z1.s// > st1wz0.s, p0, [x1, x0, lsl 2] > incwx0 > whilelop0.s, w0, w2 > b.any.L2 > ret > > This patch utilizes the special-purpose ASRD (arithmetic shift-right for > divide by immediate) instruction: > > foo_div: > ... > .L2: > ld1wz0.s, p0/z, [x3, x0, lsl 2] > asrdz0.s, p1/m, z0.s, #{P}// > st1wz0.s, p0, [x1, x0, lsl 2] > incwx0 > whilelop0.s, w0, w2 > b.any.L2 > ret > > foo_mod: > ... > .L2: > ld1wz0.s, p0/z, [x3, x0, lsl 2] > movprfxz1, z0// > asrdz1.s, p1/m, z1.s, #{P}// > lslz1.s, z1.s, #{P}// > subz0.s, z0.s, z1.s// > st1wz0.s, p0, [x1, x0, lsl 2] > incwx0 > whilelop0.s, w0, w2 > b.any.L2 > ret > > Added new tests. Built and regression tested on aarch64-none-elf. > > Best Regards, > Yuliang Wang > > > gcc/ChangeLog: > > 2019-09-23 Yuliang Wang <yuliang.w...@arm.com> > > * config/aarch64/aarch64-sve.md (asrd<mode>3): New pattern for ASRD. > * config/aarch64/iterators.md (UNSPEC_ASRD): New unspec. > (ASRDIV): New int iterator. > * internal-fn.def (IFN_ASHR_DIV): New internal function. > * optabs.def (ashr_div_optab): New optab. > * tree-vect-patterns.c (vect_recog_divmod_pattern): > Modify pattern to support new operation. > * doc/md.texi (asrd$var{m3}): Documentation for the above. > * doc/sourcebuild.texi (vect_asrdiv_si): Document new target selector.
This looks good to me. My only real question is about naming: maybe IFN_DIV_POW2 would be a better name for the internal function and sdiv_pow2_optab/"div_pow2$a3" for the optab? But I'm useless at naming things, so maybe others would prefer your names. Thanks, Richard > > gcc/testsuite/ChangeLog: > > 2019-09-23 Yuliang Wang <yuliang.w...@arm.com> > > * gcc.dg/vect/vect-asrdiv-1.c: New test. > * gcc.target/aarch64/sve/asrdiv_1.c: As above. > * lib/target-support.exp (check_effective_target_vect_asrdiv_si): > Return true for AArch64 with SVE. > > diff --git a/gcc/config/aarch64/aarch64-sve.md > b/gcc/config/aarch64/aarch64-sve.md > index > f58353e9c6dc0df97ce4074db6bb22181f426e5b..607440b7ba16d5616695f29a9cf7c4c277a4a502 > 100644 > --- a/gcc/config/aarch64/aarch64-sve.md > +++ b/gcc/config/aarch64/aarch64-sve.md > @@ -71,6 +71,7 @@ > ;; ---- [INT] Binary logical operations > ;; ---- [INT] Binary logical operations (inverted second input) > ;; ---- [INT] Shifts > +;; ---- [INT] Shifts (rounding towards 0) > ;; ---- [FP] General binary arithmetic corresponding to rtx codes > ;; ---- [FP] General binary arithmetic corresponding to unspecs > ;; ---- [FP] Addition > @@ -2563,6 +2564,46 @@ > [(set_attr "movprfx" "yes")] > ) > > +;; ------------------------------------------------------------------------- > +;; ---- [INT] Shifts (rounding towards 0) > +;; ------------------------------------------------------------------------- > +;; Includes: > +;; - ASRD > +;; ------------------------------------------------------------------------- > + > +;; Unpredicated arithmetic right shift for division by power-of-2. > +(define_expand "asrd<mode>3" > + [(set (match_operand:SVE_I 0 "register_operand" "") > + (unspec:SVE_I > + [(match_dup 3) > + (unspec:SVE_I > + [(match_operand:SVE_I 1 "register_operand" "") > + (match_operand 2 "aarch64_simd_rshift_imm")] > + UNSPEC_ASRD)] > + UNSPEC_PRED_X))] > + "TARGET_SVE" > + { > + operands[3] = aarch64_ptrue_reg (<VPRED>mode); > + } > +) > + > +;; Predicated ASRD with PTRUE. > +(define_insn "*asrd<mode>3" > + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") > + (unspec:SVE_I > + [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") > + (unspec:SVE_I > + [(match_operand:SVE_I 2 "register_operand" "0, w") > + (match_operand 3 "aarch64_simd_rshift_imm")] > + UNSPEC_ASRD)] > + UNSPEC_PRED_X))] > + "TARGET_SVE" > + "@ > + asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3 > + movprfx\t%0, %2\;asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3" > + [(set_attr "movprfx" "*,yes")] > +) > + > ;; ------------------------------------------------------------------------- > ;; ---- [FP] General binary arithmetic corresponding to rtx codes > ;; ------------------------------------------------------------------------- > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > index > 03b3ce363021a71578803e07b3548d3dd9c9de32..1e321af710bfe80606eedee7e0d191f36c70355b > 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -538,6 +538,7 @@ > UNSPEC_SMULHRS ; Used in aarch64-sve2.md. > UNSPEC_UMULHS ; Used in aarch64-sve2.md. > UNSPEC_UMULHRS ; Used in aarch64-sve2.md. > + UNSPEC_ASRD ; Used in aarch64-sve.md. > ]) > > ;; ------------------------------------------------------------------ > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi > index > f35fd2b1b19cef1deb41566d7614d80d449d69fc..2c0396c0d2ba14ef942db6dcdfea8250819bebfd > 100644 > --- a/gcc/doc/md.texi > +++ b/gcc/doc/md.texi > @@ -5414,6 +5414,17 @@ op0 = (narrow) (((((wide) op1 * (wide) op2) >> (N / 2 > - 2)) + 1) >> 1); > where the sign of @samp{narrow} determines whether this is a signed > or unsigned operation, and @var{N} is the size of @samp{wide} in bits. > > +@cindex @code{asrd@var{m3}} instruction pattern > +@item @samp{asrd@var{m3}} > +@cindex @code{asrd@var{m3}} instruction pattern > +@itemx @samp{asrd@var{m3}} > +Arithmetic shift right for division by power-of-2 immediate. Equivalent to: > +@smallexample > +signed op0, op1; > +@dots{} > +op0 = op1 / (1 << imm); > +@end smallexample > + > @cindex @code{vec_shl_insert_@var{m}} instruction pattern > @item @samp{vec_shl_insert_@var{m}} > Shift the elements in vector input operand 1 left one element (i.e.@: > diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi > index > 4ace224a8ff5ed4fafed10a69ef00ffb2d7d8c39..b01ba570bfb44b6316c3f391b7be92f1244e2030 > 100644 > --- a/gcc/doc/sourcebuild.texi > +++ b/gcc/doc/sourcebuild.texi > @@ -1446,6 +1446,10 @@ of bytes. > Target supports both signed and unsigned multiply-high-with-round-and-scale > operations on vectors of half-words. > > +@item vect_asrdiv_si > +Target supports arithmetic shift-right division by constant power-of-2 > +operations on vectors of words. > + > @item vect_condition > Target supports vector conditional operations. > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > index > 49f57978c88a3a8c1a0206d983e1720ed09b0385..f994129747854b5921bd72cc3ec7105fb6a061b7 > 100644 > --- a/gcc/internal-fn.def > +++ b/gcc/internal-fn.def > @@ -140,6 +140,8 @@ DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | > ECF_NOTHROW, while_ult, while) > DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT, ECF_CONST | ECF_NOTHROW, > vec_shl_insert, binary) > > +DEF_INTERNAL_OPTAB_FN (ASHR_DIV, ECF_CONST | ECF_NOTHROW, ashr_div, binary) > + > DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary) > DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary) > DEF_INTERNAL_OPTAB_FN (FNMS, ECF_CONST, fnms, ternary) > diff --git a/gcc/optabs.def b/gcc/optabs.def > index > 308696846d4926fdd94133b87f4f59b8d1cc7f20..bdf0e5ccc68f0809aeb4d949d290d88074af0f9b > 100644 > --- a/gcc/optabs.def > +++ b/gcc/optabs.def > @@ -347,6 +347,7 @@ OPTAB_D (smulhs_optab, "smulhs$a3") > OPTAB_D (smulhrs_optab, "smulhrs$a3") > OPTAB_D (umulhs_optab, "umulhs$a3") > OPTAB_D (umulhrs_optab, "umulhrs$a3") > +OPTAB_D (ashr_div_optab, "asrd$a3") > OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a") > OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a") > OPTAB_D (vec_pack_trunc_optab, "vec_pack_trunc_$a") > diff --git a/gcc/testsuite/gcc.dg/vect/vect-asrdiv-1.c > b/gcc/testsuite/gcc.dg/vect/vect-asrdiv-1.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..961c8b0d05659fcbd70083982169af9a584a6ad3 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-asrdiv-1.c > @@ -0,0 +1,79 @@ > +/* { dg-require-effective-target vect_int } */ > + > +#include "tree-vect.h" > + > +#define DIV(x,y) ((x)/(y)) > +#define MOD(x,y) ((x)%(y)) > + > +#define TEMPLATE(PO2,OP) \ > +void __attribute__ ((noipa)) \ > +f_##PO2##_##OP (int *restrict a, int *restrict b, __INTPTR_TYPE__ n) \ > +{ \ > + for (__INTPTR_TYPE__ i = 0; i < n; ++i) \ > + a[i] = OP (b[i], (1 << PO2)); \ > +} > +#define TEMPLATES(PO2) \ > +TEMPLATE (PO2,DIV); \ > +TEMPLATE (PO2,MOD); > + > +TEMPLATES (1); > +TEMPLATES (2); > +TEMPLATES (3); > +TEMPLATES (7); > +TEMPLATES (8); > +TEMPLATES (10); > +TEMPLATES (15); > +TEMPLATES (16); > +TEMPLATES (20); > + > +typedef void (*func_t) (int *, int *, __INTPTR_TYPE__); > +typedef struct { > + int po2; > + func_t div; > + func_t mod; > +} fn_t; > +const fn_t fns[] = { > +#define FN_PAIR(PO2) { PO2, f_##PO2##_DIV, f_##PO2##_MOD } > + FN_PAIR (1), > + FN_PAIR (2), > + FN_PAIR (3), > + FN_PAIR (7), > + FN_PAIR (8), > + FN_PAIR (10), > + FN_PAIR (15), > + FN_PAIR (16), > + FN_PAIR (20), > +}; > + > +int __attribute__ ((noipa, noinline)) > +power2 (int x) > +{ > + return 1 << x; > +} > + > +#define N 50 > + > +int > +main (void) > +{ > + int a[N], b[N], c[N]; > + > + for (int i = 0; i < (sizeof(fns)/sizeof(fns[0])); i++) > + { > + int p = power2 (fns[i].po2); > + for (int j = 0; j < N; j++) > + a[j] = ((p << 4) * j) / (N - 1) - (p << 5); > + > + fns[i].div (b, a, N); > + fns[i].mod (c, a, N); > + > + for (int j = 0; j < N; j++) > + if (a[j] != (b[j] * p + c[j])) > + __builtin_abort (); > + } > + > + return 0; > +} > + > +/* { dg-final { scan-tree-dump {\.ASHR_DIV} "vect" { target vect_asrdiv_si } > } } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 18 "vect" { target > vect_asrdiv_si } } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_1.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..692df2012a4544f1cc36c3e6f671c121c5e550ff > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_1.c > @@ -0,0 +1,52 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" > } */ > + > +#include <stdint.h> > + > +#define SIGNED(S) int##S##_t > + > +#define DIV(x,y) ((x)/(y)) > +#define MOD(x,y) ((x)%(y)) > + > +#define TEMPLATE(OP,SIZE) \ > +void __attribute__ ((noinline, noclone)) \ > +f_##OP##_##SIZE (SIGNED(SIZE) *restrict a, SIGNED(SIZE) *restrict b, \ > + __INTPTR_TYPE__ n) \ > +{ \ > + for (__INTPTR_TYPE__ i = 0; i < n; ++i) \ > + a[i] = OP (b[i], ((SIGNED(SIZE))1 << ((SIZE)/2+1))); \ > +} > +#define DIVMOD(SIZE) \ > +TEMPLATE (DIV,SIZE); \ > +TEMPLATE (MOD,SIZE); > + > +DIVMOD (8); > +DIVMOD (16); > +DIVMOD (32); > +DIVMOD (64); > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 > "vect" } } */ > + > +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 4 } } */ > + > +/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.b, p[0-9]+/m, > z[0-9]+\.b, #5\n} 2 } } */ > +/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #5\n} 1 > } } */ > +/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, z[0-9]+\.b, > z[0-9]+\.b\n} 1 } } */ > + > +/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.h, p[0-9]+/m, > z[0-9]+\.h, #9\n} 2 } } */ > +/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #9\n} 1 > } } */ > +/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, z[0-9]+\.h, > z[0-9]+\.h\n} 1 } } */ > + > +/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.s, p[0-9]+/m, > z[0-9]+\.s, #17\n} 2 } } */ > +/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #17\n} > 1 } } */ > +/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, z[0-9]+\.s, > z[0-9]+\.s\n} 1 } } */ > + > +/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.d, p[0-9]+/m, > z[0-9]+\.d, #33\n} 2 } } */ > +/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #33\n} > 1 } } */ > +/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, z[0-9]+\.d, > z[0-9]+\.d\n} 1 } } */ > + > +/* { dg-final { scan-assembler-not {\tasr\t%} } } */ > +/* { dg-final { scan-assembler-not {\tlsr\t%} } } */ > +/* { dg-final { scan-assembler-not {\tcmplt\t%} } } */ > +/* { dg-final { scan-assembler-not {\tand\t%} } } */ > + > diff --git a/gcc/testsuite/lib/target-supports.exp > b/gcc/testsuite/lib/target-supports.exp > index > 414bf80003b9192806f79afed9393f9ef4750a7d..8d6956471c7dc4e5a3147bd2958ea37e5183f408 > 100644 > --- a/gcc/testsuite/lib/target-supports.exp > +++ b/gcc/testsuite/lib/target-supports.exp > @@ -6192,6 +6192,14 @@ proc check_effective_target_vect_mulhrs_hi {} { > && [check_effective_target_aarch64_sve2] }] > } > > +# Return 1 if the target plus current options supports arithmetic > +# shift-right division by power-of-2 operations on vectors of half-words. > + > +proc check_effective_target_vect_asrdiv_si {} { > + return [expr { [istarget aarch64*-*-*] > + && [check_effective_target_aarch64_sve] }] > +} > + > # Return 1 if the target plus current options supports a vector > # demotion (packing) of shorts (to chars) and ints (to shorts) > # using modulo arithmetic, 0 otherwise. > diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c > index > 2f86f9e4fc7039add1b1d7b82574cb8262eb4ba4..8dbe8d9ea3a5c3e9db37cf15b651fa6aa1bea567 > 100644 > --- a/gcc/tree-vect-patterns.c > +++ b/gcc/tree-vect-patterns.c > @@ -2925,6 +2925,38 @@ vect_recog_divmod_pattern (stmt_vec_info stmt_vinfo, > tree *type_out) > /* Pattern detected. */ > vect_pattern_detected ("vect_recog_divmod_pattern", last_stmt); > > + *type_out = vectype; > + > + /* Check if the target supports this internal function. */ > + internal_fn ifn = IFN_ASHR_DIV; > + if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED)) > + { > + tree shift = build_int_cst (itype, tree_log2 (oprnd1)); > + > + tree var_div = vect_recog_temp_ssa_var (itype, NULL); > + gimple *div_stmt = gimple_build_call_internal (ifn, 2, oprnd0, shift); > + gimple_call_set_lhs (div_stmt, var_div); > + > + if (rhs_code == TRUNC_MOD_EXPR) > + { > + append_pattern_def_seq (stmt_vinfo, div_stmt); > + def_stmt > + = gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL), > + LSHIFT_EXPR, var_div, shift); > + append_pattern_def_seq (stmt_vinfo, def_stmt); > + pattern_stmt > + = gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL), > + MINUS_EXPR, oprnd0, > + gimple_assign_lhs (def_stmt)); > + } > + else > + { > + pattern_stmt = div_stmt; > + gimple_set_location (pattern_stmt, gimple_location (last_stmt)); > + } > + return pattern_stmt; > + } > + > cond = build2 (LT_EXPR, boolean_type_node, oprnd0, > build_int_cst (itype, 0)); > if (rhs_code == TRUNC_DIV_EXPR > @@ -3001,7 +3033,6 @@ vect_recog_divmod_pattern (stmt_vec_info stmt_vinfo, > tree *type_out) > signmask); > } > > - *type_out = vectype; > return pattern_stmt; > } >