From: Victor Do Nascimento <vicdo...@e133397.arm.com> At present, the compiler offers the `{u|s|us}dot_prod_optab' direct optabs for dealing with vectorizable dot product code sequences. The consequence of using a direct optab for this is that backend-pattern selection is only ever able to match against one datatype - Either that of the operands or of the accumulated value, never both.
With the introduction of the 2-way (un)signed dot-product insn [1][2] in AArch64 SVE2, the existing direct opcode approach is no longer sufficient for full specification of all the possible dot product machine instructions to be matched to the code sequence; a dot product resulting in VNx4SI may result from either dot products on VNx16QI or VNx8HI values for the 4- and 2-way dot product operations, respectively. This means that the following example fails autovectorization: uint32_t foo(int n, uint16_t* data) { uint32_t sum = 0; for (int i=0; i<n; i+=1) { sum += data[i] * data[i]; } return sum; } To remedy the issue a new optab is added, tentatively named `udot_prod_twoway_optab', whose selection is dependent upon checking of both input and output types involved in the operation. In order to minimize changes to the existing codebase, `optab_for_tree_code' is renamed `optab_for_tree_code_1' and a new argument is added to its signature - `const_tree otype', allowing type information to be specified for both input and output types. The existing nterface is retained by defining a new `optab_for_tree_code', which serves as a shim to `optab_for_tree_code_1', passing old parameters as-is and setting the new `optype' argument to `NULL_TREE'. For DOT_PROD_EXPR tree codes, we can call `optab_for_tree_code_1' directly, passing it both types, adding the internal logic to the function to distinguish between competing optabs. Finally, necessary changes are made to `expand_widen_pattern_expr' to ensure the new icode can be correctly selected, given the new optab. [1] https://developer.arm.com/documentation/ddi0602/2024-03/SVE-Instructions/UDOT--2-way--vectors---Unsigned-integer-dot-product- [2] https://developer.arm.com/documentation/ddi0602/2024-03/SVE-Instructions/SDOT--2-way--vectors---Signed-integer-dot-product- gcc/ChangeLog: * config/aarch64/aarch64-sve2.md (@aarch64_sve_<sur>dotvnx4sivnx8hi): renamed to `<sur>dot_prod_twoway_vnx8hi'. * config/aarch64/aarch64-sve-builtins-base.cc (svdot_impl.expand): update icodes used in line with above rename. * optabs-tree.cc (optab_for_tree_code_1): Renamed `optab_for_tree_code' and added new argument. (optab_for_tree_code): Now a call to `optab_for_tree_code_1'. * optabs-tree.h (optab_for_tree_code_1): New. * optabs.cc (expand_widen_pattern_expr): Expand support for DOT_PROD_EXPR patterns. * optabs.def (udot_prod_twoway_optab): New. (sdot_prod_twoway_optab): Likewise. * tree-vect-patterns.cc (vect_supportable_direct_optab_p): Add support for misc optabs that use two modes. gcc/testsuite/ChangeLog: * gcc.dg/vect/vect-dotprod-twoway.c: New. --- .../aarch64/aarch64-sve-builtins-base.cc | 4 ++-- gcc/config/aarch64/aarch64-sve2.md | 2 +- gcc/optabs-tree.cc | 23 ++++++++++++++++-- gcc/optabs-tree.h | 2 ++ gcc/optabs.cc | 2 +- gcc/optabs.def | 2 ++ .../gcc.dg/vect/vect-dotprod-twoway.c | 24 +++++++++++++++++++ gcc/tree-vect-patterns.cc | 2 +- 8 files changed, 54 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index 0d2edf3f19e..e457db09f66 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -764,8 +764,8 @@ public: icode = (e.type_suffix (0).float_p ? CODE_FOR_aarch64_sve_fdotvnx4sfvnx8hf : e.type_suffix (0).unsigned_p - ? CODE_FOR_aarch64_sve_udotvnx4sivnx8hi - : CODE_FOR_aarch64_sve_sdotvnx4sivnx8hi); + ? CODE_FOR_udot_prod_twoway_vnx8hi + : CODE_FOR_sdot_prod_twoway_vnx8hi); return e.use_unpred_insn (icode); } }; diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index 934e57055d3..5677de7108d 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -2021,7 +2021,7 @@ (define_insn "@aarch64_sve_qsub_<sve_int_op>_lane_<mode>" ) ;; Two-way dot-product. -(define_insn "@aarch64_sve_<sur>dotvnx4sivnx8hi" +(define_insn "<sur>dot_prod_twoway_vnx8hi" [(set (match_operand:VNx4SI 0 "register_operand") (plus:VNx4SI (unspec:VNx4SI diff --git a/gcc/optabs-tree.cc b/gcc/optabs-tree.cc index b69a5bc3676..e3c5a618ea2 100644 --- a/gcc/optabs-tree.cc +++ b/gcc/optabs-tree.cc @@ -35,8 +35,8 @@ along with GCC; see the file COPYING3. If not see cannot give complete results for multiplication or division) but probably ought to be relied on more widely throughout the expander. */ optab -optab_for_tree_code (enum tree_code code, const_tree type, - enum optab_subtype subtype) +optab_for_tree_code_1 (enum tree_code code, const_tree type, + const_tree otype, enum optab_subtype subtype) { bool trapv; switch (code) @@ -149,6 +149,14 @@ optab_for_tree_code (enum tree_code code, const_tree type, case DOT_PROD_EXPR: { + if (otype && (TYPE_PRECISION (TREE_TYPE (type)) * 2 + == TYPE_PRECISION (TREE_TYPE (otype)))) + { + if (TYPE_UNSIGNED (type) && TYPE_UNSIGNED (otype)) + return udot_prod_twoway_optab; + if (!TYPE_UNSIGNED (type) && !TYPE_UNSIGNED (otype)) + return sdot_prod_twoway_optab; + } if (subtype == optab_vector_mixed_sign) return usdot_prod_optab; @@ -285,6 +293,17 @@ optab_for_tree_code (enum tree_code code, const_tree type, } } +/* Return the optab used for computing the operation given by the tree code, + CODE and the tree EXP. This function is not always usable (for example, it + cannot give complete results for multiplication or division) but probably + ought to be relied on more widely throughout the expander. */ +optab +optab_for_tree_code (enum tree_code code, const_tree type, + enum optab_subtype subtype) +{ + return optab_for_tree_code_1 (code, type, NULL_TREE, subtype); +} + /* Check whether an operation represented by CODE is a 'half' widening operation in which the input vector type has half the number of bits of the output vector type e.g. V8QI->V8HI. diff --git a/gcc/optabs-tree.h b/gcc/optabs-tree.h index f2b49991462..13ff7ca2e4b 100644 --- a/gcc/optabs-tree.h +++ b/gcc/optabs-tree.h @@ -36,6 +36,8 @@ enum optab_subtype /* Return the optab used for computing the given operation on the type given by the second argument. The third argument distinguishes between the types of vector shifts and rotates. */ +optab optab_for_tree_code_1 (enum tree_code, const_tree, const_tree __attribute__((unused)), + enum optab_subtype ); optab optab_for_tree_code (enum tree_code, const_tree, enum optab_subtype); bool supportable_half_widening_operation (enum tree_code, tree, tree, diff --git a/gcc/optabs.cc b/gcc/optabs.cc index ce91f94ed43..3a1c6c7b90e 100644 --- a/gcc/optabs.cc +++ b/gcc/optabs.cc @@ -311,7 +311,7 @@ expand_widen_pattern_expr (sepops ops, rtx op0, rtx op1, rtx wide_op, gcc_unreachable (); widen_pattern_optab - = optab_for_tree_code (ops->code, TREE_TYPE (oprnd0), subtype); + = optab_for_tree_code_1 (ops->code, TREE_TYPE (oprnd0), TREE_TYPE (oprnd2), subtype); } else widen_pattern_optab diff --git a/gcc/optabs.def b/gcc/optabs.def index ad14f9328b9..cf1a6e7a7dc 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -408,6 +408,8 @@ OPTAB_D (sdot_prod_optab, "sdot_prod$I$a") OPTAB_D (ssum_widen_optab, "widen_ssum$I$a3") OPTAB_D (udot_prod_optab, "udot_prod$I$a") OPTAB_D (usdot_prod_optab, "usdot_prod$I$a") +OPTAB_D (sdot_prod_twoway_optab, "sdot_prod_twoway_$I$a") +OPTAB_D (udot_prod_twoway_optab, "udot_prod_twoway_$I$a") OPTAB_D (usum_widen_optab, "widen_usum$I$a3") OPTAB_D (usad_optab, "usad$I$a") OPTAB_D (ssad_optab, "ssad$I$a") diff --git a/gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c b/gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c new file mode 100644 index 00000000000..cba2aadbec8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c @@ -0,0 +1,24 @@ +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-additional-options "-march=-O3 -march=armv9.2-a+sme2 -fdump-tree-vect-details" { target { aarch64*-*-* } } } */ + +#include <stdint.h> + +uint32_t udot2(int n, uint16_t* data) { + uint32_t sum = 0; + for (int i=0; i<n; i+=1) { + sum += data[i] * data[i]; + } + return sum; +} + +int32_t sdot2(int n, int16_t* data) { + int32_t sum = 0; + for (int i=0; i<n; i+=1) { + sum += data[i] * data[i]; + } + return sum; +} + +/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected:" 2 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 4 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc index dfb7d800526..0760f25d94d 100644 --- a/gcc/tree-vect-patterns.cc +++ b/gcc/tree-vect-patterns.cc @@ -233,7 +233,7 @@ vect_supportable_direct_optab_p (vec_info *vinfo, tree otype, tree_code code, if (!vecotype) return false; - optab optab = optab_for_tree_code (code, vecitype, subtype); + optab optab = optab_for_tree_code_1 (code, vecitype, vecotype, subtype); if (!optab) return false; -- 2.34.1