On Wed, 2 Oct 2019, Jakub Jelinek wrote: > On Wed, Nov 09, 2016 at 09:14:55AM +0100, Richard Biener wrote: > > The following implements vectorization of bswap via VEC_PERM_EXPR > > on the corresponding QImode vector. > > > > ARM already has backend handling via the builtin_vectorized_call > > hook and thus there were already testcases available. It doesn't > > end up working for vect-bswap16.c because we have a promoted > > argument to __builtin_bswap16 which confuses vectorization. > > Indeed. The following patch handles that in tree-vect-patterns.c. > If it sees a __builtin_bswap16 with the promoted argument, it checks if we'd > vectorize the builtin if it didn't have a promoted argument and if yes, > it just changes it in a pattern_stmt to use an unpromoted argument or casts > it first to the right type. This works e.g. for the SSE4 case. > Otherwise, it handles __builtin_bswap16 like a x r<< 8, if that is > vectorizable, emits a pattern_stmt with x r<< 8, if it isn't, falls back to > (x << 8) | (x >> 8) if that can be vectorized. The last case matters for > SSE2. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
OK. Thanks, Richard. > 2019-10-02 Jakub Jelinek <ja...@redhat.com> > > PR tree-optimization/91940 > * tree-vect-patterns.c: Include tree-vector-builder.h and > vec-perm-indices.h. > (vect_recog_rotate_pattern): Also handle __builtin_bswap16, either by > unpromoting the argument back to uint16_t, or by converting into a > rotate, or into shifts plus ior. > > * gcc.dg/vect/vect-bswap16.c: Add -msse4 on x86, run on all targets, > expect vectorized 1 loops message on both vect_bswap and sse4_runtime > targets. > * gcc.dg/vect/vect-bswap16a.c: New test. > > --- gcc/tree-vect-patterns.c.jj 2019-09-20 12:25:48.186387075 +0200 > +++ gcc/tree-vect-patterns.c 2019-10-01 11:29:18.229215895 +0200 > @@ -46,6 +46,8 @@ along with GCC; see the file COPYING3. > #include "cgraph.h" > #include "omp-simd-clone.h" > #include "predict.h" > +#include "tree-vector-builder.h" > +#include "vec-perm-indices.h" > > /* Return true if we have a useful VR_RANGE range for VAR, storing it > in *MIN_VALUE and *MAX_VALUE if so. Note the range in the dump files. */ > @@ -2168,24 +2170,107 @@ vect_recog_rotate_pattern (stmt_vec_info > enum vect_def_type dt; > optab optab1, optab2; > edge ext_def = NULL; > + bool bswap16_p = false; > > - if (!is_gimple_assign (last_stmt)) > - return NULL; > - > - rhs_code = gimple_assign_rhs_code (last_stmt); > - switch (rhs_code) > + if (is_gimple_assign (last_stmt)) > { > - case LROTATE_EXPR: > - case RROTATE_EXPR: > - break; > - default: > - return NULL; > + rhs_code = gimple_assign_rhs_code (last_stmt); > + switch (rhs_code) > + { > + case LROTATE_EXPR: > + case RROTATE_EXPR: > + break; > + default: > + return NULL; > + } > + > + lhs = gimple_assign_lhs (last_stmt); > + oprnd0 = gimple_assign_rhs1 (last_stmt); > + type = TREE_TYPE (oprnd0); > + oprnd1 = gimple_assign_rhs2 (last_stmt); > + } > + else if (gimple_call_builtin_p (last_stmt, BUILT_IN_BSWAP16)) > + { > + /* __builtin_bswap16 (x) is another form of x r>> 8. > + The vectorizer has bswap support, but only if the argument isn't > + promoted. */ > + lhs = gimple_call_lhs (last_stmt); > + oprnd0 = gimple_call_arg (last_stmt, 0); > + type = TREE_TYPE (oprnd0); > + if (TYPE_PRECISION (TREE_TYPE (lhs)) != 16 > + || TYPE_PRECISION (type) <= 16 > + || TREE_CODE (oprnd0) != SSA_NAME > + || BITS_PER_UNIT != 8 > + || !TYPE_UNSIGNED (TREE_TYPE (lhs))) > + return NULL; > + > + stmt_vec_info def_stmt_info; > + if (!vect_is_simple_use (oprnd0, vinfo, &dt, &def_stmt_info, > &def_stmt)) > + return NULL; > + > + if (dt != vect_internal_def) > + return NULL; > + > + if (gimple_assign_cast_p (def_stmt)) > + { > + def = gimple_assign_rhs1 (def_stmt); > + if (INTEGRAL_TYPE_P (TREE_TYPE (def)) > + && TYPE_PRECISION (TREE_TYPE (def)) == 16) > + oprnd0 = def; > + } > + > + type = TREE_TYPE (lhs); > + vectype = get_vectype_for_scalar_type (type); > + if (vectype == NULL_TREE) > + return NULL; > + > + if (tree char_vectype = get_same_sized_vectype (char_type_node, > vectype)) > + { > + /* The encoding uses one stepped pattern for each byte in the > + 16-bit word. */ > + vec_perm_builder elts (TYPE_VECTOR_SUBPARTS (char_vectype), 2, 3); > + for (unsigned i = 0; i < 3; ++i) > + for (unsigned j = 0; j < 2; ++j) > + elts.quick_push ((i + 1) * 2 - j - 1); > + > + vec_perm_indices indices (elts, 1, > + TYPE_VECTOR_SUBPARTS (char_vectype)); > + if (can_vec_perm_const_p (TYPE_MODE (char_vectype), indices)) > + { > + /* vectorizable_bswap can handle the __builtin_bswap16 if we > + undo the argument promotion. */ > + if (!useless_type_conversion_p (type, TREE_TYPE (oprnd0))) > + { > + def = vect_recog_temp_ssa_var (type, NULL); > + def_stmt = gimple_build_assign (def, NOP_EXPR, oprnd0); > + append_pattern_def_seq (stmt_vinfo, def_stmt); > + oprnd0 = def; > + } > + > + /* Pattern detected. */ > + vect_pattern_detected ("vect_recog_rotate_pattern", last_stmt); > + > + *type_out = vectype; > + > + /* Pattern supported. Create a stmt to be used to replace the > + pattern, with the unpromoted argument. */ > + var = vect_recog_temp_ssa_var (type, NULL); > + pattern_stmt = gimple_build_call (gimple_call_fndecl (last_stmt), > + 1, oprnd0); > + gimple_call_set_lhs (pattern_stmt, var); > + gimple_call_set_fntype (as_a <gcall *> (pattern_stmt), > + gimple_call_fntype (last_stmt)); > + return pattern_stmt; > + } > + } > + > + oprnd1 = build_int_cst (integer_type_node, 8); > + rhs_code = LROTATE_EXPR; > + bswap16_p = true; > } > + else > + return NULL; > > - lhs = gimple_assign_lhs (last_stmt); > - oprnd0 = gimple_assign_rhs1 (last_stmt); > - type = TREE_TYPE (oprnd0); > - oprnd1 = gimple_assign_rhs2 (last_stmt); > if (TREE_CODE (oprnd0) != SSA_NAME > || TYPE_PRECISION (TREE_TYPE (lhs)) != TYPE_PRECISION (type) > || !INTEGRAL_TYPE_P (type) > @@ -2210,14 +2295,39 @@ vect_recog_rotate_pattern (stmt_vec_info > optab1 = optab_for_tree_code (rhs_code, vectype, optab_vector); > if (optab1 > && optab_handler (optab1, TYPE_MODE (vectype)) != CODE_FOR_nothing) > - return NULL; > + { > + use_rotate: > + if (bswap16_p) > + { > + if (!useless_type_conversion_p (type, TREE_TYPE (oprnd0))) > + { > + def = vect_recog_temp_ssa_var (type, NULL); > + def_stmt = gimple_build_assign (def, NOP_EXPR, oprnd0); > + append_pattern_def_seq (stmt_vinfo, def_stmt); > + oprnd0 = def; > + } > + > + /* Pattern detected. */ > + vect_pattern_detected ("vect_recog_rotate_pattern", last_stmt); > + > + *type_out = vectype; > + > + /* Pattern supported. Create a stmt to be used to replace the > + pattern. */ > + var = vect_recog_temp_ssa_var (type, NULL); > + pattern_stmt = gimple_build_assign (var, LROTATE_EXPR, oprnd0, > + oprnd1); > + return pattern_stmt; > + } > + return NULL; > + } > > if (is_a <bb_vec_info> (vinfo) || dt != vect_internal_def) > { > optab2 = optab_for_tree_code (rhs_code, vectype, optab_scalar); > if (optab2 > && optab_handler (optab2, TYPE_MODE (vectype)) != CODE_FOR_nothing) > - return NULL; > + goto use_rotate; > } > > /* If vector/vector or vector/scalar shifts aren't supported by the target, > @@ -2242,6 +2352,14 @@ vect_recog_rotate_pattern (stmt_vec_info > > *type_out = vectype; > > + if (bswap16_p && !useless_type_conversion_p (type, TREE_TYPE (oprnd0))) > + { > + def = vect_recog_temp_ssa_var (type, NULL); > + def_stmt = gimple_build_assign (def, NOP_EXPR, oprnd0); > + append_pattern_def_seq (stmt_vinfo, def_stmt); > + oprnd0 = def; > + } > + > if (dt == vect_external_def > && TREE_CODE (oprnd1) == SSA_NAME) > ext_def = vect_get_external_def_edge (vinfo, oprnd1); > --- gcc/testsuite/gcc.dg/vect/vect-bswap16.c.jj 2017-11-09 > 20:24:05.957112733 +0100 > +++ gcc/testsuite/gcc.dg/vect/vect-bswap16.c 2019-10-01 12:39:56.993268063 > +0200 > @@ -1,4 +1,4 @@ > -/* { dg-require-effective-target vect_bswap } */ > +/* { dg-additional-options "-msse4" { target sse4_runtime } } */ > > #include "tree-vect.h" > > @@ -39,4 +39,4 @@ main (void) > return 0; > } > > -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target > { vect_bswap || sse4_runtime } } } } */ > --- gcc/testsuite/gcc.dg/vect/vect-bswap16a.c.jj 2019-10-01 > 12:35:29.965294527 +0200 > +++ gcc/testsuite/gcc.dg/vect/vect-bswap16a.c 2019-10-01 12:40:45.700533603 > +0200 > @@ -0,0 +1,5 @@ > +/* { dg-additional-options "-msse2 -mno-sse3" { target sse2_runtime } } */ > + > +#include "vect-bswap16.c" > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target > { vect_shift } } } } */ > > > Jakub > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany; GF: Felix Imendörffer; HRB 247165 (AG München)