On Fri, 26 Jun 2020, Richard Biener wrote:

> (sorry for the duplicate, forgot to copy the list)
> 
> This teaches SLP analysis about vector typed externals that are
> fed into the SLP operations via lane extracting BIT_FIELD_REFs.
> It shows that there's currently no good representation for
> vector code on the SLP side so I went a half way and represent
> such vector externals uses always using a SLP permutation node
> with a single external SLP child which has a non-standard
> representation of no scalar defs but only a vector def.  That
> works best for shielding the rest of the vectorizer from it.
> 
> I'm not sure it's actually worth the trouble and what real-world
> cases benefit from this.  In theory vectorized unrolled code                  
>   
> interfacing with scalar code might be one case but there
> we necessarily go through memory and there's no intermediate                  
>   
> pass transforming that to registers [to make BB vectorization
> cheaper].                                                                     
>   
>                 
> It's also not even close to ready for re-vectorizing vectorized               
>   
> code with a larger VF.
>                                                                               
>   
> Any opinions?   

I have now installed this.

Richard.

> Bootstrapped / tested on x86_64-unknown-linux-gnu.
>                                                                               
>   
> Thanks,         
> Richard.                                                     
> 
> 2020-06-26  Richard Biener  <rguent...@suse.de>
> 
>       PR tree-optimization/95839
>       * tree-vect-slp.c (vect_slp_tree_uniform_p): Pre-existing
>       vectors are not uniform.
>       (vect_build_slp_tree_1): Handle BIT_FIELD_REFs of
>       vector registers.
>       (vect_build_slp_tree_2): For groups of lane extracts
>       from a vector register generate a permute node
>       with a special child representing the pre-existing vector.
>       (vect_prologue_cost_for_slp): Pre-existing vectors cost nothing.
>       (vect_slp_analyze_node_operations): Use SLP_TREE_LANES.
>       (vectorizable_slp_permutation): Do not generate or cost identity
>       permutes.
>       (vect_schedule_slp_instance): Handle pre-existing vector
>       that are function arguments.
> 
>       * gcc.dg/vect/bb-slp-pr95839-2.c: New testcase.
> ---
>  gcc/testsuite/gcc.dg/vect/bb-slp-pr95839-2.c |  20 ++++
>  gcc/tree-vect-slp.c                          | 119 ++++++++++++++++---
>  2 files changed, 124 insertions(+), 15 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-pr95839-2.c
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr95839-2.c 
> b/gcc/testsuite/gcc.dg/vect/bb-slp-pr95839-2.c
> new file mode 100644
> index 00000000000..49e75d8c95c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr95839-2.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-additional-options "-w -Wno-psabi" } */
> +
> +typedef double __attribute__((vector_size(16))) v2df;
> +
> +v2df f(v2df a, v2df b)
> +{
> +  return (v2df){a[0] + b[0], a[1] + b[1]};
> +}
> +
> +v2df g(v2df a, v2df b)
> +{
> +  return (v2df){a[0] + b[1], a[1] + b[0]};
> +}
> +
> +/* Verify we manage to vectorize this with using the original vectors
> +   and do not end up with any vector CTORs.  */
> +/* { dg-final { scan-tree-dump-times "basic block vectorized" 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-not "vect_cst" "slp2" } } */
> diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
> index b223956e3af..83ec382ee0d 100644
> --- a/gcc/tree-vect-slp.c
> +++ b/gcc/tree-vect-slp.c
> @@ -247,6 +247,10 @@ vect_slp_tree_uniform_p (slp_tree node)
>    gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
>             || SLP_TREE_DEF_TYPE (node) == vect_external_def);
>  
> +  /* Pre-exsting vectors.  */
> +  if (SLP_TREE_SCALAR_OPS (node).is_empty ())
> +    return false;
> +
>    unsigned i;
>    tree op, first = NULL_TREE;
>    FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
> @@ -838,7 +842,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char 
> *swap,
>        else
>       {
>         rhs_code = gimple_assign_rhs_code (stmt);
> -       load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference;
> +       load_p = gimple_vuse (stmt);
>       }
>  
>        /* Check the operation.  */
> @@ -899,6 +903,22 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char 
> *swap,
>                need_same_oprnds = true;
>                first_op1 = gimple_assign_rhs2 (stmt);
>              }
> +       else if (!load_p
> +                && rhs_code == BIT_FIELD_REF)
> +         {
> +           tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
> +           if (TREE_CODE (vec) != SSA_NAME
> +               || !types_compatible_p (vectype, TREE_TYPE (vec)))
> +             {
> +               if (dump_enabled_p ())
> +                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                                  "Build SLP failed: "
> +                                  "BIT_FIELD_REF not supported\n");
> +               /* Fatal mismatch.  */
> +               matches[0] = false;
> +               return false;
> +             }
> +         }
>         else if (call_stmt
>                  && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
>           {
> @@ -957,6 +977,18 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char 
> *swap,
>                 continue;
>               }
>           }
> +       if (!load_p
> +           && first_stmt_code == BIT_FIELD_REF
> +           && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
> +               != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
> +         {
> +           if (dump_enabled_p ())
> +             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                              "Build SLP failed: different BIT_FIELD_REF "
> +                              "arguments in %G", stmt);
> +           /* Mismatch.  */
> +           continue;
> +         }
>  
>         if (!load_p && rhs_code == CALL_EXPR)
>           {
> @@ -1026,7 +1058,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char 
> *swap,
>             && TREE_CODE_CLASS (rhs_code) != tcc_expression
>             && TREE_CODE_CLASS (rhs_code) != tcc_comparison
>             && rhs_code != VIEW_CONVERT_EXPR
> -           && rhs_code != CALL_EXPR)
> +           && rhs_code != CALL_EXPR
> +           && rhs_code != BIT_FIELD_REF)
>           {
>             if (dump_enabled_p ())
>               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -1287,6 +1320,45 @@ vect_build_slp_tree_2 (vec_info *vinfo,
>         return node;
>       }
>      }
> +  else if (gimple_assign_single_p (stmt_info->stmt)
> +        && !gimple_vuse (stmt_info->stmt)
> +        && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
> +    {
> +      /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
> +      the same SSA name vector of a compatible type to vectype.  */
> +      vec<std::pair<unsigned, unsigned> > lperm = vNULL;
> +      tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
> +      stmt_vec_info estmt_info;
> +      FOR_EACH_VEC_ELT (stmts, i, estmt_info)
> +     {
> +       gassign *estmt = as_a <gassign *> (estmt_info->stmt);
> +       tree bfref = gimple_assign_rhs1 (estmt);
> +       HOST_WIDE_INT lane;
> +       if (!known_eq (bit_field_size (bfref),
> +                      tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
> +           || !constant_multiple_p (bit_field_offset (bfref),
> +                                    bit_field_size (bfref), &lane))
> +         {
> +           lperm.release ();
> +           return NULL;
> +         }
> +       lperm.safe_push (std::make_pair (0, (unsigned)lane));
> +     }
> +      slp_tree vnode = vect_create_new_slp_node (vNULL);
> +      SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
> +      SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
> +      /* We are always building a permutation node even if it is an identity
> +      permute to shield the rest of the vectorizer from the odd node
> +      representing an actual vector without any scalar ops.
> +      ???  We could hide it completely with making the permute node
> +      external?  */
> +      node = vect_create_new_slp_node (stmts, 1);
> +      SLP_TREE_CODE (node) = VEC_PERM_EXPR;
> +      SLP_TREE_LANE_PERMUTATION (node) = lperm;
> +      SLP_TREE_VECTYPE (node) = vectype;
> +      SLP_TREE_CHILDREN (node).quick_push (vnode);
> +      return node;
> +    }
>  
>    /* Get at the operands, verifying they are compatible.  */
>    vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, 
> group_size);
> @@ -2744,6 +2816,10 @@ static void
>  vect_prologue_cost_for_slp (slp_tree node,
>                           stmt_vector_for_cost *cost_vec)
>  {
> +  /* There's a special case of an existing vector, that costs nothing.  */
> +  if (SLP_TREE_SCALAR_OPS (node).length () == 0
> +      && !SLP_TREE_VEC_DEFS (node).is_empty ())
> +    return;
>    /* Without looking at the actual initializer a vector of
>       constants can be implemented as load from the constant pool.
>       When all elements are the same we can use a splat.  */
> @@ -2857,7 +2933,7 @@ vect_slp_analyze_node_operations (vec_info *vinfo, 
> slp_tree node,
>                         && j == 1);
>             continue;
>           }
> -       unsigned group_size = SLP_TREE_SCALAR_OPS (child).length ();
> +       unsigned group_size = SLP_TREE_LANES (child);
>         poly_uint64 vf = 1;
>         if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
>           vf = loop_vinfo->vectorization_factor;
> @@ -4139,7 +4215,9 @@ vectorizable_slp_permutation (vec_info *vinfo, 
> gimple_stmt_iterator *gsi,
>       {
>         indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
>                             const_nunits);
> -       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
> +       bool identity_p = indices.series_p (0, 1, 0, 1);
> +       if (!identity_p
> +           && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
>           {
>             if (dump_enabled_p ())
>               {
> @@ -4157,11 +4235,10 @@ vectorizable_slp_permutation (vec_info *vinfo, 
> gimple_stmt_iterator *gsi,
>             return false;
>           }
>  
> -       nperms++;
> +       if (!identity_p)
> +         nperms++;
>         if (gsi)
>           {
> -           tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
> -
>             if (second_vec.first == -1U)
>               second_vec = first_vec;
>  
> @@ -4169,14 +4246,22 @@ vectorizable_slp_permutation (vec_info *vinfo, 
> gimple_stmt_iterator *gsi,
>             slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
>             tree first_def
>               = vect_get_slp_vect_def (first_node, first_vec.second);
> -           slp_tree second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
> -           tree second_def
> -             = vect_get_slp_vect_def (second_node, second_vec.second);
> +           gassign *perm_stmt;
>             tree perm_dest = make_ssa_name (vectype);
> -           gassign *perm_stmt
> -             = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
> -                                    first_def, second_def,
> -                                    mask_vec);
> +           if (!identity_p)
> +             {
> +               slp_tree second_node
> +                 = SLP_TREE_CHILDREN (node)[second_vec.first];
> +               tree second_def
> +                 = vect_get_slp_vect_def (second_node, second_vec.second);
> +               tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
> +               perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
> +                                                first_def, second_def,
> +                                                mask_vec);
> +             }
> +           else
> +             /* We need a copy here in case the def was external.  */
> +             perm_stmt = gimple_build_assign (perm_dest, first_def);
>             vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
>             /* Store the vector statement in NODE.  */
>             SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
> @@ -4300,13 +4385,17 @@ vect_schedule_slp_instance (vec_info *vinfo,
>           unsigned j;
>           tree vdef;
>           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
> -           if (TREE_CODE (vdef) == SSA_NAME)
> +           if (TREE_CODE (vdef) == SSA_NAME
> +               && !SSA_NAME_IS_DEFAULT_DEF (vdef))
>               {
>                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
>                 if (!last_stmt
>                     || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
>                   last_stmt = vstmt;
>               }
> +         /* This can happen when all children are pre-existing vectors.  */
> +         if (!last_stmt)
> +           last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
>         }
>        if (is_a <gphi *> (last_stmt))
>       si = gsi_after_labels (gimple_bb (last_stmt));
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

Reply via email to