Re: [aarch64][vect] Support V8QI->V8HI WIDEN_ patterns

Richard Sandiford via Gcc-patches Thu, 11 Feb 2021 04:54:52 -0800

One more formatting nit, sorry:

Joel Hutton <joel.hut...@arm.com> writes:
> +bool
> +supportable_half_widening_operation (enum tree_code code,
> +                            tree vectype_out, tree vectype_in,
> +                            enum tree_code *code1)


The arguments need reindenting for the new function name, so that
they line up under “enum”.  (Obviously doesn't need a full retest.)

OK with that change, thanks.

Richard

> +{
> +  machine_mode m1,m2;
> +  enum tree_code dummy_code;
> +  optab op;
> +
> +  gcc_assert (VECTOR_TYPE_P (vectype_out) && VECTOR_TYPE_P (vectype_in));
> +
> +  m1 = TYPE_MODE (vectype_out);
> +  m2 = TYPE_MODE (vectype_in);
> +
> +  if (!VECTOR_MODE_P (m1) || !VECTOR_MODE_P (m2))
> +    return false;
> +
> +  if (maybe_ne (TYPE_VECTOR_SUBPARTS (vectype_in),
> +               TYPE_VECTOR_SUBPARTS (vectype_out)))
> +    return false;
> +
> +  switch (code)
> +    {
> +    case WIDEN_LSHIFT_EXPR:
> +      *code1 = LSHIFT_EXPR;
> +      break;
> +    case WIDEN_MINUS_EXPR:
> +      *code1 = MINUS_EXPR;
> +      break;
> +    case WIDEN_PLUS_EXPR:
> +      *code1 = PLUS_EXPR;
> +      break;
> +    case WIDEN_MULT_EXPR:
> +      *code1 = MULT_EXPR;
> +      break;
> +    default:
> +      return false;
> +    }
> +
> +  if (!supportable_convert_operation (NOP_EXPR, vectype_out, vectype_in,
> +                                  &dummy_code))
> +    return false;
> +
> +  op = optab_for_tree_code (*code1, vectype_out, optab_vector);
> +  return (optab_handler (op, TYPE_MODE (vectype_out)) != CODE_FOR_nothing);
> +}
> +
>  /* Function supportable_convert_operation
>  
>     Check whether an operation represented by the code CODE is a
> diff --git a/gcc/optabs-tree.h b/gcc/optabs-tree.h
> index 173da0d3bd2..c3aaa1a4169 100644
> --- a/gcc/optabs-tree.h
> +++ b/gcc/optabs-tree.h
> @@ -36,6 +36,9 @@ enum optab_subtype
>     the second argument.  The third argument distinguishes between the types 
> of
>     vector shifts and rotates.  */
>  optab optab_for_tree_code (enum tree_code, const_tree, enum optab_subtype);
> +bool
> +supportable_half_widening_operation (enum tree_code, tree, tree,
> +                                 enum tree_code *);
>  bool supportable_convert_operation (enum tree_code, tree, tree,
>                                   enum tree_code *);
>  bool expand_vec_cmp_expr_p (tree, tree, enum tree_code);
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr98772.c 
> b/gcc/testsuite/gcc.target/aarch64/pr98772.c
> new file mode 100644
> index 00000000000..663221514e9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr98772.c
> @@ -0,0 +1,155 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps" } */
> +#include <stdint.h>
> +#include <string.h>
> +
> +#define DSIZE 16
> +#define PIXSIZE 64
> +
> +extern void
> +wplus (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
> +{
> +    for (int y = 0; y < 4; y++ )
> +    {
> +     for (int x = 0; x < 4; x++ )
> +         d[x + y*4] = pix1[x] + pix2[x];
> +     pix1 += 16;
> +     pix2 += 16;
> +    }
> +}
> +extern void __attribute__((optimize (0)))
> +wplus_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
> +{
> +    for (int y = 0; y < 4; y++ )
> +    {
> +     for (int x = 0; x < 4; x++ )
> +         d[x + y*4] = pix1[x] + pix2[x];
> +     pix1 += 16;
> +     pix2 += 16;
> +    }
> +}
> +
> +extern void
> +wminus (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
> +{
> +    for (int y = 0; y < 4; y++ )
> +    {
> +     for (int x = 0; x < 4; x++ )
> +         d[x + y*4] = pix1[x] - pix2[x];
> +     pix1 += 16;
> +     pix2 += 16;
> +    }
> +}
> +extern void __attribute__((optimize (0)))
> +wminus_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
> +{
> +    for (int y = 0; y < 4; y++ )
> +    {
> +     for (int x = 0; x < 4; x++ )
> +         d[x + y*4] = pix1[x] - pix2[x];
> +     pix1 += 16;
> +     pix2 += 16;
> +    }
> +}
> +
> +extern void
> +wmult (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
> +{
> +    for (int y = 0; y < 4; y++ )
> +    {
> +     for (int x = 0; x < 4; x++ )
> +         d[x + y*4] = pix1[x] * pix2[x];
> +     pix1 += 16;
> +     pix2 += 16;
> +    }
> +}
> +extern void __attribute__((optimize (0)))
> +wmult_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
> +{
> +    for (int y = 0; y < 4; y++ )
> +    {
> +     for (int x = 0; x < 4; x++ )
> +         d[x + y*4] = pix1[x] * pix2[x];
> +     pix1 += 16;
> +     pix2 += 16;
> +    }
> +}
> +
> +extern void
> +wlshift (uint16_t *d, uint8_t *restrict pix1)
> +
> +{
> +    for (int y = 0; y < 4; y++ )
> +    {
> +     for (int x = 0; x < 4; x++ )
> +         d[x + y*4] = pix1[x] << 8;
> +     pix1 += 16;
> +    }
> +}
> +extern void __attribute__((optimize (0)))
> +wlshift_no_opt (uint16_t *d, uint8_t *restrict pix1)
> +
> +{
> +    for (int y = 0; y < 4; y++ )
> +    {
> +     for (int x = 0; x < 4; x++ )
> +         d[x + y*4] = pix1[x] << 8;
> +     pix1 += 16;
> +    }
> +}
> +
> +void __attribute__((optimize (0)))
> +init_arrays (uint16_t *d_a, uint16_t *d_b, uint8_t *pix1, uint8_t *pix2)
> +{
> +  for (int i = 0; i < DSIZE; i++)
> +  {
> +    d_a[i] = (1074 * i)%17;
> +    d_b[i] = (1074 * i)%17;
> +  }
> +  for (int i = 0; i < PIXSIZE; i++)
> +  {
> +    pix1[i] = (1024 * i)%17;
> +    pix2[i] = (1024 * i)%17;
> +  }
> +}
> +
> +/* Don't optimize main so we don't get confused over where the vector
> +   instructions are generated. */
> +__attribute__((optimize (0)))
> +int main ()
> +{
> +  uint16_t d_a[DSIZE];
> +  uint16_t d_b[DSIZE];
> +  uint8_t pix1[PIXSIZE];
> +  uint8_t pix2[PIXSIZE];
> +
> +  init_arrays (d_a, d_b, pix1, pix2);
> +  wplus (d_a, pix1, pix2);
> +  wplus_no_opt (d_b, pix1, pix2);
> +  if (memcmp (d_a,d_b, DSIZE) != 0)
> +    return 1;
> +
> +  init_arrays (d_a, d_b, pix1, pix2);
> +  wminus (d_a, pix1, pix2);
> +  wminus_no_opt (d_b, pix1, pix2);
> +  if (memcmp (d_a,d_b, DSIZE) != 0)
> +    return 2;
> +
> +  init_arrays (d_a, d_b, pix1, pix2);
> +  wmult (d_a, pix1, pix2);
> +  wmult_no_opt (d_b, pix1, pix2);
> +  if (memcmp (d_a,d_b, DSIZE) != 0)
> +    return 3;
> +
> +  init_arrays (d_a, d_b, pix1, pix2);
> +  wlshift (d_a, pix1);
> +  wlshift_no_opt (d_b, pix1);
> +  if (memcmp (d_a,d_b, DSIZE) != 0)
> +    return 4;
> +
> +}
> +
> +/* { dg-final { scan-assembler-times "uaddl\\tv" 2 } } */
> +/* { dg-final { scan-assembler-times "usubl\\tv" 2 } } */
> +/* { dg-final { scan-assembler-times "umull\\tv" 2 } } */
> +/* { dg-final { scan-assembler-times "shl\\tv" 2 } } */
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index f180ced3124..bf23499f7e3 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -4545,6 +4545,64 @@ vect_create_vectorized_promotion_stmts (vec_info 
> *vinfo,
>    *vec_oprnds0 = vec_tmp;
>  }
>  
> +/* Create vectorized promotion stmts for widening stmts using only half the
> +   potential vector size for input.  */
> +static void
> +vect_create_half_widening_stmts (vec_info *vinfo,
> +                                     vec<tree> *vec_oprnds0,
> +                                     vec<tree> *vec_oprnds1,
> +                                     stmt_vec_info stmt_info, tree vec_dest,
> +                                     gimple_stmt_iterator *gsi,
> +                                     enum tree_code code1,
> +                                     int op_type)
> +{
> +  int i;
> +  tree vop0, vop1;
> +  gimple *new_stmt1;
> +  gimple *new_stmt2;
> +  gimple *new_stmt3;
> +  vec<tree> vec_tmp = vNULL;
> +
> +  vec_tmp.create (vec_oprnds0->length ());
> +  FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
> +    {
> +      tree new_tmp1, new_tmp2, new_tmp3, out_type;
> +
> +      gcc_assert (op_type == binary_op);
> +      vop1 = (*vec_oprnds1)[i];
> +
> +      /* Widen the first vector input.  */
> +      out_type = TREE_TYPE (vec_dest);
> +      new_tmp1 = make_ssa_name (out_type);
> +      new_stmt1 = gimple_build_assign (new_tmp1, NOP_EXPR, vop0);
> +      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt1, gsi);
> +      if (VECTOR_TYPE_P (TREE_TYPE (vop1)))
> +     {
> +       /* Widen the second vector input.  */
> +       new_tmp2 = make_ssa_name (out_type);
> +       new_stmt2 = gimple_build_assign (new_tmp2, NOP_EXPR, vop1);
> +       vect_finish_stmt_generation (vinfo, stmt_info, new_stmt2, gsi);
> +       /* Perform the operation.  With both vector inputs widened.  */
> +       new_stmt3 = gimple_build_assign (vec_dest, code1, new_tmp1, new_tmp2);
> +     }
> +      else
> +     {
> +       /* Perform the operation.  With the single vector input widened.  */
> +       new_stmt3 = gimple_build_assign (vec_dest, code1, new_tmp1, vop1);
> +      }
> +
> +      new_tmp3 = make_ssa_name (vec_dest, new_stmt3);
> +      gimple_assign_set_lhs (new_stmt3, new_tmp3);
> +      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt3, gsi);
> +
> +      /* Store the results for the next step.  */
> +      vec_tmp.quick_push (new_tmp3);
> +    }
> +
> +  vec_oprnds0->release ();
> +  *vec_oprnds0 = vec_tmp;
> +}
> +
>  
>  /* Check if STMT_INFO performs a conversion operation that can be vectorized.
>     If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
> @@ -4697,7 +4755,13 @@ vectorizable_conversion (vec_info *vinfo,
>    nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
>    nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
>    if (known_eq (nunits_out, nunits_in))
> -    modifier = NONE;
> +    if (code == WIDEN_MINUS_EXPR
> +     || code == WIDEN_PLUS_EXPR
> +     || code == WIDEN_LSHIFT_EXPR
> +     || code == WIDEN_MULT_EXPR)
> +      modifier = WIDEN;
> +    else
> +      modifier = NONE;
>    else if (multiple_p (nunits_out, nunits_in))
>      modifier = NARROW;
>    else
> @@ -4743,9 +4807,18 @@ vectorizable_conversion (vec_info *vinfo,
>        return false;
>  
>      case WIDEN:
> -      if (supportable_widening_operation (vinfo, code, stmt_info, 
> vectype_out,
> -                                       vectype_in, &code1, &code2,
> -                                       &multi_step_cvt, &interm_types))
> +      if (known_eq (nunits_in, nunits_out))
> +     {
> +       if (!supportable_half_widening_operation (code, vectype_out,
> +                                                vectype_in, &code1))
> +         goto unsupported;
> +       gcc_assert (!(multi_step_cvt && op_type == binary_op));
> +       break;
> +     }
> +      if (supportable_widening_operation (vinfo, code, stmt_info,
> +                                            vectype_out, vectype_in, &code1,
> +                                            &code2, &multi_step_cvt,
> +                                            &interm_types))
>       {
>         /* Binary widening operation can only be supported directly by the
>            architecture.  */
> @@ -4981,10 +5054,16 @@ vectorizable_conversion (vec_info *vinfo,
>             c1 = codecvt1;
>             c2 = codecvt2;
>           }
> -       vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
> -                                               &vec_oprnds1, stmt_info,
> -                                               this_dest, gsi,
> -                                               c1, c2, op_type);
> +       if (known_eq (nunits_out, nunits_in))
> +         vect_create_half_widening_stmts (vinfo, &vec_oprnds0,
> +                                                 &vec_oprnds1, stmt_info,
> +                                                 this_dest, gsi,
> +                                                 c1, op_type);
> +       else
> +         vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
> +                                                 &vec_oprnds1, stmt_info,
> +                                                 this_dest, gsi,
> +                                                 c1, c2, op_type);
>       }
>  
>        FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)

Re: [aarch64][vect] Support V8QI->V8HI WIDEN_ patterns

Reply via email to