Re: [PATCH] Use get_nonzero_bits to improve vectorization

Richard Biener Tue, 29 Oct 2013 05:12:17 -0700

On Fri, 25 Oct 2013, Jakub Jelinek wrote:

> Hi!
> 
> The following patch makes use of the computed nonzero_bits preserved
> in the SSA_NAME_RANGE_INFO.
> I chose to write a new routine instead of improving current
> highest_pow2_factor, because that routine didn't care about overflows etc.
> and by working on ctz numbers instead of powers of two in UHWI we can handle
> even larger constants etc.  highest_pow2_factor could very well overflow to
> zero etc.  So, the patch introduces a new tree_ctz routine and reimplements
> highest_pow2_factor on top of that, plus uses tree_ctz also in
> get_object_alignment_2 and in the vectorizer to determine if it can avoid
> scalar loop for bound (and indirectly also in the analysis whether peeling
> for alignment is needed).
> 
> With this patch, e.g.
> int a[1024];
> 
> void
> foo (int x, int y)
> {
>   int i;
>   x &= -32;
>   y &= -32;
>   for (i = x + 32; i < y; i++)
>     a[i]++;
> }
> can be vectorized without any peeling for alignment or scalar loop
> afterwards.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 
> 2013-10-25  Jakub Jelinek  <ja...@redhat.com>
> 
>       * tree.c (tree_ctz): New function.
>       * tree.h (tree_ctz): New prototype.
>       * tree-ssanames.h (get_range_info, get_nonzero_bits): Change
>       first argument from tree to const_tree.
>       * tree-ssanames.c (get_range_info, get_nonzero_bits): Likewise.
>       * tree-vectorizer.h (vect_generate_tmps_on_preheader): New prototype.
>       * tree-vect-loop-manip.c (vect_generate_tmps_on_preheader): No longer
>       static.
>       * expr.c (highest_pow2_factor): Reimplemented using tree_ctz.
>       * tree-vect-loop.c (vect_analyze_loop_operations,
>       vect_transform_loop): Don't force scalar loop for bound just because
>       number of iterations is unknown, only do it if it is not known to be
>       a multiple of vectorization_factor.
>       * builtins.c (get_object_alignment_2): Use tree_ctz on offset.
> 
> --- gcc/tree.c.jj     2013-10-23 14:43:12.000000000 +0200
> +++ gcc/tree.c        2013-10-25 15:00:55.296178794 +0200
> @@ -2213,6 +2213,110 @@ tree_floor_log2 (const_tree expr)
>         : floor_log2 (low));
>  }
>  
> +/* Return number of known trailing zero bits in EXPR, or, if the value of
> +   EXPR is known to be zero, the precision of it's type.  */
> +
> +int


unsigned int?

> +tree_ctz (const_tree expr)
> +{
> +  if (!INTEGRAL_TYPE_P (TREE_TYPE (expr))
> +      && !POINTER_TYPE_P (TREE_TYPE (expr)))
> +    return 0;
> +
> +  int ret1, ret2, prec = TYPE_PRECISION (TREE_TYPE (expr));
> +  switch (TREE_CODE (expr))
> +    {
> +    case INTEGER_CST:
> +      ret1 = tree_to_double_int (expr).trailing_zeros ();
> +      return MIN (ret1, prec);
> +    case SSA_NAME:
> +      ret1 = get_nonzero_bits (expr).trailing_zeros ();
> +      return MIN (ret1, prec);
> +    case PLUS_EXPR:
> +    case MINUS_EXPR:
> +    case BIT_IOR_EXPR:
> +    case BIT_XOR_EXPR:
> +    case MIN_EXPR:
> +    case MAX_EXPR:
> +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> +      ret2 = tree_ctz (TREE_OPERAND (expr, 1));

This first recurses but if either one returns 0 you don't have
to (that cuts down the recursion from exponential to linear in
the common case?).  Thus, early out on ret == 0?

> +      return MIN (ret1, ret2);
> +    case POINTER_PLUS_EXPR:
> +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> +      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
> +      ret2 = MIN (ret2, prec);

Why do you need that here but not elsewhere when processing
binary ops?

> +      return MIN (ret1, ret2);
> +    case BIT_AND_EXPR:
> +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> +      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
> +      return MAX (ret1, ret2);
> +    case MULT_EXPR:
> +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> +      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
> +      return MIN (ret1 + ret2, prec);
> +    case LSHIFT_EXPR:
> +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> +      if (host_integerp (TREE_OPERAND (expr, 1), 1)

check that first before recursing for op0 - if op1 is negative
you simply return ret1 which looks wrong, too.

> +       && ((unsigned HOST_WIDE_INT) tree_low_cst (TREE_OPERAND (expr, 1), 1)
> +           < (unsigned HOST_WIDE_INT) prec))

This check is to avoid overflowing ret1 + ret2?  If so, why not add

> +     {
> +       ret2 = tree_low_cst (TREE_OPERAND (expr, 1), 1);

  ret2 = MIN (ret2, prec);

instead?

> +       return MIN (ret1 + ret2, prec);
> +     }
> +      return ret1;
> +    case RSHIFT_EXPR:
> +      if (host_integerp (TREE_OPERAND (expr, 1), 1)
> +       && ((unsigned HOST_WIDE_INT) tree_low_cst (TREE_OPERAND (expr, 1), 1)
> +           < (unsigned HOST_WIDE_INT) prec))
> +     {
> +       ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> +       ret2 = tree_low_cst (TREE_OPERAND (expr, 1), 1);
> +       if (ret1 > ret2)
> +         return ret1 - ret2;
> +     }
> +      return 0;

Seems to be slightly better structured.  Looks like you assume only
positive shift amounts exist in the LSHIFT_EXPR case, I'm not sure
that's a valid assumption (see constant folding code dealing with that).

> +    case TRUNC_DIV_EXPR:
> +    case CEIL_DIV_EXPR:
> +    case FLOOR_DIV_EXPR:
> +    case ROUND_DIV_EXPR:
> +    case EXACT_DIV_EXPR:
> +      if (TREE_CODE (TREE_OPERAND (expr, 1)) == INTEGER_CST)
> +     {
> +       ret2 = tree_log2 (TREE_OPERAND (expr, 1));
> +       if (ret2 >= 0 && tree_int_cst_sgn (TREE_OPERAND (expr, 1)) == 1)

cheaper to test the sign first.

> +         {
> +           ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> +           if (ret1 > ret2)
> +             return ret1 - ret2;
> +         }
> +     }
> +      return 0;
> +    CASE_CONVERT:
> +      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
> +      if (ret1 && ret1 == TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (expr, 
> 0))))
> +     ret1 = prec;
> +      return MIN (ret1, prec);
> +    case SAVE_EXPR:
> +      return tree_ctz (TREE_OPERAND (expr, 0));
> +    case COND_EXPR:
> +      ret1 = tree_ctz (TREE_OPERAND (expr, 1));
> +      ret2 = tree_ctz (TREE_OPERAND (expr, 2));
> +      return MIN (ret1, ret2);
> +    case COMPOUND_EXPR:
> +      return tree_ctz (TREE_OPERAND (expr, 1));
> +    case ADDR_EXPR:
> +      ret1 = get_object_alignment (TREE_OPERAND (expr, 0));

Use get_pointer_alignment, this isn't a memory reference so type
alignment rules don't apply.

The rest looks ok to me.

Thanks,
Richard.

> +      if (ret1 > BITS_PER_UNIT)
> +     {
> +       ret1 = ctz_hwi (ret1 / BITS_PER_UNIT);
> +       return MIN (ret1, prec);
> +     }
> +      return 0;
> +    default:
> +      return 0;
> +    }
> +}
> +
>  /* Return 1 if EXPR is the real constant zero.  Trailing zeroes matter for
>     decimal float constants, so don't return 1 for them.  */
>  
> --- gcc/tree.h.jj     2013-10-17 22:30:45.000000000 +0200
> +++ gcc/tree.h        2013-10-25 12:20:05.473673186 +0200
> @@ -4546,6 +4546,7 @@ extern void get_type_static_bounds (cons
>  extern bool variably_modified_type_p (tree, tree);
>  extern int tree_log2 (const_tree);
>  extern int tree_floor_log2 (const_tree);
> +extern int tree_ctz (const_tree);
>  extern int simple_cst_equal (const_tree, const_tree);
>  extern hashval_t iterative_hash_expr (const_tree, hashval_t);
>  extern hashval_t iterative_hash_exprs_commutative (const_tree,
> --- gcc/tree-ssanames.h.jj    2013-10-24 15:52:53.000000000 +0200
> +++ gcc/tree-ssanames.h       2013-10-25 14:09:21.227015919 +0200
> @@ -72,9 +72,10 @@ enum value_range_type { VR_UNDEFINED, VR
>  /* Sets the value range to SSA.  */
>  extern void set_range_info (tree, double_int, double_int);
>  /* Gets the value range from SSA.  */
> -extern enum value_range_type get_range_info (tree, double_int *, double_int 
> *);
> +extern enum value_range_type get_range_info (const_tree, double_int *,
> +                                          double_int *);
>  extern void set_nonzero_bits (tree, double_int);
> -extern double_int get_nonzero_bits (tree);
> +extern double_int get_nonzero_bits (const_tree);
>  extern void init_ssanames (struct function *, int);
>  extern void fini_ssanames (void);
>  extern void ssanames_print_statistics (void);
> --- gcc/tree-ssanames.c.jj    2013-10-24 17:32:22.000000000 +0200
> +++ gcc/tree-ssanames.c       2013-10-25 14:08:46.218187581 +0200
> @@ -221,7 +221,7 @@ set_range_info (tree name, double_int mi
>     is used to determine if MIN and MAX are valid values.  */
>  
>  enum value_range_type
> -get_range_info (tree name, double_int *min, double_int *max)
> +get_range_info (const_tree name, double_int *min, double_int *max)
>  {
>    enum value_range_type range_type;
>    gcc_assert (!POINTER_TYPE_P (TREE_TYPE (name)));
> @@ -271,7 +271,7 @@ set_nonzero_bits (tree name, double_int
>     NAME, or double_int_minus_one if unknown.  */
>  
>  double_int
> -get_nonzero_bits (tree name)
> +get_nonzero_bits (const_tree name)
>  {
>    if (POINTER_TYPE_P (TREE_TYPE (name)))
>      {
> --- gcc/tree-vectorizer.h.jj  2013-10-24 10:19:20.000000000 +0200
> +++ gcc/tree-vectorizer.h     2013-10-25 14:02:58.198999063 +0200
> @@ -901,6 +901,8 @@ extern void slpeel_make_loop_iterate_nti
>  extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge);
>  struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, edge);
>  extern void vect_loop_versioning (loop_vec_info, unsigned int, bool);
> +extern void vect_generate_tmps_on_preheader (loop_vec_info, tree *, tree *,
> +                                          tree *, gimple_seq);
>  extern void vect_do_peeling_for_loop_bound (loop_vec_info, tree *,
>                                           unsigned int, bool);
>  extern void vect_do_peeling_for_alignment (loop_vec_info, unsigned int, 
> bool);
> --- gcc/tree-vect-loop-manip.c.jj     2013-10-24 10:19:22.000000000 +0200
> +++ gcc/tree-vect-loop-manip.c        2013-10-25 14:02:00.544284058 +0200
> @@ -1437,7 +1437,7 @@ vect_build_loop_niters (loop_vec_info lo
>   and places them at the loop preheader edge or in COND_EXPR_STMT_LIST
>   if that is non-NULL.  */
>  
> -static void
> +void
>  vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
>                                tree *ni_name_ptr,
>                                tree *ratio_mult_vf_name_ptr,
> --- gcc/expr.c.jj     2013-10-23 14:43:15.000000000 +0200
> +++ gcc/expr.c        2013-10-25 15:05:23.893781676 +0200
> @@ -7282,74 +7282,14 @@ safe_from_p (const_rtx x, tree exp, int
>  unsigned HOST_WIDE_INT
>  highest_pow2_factor (const_tree exp)
>  {
> -  unsigned HOST_WIDE_INT c0, c1;
> -
> -  switch (TREE_CODE (exp))
> -    {
> -    case INTEGER_CST:
> -      /* We can find the lowest bit that's a one.  If the low
> -      HOST_BITS_PER_WIDE_INT bits are zero, return BIGGEST_ALIGNMENT.
> -      We need to handle this case since we can find it in a COND_EXPR,
> -      a MIN_EXPR, or a MAX_EXPR.  If the constant overflows, we have an
> -      erroneous program, so return BIGGEST_ALIGNMENT to avoid any
> -      later ICE.  */
> -      if (TREE_OVERFLOW (exp))
> -     return BIGGEST_ALIGNMENT;
> -      else
> -     {
> -       /* Note: tree_low_cst is intentionally not used here,
> -          we don't care about the upper bits.  */
> -       c0 = TREE_INT_CST_LOW (exp);
> -       c0 &= -c0;
> -       return c0 ? c0 : BIGGEST_ALIGNMENT;
> -     }
> -      break;
> -
> -    case PLUS_EXPR:  case MINUS_EXPR:  case MIN_EXPR:  case MAX_EXPR:
> -      c0 = highest_pow2_factor (TREE_OPERAND (exp, 0));
> -      c1 = highest_pow2_factor (TREE_OPERAND (exp, 1));
> -      return MIN (c0, c1);
> -
> -    case MULT_EXPR:
> -      c0 = highest_pow2_factor (TREE_OPERAND (exp, 0));
> -      c1 = highest_pow2_factor (TREE_OPERAND (exp, 1));
> -      return c0 * c1;
> -
> -    case ROUND_DIV_EXPR:  case TRUNC_DIV_EXPR:  case FLOOR_DIV_EXPR:
> -    case CEIL_DIV_EXPR:
> -      if (integer_pow2p (TREE_OPERAND (exp, 1))
> -       && host_integerp (TREE_OPERAND (exp, 1), 1))
> -     {
> -       c0 = highest_pow2_factor (TREE_OPERAND (exp, 0));
> -       c1 = tree_low_cst (TREE_OPERAND (exp, 1), 1);
> -       return MAX (1, c0 / c1);
> -     }
> -      break;
> -
> -    case BIT_AND_EXPR:
> -      /* The highest power of two of a bit-and expression is the maximum of
> -      that of its operands.  We typically get here for a complex LHS and
> -      a constant negative power of two on the RHS to force an explicit
> -      alignment, so don't bother looking at the LHS.  */
> -      return highest_pow2_factor (TREE_OPERAND (exp, 1));
> -
> -    CASE_CONVERT:
> -    case SAVE_EXPR:
> -      return highest_pow2_factor (TREE_OPERAND (exp, 0));
> -
> -    case COMPOUND_EXPR:
> -      return highest_pow2_factor (TREE_OPERAND (exp, 1));
> -
> -    case COND_EXPR:
> -      c0 = highest_pow2_factor (TREE_OPERAND (exp, 1));
> -      c1 = highest_pow2_factor (TREE_OPERAND (exp, 2));
> -      return MIN (c0, c1);
> -
> -    default:
> -      break;
> -    }
> -
> -  return 1;
> +  unsigned HOST_WIDE_INT ret;
> +  int trailing_zeros = tree_ctz (exp);
> +  if (trailing_zeros >= HOST_BITS_PER_WIDE_INT)
> +    return BIGGEST_ALIGNMENT;
> +  ret = (unsigned HOST_WIDE_INT) 1 << trailing_zeros;
> +  if (ret > BIGGEST_ALIGNMENT)
> +    return BIGGEST_ALIGNMENT;
> +  return ret;
>  }
>  
>  /* Similar, except that the alignment requirements of TARGET are
> --- gcc/tree-vect-loop.c.jj   2013-10-24 10:19:23.000000000 +0200
> +++ gcc/tree-vect-loop.c      2013-10-25 14:01:35.968407222 +0200
> @@ -1586,9 +1586,9 @@ vect_analyze_loop_operations (loop_vec_i
>        return false;
>      }
>  
> -  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> -      || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
> -      || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
> +  if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo)
> +      || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
> +       < exact_log2 (vectorization_factor)))
>      {
>        if (dump_enabled_p ())
>          dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.\n");
> @@ -5656,15 +5656,20 @@ vect_transform_loop (loop_vec_info loop_
>       will remain scalar and will compute the remaining (n%VF) iterations.
>       (VF is the vectorization factor).  */
>  
> -  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> -       || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> -        && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
> -       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
> +  if (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
> +      < exact_log2 (vectorization_factor)
> +      || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
>      vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
>                                   th, check_profitability);
> -  else
> +  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
>      ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
>               LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
> +  else
> +    {
> +      tree ni_name, ratio_mult_vf;
> +      vect_generate_tmps_on_preheader (loop_vinfo, &ni_name, &ratio_mult_vf,
> +                                    &ratio, NULL);
> +    }
>  
>    /* 1) Make sure the loop header has exactly two entries
>       2) Make sure we have a preheader basic block.  */
> --- gcc/builtins.c.jj 2013-10-23 14:43:12.000000000 +0200
> +++ gcc/builtins.c    2013-10-25 12:31:49.426022284 +0200
> @@ -309,7 +309,7 @@ get_object_alignment_2 (tree exp, unsign
>    tree offset;
>    enum machine_mode mode;
>    int unsignedp, volatilep;
> -  unsigned int inner, align = BITS_PER_UNIT;
> +  unsigned int align = BITS_PER_UNIT;
>    bool known_alignment = false;
>  
>    /* Get the innermost object and the constant (bitpos) and possibly
> @@ -418,50 +418,16 @@ get_object_alignment_2 (tree exp, unsign
>  
>    /* If there is a non-constant offset part extract the maximum
>       alignment that can prevail.  */
> -  inner = ~0U;
> -  while (offset)
> +  if (offset)
>      {
> -      tree next_offset;
> -
> -      if (TREE_CODE (offset) == PLUS_EXPR)
> -     {
> -       next_offset = TREE_OPERAND (offset, 0);
> -       offset = TREE_OPERAND (offset, 1);
> -     }
> -      else
> -     next_offset = NULL;
> -      if (host_integerp (offset, 1))
> -     {
> -       /* Any overflow in calculating offset_bits won't change
> -          the alignment.  */
> -       unsigned offset_bits
> -         = ((unsigned) tree_low_cst (offset, 1) * BITS_PER_UNIT);
> -
> -       if (offset_bits)
> -         inner = MIN (inner, (offset_bits & -offset_bits));
> -     }
> -      else if (TREE_CODE (offset) == MULT_EXPR
> -            && host_integerp (TREE_OPERAND (offset, 1), 1))
> -     {
> -       /* Any overflow in calculating offset_factor won't change
> -          the alignment.  */
> -       unsigned offset_factor
> -         = ((unsigned) tree_low_cst (TREE_OPERAND (offset, 1), 1)
> -            * BITS_PER_UNIT);
> -
> -       if (offset_factor)
> -         inner = MIN (inner, (offset_factor & -offset_factor));
> -     }
> -      else
> +      int trailing_zeros = tree_ctz (offset);
> +      if (trailing_zeros < HOST_BITS_PER_INT)
>       {
> -       inner = MIN (inner, BITS_PER_UNIT);
> -       break;
> +       unsigned int inner = (1U << trailing_zeros) * BITS_PER_UNIT;
> +       if (inner)
> +         align = MIN (align, inner);
>       }
> -      offset = next_offset;
>      }
> -  /* Alignment is innermost object alignment adjusted by the constant
> -     and non-constant offset parts.  */
> -  align = MIN (align, inner);
>  
>    *alignp = align;
>    *bitposp = bitpos & (*alignp - 1);
> 
> 
>       Jakub
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imend

Re: [PATCH] Use get_nonzero_bits to improve vectorization

Reply via email to