On Fri, 25 Oct 2013, Jakub Jelinek wrote: > Hi! > > The following patch makes use of the computed nonzero_bits preserved > in the SSA_NAME_RANGE_INFO. > I chose to write a new routine instead of improving current > highest_pow2_factor, because that routine didn't care about overflows etc. > and by working on ctz numbers instead of powers of two in UHWI we can handle > even larger constants etc. highest_pow2_factor could very well overflow to > zero etc. So, the patch introduces a new tree_ctz routine and reimplements > highest_pow2_factor on top of that, plus uses tree_ctz also in > get_object_alignment_2 and in the vectorizer to determine if it can avoid > scalar loop for bound (and indirectly also in the analysis whether peeling > for alignment is needed). > > With this patch, e.g. > int a[1024]; > > void > foo (int x, int y) > { > int i; > x &= -32; > y &= -32; > for (i = x + 32; i < y; i++) > a[i]++; > } > can be vectorized without any peeling for alignment or scalar loop > afterwards. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2013-10-25 Jakub Jelinek <ja...@redhat.com> > > * tree.c (tree_ctz): New function. > * tree.h (tree_ctz): New prototype. > * tree-ssanames.h (get_range_info, get_nonzero_bits): Change > first argument from tree to const_tree. > * tree-ssanames.c (get_range_info, get_nonzero_bits): Likewise. > * tree-vectorizer.h (vect_generate_tmps_on_preheader): New prototype. > * tree-vect-loop-manip.c (vect_generate_tmps_on_preheader): No longer > static. > * expr.c (highest_pow2_factor): Reimplemented using tree_ctz. > * tree-vect-loop.c (vect_analyze_loop_operations, > vect_transform_loop): Don't force scalar loop for bound just because > number of iterations is unknown, only do it if it is not known to be > a multiple of vectorization_factor. > * builtins.c (get_object_alignment_2): Use tree_ctz on offset. > > --- gcc/tree.c.jj 2013-10-23 14:43:12.000000000 +0200 > +++ gcc/tree.c 2013-10-25 15:00:55.296178794 +0200 > @@ -2213,6 +2213,110 @@ tree_floor_log2 (const_tree expr) > : floor_log2 (low)); > } > > +/* Return number of known trailing zero bits in EXPR, or, if the value of > + EXPR is known to be zero, the precision of it's type. */ > + > +int
unsigned int? > +tree_ctz (const_tree expr) > +{ > + if (!INTEGRAL_TYPE_P (TREE_TYPE (expr)) > + && !POINTER_TYPE_P (TREE_TYPE (expr))) > + return 0; > + > + int ret1, ret2, prec = TYPE_PRECISION (TREE_TYPE (expr)); > + switch (TREE_CODE (expr)) > + { > + case INTEGER_CST: > + ret1 = tree_to_double_int (expr).trailing_zeros (); > + return MIN (ret1, prec); > + case SSA_NAME: > + ret1 = get_nonzero_bits (expr).trailing_zeros (); > + return MIN (ret1, prec); > + case PLUS_EXPR: > + case MINUS_EXPR: > + case BIT_IOR_EXPR: > + case BIT_XOR_EXPR: > + case MIN_EXPR: > + case MAX_EXPR: > + ret1 = tree_ctz (TREE_OPERAND (expr, 0)); > + ret2 = tree_ctz (TREE_OPERAND (expr, 1)); This first recurses but if either one returns 0 you don't have to (that cuts down the recursion from exponential to linear in the common case?). Thus, early out on ret == 0? > + return MIN (ret1, ret2); > + case POINTER_PLUS_EXPR: > + ret1 = tree_ctz (TREE_OPERAND (expr, 0)); > + ret2 = tree_ctz (TREE_OPERAND (expr, 1)); > + ret2 = MIN (ret2, prec); Why do you need that here but not elsewhere when processing binary ops? > + return MIN (ret1, ret2); > + case BIT_AND_EXPR: > + ret1 = tree_ctz (TREE_OPERAND (expr, 0)); > + ret2 = tree_ctz (TREE_OPERAND (expr, 1)); > + return MAX (ret1, ret2); > + case MULT_EXPR: > + ret1 = tree_ctz (TREE_OPERAND (expr, 0)); > + ret2 = tree_ctz (TREE_OPERAND (expr, 1)); > + return MIN (ret1 + ret2, prec); > + case LSHIFT_EXPR: > + ret1 = tree_ctz (TREE_OPERAND (expr, 0)); > + if (host_integerp (TREE_OPERAND (expr, 1), 1) check that first before recursing for op0 - if op1 is negative you simply return ret1 which looks wrong, too. > + && ((unsigned HOST_WIDE_INT) tree_low_cst (TREE_OPERAND (expr, 1), 1) > + < (unsigned HOST_WIDE_INT) prec)) This check is to avoid overflowing ret1 + ret2? If so, why not add > + { > + ret2 = tree_low_cst (TREE_OPERAND (expr, 1), 1); ret2 = MIN (ret2, prec); instead? > + return MIN (ret1 + ret2, prec); > + } > + return ret1; > + case RSHIFT_EXPR: > + if (host_integerp (TREE_OPERAND (expr, 1), 1) > + && ((unsigned HOST_WIDE_INT) tree_low_cst (TREE_OPERAND (expr, 1), 1) > + < (unsigned HOST_WIDE_INT) prec)) > + { > + ret1 = tree_ctz (TREE_OPERAND (expr, 0)); > + ret2 = tree_low_cst (TREE_OPERAND (expr, 1), 1); > + if (ret1 > ret2) > + return ret1 - ret2; > + } > + return 0; Seems to be slightly better structured. Looks like you assume only positive shift amounts exist in the LSHIFT_EXPR case, I'm not sure that's a valid assumption (see constant folding code dealing with that). > + case TRUNC_DIV_EXPR: > + case CEIL_DIV_EXPR: > + case FLOOR_DIV_EXPR: > + case ROUND_DIV_EXPR: > + case EXACT_DIV_EXPR: > + if (TREE_CODE (TREE_OPERAND (expr, 1)) == INTEGER_CST) > + { > + ret2 = tree_log2 (TREE_OPERAND (expr, 1)); > + if (ret2 >= 0 && tree_int_cst_sgn (TREE_OPERAND (expr, 1)) == 1) cheaper to test the sign first. > + { > + ret1 = tree_ctz (TREE_OPERAND (expr, 0)); > + if (ret1 > ret2) > + return ret1 - ret2; > + } > + } > + return 0; > + CASE_CONVERT: > + ret1 = tree_ctz (TREE_OPERAND (expr, 0)); > + if (ret1 && ret1 == TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (expr, > 0)))) > + ret1 = prec; > + return MIN (ret1, prec); > + case SAVE_EXPR: > + return tree_ctz (TREE_OPERAND (expr, 0)); > + case COND_EXPR: > + ret1 = tree_ctz (TREE_OPERAND (expr, 1)); > + ret2 = tree_ctz (TREE_OPERAND (expr, 2)); > + return MIN (ret1, ret2); > + case COMPOUND_EXPR: > + return tree_ctz (TREE_OPERAND (expr, 1)); > + case ADDR_EXPR: > + ret1 = get_object_alignment (TREE_OPERAND (expr, 0)); Use get_pointer_alignment, this isn't a memory reference so type alignment rules don't apply. The rest looks ok to me. Thanks, Richard. > + if (ret1 > BITS_PER_UNIT) > + { > + ret1 = ctz_hwi (ret1 / BITS_PER_UNIT); > + return MIN (ret1, prec); > + } > + return 0; > + default: > + return 0; > + } > +} > + > /* Return 1 if EXPR is the real constant zero. Trailing zeroes matter for > decimal float constants, so don't return 1 for them. */ > > --- gcc/tree.h.jj 2013-10-17 22:30:45.000000000 +0200 > +++ gcc/tree.h 2013-10-25 12:20:05.473673186 +0200 > @@ -4546,6 +4546,7 @@ extern void get_type_static_bounds (cons > extern bool variably_modified_type_p (tree, tree); > extern int tree_log2 (const_tree); > extern int tree_floor_log2 (const_tree); > +extern int tree_ctz (const_tree); > extern int simple_cst_equal (const_tree, const_tree); > extern hashval_t iterative_hash_expr (const_tree, hashval_t); > extern hashval_t iterative_hash_exprs_commutative (const_tree, > --- gcc/tree-ssanames.h.jj 2013-10-24 15:52:53.000000000 +0200 > +++ gcc/tree-ssanames.h 2013-10-25 14:09:21.227015919 +0200 > @@ -72,9 +72,10 @@ enum value_range_type { VR_UNDEFINED, VR > /* Sets the value range to SSA. */ > extern void set_range_info (tree, double_int, double_int); > /* Gets the value range from SSA. */ > -extern enum value_range_type get_range_info (tree, double_int *, double_int > *); > +extern enum value_range_type get_range_info (const_tree, double_int *, > + double_int *); > extern void set_nonzero_bits (tree, double_int); > -extern double_int get_nonzero_bits (tree); > +extern double_int get_nonzero_bits (const_tree); > extern void init_ssanames (struct function *, int); > extern void fini_ssanames (void); > extern void ssanames_print_statistics (void); > --- gcc/tree-ssanames.c.jj 2013-10-24 17:32:22.000000000 +0200 > +++ gcc/tree-ssanames.c 2013-10-25 14:08:46.218187581 +0200 > @@ -221,7 +221,7 @@ set_range_info (tree name, double_int mi > is used to determine if MIN and MAX are valid values. */ > > enum value_range_type > -get_range_info (tree name, double_int *min, double_int *max) > +get_range_info (const_tree name, double_int *min, double_int *max) > { > enum value_range_type range_type; > gcc_assert (!POINTER_TYPE_P (TREE_TYPE (name))); > @@ -271,7 +271,7 @@ set_nonzero_bits (tree name, double_int > NAME, or double_int_minus_one if unknown. */ > > double_int > -get_nonzero_bits (tree name) > +get_nonzero_bits (const_tree name) > { > if (POINTER_TYPE_P (TREE_TYPE (name))) > { > --- gcc/tree-vectorizer.h.jj 2013-10-24 10:19:20.000000000 +0200 > +++ gcc/tree-vectorizer.h 2013-10-25 14:02:58.198999063 +0200 > @@ -901,6 +901,8 @@ extern void slpeel_make_loop_iterate_nti > extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge); > struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, edge); > extern void vect_loop_versioning (loop_vec_info, unsigned int, bool); > +extern void vect_generate_tmps_on_preheader (loop_vec_info, tree *, tree *, > + tree *, gimple_seq); > extern void vect_do_peeling_for_loop_bound (loop_vec_info, tree *, > unsigned int, bool); > extern void vect_do_peeling_for_alignment (loop_vec_info, unsigned int, > bool); > --- gcc/tree-vect-loop-manip.c.jj 2013-10-24 10:19:22.000000000 +0200 > +++ gcc/tree-vect-loop-manip.c 2013-10-25 14:02:00.544284058 +0200 > @@ -1437,7 +1437,7 @@ vect_build_loop_niters (loop_vec_info lo > and places them at the loop preheader edge or in COND_EXPR_STMT_LIST > if that is non-NULL. */ > > -static void > +void > vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo, > tree *ni_name_ptr, > tree *ratio_mult_vf_name_ptr, > --- gcc/expr.c.jj 2013-10-23 14:43:15.000000000 +0200 > +++ gcc/expr.c 2013-10-25 15:05:23.893781676 +0200 > @@ -7282,74 +7282,14 @@ safe_from_p (const_rtx x, tree exp, int > unsigned HOST_WIDE_INT > highest_pow2_factor (const_tree exp) > { > - unsigned HOST_WIDE_INT c0, c1; > - > - switch (TREE_CODE (exp)) > - { > - case INTEGER_CST: > - /* We can find the lowest bit that's a one. If the low > - HOST_BITS_PER_WIDE_INT bits are zero, return BIGGEST_ALIGNMENT. > - We need to handle this case since we can find it in a COND_EXPR, > - a MIN_EXPR, or a MAX_EXPR. If the constant overflows, we have an > - erroneous program, so return BIGGEST_ALIGNMENT to avoid any > - later ICE. */ > - if (TREE_OVERFLOW (exp)) > - return BIGGEST_ALIGNMENT; > - else > - { > - /* Note: tree_low_cst is intentionally not used here, > - we don't care about the upper bits. */ > - c0 = TREE_INT_CST_LOW (exp); > - c0 &= -c0; > - return c0 ? c0 : BIGGEST_ALIGNMENT; > - } > - break; > - > - case PLUS_EXPR: case MINUS_EXPR: case MIN_EXPR: case MAX_EXPR: > - c0 = highest_pow2_factor (TREE_OPERAND (exp, 0)); > - c1 = highest_pow2_factor (TREE_OPERAND (exp, 1)); > - return MIN (c0, c1); > - > - case MULT_EXPR: > - c0 = highest_pow2_factor (TREE_OPERAND (exp, 0)); > - c1 = highest_pow2_factor (TREE_OPERAND (exp, 1)); > - return c0 * c1; > - > - case ROUND_DIV_EXPR: case TRUNC_DIV_EXPR: case FLOOR_DIV_EXPR: > - case CEIL_DIV_EXPR: > - if (integer_pow2p (TREE_OPERAND (exp, 1)) > - && host_integerp (TREE_OPERAND (exp, 1), 1)) > - { > - c0 = highest_pow2_factor (TREE_OPERAND (exp, 0)); > - c1 = tree_low_cst (TREE_OPERAND (exp, 1), 1); > - return MAX (1, c0 / c1); > - } > - break; > - > - case BIT_AND_EXPR: > - /* The highest power of two of a bit-and expression is the maximum of > - that of its operands. We typically get here for a complex LHS and > - a constant negative power of two on the RHS to force an explicit > - alignment, so don't bother looking at the LHS. */ > - return highest_pow2_factor (TREE_OPERAND (exp, 1)); > - > - CASE_CONVERT: > - case SAVE_EXPR: > - return highest_pow2_factor (TREE_OPERAND (exp, 0)); > - > - case COMPOUND_EXPR: > - return highest_pow2_factor (TREE_OPERAND (exp, 1)); > - > - case COND_EXPR: > - c0 = highest_pow2_factor (TREE_OPERAND (exp, 1)); > - c1 = highest_pow2_factor (TREE_OPERAND (exp, 2)); > - return MIN (c0, c1); > - > - default: > - break; > - } > - > - return 1; > + unsigned HOST_WIDE_INT ret; > + int trailing_zeros = tree_ctz (exp); > + if (trailing_zeros >= HOST_BITS_PER_WIDE_INT) > + return BIGGEST_ALIGNMENT; > + ret = (unsigned HOST_WIDE_INT) 1 << trailing_zeros; > + if (ret > BIGGEST_ALIGNMENT) > + return BIGGEST_ALIGNMENT; > + return ret; > } > > /* Similar, except that the alignment requirements of TARGET are > --- gcc/tree-vect-loop.c.jj 2013-10-24 10:19:23.000000000 +0200 > +++ gcc/tree-vect-loop.c 2013-10-25 14:01:35.968407222 +0200 > @@ -1586,9 +1586,9 @@ vect_analyze_loop_operations (loop_vec_i > return false; > } > > - if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) > - || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0 > - || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo)) > + if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > + || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo)) > + < exact_log2 (vectorization_factor))) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.\n"); > @@ -5656,15 +5656,20 @@ vect_transform_loop (loop_vec_info loop_ > will remain scalar and will compute the remaining (n%VF) iterations. > (VF is the vectorization factor). */ > > - if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) > - || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) > - && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0) > - || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) > + if (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo)) > + < exact_log2 (vectorization_factor) > + || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) > vect_do_peeling_for_loop_bound (loop_vinfo, &ratio, > th, check_profitability); > - else > + else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) > ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)), > LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor); > + else > + { > + tree ni_name, ratio_mult_vf; > + vect_generate_tmps_on_preheader (loop_vinfo, &ni_name, &ratio_mult_vf, > + &ratio, NULL); > + } > > /* 1) Make sure the loop header has exactly two entries > 2) Make sure we have a preheader basic block. */ > --- gcc/builtins.c.jj 2013-10-23 14:43:12.000000000 +0200 > +++ gcc/builtins.c 2013-10-25 12:31:49.426022284 +0200 > @@ -309,7 +309,7 @@ get_object_alignment_2 (tree exp, unsign > tree offset; > enum machine_mode mode; > int unsignedp, volatilep; > - unsigned int inner, align = BITS_PER_UNIT; > + unsigned int align = BITS_PER_UNIT; > bool known_alignment = false; > > /* Get the innermost object and the constant (bitpos) and possibly > @@ -418,50 +418,16 @@ get_object_alignment_2 (tree exp, unsign > > /* If there is a non-constant offset part extract the maximum > alignment that can prevail. */ > - inner = ~0U; > - while (offset) > + if (offset) > { > - tree next_offset; > - > - if (TREE_CODE (offset) == PLUS_EXPR) > - { > - next_offset = TREE_OPERAND (offset, 0); > - offset = TREE_OPERAND (offset, 1); > - } > - else > - next_offset = NULL; > - if (host_integerp (offset, 1)) > - { > - /* Any overflow in calculating offset_bits won't change > - the alignment. */ > - unsigned offset_bits > - = ((unsigned) tree_low_cst (offset, 1) * BITS_PER_UNIT); > - > - if (offset_bits) > - inner = MIN (inner, (offset_bits & -offset_bits)); > - } > - else if (TREE_CODE (offset) == MULT_EXPR > - && host_integerp (TREE_OPERAND (offset, 1), 1)) > - { > - /* Any overflow in calculating offset_factor won't change > - the alignment. */ > - unsigned offset_factor > - = ((unsigned) tree_low_cst (TREE_OPERAND (offset, 1), 1) > - * BITS_PER_UNIT); > - > - if (offset_factor) > - inner = MIN (inner, (offset_factor & -offset_factor)); > - } > - else > + int trailing_zeros = tree_ctz (offset); > + if (trailing_zeros < HOST_BITS_PER_INT) > { > - inner = MIN (inner, BITS_PER_UNIT); > - break; > + unsigned int inner = (1U << trailing_zeros) * BITS_PER_UNIT; > + if (inner) > + align = MIN (align, inner); > } > - offset = next_offset; > } > - /* Alignment is innermost object alignment adjusted by the constant > - and non-constant offset parts. */ > - align = MIN (align, inner); > > *alignp = align; > *bitposp = bitpos & (*alignp - 1); > > > Jakub > > -- Richard Biener <rguent...@suse.de> SUSE / SUSE Labs SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 GF: Jeff Hawn, Jennifer Guild, Felix Imend