"juzhe.zh...@rivai.ai" <juzhe.zh...@rivai.ai> writes:
> Hi, Richard.
> Would you mind take a look at the loop control part again:
>
> static gcond *
> vect_set_loop_condition_partial_vectors (class loop *loop,
> loop_vec_info loop_vinfo, tree niters,
> tree final_iv, bool niters_maybe_zero,
> gimple_stmt_iterator loop_cond_gsi)
> ...
> tree loop_len_x = NULL_TREE;
>   FOR_EACH_VEC_ELT (*controls, i, rgc)
>     if (!rgc->controls.is_empty ())
>       {
> ...
>
> /* Set up all controls for this group.  */
> if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
>    OPTIMIZE_FOR_SPEED))
>  test_ctrl
>    = vect_set_loop_controls_by_select_vl (loop, loop_vinfo,
>   &preheader_seq, &header_seq,
>   rgc, niters, &loop_len_x);
> ...
>       }
>
> static tree
> vect_set_loop_controls_by_select_vl (class loop *loop, loop_vec_info 
> loop_vinfo,
>     gimple_seq *preheader_seq,
>     gimple_seq *header_seq,
>     rgroup_controls *rgc, tree niters, tree *x)
> {
>   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
>   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>   /* We are not allowing masked approach in SELECT_VL.  */
>   gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
>
>   tree ctrl_type = rgc->type;
>   unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
>   poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * 
> rgc->factor;
>   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
>
>   /* Calculate the maximum number of item values that the rgroup
>      handles in total, the number that it handles for each iteration
>      of the vector loop.  */
>   tree nitems_total = niters;
>   if (nitems_per_iter != 1)
>     {
>       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
> these multiplications don't overflow.  */
>       tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
>       nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
>   nitems_total, compare_factor);
>     }
>
>   /* Convert the comparison value to the IV type (either a no-op or
>      a promotion).  */
>   nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
>
>   /* Create an induction variable that counts the number of items
>      processed.  */
>   tree index_before_incr, index_after_incr;
>   gimple_stmt_iterator incr_gsi;
>   bool insert_after;
>   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
>
>   /* Test the decremented IV, which will never underflow 0 since we have
>      IFN_SELECT_VL to gurantee that.  */
>   tree test_limit = nitems_total;
>
>   /* Provide a definition of each control in the group.  */
>   tree ctrl;
>   unsigned int i;
>   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
>     {
>       /* Previous controls will cover BIAS items.  This control covers the
> next batch.  */
>       poly_uint64 bias = nitems_per_ctrl * i;
>       tree bias_tree = build_int_cst (iv_type, bias);
>
>       /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
> BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
> control and adjust the bound down by BIAS.  */
>       tree this_test_limit = test_limit;
>       if (i != 0)
> {
>  this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
>  this_test_limit, bias_tree);
>  this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
>  this_test_limit, bias_tree);
> }
>
>       /* Create decrement IV.  */
>       create_iv (this_test_limit, MINUS_EXPR, ctrl, NULL_TREE, loop, 
> &incr_gsi,
> insert_after, &index_before_incr, &index_after_incr);
>
>       tree res_len;
>       if (rgc->controls.length () != 1)
> {
>  if (nitems_per_iter == 1)
>    {
>      /* Generte length = (X - VF*I/N) capped to the range [0, VF/N]. */
>      /* step = VF * I / N.  */
>      tree step
> = build_int_cst (iv_type,
> exact_div (vf * i, rgc->controls.length ()));
>      /* Make sure (X - VF*I/N) never underflow zero.  */
>      tree max = gimple_build (header_seq, MAX_EXPR, iv_type, *x, step);
>      res_len
> = gimple_build (header_seq, MIN_EXPR, iv_type,
> index_before_incr,
> build_int_cst (iv_type, vf * nitems_per_iter));
>    }
>  else
>    {
>      /* For SLP, we can't allow non-VF number of elements to be
> processed in non-final iteration. We force the number of
> elements to be processed in each non-final iteration is VF
> elements. If we allow non-VF elements processing in non-final
> iteration will make SLP too complicated and produce inferior
> codegen.
>
>   For example:
>
>    If non-final iteration process VF elements.
>
>      ...
>      .LEN_STORE (vectp_f.8_51, 128B, _71, { 1, 2, 1, 2 }, 0);
>      .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2 }, 0);
>      ...
>
>    If non-final iteration process non-VF elements.
>
>      ...
>      .LEN_STORE (vectp_f.8_51, 128B, _71, { 1, 2, 1, 2 }, 0);
>      if (_71 % 2 == 0)
>       .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2 }, 0);
>      else
>       .LEN_STORE (vectp_f.8_56, 128B, _72, { 2, 1, 2, 1 }, 0);
>      ...
>
> This is the simple case of 2-elements interleaved vector SLP.
> We consider other interleave vector, the situation will become
> more complicated.  */
>      res_len
> = gimple_build (header_seq, MIN_EXPR, iv_type,
> index_before_incr,
> build_int_cst (iv_type, vf * nitems_per_iter));
>    }
> }
>       else
> {
>  res_len
>    = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
>    index_before_incr, build_int_cst (iv_type, vf));
> }
>       gassign *assign = gimple_build_assign (ctrl, res_len);
>       gimple_seq_add_stmt (header_seq, assign);
>       if (rgc->controls.length () == 1)
> *x = ctrl;
>     }
>
>   return index_after_incr;
> }
>
> Am I understand correctly ? I think I may need to implement VEC_PACK_TRUNC to 
> test your idea.

The formatting didn't reach me in tact, so TBH it's a bit difficult to
follow.  But VF/N should in this code be equivalent to nitems_per_ctrl.
There shouldn't be any need to build an exact_div.

Also, only the single-control rgroup is/needs an IV.  The other cases just
go at the start of the loop body, using the single-control IV as input.

Unless I'm missing something, the same length > 1 code could be used
for both SLP and non-SLP.  (The length == 1 handling would still be
different for SLP and non-SLP.)

Thanks,
Richard

Reply via email to