Thanks, this looks functionally correct to me.  And I agree it handles
the cases that previously needed multiplication.

But I think it regresses code quality when no multiplication was needed.
We can now generate duplicate IVs.  Perhaps ivopts would remove the
duplicates, but it might be hard, because of the variable steps.

For example, we would generate duplicate IVs for non-SLP code that
operates on multiple vector sizes.  (Can't remembrer what the status
of unpack/truncate patterns is on RVV.)  But it also shows up for SLP.
E.g., I would expect duplicate IVs for:

uint16_t x[100];
uint32_t y[200];

void f() {
  for (int i = 0; i < 100; i += 2) {
    x[i + 0] += 1;
    x[i + 1] += 2;
    y[i + 0] += 1;
    y[i + 1] += 2;
  }
}

So I think the call to vect_set_loop_controls_directly does still
need to be inside an "if".  But the "if" condition should be based
on whether the IV step is different.  As discussed yesterday, the
IV step is different if nitems_per_iter, aka:

  max_nscalars_per_iter * factor

is different.

Because of that, I think I was wrong to suggest storing the IV in
loop_vinfo.  It should probably be stored in rgroup_controls instead.

Then we could have a structure like this:

  rgroup_controls *rgc;
  rgroup_controls *iv_rgc = nullptr;
  ...
  FOR_EACH_VEC_ELT (*controls, i, rgc)
    if (!rgc->controls.is_empty ())
      {
        ...
        if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
            || !iv_rgc
            || (iv_rgc->max_nscalars_per_iter * iv_rgc->factor
                != rgc->max_nscalars_per_iter * rgc->factor))
          {
            /* See whether zero-based IV would ever generate all-false masks
               or zero length before wrapping around.  */
            bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);

            /* Set up all controls for this group.  */
            test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
                                                         &preheader_seq,
                                                         &header_seq,
                                                         loop_cond_gsi, rgc,
                                                         niters, niters_skip,
                                                         might_wrap_p);

            iv_rgc = rgc;
          }

        if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
            && rgc->controls.length () > 1)
          {
            ...your code, using the iv in iv_rgc...;
          }
      }

Some other comments:

> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index ff6159e08d5..f9d92ced982 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -468,6 +468,38 @@ vect_set_loop_controls_directly (class loop *loop, 
> loop_vec_info loop_vinfo,
>    gimple_stmt_iterator incr_gsi;
>    bool insert_after;
>    standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> +  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> +    {
> +      /* single rgroup:

Instead of "single rgroup", how about:

      /* Create an IV that counts down from niters_total and whose step
         is the (variable) amount processed in the current iteration:
        
But please keep the example below as well.

> +      ...
> +      _10 = (unsigned long) count_12(D);
> +      ...
> +      # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
> +      _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
> +      ...
> +      vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
> +      ...
> +      ivtmp_35 = ivtmp_9 - _36;
> +      ...
> +      if (ivtmp_35 != 0)
> +        goto <bb 4>; [83.33%]
> +      else
> +        goto <bb 5>; [16.67%]
> +      */
> +      nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
> +      tree step = rgc->controls.length () == 1 ? rgc->controls[0]
> +                                            : make_ssa_name (iv_type);
> +      /* Create decrement IV.  */
> +      create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
> +              insert_after, &index_before_incr, &index_after_incr);
> +      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, MIN_EXPR,
> +                                                         index_before_incr,
> +                                                         nitems_step));
> +      LOOP_VINFO_DECREMENTING_IV_STEP (loop_vinfo) = step;
> +      return index_after_incr;
> +    }
> +
> +  /* Create increment IV.  */
>    create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
>            loop, &incr_gsi, insert_after, &index_before_incr,
>            &index_after_incr);
> @@ -683,6 +715,63 @@ vect_set_loop_controls_directly (class loop *loop, 
> loop_vec_info loop_vinfo,
>    return next_ctrl;
>  }
>  
> +/* Try to use adjust loop lens for multiple-rgroups.

This is no longer "try", since the function always does something.

How about:

/* Populate DEST_RGM->controls, given that they should add up to STEP.

> +
> +     _36 = MIN_EXPR <ivtmp_34, VF>;

Suggest using "STEP" instead of "_36" here, since this corresponds
to the function's "step" parameter.

> +
> +     First length (MIN (X, VF/N)):
> +       loop_len_15 = MIN_EXPR <_36, VF/N>;

Likewise here.

> +
> +     Second length:
> +       tmp = _36 - loop_len_15;

And here.

> +       loop_len_16 = MIN (tmp, VF/N);
> +
> +     Third length:
> +       tmp2 = tmp - loop_len_16;
> +       loop_len_17 = MIN (tmp2, VF/N);
> +
> +     Last length:
> +       loop_len_18 = tmp2 - loop_len_17;
> +*/
> +
> +static void
> +vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
> +                            rgroup_controls *dest_rgm, tree step)
> +{
> +  tree ctrl_type = dest_rgm->type;
> +  poly_uint64 nitems_per_ctrl
> +    = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
> +  tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
> +
> +  for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
> +    {
> +      tree ctrl = dest_rgm->controls[i];
> +      if (i == 0)
> +     {
> +       /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N].  */
> +       gassign *assign
> +         = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
> +       gimple_seq_add_stmt (seq, assign);
> +     }
> +      else if (i == dest_rgm->controls.length () - 1)
> +     {
> +       /* Last iteration: Remain capped to the range [0, VF/N].  */
> +       gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
> +                                              dest_rgm->controls[i - 1]);
> +       gimple_seq_add_stmt (seq, assign);
> +     }
> +      else
> +     {
> +       /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N].  */
> +       step = gimple_build (seq, MINUS_EXPR, iv_type, step,
> +                            dest_rgm->controls[i - 1]);
> +       gassign *assign
> +         = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
> +       gimple_seq_add_stmt (seq, assign);
> +     }
> +    }
> +}
> +
>  /* Set up the iteration condition and rgroup controls for LOOP, given
>     that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
>     loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
> @@ -764,6 +853,70 @@ vect_set_loop_condition_partial_vectors (class loop 
> *loop,
>                                                    loop_cond_gsi, rgc,
>                                                    niters, niters_skip,
>                                                    might_wrap_p);
> +
> +     /* Decrement IV only run vect_set_loop_controls_directly once.  */
> +     if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> +         && rgc->controls.length () > 1)
> +       {
> +         /*
> +            - Multiple rgroup (SLP):
> +              ...
> +              _38 = (unsigned long) bnd.7_29;
> +              _39 = _38 * 2;
> +              ...
> +              # ivtmp_41 = PHI <ivtmp_42(6), _39(5)>
> +              ...
> +              _43 = MIN_EXPR <ivtmp_41, 32>;
> +              loop_len_26 = MIN_EXPR <_43, 16>;
> +              loop_len_25 = _43 - loop_len_26;
> +              ...
> +              .LEN_STORE (_6, 8B, loop_len_26, ...);
> +              ...
> +              .LEN_STORE (_25, 8B, loop_len_25, ...);
> +              _33 = loop_len_26 / 2;
> +              ...
> +              .LEN_STORE (_8, 16B, _33, ...);
> +              _36 = loop_len_25 / 2;
> +              ...
> +              .LEN_STORE (_15, 16B, _36, ...);
> +              ivtmp_42 = ivtmp_41 - _43;
> +              ...
> +
> +            - Multiple rgroup (non-SLP):
> +              ...
> +              _38 = (unsigned long) n_12(D);
> +              ...
> +              # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
> +              ...
> +              loop_len_38 = MIN_EXPR <ivtmp_41, POLY_INT_CST [8, 8]>;
> +              _43 = MIN_EXPR <ivtmp_44, POLY_INT_CST [8, 8]>;
> +              loop_len_24 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
> +              _46 = _43 - loop_len_24;
> +              loop_len_23 = MIN_EXPR <_46, POLY_INT_CST [2, 2]>;
> +              _47 = _46 - loop_len_23;
> +              loop_len_22 = MIN_EXPR <_47, POLY_INT_CST [2, 2]>;
> +              loop_len_21 = _47 - loop_len_22;
> +              ...
> +              vect__4.8_17 = .LEN_LOAD (_6, 64B, loop_len_24, 0);
> +              ...
> +              vect__4.9_9 = .LEN_LOAD (_49, 64B, loop_len_23, 0);
> +              ...
> +              vect__4.10_30 = .LEN_LOAD (_52, 64B, loop_len_22, 0);
> +              ...
> +              vect__4.11_32 = .LEN_LOAD (_55, 64B, loop_len_21, 0);
> +              vect__7.13_31 = VEC_PACK_TRUNC_EXPR <...>,
> +              vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>;
> +              vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>;
> +              ...
> +              .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
> +              ivtmp_39 = ivtmp_38 - _40;
> +              ...
> +         */

I don't think this comment is really necessary, and might be a bit
misleading, since it includes things that are not done directly
by this code.

I think the comment above vect_adjust_loop_lens_control accurately
describes what's going on, so we can just say:

            /* vect_set_loop_controls_directly creates an IV whose step
               is equal to the expected sum of RGC->controls.  Use that
               information to populate RGC->controls.  */

Thanks,
Richard

Reply via email to