On Thu, May 19, 2016 at 9:46 PM, Ilya Enkovich <enkovich....@gmail.com> wrote:
> Hi,
>
> This patch enables vectorization of loop epilogues and low trip count
> loops using masking.

I wonder why we have the epilogue masking restriction with respect to
the original vectorization factor - shouldn't this simply be handled by
vectorizing the epilogue?  First trying the original VF (requires masking
and is equivalent to low-tripcount loop vectorization), then if that is not
profitable iterate to smaller VFs?   [yes, ideally we'd be able to compare
cost for vectorization with different VFs and choose the best VF]

Thanks,
Richard.

> Thanks,
> Ilya
> --
> gcc/
>
> 2016-05-19  Ilya Enkovich  <ilya.enkov...@intel.com>
>
>         * dbgcnt.def (vect_tail_mask): New.
>         * tree-vect-loop.c (vect_analyze_loop_2): Support masked loop
>         epilogues and low trip count loops.
>         (vect_get_known_peeling_cost): Ignore scalat epilogue cost for
>         loops we are going to mask.
>         (vect_estimate_min_profitable_iters): Support masked loop
>         epilogues and low trip count loops.
>         * tree-vectorizer.c (vectorize_loops): Add a message for a case
>         when loop epilogue can't be vectorized.
>
>
> diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def
> index 73c2966..5aad1d7 100644
> --- a/gcc/dbgcnt.def
> +++ b/gcc/dbgcnt.def
> @@ -193,4 +193,5 @@ DEBUG_COUNTER (tree_sra)
>  DEBUG_COUNTER (vect_loop)
>  DEBUG_COUNTER (vect_slp)
>  DEBUG_COUNTER (vect_tail_combine)
> +DEBUG_COUNTER (vect_tail_mask)
>  DEBUG_COUNTER (dom_unreachable_edges)
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index 1a80c42..7075f29 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -2199,7 +2199,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool 
> &fatal)
>    int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
>    HOST_WIDE_INT estimated_niter;
>    unsigned th;
> -  int min_scalar_loop_bound;
> +  int min_scalar_loop_bound = 0;
>
>    /* Check the SLP opportunities in the loop, analyze and build SLP trees.  
> */
>    ok = vect_analyze_slp (loop_vinfo, n_stmts);
> @@ -2224,6 +2224,30 @@ start_over:
>    unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
>    gcc_assert (vectorization_factor != 0);
>
> +  /* For now we mask loop epilogue using the same VF since it was used
> +     for cost estimations and it should be easier for reduction
> +     optimization.  */
> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> +      && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
> +      && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) != 
> (int)vectorization_factor)
> +    {
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "not vectorized: VF for loop epilogue doesn't "
> +                        "match original loop VF.\n");
> +      return false;
> +    }
> +
> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> +      && !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
> +      && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) <= 
> (int)vectorization_factor)
> +    {
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "not vectorized: VF for loop epilogue is too 
> small\n");
> +      return false;
> +    }
> +
>    if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
>      dump_printf_loc (MSG_NOTE, vect_location,
>                      "vectorization_factor = %d, niters = "
> @@ -2237,11 +2261,29 @@ start_over:
>        || (max_niter != -1
>           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
>      {
> -      if (dump_enabled_p ())
> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                        "not vectorized: iteration count smaller than "
> -                        "vectorization factor.\n");
> -      return false;
> +      /* Allow low trip count for loop epilogue we want to mask.  */
> +      if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> +         && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
> +       ;
> +      /* Allow low trip count for non-epilogue loops if flag is enabled.  */
> +      else if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> +              && flag_tree_vectorize_short_loops)
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                            "iteration count is small, masking is "
> +                            "required for chosen vectorization factor.\n");
> +
> +         LOOP_VINFO_NEED_MASKING (loop_vinfo) = true;
> +       }
> +      else
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                            "not vectorized: iteration count smaller than "
> +                            "vectorization factor.\n");
> +         return false;
> +       }
>      }
>
>    /* Analyze the alignment of the data-refs in the loop.
> @@ -2282,6 +2324,16 @@ start_over:
>        return false;
>      }
>
> +  LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = true;
> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> +      && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
> +    {
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "vectorizing loop epilogue with masking.\n");
> +      LOOP_VINFO_NEED_MASKING (loop_vinfo) = true;
> +    }
> +
>    if (slp)
>      {
>        /* Analyze operations in the SLP instances.  Note this may
> @@ -2305,6 +2357,19 @@ start_over:
>        return false;
>      }
>
> +  if (LOOP_VINFO_NEED_MASKING (loop_vinfo)
> +      && !LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
> +    {
> +      gcc_assert (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> +                 || !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo));
> +
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "not vectorized: loop cannot be masked.\n");
> +
> +      return false;
> +    }
> +
>    /* Analyze cost.  Decide if worth while to vectorize.  */
>    int min_profitable_estimate, min_profitable_iters;
>    int min_profitable_combine_iters;
> @@ -2324,8 +2389,9 @@ start_over:
>        goto again;
>      }
>
> -  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
> -                           * vectorization_factor) - 1);
> +  if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
> +    min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
> +                             * vectorization_factor) - 1);
>
>    /* Use the cost model only if it is more conservative than user specified
>       threshold.  */
> @@ -2425,18 +2491,28 @@ start_over:
>    else if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
>            && min_profitable_combine_iters >= 0)
>      {
> -      if (((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> -           && (LOOP_VINFO_INT_NITERS (loop_vinfo)
> -               >= (unsigned) min_profitable_combine_iters))
> +      if ((LOOP_VINFO_NEED_MASKING (loop_vinfo)
> +          || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> +              && (LOOP_VINFO_INT_NITERS (loop_vinfo)
> +                  >= (unsigned) min_profitable_combine_iters))
>            || estimated_niter == -1
>            || estimated_niter >= min_profitable_combine_iters)
> -         && dbg_cnt (vect_tail_combine))
> +         && (LOOP_VINFO_NEED_MASKING (loop_vinfo)
> +             || dbg_cnt (vect_tail_combine)))
>         {
>           LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false;
>           LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo) = true;
>
> -         dump_printf_loc (MSG_NOTE, vect_location,
> -                          "Decided to combine loop with its epilogue.\n");
> +          if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) && dump_enabled_p ())
> +           {
> +             if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
> +               dump_printf_loc (MSG_NOTE, vect_location,
> +                                "Decided to vectorize low trip count loop "
> +                                "with masking.\n");
> +             else
> +               dump_printf_loc (MSG_NOTE, vect_location,
> +                                "Decided to combine loop with its 
> epilogue.\n");
> +           }
>
>           /* We need to adjust profitability check if combine
>              epilogue considering additional vector iteration
> @@ -2463,6 +2539,22 @@ start_over:
>         }
>      }
>
> +  /* Check for not profitable low trip count loop vectorization.  */
> +  if (LOOP_VINFO_NEED_MASKING (loop_vinfo)
> +      && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> +      && !LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo))
> +    {
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_NOTE, vect_location,
> +                        "not vectorized: low trip count loop "
> +                        "vectorization is not profitable.\n");
> +      return false;
> +    }
> +
> +  if (LOOP_VINFO_MASK_EPILOGUE (loop_vinfo)
> +      && !dbg_cnt (vect_tail_mask))
> +    LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false;
> +
>    /* Ok to vectorize!  */
>    return true;
>
> @@ -3413,7 +3505,7 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, 
> int peel_iters_prologue,
>                                   si->count * peel_iters_prologue,
>                                   si->kind, NULL, si->misalign,
>                                   vect_prologue);
> -  if (*peel_iters_epilogue)
> +  if (*peel_iters_epilogue && !LOOP_VINFO_NEED_MASKING (loop_vinfo))
>      FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
>        retval += record_stmt_cost (epilogue_cost_vec,
>                                   si->count * *peel_iters_epilogue,
> @@ -3451,12 +3543,50 @@ vect_estimate_min_profitable_iters (loop_vec_info 
> loop_vinfo,
>    int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
>    void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
>
> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
> +    {
> +      /* Currently we don't produce scalar epilogue version in case
> +        its masked version is provided.  It means we don't need to
> +        compute profitability one more time here.  Just make a
> +        masked loop version.  */
> +      if (LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
> +       {
> +         gcc_assert (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo));
> +
> +         dump_printf_loc (MSG_NOTE, vect_location,
> +                          "cost model: mask loop epilogue.\n");
> +
> +         *ret_min_profitable_niters = 0;
> +         *ret_min_profitable_estimate = 0;
> +         *ret_min_profitable_combine_niters = 0;
> +         return;
> +       }
> +      else if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED)
> +       {
> +         dump_printf_loc (MSG_NOTE, vect_location,
> +                          "cost model disabled for epilogue.\n");
> +         *ret_min_profitable_niters = 0;
> +         *ret_min_profitable_estimate = 0;
> +         return;
> +       }
> +    }
>    /* Cost model disabled.  */
> -  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
> +  else if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>      {
>        dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
>        *ret_min_profitable_niters = 0;
>        *ret_min_profitable_estimate = 0;
> +      *ret_min_profitable_combine_niters = -1;
> +
> +      if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
> +       *ret_min_profitable_combine_niters = 0;
> +      else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)
> +              && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
> +       LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
> +      else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
> +              && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
> +       *ret_min_profitable_combine_niters = 0;
> +
>        return;
>      }
>
> @@ -3544,10 +3674,13 @@ vect_estimate_min_profitable_iters (loop_vec_info 
> loop_vinfo,
>                                 si->count * peel_iters_prologue,
>                                 si->kind, stmt_info, si->misalign,
>                                 vect_prologue);
> -         (void) add_stmt_cost (target_cost_data,
> -                               si->count * peel_iters_epilogue,
> -                               si->kind, stmt_info, si->misalign,
> -                               vect_epilogue);
> +         /* We shouldn't add scalar epilogue cost for low trip
> +            count loops which are masked and have no epilogue.  */
> +         if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
> +           (void) add_stmt_cost (target_cost_data,
> +                                 si->count * peel_iters_epilogue,
> +                                 si->kind, stmt_info, si->misalign,
> +                                 vect_epilogue);
>         }
>      }
>    else
> @@ -3744,8 +3877,9 @@ vect_estimate_min_profitable_iters (loop_vec_info 
> loop_vinfo,
>                "  Calculated minimum iters for profitability: %d\n",
>                min_profitable_iters);
>
> -  min_profitable_iters =
> -       min_profitable_iters < vf ? vf : min_profitable_iters;
> +  /* Adjust to VF for non-masked loops.  */
> +  if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
> +    min_profitable_iters = MAX (min_profitable_iters, vf);
>
>    /* Because the condition we create is:
>       if (niters <= min_profitable_iters)
> @@ -3787,6 +3921,25 @@ vect_estimate_min_profitable_iters (loop_vec_info 
> loop_vinfo,
>
>    *ret_min_profitable_combine_niters = -1;
>
> +  /* Handle low trip count loops.  */
> +  if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
> +    {
> +      /* Masked iteration should be better than a scalar loop:
> +        MIC + VIC + MOC < SIC * epilogue_niters  */
> +      if ((int)(masking_inside_cost + masking_prologue_cost + 
> vec_inside_cost)
> +         >= (scalar_single_iter_cost * peel_iters_epilogue))
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_NOTE, vect_location,
> +                            "Low trip count loop vectorization is not "
> +                            "profitable.\n");
> +         return;
> +       }
> +
> +      *ret_min_profitable_combine_niters = 0;
> +      return;
> +    }
> +
>    /* Don't try to vectorize epilogue of epilogue.  */
>    if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
>      return;
> @@ -3795,7 +3948,9 @@ vect_estimate_min_profitable_iters (loop_vec_info 
> loop_vinfo,
>      {
>        if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED)
>         {
> -         if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
> +         if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)
> +           LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
> +         else if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
>             *ret_min_profitable_combine_niters = 0;
>           return;
>         }
> @@ -3854,6 +4009,29 @@ vect_estimate_min_profitable_iters (loop_vec_info 
> loop_vinfo,
>                              profitable_iters);
>           *ret_min_profitable_combine_niters = profitable_iters;
>         }
> +
> +      if (!(flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK))
> +       return;
> +
> +      /* Now compute profitability for loop epilogue masking.
> +        The following condition must hold true:
> +        SIC * epilogue_niters + SOC > VIC + MIC + MPC  */
> +      int min_profitable_masking_niters
> +       = (vec_inside_cost + masking_inside_cost + masking_prologue_cost
> +          - scalar_outside_cost) / scalar_single_iter_cost;
> +      if (min_profitable_masking_niters > peel_iters_epilogue)
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_NOTE, vect_location,
> +                            "Loop epilogue masking is not pofitable.\n");
> +       }
> +      else
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_NOTE, vect_location,
> +                            "Loop epilogue masking is pofitable.\n");
> +         LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
> +       }
>      }
>  }
>
> diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
> index 5f15246..f70aed6 100644
> --- a/gcc/tree-vectorizer.c
> +++ b/gcc/tree-vectorizer.c
> @@ -539,7 +539,16 @@ vectorize_loops (void)
>         loop->aux = loop_vinfo;
>
>         if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
> -         continue;
> +         {
> +           if (loop_vinfo
> +               && LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> +               && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
> +               && dump_enabled_p ())
> +             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                              "loop epilogue can't be vectorized.\n");
> +
> +           continue;
> +         }
>
>          if (!dbg_cnt (vect_loop))
>           {

Reply via email to