2016-06-15 15:00 GMT+03:00 Richard Biener <richard.guent...@gmail.com>:
> On Thu, May 19, 2016 at 9:46 PM, Ilya Enkovich <enkovich....@gmail.com> wrote:
>> Hi,
>>
>> This patch enables vectorization of loop epilogues and low trip count
>> loops using masking.
>
> I wonder why we have the epilogue masking restriction with respect to
> the original vectorization factor - shouldn't this simply be handled by
> vectorizing the epilogue?  First trying the original VF (requires masking
> and is equivalent to low-tripcount loop vectorization), then if that is not
> profitable iterate to smaller VFs?   [yes, ideally we'd be able to compare
> cost for vectorization with different VFs and choose the best VF]

When main loop is vectorized using some VF we compute epilogue masking
profitability and generate epilogue to be vectorized and masked using exactly
the same VF.  In ideal case we never fail to vectorize epilogue because we
check that it can be masked.  Unfortunately we may loose some info
when generating
a loop copy (e.g. scev info is lost) and therefore may fail to
vectorize epilogue.

I expect that if we loose some info and thus fail to vectorize for a
specified VF
(for which the main loop was successfully vectorized) then we are going to fail
to vectorize for other vector sizes too.  Actually I'd prefer to try
the only vector
size for vectorization with masking to save compilation time.

Thanks,
Ilya

>
> Thanks,
> Richard.
>
>> Thanks,
>> Ilya
>> --
>> gcc/
>>
>> 2016-05-19  Ilya Enkovich  <ilya.enkov...@intel.com>
>>
>>         * dbgcnt.def (vect_tail_mask): New.
>>         * tree-vect-loop.c (vect_analyze_loop_2): Support masked loop
>>         epilogues and low trip count loops.
>>         (vect_get_known_peeling_cost): Ignore scalat epilogue cost for
>>         loops we are going to mask.
>>         (vect_estimate_min_profitable_iters): Support masked loop
>>         epilogues and low trip count loops.
>>         * tree-vectorizer.c (vectorize_loops): Add a message for a case
>>         when loop epilogue can't be vectorized.
>>
>>
>> diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def
>> index 73c2966..5aad1d7 100644
>> --- a/gcc/dbgcnt.def
>> +++ b/gcc/dbgcnt.def
>> @@ -193,4 +193,5 @@ DEBUG_COUNTER (tree_sra)
>>  DEBUG_COUNTER (vect_loop)
>>  DEBUG_COUNTER (vect_slp)
>>  DEBUG_COUNTER (vect_tail_combine)
>> +DEBUG_COUNTER (vect_tail_mask)
>>  DEBUG_COUNTER (dom_unreachable_edges)
>> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
>> index 1a80c42..7075f29 100644
>> --- a/gcc/tree-vect-loop.c
>> +++ b/gcc/tree-vect-loop.c
>> @@ -2199,7 +2199,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool 
>> &fatal)
>>    int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
>>    HOST_WIDE_INT estimated_niter;
>>    unsigned th;
>> -  int min_scalar_loop_bound;
>> +  int min_scalar_loop_bound = 0;
>>
>>    /* Check the SLP opportunities in the loop, analyze and build SLP trees.  
>> */
>>    ok = vect_analyze_slp (loop_vinfo, n_stmts);
>> @@ -2224,6 +2224,30 @@ start_over:
>>    unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
>>    gcc_assert (vectorization_factor != 0);
>>
>> +  /* For now we mask loop epilogue using the same VF since it was used
>> +     for cost estimations and it should be easier for reduction
>> +     optimization.  */
>> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>> +      && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
>> +      && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) != 
>> (int)vectorization_factor)
>> +    {
>> +      if (dump_enabled_p ())
>> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +                        "not vectorized: VF for loop epilogue doesn't "
>> +                        "match original loop VF.\n");
>> +      return false;
>> +    }
>> +
>> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>> +      && !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
>> +      && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) <= 
>> (int)vectorization_factor)
>> +    {
>> +      if (dump_enabled_p ())
>> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +                        "not vectorized: VF for loop epilogue is too 
>> small\n");
>> +      return false;
>> +    }
>> +
>>    if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
>>      dump_printf_loc (MSG_NOTE, vect_location,
>>                      "vectorization_factor = %d, niters = "
>> @@ -2237,11 +2261,29 @@ start_over:
>>        || (max_niter != -1
>>           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
>>      {
>> -      if (dump_enabled_p ())
>> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> -                        "not vectorized: iteration count smaller than "
>> -                        "vectorization factor.\n");
>> -      return false;
>> +      /* Allow low trip count for loop epilogue we want to mask.  */
>> +      if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>> +         && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
>> +       ;
>> +      /* Allow low trip count for non-epilogue loops if flag is enabled.  */
>> +      else if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>> +              && flag_tree_vectorize_short_loops)
>> +       {
>> +         if (dump_enabled_p ())
>> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +                            "iteration count is small, masking is "
>> +                            "required for chosen vectorization factor.\n");
>> +
>> +         LOOP_VINFO_NEED_MASKING (loop_vinfo) = true;
>> +       }
>> +      else
>> +       {
>> +         if (dump_enabled_p ())
>> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +                            "not vectorized: iteration count smaller than "
>> +                            "vectorization factor.\n");
>> +         return false;
>> +       }
>>      }
>>
>>    /* Analyze the alignment of the data-refs in the loop.
>> @@ -2282,6 +2324,16 @@ start_over:
>>        return false;
>>      }
>>
>> +  LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = true;
>> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>> +      && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
>> +    {
>> +      if (dump_enabled_p ())
>> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +                        "vectorizing loop epilogue with masking.\n");
>> +      LOOP_VINFO_NEED_MASKING (loop_vinfo) = true;
>> +    }
>> +
>>    if (slp)
>>      {
>>        /* Analyze operations in the SLP instances.  Note this may
>> @@ -2305,6 +2357,19 @@ start_over:
>>        return false;
>>      }
>>
>> +  if (LOOP_VINFO_NEED_MASKING (loop_vinfo)
>> +      && !LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
>> +    {
>> +      gcc_assert (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>> +                 || !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo));
>> +
>> +      if (dump_enabled_p ())
>> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +                        "not vectorized: loop cannot be masked.\n");
>> +
>> +      return false;
>> +    }
>> +
>>    /* Analyze cost.  Decide if worth while to vectorize.  */
>>    int min_profitable_estimate, min_profitable_iters;
>>    int min_profitable_combine_iters;
>> @@ -2324,8 +2389,9 @@ start_over:
>>        goto again;
>>      }
>>
>> -  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
>> -                           * vectorization_factor) - 1);
>> +  if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
>> +    min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
>> +                             * vectorization_factor) - 1);
>>
>>    /* Use the cost model only if it is more conservative than user specified
>>       threshold.  */
>> @@ -2425,18 +2491,28 @@ start_over:
>>    else if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
>>            && min_profitable_combine_iters >= 0)
>>      {
>> -      if (((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
>> -           && (LOOP_VINFO_INT_NITERS (loop_vinfo)
>> -               >= (unsigned) min_profitable_combine_iters))
>> +      if ((LOOP_VINFO_NEED_MASKING (loop_vinfo)
>> +          || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
>> +              && (LOOP_VINFO_INT_NITERS (loop_vinfo)
>> +                  >= (unsigned) min_profitable_combine_iters))
>>            || estimated_niter == -1
>>            || estimated_niter >= min_profitable_combine_iters)
>> -         && dbg_cnt (vect_tail_combine))
>> +         && (LOOP_VINFO_NEED_MASKING (loop_vinfo)
>> +             || dbg_cnt (vect_tail_combine)))
>>         {
>>           LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false;
>>           LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo) = true;
>>
>> -         dump_printf_loc (MSG_NOTE, vect_location,
>> -                          "Decided to combine loop with its epilogue.\n");
>> +          if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) && dump_enabled_p ())
>> +           {
>> +             if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
>> +               dump_printf_loc (MSG_NOTE, vect_location,
>> +                                "Decided to vectorize low trip count loop "
>> +                                "with masking.\n");
>> +             else
>> +               dump_printf_loc (MSG_NOTE, vect_location,
>> +                                "Decided to combine loop with its 
>> epilogue.\n");
>> +           }
>>
>>           /* We need to adjust profitability check if combine
>>              epilogue considering additional vector iteration
>> @@ -2463,6 +2539,22 @@ start_over:
>>         }
>>      }
>>
>> +  /* Check for not profitable low trip count loop vectorization.  */
>> +  if (LOOP_VINFO_NEED_MASKING (loop_vinfo)
>> +      && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>> +      && !LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo))
>> +    {
>> +      if (dump_enabled_p ())
>> +       dump_printf_loc (MSG_NOTE, vect_location,
>> +                        "not vectorized: low trip count loop "
>> +                        "vectorization is not profitable.\n");
>> +      return false;
>> +    }
>> +
>> +  if (LOOP_VINFO_MASK_EPILOGUE (loop_vinfo)
>> +      && !dbg_cnt (vect_tail_mask))
>> +    LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false;
>> +
>>    /* Ok to vectorize!  */
>>    return true;
>>
>> @@ -3413,7 +3505,7 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, 
>> int peel_iters_prologue,
>>                                   si->count * peel_iters_prologue,
>>                                   si->kind, NULL, si->misalign,
>>                                   vect_prologue);
>> -  if (*peel_iters_epilogue)
>> +  if (*peel_iters_epilogue && !LOOP_VINFO_NEED_MASKING (loop_vinfo))
>>      FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
>>        retval += record_stmt_cost (epilogue_cost_vec,
>>                                   si->count * *peel_iters_epilogue,
>> @@ -3451,12 +3543,50 @@ vect_estimate_min_profitable_iters (loop_vec_info 
>> loop_vinfo,
>>    int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
>>    void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
>>
>> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
>> +    {
>> +      /* Currently we don't produce scalar epilogue version in case
>> +        its masked version is provided.  It means we don't need to
>> +        compute profitability one more time here.  Just make a
>> +        masked loop version.  */
>> +      if (LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
>> +       {
>> +         gcc_assert (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo));
>> +
>> +         dump_printf_loc (MSG_NOTE, vect_location,
>> +                          "cost model: mask loop epilogue.\n");
>> +
>> +         *ret_min_profitable_niters = 0;
>> +         *ret_min_profitable_estimate = 0;
>> +         *ret_min_profitable_combine_niters = 0;
>> +         return;
>> +       }
>> +      else if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED)
>> +       {
>> +         dump_printf_loc (MSG_NOTE, vect_location,
>> +                          "cost model disabled for epilogue.\n");
>> +         *ret_min_profitable_niters = 0;
>> +         *ret_min_profitable_estimate = 0;
>> +         return;
>> +       }
>> +    }
>>    /* Cost model disabled.  */
>> -  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>> +  else if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>>      {
>>        dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
>>        *ret_min_profitable_niters = 0;
>>        *ret_min_profitable_estimate = 0;
>> +      *ret_min_profitable_combine_niters = -1;
>> +
>> +      if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
>> +       *ret_min_profitable_combine_niters = 0;
>> +      else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)
>> +              && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
>> +       LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
>> +      else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
>> +              && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
>> +       *ret_min_profitable_combine_niters = 0;
>> +
>>        return;
>>      }
>>
>> @@ -3544,10 +3674,13 @@ vect_estimate_min_profitable_iters (loop_vec_info 
>> loop_vinfo,
>>                                 si->count * peel_iters_prologue,
>>                                 si->kind, stmt_info, si->misalign,
>>                                 vect_prologue);
>> -         (void) add_stmt_cost (target_cost_data,
>> -                               si->count * peel_iters_epilogue,
>> -                               si->kind, stmt_info, si->misalign,
>> -                               vect_epilogue);
>> +         /* We shouldn't add scalar epilogue cost for low trip
>> +            count loops which are masked and have no epilogue.  */
>> +         if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
>> +           (void) add_stmt_cost (target_cost_data,
>> +                                 si->count * peel_iters_epilogue,
>> +                                 si->kind, stmt_info, si->misalign,
>> +                                 vect_epilogue);
>>         }
>>      }
>>    else
>> @@ -3744,8 +3877,9 @@ vect_estimate_min_profitable_iters (loop_vec_info 
>> loop_vinfo,
>>                "  Calculated minimum iters for profitability: %d\n",
>>                min_profitable_iters);
>>
>> -  min_profitable_iters =
>> -       min_profitable_iters < vf ? vf : min_profitable_iters;
>> +  /* Adjust to VF for non-masked loops.  */
>> +  if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
>> +    min_profitable_iters = MAX (min_profitable_iters, vf);
>>
>>    /* Because the condition we create is:
>>       if (niters <= min_profitable_iters)
>> @@ -3787,6 +3921,25 @@ vect_estimate_min_profitable_iters (loop_vec_info 
>> loop_vinfo,
>>
>>    *ret_min_profitable_combine_niters = -1;
>>
>> +  /* Handle low trip count loops.  */
>> +  if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
>> +    {
>> +      /* Masked iteration should be better than a scalar loop:
>> +        MIC + VIC + MOC < SIC * epilogue_niters  */
>> +      if ((int)(masking_inside_cost + masking_prologue_cost + 
>> vec_inside_cost)
>> +         >= (scalar_single_iter_cost * peel_iters_epilogue))
>> +       {
>> +         if (dump_enabled_p ())
>> +           dump_printf_loc (MSG_NOTE, vect_location,
>> +                            "Low trip count loop vectorization is not "
>> +                            "profitable.\n");
>> +         return;
>> +       }
>> +
>> +      *ret_min_profitable_combine_niters = 0;
>> +      return;
>> +    }
>> +
>>    /* Don't try to vectorize epilogue of epilogue.  */
>>    if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
>>      return;
>> @@ -3795,7 +3948,9 @@ vect_estimate_min_profitable_iters (loop_vec_info 
>> loop_vinfo,
>>      {
>>        if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED)
>>         {
>> -         if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
>> +         if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)
>> +           LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
>> +         else if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
>>             *ret_min_profitable_combine_niters = 0;
>>           return;
>>         }
>> @@ -3854,6 +4009,29 @@ vect_estimate_min_profitable_iters (loop_vec_info 
>> loop_vinfo,
>>                              profitable_iters);
>>           *ret_min_profitable_combine_niters = profitable_iters;
>>         }
>> +
>> +      if (!(flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK))
>> +       return;
>> +
>> +      /* Now compute profitability for loop epilogue masking.
>> +        The following condition must hold true:
>> +        SIC * epilogue_niters + SOC > VIC + MIC + MPC  */
>> +      int min_profitable_masking_niters
>> +       = (vec_inside_cost + masking_inside_cost + masking_prologue_cost
>> +          - scalar_outside_cost) / scalar_single_iter_cost;
>> +      if (min_profitable_masking_niters > peel_iters_epilogue)
>> +       {
>> +         if (dump_enabled_p ())
>> +           dump_printf_loc (MSG_NOTE, vect_location,
>> +                            "Loop epilogue masking is not pofitable.\n");
>> +       }
>> +      else
>> +       {
>> +         if (dump_enabled_p ())
>> +           dump_printf_loc (MSG_NOTE, vect_location,
>> +                            "Loop epilogue masking is pofitable.\n");
>> +         LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
>> +       }
>>      }
>>  }
>>
>> diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
>> index 5f15246..f70aed6 100644
>> --- a/gcc/tree-vectorizer.c
>> +++ b/gcc/tree-vectorizer.c
>> @@ -539,7 +539,16 @@ vectorize_loops (void)
>>         loop->aux = loop_vinfo;
>>
>>         if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
>> -         continue;
>> +         {
>> +           if (loop_vinfo
>> +               && LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>> +               && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
>> +               && dump_enabled_p ())
>> +             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +                              "loop epilogue can't be vectorized.\n");
>> +
>> +           continue;
>> +         }
>>
>>          if (!dbg_cnt (vect_loop))
>>           {

Reply via email to