> -----Original Message-----
> From: Richard Biener <[email protected]>
> Sent: Thursday, January 22, 2026 9:16 PM
> To: [email protected]
> Cc: [email protected]; Liu, Hongtao
> <[email protected]>
> Subject: [PATCH] Avoid selecting masked epilogs for in-order reduction
> vectorization
> 
> When masking an in-order reduction we are applying the mask with a
> COND_EXPR followed by an in-order accumulation of all elements, including
> the masked ones.  That makes loop masking not profitable.
> 
> Ideally we'd apply this logic to all loops, even when masking is selected via 
> --
> param vect-partial-vector-usage=2 but the current way we iterate over modes
> (and opt-out of cost compares) does not allow do iterate over masked vs.
> non-masked, so that does not work.
> I plan to fix that for GCC 17, for now this fixes a regression for tagets 
> opting in
> to avx512_masked_epilogues.
> 
> Bootstrap and regtest running on x86_64-unknown-linux-gnu.  OK for trunk?
Ok, The plan of vect compare cost being able to iterate over masked vs 
non-masked sounds great.
> 
> Thanks,
> Richard.
> 
>       * config/i386/i386.cc (ix86_vector_costs::finish_cost):
>       Avoid selecting masked epilogs for in-order reductions.
> 
>       * gcc.dg/vect/costmodel/x86_64/costmodel-vect-epil-1.c: New
>       testcase.
> ---
>  gcc/config/i386/i386.cc                       | 20 +++++++
>  .../costmodel/x86_64/costmodel-vect-epil-1.c  | 58 +++++++++++++++++++
>  2 files changed, 78 insertions(+)
>  create mode 100644
> gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-epil-1.c
> 
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index
> a3d0f7cb649..42ae9ccb051 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -26572,6 +26572,12 @@ ix86_vector_costs::finish_cost (const
> vector_costs *scalar_costs)
>             > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo))))
>       m_costs[vect_body] = INT_MAX;
> 
> +      /* We'd like to avoid using masking if there's an in-order reduction
> +      to vectorize because that will also perform in-order adds of
> +      masked elements (as neutral value, of course) here, but there
> +      is currently no way to indicate to try un-masked with the same
> +      mode.  */
> +
>        bool any_reduc_p = false;
>        for (int i = 0; i != X86_REDUC_LAST; i++)
>       if (m_num_reduc[i])
> @@ -26687,6 +26693,20 @@ ix86_vector_costs::finish_cost (const
> vector_costs *scalar_costs)
>                 }
>             }
>         }
> +      /* Avoid using masking if there's an in-order reduction
> +      to vectorize because that will also perform in-order adds of
> +      masked elements (as neutral value, of course).  */
> +      if (!avoid)
> +     {
> +       for (auto inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
> +         if (SLP_INSTANCE_KIND (inst) == slp_inst_kind_reduc_group
> +             && (vect_reduc_type (loop_vinfo, SLP_INSTANCE_TREE (inst))
> +                 == FOLD_LEFT_REDUCTION))
> +           {
> +             avoid = true;
> +             break;
> +           }
> +     }
>        if (!avoid)
>       {
>         m_suggested_epilogue_mode = loop_vinfo->vector_mode; diff --git
> a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-epil-1.c
> b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-epil-1.c
> new file mode 100644
> index 00000000000..5b8c358b2a4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-epil-1.c
> @@ -0,0 +1,58 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-mavx512bw
> +-mtune-ctrl=avx512_masked_epilogues" } */
> +
> +void test (const unsigned char * __restrict__ pi,
> +        const float * __restrict__ blk,
> +        int texel_count,
> +        float *pp_avg_rgb)
> +{
> +    float pp_avg_rgb_0 = 0.0f;
> +    float pp_avg_rgb_1 = 0.0f;
> +    float pp_avg_rgb_2 = 0.0f;
> +    float pp_avg_rgb_3 = 0.0f;
> +    for (int lane_id = 0; lane_id < texel_count; lane_id++) {
> +        unsigned char r_byte = pi[lane_id * 4 + 0];
> +        unsigned char g_byte = pi[lane_id * 4 + 1];
> +        unsigned char b_byte = pi[lane_id * 4 + 2];
> +        unsigned char a_byte = pi[lane_id * 4 + 3];
> +
> +        float r_float = blk[lane_id * 4 + 0];
> +        float g_float = blk[lane_id * 4 + 1];
> +        float b_float = blk[lane_id * 4 + 2];
> +        float a_float = blk[lane_id * 4 + 3];
> +
> +        int r_is_zero = (r_byte == 0) ? 1 : 0;
> +        int r_in_bounds = (texel_count > lane_id) ? 1 : 0;
> +        int r_mask = r_is_zero * (-r_in_bounds);
> +        if (r_mask != 0) {
> +            pp_avg_rgb_0 += r_float;
> +        }
> +        int g_is_zero = (g_byte == 0) ? 1 : 0;
> +        int g_in_bounds = (texel_count > lane_id) ? 1 : 0;
> +        int g_mask = g_is_zero * (-g_in_bounds);
> +        if (g_mask != 0) {
> +            pp_avg_rgb_1 += g_float;
> +        }
> +        int b_is_zero = (b_byte == 0) ? 1 : 0;
> +        int b_in_bounds = (texel_count > lane_id) ? 1 : 0;
> +        int b_mask = b_is_zero * (-b_in_bounds);
> +        if (b_mask != 0) {
> +            pp_avg_rgb_2 += b_float;
> +        }
> +        int a_is_zero = (a_byte == 0) ? 1 : 0;
> +        int a_in_bounds = (texel_count > lane_id) ? 1 : 0;
> +        int a_mask = a_is_zero * (-a_in_bounds);
> +        if (a_mask != 0) {
> +            pp_avg_rgb_3 += a_float;
> +        }
> +    }
> +    pp_avg_rgb[0] = pp_avg_rgb_0;
> +    pp_avg_rgb[1] = pp_avg_rgb_1;
> +    pp_avg_rgb[2] = pp_avg_rgb_2;
> +    pp_avg_rgb[3] = pp_avg_rgb_3;
> +}
> +
> +/* Even though there's an SLP opportunity in-order reductions should never
> use
> +   masked epilogs.  */
> +/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64
> +byte vectors" "vect" } } */
> +/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized
> +using 32 byte vectors" "vect" } } */
> --
> 2.51.0

Reply via email to