On Wed, 11 Oct 2023, Juzhe-Zhong wrote:

> This patch fixes this following FAILs in RISC-V regression:
> 
> FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects  scan-tree-dump 
> vect "Loop contains only SLP stmts"
> FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP 
> stmts"
> FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects  scan-tree-dump 
> vect "Loop contains only SLP stmts"
> FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP 
> stmts"
> 
> The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD.
> 
> Since for RVV, we build MASK_LEN_GATHER_LOAD with dummy mask (-1) in 
> tree-vect-patterns.cc if it is same
> situation as GATHER_LOAD (no conditional mask).
> 
> So we make MASK_LEN_GATHER_LOAD leverage the flow of GATHER_LOAD if mask 
> argument is a dummy mask.
> 
> gcc/ChangeLog:
> 
>       * tree-vect-slp.cc (vect_get_operand_map):
>       (vect_build_slp_tree_1):
>       (vect_build_slp_tree_2):
>       * tree-vect-stmts.cc (vectorizable_load):
> 
> ---
>  gcc/tree-vect-slp.cc   | 18 ++++++++++++++++--
>  gcc/tree-vect-stmts.cc |  4 ++--
>  2 files changed, 18 insertions(+), 4 deletions(-)
> 
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index fa098f9ff4e..712c04ec278 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -544,6 +544,17 @@ vect_get_operand_map (const gimple *stmt, unsigned char 
> swap = 0)
>         case IFN_MASK_GATHER_LOAD:
>           return arg1_arg4_map;
>  
> +       case IFN_MASK_LEN_GATHER_LOAD:
> +         /* In tree-vect-patterns.cc, we will have these 2 situations:
> +
> +             - Unconditional gather load transforms
> +               into MASK_LEN_GATHER_LOAD with dummy mask which is -1.
> +
> +             - Conditional gather load transforms
> +               into MASK_LEN_GATHER_LOAD with real conditional mask.*/
> +         return integer_minus_onep (gimple_call_arg (call, 4)) ? arg1_map
> +                                                               : nullptr;
> +
>         case IFN_MASK_STORE:
>           return arg3_arg2_map;
>  
> @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char 
> *swap,
>  
>         if (cfn == CFN_MASK_LOAD
>             || cfn == CFN_GATHER_LOAD
> -           || cfn == CFN_MASK_GATHER_LOAD)
> +           || cfn == CFN_MASK_GATHER_LOAD
> +           || cfn == CFN_MASK_LEN_GATHER_LOAD)
>           ldst_p = true;
>         else if (cfn == CFN_MASK_STORE)
>           {
> @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char 
> *swap,
>         if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
>             && rhs_code != CFN_GATHER_LOAD
>             && rhs_code != CFN_MASK_GATHER_LOAD
> +           && rhs_code != CFN_MASK_LEN_GATHER_LOAD
>             /* Not grouped loads are handled as externals for BB
>                vectorization.  For loop vectorization we can handle
>                splats the same we handle single element interleaving.  */
> @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
>        if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
>       gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
>                   || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
> -                 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
> +                 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
> +                 || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD));
>        else
>       {
>         *max_nunits = this_max_nunits;
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index cd7c1090d88..263acf5d3cd 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -9575,9 +9575,9 @@ vectorizable_load (vec_info *vinfo,
>       return false;
>  
>        mask_index = internal_fn_mask_index (ifn);
> -      if (mask_index >= 0 && slp_node)
> +      if (mask_index >= 0 && slp_node && internal_fn_len_index (ifn) < 0)
>       mask_index = vect_slp_child_index_for_operand (call, mask_index);
> -      if (mask_index >= 0
> +      if (mask_index >= 0 && internal_fn_len_index (ifn) < 0
>         && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
>                                     &mask, NULL, &mask_dt, &mask_vectype))
>       return false;

You are ignoring the mask argument and here only handle it when the
IFN doesn't have a _LEN.  This doesn't seem to be forward looking
to the point where you want to actually handle masked (aka conditional)
gather.

Did you check that SLP is actually used to vectorize this?

Richard.

Reply via email to