Re: Handle SLP permutations for variable-length vectors

2018-08-24 Thread Richard Biener
On Thu, Aug 23, 2018 at 11:08 AM Richard Sandiford
 wrote:
>
> The SLP code currently punts for all variable-length permutes.
> This patch makes it handle the easy case of N->N permutes in which
> the number of vector lanes is a multiple of N.  Every permute then
> uses the same mask, and that mask repeats (with a stride) every
> N elements.
>
> The patch uses the same path for constant-length vectors,
> since it should be slightly cheaper in terms of compile time.
>
> Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf
> and x86_64-linux-gnu.  OK to install?

OK.

Thanks,
Richard.

> Richard
>
>
> 2018-08-23  Richard Sandiford  
>
> gcc/
> * tree-vect-slp.c (vect_transform_slp_perm_load): Separate out
> the case in which the permute needs only a single element and
> repeats for every vector of the result.  Extend that case to
> handle variable-length vectors.
> * tree-vect-stmts.c (vectorizable_load): Update accordingly.
>
> gcc/testsuite/
> * gcc.target/aarch64/sve/slp_perm_1.c: New test.
> * gcc.target/aarch64/sve/slp_perm_2.c: Likewise.
> * gcc.target/aarch64/sve/slp_perm_3.c: Likewise.
> * gcc.target/aarch64/sve/slp_perm_4.c: Likewise.
> * gcc.target/aarch64/sve/slp_perm_5.c: Likewise.
> * gcc.target/aarch64/sve/slp_perm_6.c: Likewise.
> * gcc.target/aarch64/sve/slp_perm_7.c: Likewise.
>
> Index: gcc/tree-vect-slp.c
> ===
> *** gcc/tree-vect-slp.c 2018-08-21 14:47:08.339163256 +0100
> --- gcc/tree-vect-slp.c 2018-08-23 09:59:35.245682525 +0100
> *** vect_transform_slp_perm_load (slp_tree n
> *** 3606,3618 
>   {
> stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
> vec_info *vinfo = stmt_info->vinfo;
> -   tree mask_element_type = NULL_TREE, mask_type;
> int vec_index = 0;
> tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> !   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
> unsigned int mask_element;
> machine_mode mode;
> -   unsigned HOST_WIDE_INT nunits, const_vf;
>
> if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
>   return false;
> --- 3606,3616 
>   {
> stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
> vec_info *vinfo = stmt_info->vinfo;
> int vec_index = 0;
> tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> !   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
> unsigned int mask_element;
> machine_mode mode;
>
> if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
>   return false;
> *** vect_transform_slp_perm_load (slp_tree n
> *** 3620,3641 
> stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
>
> mode = TYPE_MODE (vectype);
> !
> !   /* At the moment, all permutations are represented using per-element
> !  indices, so we can't cope with variable vector lengths or
> !  vectorization factors.  */
> !   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()
> !   || !vf.is_constant (_vf))
> ! return false;
> !
> !   /* The generic VEC_PERM_EXPR code always uses an integral type of the
> !  same size as the vector element being permuted.  */
> !   mask_element_type = lang_hooks.types.type_for_mode
> ! (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
> !   mask_type = get_vectype_for_scalar_type (mask_element_type);
> !   vec_perm_builder mask (nunits, nunits, 1);
> !   mask.quick_grow (nunits);
> !   vec_perm_indices indices;
>
> /* Initialize the vect stmts of NODE to properly insert the generated
>stmts later.  */
> --- 3618,3624 
> stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
>
> mode = TYPE_MODE (vectype);
> !   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
>
> /* Initialize the vect stmts of NODE to properly insert the generated
>stmts later.  */
> *** vect_transform_slp_perm_load (slp_tree n
> *** 3669,3682 
> bool noop_p = true;
> *n_perms = 0;
>
> !   for (unsigned int j = 0; j < const_vf; j++)
>   {
> !   for (int k = 0; k < group_size; k++)
> {
> ! unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
> !   + j * DR_GROUP_SIZE (stmt_info));
> ! vec_index = i / nunits;
> ! mask_element = i % nunits;
>   if (vec_index == first_vec_index
>   || first_vec_index == -1)
> {
> --- 3652,3704 
> bool noop_p = true;
> *n_perms = 0;
>
> !   vec_perm_builder mask;
> !   unsigned int nelts_to_build;
> !   unsigned int nvectors_per_build;
> !   bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
> ! && multiple_p (nunits, group_size));
> !   if (repeating_p)
> ! {
> !   /* A single vector contains a whole number of copies of the node, so:
> !(a) all permutes can use the same mask; and
> !(b) the permutes only 

Handle SLP permutations for variable-length vectors

2018-08-23 Thread Richard Sandiford
The SLP code currently punts for all variable-length permutes.
This patch makes it handle the easy case of N->N permutes in which
the number of vector lanes is a multiple of N.  Every permute then
uses the same mask, and that mask repeats (with a stride) every
N elements.

The patch uses the same path for constant-length vectors,
since it should be slightly cheaper in terms of compile time.

Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf
and x86_64-linux-gnu.  OK to install?

Richard


2018-08-23  Richard Sandiford  

gcc/
* tree-vect-slp.c (vect_transform_slp_perm_load): Separate out
the case in which the permute needs only a single element and
repeats for every vector of the result.  Extend that case to
handle variable-length vectors.
* tree-vect-stmts.c (vectorizable_load): Update accordingly.

gcc/testsuite/
* gcc.target/aarch64/sve/slp_perm_1.c: New test.
* gcc.target/aarch64/sve/slp_perm_2.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_3.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_4.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_5.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_6.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_7.c: Likewise.

Index: gcc/tree-vect-slp.c
===
*** gcc/tree-vect-slp.c 2018-08-21 14:47:08.339163256 +0100
--- gcc/tree-vect-slp.c 2018-08-23 09:59:35.245682525 +0100
*** vect_transform_slp_perm_load (slp_tree n
*** 3606,3618 
  {
stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
vec_info *vinfo = stmt_info->vinfo;
-   tree mask_element_type = NULL_TREE, mask_type;
int vec_index = 0;
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
!   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
unsigned int mask_element;
machine_mode mode;
-   unsigned HOST_WIDE_INT nunits, const_vf;
  
if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
  return false;
--- 3606,3616 
  {
stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
vec_info *vinfo = stmt_info->vinfo;
int vec_index = 0;
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
!   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
unsigned int mask_element;
machine_mode mode;
  
if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
  return false;
*** vect_transform_slp_perm_load (slp_tree n
*** 3620,3641 
stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
  
mode = TYPE_MODE (vectype);
! 
!   /* At the moment, all permutations are represented using per-element
!  indices, so we can't cope with variable vector lengths or
!  vectorization factors.  */
!   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()
!   || !vf.is_constant (_vf))
! return false;
! 
!   /* The generic VEC_PERM_EXPR code always uses an integral type of the
!  same size as the vector element being permuted.  */
!   mask_element_type = lang_hooks.types.type_for_mode
! (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
!   mask_type = get_vectype_for_scalar_type (mask_element_type);
!   vec_perm_builder mask (nunits, nunits, 1);
!   mask.quick_grow (nunits);
!   vec_perm_indices indices;
  
/* Initialize the vect stmts of NODE to properly insert the generated
   stmts later.  */
--- 3618,3624 
stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
  
mode = TYPE_MODE (vectype);
!   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
  
/* Initialize the vect stmts of NODE to properly insert the generated
   stmts later.  */
*** vect_transform_slp_perm_load (slp_tree n
*** 3669,3682 
bool noop_p = true;
*n_perms = 0;
  
!   for (unsigned int j = 0; j < const_vf; j++)
  {
!   for (int k = 0; k < group_size; k++)
{
! unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
!   + j * DR_GROUP_SIZE (stmt_info));
! vec_index = i / nunits;
! mask_element = i % nunits;
  if (vec_index == first_vec_index
  || first_vec_index == -1)
{
--- 3652,3704 
bool noop_p = true;
*n_perms = 0;
  
!   vec_perm_builder mask;
!   unsigned int nelts_to_build;
!   unsigned int nvectors_per_build;
!   bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
! && multiple_p (nunits, group_size));
!   if (repeating_p)
! {
!   /* A single vector contains a whole number of copies of the node, so:
!(a) all permutes can use the same mask; and
!(b) the permutes only need a single vector input.  */
!   mask.new_vector (nunits, group_size, 3);
!   nelts_to_build = mask.encoded_nelts ();
!   nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
! }
!   else
! {
!   /* We need to construct a separate mask for each vector statement.  */
!