Re: Handle SLP permutations for variable-length vectors
On Thu, Aug 23, 2018 at 11:08 AM Richard Sandiford wrote: > > The SLP code currently punts for all variable-length permutes. > This patch makes it handle the easy case of N->N permutes in which > the number of vector lanes is a multiple of N. Every permute then > uses the same mask, and that mask repeats (with a stride) every > N elements. > > The patch uses the same path for constant-length vectors, > since it should be slightly cheaper in terms of compile time. > > Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf > and x86_64-linux-gnu. OK to install? OK. Thanks, Richard. > Richard > > > 2018-08-23 Richard Sandiford > > gcc/ > * tree-vect-slp.c (vect_transform_slp_perm_load): Separate out > the case in which the permute needs only a single element and > repeats for every vector of the result. Extend that case to > handle variable-length vectors. > * tree-vect-stmts.c (vectorizable_load): Update accordingly. > > gcc/testsuite/ > * gcc.target/aarch64/sve/slp_perm_1.c: New test. > * gcc.target/aarch64/sve/slp_perm_2.c: Likewise. > * gcc.target/aarch64/sve/slp_perm_3.c: Likewise. > * gcc.target/aarch64/sve/slp_perm_4.c: Likewise. > * gcc.target/aarch64/sve/slp_perm_5.c: Likewise. > * gcc.target/aarch64/sve/slp_perm_6.c: Likewise. > * gcc.target/aarch64/sve/slp_perm_7.c: Likewise. > > Index: gcc/tree-vect-slp.c > === > *** gcc/tree-vect-slp.c 2018-08-21 14:47:08.339163256 +0100 > --- gcc/tree-vect-slp.c 2018-08-23 09:59:35.245682525 +0100 > *** vect_transform_slp_perm_load (slp_tree n > *** 3606,3618 > { > stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; > vec_info *vinfo = stmt_info->vinfo; > - tree mask_element_type = NULL_TREE, mask_type; > int vec_index = 0; > tree vectype = STMT_VINFO_VECTYPE (stmt_info); > ! int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); > unsigned int mask_element; > machine_mode mode; > - unsigned HOST_WIDE_INT nunits, const_vf; > > if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) > return false; > --- 3606,3616 > { > stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; > vec_info *vinfo = stmt_info->vinfo; > int vec_index = 0; > tree vectype = STMT_VINFO_VECTYPE (stmt_info); > ! unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); > unsigned int mask_element; > machine_mode mode; > > if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) > return false; > *** vect_transform_slp_perm_load (slp_tree n > *** 3620,3641 > stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); > > mode = TYPE_MODE (vectype); > ! > ! /* At the moment, all permutations are represented using per-element > ! indices, so we can't cope with variable vector lengths or > ! vectorization factors. */ > ! if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant () > ! || !vf.is_constant (_vf)) > ! return false; > ! > ! /* The generic VEC_PERM_EXPR code always uses an integral type of the > ! same size as the vector element being permuted. */ > ! mask_element_type = lang_hooks.types.type_for_mode > ! (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1); > ! mask_type = get_vectype_for_scalar_type (mask_element_type); > ! vec_perm_builder mask (nunits, nunits, 1); > ! mask.quick_grow (nunits); > ! vec_perm_indices indices; > > /* Initialize the vect stmts of NODE to properly insert the generated >stmts later. */ > --- 3618,3624 > stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); > > mode = TYPE_MODE (vectype); > ! poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); > > /* Initialize the vect stmts of NODE to properly insert the generated >stmts later. */ > *** vect_transform_slp_perm_load (slp_tree n > *** 3669,3682 > bool noop_p = true; > *n_perms = 0; > > ! for (unsigned int j = 0; j < const_vf; j++) > { > ! for (int k = 0; k < group_size; k++) > { > ! unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k] > ! + j * DR_GROUP_SIZE (stmt_info)); > ! vec_index = i / nunits; > ! mask_element = i % nunits; > if (vec_index == first_vec_index > || first_vec_index == -1) > { > --- 3652,3704 > bool noop_p = true; > *n_perms = 0; > > ! vec_perm_builder mask; > ! unsigned int nelts_to_build; > ! unsigned int nvectors_per_build; > ! bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info) > ! && multiple_p (nunits, group_size)); > ! if (repeating_p) > ! { > ! /* A single vector contains a whole number of copies of the node, so: > !(a) all permutes can use the same mask; and > !(b) the permutes only
Handle SLP permutations for variable-length vectors
The SLP code currently punts for all variable-length permutes. This patch makes it handle the easy case of N->N permutes in which the number of vector lanes is a multiple of N. Every permute then uses the same mask, and that mask repeats (with a stride) every N elements. The patch uses the same path for constant-length vectors, since it should be slightly cheaper in terms of compile time. Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf and x86_64-linux-gnu. OK to install? Richard 2018-08-23 Richard Sandiford gcc/ * tree-vect-slp.c (vect_transform_slp_perm_load): Separate out the case in which the permute needs only a single element and repeats for every vector of the result. Extend that case to handle variable-length vectors. * tree-vect-stmts.c (vectorizable_load): Update accordingly. gcc/testsuite/ * gcc.target/aarch64/sve/slp_perm_1.c: New test. * gcc.target/aarch64/sve/slp_perm_2.c: Likewise. * gcc.target/aarch64/sve/slp_perm_3.c: Likewise. * gcc.target/aarch64/sve/slp_perm_4.c: Likewise. * gcc.target/aarch64/sve/slp_perm_5.c: Likewise. * gcc.target/aarch64/sve/slp_perm_6.c: Likewise. * gcc.target/aarch64/sve/slp_perm_7.c: Likewise. Index: gcc/tree-vect-slp.c === *** gcc/tree-vect-slp.c 2018-08-21 14:47:08.339163256 +0100 --- gcc/tree-vect-slp.c 2018-08-23 09:59:35.245682525 +0100 *** vect_transform_slp_perm_load (slp_tree n *** 3606,3618 { stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; vec_info *vinfo = stmt_info->vinfo; - tree mask_element_type = NULL_TREE, mask_type; int vec_index = 0; tree vectype = STMT_VINFO_VECTYPE (stmt_info); ! int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); unsigned int mask_element; machine_mode mode; - unsigned HOST_WIDE_INT nunits, const_vf; if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) return false; --- 3606,3616 { stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; vec_info *vinfo = stmt_info->vinfo; int vec_index = 0; tree vectype = STMT_VINFO_VECTYPE (stmt_info); ! unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); unsigned int mask_element; machine_mode mode; if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) return false; *** vect_transform_slp_perm_load (slp_tree n *** 3620,3641 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); mode = TYPE_MODE (vectype); ! ! /* At the moment, all permutations are represented using per-element ! indices, so we can't cope with variable vector lengths or ! vectorization factors. */ ! if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant () ! || !vf.is_constant (_vf)) ! return false; ! ! /* The generic VEC_PERM_EXPR code always uses an integral type of the ! same size as the vector element being permuted. */ ! mask_element_type = lang_hooks.types.type_for_mode ! (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1); ! mask_type = get_vectype_for_scalar_type (mask_element_type); ! vec_perm_builder mask (nunits, nunits, 1); ! mask.quick_grow (nunits); ! vec_perm_indices indices; /* Initialize the vect stmts of NODE to properly insert the generated stmts later. */ --- 3618,3624 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); mode = TYPE_MODE (vectype); ! poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); /* Initialize the vect stmts of NODE to properly insert the generated stmts later. */ *** vect_transform_slp_perm_load (slp_tree n *** 3669,3682 bool noop_p = true; *n_perms = 0; ! for (unsigned int j = 0; j < const_vf; j++) { ! for (int k = 0; k < group_size; k++) { ! unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k] ! + j * DR_GROUP_SIZE (stmt_info)); ! vec_index = i / nunits; ! mask_element = i % nunits; if (vec_index == first_vec_index || first_vec_index == -1) { --- 3652,3704 bool noop_p = true; *n_perms = 0; ! vec_perm_builder mask; ! unsigned int nelts_to_build; ! unsigned int nvectors_per_build; ! bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info) ! && multiple_p (nunits, group_size)); ! if (repeating_p) ! { ! /* A single vector contains a whole number of copies of the node, so: !(a) all permutes can use the same mask; and !(b) the permutes only need a single vector input. */ ! mask.new_vector (nunits, group_size, 3); ! nelts_to_build = mask.encoded_nelts (); ! nvectors_per_build = SLP_TREE_VEC_STMTS (node).length (); ! } ! else ! { ! /* We need to construct a separate mask for each vector statement. */ !