On Thu, Aug 23, 2018 at 11:08 AM Richard Sandiford <richard.sandif...@arm.com> wrote: > > The SLP code currently punts for all variable-length permutes. > This patch makes it handle the easy case of N->N permutes in which > the number of vector lanes is a multiple of N. Every permute then > uses the same mask, and that mask repeats (with a stride) every > N elements. > > The patch uses the same path for constant-length vectors, > since it should be slightly cheaper in terms of compile time. > > Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf > and x86_64-linux-gnu. OK to install?
OK. Thanks, Richard. > Richard > > > 2018-08-23 Richard Sandiford <richard.sandif...@arm.com> > > gcc/ > * tree-vect-slp.c (vect_transform_slp_perm_load): Separate out > the case in which the permute needs only a single element and > repeats for every vector of the result. Extend that case to > handle variable-length vectors. > * tree-vect-stmts.c (vectorizable_load): Update accordingly. > > gcc/testsuite/ > * gcc.target/aarch64/sve/slp_perm_1.c: New test. > * gcc.target/aarch64/sve/slp_perm_2.c: Likewise. > * gcc.target/aarch64/sve/slp_perm_3.c: Likewise. > * gcc.target/aarch64/sve/slp_perm_4.c: Likewise. > * gcc.target/aarch64/sve/slp_perm_5.c: Likewise. > * gcc.target/aarch64/sve/slp_perm_6.c: Likewise. > * gcc.target/aarch64/sve/slp_perm_7.c: Likewise. > > Index: gcc/tree-vect-slp.c > =================================================================== > *** gcc/tree-vect-slp.c 2018-08-21 14:47:08.339163256 +0100 > --- gcc/tree-vect-slp.c 2018-08-23 09:59:35.245682525 +0100 > *************** vect_transform_slp_perm_load (slp_tree n > *** 3606,3618 **** > { > stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; > vec_info *vinfo = stmt_info->vinfo; > - tree mask_element_type = NULL_TREE, mask_type; > int vec_index = 0; > tree vectype = STMT_VINFO_VECTYPE (stmt_info); > ! int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); > unsigned int mask_element; > machine_mode mode; > - unsigned HOST_WIDE_INT nunits, const_vf; > > if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) > return false; > --- 3606,3616 ---- > { > stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; > vec_info *vinfo = stmt_info->vinfo; > int vec_index = 0; > tree vectype = STMT_VINFO_VECTYPE (stmt_info); > ! unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); > unsigned int mask_element; > machine_mode mode; > > if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) > return false; > *************** vect_transform_slp_perm_load (slp_tree n > *** 3620,3641 **** > stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); > > mode = TYPE_MODE (vectype); > ! > ! /* At the moment, all permutations are represented using per-element > ! indices, so we can't cope with variable vector lengths or > ! vectorization factors. */ > ! if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits) > ! || !vf.is_constant (&const_vf)) > ! return false; > ! > ! /* The generic VEC_PERM_EXPR code always uses an integral type of the > ! same size as the vector element being permuted. */ > ! mask_element_type = lang_hooks.types.type_for_mode > ! (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1); > ! mask_type = get_vectype_for_scalar_type (mask_element_type); > ! vec_perm_builder mask (nunits, nunits, 1); > ! mask.quick_grow (nunits); > ! vec_perm_indices indices; > > /* Initialize the vect stmts of NODE to properly insert the generated > stmts later. */ > --- 3618,3624 ---- > stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); > > mode = TYPE_MODE (vectype); > ! poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); > > /* Initialize the vect stmts of NODE to properly insert the generated > stmts later. */ > *************** vect_transform_slp_perm_load (slp_tree n > *** 3669,3682 **** > bool noop_p = true; > *n_perms = 0; > > ! for (unsigned int j = 0; j < const_vf; j++) > { > ! for (int k = 0; k < group_size; k++) > { > ! unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k] > ! + j * DR_GROUP_SIZE (stmt_info)); > ! vec_index = i / nunits; > ! mask_element = i % nunits; > if (vec_index == first_vec_index > || first_vec_index == -1) > { > --- 3652,3704 ---- > bool noop_p = true; > *n_perms = 0; > > ! vec_perm_builder mask; > ! unsigned int nelts_to_build; > ! unsigned int nvectors_per_build; > ! bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info) > ! && multiple_p (nunits, group_size)); > ! if (repeating_p) > ! { > ! /* A single vector contains a whole number of copies of the node, so: > ! (a) all permutes can use the same mask; and > ! (b) the permutes only need a single vector input. */ > ! mask.new_vector (nunits, group_size, 3); > ! nelts_to_build = mask.encoded_nelts (); > ! nvectors_per_build = SLP_TREE_VEC_STMTS (node).length (); > ! } > ! else > ! { > ! /* We need to construct a separate mask for each vector statement. */ > ! unsigned HOST_WIDE_INT const_nunits, const_vf; > ! if (!nunits.is_constant (&const_nunits) > ! || !vf.is_constant (&const_vf)) > ! return false; > ! mask.new_vector (const_nunits, const_nunits, 1); > ! nelts_to_build = const_vf * group_size; > ! nvectors_per_build = 1; > ! } > ! > ! unsigned int count = mask.encoded_nelts (); > ! mask.quick_grow (count); > ! vec_perm_indices indices; > ! > ! for (unsigned int j = 0; j < nelts_to_build; j++) > { > ! unsigned int iter_num = j / group_size; > ! unsigned int stmt_num = j % group_size; > ! unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) > ! + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]); > ! if (repeating_p) > { > ! first_vec_index = 0; > ! mask_element = i; > ! } > ! else > ! { > ! /* Enforced before the loop when !repeating_p. */ > ! unsigned int const_nunits = nunits.to_constant (); > ! vec_index = i / const_nunits; > ! mask_element = i % const_nunits; > if (vec_index == first_vec_index > || first_vec_index == -1) > { > *************** vect_transform_slp_perm_load (slp_tree n > *** 3686,3692 **** > || second_vec_index == -1) > { > second_vec_index = vec_index; > ! mask_element += nunits; > } > else > { > --- 3708,3714 ---- > || second_vec_index == -1) > { > second_vec_index = vec_index; > ! mask_element += const_nunits; > } > else > { > *************** vect_transform_slp_perm_load (slp_tree n > *** 3702,3751 **** > return false; > } > > ! gcc_assert (mask_element < 2 * nunits); > ! if (mask_element != index) > ! noop_p = false; > ! mask[index++] = mask_element; > > ! if (index == nunits && !noop_p) > { > ! indices.new_vector (mask, 2, nunits); > ! if (!can_vec_perm_const_p (mode, indices)) > { > ! if (dump_enabled_p ()) > { > ! dump_printf_loc (MSG_MISSED_OPTIMIZATION, > ! vect_location, > ! "unsupported vect permute { "); > ! for (i = 0; i < nunits; ++i) > ! { > ! dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]); > ! dump_printf (MSG_MISSED_OPTIMIZATION, " "); > ! } > ! dump_printf (MSG_MISSED_OPTIMIZATION, "}\n"); > } > ! gcc_assert (analyze_only); > ! return false; > } > ! > ! ++*n_perms; > } > > ! if (index == nunits) > { > ! if (!analyze_only) > ! { > ! tree mask_vec = NULL_TREE; > > ! if (! noop_p) > ! mask_vec = vec_perm_indices_to_tree (mask_type, indices); > > ! if (second_vec_index == -1) > ! second_vec_index = first_vec_index; > > /* Generate the permute statement if necessary. */ > ! tree first_vec = dr_chain[first_vec_index]; > ! tree second_vec = dr_chain[second_vec_index]; > stmt_vec_info perm_stmt_info; > if (! noop_p) > { > --- 3724,3777 ---- > return false; > } > > ! gcc_assert (mask_element < 2 * const_nunits); > ! } > ! > ! if (mask_element != index) > ! noop_p = false; > ! mask[index++] = mask_element; > > ! if (index == count && !noop_p) > ! { > ! indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits); > ! if (!can_vec_perm_const_p (mode, indices)) > { > ! if (dump_enabled_p ()) > { > ! dump_printf_loc (MSG_MISSED_OPTIMIZATION, > ! vect_location, > ! "unsupported vect permute { "); > ! for (i = 0; i < count; ++i) > { > ! dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]); > ! dump_printf (MSG_MISSED_OPTIMIZATION, " "); > } > ! dump_printf (MSG_MISSED_OPTIMIZATION, "}\n"); > } > ! gcc_assert (analyze_only); > ! return false; > } > > ! ++*n_perms; > ! } > ! > ! if (index == count) > ! { > ! if (!analyze_only) > { > ! tree mask_vec = NULL_TREE; > > ! if (! noop_p) > ! mask_vec = vect_gen_perm_mask_checked (vectype, indices); > > ! if (second_vec_index == -1) > ! second_vec_index = first_vec_index; > > + for (unsigned int ri = 0; ri < nvectors_per_build; ++ri) > + { > /* Generate the permute statement if necessary. */ > ! tree first_vec = dr_chain[first_vec_index + ri]; > ! tree second_vec = dr_chain[second_vec_index + ri]; > stmt_vec_info perm_stmt_info; > if (! noop_p) > { > *************** vect_transform_slp_perm_load (slp_tree n > *** 3771,3782 **** > SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] > = perm_stmt_info; > } > - > - index = 0; > - first_vec_index = -1; > - second_vec_index = -1; > - noop_p = true; > } > } > } > > --- 3797,3808 ---- > SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] > = perm_stmt_info; > } > } > + > + index = 0; > + first_vec_index = -1; > + second_vec_index = -1; > + noop_p = true; > } > } > > Index: gcc/tree-vect-stmts.c > =================================================================== > *** gcc/tree-vect-stmts.c 2018-08-22 13:58:45.652208084 +0100 > --- gcc/tree-vect-stmts.c 2018-08-23 09:59:35.245682525 +0100 > *************** vectorizable_load (stmt_vec_info stmt_in > *** 8003,8015 **** > if (slp) > { > grouped_load = false; > ! /* For SLP permutation support we need to load the whole group, > ! not only the number of vector stmts the permutation result > ! fits in. */ > ! if (slp_perm) > { > ! /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for > ! variable VF. */ > unsigned int const_vf = vf.to_constant (); > unsigned int const_nunits = nunits.to_constant (); > vec_num = CEIL (group_size * const_vf, const_nunits); > --- 8003,8020 ---- > if (slp) > { > grouped_load = false; > ! /* If an SLP permutation is from N elements to N elements, > ! and if one vector holds a whole number of N, we can load > ! the inputs to the permutation in the same way as an > ! unpermuted sequence. In other cases we need to load the > ! whole group, not only the number of vector stmts the > ! permutation result fits in. */ > ! if (slp_perm > ! && (group_size != SLP_INSTANCE_GROUP_SIZE (slp_node_instance) > ! || !multiple_p (nunits, group_size))) > { > ! /* We don't yet generate such SLP_TREE_LOAD_PERMUTATIONs for > ! variable VF; see vect_transform_slp_perm_load. */ > unsigned int const_vf = vf.to_constant (); > unsigned int const_nunits = nunits.to_constant (); > vec_num = CEIL (group_size * const_vf, const_nunits); > Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_1.c > =================================================================== > *** /dev/null 2018-07-26 10:26:13.137955424 +0100 > --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_1.c 2018-08-23 > 09:59:35.245682525 +0100 > *************** > *** 0 **** > --- 1,22 ---- > + /* { dg-do compile } */ > + /* { dg-options "-O2 -ftree-vectorize" } */ > + > + #include <stdint.h> > + > + void > + f (uint8_t *restrict a, uint8_t *restrict b) > + { > + for (int i = 0; i < 100; ++i) > + { > + a[i * 8] = b[i * 8 + 7] + 1; > + a[i * 8 + 1] = b[i * 8 + 6] + 2; > + a[i * 8 + 2] = b[i * 8 + 5] + 3; > + a[i * 8 + 3] = b[i * 8 + 4] + 4; > + a[i * 8 + 4] = b[i * 8 + 3] + 5; > + a[i * 8 + 5] = b[i * 8 + 2] + 6; > + a[i * 8 + 6] = b[i * 8 + 1] + 7; > + a[i * 8 + 7] = b[i * 8 + 0] + 8; > + } > + } > + > + /* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.d, p[0-7]/m, > z[0-9]+\.d\n} 1 } } */ > Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_2.c > =================================================================== > *** /dev/null 2018-07-26 10:26:13.137955424 +0100 > --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_2.c 2018-08-23 > 09:59:35.245682525 +0100 > *************** > *** 0 **** > --- 1,22 ---- > + /* { dg-do compile } */ > + /* { dg-options "-O2 -ftree-vectorize" } */ > + > + #include <stdint.h> > + > + void > + f (uint8_t *restrict a, uint8_t *restrict b) > + { > + for (int i = 0; i < 100; ++i) > + { > + a[i * 8] = b[i * 8 + 3] + 1; > + a[i * 8 + 1] = b[i * 8 + 2] + 2; > + a[i * 8 + 2] = b[i * 8 + 1] + 3; > + a[i * 8 + 3] = b[i * 8 + 0] + 4; > + a[i * 8 + 4] = b[i * 8 + 7] + 5; > + a[i * 8 + 5] = b[i * 8 + 6] + 6; > + a[i * 8 + 6] = b[i * 8 + 5] + 7; > + a[i * 8 + 7] = b[i * 8 + 4] + 8; > + } > + } > + > + /* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s\n} 1 } } */ > Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_3.c > =================================================================== > *** /dev/null 2018-07-26 10:26:13.137955424 +0100 > --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_3.c 2018-08-23 > 09:59:35.245682525 +0100 > *************** > *** 0 **** > --- 1,22 ---- > + /* { dg-do compile } */ > + /* { dg-options "-O2 -ftree-vectorize" } */ > + > + #include <stdint.h> > + > + void > + f (uint8_t *restrict a, uint8_t *restrict b) > + { > + for (int i = 0; i < 100; ++i) > + { > + a[i * 8] = b[i * 8 + 1] + 1; > + a[i * 8 + 1] = b[i * 8 + 0] + 2; > + a[i * 8 + 2] = b[i * 8 + 3] + 3; > + a[i * 8 + 3] = b[i * 8 + 2] + 4; > + a[i * 8 + 4] = b[i * 8 + 5] + 5; > + a[i * 8 + 5] = b[i * 8 + 4] + 6; > + a[i * 8 + 6] = b[i * 8 + 7] + 7; > + a[i * 8 + 7] = b[i * 8 + 6] + 8; > + } > + } > + > + /* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h\n} 1 } } */ > Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_4.c > =================================================================== > *** /dev/null 2018-07-26 10:26:13.137955424 +0100 > --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_4.c 2018-08-23 > 09:59:35.245682525 +0100 > *************** > *** 0 **** > --- 1,22 ---- > + /* { dg-do compile } */ > + /* { dg-options "-O2 -ftree-vectorize" } */ > + > + #include <stdint.h> > + > + void > + f (uint8_t *restrict a, uint8_t *restrict b, uint8_t *restrict c) > + { > + for (int i = 0; i < 100; ++i) > + { > + a[i * 8] = b[i * 8] + c[i * 8]; > + a[i * 8 + 1] = b[i * 8] + c[i * 8 + 1]; > + a[i * 8 + 2] = b[i * 8 + 2] + c[i * 8 + 2]; > + a[i * 8 + 3] = b[i * 8 + 2] + c[i * 8 + 3]; > + a[i * 8 + 4] = b[i * 8 + 4] + c[i * 8 + 4]; > + a[i * 8 + 5] = b[i * 8 + 4] + c[i * 8 + 5]; > + a[i * 8 + 6] = b[i * 8 + 6] + c[i * 8 + 6]; > + a[i * 8 + 7] = b[i * 8 + 6] + c[i * 8 + 7]; > + } > + } > + > + /* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, > z[0-9]+\.b\n} } } */ > Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_5.c > =================================================================== > *** /dev/null 2018-07-26 10:26:13.137955424 +0100 > --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_5.c 2018-08-23 > 09:59:35.245682525 +0100 > *************** > *** 0 **** > --- 1,32 ---- > + /* { dg-do compile } */ > + /* { dg-options "-O2 -ftree-vectorize" } */ > + > + #include <stdint.h> > + > + void > + f (uint8_t *restrict a, uint8_t *restrict b, > + uint8_t *restrict c, uint8_t *restrict d) > + { > + for (int i = 0; i < 100; ++i) > + { > + a[i * 8] = c[i * 8] + d[i * 8]; > + a[i * 8 + 1] = c[i * 8] + d[i * 8 + 1]; > + a[i * 8 + 2] = c[i * 8 + 2] + d[i * 8 + 2]; > + a[i * 8 + 3] = c[i * 8 + 2] + d[i * 8 + 3]; > + a[i * 8 + 4] = c[i * 8 + 4] + d[i * 8 + 4]; > + a[i * 8 + 5] = c[i * 8 + 4] + d[i * 8 + 5]; > + a[i * 8 + 6] = c[i * 8 + 6] + d[i * 8 + 6]; > + a[i * 8 + 7] = c[i * 8 + 6] + d[i * 8 + 7]; > + b[i * 8] = c[i * 8 + 1] + d[i * 8]; > + b[i * 8 + 1] = c[i * 8 + 1] + d[i * 8 + 1]; > + b[i * 8 + 2] = c[i * 8 + 3] + d[i * 8 + 2]; > + b[i * 8 + 3] = c[i * 8 + 3] + d[i * 8 + 3]; > + b[i * 8 + 4] = c[i * 8 + 5] + d[i * 8 + 4]; > + b[i * 8 + 5] = c[i * 8 + 5] + d[i * 8 + 5]; > + b[i * 8 + 6] = c[i * 8 + 7] + d[i * 8 + 6]; > + b[i * 8 + 7] = c[i * 8 + 7] + d[i * 8 + 7]; > + } > + } > + > + /* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, > z[0-9]+\.b\n} } } */ > + /* { dg-final { scan-assembler {\ttrn2\tz[0-9]+\.b, z[0-9]+\.b, > z[0-9]+\.b\n} } } */ > Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_6.c > =================================================================== > *** /dev/null 2018-07-26 10:26:13.137955424 +0100 > --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_6.c 2018-08-23 > 09:59:35.245682525 +0100 > *************** > *** 0 **** > --- 1,22 ---- > + /* { dg-do compile } */ > + /* { dg-options "-O2 -ftree-vectorize" } */ > + > + #include <stdint.h> > + > + void > + f (uint8_t *restrict a, uint8_t *restrict b) > + { > + for (int i = 0; i < 100; ++i) > + { > + a[i * 8] = b[i * 8 + 3] + 1; > + a[i * 8 + 1] = b[i * 8 + 6] + 1; > + a[i * 8 + 2] = b[i * 8 + 0] + 1; > + a[i * 8 + 3] = b[i * 8 + 2] + 1; > + a[i * 8 + 4] = b[i * 8 + 1] + 1; > + a[i * 8 + 5] = b[i * 8 + 7] + 1; > + a[i * 8 + 6] = b[i * 8 + 5] + 1; > + a[i * 8 + 7] = b[i * 8 + 4] + 1; > + } > + } > + > + /* { dg-final { scan-assembler-times {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, > z[0-9]+\.b\n} 1 } } */ > Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_7.c > =================================================================== > *** /dev/null 2018-07-26 10:26:13.137955424 +0100 > --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_7.c 2018-08-23 > 09:59:35.245682525 +0100 > *************** > *** 0 **** > --- 1,22 ---- > + /* { dg-do compile } */ > + /* { dg-options "-O2 -ftree-vectorize" } */ > + > + #include <stdint.h> > + > + void > + f (uint8_t *restrict a, uint8_t *restrict b) > + { > + for (int i = 0; i < 100; ++i) > + { > + a[i * 8] = b[i * 8 + 1] + 1; > + a[i * 8 + 1] = b[i * 8 + 7] + 2; > + a[i * 8 + 2] = b[i * 8 + 1] + 3; > + a[i * 8 + 3] = b[i * 8 + 7] + 4; > + a[i * 8 + 4] = b[i * 8 + 1] + 5; > + a[i * 8 + 5] = b[i * 8 + 7] + 6; > + a[i * 8 + 6] = b[i * 8 + 1] + 7; > + a[i * 8 + 7] = b[i * 8 + 7] + 8; > + } > + } > + > + /* { dg-final { scan-assembler {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, > z[0-9]+\.b\n} } } */