Re: Handle SLP permutations for variable-length vectors

Richard Biener Fri, 24 Aug 2018 05:51:17 -0700

On Thu, Aug 23, 2018 at 11:08 AM Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> The SLP code currently punts for all variable-length permutes.
> This patch makes it handle the easy case of N->N permutes in which
> the number of vector lanes is a multiple of N.  Every permute then
> uses the same mask, and that mask repeats (with a stride) every
> N elements.
>
> The patch uses the same path for constant-length vectors,
> since it should be slightly cheaper in terms of compile time.
>
> Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf
> and x86_64-linux-gnu.  OK to install?


OK.

Thanks,
Richard.

> Richard
>
>
> 2018-08-23  Richard Sandiford  <richard.sandif...@arm.com>
>
> gcc/
>         * tree-vect-slp.c (vect_transform_slp_perm_load): Separate out
>         the case in which the permute needs only a single element and
>         repeats for every vector of the result.  Extend that case to
>         handle variable-length vectors.
>         * tree-vect-stmts.c (vectorizable_load): Update accordingly.
>
> gcc/testsuite/
>         * gcc.target/aarch64/sve/slp_perm_1.c: New test.
>         * gcc.target/aarch64/sve/slp_perm_2.c: Likewise.
>         * gcc.target/aarch64/sve/slp_perm_3.c: Likewise.
>         * gcc.target/aarch64/sve/slp_perm_4.c: Likewise.
>         * gcc.target/aarch64/sve/slp_perm_5.c: Likewise.
>         * gcc.target/aarch64/sve/slp_perm_6.c: Likewise.
>         * gcc.target/aarch64/sve/slp_perm_7.c: Likewise.
>
> Index: gcc/tree-vect-slp.c
> ===================================================================
> *** gcc/tree-vect-slp.c 2018-08-21 14:47:08.339163256 +0100
> --- gcc/tree-vect-slp.c 2018-08-23 09:59:35.245682525 +0100
> *************** vect_transform_slp_perm_load (slp_tree n
> *** 3606,3618 ****
>   {
>     stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
>     vec_info *vinfo = stmt_info->vinfo;
> -   tree mask_element_type = NULL_TREE, mask_type;
>     int vec_index = 0;
>     tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> !   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
>     unsigned int mask_element;
>     machine_mode mode;
> -   unsigned HOST_WIDE_INT nunits, const_vf;
>
>     if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
>       return false;
> --- 3606,3616 ----
>   {
>     stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
>     vec_info *vinfo = stmt_info->vinfo;
>     int vec_index = 0;
>     tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> !   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
>     unsigned int mask_element;
>     machine_mode mode;
>
>     if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
>       return false;
> *************** vect_transform_slp_perm_load (slp_tree n
> *** 3620,3641 ****
>     stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
>
>     mode = TYPE_MODE (vectype);
> !
> !   /* At the moment, all permutations are represented using per-element
> !      indices, so we can't cope with variable vector lengths or
> !      vectorization factors.  */
> !   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
> !       || !vf.is_constant (&const_vf))
> !     return false;
> !
> !   /* The generic VEC_PERM_EXPR code always uses an integral type of the
> !      same size as the vector element being permuted.  */
> !   mask_element_type = lang_hooks.types.type_for_mode
> !     (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
> !   mask_type = get_vectype_for_scalar_type (mask_element_type);
> !   vec_perm_builder mask (nunits, nunits, 1);
> !   mask.quick_grow (nunits);
> !   vec_perm_indices indices;
>
>     /* Initialize the vect stmts of NODE to properly insert the generated
>        stmts later.  */
> --- 3618,3624 ----
>     stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
>
>     mode = TYPE_MODE (vectype);
> !   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
>
>     /* Initialize the vect stmts of NODE to properly insert the generated
>        stmts later.  */
> *************** vect_transform_slp_perm_load (slp_tree n
> *** 3669,3682 ****
>     bool noop_p = true;
>     *n_perms = 0;
>
> !   for (unsigned int j = 0; j < const_vf; j++)
>       {
> !       for (int k = 0; k < group_size; k++)
>         {
> !         unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
> !                           + j * DR_GROUP_SIZE (stmt_info));
> !         vec_index = i / nunits;
> !         mask_element = i % nunits;
>           if (vec_index == first_vec_index
>               || first_vec_index == -1)
>             {
> --- 3652,3704 ----
>     bool noop_p = true;
>     *n_perms = 0;
>
> !   vec_perm_builder mask;
> !   unsigned int nelts_to_build;
> !   unsigned int nvectors_per_build;
> !   bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
> !                     && multiple_p (nunits, group_size));
> !   if (repeating_p)
> !     {
> !       /* A single vector contains a whole number of copies of the node, so:
> !        (a) all permutes can use the same mask; and
> !        (b) the permutes only need a single vector input.  */
> !       mask.new_vector (nunits, group_size, 3);
> !       nelts_to_build = mask.encoded_nelts ();
> !       nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
> !     }
> !   else
> !     {
> !       /* We need to construct a separate mask for each vector statement.  */
> !       unsigned HOST_WIDE_INT const_nunits, const_vf;
> !       if (!nunits.is_constant (&const_nunits)
> !         || !vf.is_constant (&const_vf))
> !       return false;
> !       mask.new_vector (const_nunits, const_nunits, 1);
> !       nelts_to_build = const_vf * group_size;
> !       nvectors_per_build = 1;
> !     }
> !
> !   unsigned int count = mask.encoded_nelts ();
> !   mask.quick_grow (count);
> !   vec_perm_indices indices;
> !
> !   for (unsigned int j = 0; j < nelts_to_build; j++)
>       {
> !       unsigned int iter_num = j / group_size;
> !       unsigned int stmt_num = j % group_size;
> !       unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
> !                       + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
> !       if (repeating_p)
>         {
> !         first_vec_index = 0;
> !         mask_element = i;
> !       }
> !       else
> !       {
> !         /* Enforced before the loop when !repeating_p.  */
> !         unsigned int const_nunits = nunits.to_constant ();
> !         vec_index = i / const_nunits;
> !         mask_element = i % const_nunits;
>           if (vec_index == first_vec_index
>               || first_vec_index == -1)
>             {
> *************** vect_transform_slp_perm_load (slp_tree n
> *** 3686,3692 ****
>                    || second_vec_index == -1)
>             {
>               second_vec_index = vec_index;
> !             mask_element += nunits;
>             }
>           else
>             {
> --- 3708,3714 ----
>                    || second_vec_index == -1)
>             {
>               second_vec_index = vec_index;
> !             mask_element += const_nunits;
>             }
>           else
>             {
> *************** vect_transform_slp_perm_load (slp_tree n
> *** 3702,3751 ****
>               return false;
>             }
>
> !         gcc_assert (mask_element < 2 * nunits);
> !         if (mask_element != index)
> !           noop_p = false;
> !         mask[index++] = mask_element;
>
> !         if (index == nunits && !noop_p)
>             {
> !             indices.new_vector (mask, 2, nunits);
> !             if (!can_vec_perm_const_p (mode, indices))
>                 {
> !                 if (dump_enabled_p ())
>                     {
> !                     dump_printf_loc (MSG_MISSED_OPTIMIZATION,
> !                                      vect_location,
> !                                      "unsupported vect permute { ");
> !                     for (i = 0; i < nunits; ++i)
> !                       {
> !                         dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
> !                         dump_printf (MSG_MISSED_OPTIMIZATION, " ");
> !                       }
> !                     dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
>                     }
> !                 gcc_assert (analyze_only);
> !                 return false;
>                 }
> !
> !             ++*n_perms;
>             }
>
> !         if (index == nunits)
>             {
> !             if (!analyze_only)
> !               {
> !                 tree mask_vec = NULL_TREE;
>
> !                 if (! noop_p)
> !                   mask_vec = vec_perm_indices_to_tree (mask_type, indices);
>
> !                 if (second_vec_index == -1)
> !                   second_vec_index = first_vec_index;
>
>                   /* Generate the permute statement if necessary.  */
> !                 tree first_vec = dr_chain[first_vec_index];
> !                 tree second_vec = dr_chain[second_vec_index];
>                   stmt_vec_info perm_stmt_info;
>                   if (! noop_p)
>                     {
> --- 3724,3777 ----
>               return false;
>             }
>
> !         gcc_assert (mask_element < 2 * const_nunits);
> !       }
> !
> !       if (mask_element != index)
> !       noop_p = false;
> !       mask[index++] = mask_element;
>
> !       if (index == count && !noop_p)
> !       {
> !         indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
> !         if (!can_vec_perm_const_p (mode, indices))
>             {
> !             if (dump_enabled_p ())
>                 {
> !                 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
> !                                  vect_location,
> !                                  "unsupported vect permute { ");
> !                 for (i = 0; i < count; ++i)
>                     {
> !                     dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
> !                     dump_printf (MSG_MISSED_OPTIMIZATION, " ");
>                     }
> !                 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
>                 }
> !             gcc_assert (analyze_only);
> !             return false;
>             }
>
> !         ++*n_perms;
> !       }
> !
> !       if (index == count)
> !       {
> !         if (!analyze_only)
>             {
> !             tree mask_vec = NULL_TREE;
>
> !             if (! noop_p)
> !               mask_vec = vect_gen_perm_mask_checked (vectype, indices);
>
> !             if (second_vec_index == -1)
> !               second_vec_index = first_vec_index;
>
> +             for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
> +               {
>                   /* Generate the permute statement if necessary.  */
> !                 tree first_vec = dr_chain[first_vec_index + ri];
> !                 tree second_vec = dr_chain[second_vec_index + ri];
>                   stmt_vec_info perm_stmt_info;
>                   if (! noop_p)
>                     {
> *************** vect_transform_slp_perm_load (slp_tree n
> *** 3771,3782 ****
>                   SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
>                     = perm_stmt_info;
>                 }
> -
> -             index = 0;
> -             first_vec_index = -1;
> -             second_vec_index = -1;
> -             noop_p = true;
>             }
>         }
>       }
>
> --- 3797,3808 ----
>                   SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
>                     = perm_stmt_info;
>                 }
>             }
> +
> +         index = 0;
> +         first_vec_index = -1;
> +         second_vec_index = -1;
> +         noop_p = true;
>         }
>       }
>
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> *** gcc/tree-vect-stmts.c       2018-08-22 13:58:45.652208084 +0100
> --- gcc/tree-vect-stmts.c       2018-08-23 09:59:35.245682525 +0100
> *************** vectorizable_load (stmt_vec_info stmt_in
> *** 8003,8015 ****
>         if (slp)
>         {
>           grouped_load = false;
> !         /* For SLP permutation support we need to load the whole group,
> !            not only the number of vector stmts the permutation result
> !            fits in.  */
> !         if (slp_perm)
>             {
> !             /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
> !                variable VF.  */
>               unsigned int const_vf = vf.to_constant ();
>               unsigned int const_nunits = nunits.to_constant ();
>               vec_num = CEIL (group_size * const_vf, const_nunits);
> --- 8003,8020 ----
>         if (slp)
>         {
>           grouped_load = false;
> !         /* If an SLP permutation is from N elements to N elements,
> !            and if one vector holds a whole number of N, we can load
> !            the inputs to the permutation in the same way as an
> !            unpermuted sequence.  In other cases we need to load the
> !            whole group, not only the number of vector stmts the
> !            permutation result fits in.  */
> !         if (slp_perm
> !             && (group_size != SLP_INSTANCE_GROUP_SIZE (slp_node_instance)
> !                 || !multiple_p (nunits, group_size)))
>             {
> !             /* We don't yet generate such SLP_TREE_LOAD_PERMUTATIONs for
> !                variable VF; see vect_transform_slp_perm_load.  */
>               unsigned int const_vf = vf.to_constant ();
>               unsigned int const_nunits = nunits.to_constant ();
>               vec_num = CEIL (group_size * const_vf, const_nunits);
> Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_1.c
> ===================================================================
> *** /dev/null   2018-07-26 10:26:13.137955424 +0100
> --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_1.c   2018-08-23 
> 09:59:35.245682525 +0100
> ***************
> *** 0 ****
> --- 1,22 ----
> + /* { dg-do compile } */
> + /* { dg-options "-O2 -ftree-vectorize" } */
> +
> + #include <stdint.h>
> +
> + void
> + f (uint8_t *restrict a, uint8_t *restrict b)
> + {
> +   for (int i = 0; i < 100; ++i)
> +     {
> +       a[i * 8] = b[i * 8 + 7] + 1;
> +       a[i * 8 + 1] = b[i * 8 + 6] + 2;
> +       a[i * 8 + 2] = b[i * 8 + 5] + 3;
> +       a[i * 8 + 3] = b[i * 8 + 4] + 4;
> +       a[i * 8 + 4] = b[i * 8 + 3] + 5;
> +       a[i * 8 + 5] = b[i * 8 + 2] + 6;
> +       a[i * 8 + 6] = b[i * 8 + 1] + 7;
> +       a[i * 8 + 7] = b[i * 8 + 0] + 8;
> +     }
> + }
> +
> + /* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.d, p[0-7]/m, 
> z[0-9]+\.d\n} 1 } } */
> Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_2.c
> ===================================================================
> *** /dev/null   2018-07-26 10:26:13.137955424 +0100
> --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_2.c   2018-08-23 
> 09:59:35.245682525 +0100
> ***************
> *** 0 ****
> --- 1,22 ----
> + /* { dg-do compile } */
> + /* { dg-options "-O2 -ftree-vectorize" } */
> +
> + #include <stdint.h>
> +
> + void
> + f (uint8_t *restrict a, uint8_t *restrict b)
> + {
> +   for (int i = 0; i < 100; ++i)
> +     {
> +       a[i * 8] = b[i * 8 + 3] + 1;
> +       a[i * 8 + 1] = b[i * 8 + 2] + 2;
> +       a[i * 8 + 2] = b[i * 8 + 1] + 3;
> +       a[i * 8 + 3] = b[i * 8 + 0] + 4;
> +       a[i * 8 + 4] = b[i * 8 + 7] + 5;
> +       a[i * 8 + 5] = b[i * 8 + 6] + 6;
> +       a[i * 8 + 6] = b[i * 8 + 5] + 7;
> +       a[i * 8 + 7] = b[i * 8 + 4] + 8;
> +     }
> + }
> +
> + /* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s\n} 1 } } */
> Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_3.c
> ===================================================================
> *** /dev/null   2018-07-26 10:26:13.137955424 +0100
> --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_3.c   2018-08-23 
> 09:59:35.245682525 +0100
> ***************
> *** 0 ****
> --- 1,22 ----
> + /* { dg-do compile } */
> + /* { dg-options "-O2 -ftree-vectorize" } */
> +
> + #include <stdint.h>
> +
> + void
> + f (uint8_t *restrict a, uint8_t *restrict b)
> + {
> +   for (int i = 0; i < 100; ++i)
> +     {
> +       a[i * 8] = b[i * 8 + 1] + 1;
> +       a[i * 8 + 1] = b[i * 8 + 0] + 2;
> +       a[i * 8 + 2] = b[i * 8 + 3] + 3;
> +       a[i * 8 + 3] = b[i * 8 + 2] + 4;
> +       a[i * 8 + 4] = b[i * 8 + 5] + 5;
> +       a[i * 8 + 5] = b[i * 8 + 4] + 6;
> +       a[i * 8 + 6] = b[i * 8 + 7] + 7;
> +       a[i * 8 + 7] = b[i * 8 + 6] + 8;
> +     }
> + }
> +
> + /* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h\n} 1 } } */
> Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_4.c
> ===================================================================
> *** /dev/null   2018-07-26 10:26:13.137955424 +0100
> --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_4.c   2018-08-23 
> 09:59:35.245682525 +0100
> ***************
> *** 0 ****
> --- 1,22 ----
> + /* { dg-do compile } */
> + /* { dg-options "-O2 -ftree-vectorize" } */
> +
> + #include <stdint.h>
> +
> + void
> + f (uint8_t *restrict a, uint8_t *restrict b, uint8_t *restrict c)
> + {
> +   for (int i = 0; i < 100; ++i)
> +     {
> +       a[i * 8] = b[i * 8] + c[i * 8];
> +       a[i * 8 + 1] = b[i * 8] + c[i * 8 + 1];
> +       a[i * 8 + 2] = b[i * 8 + 2] + c[i * 8 + 2];
> +       a[i * 8 + 3] = b[i * 8 + 2] + c[i * 8 + 3];
> +       a[i * 8 + 4] = b[i * 8 + 4] + c[i * 8 + 4];
> +       a[i * 8 + 5] = b[i * 8 + 4] + c[i * 8 + 5];
> +       a[i * 8 + 6] = b[i * 8 + 6] + c[i * 8 + 6];
> +       a[i * 8 + 7] = b[i * 8 + 6] + c[i * 8 + 7];
> +     }
> + }
> +
> + /* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, 
> z[0-9]+\.b\n} } } */
> Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_5.c
> ===================================================================
> *** /dev/null   2018-07-26 10:26:13.137955424 +0100
> --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_5.c   2018-08-23 
> 09:59:35.245682525 +0100
> ***************
> *** 0 ****
> --- 1,32 ----
> + /* { dg-do compile } */
> + /* { dg-options "-O2 -ftree-vectorize" } */
> +
> + #include <stdint.h>
> +
> + void
> + f (uint8_t *restrict a, uint8_t *restrict b,
> +    uint8_t *restrict c, uint8_t *restrict d)
> + {
> +   for (int i = 0; i < 100; ++i)
> +     {
> +       a[i * 8] = c[i * 8] + d[i * 8];
> +       a[i * 8 + 1] = c[i * 8] + d[i * 8 + 1];
> +       a[i * 8 + 2] = c[i * 8 + 2] + d[i * 8 + 2];
> +       a[i * 8 + 3] = c[i * 8 + 2] + d[i * 8 + 3];
> +       a[i * 8 + 4] = c[i * 8 + 4] + d[i * 8 + 4];
> +       a[i * 8 + 5] = c[i * 8 + 4] + d[i * 8 + 5];
> +       a[i * 8 + 6] = c[i * 8 + 6] + d[i * 8 + 6];
> +       a[i * 8 + 7] = c[i * 8 + 6] + d[i * 8 + 7];
> +       b[i * 8] = c[i * 8 + 1] + d[i * 8];
> +       b[i * 8 + 1] = c[i * 8 + 1] + d[i * 8 + 1];
> +       b[i * 8 + 2] = c[i * 8 + 3] + d[i * 8 + 2];
> +       b[i * 8 + 3] = c[i * 8 + 3] + d[i * 8 + 3];
> +       b[i * 8 + 4] = c[i * 8 + 5] + d[i * 8 + 4];
> +       b[i * 8 + 5] = c[i * 8 + 5] + d[i * 8 + 5];
> +       b[i * 8 + 6] = c[i * 8 + 7] + d[i * 8 + 6];
> +       b[i * 8 + 7] = c[i * 8 + 7] + d[i * 8 + 7];
> +     }
> + }
> +
> + /* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, 
> z[0-9]+\.b\n} } } */
> + /* { dg-final { scan-assembler {\ttrn2\tz[0-9]+\.b, z[0-9]+\.b, 
> z[0-9]+\.b\n} } } */
> Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_6.c
> ===================================================================
> *** /dev/null   2018-07-26 10:26:13.137955424 +0100
> --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_6.c   2018-08-23 
> 09:59:35.245682525 +0100
> ***************
> *** 0 ****
> --- 1,22 ----
> + /* { dg-do compile } */
> + /* { dg-options "-O2 -ftree-vectorize" } */
> +
> + #include <stdint.h>
> +
> + void
> + f (uint8_t *restrict a, uint8_t *restrict b)
> + {
> +   for (int i = 0; i < 100; ++i)
> +     {
> +       a[i * 8] = b[i * 8 + 3] + 1;
> +       a[i * 8 + 1] = b[i * 8 + 6] + 1;
> +       a[i * 8 + 2] = b[i * 8 + 0] + 1;
> +       a[i * 8 + 3] = b[i * 8 + 2] + 1;
> +       a[i * 8 + 4] = b[i * 8 + 1] + 1;
> +       a[i * 8 + 5] = b[i * 8 + 7] + 1;
> +       a[i * 8 + 6] = b[i * 8 + 5] + 1;
> +       a[i * 8 + 7] = b[i * 8 + 4] + 1;
> +     }
> + }
> +
> + /* { dg-final { scan-assembler-times {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, 
> z[0-9]+\.b\n} 1 } } */
> Index: gcc/testsuite/gcc.target/aarch64/sve/slp_perm_7.c
> ===================================================================
> *** /dev/null   2018-07-26 10:26:13.137955424 +0100
> --- gcc/testsuite/gcc.target/aarch64/sve/slp_perm_7.c   2018-08-23 
> 09:59:35.245682525 +0100
> ***************
> *** 0 ****
> --- 1,22 ----
> + /* { dg-do compile } */
> + /* { dg-options "-O2 -ftree-vectorize" } */
> +
> + #include <stdint.h>
> +
> + void
> + f (uint8_t *restrict a, uint8_t *restrict b)
> + {
> +   for (int i = 0; i < 100; ++i)
> +     {
> +       a[i * 8] = b[i * 8 + 1] + 1;
> +       a[i * 8 + 1] = b[i * 8 + 7] + 2;
> +       a[i * 8 + 2] = b[i * 8 + 1] + 3;
> +       a[i * 8 + 3] = b[i * 8 + 7] + 4;
> +       a[i * 8 + 4] = b[i * 8 + 1] + 5;
> +       a[i * 8 + 5] = b[i * 8 + 7] + 6;
> +       a[i * 8 + 6] = b[i * 8 + 1] + 7;
> +       a[i * 8 + 7] = b[i * 8 + 7] + 8;
> +     }
> + }
> +
> + /* { dg-final { scan-assembler {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, 
> z[0-9]+\.b\n} } } */

Re: Handle SLP permutations for variable-length vectors

Reply via email to