Hi!

The following patch implements what I've talked about, i.e. to no longer
force operands of vec_perm_const into registers in the generic code, but let
each of the (currently 8) targets force it into registers individually,
giving the targets better control on if it does that and when and allowing
them to do something special with some particular operands.
And then defines the define_insn_and_split for the 256-bit and 512-bit
permutations into vpmovzx* (only the bw, wd and dq cases, in theory we could
add define_insn_and_split patterns also for the bd, bq and wq).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2021-01-13  Jakub Jelinek  <ja...@redhat.com>

        PR target/95905
        * optabs.c (expand_vec_perm_const): Don't force v0 and v1 into
        registers before calling targetm.vectorize.vec_perm_const, only after
        that.
        * config/i386/i386-expand.c (ix86_vectorize_vec_perm_const): Handle
        two argument permutation when one operand is zero vector and only
        after that force operands into registers.
        * config/i386/sse.md (*avx2_zero_extendv16qiv16hi2_1,
        *avx512bw_zero_extendv32qiv32hi2_1, *avx512f_zero_extendv16hiv16si2_1,
        *avx2_zero_extendv8hiv8si2_1, *avx512f_zero_extendv8siv8di2_1,
        *avx2_zero_extendv4siv4di2_1): New define_insn_and_split patterns.
        * config/mips/mips.c (mips_vectorize_vec_perm_const): Force operands
        into registers.
        * config/arm/arm.c (arm_vectorize_vec_perm_const): Likewise.
        * config/sparc/sparc.c (sparc_vectorize_vec_perm_const): Likewise.
        * config/ia64/ia64.c (ia64_vectorize_vec_perm_const): Likewise.
        * config/aarch64/aarch64.c (aarch64_vectorize_vec_perm_const): Likewise.
        * config/rs6000/rs6000.c (rs6000_vectorize_vec_perm_const): Likewise.
        * config/gcn/gcn.c (gcn_vectorize_vec_perm_const): Likewise.  Use 
std::swap.

        * gcc.target/i386/pr95905-2.c: Use scan-assembler-times instead of
        scan-assembler.  Add tests with zero vector as first __builtin_shuffle
        operand.
        * gcc.target/i386/pr95905-3.c: New test.
        * gcc.target/i386/pr95905-4.c: New test.

--- gcc/optabs.c.jj     2021-01-04 10:25:38.632236100 +0100
+++ gcc/optabs.c        2021-01-12 14:46:44.719557815 +0100
@@ -6070,11 +6070,8 @@ expand_vec_perm_const (machine_mode mode
 
   if (targetm.vectorize.vec_perm_const != NULL)
     {
-      v0 = force_reg (mode, v0);
       if (single_arg_p)
        v1 = v0;
-      else
-       v1 = force_reg (mode, v1);
 
       if (targetm.vectorize.vec_perm_const (mode, target, v0, v1, indices))
        return target;
@@ -6095,6 +6092,11 @@ expand_vec_perm_const (machine_mode mode
        return gen_lowpart (mode, target_qi);
     }
 
+  v0 = force_reg (mode, v0);
+  if (single_arg_p)
+    v1 = v0;
+  v1 = force_reg (mode, v1);
+
   /* Otherwise expand as a fully variable permuation.  */
 
   /* The optabs are only defined for selectors with the same width
--- gcc/config/i386/i386-expand.c.jj    2021-01-12 11:01:51.189386077 +0100
+++ gcc/config/i386/i386-expand.c       2021-01-12 15:43:55.673095807 +0100
@@ -19929,6 +19929,33 @@ ix86_vectorize_vec_perm_const (machine_m
 
   two_args = canonicalize_perm (&d);
 
+  /* If one of the operands is a zero vector, try to match pmovzx.  */
+  if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
+    {
+      struct expand_vec_perm_d dzero = d;
+      if (d.op0 == CONST0_RTX (vmode))
+       {
+         d.op1 = dzero.op1 = force_reg (vmode, d.op1);
+         std::swap (dzero.op0, dzero.op1);
+         for (i = 0; i < nelt; ++i)
+           dzero.perm[i] ^= nelt;
+       }
+      else
+       d.op0 = dzero.op0 = force_reg (vmode, d.op0);
+
+      if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
+                                 dzero.perm, nelt, dzero.testing_p))
+       return true;
+    }
+
+  /* Force operands into registers.  */
+  rtx nop0 = force_reg (vmode, d.op0);
+  if (d.op0 == d.op1)
+    d.op1 = nop0;
+  d.op0 = nop0;
+  if (d.op0 != d.op1)
+    d.op1 = force_reg (vmode, d.op1);
+
   if (ix86_expand_vec_perm_const_1 (&d))
     return true;
 
--- gcc/config/i386/sse.md.jj   2021-01-12 14:30:32.688546846 +0100
+++ gcc/config/i386/sse.md      2021-01-12 15:40:29.018402527 +0100
@@ -17611,6 +17611,23 @@ (define_insn "avx2_<code>v16qiv16hi2<mas
    (set_attr "prefix" "maybe_evex")
    (set_attr "mode" "OI")])
 
+(define_insn_and_split "*avx2_zero_extendv16qiv16hi2_1"
+  [(set (match_operand:V32QI 0 "register_operand" "=v")
+       (vec_select:V32QI
+         (vec_concat:V64QI
+           (match_operand:V32QI 1 "nonimmediate_operand" "vm")
+           (match_operand:V32QI 2 "const0_operand" "C"))
+         (match_parallel 3 "pmovzx_parallel"
+           [(match_operand 4 "const_int_operand" "n")])))]
+  "TARGET_AVX2"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V16HI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V16HImode, operands[0], V32QImode);
+  operands[1] = lowpart_subreg (V16QImode, operands[1], V32QImode);
+})
+
 (define_expand "<insn>v16qiv16hi2"
   [(set (match_operand:V16HI 0 "register_operand")
        (any_extend:V16HI
@@ -17628,6 +17645,23 @@ (define_insn "avx512bw_<code>v32qiv32hi2
    (set_attr "prefix" "evex")
    (set_attr "mode" "XI")])
 
+(define_insn_and_split "*avx512bw_zero_extendv32qiv32hi2_1"
+  [(set (match_operand:V64QI 0 "register_operand" "=v")
+       (vec_select:V64QI
+         (vec_concat:V128QI
+           (match_operand:V64QI 1 "nonimmediate_operand" "vm")
+           (match_operand:V64QI 2 "const0_operand" "C"))
+         (match_parallel 3 "pmovzx_parallel"
+           [(match_operand 4 "const_int_operand" "n")])))]
+  "TARGET_AVX512BW"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V32HI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V32HImode, operands[0], V64QImode);
+  operands[1] = lowpart_subreg (V32QImode, operands[1], V64QImode);
+})
+
 (define_expand "<insn>v32qiv32hi2"
   [(set (match_operand:V32HI 0 "register_operand")
        (any_extend:V32HI
@@ -17883,6 +17917,23 @@ (define_expand "<insn>v16hiv16si2"
          (match_operand:V16HI 1 "nonimmediate_operand")))]
   "TARGET_AVX512F")
 
+(define_insn_and_split "avx512f_zero_extendv16hiv16si2_1"
+  [(set (match_operand:V32HI 0 "register_operand" "=v")
+       (vec_select:V32HI
+         (vec_concat:V64HI
+           (match_operand:V32HI 1 "nonimmediate_operand" "vm")
+           (match_operand:V32HI 2 "const0_operand" "C"))
+         (match_parallel 3 "pmovzx_parallel"
+           [(match_operand 4 "const_int_operand" "n")])))]
+  "TARGET_AVX512F"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V16SI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V16SImode, operands[0], V32HImode);
+  operands[1] = lowpart_subreg (V16HImode, operands[1], V32HImode);
+})
+
 (define_insn "avx2_<code>v8hiv8si2<mask_name>"
   [(set (match_operand:V8SI 0 "register_operand" "=v")
        (any_extend:V8SI
@@ -17900,6 +17951,23 @@ (define_expand "<insn>v8hiv8si2"
          (match_operand:V8HI 1 "nonimmediate_operand")))]
   "TARGET_AVX2")
 
+(define_insn_and_split "avx2_zero_extendv8hiv8si2_1"
+  [(set (match_operand:V16HI 0 "register_operand" "=v")
+       (vec_select:V16HI
+         (vec_concat:V32HI
+           (match_operand:V16HI 1 "nonimmediate_operand" "vm")
+           (match_operand:V16HI 2 "const0_operand" "C"))
+         (match_parallel 3 "pmovzx_parallel"
+           [(match_operand 4 "const_int_operand" "n")])))]
+  "TARGET_AVX2"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V8SI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V8SImode, operands[0], V16HImode);
+  operands[1] = lowpart_subreg (V8HImode, operands[1], V16HImode);
+})
+
 (define_insn "sse4_1_<code>v4hiv4si2<mask_name>"
   [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v")
        (any_extend:V4SI
@@ -18275,6 +18343,23 @@ (define_insn "avx512f_<code>v8siv8di2<ma
    (set_attr "prefix" "evex")
    (set_attr "mode" "XI")])
 
+(define_insn_and_split "*avx512f_zero_extendv8siv8di2_1"
+  [(set (match_operand:V16SI 0 "register_operand" "=v")
+       (vec_select:V16SI
+         (vec_concat:V32SI
+           (match_operand:V16SI 1 "nonimmediate_operand" "vm")
+           (match_operand:V16SI 2 "const0_operand" "C"))
+         (match_parallel 3 "pmovzx_parallel"
+           [(match_operand 4 "const_int_operand" "n")])))]
+  "TARGET_AVX512F"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V8DI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V8DImode, operands[0], V16SImode);
+  operands[1] = lowpart_subreg (V8SImode, operands[1], V16SImode);
+})
+
 (define_expand "<insn>v8siv8di2"
   [(set (match_operand:V8DI 0 "register_operand" "=v")
        (any_extend:V8DI
@@ -18292,6 +18377,23 @@ (define_insn "avx2_<code>v4siv4di2<mask_
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "OI")])
 
+(define_insn_and_split "*avx2_zero_extendv4siv4di2_1"
+  [(set (match_operand:V8SI 0 "register_operand" "=v")
+       (vec_select:V8SI
+         (vec_concat:V16SI
+           (match_operand:V8SI 1 "nonimmediate_operand" "vm")
+           (match_operand:V8SI 2 "const0_operand" "C"))
+         (match_parallel 3 "pmovzx_parallel"
+           [(match_operand 4 "const_int_operand" "n")])))]
+  "TARGET_AVX2"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V4DI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V4DImode, operands[0], V8SImode);
+  operands[1] = lowpart_subreg (V4SImode, operands[1], V8SImode);
+})
+
 (define_expand "<insn>v4siv4di2"
   [(set (match_operand:V4DI 0 "register_operand" "=v")
        (any_extend:V4DI
--- gcc/config/mips/mips.c.jj   2021-01-04 10:25:41.592202583 +0100
+++ gcc/config/mips/mips.c      2021-01-12 15:06:07.608535692 +0100
@@ -21624,6 +21624,15 @@ mips_vectorize_vec_perm_const (machine_m
   bool ok;
 
   d.target = target;
+  if (op0)
+    {
+      rtx nop0 = force_reg (vmode, op0);
+      if (op0 == op1)
+        op1 = nop0;
+      op0 = nop0;
+    }
+  if (op1 && op0 != op1)
+    op1 = force_reg (vmode, op1);
   d.op0 = op0;
   d.op1 = op1;
 
--- gcc/config/arm/arm.c.jj     2021-01-04 10:25:44.469170006 +0100
+++ gcc/config/arm/arm.c        2021-01-12 15:02:24.333038536 +0100
@@ -31482,6 +31482,15 @@ arm_vectorize_vec_perm_const (machine_mo
     return false;
 
   d.target = target;
+  if (op0)
+    {
+      rtx nop0 = force_reg (vmode, op0);
+      if (op0 == op1)
+        op1 = nop0;
+      op0 = nop0;
+    }
+  if (op1 && op0 != op1)
+    op1 = force_reg (vmode, op1);
   d.op0 = op0;
   d.op1 = op1;
 
--- gcc/config/sparc/sparc.c.jj 2021-01-04 10:25:45.662156497 +0100
+++ gcc/config/sparc/sparc.c    2021-01-12 15:10:43.491443165 +0100
@@ -12942,6 +12942,13 @@ sparc_vectorize_vec_perm_const (machine_
   if (vmode != V8QImode)
     return false;
 
+  rtx nop0 = force_reg (vmode, op0);
+  if (op0 == op1)
+    op1 = nop0;
+  op0 = nop0;
+  if (op0 != op1)
+    op1 = force_reg (vmode, op1);
+
   unsigned int i, mask;
   for (i = mask = 0; i < 8; ++i)
     mask |= (sel[i] & 0xf) << (28 - i*4);
--- gcc/config/ia64/ia64.c.jj   2021-01-04 10:25:45.808154844 +0100
+++ gcc/config/ia64/ia64.c      2021-01-12 15:03:26.704339360 +0100
@@ -11759,6 +11759,15 @@ ia64_vectorize_vec_perm_const (machine_m
   unsigned int i, nelt, which;
 
   d.target = target;
+  if (op0)
+    {
+      rtx nop0 = force_reg (vmode, op0);
+      if (op0 == op1)
+        op1 = nop0;
+      op0 = nop0;
+    }
+  if (op1 && op0 != op1)
+    op1 = force_reg (vmode, op1);
   d.op0 = op0;
   d.op1 = op1;
 
--- gcc/config/aarch64/aarch64.c.jj     2021-01-05 13:53:53.291683826 +0100
+++ gcc/config/aarch64/aarch64.c        2021-01-12 14:51:26.645401653 +0100
@@ -21020,8 +21020,11 @@ aarch64_vectorize_vec_perm_const (machin
   d.vmode = vmode;
   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
   d.target = target;
-  d.op0 = op0;
-  d.op1 = op1;
+  d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
+  if (op0 == op1)
+    d.op1 = d.op0;
+  else
+    d.op1 = op1 ? force_reg (vmode, op1) : NULL_RTX;
   d.testing_p = !target;
 
   if (!d.testing_p)
--- gcc/config/rs6000/rs6000.c.jj       2021-01-04 10:25:47.037140928 +0100
+++ gcc/config/rs6000/rs6000.c  2021-01-12 15:09:32.866234841 +0100
@@ -22946,6 +22946,16 @@ rs6000_vectorize_vec_perm_const (machine
   if (TARGET_ALTIVEC && testing_p)
     return true;
 
+  if (op0)
+    {
+      rtx nop0 = force_reg (vmode, op0);
+      if (op0 == op1)
+        op1 = nop0;
+      op0 = nop0;
+    }
+  if (op1 && op0 != op1)
+    op1 = force_reg (vmode, op1);
+
   /* Check for ps_merge* or xxpermdi insns.  */
   if ((vmode == V2DFmode || vmode == V2DImode) && VECTOR_MEM_VSX_P (vmode))
     {
--- gcc/config/gcn/gcn.c.jj     2021-01-04 10:25:45.939153361 +0100
+++ gcc/config/gcn/gcn.c        2021-01-12 14:56:17.394146737 +0100
@@ -3986,13 +3986,14 @@ gcn_vectorize_vec_perm_const (machine_mo
   for (unsigned int i = 0; i < nelt; ++i)
     perm[i] = sel[i] & (2 * nelt - 1);
 
+  src0 = force_reg (vmode, src0);
+  src1 = force_reg (vmode, src1);
+
   /* Make life a bit easier by swapping operands if necessary so that
      the first element always comes from src0.  */
   if (perm[0] >= nelt)
     {
-      rtx temp = src0;
-      src0 = src1;
-      src1 = temp;
+      std::swap (src0, src1);
 
       for (unsigned int i = 0; i < nelt; ++i)
        if (perm[i] < nelt)
--- gcc/testsuite/gcc.target/i386/pr95905-2.c.jj        2021-01-12 
13:58:39.820222075 +0100
+++ gcc/testsuite/gcc.target/i386/pr95905-2.c   2021-01-12 15:50:05.796964412 
+0100
@@ -1,9 +1,9 @@
 /* PR target/95905 */
 /* { dg-do compile } */
 /* { dg-options "-O2 -msse4.1" } */
-/* { dg-final { scan-assembler "\tv?pmovzxbw\t" } } */
-/* { dg-final { scan-assembler "\tv?pmovzxwd\t" } } */
-/* { dg-final { scan-assembler "\tv?pmovzxdq\t" } } */
+/* { dg-final { scan-assembler-times "\tv?pmovzxbw\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tv?pmovzxwd\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tv?pmovzxdq\t" 4 } } */
 
 typedef unsigned char V1 __attribute__((vector_size (16)));
 typedef unsigned short V2 __attribute__((vector_size (16)));
@@ -44,3 +44,39 @@ f6 (V3 *x)
 {
   return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 4, 1, 5 });
 }
+
+V1
+f7 (V1 x)
+{
+  return __builtin_shuffle ((V1) {}, x, (V1) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 
4, 21, 5, 22, 6, 23, 7 });
+}
+
+V2
+f8 (V2 x)
+{
+  return __builtin_shuffle ((V2) {}, x, (V2) { 8, 0, 9, 1, 10, 2, 11, 3 });
+}
+
+V3
+f9 (V3 x)
+{
+  return __builtin_shuffle ((V3) {}, x, (V3) { 4, 0, 5, 1 });
+}
+
+V1
+f10 (V1 *x)
+{
+  return __builtin_shuffle ((V1) {}, *x, (V1) { 16, 0, 17, 1, 18, 2, 19, 3, 
20, 4, 21, 5, 22, 6, 23, 7 });
+}
+
+V2
+f11 (V2 *x)
+{
+  return __builtin_shuffle ((V2) {}, *x, (V2) { 8, 0, 9, 1, 10, 2, 11, 3 });
+}
+
+V3
+f12 (V3 *x)
+{
+  return __builtin_shuffle ((V3) {}, *x, (V3) { 4, 0, 5, 1 });
+}
--- gcc/testsuite/gcc.target/i386/pr95905-3.c.jj        2021-01-12 
15:53:05.627957108 +0100
+++ gcc/testsuite/gcc.target/i386/pr95905-3.c   2021-01-12 15:52:32.393328070 
+0100
@@ -0,0 +1,82 @@
+/* PR target/95905 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2" } */
+/* { dg-final { scan-assembler-times "\tvpmovzxbw\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tvpmovzxwd\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tvpmovzxdq\t" 4 } } */
+
+typedef unsigned char V1 __attribute__((vector_size (32)));
+typedef unsigned short V2 __attribute__((vector_size (32)));
+typedef unsigned int V3 __attribute__((vector_size (32)));
+
+V1
+f1 (V1 x)
+{
+  return __builtin_shuffle (x, (V1) {}, (V1) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 
36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 
15, 47 });
+}
+
+V2
+f2 (V2 x)
+{
+  return __builtin_shuffle (x, (V2) {}, (V2) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 
20, 5, 21, 6, 22, 7, 23 });
+}
+
+V3
+f3 (V3 x)
+{
+  return __builtin_shuffle (x, (V3) {}, (V3) { 0, 8, 1, 9, 2, 10, 3, 11 });
+}
+
+V1
+f4 (V1 *x)
+{
+  return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 
36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 
15, 47 });
+}
+
+V2
+f5 (V2 *x)
+{
+  return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 
20, 5, 21, 6, 22, 7, 23 });
+}
+
+V3
+f6 (V3 *x)
+{
+  return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 8, 1, 9, 2, 10, 3, 11 });
+}
+
+V1
+f7 (V1 x)
+{
+  return __builtin_shuffle ((V1) {}, x, (V1) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 
4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 
47, 15 });
+}
+
+V2
+f8 (V2 x)
+{
+  return __builtin_shuffle ((V2) {}, x, (V2) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 
4, 21, 5, 22, 6, 23, 7 });
+}
+
+V3
+f9 (V3 x)
+{
+  return __builtin_shuffle ((V3) {}, x, (V3) { 8, 0, 9, 1, 10, 2, 11, 3 });
+}
+
+V1
+f10 (V1 *x)
+{
+  return __builtin_shuffle ((V1) {}, *x, (V1) { 32, 0, 33, 1, 34, 2, 35, 3, 
36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 
14, 47, 15 });
+}
+
+V2
+f11 (V2 *x)
+{
+  return __builtin_shuffle ((V2) {}, *x, (V2) { 16, 0, 17, 1, 18, 2, 19, 3, 
20, 4, 21, 5, 22, 6, 23, 7 });
+}
+
+V3
+f12 (V3 *x)
+{
+  return __builtin_shuffle ((V3) {}, *x, (V3) { 8, 0, 9, 1, 10, 2, 11, 3 });
+}
--- gcc/testsuite/gcc.target/i386/pr95905-4.c.jj        2021-01-12 
15:55:30.065343628 +0100
+++ gcc/testsuite/gcc.target/i386/pr95905-4.c   2021-01-12 15:55:01.957657667 
+0100
@@ -0,0 +1,82 @@
+/* PR target/95905 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw" } */
+/* { dg-final { scan-assembler-times "\tvpmovzxbw\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tvpmovzxwd\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tvpmovzxdq\t" 4 } } */
+
+typedef unsigned char V1 __attribute__((vector_size (64)));
+typedef unsigned short V2 __attribute__((vector_size (64)));
+typedef unsigned int V3 __attribute__((vector_size (64)));
+
+V1
+f1 (V1 x)
+{
+  return __builtin_shuffle (x, (V1) {}, (V1) { 0, 64, 1, 65, 2, 66, 3, 67, 4, 
68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 
15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 24, 88, 
25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 });
+}
+
+V2
+f2 (V2 x)
+{
+  return __builtin_shuffle (x, (V2) {}, (V2) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 
36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 
15, 47 });
+}
+
+V3
+f3 (V3 x)
+{
+  return __builtin_shuffle (x, (V3) {}, (V3) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 
20, 5, 21, 6, 22, 7, 23 });
+}
+
+V1
+f4 (V1 *x)
+{
+  return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 64, 1, 65, 2, 66, 3, 67, 4, 
68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 
15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 24, 88, 
25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 });
+}
+
+V2
+f5 (V2 *x)
+{
+  return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 
36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 
15, 47 });
+}
+
+V3
+f6 (V3 *x)
+{
+  return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 
20, 5, 21, 6, 22, 7, 23 });
+}
+
+V1
+f7 (V1 x)
+{
+  return __builtin_shuffle ((V1) {}, x, (V1) { 64, 0, 65, 1, 66, 2, 67, 3, 68, 
4, 69, 5, 70, 6, 71, 7, 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 
79, 15, 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24, 
89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 });
+}
+
+V2
+f8 (V2 x)
+{
+  return __builtin_shuffle ((V2) {}, x, (V2) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 
4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 
47, 15 });
+}
+
+V3
+f9 (V3 x)
+{
+  return __builtin_shuffle ((V3) {}, x, (V3) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 
4, 21, 5, 22, 6, 23, 7 });
+}
+
+V1
+f10 (V1 *x)
+{
+  return __builtin_shuffle ((V1) {}, *x, (V1) { 64, 0, 65, 1, 66, 2, 67, 3, 
68, 4, 69, 5, 70, 6, 71, 7, 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 
14, 79, 15, 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 
24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 });
+}
+
+V2
+f11 (V2 *x)
+{
+  return __builtin_shuffle ((V2) {}, *x, (V2) { 32, 0, 33, 1, 34, 2, 35, 3, 
36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 
14, 47, 15 });
+}
+
+V3
+f12 (V3 *x)
+{
+  return __builtin_shuffle ((V3) {}, *x, (V3) { 16, 0, 17, 1, 18, 2, 19, 3, 
20, 4, 21, 5, 22, 6, 23, 7 });
+}

        Jakub

Reply via email to