Hi: Here's updated patch which does 3 things: 1. Support vpermw/vpermb in ix86_expand_vec_one_operand_perm_avx512. 2. Support 256/128-bits vpermi2b in ix86_expand_vec_perm_vpermt2. 3. Add define_insn_and_split to optimize specific vector permutation to opmov{dw,wb,qd}.
Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. Ok for trunk? gcc/ChangeLog: PR target/101846 * config/i386/i386-expand.c (ix86_expand_vec_perm_vpermt2): Support vpermi2b for V32QI/V16QImode. (ix86_extract_perm_from_pool_constant): New function. (ix86_expand_vec_one_operand_perm_avx512): Support vpermw/vpermb under TARGET_AVX512BW/TARGET_AVX512VBMI. (expand_vec_perm_1): Adjust comments for upper. * config/i386/i386-protos.h (ix86_extract_perm_from_pool_constant): New declare. * config/i386/predicates.md (permvar_truncate_operand): New predicate. (pshufb_truncv4siv4hi_operand): Ditto. (pshufb_truncv8hiv8qi_operand): Ditto. * config/i386/sse.md (*avx512bw_permvar_truncv16siv16hi_1): New pre_reload define_insn_and_split. (*avx512f_permvar_truncv8siv8hi_1): Ditto. (*avx512f_vpermvar_truncv8div8si_1): Ditto. (*avx512f_permvar_truncv32hiv32qi_1): Ditto. (*avx512f_permvar_truncv16hiv16qi_1): Ditto. (*avx512f_permvar_truncv4div4si_1): Ditto. (*avx512f_pshufb_truncv8hiv8qi_1): Ditto. (*avx512f_pshufb_truncv4siv4hi_1): Ditto. (*avx512f_pshufd_truncv2div2si_1): Ditto. gcc/testsuite/ChangeLog: PR target/101846 * gcc.target/i386/pr101846-2.c: New test. * gcc.target/i386/pr101846-3.c: New test. * gcc.target/i386/pr101846-4.c: New test. --- gcc/config/i386/i386-expand.c | 89 +++++++++- gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/predicates.md | 90 ++++++++++ gcc/config/i386/sse.md | 190 +++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr101846-2.c | 81 +++++++++ gcc/testsuite/gcc.target/i386/pr101846-3.c | 73 ++++++++ gcc/testsuite/gcc.target/i386/pr101846-4.c | 40 +++++ 7 files changed, 559 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-4.c diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index a652b25f534..56319cb6f6a 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -4778,6 +4778,18 @@ ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1, switch (mode) { + case E_V16QImode: + if (TARGET_AVX512VL && TARGET_AVX512VBMI) + gen = gen_avx512vl_vpermt2varv16qi3; + break; + case E_V32QImode: + if (TARGET_AVX512VL && TARGET_AVX512VBMI) + gen = gen_avx512vl_vpermt2varv32qi3; + break; + case E_V64QImode: + if (TARGET_AVX512VBMI) + gen = gen_avx512bw_vpermt2varv64qi3; + break; case E_V8HImode: if (TARGET_AVX512VL && TARGET_AVX512BW) gen = gen_avx512vl_vpermt2varv8hi3; @@ -4786,10 +4798,6 @@ ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1, if (TARGET_AVX512VL && TARGET_AVX512BW) gen = gen_avx512vl_vpermt2varv16hi3; break; - case E_V64QImode: - if (TARGET_AVX512VBMI) - gen = gen_avx512bw_vpermt2varv64qi3; - break; case E_V32HImode: if (TARGET_AVX512BW) gen = gen_avx512bw_vpermt2varv32hi3; @@ -5487,6 +5495,45 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) } } +/* Return true if mem is pool constant which contains a const_vector + perm index, assign the index to PERM. */ +bool +ix86_extract_perm_from_pool_constant (int* perm, rtx mem) +{ + machine_mode mode = GET_MODE (mem); + int nelt = GET_MODE_NUNITS (mode); + + if (!INTEGRAL_MODE_P (mode)) + return false; + + /* Needs to be constant pool. */ + if (!(MEM_P (mem)) + || !SYMBOL_REF_P (XEXP (mem, 0)) + || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0))) + return false; + + rtx constant = get_pool_constant (XEXP (mem, 0)); + + if (GET_CODE (constant) != CONST_VECTOR) + return false; + + /* There could be some rtx like + (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1"))) + but with "*.LC1" refer to V2DI constant vector. */ + if (GET_MODE (constant) != mode) + { + constant = simplify_subreg (mode, constant, GET_MODE (constant), 0); + + if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR) + return false; + } + + for (int i = 0; i != nelt; i++) + perm[i] = UINTVAL (XVECEXP (constant, 0, i)); + + return true; +} + /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode, but works for floating pointer parameters and nonoffsetable memories. For pushes, it returns just stack offsets; the values will be saved @@ -18086,6 +18133,7 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d) { machine_mode mode = GET_MODE (d->op0); machine_mode maskmode = mode; + unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode)); rtx (*gen) (rtx, rtx, rtx) = NULL; rtx target, op0, mask; rtx vec[64]; @@ -18096,6 +18144,18 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d) if (!TARGET_AVX512F) return false; + /* Accept VNxHImode and VNxQImode now. */ + if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64) + return false; + + /* vpermw. */ + if (!TARGET_AVX512BW && inner_size == 2) + return false; + + /* vpermb. */ + if (!TARGET_AVX512VBMI && inner_size == 1) + return false; + switch (mode) { case E_V16SImode: @@ -18112,6 +18172,25 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d) gen = gen_avx512f_permvarv8df; maskmode = V8DImode; break; + case E_V32HImode: + gen = gen_avx512bw_permvarv32hi; + break; + case E_V16HImode: + gen = gen_avx512vl_permvarv16hi; + break; + case E_V8HImode: + gen = gen_avx512vl_permvarv8hi; + break; + case E_V64QImode: + gen = gen_avx512bw_permvarv64qi; + break; + case E_V32QImode: + gen = gen_avx512vl_permvarv32qi; + break; + case E_V16QImode: + gen = gen_avx512vl_permvarv16qi; + break; + default: return false; } @@ -18301,7 +18380,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_palignr (d, true)) return true; - /* Try the AVX512F vperm{s,d} instructions. */ + /* Try the AVX512F vperm{w,b,s,d} and instructions */ if (ix86_expand_vec_one_operand_perm_avx512 (d)) return true; diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 07ac02aff69..2fd13074c81 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -260,6 +260,7 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx); extern void ix86_expand_sse2_abs (rtx, rtx); extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx, rtx); +extern bool ix86_extract_perm_from_pool_constant (int*, rtx); /* In i386-c.c */ extern void ix86_target_macros (void); diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 129205ac3a7..650d6354de9 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1713,6 +1713,96 @@ (define_predicate "addsub_vs_parallel" return true; }) +;; Return true if OP is a constant pool in perm{w,d,b} which constains index +;; match pmov{dw,wb,qd}. +(define_predicate "permvar_truncate_operand" + (match_code "mem") +{ + int nelt = GET_MODE_NUNITS (mode); + int perm[128]; + int id; + + if (!INTEGRAL_MODE_P (mode) || !VECTOR_MODE_P (mode)) + return false; + + if (nelt < 2) + return false; + + if (!ix86_extract_perm_from_pool_constant (&perm[0], op)) + return false; + + id = exact_log2 (nelt); + + /* Check that the permutation is suitable for pmovz{bw,wd,dq}. + For example V16HImode to V8HImode + { 0 2 4 6 8 10 12 14 * * * * * * * * }. */ + for (int i = 0; i != nelt/2; i++) + if ((perm[i] & ((1 << id) - 1)) != i * 2) + return false; + + return true; +}) + +;; Return true if OP is a constant pool in shufb which constains index +;; match pmovdw. +(define_predicate "pshufb_truncv4siv4hi_operand" + (match_code "mem") +{ + int perm[128]; + + if (mode != E_V16QImode) + return false; + + if (!ix86_extract_perm_from_pool_constant (&perm[0], op)) + return false; + + /* Check that the permutation is suitable for pmovwd. + For example V16HImode to V8HImode + { 0 1 4 5 8 9 12 13 * * * * * * * * }. + index = i % 2 + (i / 2) * 4. */ + for (int i = 0; i != 8; i++) + { + /* if (SRC2[(i * 8)+7] = 1) then DEST[(i*8)+7..(i*8)+0] := 0; */ + if (perm[i] & 128) + return false; + + if ((perm[i] & 15) != ((i & 1) + (i & 0xFE) * 2)) + return false; + } + + return true; +}) + +;; Return true if OP is a constant pool in shufb which constains index +;; match pmovdw. +(define_predicate "pshufb_truncv8hiv8qi_operand" + (match_code "mem") +{ + int perm[128]; + + if (mode != E_V16QImode) + return false; + + if (!ix86_extract_perm_from_pool_constant (&perm[0], op)) + return false; + + /* Check that the permutation is suitable for pmovwd. + For example V16HImode to V8HImode + { 0 2 4 6 8 10 12 14 * * * * * * * * }. + index = i % 2 + (i / 2) * 4. */ + for (int i = 0; i != 8; i++) + { + /* if (SRC2[(i * 8)+7] = 1) then DEST[(i*8)+7..(i*8)+0] := 0; */ + if (perm[i] & 128) + return false; + + if ((perm[i] & 15) != i * 2) + return false; + } + + return true; +}) + ;; Return true if OP is a parallel for an pmovz{bw,wd,dq} vec_select, ;; where one of the two operands of the vec_concat is const0_operand. (define_predicate "pmovzx_parallel" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 3a7bbaec7af..c9f21082beb 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -10978,6 +10978,64 @@ (define_insn "*avx512f_<code><pmov_src_lower><mode>2" (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn_and_split "*avx512bw_permvar_truncv16siv16hi_1" + [(set (match_operand:V16HI 0 "nonimmediate_operand") + (vec_select:V16HI + (unspec:V32HI + [(match_operand:V32HI 1 "register_operand") + (match_operand:V32HI 2 "permvar_truncate_operand")] + UNSPEC_VPERMVAR) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7) + (const_int 8) (const_int 9) + (const_int 10) (const_int 11) + (const_int 12) (const_int 13) + (const_int 14) (const_int 15)])))] + "TARGET_AVX512BW && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (truncate:V16HI (match_dup 1)))] + "operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);") + +(define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1" + [(set (match_operand:V8HI 0 "nonimmediate_operand") + (vec_select:V8HI + (unspec:V16HI + [(match_operand:V16HI 1 "register_operand") + (match_operand:V16HI 2 "permvar_truncate_operand")] + UNSPEC_VPERMVAR) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7)])))] + "TARGET_AVX512VL && TARGET_AVX512BW && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (truncate:V8HI (match_dup 1)))] + "operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);") + +(define_insn_and_split "*avx512f_vpermvar_truncv8div8si_1" + [(set (match_operand:V8SI 0 "nonimmediate_operand") + (vec_select:V8SI + (unspec:V16SI + [(match_operand:V16SI 1 "register_operand") + (match_operand:V16SI 2 "permvar_truncate_operand")] + UNSPEC_VPERMVAR) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7)])))] + "TARGET_AVX512F && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (truncate:V8SI (match_dup 1)))] + "operands[1] = lowpart_subreg (V8DImode, operands[1], V16SImode);") + (define_insn "avx512f_<code><pmov_src_lower><mode>2_mask" [(set (match_operand:PMOV_DST_MODE_1 0 "nonimmediate_operand" "=v,m") (vec_merge:PMOV_DST_MODE_1 @@ -11018,6 +11076,36 @@ (define_insn "avx512bw_<code>v32hiv32qi2" (set_attr "prefix" "evex") (set_attr "mode" "XI")]) +(define_insn_and_split "*avx512f_permvar_truncv32hiv32qi_1" + [(set (match_operand:V32QI 0 "nonimmediate_operand") + (vec_select:V32QI + (unspec:V64QI + [(match_operand:V64QI 1 "register_operand") + (match_operand:V64QI 2 "permvar_truncate_operand")] + UNSPEC_VPERMVAR) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7) + (const_int 8) (const_int 9) + (const_int 10) (const_int 11) + (const_int 12) (const_int 13) + (const_int 14) (const_int 15) + (const_int 16) (const_int 17) + (const_int 18) (const_int 19) + (const_int 20) (const_int 21) + (const_int 22) (const_int 23) + (const_int 24) (const_int 25) + (const_int 26) (const_int 27) + (const_int 28) (const_int 29) + (const_int 30) (const_int 31)])))] + "TARGET_AVX512VBMI && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (truncate:V32QI (match_dup 1)))] + "operands[1] = lowpart_subreg (V32HImode, operands[1], V64QImode);") + (define_insn "avx512bw_<code>v32hiv32qi2_mask" [(set (match_operand:V32QI 0 "nonimmediate_operand" "=v,m") (vec_merge:V32QI @@ -11063,6 +11151,45 @@ (define_insn "*avx512vl_<code><ssedoublemodelower><mode>2" (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn_and_split "*avx512f_permvar_truncv16hiv16qi_1" + [(set (match_operand:V16QI 0 "nonimmediate_operand") + (vec_select:V16QI + (unspec:V32QI + [(match_operand:V32QI 1 "register_operand") + (match_operand:V32QI 2 "permvar_truncate_operand")] + UNSPEC_VPERMVAR) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7) + (const_int 8) (const_int 9) + (const_int 10) (const_int 11) + (const_int 12) (const_int 13) + (const_int 14) (const_int 15)])))] + "TARGET_AVX512VL && TARGET_AVX512VBMI + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (truncate:V16QI (match_dup 1)))] + "operands[1] = lowpart_subreg (V16HImode, operands[1], V32QImode);") + +(define_insn_and_split "*avx512f_permvar_truncv4div4si_1" + [(set (match_operand:V4SI 0 "nonimmediate_operand") + (vec_select:V4SI + (unspec:V8SI + [(match_operand:V8SI 1 "register_operand") + (match_operand:V8SI 2 "permvar_truncate_operand")] + UNSPEC_VPERMVAR) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)])))] + "TARGET_AVX512VL && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (truncate:V4SI (match_dup 1)))] + "operands[1] = lowpart_subreg (V4DImode, operands[1], V8SImode);") + (define_insn "<avx512>_<code><ssedoublemodelower><mode>2_mask" [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand" "=v,m") (vec_merge:PMOV_DST_MODE_2 @@ -11121,6 +11248,27 @@ (define_insn "avx512vl_<code><mode>v<ssescalarnum>qi2" (set_attr "prefix" "evex") (set_attr "mode" "TI")]) +(define_insn_and_split "*avx512f_pshufb_truncv8hiv8qi_1" + [(set (match_operand:DI 0 "register_operand") + (vec_select:DI + (subreg:V2DI + (unspec:V16QI + [(match_operand:V16QI 1 "register_operand") + (match_operand:V16QI 2 "pshufb_truncv8hiv8qi_operand")] + UNSPEC_PSHUFB) 0) + (parallel [(const_int 0)])))] + "TARGET_AVX512VL && ix86_pre_reload_split ()" + "#" + "&& 1" + [(const_int 0)] +{ + rtx op1 = gen_reg_rtx (V8QImode); + operands[1] = lowpart_subreg (V8HImode, operands[1], V16QImode); + emit_insn (gen_truncv8hiv8qi2 (op1, operands[1])); + emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V8QImode)); + DONE; +}) + (define_insn "*avx512vl_<code>v2div2qi2_store_1" [(set (match_operand:V2QI 0 "memory_operand" "=m") (any_truncate:V2QI @@ -11476,6 +11624,27 @@ (define_insn "avx512vl_<code><mode>v<ssescalarnum>hi2" (set_attr "prefix" "evex") (set_attr "mode" "TI")]) +(define_insn_and_split "*avx512f_pshufb_truncv4siv4hi_1" + [(set (match_operand:DI 0 "register_operand") + (vec_select:DI + (subreg:V2DI + (unspec:V16QI + [(match_operand:V16QI 1 "register_operand") + (match_operand:V16QI 2 "pshufb_truncv4siv4hi_operand")] + UNSPEC_PSHUFB) 0) + (parallel [(const_int 0)])))] + "TARGET_AVX512VL && ix86_pre_reload_split ()" + "#" + "&& 1" + [(const_int 0)] +{ + rtx op1 = gen_reg_rtx (V4HImode); + operands[1] = lowpart_subreg (V4SImode, operands[1], V16QImode); + emit_insn (gen_truncv4siv4hi2 (op1, operands[1])); + emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V4HImode)); + DONE; +}) + (define_insn "*avx512vl_<code><mode>v4hi2_store_1" [(set (match_operand:V4HI 0 "memory_operand" "=m") (any_truncate:V4HI @@ -11699,6 +11868,27 @@ (define_insn "avx512vl_<code>v2div2si2" (set_attr "prefix" "evex") (set_attr "mode" "TI")]) +(define_insn_and_split "*avx512f_pshufd_truncv2div2si_1" + [(set (match_operand:DI 0 "register_operand") + (vec_select:DI + (subreg:V2DI + (vec_select:V4SI + (match_operand:V4SI 1 "register_operand") + (parallel [(const_int 0) (const_int 2) + (const_int 2) (const_int 3)])) 0) + (parallel [(const_int 0)])))] + "TARGET_AVX512VL && ix86_pre_reload_split ()" + "#" + "&& 1" + [(const_int 0)] +{ + rtx op1 = gen_reg_rtx (V2SImode); + operands[1] = lowpart_subreg (V2DImode, operands[1], V4SImode); + emit_insn (gen_truncv2div2si2 (op1, operands[1])); + emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V2SImode)); + DONE; +}) + (define_insn "*avx512vl_<code>v2div2si2_store_1" [(set (match_operand:V2SI 0 "memory_operand" "=m") (any_truncate:V2SI diff --git a/gcc/testsuite/gcc.target/i386/pr101846-2.c b/gcc/testsuite/gcc.target/i386/pr101846-2.c new file mode 100644 index 00000000000..26c9ed511e5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101846-2.c @@ -0,0 +1,81 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512vl -mavx512vbmi -O2" } */ +/* { dg-final { scan-assembler-times "vpmovwb" "3" } } */ +/* { dg-final { scan-assembler-times "vpmovdw" "3" } } */ +/* { dg-final { scan-assembler-times "vpmovqd" "3" } } */ + +typedef short v4hi __attribute__((vector_size (8))); +typedef short v8hi __attribute__((vector_size (16))); +typedef short v16hi __attribute__((vector_size (32))); +typedef short v32hi __attribute__((vector_size (64))); +typedef char v8qi __attribute__((vector_size (8))); +typedef char v16qi __attribute__((vector_size (16))); +typedef char v32qi __attribute__((vector_size (32))); +typedef char v64qi __attribute__((vector_size (64))); +typedef int v2si __attribute__((vector_size (8))); +typedef int v4si __attribute__((vector_size (16))); +typedef int v8si __attribute__((vector_size (32))); +typedef int v16si __attribute__((vector_size (64))); + +v16hi +foo_dw_512 (v32hi x) +{ + return __builtin_shufflevector (x, x, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30); +} + +v8hi +foo_dw_256 (v16hi x) +{ + return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14); +} + +v4hi +foo_dw_128 (v8hi x) +{ + return __builtin_shufflevector (x, x, 0, 2, 4, 6); +} + +v8si +foo_qd_512 (v16si x) +{ + return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14); +} + +v4si +foo_qd_256 (v8si x) +{ + return __builtin_shufflevector (x, x, 0, 2, 4, 6); +} + +v2si +foo_qd_128 (v4si x) +{ + return __builtin_shufflevector (x, x, 0, 2); +} + +v32qi +foo_wb_512 (v64qi x) +{ + return __builtin_shufflevector (x, x, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62); +} + +v16qi +foo_wb_256 (v32qi x) +{ + return __builtin_shufflevector (x, x, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30); +} + +v8qi +foo_wb_128 (v16qi x) +{ + return __builtin_shufflevector (x, x, + 0, 2, 4, 6, 8, 10, 12, 14); +} diff --git a/gcc/testsuite/gcc.target/i386/pr101846-3.c b/gcc/testsuite/gcc.target/i386/pr101846-3.c new file mode 100644 index 00000000000..f774018a382 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101846-3.c @@ -0,0 +1,73 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512vl -mavx512vbmi -O2" } */ +/* { dg-final { scan-assembler-times "vpermb" "2" } } */ +/* { dg-final { scan-assembler-times "vpermw" "2" } } */ +/* { dg-final { scan-assembler-times "vpermd" "2" } } */ + +typedef short v4hi __attribute__((vector_size (8))); +typedef short v8hi __attribute__((vector_size (16))); +typedef short v16hi __attribute__((vector_size (32))); +typedef short v32hi __attribute__((vector_size (64))); +typedef char v8qi __attribute__((vector_size (8))); +typedef char v16qi __attribute__((vector_size (16))); +typedef char v32qi __attribute__((vector_size (32))); +typedef char v64qi __attribute__((vector_size (64))); +typedef int v2si __attribute__((vector_size (8))); +typedef int v4si __attribute__((vector_size (16))); +typedef int v8si __attribute__((vector_size (32))); +typedef int v16si __attribute__((vector_size (64))); + +v32hi +foow_512 (v32hi x) +{ + return __builtin_shufflevector (x, x, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); +} + +v16hi +foow_256 (v16hi x) +{ + return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14, + 8, 9, 10, 11, 12, 13, 14, 15); +} + + +v16si +food_512 (v16si x) +{ + return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14, + 8, 9, 10, 11, 12, 13, 14, 15); +} + +v8si +food_256 (v8si x) +{ + return __builtin_shufflevector (x, x, 0, 2, 4, 6, 4, 5, 6, 7); +} + +v64qi +foob_512 (v64qi x) +{ + return __builtin_shufflevector (x, x, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63); +} + +v32qi +foob_256 (v32qi x) +{ + return __builtin_shufflevector (x, x, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); +} diff --git a/gcc/testsuite/gcc.target/i386/pr101846-4.c b/gcc/testsuite/gcc.target/i386/pr101846-4.c new file mode 100644 index 00000000000..2a6163c4d72 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101846-4.c @@ -0,0 +1,40 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512vl -mavx512vbmi -O2" } */ +/* { dg-final { scan-assembler-times "vpermi2b" "3" } } */ + +typedef char v16qi __attribute__((vector_size (16))); +typedef char v32qi __attribute__((vector_size (32))); +typedef char v64qi __attribute__((vector_size (64))); + + +v64qi +foob_512 (v64qi x, v64qi y) +{ + return __builtin_shufflevector (x, y, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 77, 79, 74, 72, 70, + 89, 88, 78, 86, 85, 75, 83, 82, + 112, 108, 101, 100, 86, 96, 97, 95); +} + +v32qi +foob_256 (v32qi x, v32qi y) +{ + return __builtin_shufflevector (x, y, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62); +} + +v16qi +foob_128 (v16qi x, v16qi y) +{ + return __builtin_shufflevector (x, y, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30); +} -- 2.27.0