https://gcc.gnu.org/g:7a527754fdb61597b6a4c3289b63af3c86b2aa9d
commit r16-3677-g7a527754fdb61597b6a4c3289b63af3c86b2aa9d Author: liuhongt <hongtao....@intel.com> Date: Mon Sep 1 01:12:49 2025 -0700 Use vpermil{ps,pd} instead of vperm{d,q} when permutation is in-lane. gcc/ChangeLog: * config/i386/i386-expand.cc (expand_vec_perm_vpermil): Extend to handle V8SImode. * config/i386/i386.cc (avx_vpermilp_parallel): Extend to handle vector integer modes with same vector size and same component size. * config/i386/sse.md (<sse2_avx_avx512f>_vpermilp<mode><mask_name>): Ditto. (V48_AVX): New mode iterator. (ssefltmodesuffix): Extend for V16SI/V8DI/V16SF/V8DF. gcc/testsuite/ChangeLog: * gcc.target/i386/avx256_avoid_vec_perm-3.c: New test. * gcc.target/i386/avx256_avoid_vec_perm-4.c: New test. * gcc.target/i386/avx512bw-vpalignr-4.c: Adjust testcase. * gcc.target/i386/avx512vl-vpalignr-4.c: Ditto. Diff: --- gcc/config/i386/i386-expand.cc | 13 ++++++++++-- gcc/config/i386/i386.cc | 6 ++++++ gcc/config/i386/sse.md | 22 ++++++++++++-------- .../gcc.target/i386/avx256_avoid_vec_perm-3.c | 24 ++++++++++++++++++++++ .../gcc.target/i386/avx256_avoid_vec_perm-4.c | 21 +++++++++++++++++++ .../gcc.target/i386/avx512bw-vpalignr-4.c | 4 +--- .../gcc.target/i386/avx512vl-vpalignr-4.c | 2 +- 7 files changed, 78 insertions(+), 14 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 3278f1fea251..dc26b3452cb1 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -20826,7 +20826,8 @@ expand_vec_perm_vpermil (struct expand_vec_perm_d *d) rtx rperm[8], vperm; unsigned i; - if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p) + if (!TARGET_AVX || !d->one_operand_p + || (d->vmode != V8SImode && d->vmode != V8SFmode)) return false; /* We can only permute within the 128-bit lane. */ @@ -20856,7 +20857,15 @@ expand_vec_perm_vpermil (struct expand_vec_perm_d *d) vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm)); vperm = force_reg (V8SImode, vperm); - emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm)); + rtx target = d->target; + rtx op0 = d->op0; + if (d->vmode == V8SImode) + { + target = lowpart_subreg (V8SFmode, target, V8SImode); + op0 = lowpart_subreg (V8SFmode, op0, V8SImode); + } + + emit_insn (gen_avx_vpermilvarv8sf3 (target, op0, vperm)); return true; } diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index d71975a42bea..5311d8c43342 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -20603,6 +20603,7 @@ avx_vpermilp_parallel (rtx par, machine_mode mode) switch (mode) { case E_V8DFmode: + case E_V8DImode: /* In the 512-bit DFmode case, we can only move elements within a 128-bit lane. First fill the second part of the mask, then fallthru. */ @@ -20621,6 +20622,7 @@ avx_vpermilp_parallel (rtx par, machine_mode mode) /* FALLTHRU */ case E_V4DFmode: + case E_V4DImode: /* In the 256-bit DFmode case, we can only move elements within a 128-bit lane. */ for (i = 0; i < 2; ++i) @@ -20638,6 +20640,7 @@ avx_vpermilp_parallel (rtx par, machine_mode mode) break; case E_V16SFmode: + case E_V16SImode: /* In 512 bit SFmode case, permutation in the upper 256 bits must mirror the permutation in the lower 256-bits. */ for (i = 0; i < 8; ++i) @@ -20646,6 +20649,7 @@ avx_vpermilp_parallel (rtx par, machine_mode mode) /* FALLTHRU */ case E_V8SFmode: + case E_V8SImode: /* In 256 bit SFmode case, we have full freedom of movement within the low 128-bit lane, but the high 128-bit lane must mirror the exact same pattern. */ @@ -20656,7 +20660,9 @@ avx_vpermilp_parallel (rtx par, machine_mode mode) /* FALLTHRU */ case E_V2DFmode: + case E_V2DImode: case E_V4SFmode: + case E_V4SImode: /* In the 128-bit case, we've full freedom in the placement of the elements from the source operand. */ for (i = 0; i < nelt; ++i) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 73906b85d899..e87c26fcc072 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -302,6 +302,12 @@ V16SF (V8SF "TARGET_AVX512VL") V8DF (V4DF "TARGET_AVX512VL")]) +(define_mode_iterator V48_AVX + [(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI + (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") (V2DI "TARGET_SSE2") + (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF + (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) + ;; All AVX-512{F,VL} vector modes. Supposed TARGET_AVX512F baseline. (define_mode_iterator V48H_AVX512VL [V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") @@ -1428,6 +1434,10 @@ (define_mode_attr DOUBLEMASKMODE [(HI "SI") (SI "DI")]) +;; Float mode suffix used for instructions like vpermilpd with integer modes. +(define_mode_attr ssefltmodesuffix + [(V2DI "pd") (V4DI "pd") (V8DI "pd") (V4SI "ps") (V8SI "ps") (V16SI "ps") + (V2DF "pd") (V4DF "pd") (V8DF "pd") (V4SF "ps") (V8SF "ps") (V16SF "ps")]) ;; Include define_subst patterns for instructions with mask (include "subst.md") @@ -23655,10 +23665,6 @@ (set_attr "btver2_decode" "vector,vector,vector") (set_attr "mode" "<MODE>")]) -(define_mode_attr ssefltmodesuffix - [(V2DI "pd") (V4DI "pd") (V4SI "ps") (V8SI "ps") - (V2DF "pd") (V4DF "pd") (V4SF "ps") (V8SF "ps")]) - (define_mode_attr ssefltvecmode [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")]) @@ -27615,9 +27621,9 @@ ;; being a subset of what vpermp* can do), but vpermilp* has shorter ;; latency as it never crosses lanes. (define_insn "*<sse2_avx_avx512f>_vpermilp<mode><mask_name>" - [(set (match_operand:VF 0 "register_operand" "=v") - (vec_select:VF - (match_operand:VF 1 "nonimmediate_operand" "vm") + [(set (match_operand:V48_AVX 0 "register_operand" "=v") + (vec_select:V48_AVX + (match_operand:V48_AVX 1 "nonimmediate_operand" "vm") (match_parallel 2 "" [(match_operand 3 "const_int_operand")])))] "TARGET_AVX && <mask_mode512bit_condition> @@ -27625,7 +27631,7 @@ { int mask = avx_vpermilp_parallel (operands[2], <MODE>mode) - 1; operands[2] = GEN_INT (mask); - return "vpermil<ssemodesuffix>\t{%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2}"; + return "vpermil<ssefltmodesuffix>\t{%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2}"; } [(set_attr "type" "sselog") (set_attr "prefix_extra" "1") diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-3.c b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-3.c new file mode 100644 index 000000000000..cb1328ce2d0b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-3.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=sierraforest -fdump-tree-vect-details" } */ +/* { dg-final { scan-tree-dump "loop vectorized using 32 byte vectors" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-assembler "vpermilps" { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-not "vpermd" } } */ + +int a[256], b[256]; + +void __attribute__((noinline)) +foo (void) +{ + int i; + for (i = 0; i < 32; ++i) + { + b[i*8+0] = a[i*8+0]; + b[i*8+1] = a[i*8+0]; + b[i*8+2] = a[i*8+3]; + b[i*8+3] = a[i*8+3]; + b[i*8+4] = a[i*8+4]; + b[i*8+5] = a[i*8+6]; + b[i*8+6] = a[i*8+4]; + b[i*8+7] = a[i*8+6]; + } +} diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-4.c b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-4.c new file mode 100644 index 000000000000..016771ab743c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-4.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx2 -O2" } */ +/* { dg-final { scan-assembler-times {(?n)vpermilp[ds]} 2} } */ +/* { dg-final { scan-assembler-not {(?n)vperm[dq]} } } */ + + +typedef long long v4di __attribute__((vector_size(32))); +typedef int v8si __attribute__((vector_size(32))); + +v4di +foo (v4di a) +{ + return __builtin_shufflevector (a, a, 1, 0, 3, 2); +} + +v8si +foo1 (v8si a) +{ + return __builtin_shufflevector (a, a, 1, 0, 3, 2, 7, 6, 5, 4); +} + diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-vpalignr-4.c b/gcc/testsuite/gcc.target/i386/avx512bw-vpalignr-4.c index 50a2a3522139..cdd08f5be63f 100644 --- a/gcc/testsuite/gcc.target/i386/avx512bw-vpalignr-4.c +++ b/gcc/testsuite/gcc.target/i386/avx512bw-vpalignr-4.c @@ -55,8 +55,6 @@ f4 (V4 x) asm volatile ("" : "+v" (a)); } -/* { dg-final { scan-assembler-times "vpalignr\[^\n\r]*\\\$8\[^\n\r]*%xmm16\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */ - typedef float V5 __attribute__((vector_size (16))); void @@ -83,4 +81,4 @@ f6 (V6 x) asm volatile ("" : "+v" (a)); } -/* { dg-final { scan-assembler-times "vpermilpd\[^\n\r]*\\\$1\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */ +/* { dg-final { scan-assembler-times "vpermilpd\[^\n\r]*\\\$1\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vpalignr-4.c b/gcc/testsuite/gcc.target/i386/avx512vl-vpalignr-4.c index 4936d2f4c5b0..3076fb020a7c 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vpalignr-4.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vpalignr-4.c @@ -83,4 +83,4 @@ f6 (V6 x) asm volatile ("" : "+v" (a)); } -/* { dg-final { scan-assembler-times "vpermilpd\[^\n\r]*\\\$1\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */ +/* { dg-final { scan-assembler-times "vpermilpd\[^\n\r]*\\\$1\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 2 } } */