Re: [PATCH] i386: Optimize vshuf{i, f}{32x4, 64x2} ymm and vperm{i, f}128 ymm
On Tue, Apr 18, 2023 at 2:52 PM Hu, Lin1 via Gcc-patches wrote: > > Hi, all > > The patch aims to optimize vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128. > And it has regtested on x86_64-pc-linux-gnu. OK for trunk? Ok. > > Thanks. > Lin > > vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm are 3 clk. > We can optimze them to vblend, vmovaps when there's no cross-lane. > > gcc/ChangeLog: > > * config/i386/sse.md: Modify insn vperm{i,f} > and vshuf{i,f}. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/avx512vl-vshuff32x4-1.c: Modify test. > * gcc.target/i386/avx512vl-vshuff64x2-1.c: Ditto. > * gcc.target/i386/avx512vl-vshufi32x4-1.c: Ditto. > * gcc.target/i386/avx512vl-vshufi64x2-1.c: Ditto. > * gcc.target/i386/opt-vperm-vshuf-1.c: New test. > * gcc.target/i386/opt-vperm-vshuf-2.c: Ditto. > * gcc.target/i386/opt-vperm-vshuf-3.c: Ditto. > --- > gcc/config/i386/sse.md| 36 -- > .../gcc.target/i386/avx512vl-vshuff32x4-1.c | 2 +- > .../gcc.target/i386/avx512vl-vshuff64x2-1.c | 2 +- > .../gcc.target/i386/avx512vl-vshufi32x4-1.c | 2 +- > .../gcc.target/i386/avx512vl-vshufi64x2-1.c | 2 +- > .../gcc.target/i386/opt-vperm-vshuf-1.c | 51 ++ > .../gcc.target/i386/opt-vperm-vshuf-2.c | 68 +++ > .../gcc.target/i386/opt-vperm-vshuf-3.c | 63 + > 8 files changed, 218 insertions(+), 8 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index 513960e8f33..5b6b2427460 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -18437,6 +18437,8 @@ >mask = INTVAL (operands[3]) / 2; >mask |= (INTVAL (operands[5]) - 4) / 2 << 1; >operands[3] = GEN_INT (mask); > + if (INTVAL (operands[3]) == 2 && !) > +return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; >return "vshuf64x2\t{%3, %2, %1, > %0|%0, %1, %2, %3}"; > } >[(set_attr "type" "sselog") > @@ -18595,6 +18597,9 @@ >mask |= (INTVAL (operands[7]) - 8) / 4 << 1; >operands[3] = GEN_INT (mask); > > + if (INTVAL (operands[3]) == 2 && !) > +return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; > + >return "vshuf32x4\t{%3, %2, %1, > %0|%0, %1, %2, %3}"; > } >[(set_attr "type" "sselog") > @@ -25663,7 +25668,28 @@ >(match_operand:SI 3 "const_0_to_255_operand")] > UNSPEC_VPERMTI))] >"TARGET_AVX2" > - "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + { > +int mask = INTVAL (operands[3]); > +if ((mask & 0xbb) == 16) > + { > + if (rtx_equal_p (operands[0], operands[1])) > + return ""; > + else > + return "vmovaps\t{%1, %0|%0, %1}"; > + } > +if ((mask & 0xbb) == 50) > + { > + if (rtx_equal_p (operands[0], operands[2])) > + return ""; > + else > + return "vmovaps\t{%2, %0|%0, %2}"; > + } > +if ((mask & 0xbb) == 18) > + return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; > +if ((mask & 0xbb) == 48) > + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; > +return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"; > + } >[(set_attr "type" "sselog") > (set_attr "prefix" "vex") > (set_attr "mode" "OI")]) > @@ -26226,9 +26252,11 @@ > && avx_vperm2f128_parallel (operands[3], mode)" > { >int mask = avx_vperm2f128_parallel (operands[3], mode) - 1; > - if (mask == 0x12) > -return "vinsert\t{$0, %x2, %1, %0|%0, %1, %x2, 0}"; > - if (mask == 0x20) > + if ((mask & 0xbb) == 0x12) > +return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; > + if ((mask & 0xbb) == 0x30) > +return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; > + if ((mask & 0xbb) == 0x20) > return "vinsert\t{$1, %x2, %1, %0|%0, %1, %x2, 1}"; >operands[3] = GEN_INT (mask); >return "vperm2\t{%3, %2, %1, %0|%0, %1, %2, %3}"; > diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c > b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c > index 6c2fb2f184a..02aecf4edce 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c > @@ -12,7 +12,7 @@ volatile __mmask8 m; > void extern > avx512vl_test (void) > { > - x = _mm256_shuffle_f32x4 (x, x, 2); > + x = _mm256_shuffle_f32x4 (x, x, 3); >x = _mm256_mask_shuffle_f32x4 (x, m, x, x, 2); >x = _mm256_maskz_shuffle_f32x4 (m, x, x, 2); > } > diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c > b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c > index 1191b400134..563ded5d9df 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c > @@
[PATCH] i386: Optimize vshuf{i, f}{32x4, 64x2} ymm and vperm{i, f}128 ymm
Hi, all The patch aims to optimize vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128. And it has regtested on x86_64-pc-linux-gnu. OK for trunk? Thanks. Lin vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm are 3 clk. We can optimze them to vblend, vmovaps when there's no cross-lane. gcc/ChangeLog: * config/i386/sse.md: Modify insn vperm{i,f} and vshuf{i,f}. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512vl-vshuff32x4-1.c: Modify test. * gcc.target/i386/avx512vl-vshuff64x2-1.c: Ditto. * gcc.target/i386/avx512vl-vshufi32x4-1.c: Ditto. * gcc.target/i386/avx512vl-vshufi64x2-1.c: Ditto. * gcc.target/i386/opt-vperm-vshuf-1.c: New test. * gcc.target/i386/opt-vperm-vshuf-2.c: Ditto. * gcc.target/i386/opt-vperm-vshuf-3.c: Ditto. --- gcc/config/i386/sse.md| 36 -- .../gcc.target/i386/avx512vl-vshuff32x4-1.c | 2 +- .../gcc.target/i386/avx512vl-vshuff64x2-1.c | 2 +- .../gcc.target/i386/avx512vl-vshufi32x4-1.c | 2 +- .../gcc.target/i386/avx512vl-vshufi64x2-1.c | 2 +- .../gcc.target/i386/opt-vperm-vshuf-1.c | 51 ++ .../gcc.target/i386/opt-vperm-vshuf-2.c | 68 +++ .../gcc.target/i386/opt-vperm-vshuf-3.c | 63 + 8 files changed, 218 insertions(+), 8 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 513960e8f33..5b6b2427460 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -18437,6 +18437,8 @@ mask = INTVAL (operands[3]) / 2; mask |= (INTVAL (operands[5]) - 4) / 2 << 1; operands[3] = GEN_INT (mask); + if (INTVAL (operands[3]) == 2 && !) +return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; return "vshuf64x2\t{%3, %2, %1, %0|%0, %1, %2, %3}"; } [(set_attr "type" "sselog") @@ -18595,6 +18597,9 @@ mask |= (INTVAL (operands[7]) - 8) / 4 << 1; operands[3] = GEN_INT (mask); + if (INTVAL (operands[3]) == 2 && !) +return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + return "vshuf32x4\t{%3, %2, %1, %0|%0, %1, %2, %3}"; } [(set_attr "type" "sselog") @@ -25663,7 +25668,28 @@ (match_operand:SI 3 "const_0_to_255_operand")] UNSPEC_VPERMTI))] "TARGET_AVX2" - "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}" + { +int mask = INTVAL (operands[3]); +if ((mask & 0xbb) == 16) + { + if (rtx_equal_p (operands[0], operands[1])) + return ""; + else + return "vmovaps\t{%1, %0|%0, %1}"; + } +if ((mask & 0xbb) == 50) + { + if (rtx_equal_p (operands[0], operands[2])) + return ""; + else + return "vmovaps\t{%2, %0|%0, %2}"; + } +if ((mask & 0xbb) == 18) + return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; +if ((mask & 0xbb) == 48) + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; +return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"; + } [(set_attr "type" "sselog") (set_attr "prefix" "vex") (set_attr "mode" "OI")]) @@ -26226,9 +26252,11 @@ && avx_vperm2f128_parallel (operands[3], mode)" { int mask = avx_vperm2f128_parallel (operands[3], mode) - 1; - if (mask == 0x12) -return "vinsert\t{$0, %x2, %1, %0|%0, %1, %x2, 0}"; - if (mask == 0x20) + if ((mask & 0xbb) == 0x12) +return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; + if ((mask & 0xbb) == 0x30) +return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + if ((mask & 0xbb) == 0x20) return "vinsert\t{$1, %x2, %1, %0|%0, %1, %x2, 1}"; operands[3] = GEN_INT (mask); return "vperm2\t{%3, %2, %1, %0|%0, %1, %2, %3}"; diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c index 6c2fb2f184a..02aecf4edce 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c @@ -12,7 +12,7 @@ volatile __mmask8 m; void extern avx512vl_test (void) { - x = _mm256_shuffle_f32x4 (x, x, 2); + x = _mm256_shuffle_f32x4 (x, x, 3); x = _mm256_mask_shuffle_f32x4 (x, m, x, x, 2); x = _mm256_maskz_shuffle_f32x4 (m, x, x, 2); } diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c index 1191b400134..563ded5d9df 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c @@ -12,7 +12,7 @@ volatile __mmask8 m; void extern avx512vl_test (void) { - x = _mm256_shuffle_f64x2 (x, x, 2); + x = _mm256_shuffle_f64x2 (x, x, 3); x = _mm256_mask_shuffle_f64x2 (x, m, x, x, 2); x = _mm256_maskz_shuffle_f64x2 (m, x, x, 2); } diff --git