[PATCH] i386: Optimize _mm_unpacklo_epi8 of 0 vector as second argument or similar VEC_PERM_EXPRs into pmovzx [PR95905]

Jakub Jelinek via Gcc-patches Tue, 12 Jan 2021 01:33:34 -0800

Hi!

The following patch adds patterns (in the end I went with define_insn rather
than combiner define_split + define_insn_and_split I initially hoped or
define_insn_and_split) to represent (so far 128-bit only) permutations
like { 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23 } where the second
operand is CONST0_RTX CONST_VECTOR as pmovzx.
define_split didn't work (due to the combiner not trying combine_split_insn
when i1 is NULL) but in the end was quite large, and the reason for not
trying to split this afterwards is the different vector mode of the output,
and lowpart_subreg on the result is undesirable,
so we'd need to split it into two instructions and hope some later pass
optimizes the move into just rewriting the uses using lowpart_subreg.
While I initially wrote also avx2 and avx512{bw,f} patterns, turns out that
without further changes they aren't useful, such permutations are expanded
into multiple permutation instructions and while combiner in theory could
be able to undo that, what stops it is that the const_vector of 0 has
multiple uses.  So either we'd need to allow pre-reload const0_operand
next to registers (and sometimes memory) in some permutations, or perhaps
change the vec_perm_const target hook calling code to not force zero vectors
into registers and change each vec_perm_const hook to force it into register
itself if it can't expand such a permutation smartly.


Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2021-01-12  Jakub Jelinek  <ja...@redhat.com>

        PR target/95905
        * config/i386/predicates.md (pmovzx_parallel): New predicate.
        * config/i386/sse.md (*sse4_1_zero_extendv8qiv8hi2_3,
        *sse4_1_zero_extendv4hiv4si2_3, *sse4_1_zero_extendv2siv2di2_3): New
        define_insn patterns.

        * gcc.target/i386/pr95905-1.c: New test.
        * gcc.target/i386/pr95905-2.c: New test.

--- gcc/config/i386/predicates.md.jj    2021-01-04 10:25:45.248161185 +0100
+++ gcc/config/i386/predicates.md       2021-01-11 19:40:04.793433826 +0100
@@ -1600,6 +1600,38 @@ (define_predicate "addsub_vs_parallel"
   return true;
 })
 
+;; Return true if OP is a parallel for an pmovz{bw,wd,dq} vec_select,
+;; where one of the two operands of the vec_concat is const0_operand.
+(define_predicate "pmovzx_parallel"
+  (and (match_code "parallel")
+       (match_code "const_int" "a"))
+{
+  int nelt = XVECLEN (op, 0);
+  int elt, i;
+
+  if (nelt < 2)
+    return false;
+
+  /* Check that the permutation is suitable for pmovz{bw,wd,dq}.
+     For example { 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23 }.  */
+  elt = INTVAL (XVECEXP (op, 0, 0));
+  if (elt == 0)
+    {
+      for (i = 1; i < nelt; ++i)
+       if ((i & 1) != 0)
+         {
+           if (INTVAL (XVECEXP (op, 0, i)) < nelt)
+             return false;
+         }
+       else if (INTVAL (XVECEXP (op, 0, i)) != i / 2)
+         return false;
+    }
+  else
+    return false;
+
+  return true;
+})
+
 ;; Return true if OP is a parallel for a vbroadcast permute.
 (define_predicate "avx_vbroadcast_operand"
   (and (match_code "parallel")
--- gcc/config/i386/sse.md.jj   2021-01-07 15:29:52.604974544 +0100
+++ gcc/config/i386/sse.md      2021-01-11 21:19:06.247346965 +0100
@@ -17683,6 +17683,27 @@ (define_insn_and_split "*sse4_1_<code>v8
        (any_extend:V8HI (match_dup 1)))]
   "operands[1] = adjust_address_nv (operands[1], V8QImode, 0);")
 
+(define_insn "*sse4_1_zero_extendv8qiv8hi2_3"
+  [(set (match_operand:V16QI 0 "register_operand" "=Yr,*x,v")
+       (vec_select:V16QI
+         (vec_concat:V32QI
+           (match_operand:V16QI 1 "vector_operand" "Yrm,*xm,vm")
+           (match_operand:V16QI 2 "const0_operand" "C,C,C"))
+         (match_parallel 3 "pmovzx_parallel"
+           [(match_operand 4 "const_int_operand" "n,n,n")])))]
+  "TARGET_SSE4_1"
+{
+  if (MEM_P (operands[1]))
+    return "%vpmovzxbw\t{%1, %0|%0, %1}";
+  else
+    return "%vpmovzxbw\t{%1, %0|%0, %q1}";
+}
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "TI")])
+
 (define_expand "<insn>v8qiv8hi2"
   [(set (match_operand:V8HI 0 "register_operand")
        (any_extend:V8HI
@@ -17929,6 +17950,27 @@ (define_expand "<insn>v4hiv4si2"
     }
 })
 
+(define_insn "*sse4_1_zero_extendv4hiv4si2_3"
+  [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v")
+       (vec_select:V8HI
+         (vec_concat:V16HI
+           (match_operand:V8HI 1 "vector_operand" "Yrm,*xm,vm")
+           (match_operand:V8HI 2 "const0_operand" "C,C,C"))
+         (match_parallel 3 "pmovzx_parallel"
+           [(match_operand 4 "const_int_operand" "n,n,n")])))]
+  "TARGET_SSE4_1"
+{
+  if (MEM_P (operands[1]))
+    return "%vpmovzxwd\t{%1, %0|%0, %1}";
+  else
+    return "%vpmovzxwd\t{%1, %0|%0, %q1}";
+}
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "TI")])
+
 (define_insn "avx512f_<code>v8qiv8di2<mask_name>"
   [(set (match_operand:V8DI 0 "register_operand" "=v")
        (any_extend:V8DI
@@ -18283,6 +18325,27 @@ (define_insn_and_split "*sse4_1_<code>v2
        (any_extend:V2DI (match_dup 1)))]
   "operands[1] = adjust_address_nv (operands[1], V2SImode, 0);")
 
+(define_insn "*sse4_1_zero_extendv2siv2di2_3"
+  [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v")
+       (vec_select:V4SI
+         (vec_concat:V8SI
+           (match_operand:V4SI 1 "vector_operand" "Yrm,*xm,vm")
+           (match_operand:V4SI 2 "const0_operand" "C,C,C"))
+         (match_parallel 3 "pmovzx_parallel"
+           [(match_operand 4 "const_int_operand" "n,n,n")])))]
+  "TARGET_SSE4_1"
+{
+  if (MEM_P (operands[1]))
+    return "%vpmovzxdq\t{%1, %0|%0, %1}";
+  else
+    return "%vpmovzxdq\t{%1, %0|%0, %q1}";
+}
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "TI")])
+
 (define_expand "<insn>v2siv2di2"
   [(set (match_operand:V2DI 0 "register_operand")
        (any_extend:V2DI
--- gcc/testsuite/gcc.target/i386/pr95905-1.c.jj        2021-01-11 
21:19:46.657890361 +0100
+++ gcc/testsuite/gcc.target/i386/pr95905-1.c   2021-01-11 20:41:25.189872347 
+0100
@@ -0,0 +1,26 @@
+/* PR target/95905 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1 -mno-avx" } */
+/* { dg-final { scan-assembler "\tpmovzxbw\t" } } */
+/* { dg-final { scan-assembler "\tpmovzxwd\t" } } */
+/* { dg-final { scan-assembler "\tpmovzxdq\t" } } */
+
+#include <x86intrin.h>
+
+__m128i
+f1 (__m128i a)
+{
+  return _mm_unpacklo_epi8 (a, _mm_setzero_si128 ());
+}
+
+__m128i
+f2 (__m128i a)
+{
+  return _mm_unpacklo_epi16 (a, _mm_setzero_si128 ());
+}
+
+__m128i
+f3 (__m128i a)
+{
+  return _mm_unpacklo_epi32 (a, _mm_setzero_si128 ());
+}
--- gcc/testsuite/gcc.target/i386/pr95905-2.c.jj        2021-01-11 
21:20:10.075625762 +0100
+++ gcc/testsuite/gcc.target/i386/pr95905-2.c   2021-01-11 21:20:44.126241029 
+0100
@@ -0,0 +1,46 @@
+/* PR target/95905 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+/* { dg-final { scan-assembler "\tv?pmovzxbw\t" } } */
+/* { dg-final { scan-assembler "\tv?pmovzxwd\t" } } */
+/* { dg-final { scan-assembler "\tv?pmovzxdq\t" } } */
+
+typedef unsigned char V1 __attribute__((vector_size (16)));
+typedef unsigned short V2 __attribute__((vector_size (16)));
+typedef unsigned int V3 __attribute__((vector_size (16)));
+
+V1
+f1 (V1 x)
+{
+  return __builtin_shuffle (x, (V1) {}, (V1) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 
20, 5, 21, 6, 22, 7, 23 });
+}
+
+V2
+f2 (V2 x)
+{
+  return __builtin_shuffle (x, (V2) {}, (V2) { 0, 8, 1, 9, 2, 10, 3, 11 });
+}
+
+V3
+f3 (V3 x)
+{
+  return __builtin_shuffle (x, (V3) {}, (V3) { 0, 4, 1, 5 });
+}
+
+V1
+f4 (V1 *x)
+{
+  return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 
20, 5, 21, 6, 22, 7, 23 });
+}
+
+V2
+f5 (V2 *x)
+{
+  return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 8, 1, 9, 2, 10, 3, 11 });
+}
+
+V3
+f6 (V3 *x)
+{
+  return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 4, 1, 5 });
+}

        Jakub

[PATCH] i386: Optimize _mm_unpacklo_epi8 of 0 vector as second argument or similar VEC_PERM_EXPRs into pmovzx [PR95905]

Reply via email to