https://gcc.gnu.org/g:2f759fa9f4dd78ae8d86482ccda72a335aaac404

commit r15-2758-g2f759fa9f4dd78ae8d86482ccda72a335aaac404
Author: Roger Sayle <ro...@nextmovesoftware.com>
Date:   Tue Aug 6 17:19:29 2024 +0100

    i386: Refactor V2DI arithmetic right shift expansion for STV.
    
    This patch refactors ashrv2di RTL expansion into a function so that it may
    be reused by a pre-reload splitter, such that DImode right shifts may be
    considered candidates during the Scalar-To-Vector (STV) pass.  Currently
    DImode arithmetic right shifts are not considered potential candidates
    during STV, so for the following testcase:
    
    long long m;
    typedef long long v2di __attribute__((vector_size (16)));
    void foo(v2di x) { m = x[0]>>63; }
    
    We currently see the following warning/error during STV2
    >  r101 use in insn 7 isn't convertible
    
    And end up generating scalar code with an interunit move:
    
    foo:    movq    %xmm0, %rax
            sarq    $63, %rax
            movq    %rax, m(%rip)
            ret
    
    With this patch, we can reuse the RTL expansion logic and produce:
    
    foo:    psrad   $31, %xmm0
            pshufd  $245, %xmm0, %xmm0
            movq    %xmm0, m(%rip)
            ret
    
    Or with the addition of -mavx2, the equivalent:
    
    foo:    vpxor   %xmm1, %xmm1, %xmm1
            vpcmpgtq        %xmm0, %xmm1, %xmm0
            vmovq   %xmm0, m(%rip)
            ret
    
    The only design decision of note is the choice to continue lowering V2DI
    into vector sequences during RTL expansion, to enable combine to optimize
    things if possible.  Using just define_insn_and_split potentially misses
    optimizations, such as reusing the zero vector produced by vpxor above.
    It may be necessary to tweak STV's compute gain at some point, but this
    patch controls what's possible (rather than what's beneficial).
    
    2024-08-06  Roger Sayle  <ro...@nextmovesoftware.com>
    
    gcc/ChangeLog
            * config/i386/i386-expand.cc (ix86_expand_v2di_ashiftrt): New
            function refactored from define_expand ashrv2di3.
            * config/i386/i386-features.cc 
(general_scalar_to_vector_candidate_p)
            <case ASHIFTRT>: Handle like other shifts and rotates.
            * config/i386/i386-protos.h (ix86_expand_v2di_ashiftrt): Prototype.
            * config/i386/sse.md (ashrv2di3): Call ix86_expand_v2di_ashiftrt.
            (*ashrv2di3): New define_insn_and_split to enable creation by stv2
            pass, and splitting during split1 reusing ix86_expand_v2di_ashiftrt.
    
    gcc/testsuite/ChangeLog
            * gcc.target/i386/sse2-stv-2.c: New test case.

Diff:
---
 gcc/config/i386/i386-expand.cc             | 156 ++++++++++++++++++++++++++++
 gcc/config/i386/i386-features.cc           |   6 +-
 gcc/config/i386/i386-protos.h              |   1 +
 gcc/config/i386/sse.md                     | 159 +++--------------------------
 gcc/testsuite/gcc.target/i386/sse2-stv-2.c |  10 ++
 5 files changed, 180 insertions(+), 152 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index d9ad06264aaf..bdbc14232679 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -7471,6 +7471,162 @@ ix86_expand_v1ti_ashiftrt (rtx operands[])
     }
 }
 
+/* Expand V2DI mode ashiftrt.  */
+void
+ix86_expand_v2di_ashiftrt (rtx operands[])
+{
+  if (operands[2] == const0_rtx)
+    {
+      emit_move_insn (operands[0], operands[1]);
+      return;
+    }
+
+  if (TARGET_SSE4_2
+      && CONST_INT_P (operands[2])
+      && UINTVAL (operands[2]) >= 63
+      && !optimize_insn_for_size_p ())
+    {
+      rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
+      emit_insn (gen_sse4_2_gtv2di3 (operands[0], zero, operands[1]));
+      return;
+    }
+
+  if (CONST_INT_P (operands[2])
+      && (!TARGET_XOP || UINTVAL (operands[2]) >= 63))
+    {
+      vec_perm_builder sel (4, 4, 1);
+      sel.quick_grow (4);
+      rtx arg0, arg1;
+      rtx op1 = lowpart_subreg (V4SImode,
+                               force_reg (V2DImode, operands[1]),
+                               V2DImode);
+      rtx target = gen_reg_rtx (V4SImode);
+      if (UINTVAL (operands[2]) >= 63)
+       {
+         arg0 = arg1 = gen_reg_rtx (V4SImode);
+         emit_insn (gen_ashrv4si3 (arg0, op1, GEN_INT (31)));
+         sel[0] = 1;
+         sel[1] = 1;
+         sel[2] = 3;
+         sel[3] = 3;
+       }
+      else if (INTVAL (operands[2]) > 32)
+       {
+         arg0 = gen_reg_rtx (V4SImode);
+         arg1 = gen_reg_rtx (V4SImode);
+         emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
+         emit_insn (gen_ashrv4si3 (arg0, op1,
+                                   GEN_INT (INTVAL (operands[2]) - 32)));
+         sel[0] = 1;
+         sel[1] = 5;
+         sel[2] = 3;
+         sel[3] = 7;
+       }
+      else if (INTVAL (operands[2]) == 32)
+       {
+         arg0 = op1;
+         arg1 = gen_reg_rtx (V4SImode);
+         emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
+         sel[0] = 1;
+         sel[1] = 5;
+         sel[2] = 3;
+         sel[3] = 7;
+       }
+      else
+       {
+         arg0 = gen_reg_rtx (V2DImode);
+         arg1 = gen_reg_rtx (V4SImode);
+         emit_insn (gen_lshrv2di3 (arg0, operands[1], operands[2]));
+         emit_insn (gen_ashrv4si3 (arg1, op1, operands[2]));
+         arg0 = lowpart_subreg (V4SImode, arg0, V2DImode);
+         sel[0] = 0;
+         sel[1] = 5;
+         sel[2] = 2;
+         sel[3] = 7;
+       }
+      vec_perm_indices indices (sel, arg0 != arg1 ? 2 : 1, 4);
+      rtx op0 = operands[0];
+      bool ok = targetm.vectorize.vec_perm_const (V4SImode, V4SImode,
+                                                 target, arg0, arg1,
+                                                 indices);
+      gcc_assert (ok);
+      emit_move_insn (op0, lowpart_subreg (V2DImode, target, V4SImode));
+      return;
+    }
+  if (!TARGET_XOP)
+    {
+      rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
+      rtx zero_or_all_ones;
+      if (TARGET_SSE4_2)
+       {
+         zero_or_all_ones = gen_reg_rtx (V2DImode);
+         emit_insn (gen_sse4_2_gtv2di3 (zero_or_all_ones, zero,
+                                        operands[1]));
+       }
+      else
+       {
+         rtx temp = gen_reg_rtx (V4SImode);
+         emit_insn (gen_ashrv4si3 (temp,
+                                   lowpart_subreg (V4SImode,
+                                                   force_reg (V2DImode,
+                                                              operands[1]),
+                                                   V2DImode),
+                                   GEN_INT (31)));
+         zero_or_all_ones = gen_reg_rtx (V4SImode);
+         emit_insn (gen_sse2_pshufd_1 (zero_or_all_ones, temp,
+                                       const1_rtx, const1_rtx,
+                                       GEN_INT (3), GEN_INT (3)));
+         zero_or_all_ones = lowpart_subreg (V2DImode, zero_or_all_ones,
+                                            V4SImode);
+       }
+      rtx lshr_res = gen_reg_rtx (V2DImode);
+      emit_insn (gen_lshrv2di3 (lshr_res, operands[1], operands[2]));
+      rtx ashl_res = gen_reg_rtx (V2DImode);
+      rtx amount;
+      if (TARGET_64BIT)
+       {
+         amount = gen_reg_rtx (DImode);
+         emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)),
+                                operands[2]));
+       }
+      else
+       {
+         rtx temp = gen_reg_rtx (SImode);
+         emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)),
+                                lowpart_subreg (SImode, operands[2],
+                                                DImode)));
+         amount = gen_reg_rtx (V4SImode);
+         emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode),
+                                       temp));
+       }
+      amount = lowpart_subreg (DImode, amount, GET_MODE (amount));
+      emit_insn (gen_ashlv2di3 (ashl_res, zero_or_all_ones, amount));
+      emit_insn (gen_iorv2di3 (operands[0], lshr_res, ashl_res));
+      return;
+    }
+
+  rtx reg = gen_reg_rtx (V2DImode);
+  rtx par;
+  bool negate = false;
+  int i;
+
+  if (CONST_INT_P (operands[2]))
+    operands[2] = GEN_INT (-INTVAL (operands[2]));
+  else
+    negate = true;
+
+  par = gen_rtx_PARALLEL (V2DImode, rtvec_alloc (2));
+  for (i = 0; i < 2; i++)
+    XVECEXP (par, 0, i) = operands[2];
+
+  emit_insn (gen_vec_initv2didi (reg, par));
+
+  if (negate)
+    emit_insn (gen_negv2di2 (reg, reg));
+
+  emit_insn (gen_xop_shav2di3 (operands[0], operands[1], reg));
+}
+
 /* Replace all occurrences of REG FROM with REG TO in X, including
    occurrences with different modes.  */
 
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 3da56ddbdccd..c36d181f2d64 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -2131,13 +2131,9 @@ general_scalar_to_vector_candidate_p (rtx_insn *insn, 
enum machine_mode mode)
 
   switch (GET_CODE (src))
     {
-    case ASHIFTRT:
-      if (mode == DImode && !TARGET_AVX512VL)
-       return false;
-      /* FALLTHRU */
-
     case ASHIFT:
     case LSHIFTRT:
+    case ASHIFTRT:
     case ROTATE:
     case ROTATERT:
       if (!CONST_INT_P (XEXP (src, 1))
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index a3629b32a019..a80432b3742a 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -181,6 +181,7 @@ extern void ix86_split_rshift_ndd (enum rtx_code, rtx *, 
rtx);
 extern void ix86_expand_v1ti_shift (enum rtx_code, rtx[]);
 extern void ix86_expand_v1ti_rotate (enum rtx_code, rtx[]);
 extern void ix86_expand_v1ti_ashiftrt (rtx[]);
+extern void ix86_expand_v2di_ashiftrt (rtx[]);
 extern rtx ix86_replace_reg_with_reg (rtx, rtx, rtx);
 extern rtx ix86_find_base_term (rtx);
 extern bool ix86_check_movabs (rtx, int);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index baaec6897496..d1010bc56821 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -26655,157 +26655,22 @@
 {
   if (!TARGET_AVX512VL)
     {
-      if (TARGET_SSE4_2
-         && CONST_INT_P (operands[2])
-         && UINTVAL (operands[2]) >= 63)
-       {
-         rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
-         emit_insn (gen_sse4_2_gtv2di3 (operands[0], zero, operands[1]));
-         DONE;
-       }
-      if (operands[2] == const0_rtx)
-       {
-         emit_move_insn (operands[0], operands[1]);
-         DONE;
-       }
-      if (CONST_INT_P (operands[2])
-         && (!TARGET_XOP || UINTVAL (operands[2]) >= 63))
-       {
-         vec_perm_builder sel (4, 4, 1);
-         sel.quick_grow (4);
-         rtx arg0, arg1;
-         rtx op1 = lowpart_subreg (V4SImode,
-                                   force_reg (V2DImode, operands[1]),
-                                   V2DImode);
-         rtx target = gen_reg_rtx (V4SImode);
-         if (UINTVAL (operands[2]) >= 63)
-           {
-             arg0 = arg1 = gen_reg_rtx (V4SImode);
-             emit_insn (gen_ashrv4si3 (arg0, op1, GEN_INT (31)));
-             sel[0] = 1;
-             sel[1] = 1;
-             sel[2] = 3;
-             sel[3] = 3;
-           }
-         else if (INTVAL (operands[2]) > 32)
-           {
-             arg0 = gen_reg_rtx (V4SImode);
-             arg1 = gen_reg_rtx (V4SImode);
-             emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
-             emit_insn (gen_ashrv4si3 (arg0, op1,
-                                       GEN_INT (INTVAL (operands[2]) - 32)));
-             sel[0] = 1;
-             sel[1] = 5;
-             sel[2] = 3;
-             sel[3] = 7;
-           }
-         else if (INTVAL (operands[2]) == 32)
-           {
-             arg0 = op1;
-             arg1 = gen_reg_rtx (V4SImode);
-             emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
-             sel[0] = 1;
-             sel[1] = 5;
-             sel[2] = 3;
-             sel[3] = 7;
-           }
-         else
-           {
-             arg0 = gen_reg_rtx (V2DImode);
-             arg1 = gen_reg_rtx (V4SImode);
-             emit_insn (gen_lshrv2di3 (arg0, operands[1], operands[2]));
-             emit_insn (gen_ashrv4si3 (arg1, op1, operands[2]));
-             arg0 = lowpart_subreg (V4SImode, arg0, V2DImode);
-             sel[0] = 0;
-             sel[1] = 5;
-             sel[2] = 2;
-             sel[3] = 7;
-           }
-         vec_perm_indices indices (sel, arg0 != arg1 ? 2 : 1, 4);
-         bool ok = targetm.vectorize.vec_perm_const (V4SImode, V4SImode,
-                                                     target, arg0, arg1,
-                                                     indices);
-         gcc_assert (ok);
-         emit_move_insn (operands[0],
-                         lowpart_subreg (V2DImode, target, V4SImode));
-         DONE;
-       }
-      if (!TARGET_XOP)
-       {
-         rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
-         rtx zero_or_all_ones;
-         if (TARGET_SSE4_2)
-           {
-             zero_or_all_ones = gen_reg_rtx (V2DImode);
-             emit_insn (gen_sse4_2_gtv2di3 (zero_or_all_ones, zero,
-                                            operands[1]));
-           }
-         else
-           {
-             rtx temp = gen_reg_rtx (V4SImode);
-             emit_insn (gen_ashrv4si3 (temp,
-                                       lowpart_subreg (V4SImode,
-                                                       force_reg (V2DImode,
-                                                                  operands[1]),
-                                                       V2DImode),
-                                       GEN_INT (31)));
-             zero_or_all_ones = gen_reg_rtx (V4SImode);
-             emit_insn (gen_sse2_pshufd_1 (zero_or_all_ones, temp,
-                                           const1_rtx, const1_rtx,
-                                           GEN_INT (3), GEN_INT (3)));
-             zero_or_all_ones = lowpart_subreg (V2DImode, zero_or_all_ones,
-                                                V4SImode);
-           }
-         rtx lshr_res = gen_reg_rtx (V2DImode);
-         emit_insn (gen_lshrv2di3 (lshr_res, operands[1], operands[2]));
-         rtx ashl_res = gen_reg_rtx (V2DImode);
-         rtx amount;
-         if (TARGET_64BIT)
-           {
-             amount = gen_reg_rtx (DImode);
-             emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)),
-                                    operands[2]));
-           }
-         else
-           {
-             rtx temp = gen_reg_rtx (SImode);
-             emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)),
-                                    lowpart_subreg (SImode, operands[2],
-                                                    DImode)));
-             amount = gen_reg_rtx (V4SImode);
-             emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode),
-                                           temp));
-           }
-         amount = lowpart_subreg (DImode, amount, GET_MODE (amount));
-         emit_insn (gen_ashlv2di3 (ashl_res, zero_or_all_ones, amount));
-         emit_insn (gen_iorv2di3 (operands[0], lshr_res, ashl_res));
-         DONE;
-       }
-
-      rtx reg = gen_reg_rtx (V2DImode);
-      rtx par;
-      bool negate = false;
-      int i;
-
-      if (CONST_INT_P (operands[2]))
-       operands[2] = GEN_INT (-INTVAL (operands[2]));
-      else
-       negate = true;
-
-      par = gen_rtx_PARALLEL (V2DImode, rtvec_alloc (2));
-      for (i = 0; i < 2; i++)
-       XVECEXP (par, 0, i) = operands[2];
-
-      emit_insn (gen_vec_initv2didi (reg, par));
-
-      if (negate)
-       emit_insn (gen_negv2di2 (reg, reg));
-
-      emit_insn (gen_xop_shav2di3 (operands[0], operands[1], reg));
+      ix86_expand_v2di_ashiftrt (operands);
       DONE;
     }
 })
 
+(define_insn_and_split "*ashrv2di3"
+  [(set (match_operand:V2DI 0 "register_operand")
+       (ashiftrt:V2DI
+         (match_operand:V2DI 1 "register_operand")
+         (match_operand:DI 2 "nonmemory_operand")))]
+  "TARGET_SSE2 && !TARGET_AVX512VL && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  "ix86_expand_v2di_ashiftrt (operands); DONE;")
+
 ;; XOP FRCZ support
 (define_insn "xop_frcz<mode>2"
   [(set (match_operand:FMAMODE 0 "register_operand" "=x")
diff --git a/gcc/testsuite/gcc.target/i386/sse2-stv-2.c 
b/gcc/testsuite/gcc.target/i386/sse2-stv-2.c
new file mode 100644
index 000000000000..9f10b9be8a94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-stv-2.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+
+long long m;
+
+void foo(v2di x) { m = x[0]>>63; }
+
+/* { dg-final { scan-assembler-not "ax" } } */

Reply via email to