Re: [AArch64, PATCH] Improve Neon store of zero

Jackson Woodruff Wed, 16 Aug 2017 08:19:52 -0700

Hi Richard,

I have changed the condition as you suggest below. OK for trunk?


Jackson.

On 08/11/2017 02:56 PM, Richard Earnshaw (lists) wrote:

On 10/08/17 14:12, Jackson Woodruff wrote:

Hi all,

This patch changes patterns in aarch64-simd.md to replace

     movi    v0.4s, 0
     str    q0, [x0, 16]

With:

     stp xzr, xzr, [x0, 16]

When we are storing zeros to vectors like this:

     void f(uint32x4_t *p) {
       uint32x4_t x = { 0, 0, 0, 0};
       p[1] = x;
     }

Bootstrapped and regtested on aarch64 with no regressions.
OK for trunk?

Jackson

gcc/

2017-08-09  Jackson Woodruff  <jackson.woodr...@arm.com>

     * aarch64-simd.md (mov<mode>): No longer force zero
     immediate into register.
     (*aarch64_simd_mov<mode>): Add new case for stp
     using zero immediate.


gcc/testsuite

2017-08-09  Jackson Woodruff  <jackson.woodr...@arm.com>

     * gcc.target/aarch64/simd/neon_str_zero.c: New.


patchfile


diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
74de9b8c89dd5e4e3d87504594c969de0e0128ce..0149a742d34ae4fd5b3fd705b03c845f94aa1d59
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -23,7 +23,10 @@
        (match_operand:VALL_F16 1 "general_operand" ""))]
    "TARGET_SIMD"
    "
-    if (GET_CODE (operands[0]) == MEM)
+    if (GET_CODE (operands[0]) == MEM
+           && !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
+                && aarch64_legitimate_address_p (<MODE>mode, operands[0],
+                                                 PARALLEL, 1)))
        operands[1] = force_reg (<MODE>mode, operands[1]);
    "
  )
@@ -94,63 +97,70 @@

(define_insn "*aarch64_simd_mov<mode>"

    [(set (match_operand:VD 0 "nonimmediate_operand"
-               "=w, m,  w, ?r, ?w, ?r, w")
+               "=w, m,  m,  w, ?r, ?w, ?r, w")
        (match_operand:VD 1 "general_operand"
-               "m,  w,  w,  w,  r,  r, Dn"))]
+               "m,  Dz, w,  w,  w,  r,  r, Dn"))]
    "TARGET_SIMD
-   && (register_operand (operands[0], <MODE>mode)
-       || register_operand (operands[1], <MODE>mode))"
+   && ((register_operand (operands[0], <MODE>mode)
+       || register_operand (operands[1], <MODE>mode))
+      || (memory_operand (operands[0], <MODE>mode)
+         && immediate_operand (operands[1], <MODE>mode)))"

Allowing any immediate here seems too lax - it allows any immediate
value which then could cause reload operations to be inserted (that in
turn might cause register pressure calculations to be incorrect).
Wouldn't it be better to use something like aarch64_simd_reg_or_zero?
Similarly below.

R.

  {
     switch (which_alternative)
       {
       case 0: return "ldr\\t%d0, %1";
-     case 1: return "str\\t%d1, %0";
-     case 2: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
-     case 3: return "umov\t%0, %1.d[0]";
-     case 4: return "fmov\t%d0, %1";
-     case 5: return "mov\t%0, %1";
-     case 6:
+     case 1: return "str\\txzr, %0";
+     case 2: return "str\\t%d1, %0";
+     case 3: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
+     case 4: return "umov\t%0, %1.d[0]";
+     case 5: return "fmov\t%d0, %1";
+     case 6: return "mov\t%0, %1";
+     case 7:
        return aarch64_output_simd_mov_immediate (operands[1],
                                                  <MODE>mode, 64);
       default: gcc_unreachable ();
       }
  }
-  [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
+  [(set_attr "type" "neon_load1_1reg<q>, neon_stp, neon_store1_1reg<q>,\
                     neon_logic<q>, neon_to_gp<q>, f_mcr,\
                     mov_reg, neon_move<q>")]
  )

(define_insn "*aarch64_simd_mov<mode>"

    [(set (match_operand:VQ 0 "nonimmediate_operand"
-               "=w, m,  w, ?r, ?w, ?r, w")
+               "=w, Ump,  m,  w, ?r, ?w, ?r, w")
        (match_operand:VQ 1 "general_operand"
-               "m,  w,  w,  w,  r,  r, Dn"))]
+               "m,  Dz, w,  w,  w,  r,  r, Dn"))]
    "TARGET_SIMD
-   && (register_operand (operands[0], <MODE>mode)
-       || register_operand (operands[1], <MODE>mode))"
+   && ((register_operand (operands[0], <MODE>mode)
+       || register_operand (operands[1], <MODE>mode))
+       || (memory_operand (operands[0], <MODE>mode)
+          && immediate_operand (operands[1], <MODE>mode)))"
  {
    switch (which_alternative)
      {
      case 0:
        return "ldr\\t%q0, %1";
      case 1:
-       return "str\\t%q1, %0";
+       return "stp\\txzr, xzr, %0";
      case 2:
-       return "mov\t%0.<Vbtype>, %1.<Vbtype>";
+       return "str\\t%q1, %0";
      case 3:
+       return "mov\t%0.<Vbtype>, %1.<Vbtype>";
      case 4:
      case 5:
-       return "#";
      case 6:
+       return "#";
+    case 7:
        return aarch64_output_simd_mov_immediate (operands[1], <MODE>mode, 128);
      default:
        gcc_unreachable ();
      }
  }
    [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
-                     neon_logic<q>, multiple, multiple, multiple,\
-                     neon_move<q>")
-   (set_attr "length" "4,4,4,8,8,8,4")]
+                    neon_stp, neon_logic<q>, multiple, multiple,\
+                    multiple, neon_move<q>")
+   (set_attr "length" "4,4,4,4,8,8,8,4")]
  )

;; When storing lane zero we can use the normal STR and its more permissive

diff --git a/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c 
b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
new file mode 100644
index 
0000000000000000000000000000000000000000..07198de109432b530745cc540790303ae0245efb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <arm_neon.h>
+
+void
+f (uint32x4_t *p)
+{
+  uint32x4_t x = { 0, 0, 0, 0};
+  p[1] = x;
+
+  /* { dg-final { scan-assembler "stp\txzr, xzr," } } */
+}
+
+void
+g (float32x2_t *p)
+{
+  float32x2_t x = {0.0, 0.0};
+  p[0] = x;
+
+  /* { dg-final { scan-assembler "str\txzr, " } } */
+}

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
74de9b8c89dd5e4e3d87504594c969de0e0128ce..ce1b981fc005edf48a401a456def2a37cf9d9022
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -23,7 +23,10 @@
        (match_operand:VALL_F16 1 "general_operand" ""))]
   "TARGET_SIMD"
   "
-    if (GET_CODE (operands[0]) == MEM)
+    if (GET_CODE (operands[0]) == MEM
+       && !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
+            && aarch64_legitimate_address_p (<MODE>mode, operands[0],
+                                             PARALLEL, 1)))
       operands[1] = force_reg (<MODE>mode, operands[1]);
   "
 )
@@ -94,63 +97,66 @@
 
 (define_insn "*aarch64_simd_mov<mode>"
   [(set (match_operand:VD 0 "nonimmediate_operand"
-               "=w, m,  w, ?r, ?w, ?r, w")
+               "=w, m,  m,  w, ?r, ?w, ?r, w")
        (match_operand:VD 1 "general_operand"
-               "m,  w,  w,  w,  r,  r, Dn"))]
+               "m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
-       || register_operand (operands[1], <MODE>mode))"
+       || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
 {
    switch (which_alternative)
      {
-     case 0: return "ldr\\t%d0, %1";
-     case 1: return "str\\t%d1, %0";
-     case 2: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
-     case 3: return "umov\t%0, %1.d[0]";
-     case 4: return "fmov\t%d0, %1";
-     case 5: return "mov\t%0, %1";
-     case 6:
+     case 0: return "ldr\t%d0, %1";
+     case 1: return "str\txzr, %0";
+     case 2: return "str\t%d1, %0";
+     case 3: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
+     case 4: return "umov\t%0, %1.d[0]";
+     case 5: return "fmov\t%d0, %1";
+     case 6: return "mov\t%0, %1";
+     case 7:
        return aarch64_output_simd_mov_immediate (operands[1],
                                                  <MODE>mode, 64);
      default: gcc_unreachable ();
      }
 }
-  [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
+  [(set_attr "type" "neon_load1_1reg<q>, neon_stp, neon_store1_1reg<q>,\
                     neon_logic<q>, neon_to_gp<q>, f_mcr,\
                     mov_reg, neon_move<q>")]
 )
 
 (define_insn "*aarch64_simd_mov<mode>"
   [(set (match_operand:VQ 0 "nonimmediate_operand"
-               "=w, m,  w, ?r, ?w, ?r, w")
+               "=w, Ump,  m,  w, ?r, ?w, ?r, w")
        (match_operand:VQ 1 "general_operand"
-               "m,  w,  w,  w,  r,  r, Dn"))]
+               "m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
-       || register_operand (operands[1], <MODE>mode))"
+       || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
 {
   switch (which_alternative)
     {
     case 0:
-       return "ldr\\t%q0, %1";
+       return "ldr\t%q0, %1";
     case 1:
-       return "str\\t%q1, %0";
+       return "stp\txzr, xzr, %0";
     case 2:
-       return "mov\t%0.<Vbtype>, %1.<Vbtype>";
+       return "str\t%q1, %0";
     case 3:
+       return "mov\t%0.<Vbtype>, %1.<Vbtype>";
     case 4:
     case 5:
-       return "#";
     case 6:
+       return "#";
+    case 7:
        return aarch64_output_simd_mov_immediate (operands[1], <MODE>mode, 128);
     default:
        gcc_unreachable ();
     }
 }
   [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
-                     neon_logic<q>, multiple, multiple, multiple,\
-                     neon_move<q>")
-   (set_attr "length" "4,4,4,8,8,8,4")]
+                    neon_stp, neon_logic<q>, multiple, multiple,\
+                    multiple, neon_move<q>")
+   (set_attr "length" "4,4,4,4,8,8,8,4")]
 )
 
 ;; When storing lane zero we can use the normal STR and its more permissive
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c 
b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
new file mode 100644
index 
0000000000000000000000000000000000000000..07198de109432b530745cc540790303ae0245efb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <arm_neon.h>
+
+void
+f (uint32x4_t *p)
+{
+  uint32x4_t x = { 0, 0, 0, 0};
+  p[1] = x;
+
+  /* { dg-final { scan-assembler "stp\txzr, xzr," } } */
+}
+
+void
+g (float32x2_t *p)
+{
+  float32x2_t x = {0.0, 0.0};
+  p[0] = x;
+
+  /* { dg-final { scan-assembler "str\txzr, " } } */
+}

Re: [AArch64, PATCH] Improve Neon store of zero

Reply via email to