Hi Richard,
I have changed the condition as you suggest below. OK for trunk?
Jackson.
On 08/11/2017 02:56 PM, Richard Earnshaw (lists) wrote:
On 10/08/17 14:12, Jackson Woodruff wrote:
Hi all,
This patch changes patterns in aarch64-simd.md to replace
movi v0.4s, 0
str q0, [x0, 16]
With:
stp xzr, xzr, [x0, 16]
When we are storing zeros to vectors like this:
void f(uint32x4_t *p) {
uint32x4_t x = { 0, 0, 0, 0};
p[1] = x;
}
Bootstrapped and regtested on aarch64 with no regressions.
OK for trunk?
Jackson
gcc/
2017-08-09 Jackson Woodruff <jackson.woodr...@arm.com>
* aarch64-simd.md (mov<mode>): No longer force zero
immediate into register.
(*aarch64_simd_mov<mode>): Add new case for stp
using zero immediate.
gcc/testsuite
2017-08-09 Jackson Woodruff <jackson.woodr...@arm.com>
* gcc.target/aarch64/simd/neon_str_zero.c: New.
patchfile
diff --git a/gcc/config/aarch64/aarch64-simd.md
b/gcc/config/aarch64/aarch64-simd.md
index
74de9b8c89dd5e4e3d87504594c969de0e0128ce..0149a742d34ae4fd5b3fd705b03c845f94aa1d59
100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -23,7 +23,10 @@
(match_operand:VALL_F16 1 "general_operand" ""))]
"TARGET_SIMD"
"
- if (GET_CODE (operands[0]) == MEM)
+ if (GET_CODE (operands[0]) == MEM
+ && !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
+ && aarch64_legitimate_address_p (<MODE>mode, operands[0],
+ PARALLEL, 1)))
operands[1] = force_reg (<MODE>mode, operands[1]);
"
)
@@ -94,63 +97,70 @@
(define_insn "*aarch64_simd_mov<mode>"
[(set (match_operand:VD 0 "nonimmediate_operand"
- "=w, m, w, ?r, ?w, ?r, w")
+ "=w, m, m, w, ?r, ?w, ?r, w")
(match_operand:VD 1 "general_operand"
- "m, w, w, w, r, r, Dn"))]
+ "m, Dz, w, w, w, r, r, Dn"))]
"TARGET_SIMD
- && (register_operand (operands[0], <MODE>mode)
- || register_operand (operands[1], <MODE>mode))"
+ && ((register_operand (operands[0], <MODE>mode)
+ || register_operand (operands[1], <MODE>mode))
+ || (memory_operand (operands[0], <MODE>mode)
+ && immediate_operand (operands[1], <MODE>mode)))"
Allowing any immediate here seems too lax - it allows any immediate
value which then could cause reload operations to be inserted (that in
turn might cause register pressure calculations to be incorrect).
Wouldn't it be better to use something like aarch64_simd_reg_or_zero?
Similarly below.
R.
{
switch (which_alternative)
{
case 0: return "ldr\\t%d0, %1";
- case 1: return "str\\t%d1, %0";
- case 2: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
- case 3: return "umov\t%0, %1.d[0]";
- case 4: return "fmov\t%d0, %1";
- case 5: return "mov\t%0, %1";
- case 6:
+ case 1: return "str\\txzr, %0";
+ case 2: return "str\\t%d1, %0";
+ case 3: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
+ case 4: return "umov\t%0, %1.d[0]";
+ case 5: return "fmov\t%d0, %1";
+ case 6: return "mov\t%0, %1";
+ case 7:
return aarch64_output_simd_mov_immediate (operands[1],
<MODE>mode, 64);
default: gcc_unreachable ();
}
}
- [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
+ [(set_attr "type" "neon_load1_1reg<q>, neon_stp, neon_store1_1reg<q>,\
neon_logic<q>, neon_to_gp<q>, f_mcr,\
mov_reg, neon_move<q>")]
)
(define_insn "*aarch64_simd_mov<mode>"
[(set (match_operand:VQ 0 "nonimmediate_operand"
- "=w, m, w, ?r, ?w, ?r, w")
+ "=w, Ump, m, w, ?r, ?w, ?r, w")
(match_operand:VQ 1 "general_operand"
- "m, w, w, w, r, r, Dn"))]
+ "m, Dz, w, w, w, r, r, Dn"))]
"TARGET_SIMD
- && (register_operand (operands[0], <MODE>mode)
- || register_operand (operands[1], <MODE>mode))"
+ && ((register_operand (operands[0], <MODE>mode)
+ || register_operand (operands[1], <MODE>mode))
+ || (memory_operand (operands[0], <MODE>mode)
+ && immediate_operand (operands[1], <MODE>mode)))"
{
switch (which_alternative)
{
case 0:
return "ldr\\t%q0, %1";
case 1:
- return "str\\t%q1, %0";
+ return "stp\\txzr, xzr, %0";
case 2:
- return "mov\t%0.<Vbtype>, %1.<Vbtype>";
+ return "str\\t%q1, %0";
case 3:
+ return "mov\t%0.<Vbtype>, %1.<Vbtype>";
case 4:
case 5:
- return "#";
case 6:
+ return "#";
+ case 7:
return aarch64_output_simd_mov_immediate (operands[1], <MODE>mode, 128);
default:
gcc_unreachable ();
}
}
[(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
- neon_logic<q>, multiple, multiple, multiple,\
- neon_move<q>")
- (set_attr "length" "4,4,4,8,8,8,4")]
+ neon_stp, neon_logic<q>, multiple, multiple,\
+ multiple, neon_move<q>")
+ (set_attr "length" "4,4,4,4,8,8,8,4")]
)
;; When storing lane zero we can use the normal STR and its more permissive
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
new file mode 100644
index
0000000000000000000000000000000000000000..07198de109432b530745cc540790303ae0245efb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <arm_neon.h>
+
+void
+f (uint32x4_t *p)
+{
+ uint32x4_t x = { 0, 0, 0, 0};
+ p[1] = x;
+
+ /* { dg-final { scan-assembler "stp\txzr, xzr," } } */
+}
+
+void
+g (float32x2_t *p)
+{
+ float32x2_t x = {0.0, 0.0};
+ p[0] = x;
+
+ /* { dg-final { scan-assembler "str\txzr, " } } */
+}
diff --git a/gcc/config/aarch64/aarch64-simd.md
b/gcc/config/aarch64/aarch64-simd.md
index
74de9b8c89dd5e4e3d87504594c969de0e0128ce..ce1b981fc005edf48a401a456def2a37cf9d9022
100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -23,7 +23,10 @@
(match_operand:VALL_F16 1 "general_operand" ""))]
"TARGET_SIMD"
"
- if (GET_CODE (operands[0]) == MEM)
+ if (GET_CODE (operands[0]) == MEM
+ && !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
+ && aarch64_legitimate_address_p (<MODE>mode, operands[0],
+ PARALLEL, 1)))
operands[1] = force_reg (<MODE>mode, operands[1]);
"
)
@@ -94,63 +97,66 @@
(define_insn "*aarch64_simd_mov<mode>"
[(set (match_operand:VD 0 "nonimmediate_operand"
- "=w, m, w, ?r, ?w, ?r, w")
+ "=w, m, m, w, ?r, ?w, ?r, w")
(match_operand:VD 1 "general_operand"
- "m, w, w, w, r, r, Dn"))]
+ "m, Dz, w, w, w, r, r, Dn"))]
"TARGET_SIMD
&& (register_operand (operands[0], <MODE>mode)
- || register_operand (operands[1], <MODE>mode))"
+ || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
{
switch (which_alternative)
{
- case 0: return "ldr\\t%d0, %1";
- case 1: return "str\\t%d1, %0";
- case 2: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
- case 3: return "umov\t%0, %1.d[0]";
- case 4: return "fmov\t%d0, %1";
- case 5: return "mov\t%0, %1";
- case 6:
+ case 0: return "ldr\t%d0, %1";
+ case 1: return "str\txzr, %0";
+ case 2: return "str\t%d1, %0";
+ case 3: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
+ case 4: return "umov\t%0, %1.d[0]";
+ case 5: return "fmov\t%d0, %1";
+ case 6: return "mov\t%0, %1";
+ case 7:
return aarch64_output_simd_mov_immediate (operands[1],
<MODE>mode, 64);
default: gcc_unreachable ();
}
}
- [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
+ [(set_attr "type" "neon_load1_1reg<q>, neon_stp, neon_store1_1reg<q>,\
neon_logic<q>, neon_to_gp<q>, f_mcr,\
mov_reg, neon_move<q>")]
)
(define_insn "*aarch64_simd_mov<mode>"
[(set (match_operand:VQ 0 "nonimmediate_operand"
- "=w, m, w, ?r, ?w, ?r, w")
+ "=w, Ump, m, w, ?r, ?w, ?r, w")
(match_operand:VQ 1 "general_operand"
- "m, w, w, w, r, r, Dn"))]
+ "m, Dz, w, w, w, r, r, Dn"))]
"TARGET_SIMD
&& (register_operand (operands[0], <MODE>mode)
- || register_operand (operands[1], <MODE>mode))"
+ || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
{
switch (which_alternative)
{
case 0:
- return "ldr\\t%q0, %1";
+ return "ldr\t%q0, %1";
case 1:
- return "str\\t%q1, %0";
+ return "stp\txzr, xzr, %0";
case 2:
- return "mov\t%0.<Vbtype>, %1.<Vbtype>";
+ return "str\t%q1, %0";
case 3:
+ return "mov\t%0.<Vbtype>, %1.<Vbtype>";
case 4:
case 5:
- return "#";
case 6:
+ return "#";
+ case 7:
return aarch64_output_simd_mov_immediate (operands[1], <MODE>mode, 128);
default:
gcc_unreachable ();
}
}
[(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
- neon_logic<q>, multiple, multiple, multiple,\
- neon_move<q>")
- (set_attr "length" "4,4,4,8,8,8,4")]
+ neon_stp, neon_logic<q>, multiple, multiple,\
+ multiple, neon_move<q>")
+ (set_attr "length" "4,4,4,4,8,8,8,4")]
)
;; When storing lane zero we can use the normal STR and its more permissive
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
new file mode 100644
index
0000000000000000000000000000000000000000..07198de109432b530745cc540790303ae0245efb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <arm_neon.h>
+
+void
+f (uint32x4_t *p)
+{
+ uint32x4_t x = { 0, 0, 0, 0};
+ p[1] = x;
+
+ /* { dg-final { scan-assembler "stp\txzr, xzr," } } */
+}
+
+void
+g (float32x2_t *p)
+{
+ float32x2_t x = {0.0, 0.0};
+ p[0] = x;
+
+ /* { dg-final { scan-assembler "str\txzr, " } } */
+}