On 10/30/18 8:32 PM, James Greenhalgh wrote:
> On Tue, Oct 02, 2018 at 11:19:09AM -0500, Richard Henderson wrote:
>> When the result of an operation is not used, we can ignore the
>> result by storing to XZR. For two of the memory models, using
>> XZR with LD<op> has a preferred assembler alias, ST<op>.
>
> ST<op> has different semantics to LD<op>, in particular, ST<op> is not
> ordered by a DMB LD; so this could weaken the LDADD and break C11 semantics.
>
> The relevant Arm Arm text is:
>
> If the destination register is not one of WZR or XZR, LDADDA and
> LDADDAL load from memory with acquire semantics
>
> LDADDL and LDADDAL store to memory with release semantics.
>
> LDADD has no memory ordering requirements.
>
> I'm taking this to mean that even if the result is unused, using XZR is not
> a valid transformation; it weakens the expected acquire semantics to
> unordered.
>
> The example I have from Will Deacon on an internal bug database is:
>
> P0 (atomic_int* y,atomic_int* x) {
> atomic_store_explicit(x,1,memory_order_relaxed);
> atomic_thread_fence(memory_order_release);
> atomic_store_explicit(y,1,memory_order_relaxed);
> }
>
> P1 (atomic_int* y,atomic_int* x) {
> int r0 = atomic_fetch_add_explicit(y,1,memory_order_relaxed);
> atomic_thread_fence(memory_order_acquire);
> int r1 = atomic_load_explicit(x,memory_order_relaxed);
> }
>
> The outcome where y == 2 and P1 has r0 = 1 and r1 = 0 is illegal.
>
> This example comes from a while back in my memory; so copying Will for
> any more detailed questions.
>
> My impression is that this transformation is not safe, and so the patch is
> not OK.
Here's a revised patch.
Use ST<op> for relaxed and release orderings, retain the (non-xzr) scratch
register for other orderings. But the scratch need not be early-clobber, since
there's no mid-point of half-consumed inputs.
r~
From 31a9f2fa6b73ecbca39c8d0b1a09f269a9d67f8d Mon Sep 17 00:00:00 2001
From: Richard Henderson <[email protected]>
Date: Wed, 19 Sep 2018 22:18:09 +0000
Subject: aarch64: Emit LSE st<op> instructions
* config/aarch64/atomics.md (aarch64_atomic_<ATOMIC_LDOP><ALLI>_lse):
Use ST<op> for relaxed and release models; the scratch register need
not be early-clobber.
* gcc.target/aarch64/atomic-inst-ldadd.c: Expect stadd{,l}.
* gcc.target/aarch64/atomic-inst-ldlogic.c: Similarly.
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 2198649b1be..00536d34dfd 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -270,14 +270,14 @@
(match_operand:ALLI 1 "register_operand" "r")
(match_operand:SI 2 "const_int_operand")]
ATOMIC_LDOP))
- (clobber (match_scratch:ALLI 3 "=&r"))]
+ (clobber (match_scratch:ALLI 3 "=r"))]
"TARGET_LSE"
{
enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
if (is_mm_relaxed (model))
- return "ld<atomic_ldop><atomic_sfx>\t%<w>1, %<w>3, %0";
+ return "st<atomic_ldop><atomic_sfx>\t%<w>1, %0";
else if (is_mm_release (model))
- return "ld<atomic_ldop>l<atomic_sfx>\t%<w>1, %<w>3, %0";
+ return "st<atomic_ldop>l<atomic_sfx>\t%<w>1, %0";
else if (is_mm_acquire (model) || is_mm_consume (model))
return "ld<atomic_ldop>a<atomic_sfx>\t%<w>1, %<w>3, %0";
else
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
index 4b2282c6861..db2206186b4 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
@@ -67,20 +67,26 @@ TEST (add_load_notreturn, ADD_LOAD_NORETURN)
TEST (sub_load, SUB_LOAD)
TEST (sub_load_notreturn, SUB_LOAD_NORETURN)
-/* { dg-final { scan-assembler-times "ldaddb\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddb\t" 8} } */
/* { dg-final { scan-assembler-times "ldaddab\t" 32} } */
-/* { dg-final { scan-assembler-times "ldaddlb\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddlb\t" 8} } */
/* { dg-final { scan-assembler-times "ldaddalb\t" 32} } */
+/* { dg-final { scan-assembler-times "staddb\t" 8} } */
+/* { dg-final { scan-assembler-times "staddlb\t" 8} } */
-/* { dg-final { scan-assembler-times "ldaddh\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddh\t" 8} } */
/* { dg-final { scan-assembler-times "ldaddah\t" 32} } */
-/* { dg-final { scan-assembler-times "ldaddlh\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddlh\t" 8} } */
/* { dg-final { scan-assembler-times "ldaddalh\t" 32} } */
+/* { dg-final { scan-assembler-times "staddh\t" 8} } */
+/* { dg-final { scan-assembler-times "staddlh\t" 8} } */
-/* { dg-final { scan-assembler-times "ldadd\t" 32} } */
+/* { dg-final { scan-assembler-times "ldadd\t" 16} } */
/* { dg-final { scan-assembler-times "ldadda\t" 64} } */
-/* { dg-final { scan-assembler-times "ldaddl\t" 32} } */
+/* { dg-final { scan-assembler-times "ldaddl\t" 16} } */
/* { dg-final { scan-assembler-times "ldaddal\t" 64} } */
+/* { dg-final { scan-assembler-times "stadd\t" 16} } */
+/* { dg-final { scan-assembler-times "staddl\t" 16} } */
/* { dg-final { scan-assembler-not "ldaxr\t" } } */
/* { dg-final { scan-assembler-not "stlxr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
index 4879d52b9b4..b8a53e0a676 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
@@ -101,54 +101,72 @@ TEST (xor_load_notreturn, XOR_LOAD_NORETURN)
/* Load-OR. */
-/* { dg-final { scan-assembler-times "ldsetb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldsetb\t" 4} } */
/* { dg-final { scan-assembler-times "ldsetab\t" 16} } */
-/* { dg-final { scan-assembler-times "ldsetlb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldsetlb\t" 4} } */
/* { dg-final { scan-assembler-times "ldsetalb\t" 16} } */
+/* { dg-final { scan-assembler-times "stsetb\t" 4} } */
+/* { dg-final { scan-assembler-times "stsetlb\t" 4} } */
-/* { dg-final { scan-assembler-times "ldseth\t" 8} } */
+/* { dg-final { scan-assembler-times "ldseth\t" 4} } */
/* { dg-final { scan-assembler-times "ldsetah\t" 16} } */
-/* { dg-final { scan-assembler-times "ldsetlh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldsetlh\t" 4} } */
/* { dg-final { scan-assembler-times "ldsetalh\t" 16} } */
+/* { dg-final { scan-assembler-times "stseth\t" 4} } */
+/* { dg-final { scan-assembler-times "stsetlh\t" 4} } */
-/* { dg-final { scan-assembler-times "ldset\t" 16} } */
+/* { dg-final { scan-assembler-times "ldset\t" 8} } */
/* { dg-final { scan-assembler-times "ldseta\t" 32} } */
-/* { dg-final { scan-assembler-times "ldsetl\t" 16} } */
+/* { dg-final { scan-assembler-times "ldsetl\t" 8} } */
/* { dg-final { scan-assembler-times "ldsetal\t" 32} } */
+/* { dg-final { scan-assembler-times "stset\t" 8} } */
+/* { dg-final { scan-assembler-times "stsetl\t" 8} } */
/* Load-AND. */
-/* { dg-final { scan-assembler-times "ldclrb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrb\t" 4} } */
/* { dg-final { scan-assembler-times "ldclrab\t" 16} } */
-/* { dg-final { scan-assembler-times "ldclrlb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrlb\t" 4} } */
/* { dg-final { scan-assembler-times "ldclralb\t" 16} } */
+/* { dg-final { scan-assembler-times "stclrb\t" 4} } */
+/* { dg-final { scan-assembler-times "stclrlb\t" 4} } */
-/* { dg-final { scan-assembler-times "ldclrh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrh\t" 4} } */
/* { dg-final { scan-assembler-times "ldclrah\t" 16} } */
-/* { dg-final { scan-assembler-times "ldclrlh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrlh\t" 4} } */
/* { dg-final { scan-assembler-times "ldclralh\t" 16} } */
+/* { dg-final { scan-assembler-times "stclrh\t" 4} } */
+/* { dg-final { scan-assembler-times "stclrlh\t" 4} } */
-/* { dg-final { scan-assembler-times "ldclr\t" 16} */
+/* { dg-final { scan-assembler-times "ldclr\t" 8} */
/* { dg-final { scan-assembler-times "ldclra\t" 32} } */
-/* { dg-final { scan-assembler-times "ldclrl\t" 16} } */
+/* { dg-final { scan-assembler-times "ldclrl\t" 8} } */
/* { dg-final { scan-assembler-times "ldclral\t" 32} } */
+/* { dg-final { scan-assembler-times "stclr\t" 8} */
+/* { dg-final { scan-assembler-times "stclrl\t" 8} } */
/* Load-XOR. */
-/* { dg-final { scan-assembler-times "ldeorb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldeorb\t" 4} } */
/* { dg-final { scan-assembler-times "ldeorab\t" 16} } */
-/* { dg-final { scan-assembler-times "ldeorlb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldeorlb\t" 4} } */
/* { dg-final { scan-assembler-times "ldeoralb\t" 16} } */
+/* { dg-final { scan-assembler-times "steorb\t" 4} } */
+/* { dg-final { scan-assembler-times "steorlb\t" 4} } */
-/* { dg-final { scan-assembler-times "ldeorh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldeorh\t" 4} } */
/* { dg-final { scan-assembler-times "ldeorah\t" 16} } */
-/* { dg-final { scan-assembler-times "ldeorlh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldeorlh\t" 4} } */
/* { dg-final { scan-assembler-times "ldeoralh\t" 16} } */
+/* { dg-final { scan-assembler-times "steorh\t" 4} } */
+/* { dg-final { scan-assembler-times "steorlh\t" 4} } */
-/* { dg-final { scan-assembler-times "ldeor\t" 16} */
+/* { dg-final { scan-assembler-times "ldeor\t" 8} */
/* { dg-final { scan-assembler-times "ldeora\t" 32} } */
-/* { dg-final { scan-assembler-times "ldeorl\t" 16} } */
+/* { dg-final { scan-assembler-times "ldeorl\t" 8} } */
/* { dg-final { scan-assembler-times "ldeoral\t" 32} } */
+/* { dg-final { scan-assembler-times "steor\t" 8} */
+/* { dg-final { scan-assembler-times "steorl\t" 8} } */
/* { dg-final { scan-assembler-not "ldaxr\t" } } */
/* { dg-final { scan-assembler-not "stlxr\t" } } */