In the test below, we cannot cache either [x] or [y] neither before the
load of flag1 nor the load of flag2. This is because the corresponding
store/release can flush a different value of x or y:
+ if (__atomic_load_n (&flag1, __ATOMIC_ACQUIRE))
+ i = x + y;
+
+ if (__atomic_load_n (&flag2, __ATOMIC_ACQUIRE))
+ a = 10;
+ j = x + y;
For example, on x86-64, we are hoisting "x" and "y" before the load of
flag2:
movl flag1(%rip), %eax
movl x(%rip), %edx <-- hoist of X
testl %eax, %eax
movl y(%rip), %eax <-- hoist of Y
je .L2
leal (%edx,%eax), %ecx
movl %ecx, i(%rip)
.L2:
movl flag2(%rip), %ecx
testl %ecx, %ecx
je .L3
movl $10, a(%rip)
.L3:
addl %edx, %eax <-- x/y may have changed by the
acquire of flag2.
movl %eax, j(%rip)
ret
(For that matter, we are also hoisting "x" before the actual test of
flag1 as well, but I believe this is allowed since flag1 has already
been loaded.)
The pass at fault here is the combine stack adjustment RTL pass. I have
not looked into why this is happening, but I wanted to get this test
into the branch lest we forget about it.
Is this OK for the branch? Is my understanding correct?
Aldy
Index: atomic-hoist-1.c
===================================================================
--- atomic-hoist-1.c (revision 0)
+++ atomic-hoist-1.c (revision 0)
@@ -0,0 +1,96 @@
+/* { dg-do link } */
+/* { dg-require-effective-target sync_int_long } */
+/* { dg-final { simulate-thread } } */
+
+/* Test that a hoist is not performed across an acquire barrier. */
+
+#include <stdio.h>
+#include "simulate-thread.h"
+
+int flag1=1, flag2=1;
+
+unsigned int x=1, y=2, i=0x1234, j=0x5678, a;
+
+/* These two tables are random numbers such that there are no two
+ pairs between the both tables that yield the same sum. */
+
+unsigned int table1[16] = {
+ 24747, 19512, 3692, 25985,
+ 25079, 24, 3310, 22073,
+ 4026, 25641, 35240, 35542,
+ 24783, 17378, 12184, 23755
+};
+
+unsigned int table2[16] = {
+ 2467, 37461, 14064, 36460,
+ 46434, 8387, 42174, 36763,
+ 49205, 48759, 10526, 3446,
+ 14035, 2195, 6798, 38782
+};
+
+int table_cycle_size = 16;
+
+/* At each instruction, get a new X and Y from the tables to later
+ verify that we have not reused a value incorrectly. */
+void simulate_thread_other_threads ()
+{
+ static int current = 0;
+
+ if (++current >= table_cycle_size)
+ current = 0;
+ x = table1[current];
+ y = table2[current];
+}
+
+/* Return true if error, otherwise 0. */
+int verify_result ()
+{
+ /* [i] should not equal [j], because that would mean that we hoisted
+ [x] and [y] instead of loading them again. */
+ int fail = i == j;
+ if (fail)
+ printf("FAIL: i (%u) should not equal j (%u)\n", i, j);
+ return fail;
+}
+
+int simulate_thread_step_verify ()
+{
+ return verify_result ();
+}
+
+int simulate_thread_final_verify ()
+{
+ return verify_result ();
+}
+
+__attribute__((noinline))
+void simulate_thread_main()
+{
+ /* The values of x or y should not be hoisted across reads of
+ flag[12].
+
+ For example, when the second load below synchronizes with another
+ thread, the synchronization is with a release, and that release
+ may cause a stored value of x/y to be flushed and become visible.
+ So, for this case, it is incorrect for CSE/CSA/and-others to
+ hoist x or y above the load of flag2. */
+
+ /* Execute loads with value changing at various cyclic values. */
+ if (__atomic_load_n (&flag1, __ATOMIC_ACQUIRE))
+ i = x + y;
+
+ if (__atomic_load_n (&flag2, __ATOMIC_ACQUIRE))
+ a = 10;
+ j = x + y;
+
+ /* Since x and y have been changing at each instruction above, i and j
+ should be different. If they are the same, we have hoisted
+ something incorrectly. */
+}
+
+main()
+{
+ simulate_thread_main ();
+ simulate_thread_done ();
+ return 0;
+}