Recently I was hunting for bugs in __cxa_guard_acquire/release that might result in pr52839. No bugs found, but I did notice that these functions use excessive memory barriers, no doubt from their conversion from sync to atomic builtins.
Making the second __atomic_compare_exchange_n in __cxa_guard_acquire __ATOMIC_RELAXED is easy to justify: This atomic op is in a loop, the entirety of which just attempts to modify the guard. No other shared memory is touched, and the loop terminates after the first atomic op. So only the first atomic op requires a memory barrier. That the barrier on the first atomic op should be __ATOMIC_ACQUIRE can be seen from gcc/cp/decl.c:expand_static_init code and comments. We need an acquire (or read) barrier so that a thread that doesn't perform the initialization won't have stale data cached from before the guard was set to say that the guarded static var has been successfully initialized. A read barrier is sufficient because no thread has written anything to the guarded var before that point. The atomic op in __cxa_guard_release needs to be __ATOMIC_RELEASE (a release or write barrier) so that all writes to the guarded static var are flushed before the guard state changes. Similar reasoning for __cxa_guard_abort says we need a write barrier so that intermediate or partial writes to the guarded variable from the aborted thread are flushed before some other thread starts the initialization. We wouldn't want any writes from the aborted thread corrupting a good initialization. Quite obviously we don't need a read barrier at these points since only the initialization thread gets there. Bootstrapped and regression tested on power7. OK to apply? 2012-04-08 Alan Modra <amo...@gmail.com> * libsupc++/guard.cc: Whitespace cleanup. (__cxa_guard_acquire): Use acquire barrier on first atomic compare exchange, no barriers on second. (__cxa_guard_abort, __cxa_guard_release): Use release barrier on atomic exchange. Index: libstdc++-v3/libsupc++/guard.cc =================================================================== --- libstdc++-v3/libsupc++/guard.cc (revision 186130) +++ libstdc++-v3/libsupc++/guard.cc (working copy) @@ -1,6 +1,6 @@ // Copyright (C) 2002, 2004, 2006, 2008, 2009, 2010, 2011, 2012 // Free Software Foundation, Inc. -// +// // This file is part of GCC. // // GCC is free software; you can redistribute it and/or modify @@ -50,7 +50,7 @@ namespace { // A single mutex controlling all static initializations. - static __gnu_cxx::__recursive_mutex* static_mutex; + static __gnu_cxx::__recursive_mutex* static_mutex; typedef char fake_recursive_mutex[sizeof(__gnu_cxx::__recursive_mutex)] __attribute__ ((aligned(__alignof__(__gnu_cxx::__recursive_mutex)))); @@ -87,7 +87,7 @@ namespace { // A single condition variable controlling all static initializations. - static __gnu_cxx::__cond* static_cond; + static __gnu_cxx::__cond* static_cond; // using a fake type to avoid initializing a static class. typedef char fake_cond_t[sizeof(__gnu_cxx::__cond)] @@ -179,7 +179,7 @@ // | _GLIBCXX_GUARD_WAITING_BIT) and some other threads are waiting until // it is initialized. -namespace __cxxabiv1 +namespace __cxxabiv1 { #ifdef _GLIBCXX_USE_FUTEX namespace @@ -229,7 +229,7 @@ } extern "C" - int __cxa_guard_acquire (__guard *g) + int __cxa_guard_acquire (__guard *g) { #ifdef __GTHREADS // If the target can reorder loads, we need to insert a read memory @@ -252,26 +252,26 @@ while (1) { if (__atomic_compare_exchange_n(gi, &expected, pending_bit, false, - __ATOMIC_ACQ_REL, + __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) { // This thread should do the initialization. return 1; } - + if (expected == guard_bit) { // Already initialized. - return 0; + return 0; } if (expected == pending_bit) { int newv = expected | waiting_bit; if (!__atomic_compare_exchange_n(gi, &expected, newv, false, - __ATOMIC_ACQ_REL, + __ATOMIC_RELAXED, __ATOMIC_RELAXED)) continue; - + expected = newv; } @@ -331,7 +331,7 @@ { int *gi = (int *) (void *) g; const int waiting_bit = _GLIBCXX_GUARD_WAITING_BIT; - int old = __atomic_exchange_n (gi, 0, __ATOMIC_ACQ_REL); + int old = __atomic_exchange_n (gi, 0, __ATOMIC_RELEASE); if ((old & waiting_bit) != 0) syscall (SYS_futex, gi, _GLIBCXX_FUTEX_WAKE, INT_MAX); @@ -339,16 +339,16 @@ } #elif defined(__GTHREAD_HAS_COND) if (__gthread_active_p()) - { + { mutex_wrapper mw; set_init_in_progress_flag(g, 0); // If we abort, we still need to wake up all other threads waiting for // the condition variable. - get_static_cond().broadcast(); + get_static_cond().broadcast(); return; - } + } #endif set_init_in_progress_flag(g, 0); @@ -371,7 +371,7 @@ int *gi = (int *) (void *) g; const int guard_bit = _GLIBCXX_GUARD_BIT; const int waiting_bit = _GLIBCXX_GUARD_WAITING_BIT; - int old = __atomic_exchange_n (gi, guard_bit, __ATOMIC_ACQ_REL); + int old = __atomic_exchange_n (gi, guard_bit, __ATOMIC_RELEASE); if ((old & waiting_bit) != 0) syscall (SYS_futex, gi, _GLIBCXX_FUTEX_WAKE, INT_MAX); @@ -385,9 +385,9 @@ set_init_in_progress_flag(g, 0); _GLIBCXX_GUARD_SET_AND_RELEASE(g); - get_static_cond().broadcast(); + get_static_cond().broadcast(); return; - } + } #endif set_init_in_progress_flag(g, 0); -- Alan Modra Australia Development Lab, IBM