Author: Remi Meier <remi.me...@gmail.com> Branch: nogil-unsafe-2 Changeset: r90444:65be98dc2aee Date: 2017-03-01 18:27 +0100 http://bitbucket.org/pypy/pypy/changeset/65be98dc2aee/
Log: (arigo, remi) implement a synchronisation scheme for safepoints (WIP) diff --git a/rpython/memory/gc/incminimark.py b/rpython/memory/gc/incminimark.py --- a/rpython/memory/gc/incminimark.py +++ b/rpython/memory/gc/incminimark.py @@ -72,7 +72,7 @@ from rpython.rlib.rarithmetic import LONG_BIT_SHIFT from rpython.rlib.debug import ll_assert, debug_print, debug_start, debug_stop from rpython.rlib.objectmodel import specialize, we_are_translated -from rpython.rlib import rthread +from rpython.rlib import rgil, rthread from rpython.memory.gc.minimarkpage import out_of_memory # @@ -191,7 +191,7 @@ NURSERY_FREE = rthread.ThreadLocalField(llmemory.Address, 'nursery_free') NURSERY_TOP = rthread.ThreadLocalField(llmemory.Address, 'nursery_top') -NEXT_NUBLOCK = rthread.ThreadLocalField(llmemory.Address, 'next_nublock') + # ____________________________________________________________ @@ -438,11 +438,11 @@ self.old_objects_pointing_to_pinned = self.AddressStack() self.updated_old_objects_pointing_to_pinned = False # - # Allocate lock(s) - ll_lock = lltype.malloc(rthread.TLOCKP.TO, flavor='raw', - track_allocation=False) - rthread.c_thread_lock_init(ll_lock) - self.ll_lock = ll_lock + # # Allocate lock(s) + # ll_lock = lltype.malloc(rthread.TLOCKP.TO, flavor='raw', + # track_allocation=False) + # rthread.c_thread_lock_init(ll_lock) + # self.ll_lock = ll_lock # # Allocate a nursery. In case of auto_nursery_size, start by # allocating a very small nursery, enough to do things like look @@ -650,9 +650,6 @@ get_nursery_top = staticmethod(NURSERY_TOP.getraw) set_nursery_top = staticmethod(NURSERY_TOP.setraw) - get_next_nublock = staticmethod(NEXT_NUBLOCK.getraw) - set_next_nublock = staticmethod(NEXT_NUBLOCK.setraw) - @property def nursery_top(self): XXX # fix caller @@ -859,7 +856,8 @@ major collection, and finally reserve totalsize bytes. """ - rthread.acquire_NOAUTO(self.ll_lock, 1) + # rthread.acquire_NOAUTO(self.ll_lock, 1) + rgil.enter_master_section() minor_collection_count = 0 while True: @@ -898,6 +896,8 @@ self.set_nursery_free(self.nursery_barriers.popleft()) self.set_nursery_top(self.nursery_barriers.popleft()) else: + rgil.master_request_safepoint() + minor_collection_count += 1 if minor_collection_count == 1: self.minor_collection_with_major_progress() @@ -936,7 +936,8 @@ self.set_nursery_free(self.get_nursery_top() - self.debug_tiny_nursery) # - rthread.release_NOAUTO(self.ll_lock) + rgil.leave_master_section() + # rthread.release_NOAUTO(self.ll_lock) return result collect_and_reserve._dont_inline_ = True diff --git a/rpython/memory/gctransform/shadowstack.py b/rpython/memory/gctransform/shadowstack.py --- a/rpython/memory/gctransform/shadowstack.py +++ b/rpython/memory/gctransform/shadowstack.py @@ -227,9 +227,11 @@ tl_shadowstack = rthread.ThreadLocalField(llmemory.Address, 'shadowstack') + tl_synclock = rthread.ThreadLocalField(lltype.Signed, 'synclock') def thread_setup(): allocate_shadow_stack() + tl_synclock.get_or_make_raw() def thread_run(): # If it's the first time we see this thread, allocate diff --git a/rpython/rlib/rgil.py b/rpython/rlib/rgil.py --- a/rpython/rlib/rgil.py +++ b/rpython/rlib/rgil.py @@ -22,7 +22,7 @@ _nowrapper=True, sandboxsafe=True, compilation_info=eci) -_gil_yield_thread = llexternal('RPyGilYieldThread', [], lltype.Signed, +_gil_yield_thread = llexternal('RPyGilYieldThread', [], lltype.Void, _nowrapper=True, sandboxsafe=True, compilation_info=eci) @@ -38,6 +38,20 @@ _nowrapper=True, sandboxsafe=True, compilation_info=eci) +enter_master_section = llexternal( + 'RPyGilEnterMasterSection', [], lltype.Void, + _nowrapper=True, sandboxsafe=True, + compilation_info=eci) + +leave_master_section = llexternal( + 'RPyGilLeaveMasterSection', [], lltype.Void, + _nowrapper=True, sandboxsafe=True, + compilation_info=eci) + +master_request_safepoint = llexternal( + 'RPyGilMasterRequestSafepoint', [], lltype.Void, + _nowrapper=True, sandboxsafe=True, + compilation_info=eci) # ____________________________________________________________ @@ -133,10 +147,11 @@ # explicitly release the gil, in a way that tries to give more # priority to other threads (as opposed to continuing to run in # the same thread). - if _gil_yield_thread(): - from rpython.rlib import rthread - rthread.gc_thread_run() - _after_thread_switch() + # if _gil_yield_thread(): + # from rpython.rlib import rthread + # rthread.gc_thread_run() + # _after_thread_switch() + _gil_yield_thread() yield_thread._gctransformer_hint_close_stack_ = True yield_thread._dont_reach_me_in_del_ = True yield_thread._dont_inline_ = True diff --git a/rpython/translator/c/src/thread.c b/rpython/translator/c/src/thread.c --- a/rpython/translator/c/src/thread.c +++ b/rpython/translator/c/src/thread.c @@ -9,11 +9,9 @@ #include "common_header.h" #endif -#ifdef PYPY_USE_ASMGCC # include "common_header.h" # include "structdef.h" # include "forwarddecl.h" -#endif #ifdef _WIN32 #include "src/thread_nt.c" diff --git a/rpython/translator/c/src/thread.h b/rpython/translator/c/src/thread.h --- a/rpython/translator/c/src/thread.h +++ b/rpython/translator/c/src/thread.h @@ -30,8 +30,15 @@ #endif /* !_WIN32 */ RPY_EXTERN void RPyGilAllocate(void); -RPY_EXTERN long RPyGilYieldThread(void); -RPY_EXTERN void RPyGilAcquireSlowPath(long); +RPY_EXTERN void RPyGilYieldThreadSlowPath(void); +RPY_EXTERN void RPyGilAcquireSlowPath(void); +RPY_EXTERN void RPyGilReleaseSlowPath(void); + +RPY_EXTERN void RPyGilEnterMasterSection(void); +RPY_EXTERN void RPyGilLeaveMasterSection(void); +RPY_EXTERN void RPyGilMasterRequestSafepoint(void); + + #define RPyGilAcquire _RPyGilAcquire #define RPyGilRelease _RPyGilRelease #define RPyFetchFastGil _RPyFetchFastGil @@ -43,21 +50,33 @@ #endif //RPY_EXTERN long rpy_fastgil; +#include "threadlocal.h" -static inline void _RPyGilAcquire(void) { -// long old_fastgil = pypy_lock_test_and_set(&rpy_fastgil, 1); -// if (old_fastgil != 0) -// RPyGilAcquireSlowPath(old_fastgil); -} -static inline void _RPyGilRelease(void) { -// assert(RPY_FASTGIL_LOCKED(rpy_fastgil)); -// pypy_lock_release(&rpy_fastgil); -} +#define _RPyGilAcquire() do { \ + if (!__sync_bool_compare_and_swap( \ + &RPY_THREADLOCALREF_GET(synclock), 0L, 1L)) \ + RPyGilAcquireSlowPath(); \ + } while (0) + +#define _RPyGilRelease() do { \ + assert(RPY_THREADLOCALREF_GET(synclock) != 0L); \ + if (!__sync_bool_compare_and_swap( \ + &RPY_THREADLOCALREF_GET(synclock), 1L, 0L)) \ + RPyGilReleaseSlowPath(); \ + } while (0) + static inline long *_RPyFetchFastGil(void) { abort(); // return &rpy_fastgil; } +#define RPyGilYieldThread() do { \ + assert(RPY_THREADLOCALREF_GET(synclock) & 1L); \ + if (RPY_THREADLOCALREF_GET(synclock) == 3L) { \ + RPyGilYieldThreadSlowPath(); \ + } \ + } while (0) + typedef unsigned char rpy_spinlock_t; static inline void rpy_spinlock_acquire(rpy_spinlock_t *p) { diff --git a/rpython/translator/c/src/thread_gil.c b/rpython/translator/c/src/thread_gil.c --- a/rpython/translator/c/src/thread_gil.c +++ b/rpython/translator/c/src/thread_gil.c @@ -1,239 +1,142 @@ +#include <pthread.h> +#include <stdlib.h> +#include <assert.h> +#include "threadlocal.h" -/* Idea: +static pthread_mutex_t master_mutex; +static pthread_mutex_t sync_mutex; +static pthread_cond_t sync_cond; - - "The GIL" is a composite concept. There are two locks, and "the - GIL is locked" when both are locked. +static long counter_of_threes = 0; - - The first lock is a simple global variable 'rpy_fastgil'. With - shadowstack, we use the most portable definition: 0 means unlocked - and != 0 means locked. With asmgcc, 0 means unlocked but only 1 - means locked. A different value means unlocked too, but the value - is used by the JIT to contain the stack top for stack root scanning. - - - The second lock is a regular mutex. In the fast path, it is never - unlocked. Remember that "the GIL is unlocked" means that either - the first or the second lock is unlocked. It should never be the - case that both are unlocked at the same time. - - - Let's call "thread 1" the thread with the GIL. Whenever it does an - external function call, it sets 'rpy_fastgil' to 0 (unlocked). - This is the cheapest way to release the GIL. When it returns from - the function call, this thread attempts to atomically change - 'rpy_fastgil' to 1. In the common case where it works, thread 1 - has got the GIL back and so continues to run. - - - Say "thread 2" is eagerly waiting for thread 1 to become blocked in - some long-running call. Regularly, it checks if 'rpy_fastgil' is 0 - and tries to atomically change it to 1. If it succeeds, it means - that the GIL was not previously locked. Thread 2 has now got the GIL. - - - If there are more than 2 threads, the rest is really sleeping by - waiting on the 'mutex_gil_stealer' held by thread 2. - - - An additional mechanism is used for when thread 1 wants to - explicitly yield the GIL to thread 2: it does so by releasing - 'mutex_gil' (which is otherwise not released) but keeping the - value of 'rpy_fastgil' to 1. -*/ - - -/* The GIL is initially released; see pypy_main_function(), which calls - RPyGilAcquire/RPyGilRelease. The point is that when building - RPython libraries, they can be a collection of regular functions that - also call RPyGilAcquire/RPyGilRelease; see test_standalone.TestShared. -*/ -long rpy_fastgil = 0; -static long rpy_waiting_threads = -42; /* GIL not initialized */ -static volatile int rpy_early_poll_n = 0; -static mutex1_t mutex_gil_stealer; -static mutex2_t mutex_gil; +static long rpy_initialize = -42; static void rpy_init_mutexes(void) { - mutex1_init(&mutex_gil_stealer); - mutex2_init_locked(&mutex_gil); - rpy_waiting_threads = 0; + int err = pthread_mutex_init(&master_mutex, NULL); + if (err) + abort(); + + err = pthread_mutex_init(&sync_mutex, NULL); + if (err) + abort(); + + err = pthread_cond_init(&sync_cond, NULL); + if (err) + abort(); + + counter_of_threes = 0; // XXX: fork? + rpy_initialize = 0; } void RPyGilAllocate(void) { -// if (rpy_waiting_threads < 0) { -// assert(rpy_waiting_threads == -42); -// rpy_init_mutexes(); + if (rpy_initialize < 0) { + assert(rpy_initialize == -42); + rpy_init_mutexes(); #ifdef HAVE_PTHREAD_ATFORK -// pthread_atfork(NULL, NULL, rpy_init_mutexes); + pthread_atfork(NULL, NULL, rpy_init_mutexes); #endif -// } + } } -static void check_and_save_old_fastgil(long old_fastgil) + +void RPyGilAcquireSlowPath(void) { - assert(RPY_FASTGIL_LOCKED(rpy_fastgil)); + assert(RPY_THREADLOCALREF_GET(synclock) == 2); -#ifdef PYPY_USE_ASMGCC - if (old_fastgil != 0) { - /* this case only occurs from the JIT compiler */ - struct pypy_ASM_FRAMEDATA_HEAD0 *new = - (struct pypy_ASM_FRAMEDATA_HEAD0 *)old_fastgil; - struct pypy_ASM_FRAMEDATA_HEAD0 *root = &pypy_g_ASM_FRAMEDATA_HEAD; - struct pypy_ASM_FRAMEDATA_HEAD0 *next = root->as_next; - new->as_next = next; - new->as_prev = root; - root->as_next = new; - next->as_prev = new; - } -#else - assert(old_fastgil == 0); -#endif + /* wait until the master leaves the safe point */ + pthread_mutex_lock(&master_mutex); + RPY_THREADLOCALREF_GET(synclock) = 1; + pthread_mutex_unlock(&master_mutex); } -#define RPY_GIL_POKE_MIN 40 -#define RPY_GIL_POKE_MAX 400 +void RPyGilReleaseSlowPath(void) +{ + assert(RPY_THREADLOCALREF_GET(synclock) == 3); -void RPyGilAcquireSlowPath(long old_fastgil) + pthread_mutex_lock(&sync_mutex); + + /* we are one of the THREES that the master is waiting for. Decrease the + * counter and signal the master if we are the last. */ + counter_of_threes--; + if (counter_of_threes == 0) + pthread_cond_signal(&sync_cond); + + /* set to TWO, so that Acquire above will wait until the master is finished + * with its safe point */ + RPY_THREADLOCALREF_GET(synclock) = 2; + pthread_mutex_unlock(&sync_mutex); + // continue without GIL +} + +void RPyGilYieldThreadSlowPath(void) { - /* Acquires the GIL. This assumes that we already did: + RPyGilRelease(); + RPyGilAcquire(); +} - old_fastgil = pypy_lock_test_and_set(&rpy_fastgil, 1); - */ - if (!RPY_FASTGIL_LOCKED(old_fastgil)) { - /* The fastgil was not previously locked: success. - 'mutex_gil' should still be locked at this point. - */ - } - else { - /* Otherwise, another thread is busy with the GIL. */ - int n; - long old_waiting_threads; +void RPyGilEnterMasterSection(void) +{ + RPyGilRelease(); + pthread_mutex_lock(&master_mutex); +} - if (rpy_waiting_threads < 0) { - /* <arigo> I tried to have RPyGilAllocate() called from - * here, but it fails occasionally on an example - * (2.7/test/test_threading.py). I think what occurs is - * that if one thread runs RPyGilAllocate(), it still - * doesn't have the GIL; then the other thread might fork() - * at precisely this moment, killing the first thread. - */ - fprintf(stderr, "Fatal RPython error: a thread is trying to wait " - "for the GIL, but the GIL was not initialized\n" - "(For PyPy, see " - "https://bitbucket.org/pypy/pypy/issues/2274)\n"); +void RPyGilLeaveMasterSection(void) +{ + pthread_mutex_unlock(&master_mutex); + RPyGilAcquire(); +} + +void RPyGilMasterRequestSafepoint(void) +{ + pthread_mutex_lock(&sync_mutex); + assert(counter_of_threes == 0); + + /* signal all threads to enter safepoints */ + OP_THREADLOCALREF_ACQUIRE(/* */); + + struct pypy_threadlocal_s *t = NULL; + while (1) { + OP_THREADLOCALREF_ENUM(t, t); + if (t == NULL) + break; + + retry: + switch (t->synclock) { + case 3: + assert(!"unexpected synclock=3 found"); abort(); - } - - /* Register me as one of the threads that is actively waiting - for the GIL. The number of such threads is found in - rpy_waiting_threads. */ - old_waiting_threads = atomic_increment(&rpy_waiting_threads); - - /* Early polling: before entering the waiting queue, we check - a certain number of times if the GIL becomes free. The - motivation for this is issue #2341. Note that we do this - polling even if there are already other threads in the - queue, and one of thesee threads is the stealer. This is - because the stealer is likely sleeping right now. There - are use cases where the GIL will really be released very - soon after RPyGilAcquireSlowPath() is called, so it's worth - always doing this check. - - To avoid falling into bad cases, we "randomize" the number - of iterations: we loop N times, where N is choosen between - RPY_GIL_POKE_MIN and RPY_GIL_POKE_MAX. - */ - n = rpy_early_poll_n * 2 + 1; - while (n >= RPY_GIL_POKE_MAX) - n -= (RPY_GIL_POKE_MAX - RPY_GIL_POKE_MIN); - rpy_early_poll_n = n; - while (n >= 0) { - n--; - if (old_waiting_threads != rpy_waiting_threads) { - /* If the number changed, it is because another thread - entered or left this function. In that case, stop - this loop: if another thread left it means the GIL - has been acquired by that thread; if another thread - entered there is no point in running the present - loop twice. */ + case 2: + /* thread running in C code, already knows we want a safepoint */ + break; + case 0: + /* thread running in C code, make sure it checks for and enters + * the safepoint before acquiring the "gil" again */ + if (__sync_bool_compare_and_swap(&t->synclock, 0, 2)) + break; + goto retry; + case 1: + /* thread running normally, place request to enter safepoint */ + if (__sync_bool_compare_and_swap(&t->synclock, 1, 3)) { + counter_of_threes++; + t->nursery_top = NULL; break; } - RPy_YieldProcessor(); - RPy_CompilerMemoryBarrier(); + goto retry; + } + } + OP_THREADLOCALREF_RELEASE(/* */); - if (!RPY_FASTGIL_LOCKED(rpy_fastgil)) { - old_fastgil = pypy_lock_test_and_set(&rpy_fastgil, 1); - if (!RPY_FASTGIL_LOCKED(old_fastgil)) { - /* We got the gil before entering the waiting - queue. In case there are other threads waiting - for the GIL, wake up the stealer thread now and - go to the waiting queue anyway, for fairness. - This will fall through if there are no other - threads waiting. - */ - check_and_save_old_fastgil(old_fastgil); - mutex2_unlock(&mutex_gil); - break; - } - } - } + /* wait until all THREES entered their safepoints */ + while (counter_of_threes > 0) { + pthread_cond_wait(&sync_cond, &sync_mutex); + } - /* Enter the waiting queue from the end. Assuming a roughly - first-in-first-out order, this will nicely give the threads - a round-robin chance. - */ - mutex1_lock(&mutex_gil_stealer); - mutex2_loop_start(&mutex_gil); + pthread_mutex_unlock(&sync_mutex); - /* We are now the stealer thread. Steals! */ - while (1) { - /* Busy-looping here. Try to look again if 'rpy_fastgil' is - released. - */ - if (!RPY_FASTGIL_LOCKED(rpy_fastgil)) { - old_fastgil = pypy_lock_test_and_set(&rpy_fastgil, 1); - if (!RPY_FASTGIL_LOCKED(old_fastgil)) - /* yes, got a non-held value! Now we hold it. */ - break; - } - /* Sleep for one interval of time. We may be woken up earlier - if 'mutex_gil' is released. - */ - if (mutex2_lock_timeout(&mutex_gil, 0.0001)) { /* 0.1 ms... */ - /* We arrive here if 'mutex_gil' was recently released - and we just relocked it. - */ - old_fastgil = 0; - break; - } - /* Loop back. */ - } - atomic_decrement(&rpy_waiting_threads); - mutex2_loop_stop(&mutex_gil); - mutex1_unlock(&mutex_gil_stealer); - } - check_and_save_old_fastgil(old_fastgil); -} - -long RPyGilYieldThread(void) -{ - /* can be called even before RPyGilAllocate(), but in this case, - 'rpy_waiting_threads' will be -42. */ - assert(RPY_FASTGIL_LOCKED(rpy_fastgil)); - if (rpy_waiting_threads <= 0) - return 0; - - /* Explicitly release the 'mutex_gil'. - */ - mutex2_unlock(&mutex_gil); - - /* Now nobody has got the GIL, because 'mutex_gil' is released (but - rpy_fastgil is still locked). Call RPyGilAcquire(). It will - enqueue ourselves at the end of the 'mutex_gil_stealer' queue. - If there is no other waiting thread, it will fall through both - its mutex_lock() and mutex_lock_timeout() now. But that's - unlikely, because we tested above that 'rpy_waiting_threads > 0'. - */ - RPyGilAcquire(); - return 1; + /* caller can continue; all threads in safepoints */ } /********** for tests only **********/ _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit