[Python-checkins] gh-114746: Avoid quadratic behavior in free-threaded GC (GH-114817)

encukou Thu, 01 Feb 2024 01:26:53 -0800

https://github.com/python/cpython/commit/e6d6d5dcc00af50446761b0c4d20bd6e92380135
commit: e6d6d5dcc00af50446761b0c4d20bd6e92380135
branch: main
author: Sam Gross <[email protected]>
committer: encukou <[email protected]>
date: 2024-02-01T10:26:23+01:00
summary:


gh-114746: Avoid quadratic behavior in free-threaded GC (GH-114817)

The free-threaded build's GC implementation is non-generational, but was
scheduled as if it were collecting a young generation leading to
quadratic behavior. This increases the minimum threshold and scales it
to the number of live objects as we do for the old generation in the
default build.

Note that the scheduling is still not thread-safe without the GIL. Those
changes will come in later PRs.

A few tests, like "test_sneaky_frame_object" rely on prompt scheduling
of the GC. For now, to keep that test passing, we disable the scaled
threshold after calls like `gc.set_threshold(1, 0, 0)`.

files:
M Python/gc_free_threading.c

diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
index f2cd84981461a4..a6513a2c4aba2a 100644
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@@ -46,6 +46,7 @@ struct collection_state {
     GCState *gcstate;
     Py_ssize_t collected;
     Py_ssize_t uncollectable;
+    Py_ssize_t long_lived_total;
     struct worklist unreachable;
     struct worklist legacy_finalizers;
     struct worklist wrcb_to_call;
@@ -443,7 +444,7 @@ scan_heap_visitor(const mi_heap_t *heap, const 
mi_heap_area_t *area,
     else {
         // object is reachable, restore `ob_tid`; we're done with these objects
         gc_restore_tid(op);
-        state->gcstate->long_lived_total++;
+        state->long_lived_total++;
     }
 
     return true;
@@ -605,6 +606,8 @@ get_gc_state(void)
 void
 _PyGC_InitState(GCState *gcstate)
 {
+    // TODO: move to pycore_runtime_init.h once the incremental GC lands.
+    gcstate->generations[0].threshold = 2000;
 }
 
 
@@ -885,62 +888,6 @@ invoke_gc_callback(PyThreadState *tstate, const char 
*phase,
     assert(!_PyErr_Occurred(tstate));
 }
 
-
-/* Find the oldest generation (highest numbered) where the count
- * exceeds the threshold.  Objects in the that generation and
- * generations younger than it will be collected. */
-static int
-gc_select_generation(GCState *gcstate)
-{
-    for (int i = NUM_GENERATIONS-1; i >= 0; i--) {
-        if (gcstate->generations[i].count > gcstate->generations[i].threshold) 
{
-            /* Avoid quadratic performance degradation in number
-               of tracked objects (see also issue #4074):
-
-               To limit the cost of garbage collection, there are two 
strategies;
-                 - make each collection faster, e.g. by scanning fewer objects
-                 - do less collections
-               This heuristic is about the latter strategy.
-
-               In addition to the various configurable thresholds, we only 
trigger a
-               full collection if the ratio
-
-                long_lived_pending / long_lived_total
-
-               is above a given value (hardwired to 25%).
-
-               The reason is that, while "non-full" collections (i.e., 
collections of
-               the young and middle generations) will always examine roughly 
the same
-               number of objects -- determined by the aforementioned 
thresholds --,
-               the cost of a full collection is proportional to the total 
number of
-               long-lived objects, which is virtually unbounded.
-
-               Indeed, it has been remarked that doing a full collection every
-               <constant number> of object creations entails a dramatic 
performance
-               degradation in workloads which consist in creating and storing 
lots of
-               long-lived objects (e.g. building a large list of GC-tracked 
objects would
-               show quadratic performance, instead of linear as expected: see 
issue #4074).
-
-               Using the above ratio, instead, yields amortized linear 
performance in
-               the total number of objects (the effect of which can be 
summarized
-               thusly: "each full garbage collection is more and more costly 
as the
-               number of objects grows, but we do fewer and fewer of them").
-
-               This heuristic was suggested by Martin von Löwis on python-dev 
in
-               June 2008. His original analysis and proposal can be found at:
-               
http://mail.python.org/pipermail/python-dev/2008-June/080579.html
-            */
-            if (i == NUM_GENERATIONS - 1
-                && gcstate->long_lived_pending < gcstate->long_lived_total / 4)
-            {
-                continue;
-            }
-            return i;
-        }
-    }
-    return -1;
-}
-
 static void
 cleanup_worklist(struct worklist *worklist)
 {
@@ -952,6 +899,21 @@ cleanup_worklist(struct worklist *worklist)
     }
 }
 
+static bool
+gc_should_collect(GCState *gcstate)
+{
+    int count = _Py_atomic_load_int_relaxed(&gcstate->generations[0].count);
+    int threshold = gcstate->generations[0].threshold;
+    if (count <= threshold || threshold == 0 || !gcstate->enabled) {
+        return false;
+    }
+    // Avoid quadratic behavior by scaling threshold to the number of live
+    // objects. A few tests rely on immediate scheduling of the GC so we ignore
+    // the scaled threshold if generations[1].threshold is set to zero.
+    return (count > gcstate->long_lived_total / 4 ||
+            gcstate->generations[1].threshold == 0);
+}
+
 static void
 gc_collect_internal(PyInterpreterState *interp, struct collection_state *state)
 {
@@ -1029,15 +991,10 @@ gc_collect_main(PyThreadState *tstate, int generation, 
_PyGC_Reason reason)
         return 0;
     }
 
-    if (generation == GENERATION_AUTO) {
-        // Select the oldest generation that needs collecting. We will collect
-        // objects from that generation and all generations younger than it.
-        generation = gc_select_generation(gcstate);
-        if (generation < 0) {
-            // No generation needs to be collected.
-            _Py_atomic_store_int(&gcstate->collecting, 0);
-            return 0;
-        }
+    if (reason == _Py_GC_REASON_HEAP && !gc_should_collect(gcstate)) {
+        // Don't collect if the threshold is not exceeded.
+        _Py_atomic_store_int(&gcstate->collecting, 0);
+        return 0;
     }
 
     assert(generation >= 0 && generation < NUM_GENERATIONS);
@@ -1082,6 +1039,7 @@ gc_collect_main(PyThreadState *tstate, int generation, 
_PyGC_Reason reason)
 
     m = state.collected;
     n = state.uncollectable;
+    gcstate->long_lived_total = state.long_lived_total;
 
     if (gcstate->debug & _PyGC_DEBUG_STATS) {
         double d = _PyTime_AsSecondsDouble(_PyTime_GetPerfCounter() - t1);
@@ -1523,12 +1481,10 @@ _PyObject_GC_Link(PyObject *op)
 {
     PyThreadState *tstate = _PyThreadState_GET();
     GCState *gcstate = &tstate->interp->gc;
-    gcstate->generations[0].count++; /* number of allocated GC objects */
-    if (gcstate->generations[0].count > gcstate->generations[0].threshold &&
-        gcstate->enabled &&
-        gcstate->generations[0].threshold &&
-        !_Py_atomic_load_int_relaxed(&gcstate->collecting) &&
-        !_PyErr_Occurred(tstate))
+    gcstate->generations[0].count++;
+
+    if (gc_should_collect(gcstate) &&
+        !_Py_atomic_load_int_relaxed(&gcstate->collecting))
     {
         _Py_ScheduleGC(tstate->interp);
     }
@@ -1537,7 +1493,7 @@ _PyObject_GC_Link(PyObject *op)
 void
 _Py_RunGC(PyThreadState *tstate)
 {
-    gc_collect_main(tstate, GENERATION_AUTO, _Py_GC_REASON_HEAP);
+    gc_collect_main(tstate, 0, _Py_GC_REASON_HEAP);
 }
 
 static PyObject *

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]

[Python-checkins] gh-114746: Avoid quadratic behavior in free-threaded GC (GH-114817)

Reply via email to