[Python-checkins] gh-145230: Update lockbench (gh-145231)

colesbury Fri, 27 Feb 2026 11:10:04 -0800

https://github.com/python/cpython/commit/72eca2af59043c78647b0e6be3777a947ea9ef0f
commit: 72eca2af59043c78647b0e6be3777a947ea9ef0f
branch: main
author: Sam Gross <[email protected]>
committer: colesbury <[email protected]>
date: 2026-02-27T14:09:05-05:00
summary:


gh-145230: Update lockbench (gh-145231)

Remove PyThread_type_lock (now uses PyMutex internally).

Add new benchmark options:
- work_inside/work_outside: control work inside and outside the critical 
section to vary contention levels
- num_locks: use multiple independent locks with threads assigned round-robin
- total_iters: fixed iteration count per thread instead of time-based, useful 
for measuring fairness
- num_acquisitions: lock acquisitions per loop iteration
- random_locks: acquire random lock each iteration

Also return elapsed time from benchmark_locks() and switch lockbench.py to use 
argparse.

files:
M Modules/_testinternalcapi/clinic/test_lock.c.h
M Modules/_testinternalcapi/test_lock.c
M Tools/lockbench/lockbench.py

diff --git a/Modules/_testinternalcapi/clinic/test_lock.c.h 
b/Modules/_testinternalcapi/clinic/test_lock.c.h
index 86875767343cd2..6e989a777ac7f0 100644
--- a/Modules/_testinternalcapi/clinic/test_lock.c.h
+++ b/Modules/_testinternalcapi/clinic/test_lock.c.h
@@ -6,8 +6,9 @@ preserve
 #include "pycore_modsupport.h"    // _PyArg_CheckPositional()
 
 PyDoc_STRVAR(_testinternalcapi_benchmark_locks__doc__,
-"benchmark_locks($module, num_threads, use_pymutex=True,\n"
-"                critical_section_length=1, time_ms=1000, /)\n"
+"benchmark_locks($module, num_threads, work_inside=1, work_outside=0,\n"
+"                time_ms=1000, num_acquisitions=1, total_iters=0,\n"
+"                num_locks=1, random_locks=False, /)\n"
 "--\n"
 "\n");
 
@@ -17,20 +18,26 @@ PyDoc_STRVAR(_testinternalcapi_benchmark_locks__doc__,
 static PyObject *
 _testinternalcapi_benchmark_locks_impl(PyObject *module,
                                        Py_ssize_t num_threads,
-                                       int use_pymutex,
-                                       int critical_section_length,
-                                       int time_ms);
+                                       int work_inside, int work_outside,
+                                       int time_ms, int num_acquisitions,
+                                       Py_ssize_t total_iters,
+                                       Py_ssize_t num_locks,
+                                       int random_locks);
 
 static PyObject *
 _testinternalcapi_benchmark_locks(PyObject *module, PyObject *const *args, 
Py_ssize_t nargs)
 {
     PyObject *return_value = NULL;
     Py_ssize_t num_threads;
-    int use_pymutex = 1;
-    int critical_section_length = 1;
+    int work_inside = 1;
+    int work_outside = 0;
     int time_ms = 1000;
+    int num_acquisitions = 1;
+    Py_ssize_t total_iters = 0;
+    Py_ssize_t num_locks = 1;
+    int random_locks = 0;
 
-    if (!_PyArg_CheckPositional("benchmark_locks", nargs, 1, 4)) {
+    if (!_PyArg_CheckPositional("benchmark_locks", nargs, 1, 8)) {
         goto exit;
     }
     {
@@ -48,15 +55,15 @@ _testinternalcapi_benchmark_locks(PyObject *module, 
PyObject *const *args, Py_ss
     if (nargs < 2) {
         goto skip_optional;
     }
-    use_pymutex = PyObject_IsTrue(args[1]);
-    if (use_pymutex < 0) {
+    work_inside = PyLong_AsInt(args[1]);
+    if (work_inside == -1 && PyErr_Occurred()) {
         goto exit;
     }
     if (nargs < 3) {
         goto skip_optional;
     }
-    critical_section_length = PyLong_AsInt(args[2]);
-    if (critical_section_length == -1 && PyErr_Occurred()) {
+    work_outside = PyLong_AsInt(args[2]);
+    if (work_outside == -1 && PyErr_Occurred()) {
         goto exit;
     }
     if (nargs < 4) {
@@ -66,10 +73,54 @@ _testinternalcapi_benchmark_locks(PyObject *module, 
PyObject *const *args, Py_ss
     if (time_ms == -1 && PyErr_Occurred()) {
         goto exit;
     }
+    if (nargs < 5) {
+        goto skip_optional;
+    }
+    num_acquisitions = PyLong_AsInt(args[4]);
+    if (num_acquisitions == -1 && PyErr_Occurred()) {
+        goto exit;
+    }
+    if (nargs < 6) {
+        goto skip_optional;
+    }
+    {
+        Py_ssize_t ival = -1;
+        PyObject *iobj = _PyNumber_Index(args[5]);
+        if (iobj != NULL) {
+            ival = PyLong_AsSsize_t(iobj);
+            Py_DECREF(iobj);
+        }
+        if (ival == -1 && PyErr_Occurred()) {
+            goto exit;
+        }
+        total_iters = ival;
+    }
+    if (nargs < 7) {
+        goto skip_optional;
+    }
+    {
+        Py_ssize_t ival = -1;
+        PyObject *iobj = _PyNumber_Index(args[6]);
+        if (iobj != NULL) {
+            ival = PyLong_AsSsize_t(iobj);
+            Py_DECREF(iobj);
+        }
+        if (ival == -1 && PyErr_Occurred()) {
+            goto exit;
+        }
+        num_locks = ival;
+    }
+    if (nargs < 8) {
+        goto skip_optional;
+    }
+    random_locks = PyObject_IsTrue(args[7]);
+    if (random_locks < 0) {
+        goto exit;
+    }
 skip_optional:
-    return_value = _testinternalcapi_benchmark_locks_impl(module, num_threads, 
use_pymutex, critical_section_length, time_ms);
+    return_value = _testinternalcapi_benchmark_locks_impl(module, num_threads, 
work_inside, work_outside, time_ms, num_acquisitions, total_iters, num_locks, 
random_locks);
 
 exit:
     return return_value;
 }
-/*[clinic end generated code: output=105105d759c0c271 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=6cfed9fc081313ef input=a9049054013a1b77]*/
diff --git a/Modules/_testinternalcapi/test_lock.c 
b/Modules/_testinternalcapi/test_lock.c
index ded76ca9fe6819..596120ef275196 100644
--- a/Modules/_testinternalcapi/test_lock.c
+++ b/Modules/_testinternalcapi/test_lock.c
@@ -194,65 +194,101 @@ test_lock_counter_slow(PyObject *self, PyObject *obj)
     Py_RETURN_NONE;
 }
 
-struct bench_data_locks {
-    int stop;
-    int use_pymutex;
-    int critical_section_length;
+struct bench_lock {
     char padding[200];
-    PyThread_type_lock lock;
     PyMutex m;
     double value;
-    Py_ssize_t total_iters;
+};
+
+struct bench_config {
+    int stop;
+    int work_inside;
+    int work_outside;
+    int num_acquisitions;
+    int random_locks;
+    Py_ssize_t target_iters;
+    Py_ssize_t num_locks;
+    struct bench_lock *locks;
 };
 
 struct bench_thread_data {
-    struct bench_data_locks *bench_data;
+    struct bench_config *config;
+    struct bench_lock *lock;
+    uint64_t rng_state;
     Py_ssize_t iters;
     PyEvent done;
 };
 
+static uint64_t
+splitmix64(uint64_t *state)
+{
+    uint64_t z = (*state += 0x9e3779b97f4a7c15);
+    z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
+    z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
+    return z ^ (z >> 31);
+}
+
 static void
 thread_benchmark_locks(void *arg)
 {
-    struct bench_thread_data *thread_data = arg;
-    struct bench_data_locks *bench_data = thread_data->bench_data;
-    int use_pymutex = bench_data->use_pymutex;
-    int critical_section_length = bench_data->critical_section_length;
-
+    struct bench_thread_data *td = arg;
+    struct bench_config *config = td->config;
+    int work_inside = config->work_inside;
+    int work_outside = config->work_outside;
+    int num_acquisitions = config->num_acquisitions;
+    Py_ssize_t target_iters = config->target_iters;
+    uint64_t rng_state = td->rng_state;
+
+    double local_value = 0.0;
     double my_value = 1.0;
     Py_ssize_t iters = 0;
-    while (!_Py_atomic_load_int_relaxed(&bench_data->stop)) {
-        if (use_pymutex) {
-            PyMutex_Lock(&bench_data->m);
-            for (int i = 0; i < critical_section_length; i++) {
-                bench_data->value += my_value;
-                my_value = bench_data->value;
+    for (;;) {
+        if (target_iters > 0) {
+            if (iters >= target_iters) {
+                break;
             }
-            PyMutex_Unlock(&bench_data->m);
         }
-        else {
-            PyThread_acquire_lock(bench_data->lock, 1);
-            for (int i = 0; i < critical_section_length; i++) {
-                bench_data->value += my_value;
-                my_value = bench_data->value;
+        else if (_Py_atomic_load_int_relaxed(&config->stop)) {
+            break;
+        }
+        struct bench_lock *lock = td->lock;
+        if (config->random_locks) {
+            uint32_t r = (uint32_t)splitmix64(&rng_state);
+            // Fast modulo reduction to pick a random lock, adapted from:
+            // 
https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+            Py_ssize_t idx = ((uint64_t)r * (uint32_t)config->num_locks) >> 32;
+            lock = &config->locks[idx];
+        }
+        for (int acq = 0; acq < num_acquisitions; acq++) {
+            PyMutex_Lock(&lock->m);
+            for (int i = 0; i < work_inside; i++) {
+                lock->value += my_value;
+                my_value = lock->value;
             }
-            PyThread_release_lock(bench_data->lock);
+            PyMutex_Unlock(&lock->m);
         }
-        iters++;
+        for (int i = 0; i < work_outside; i++) {
+            local_value += my_value;
+            my_value = local_value;
+        }
+        iters += num_acquisitions;
     }
 
-    thread_data->iters = iters;
-    _Py_atomic_add_ssize(&bench_data->total_iters, iters);
-    _PyEvent_Notify(&thread_data->done);
+    td->iters = iters;
+    _PyEvent_Notify(&td->done);
 }
 
 /*[clinic input]
 _testinternalcapi.benchmark_locks
 
     num_threads: Py_ssize_t
-    use_pymutex: bool = True
-    critical_section_length: int = 1
+    work_inside: int = 1
+    work_outside: int = 0
     time_ms: int = 1000
+    num_acquisitions: int = 1
+    total_iters: Py_ssize_t = 0
+    num_locks: Py_ssize_t = 1
+    random_locks: bool = False
     /
 
 [clinic start generated code]*/
@@ -260,10 +296,12 @@ _testinternalcapi.benchmark_locks
 static PyObject *
 _testinternalcapi_benchmark_locks_impl(PyObject *module,
                                        Py_ssize_t num_threads,
-                                       int use_pymutex,
-                                       int critical_section_length,
-                                       int time_ms)
-/*[clinic end generated code: output=381df8d7e9a74f18 input=f3aeaf688738c121]*/
+                                       int work_inside, int work_outside,
+                                       int time_ms, int num_acquisitions,
+                                       Py_ssize_t total_iters,
+                                       Py_ssize_t num_locks,
+                                       int random_locks)
+/*[clinic end generated code: output=6258dc9de8cb9af1 input=d622cf4e1c4d008b]*/
 {
     // Run from Tools/lockbench/lockbench.py
     // Based on the WebKit lock benchmarks:
@@ -271,24 +309,28 @@ _testinternalcapi_benchmark_locks_impl(PyObject *module,
     // See also https://webkit.org/blog/6161/locking-in-webkit/
     PyObject *thread_iters = NULL;
     PyObject *res = NULL;
+    struct bench_thread_data *thread_data = NULL;
 
-    struct bench_data_locks bench_data;
-    memset(&bench_data, 0, sizeof(bench_data));
-    bench_data.use_pymutex = use_pymutex;
-    bench_data.critical_section_length = critical_section_length;
-
-    bench_data.lock = PyThread_allocate_lock();
-    if (bench_data.lock == NULL) {
-        return PyErr_NoMemory();
+    struct bench_config config = {
+        .work_inside = work_inside,
+        .work_outside = work_outside,
+        .num_acquisitions = num_acquisitions,
+        .target_iters = total_iters,
+        .num_locks = num_locks,
+        .random_locks = random_locks,
+    };
+
+    config.locks = PyMem_Calloc(num_locks, sizeof(*config.locks));
+    if (config.locks == NULL) {
+        PyErr_NoMemory();
+        goto exit;
     }
 
-    struct bench_thread_data *thread_data = NULL;
     thread_data = PyMem_Calloc(num_threads, sizeof(*thread_data));
     if (thread_data == NULL) {
         PyErr_NoMemory();
         goto exit;
     }
-
     thread_iters = PyList_New(num_threads);
     if (thread_iters == NULL) {
         goto exit;
@@ -300,40 +342,43 @@ _testinternalcapi_benchmark_locks_impl(PyObject *module,
     }
 
     for (Py_ssize_t i = 0; i < num_threads; i++) {
-        thread_data[i].bench_data = &bench_data;
+        thread_data[i].config = &config;
+        thread_data[i].lock = &config.locks[i % num_locks];
+        thread_data[i].rng_state = (uint64_t)i + 1;
         PyThread_start_new_thread(thread_benchmark_locks, &thread_data[i]);
     }
 
-    // Let the threads run for `time_ms` milliseconds
-    pysleep(time_ms);
-    _Py_atomic_store_int(&bench_data.stop, 1);
+    if (total_iters == 0) {
+        pysleep(time_ms);
+        _Py_atomic_store_int(&config.stop, 1);
+    }
 
-    // Wait for the threads to finish
     for (Py_ssize_t i = 0; i < num_threads; i++) {
         PyEvent_Wait(&thread_data[i].done);
     }
 
-    Py_ssize_t total_iters = bench_data.total_iters;
     if (PyTime_PerfCounter(&end) < 0) {
         goto exit;
     }
 
-    // Return the total number of acquisitions and the number of acquisitions
-    // for each thread.
+    Py_ssize_t sum_iters = 0;
     for (Py_ssize_t i = 0; i < num_threads; i++) {
         PyObject *iter = PyLong_FromSsize_t(thread_data[i].iters);
         if (iter == NULL) {
             goto exit;
         }
         PyList_SET_ITEM(thread_iters, i, iter);
+        sum_iters += thread_data[i].iters;
     }
 
     assert(end != start);
-    double rate = total_iters * 1e9 / (end - start);
-    res = Py_BuildValue("(dO)", rate, thread_iters);
+    PyTime_t elapsed_ns = end - start;
+    double rate = sum_iters * 1e9 / elapsed_ns;
+    res = Py_BuildValue("(dOL)", rate, thread_iters,
+                        (long long)elapsed_ns);
 
 exit:
-    PyThread_free_lock(bench_data.lock);
+    PyMem_Free(config.locks);
     PyMem_Free(thread_data);
     Py_XDECREF(thread_iters);
     return res;
@@ -344,7 +389,7 @@ test_lock_benchmark(PyObject *module, PyObject *obj)
 {
     // Just make sure the benchmark runs without crashing
     PyObject *res = _testinternalcapi_benchmark_locks_impl(
-        module, 1, 1, 1, 100);
+        module, 1, 1, 0, 100, 1, 0, 1, 0);
     if (res == NULL) {
         return NULL;
     }
diff --git a/Tools/lockbench/lockbench.py b/Tools/lockbench/lockbench.py
index 9833d703e00cbb..d2608797f3a4d5 100644
--- a/Tools/lockbench/lockbench.py
+++ b/Tools/lockbench/lockbench.py
@@ -1,14 +1,28 @@
-# Measure the performance of PyMutex and PyThread_type_lock locks
-# with short critical sections.
+# Measure the performance of PyMutex locks with short critical sections.
 #
-# Usage: python Tools/lockbench/lockbench.py [CRITICAL_SECTION_LENGTH]
+# Usage: python Tools/lockbench/lockbench.py [options]
+#
+# Options:
+#   --work-inside N    Units of work inside the critical section (default: 1).
+#   --work-outside N   Units of work outside the critical section (default: 0).
+#                      Each unit of work is a dependent floating-point
+#                      addition, which takes about 0.4 ns on modern
+#                      Intel / AMD processors.
+#   --num-locks N      Number of independent locks (default: 1). Threads are
+#                      assigned to locks round-robin.
+#   --random-locks     Each thread picks a random lock per acquisition instead
+#                      of using a fixed assignment. Requires --num-locks > 1.
+#   --acquisitions N   Lock acquisitions per loop iteration (default: 1).
+#   --total-iters N    Fixed iterations per thread (default: 0 = time-based).
+#                      Useful for measuring fairness: the benchmark runs until
+#                      the slowest thread finishes.
 #
 # How to interpret the results:
 #
 # Acquisitions (kHz): Reports the total number of lock acquisitions in
 # thousands of acquisitions per second. This is the most important metric,
 # particularly for the 1 thread case because even in multithreaded programs,
-# most locks acquisitions are not contended. Values for 2+ threads are
+# most lock acquisitions are not contended. Values for 2+ threads are
 # only meaningful for `--disable-gil` builds, because the GIL prevents most
 # situations where there is lock contention with short critical sections.
 #
@@ -19,14 +33,15 @@
 # See https://en.wikipedia.org/wiki/Fairness_measure#Jain's_fairness_index
 
 from _testinternalcapi import benchmark_locks
-import sys
-
-# Max number of threads to test
-MAX_THREADS = 10
+import argparse
 
-# How much "work" to do while holding the lock
-CRITICAL_SECTION_LENGTH = 1
 
+def parse_threads(value):
+    if '-' in value:
+        lo, hi = value.split('-', 1)
+        lo, hi = int(lo), int(hi)
+        return range(lo, hi + 1)
+    return range(int(value), int(value) + 1)
 
 def jains_fairness(values):
     # Jain's fairness index
@@ -34,20 +49,44 @@ def jains_fairness(values):
     return (sum(values) ** 2) / (len(values) * sum(x ** 2 for x in values))
 
 def main():
-    print("Lock Type           Threads           Acquisitions (kHz)   
Fairness")
-    for lock_type in ["PyMutex", "PyThread_type_lock"]:
-        use_pymutex = (lock_type == "PyMutex")
-        for num_threads in range(1, MAX_THREADS + 1):
-            acquisitions, thread_iters = benchmark_locks(
-                num_threads, use_pymutex, CRITICAL_SECTION_LENGTH)
+    parser = argparse.ArgumentParser(description="Benchmark PyMutex locks")
+    parser.add_argument("--work-inside", type=int, default=1,
+                        help="units of work inside the critical section")
+    parser.add_argument("--work-outside", type=int, default=0,
+                        help="units of work outside the critical section")
+    parser.add_argument("--acquisitions", type=int, default=1,
+                        help="lock acquisitions per loop iteration")
+    parser.add_argument("--total-iters", type=int, default=0,
+                        help="fixed iterations per thread (0 = time-based)")
+    parser.add_argument("--num-locks", type=int, default=1,
+                        help="number of independent locks (round-robin 
assignment)")
+    parser.add_argument("--random-locks", action="store_true",
+                        help="pick a random lock per acquisition")
+    parser.add_argument("threads", type=parse_threads, nargs='?',
+                        default=range(1, 11),
+                        help="Number of threads: N or MIN-MAX (default: 1-10)")
+    args = parser.parse_args()
+
+    header = f"{'Threads': <10}{'Acq (kHz)': >12}{'Fairness': >10}"
+    if args.total_iters:
+        header += f"{'Wall (ms)': >12}"
+    print(header)
+    for num_threads in args.threads:
+        acquisitions, thread_iters, elapsed_ns = \
+            benchmark_locks(
+                num_threads, args.work_inside, args.work_outside,
+                1000, args.acquisitions, args.total_iters,
+                args.num_locks, args.random_locks)
 
-            acquisitions /= 1000  # report in kHz for readability
-            fairness = jains_fairness(thread_iters)
+        wall_ms = elapsed_ns / 1e6
+        acquisitions /= 1000  # report in kHz for readability
+        fairness = jains_fairness(thread_iters)
 
-            print(f"{lock_type: <20}{num_threads: <18}{acquisitions: 
>5.0f}{fairness: >20.2f}")
+        line = f"{num_threads: <10}{acquisitions: >12.0f}{fairness: >10.2f}"
+        if args.total_iters:
+            line += f"{wall_ms: >12.1f}"
+        print(line)
 
 
 if __name__ == "__main__":
-    if len(sys.argv) > 1:
-        CRITICAL_SECTION_LENGTH = int(sys.argv[1])
     main()

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]

[Python-checkins] gh-145230: Update lockbench (gh-145231)

Reply via email to