https://github.com/python/cpython/commit/72eca2af59043c78647b0e6be3777a947ea9ef0f
commit: 72eca2af59043c78647b0e6be3777a947ea9ef0f
branch: main
author: Sam Gross <[email protected]>
committer: colesbury <[email protected]>
date: 2026-02-27T14:09:05-05:00
summary:
gh-145230: Update lockbench (gh-145231)
Remove PyThread_type_lock (now uses PyMutex internally).
Add new benchmark options:
- work_inside/work_outside: control work inside and outside the critical
section to vary contention levels
- num_locks: use multiple independent locks with threads assigned round-robin
- total_iters: fixed iteration count per thread instead of time-based, useful
for measuring fairness
- num_acquisitions: lock acquisitions per loop iteration
- random_locks: acquire random lock each iteration
Also return elapsed time from benchmark_locks() and switch lockbench.py to use
argparse.
files:
M Modules/_testinternalcapi/clinic/test_lock.c.h
M Modules/_testinternalcapi/test_lock.c
M Tools/lockbench/lockbench.py
diff --git a/Modules/_testinternalcapi/clinic/test_lock.c.h
b/Modules/_testinternalcapi/clinic/test_lock.c.h
index 86875767343cd2..6e989a777ac7f0 100644
--- a/Modules/_testinternalcapi/clinic/test_lock.c.h
+++ b/Modules/_testinternalcapi/clinic/test_lock.c.h
@@ -6,8 +6,9 @@ preserve
#include "pycore_modsupport.h" // _PyArg_CheckPositional()
PyDoc_STRVAR(_testinternalcapi_benchmark_locks__doc__,
-"benchmark_locks($module, num_threads, use_pymutex=True,\n"
-" critical_section_length=1, time_ms=1000, /)\n"
+"benchmark_locks($module, num_threads, work_inside=1, work_outside=0,\n"
+" time_ms=1000, num_acquisitions=1, total_iters=0,\n"
+" num_locks=1, random_locks=False, /)\n"
"--\n"
"\n");
@@ -17,20 +18,26 @@ PyDoc_STRVAR(_testinternalcapi_benchmark_locks__doc__,
static PyObject *
_testinternalcapi_benchmark_locks_impl(PyObject *module,
Py_ssize_t num_threads,
- int use_pymutex,
- int critical_section_length,
- int time_ms);
+ int work_inside, int work_outside,
+ int time_ms, int num_acquisitions,
+ Py_ssize_t total_iters,
+ Py_ssize_t num_locks,
+ int random_locks);
static PyObject *
_testinternalcapi_benchmark_locks(PyObject *module, PyObject *const *args,
Py_ssize_t nargs)
{
PyObject *return_value = NULL;
Py_ssize_t num_threads;
- int use_pymutex = 1;
- int critical_section_length = 1;
+ int work_inside = 1;
+ int work_outside = 0;
int time_ms = 1000;
+ int num_acquisitions = 1;
+ Py_ssize_t total_iters = 0;
+ Py_ssize_t num_locks = 1;
+ int random_locks = 0;
- if (!_PyArg_CheckPositional("benchmark_locks", nargs, 1, 4)) {
+ if (!_PyArg_CheckPositional("benchmark_locks", nargs, 1, 8)) {
goto exit;
}
{
@@ -48,15 +55,15 @@ _testinternalcapi_benchmark_locks(PyObject *module,
PyObject *const *args, Py_ss
if (nargs < 2) {
goto skip_optional;
}
- use_pymutex = PyObject_IsTrue(args[1]);
- if (use_pymutex < 0) {
+ work_inside = PyLong_AsInt(args[1]);
+ if (work_inside == -1 && PyErr_Occurred()) {
goto exit;
}
if (nargs < 3) {
goto skip_optional;
}
- critical_section_length = PyLong_AsInt(args[2]);
- if (critical_section_length == -1 && PyErr_Occurred()) {
+ work_outside = PyLong_AsInt(args[2]);
+ if (work_outside == -1 && PyErr_Occurred()) {
goto exit;
}
if (nargs < 4) {
@@ -66,10 +73,54 @@ _testinternalcapi_benchmark_locks(PyObject *module,
PyObject *const *args, Py_ss
if (time_ms == -1 && PyErr_Occurred()) {
goto exit;
}
+ if (nargs < 5) {
+ goto skip_optional;
+ }
+ num_acquisitions = PyLong_AsInt(args[4]);
+ if (num_acquisitions == -1 && PyErr_Occurred()) {
+ goto exit;
+ }
+ if (nargs < 6) {
+ goto skip_optional;
+ }
+ {
+ Py_ssize_t ival = -1;
+ PyObject *iobj = _PyNumber_Index(args[5]);
+ if (iobj != NULL) {
+ ival = PyLong_AsSsize_t(iobj);
+ Py_DECREF(iobj);
+ }
+ if (ival == -1 && PyErr_Occurred()) {
+ goto exit;
+ }
+ total_iters = ival;
+ }
+ if (nargs < 7) {
+ goto skip_optional;
+ }
+ {
+ Py_ssize_t ival = -1;
+ PyObject *iobj = _PyNumber_Index(args[6]);
+ if (iobj != NULL) {
+ ival = PyLong_AsSsize_t(iobj);
+ Py_DECREF(iobj);
+ }
+ if (ival == -1 && PyErr_Occurred()) {
+ goto exit;
+ }
+ num_locks = ival;
+ }
+ if (nargs < 8) {
+ goto skip_optional;
+ }
+ random_locks = PyObject_IsTrue(args[7]);
+ if (random_locks < 0) {
+ goto exit;
+ }
skip_optional:
- return_value = _testinternalcapi_benchmark_locks_impl(module, num_threads,
use_pymutex, critical_section_length, time_ms);
+ return_value = _testinternalcapi_benchmark_locks_impl(module, num_threads,
work_inside, work_outside, time_ms, num_acquisitions, total_iters, num_locks,
random_locks);
exit:
return return_value;
}
-/*[clinic end generated code: output=105105d759c0c271 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=6cfed9fc081313ef input=a9049054013a1b77]*/
diff --git a/Modules/_testinternalcapi/test_lock.c
b/Modules/_testinternalcapi/test_lock.c
index ded76ca9fe6819..596120ef275196 100644
--- a/Modules/_testinternalcapi/test_lock.c
+++ b/Modules/_testinternalcapi/test_lock.c
@@ -194,65 +194,101 @@ test_lock_counter_slow(PyObject *self, PyObject *obj)
Py_RETURN_NONE;
}
-struct bench_data_locks {
- int stop;
- int use_pymutex;
- int critical_section_length;
+struct bench_lock {
char padding[200];
- PyThread_type_lock lock;
PyMutex m;
double value;
- Py_ssize_t total_iters;
+};
+
+struct bench_config {
+ int stop;
+ int work_inside;
+ int work_outside;
+ int num_acquisitions;
+ int random_locks;
+ Py_ssize_t target_iters;
+ Py_ssize_t num_locks;
+ struct bench_lock *locks;
};
struct bench_thread_data {
- struct bench_data_locks *bench_data;
+ struct bench_config *config;
+ struct bench_lock *lock;
+ uint64_t rng_state;
Py_ssize_t iters;
PyEvent done;
};
+static uint64_t
+splitmix64(uint64_t *state)
+{
+ uint64_t z = (*state += 0x9e3779b97f4a7c15);
+ z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
+ z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
+ return z ^ (z >> 31);
+}
+
static void
thread_benchmark_locks(void *arg)
{
- struct bench_thread_data *thread_data = arg;
- struct bench_data_locks *bench_data = thread_data->bench_data;
- int use_pymutex = bench_data->use_pymutex;
- int critical_section_length = bench_data->critical_section_length;
-
+ struct bench_thread_data *td = arg;
+ struct bench_config *config = td->config;
+ int work_inside = config->work_inside;
+ int work_outside = config->work_outside;
+ int num_acquisitions = config->num_acquisitions;
+ Py_ssize_t target_iters = config->target_iters;
+ uint64_t rng_state = td->rng_state;
+
+ double local_value = 0.0;
double my_value = 1.0;
Py_ssize_t iters = 0;
- while (!_Py_atomic_load_int_relaxed(&bench_data->stop)) {
- if (use_pymutex) {
- PyMutex_Lock(&bench_data->m);
- for (int i = 0; i < critical_section_length; i++) {
- bench_data->value += my_value;
- my_value = bench_data->value;
+ for (;;) {
+ if (target_iters > 0) {
+ if (iters >= target_iters) {
+ break;
}
- PyMutex_Unlock(&bench_data->m);
}
- else {
- PyThread_acquire_lock(bench_data->lock, 1);
- for (int i = 0; i < critical_section_length; i++) {
- bench_data->value += my_value;
- my_value = bench_data->value;
+ else if (_Py_atomic_load_int_relaxed(&config->stop)) {
+ break;
+ }
+ struct bench_lock *lock = td->lock;
+ if (config->random_locks) {
+ uint32_t r = (uint32_t)splitmix64(&rng_state);
+ // Fast modulo reduction to pick a random lock, adapted from:
+ //
https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+ Py_ssize_t idx = ((uint64_t)r * (uint32_t)config->num_locks) >> 32;
+ lock = &config->locks[idx];
+ }
+ for (int acq = 0; acq < num_acquisitions; acq++) {
+ PyMutex_Lock(&lock->m);
+ for (int i = 0; i < work_inside; i++) {
+ lock->value += my_value;
+ my_value = lock->value;
}
- PyThread_release_lock(bench_data->lock);
+ PyMutex_Unlock(&lock->m);
}
- iters++;
+ for (int i = 0; i < work_outside; i++) {
+ local_value += my_value;
+ my_value = local_value;
+ }
+ iters += num_acquisitions;
}
- thread_data->iters = iters;
- _Py_atomic_add_ssize(&bench_data->total_iters, iters);
- _PyEvent_Notify(&thread_data->done);
+ td->iters = iters;
+ _PyEvent_Notify(&td->done);
}
/*[clinic input]
_testinternalcapi.benchmark_locks
num_threads: Py_ssize_t
- use_pymutex: bool = True
- critical_section_length: int = 1
+ work_inside: int = 1
+ work_outside: int = 0
time_ms: int = 1000
+ num_acquisitions: int = 1
+ total_iters: Py_ssize_t = 0
+ num_locks: Py_ssize_t = 1
+ random_locks: bool = False
/
[clinic start generated code]*/
@@ -260,10 +296,12 @@ _testinternalcapi.benchmark_locks
static PyObject *
_testinternalcapi_benchmark_locks_impl(PyObject *module,
Py_ssize_t num_threads,
- int use_pymutex,
- int critical_section_length,
- int time_ms)
-/*[clinic end generated code: output=381df8d7e9a74f18 input=f3aeaf688738c121]*/
+ int work_inside, int work_outside,
+ int time_ms, int num_acquisitions,
+ Py_ssize_t total_iters,
+ Py_ssize_t num_locks,
+ int random_locks)
+/*[clinic end generated code: output=6258dc9de8cb9af1 input=d622cf4e1c4d008b]*/
{
// Run from Tools/lockbench/lockbench.py
// Based on the WebKit lock benchmarks:
@@ -271,24 +309,28 @@ _testinternalcapi_benchmark_locks_impl(PyObject *module,
// See also https://webkit.org/blog/6161/locking-in-webkit/
PyObject *thread_iters = NULL;
PyObject *res = NULL;
+ struct bench_thread_data *thread_data = NULL;
- struct bench_data_locks bench_data;
- memset(&bench_data, 0, sizeof(bench_data));
- bench_data.use_pymutex = use_pymutex;
- bench_data.critical_section_length = critical_section_length;
-
- bench_data.lock = PyThread_allocate_lock();
- if (bench_data.lock == NULL) {
- return PyErr_NoMemory();
+ struct bench_config config = {
+ .work_inside = work_inside,
+ .work_outside = work_outside,
+ .num_acquisitions = num_acquisitions,
+ .target_iters = total_iters,
+ .num_locks = num_locks,
+ .random_locks = random_locks,
+ };
+
+ config.locks = PyMem_Calloc(num_locks, sizeof(*config.locks));
+ if (config.locks == NULL) {
+ PyErr_NoMemory();
+ goto exit;
}
- struct bench_thread_data *thread_data = NULL;
thread_data = PyMem_Calloc(num_threads, sizeof(*thread_data));
if (thread_data == NULL) {
PyErr_NoMemory();
goto exit;
}
-
thread_iters = PyList_New(num_threads);
if (thread_iters == NULL) {
goto exit;
@@ -300,40 +342,43 @@ _testinternalcapi_benchmark_locks_impl(PyObject *module,
}
for (Py_ssize_t i = 0; i < num_threads; i++) {
- thread_data[i].bench_data = &bench_data;
+ thread_data[i].config = &config;
+ thread_data[i].lock = &config.locks[i % num_locks];
+ thread_data[i].rng_state = (uint64_t)i + 1;
PyThread_start_new_thread(thread_benchmark_locks, &thread_data[i]);
}
- // Let the threads run for `time_ms` milliseconds
- pysleep(time_ms);
- _Py_atomic_store_int(&bench_data.stop, 1);
+ if (total_iters == 0) {
+ pysleep(time_ms);
+ _Py_atomic_store_int(&config.stop, 1);
+ }
- // Wait for the threads to finish
for (Py_ssize_t i = 0; i < num_threads; i++) {
PyEvent_Wait(&thread_data[i].done);
}
- Py_ssize_t total_iters = bench_data.total_iters;
if (PyTime_PerfCounter(&end) < 0) {
goto exit;
}
- // Return the total number of acquisitions and the number of acquisitions
- // for each thread.
+ Py_ssize_t sum_iters = 0;
for (Py_ssize_t i = 0; i < num_threads; i++) {
PyObject *iter = PyLong_FromSsize_t(thread_data[i].iters);
if (iter == NULL) {
goto exit;
}
PyList_SET_ITEM(thread_iters, i, iter);
+ sum_iters += thread_data[i].iters;
}
assert(end != start);
- double rate = total_iters * 1e9 / (end - start);
- res = Py_BuildValue("(dO)", rate, thread_iters);
+ PyTime_t elapsed_ns = end - start;
+ double rate = sum_iters * 1e9 / elapsed_ns;
+ res = Py_BuildValue("(dOL)", rate, thread_iters,
+ (long long)elapsed_ns);
exit:
- PyThread_free_lock(bench_data.lock);
+ PyMem_Free(config.locks);
PyMem_Free(thread_data);
Py_XDECREF(thread_iters);
return res;
@@ -344,7 +389,7 @@ test_lock_benchmark(PyObject *module, PyObject *obj)
{
// Just make sure the benchmark runs without crashing
PyObject *res = _testinternalcapi_benchmark_locks_impl(
- module, 1, 1, 1, 100);
+ module, 1, 1, 0, 100, 1, 0, 1, 0);
if (res == NULL) {
return NULL;
}
diff --git a/Tools/lockbench/lockbench.py b/Tools/lockbench/lockbench.py
index 9833d703e00cbb..d2608797f3a4d5 100644
--- a/Tools/lockbench/lockbench.py
+++ b/Tools/lockbench/lockbench.py
@@ -1,14 +1,28 @@
-# Measure the performance of PyMutex and PyThread_type_lock locks
-# with short critical sections.
+# Measure the performance of PyMutex locks with short critical sections.
#
-# Usage: python Tools/lockbench/lockbench.py [CRITICAL_SECTION_LENGTH]
+# Usage: python Tools/lockbench/lockbench.py [options]
+#
+# Options:
+# --work-inside N Units of work inside the critical section (default: 1).
+# --work-outside N Units of work outside the critical section (default: 0).
+# Each unit of work is a dependent floating-point
+# addition, which takes about 0.4 ns on modern
+# Intel / AMD processors.
+# --num-locks N Number of independent locks (default: 1). Threads are
+# assigned to locks round-robin.
+# --random-locks Each thread picks a random lock per acquisition instead
+# of using a fixed assignment. Requires --num-locks > 1.
+# --acquisitions N Lock acquisitions per loop iteration (default: 1).
+# --total-iters N Fixed iterations per thread (default: 0 = time-based).
+# Useful for measuring fairness: the benchmark runs until
+# the slowest thread finishes.
#
# How to interpret the results:
#
# Acquisitions (kHz): Reports the total number of lock acquisitions in
# thousands of acquisitions per second. This is the most important metric,
# particularly for the 1 thread case because even in multithreaded programs,
-# most locks acquisitions are not contended. Values for 2+ threads are
+# most lock acquisitions are not contended. Values for 2+ threads are
# only meaningful for `--disable-gil` builds, because the GIL prevents most
# situations where there is lock contention with short critical sections.
#
@@ -19,14 +33,15 @@
# See https://en.wikipedia.org/wiki/Fairness_measure#Jain's_fairness_index
from _testinternalcapi import benchmark_locks
-import sys
-
-# Max number of threads to test
-MAX_THREADS = 10
+import argparse
-# How much "work" to do while holding the lock
-CRITICAL_SECTION_LENGTH = 1
+def parse_threads(value):
+ if '-' in value:
+ lo, hi = value.split('-', 1)
+ lo, hi = int(lo), int(hi)
+ return range(lo, hi + 1)
+ return range(int(value), int(value) + 1)
def jains_fairness(values):
# Jain's fairness index
@@ -34,20 +49,44 @@ def jains_fairness(values):
return (sum(values) ** 2) / (len(values) * sum(x ** 2 for x in values))
def main():
- print("Lock Type Threads Acquisitions (kHz)
Fairness")
- for lock_type in ["PyMutex", "PyThread_type_lock"]:
- use_pymutex = (lock_type == "PyMutex")
- for num_threads in range(1, MAX_THREADS + 1):
- acquisitions, thread_iters = benchmark_locks(
- num_threads, use_pymutex, CRITICAL_SECTION_LENGTH)
+ parser = argparse.ArgumentParser(description="Benchmark PyMutex locks")
+ parser.add_argument("--work-inside", type=int, default=1,
+ help="units of work inside the critical section")
+ parser.add_argument("--work-outside", type=int, default=0,
+ help="units of work outside the critical section")
+ parser.add_argument("--acquisitions", type=int, default=1,
+ help="lock acquisitions per loop iteration")
+ parser.add_argument("--total-iters", type=int, default=0,
+ help="fixed iterations per thread (0 = time-based)")
+ parser.add_argument("--num-locks", type=int, default=1,
+ help="number of independent locks (round-robin
assignment)")
+ parser.add_argument("--random-locks", action="store_true",
+ help="pick a random lock per acquisition")
+ parser.add_argument("threads", type=parse_threads, nargs='?',
+ default=range(1, 11),
+ help="Number of threads: N or MIN-MAX (default: 1-10)")
+ args = parser.parse_args()
+
+ header = f"{'Threads': <10}{'Acq (kHz)': >12}{'Fairness': >10}"
+ if args.total_iters:
+ header += f"{'Wall (ms)': >12}"
+ print(header)
+ for num_threads in args.threads:
+ acquisitions, thread_iters, elapsed_ns = \
+ benchmark_locks(
+ num_threads, args.work_inside, args.work_outside,
+ 1000, args.acquisitions, args.total_iters,
+ args.num_locks, args.random_locks)
- acquisitions /= 1000 # report in kHz for readability
- fairness = jains_fairness(thread_iters)
+ wall_ms = elapsed_ns / 1e6
+ acquisitions /= 1000 # report in kHz for readability
+ fairness = jains_fairness(thread_iters)
- print(f"{lock_type: <20}{num_threads: <18}{acquisitions:
>5.0f}{fairness: >20.2f}")
+ line = f"{num_threads: <10}{acquisitions: >12.0f}{fairness: >10.2f}"
+ if args.total_iters:
+ line += f"{wall_ms: >12.1f}"
+ print(line)
if __name__ == "__main__":
- if len(sys.argv) > 1:
- CRITICAL_SECTION_LENGTH = int(sys.argv[1])
main()
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]