Pranith Kumar <bobby.pr...@gmail.com> writes: > Using heaptrack, I found that quite a few of our temporary allocations > are coming from allocating work items. Instead of doing this > continously, we can cache the allocated items and reuse them instead > of freeing them. > > Stats from an ARM64 guest (boot+shutdown): > > heaptrack stats(before): > allocations: 1471317 > leaked allocations: 73824 > temporary allocations: 651293 > > heaptrack stats(after): > allocations: 1143130 > leaked allocations: 73693 > temporary allocations: 487342 > > The improvement in speedup is minor and within error margins, however I think > the > patch is still worth. We can also explore atomics instead of taking a lock for > the work item pool.
When we where doing the original MTTCG work I looked at using GArray for the work queue, see: http://lists.gnu.org/archive/html/qemu-devel/2016-08/msg00367.html specifically: Subject: [PATCH v5 13/13] cpu-exec: replace cpu->queued_work with GArray Date: Tue, 2 Aug 2016 18:27:44 +0100 Message-Id: <1470158864-17651-14-git-send-email-alex.ben...@linaro.org> which I personally think might yield better results than messing around with custom allocators and GSlice and the like. You still get the dynamic sizing of a malloc based array but for operations like insertion and iterating through the work queue should be cache friendly. Once the array has (transparently) reached a reasonable size to service all allocations in the usual servicing period the same memory can be used over and over again ;-) My fondness for arrays is informed by comments by Bjarne Stroustrup: https://www.youtube.com/watch?v=YQs6IC-vgmo Obviously this patch would need to be re-worked given how much the code has changes since it was merged. > > Signed-off-by: Pranith Kumar <bobby.pr...@gmail.com> > --- > cpus-common.c | 75 > +++++++++++++++++++++++++++++++++++++++++++++++------------ > 1 file changed, 60 insertions(+), 15 deletions(-) > > diff --git a/cpus-common.c b/cpus-common.c > index 59f751ecf9..ccf5f50e4e 100644 > --- a/cpus-common.c > +++ b/cpus-common.c > @@ -24,6 +24,7 @@ > #include "sysemu/cpus.h" > > static QemuMutex qemu_cpu_list_lock; > +static QemuMutex qemu_wi_pool_lock; > static QemuCond exclusive_cond; > static QemuCond exclusive_resume; > static QemuCond qemu_work_cond; > @@ -33,6 +34,49 @@ static QemuCond qemu_work_cond; > */ > static int pending_cpus; > > +typedef struct qemu_work_item { > + struct qemu_work_item *next; > + run_on_cpu_func func; > + run_on_cpu_data data; > + bool free, exclusive, done; > +} qemu_work_item; > + > +typedef struct qemu_wi_pool { > + qemu_work_item *head; > + int num_items; > +} qemu_wi_pool; > + > +qemu_wi_pool *wi_free_pool; > + > +static void qemu_init_workitem_pool(void) > +{ > + wi_free_pool = g_malloc0(sizeof(qemu_wi_pool)); > +} > + > +static void qemu_wi_pool_insert(qemu_work_item *item) > +{ > + qemu_mutex_lock(&qemu_wi_pool_lock); > + qemu_work_item *curr = atomic_read(&wi_free_pool->head); > + item->next = curr; > + wi_free_pool->head = item; > + qemu_mutex_unlock(&qemu_wi_pool_lock); > +} > + > +static qemu_work_item *qemu_wi_pool_remove(void) > +{ > + qemu_mutex_lock(&qemu_wi_pool_lock); > + qemu_work_item *curr = atomic_read(&wi_free_pool->head); > + if (curr == NULL) { > + goto out; > + } > + wi_free_pool->head = curr->next; > + curr->next = NULL; > + > + out: > + qemu_mutex_unlock(&qemu_wi_pool_lock); > + return curr; > +} > + > void qemu_init_cpu_list(void) > { > /* This is needed because qemu_init_cpu_list is also called by the > @@ -43,6 +87,9 @@ void qemu_init_cpu_list(void) > qemu_cond_init(&exclusive_cond); > qemu_cond_init(&exclusive_resume); > qemu_cond_init(&qemu_work_cond); > + > + qemu_init_workitem_pool(); > + qemu_mutex_init(&qemu_wi_pool_lock); > } > > void cpu_list_lock(void) > @@ -106,14 +153,7 @@ void cpu_list_remove(CPUState *cpu) > qemu_mutex_unlock(&qemu_cpu_list_lock); > } > > -struct qemu_work_item { > - struct qemu_work_item *next; > - run_on_cpu_func func; > - run_on_cpu_data data; > - bool free, exclusive, done; > -}; > - > -static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi) > +static void queue_work_on_cpu(CPUState *cpu, qemu_work_item *wi) > { > qemu_mutex_lock(&cpu->work_mutex); > if (cpu->queued_work_first == NULL) { > @@ -132,7 +172,7 @@ static void queue_work_on_cpu(CPUState *cpu, struct > qemu_work_item *wi) > void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data, > QemuMutex *mutex) > { > - struct qemu_work_item wi; > + qemu_work_item wi; > > if (qemu_cpu_is_self(cpu)) { > func(cpu, data); > @@ -156,9 +196,11 @@ void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, > run_on_cpu_data data, > > void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data > data) > { > - struct qemu_work_item *wi; > + qemu_work_item *wi = qemu_wi_pool_remove(); > > - wi = g_malloc0(sizeof(struct qemu_work_item)); > + if (!wi) { > + wi = g_malloc0(sizeof(qemu_work_item)); > + } > wi->func = func; > wi->data = data; > wi->free = true; > @@ -299,9 +341,11 @@ void cpu_exec_end(CPUState *cpu) > void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, > run_on_cpu_data data) > { > - struct qemu_work_item *wi; > + qemu_work_item *wi = qemu_wi_pool_remove(); > > - wi = g_malloc0(sizeof(struct qemu_work_item)); > + if (!wi) { > + wi = g_malloc0(sizeof(qemu_work_item)); > + } > wi->func = func; > wi->data = data; > wi->free = true; > @@ -312,7 +356,7 @@ void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func > func, > > void process_queued_cpu_work(CPUState *cpu) > { > - struct qemu_work_item *wi; > + qemu_work_item *wi; > > if (cpu->queued_work_first == NULL) { > return; > @@ -343,7 +387,8 @@ void process_queued_cpu_work(CPUState *cpu) > } > qemu_mutex_lock(&cpu->work_mutex); > if (wi->free) { > - g_free(wi); > + memset(wi, 0, sizeof(qemu_work_item)); > + qemu_wi_pool_insert(wi); > } else { > atomic_mb_set(&wi->done, true); > } -- Alex Bennée