__kfree_rcu_sheaf() cannot invoke call_rcu() when spinning is not allowed and IRQs are disabled. To relax the limitation, extend the deferred free fallback so that a full rcu sheaf can be submitted to call_rcu() via the existing IRQ work.
Since the deferred mechanism does more than deferred free of objects, rename the struct to deferred_percpu_work and adjust names accordingly. When a sheaf is queued on an IRQ work, it is detached from pcs->rcu_free but call_rcu() is not invoked until the irq_work runs. To keep the kvfree_rcu barrier's promise, call irq_work_sync() on each CPU before calling rcu_barrier(). In the meantime, remove the TODO item as apparently there is no simple and effective way to achieve that. Suggested-by: Alexei Starovoitov <[email protected]> Signed-off-by: Harry Yoo (Oracle) <[email protected]> --- mm/slab.h | 2 +- mm/slab_common.c | 7 ++--- mm/slub.c | 79 ++++++++++++++++++++++++++++++++++---------------------- 3 files changed, 51 insertions(+), 37 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index b1bd33a16544..961581e35ec8 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -744,7 +744,7 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) void __check_heap_object(const void *ptr, unsigned long n, const struct slab *slab, bool to_user); -void defer_free_barrier(void); +void deferred_work_barrier(void); static inline bool slub_debug_orig_size(struct kmem_cache *s) { diff --git a/mm/slab_common.c b/mm/slab_common.c index bc1a8ec938d9..55546b8385ff 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -551,7 +551,7 @@ void kmem_cache_destroy(struct kmem_cache *s) } /* Wait for deferred work from kmalloc/kfree_nolock() */ - defer_free_barrier(); + deferred_work_barrier(); cpus_read_lock(); mutex_lock(&slab_mutex); @@ -2113,13 +2113,10 @@ void kvfree_rcu_barrier_on_cache(struct kmem_cache *s) cpus_read_lock(); flush_rcu_sheaves_on_cache(s); cpus_read_unlock(); + deferred_work_barrier(); rcu_barrier(); } - /* - * TODO: Introduce a version of __kvfree_rcu_barrier() that works - * on a specific slab cache. - */ __kvfree_rcu_barrier(); } EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache); diff --git a/mm/slub.c b/mm/slub.c index 6a3552b70683..ba593c1c53d5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -418,6 +418,8 @@ struct slab_sheaf { union { struct rcu_head rcu_head; struct list_head barn_list; + /* only used to defer call_rcu() in unknown context */ + struct llist_node llnode; /* only used for prefilled sheafs */ struct { unsigned int capacity; @@ -4071,6 +4073,20 @@ static void flush_all(struct kmem_cache *s) cpus_read_unlock(); } +struct deferred_percpu_work { + struct llist_head objects; + struct llist_head rcu_sheaves; + struct irq_work work; +}; + +static void deferred_percpu_work_fn(struct irq_work *work); + +static DEFINE_PER_CPU(struct deferred_percpu_work, deferred_percpu_work) = { + .objects = LLIST_HEAD_INIT(objects), + .rcu_sheaves = LLIST_HEAD_INIT(rcu_sheaves), + .work = IRQ_WORK_INIT(deferred_percpu_work_fn), +}; + static void flush_rcu_sheaf(struct work_struct *w) { struct slub_percpu_sheaves *pcs; @@ -4142,6 +4158,7 @@ void flush_all_rcu_sheaves(void) mutex_unlock(&slab_mutex); cpus_read_unlock(); + deferred_work_barrier(); rcu_barrier(); } @@ -6158,12 +6175,6 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj, bool allow_spin) if (likely(rcu_sheaf->size < s->sheaf_capacity)) { rcu_sheaf = NULL; } else { - /* call_rcu() disables IRQs to protect percpu data structures */ - if (unlikely(!allow_spin && irqs_disabled())) { - rcu_sheaf->size--; - local_unlock(&s->cpu_sheaves->lock); - goto fail; - } pcs->rcu_free = NULL; rcu_sheaf->node = numa_node_id(); } @@ -6172,8 +6183,18 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj, bool allow_spin) * we flush before local_unlock to make sure a racing * flush_all_rcu_sheaves() doesn't miss this sheaf */ - if (rcu_sheaf) - call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf); + if (rcu_sheaf) { + /* call_rcu() disables IRQs to protect percpu data structures */ + if (unlikely(!allow_spin && irqs_disabled())) { + struct deferred_percpu_work *dpw; + + dpw = this_cpu_ptr(&deferred_percpu_work); + if (llist_add(&rcu_sheaf->llnode, &dpw->rcu_sheaves)) + irq_work_queue(&dpw->work); + } else { + call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf); + } + } local_unlock(&s->cpu_sheaves->lock); @@ -6360,31 +6381,20 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) } } -struct defer_free { - struct llist_head objects; - struct irq_work work; -}; - -static void free_deferred_objects(struct irq_work *work); - -static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { - .objects = LLIST_HEAD_INIT(objects), - .work = IRQ_WORK_INIT(free_deferred_objects), -}; - /* * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe * to take sleeping spin_locks from __slab_free(). * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). */ -static void free_deferred_objects(struct irq_work *work) +static void deferred_percpu_work_fn(struct irq_work *work) { - struct defer_free *df = container_of(work, struct defer_free, work); - struct llist_head *objs = &df->objects; + struct deferred_percpu_work *dpw; + struct llist_head *objs, *rcu_sheaves; struct llist_node *llnode, *pos, *t; - if (llist_empty(objs)) - return; + dpw = container_of(work, struct deferred_percpu_work, work); + rcu_sheaves = &dpw->rcu_sheaves; + objs = &dpw->objects; llnode = llist_del_all(objs); llist_for_each_safe(pos, t, llnode) { @@ -6408,27 +6418,34 @@ static void free_deferred_objects(struct irq_work *work) __slab_free(s, slab, x, x, 1, _THIS_IP_); stat(s, FREE_SLOWPATH); } + + llnode = llist_del_all(rcu_sheaves); + llist_for_each_safe(pos, t, llnode) { + struct slab_sheaf *rcu_sheaf = llist_entry(pos, struct slab_sheaf, llnode); + + call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf); + } } static void defer_free(struct kmem_cache *s, void *head) { - struct defer_free *df; + struct deferred_percpu_work *dpw; guard(preempt)(); head = kasan_reset_tag(head); - df = this_cpu_ptr(&defer_free_objects); - if (llist_add(head + s->offset, &df->objects)) - irq_work_queue(&df->work); + dpw = this_cpu_ptr(&deferred_percpu_work); + if (llist_add(head + s->offset, &dpw->objects)) + irq_work_queue(&dpw->work); } -void defer_free_barrier(void) +void deferred_work_barrier(void) { int cpu; for_each_possible_cpu(cpu) - irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); + irq_work_sync(&per_cpu_ptr(&deferred_percpu_work, cpu)->work); } static __fastpath_inline -- 2.53.0

