struct rcu_head is overkill for kfree_rcu() because the callback
function is always kfree() and thus there is no need for a function
pointer.

It is enough to implement a linked list using a single pointer.
Introduce a new struct called kfree_rcu_head (the name was suggested
by Vlastimil Babka), which is similar to struct rcu_head but is only
a single pointer. Use it only in kfree_rcu_nolock() for now.

It is intended that in most cases, kfree_rcu_nolock() goes through
sheaves path and the field is not used at all. However, in the worst
case where trylock fails or no empty sheaves are available, maintain
a global list of objects that will later be freed after
synchronize_rcu() in a workqueue. Since the fallback is intended to be
the last resort, it focuses on minimizing complexity and memory usage.

To avoid crashing the kernel in a theoretical case where fallback path
is taken before kmem_cache_init_late(), do not queue the work until
the workqueue is actually allocated.

Factor out the logic that calculates object start address into
object_start_address() from kvfree_rcu_cb() to avoid duplicating it.

Suggested-by: Alexei Starovoitov <[email protected]>
Signed-off-by: Harry Yoo (Oracle) <[email protected]>
---
 include/linux/rcupdate.h |  2 +-
 include/linux/types.h    |  4 +++
 mm/slab.h                |  2 +-
 mm/slab_common.c         |  2 +-
 mm/slub.c                | 92 ++++++++++++++++++++++++++++++++----------------
 5 files changed, 69 insertions(+), 33 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 3025249bfcb5..83e66585001f 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1099,7 +1099,7 @@ static inline void rcu_read_unlock_migrate(void)
  * In mm/slab_common.c, no suitable header to include here.
  */
 void kvfree_call_rcu(struct rcu_head *head, void *ptr);
-void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr);
+void kfree_call_rcu_nolock(struct kfree_rcu_head *head, void *ptr);
 
 /*
  * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the
diff --git a/include/linux/types.h b/include/linux/types.h
index 93166b0b0617..695f2a71411e 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -255,6 +255,10 @@ struct callback_head {
 } __attribute__((aligned(sizeof(void *))));
 #define rcu_head callback_head
 
+struct kfree_rcu_head {
+       struct kfree_rcu_head *next;
+};
+
 typedef void (*rcu_callback_t)(struct rcu_head *head);
 typedef void (*call_rcu_func_t)(struct rcu_head *head, rcu_callback_t func);
 
diff --git a/mm/slab.h b/mm/slab.h
index a493c5201e96..19a2a819fd13 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -745,7 +745,7 @@ void __check_heap_object(const void *ptr, unsigned long n,
                         const struct slab *slab, bool to_user);
 
 void deferred_work_barrier(void);
-void defer_kfree_rcu(struct rcu_head *head);
+void defer_kfree_rcu(struct kfree_rcu_head *head);
 
 static inline bool slub_debug_orig_size(struct kmem_cache *s)
 {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5a39e6225160..6b80b516c93f 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1263,7 +1263,7 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
 EXPORT_TRACEPOINT_SYMBOL(kfree);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
 
-void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr)
+void kfree_call_rcu_nolock(struct kfree_rcu_head *head, void *ptr)
 {
        struct slab *slab;
        struct kmem_cache *s;
diff --git a/mm/slub.c b/mm/slub.c
index 19018a979445..853430c7fbe0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4075,20 +4075,25 @@ static void flush_all(struct kmem_cache *s)
 
 struct deferred_percpu_work {
        struct llist_head objects;
-       struct llist_head objects_by_rcu;
        struct llist_head rcu_sheaves;
        struct irq_work work;
 };
 
 static void deferred_percpu_work_fn(struct irq_work *work);
+static void deferred_irq_work_fn(struct irq_work *work);
+static void deferred_work_fn(struct work_struct *w);
 
 static DEFINE_PER_CPU(struct deferred_percpu_work, deferred_percpu_work) = {
        .objects = LLIST_HEAD_INIT(objects),
-       .objects_by_rcu = LLIST_HEAD_INIT(objects_by_rcu),
        .rcu_sheaves = LLIST_HEAD_INIT(rcu_sheaves),
        .work = IRQ_WORK_INIT(deferred_percpu_work_fn),
 };
 
+static LLIST_HEAD(deferred_free_by_rcu);
+static struct workqueue_struct *deferred_wq;
+static DEFINE_IRQ_WORK(deferred_irq_work, deferred_irq_work_fn);
+static DECLARE_WORK(deferred_work, deferred_work_fn);
+
 static void flush_rcu_sheaf(struct work_struct *w)
 {
        struct slub_percpu_sheaves *pcs;
@@ -6394,13 +6399,12 @@ static void free_to_pcs_bulk(struct kmem_cache *s, 
size_t size, void **p)
 static void deferred_percpu_work_fn(struct irq_work *work)
 {
        struct deferred_percpu_work *dpw;
-       struct llist_head *objs, *objs_by_rcu, *rcu_sheaves;
+       struct llist_head *objs, *rcu_sheaves;
        struct llist_node *llnode, *pos, *t;
 
        dpw = container_of(work, struct deferred_percpu_work, work);
        rcu_sheaves = &dpw->rcu_sheaves;
        objs = &dpw->objects;
-       objs_by_rcu = &dpw->objects_by_rcu;
 
        llnode = llist_del_all(objs);
        llist_for_each_safe(pos, t, llnode) {
@@ -6431,12 +6435,50 @@ static void deferred_percpu_work_fn(struct irq_work 
*work)
 
                call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
        }
+}
+
+static void deferred_irq_work_fn(struct irq_work *work)
+{
+       if (!deferred_wq)
+               return;
+
+       queue_work(deferred_wq, &deferred_work);
+}
+
+static inline void *object_start_address(void *ptr)
+{
+       void *obj;
+       struct slab *slab = virt_to_slab(ptr);
+       struct kmem_cache *s = slab->slab_cache;
+
+       VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !slab);
+
+       if (is_kfence_address(ptr)) {
+               obj = kfence_object_start(ptr);
+       } else {
+               unsigned int idx = __obj_to_index(s, slab_address(slab), ptr);
+
+               obj = slab_address(slab) + s->size * idx;
+               obj = fixup_red_left(s, obj);
+       }
+
+       return obj;
+}
+
+static void deferred_work_fn(struct work_struct *w)
+{
+       struct llist_node *llnode, *pos, *t;
+
+       llnode = llist_del_all(&deferred_free_by_rcu);
+       if (!llnode)
+               return;
+
+       synchronize_rcu();
 
-       llnode = llist_del_all(objs_by_rcu);
        llist_for_each_safe(pos, t, llnode) {
-               struct rcu_head *head = (struct rcu_head *)pos;
+               void *obj = object_start_address(pos);
 
-               call_rcu(head, kvfree_rcu_cb);
+               kfree(obj);
        }
 }
 
@@ -6453,15 +6495,10 @@ static void defer_free(struct kmem_cache *s, void *head)
                irq_work_queue(&dpw->work);
 }
 
-void defer_kfree_rcu(struct rcu_head *head)
+void defer_kfree_rcu(struct kfree_rcu_head *head)
 {
-       struct deferred_percpu_work *dpw;
-
-       guard(preempt)();
-
-       dpw = this_cpu_ptr(&deferred_percpu_work);
-       if (llist_add((struct llist_node *)head, &dpw->objects_by_rcu))
-               irq_work_queue(&dpw->work);
+       if (llist_add((struct llist_node *)head, &deferred_free_by_rcu))
+               irq_work_queue(&deferred_irq_work);
 }
 
 void deferred_work_barrier(void)
@@ -6470,6 +6507,9 @@ void deferred_work_barrier(void)
 
        for_each_possible_cpu(cpu)
                irq_work_sync(&per_cpu_ptr(&deferred_percpu_work, cpu)->work);
+
+       irq_work_sync(&deferred_irq_work);
+       flush_work(&deferred_work);
 }
 
 static __fastpath_inline
@@ -6731,8 +6771,6 @@ void kvfree_rcu_cb(struct rcu_head *head)
        void *obj = head;
        struct page *page;
        struct slab *slab;
-       struct kmem_cache *s;
-       void *slab_addr;
 
        if (is_vmalloc_addr(obj)) {
                obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
@@ -6752,19 +6790,8 @@ void kvfree_rcu_cb(struct rcu_head *head)
                return;
        }
 
-       s = slab->slab_cache;
-       slab_addr = slab_address(slab);
-
-       if (is_kfence_address(obj)) {
-               obj = kfence_object_start(obj);
-       } else {
-               unsigned int idx = __obj_to_index(s, slab_addr, obj);
-
-               obj = slab_addr + s->size * idx;
-               obj = fixup_red_left(s, obj);
-       }
-
-       slab_free(s, slab, obj, _RET_IP_);
+       obj = object_start_address(obj);
+       slab_free(slab->slab_cache, slab, obj, _RET_IP_);
 }
 
 /**
@@ -8698,6 +8725,11 @@ void __init kmem_cache_init_late(void)
        flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM | WQ_PERCPU,
                                  0);
        WARN_ON(!flushwq);
+
+       deferred_wq = alloc_workqueue("slab_deferred_wq",
+                                     WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
+       WARN_ON(!deferred_wq);
+       queue_work(deferred_wq, &deferred_work);
 #ifdef CONFIG_SLAB_FREELIST_RANDOM
        prandom_init_once(&slab_rnd_state);
 #endif

-- 
2.53.0


Reply via email to