On Fri, Feb 06, 2026 at 06:34:04PM +0900, Harry Yoo wrote: > k[v]free_rcu() repurposes two fields of struct rcu_head: 'func' to store > the start address of the object, and 'next' to link objects. > > However, using 'func' to store the start address is unnecessary: > > 1. slab can get the start address from the address of struct rcu_head > field via nearest_obj(), and > > 2. vmalloc and large kmalloc can get the start address by aligning > down the address of the struct rcu_head field to the page boundary. > > Therefore, allow an 8-byte (on 64-bit) field (of a new type called > struct rcu_ptr) to be used with k[v]free_rcu() with two arguments. > > Some users use both call_rcu() and k[v]free_rcu() to process callbacks > (e.g., maple tree), so it makes sense to have struct rcu_head field > to handle both cases. However, many users that simply free objects via > kvfree_rcu() can save one pointer by using struct rcu_ptr instead of > struct rcu_head. > > Note that struct rcu_ptr is a single pointer only when > CONFIG_KVFREE_RCU_BATCHED=y. To keep kvfree_rcu() implementation minimal > when CONFIG_KVFREE_RCU_BATCHED is disabled, struct rcu_ptr is the size > as struct rcu_head, and the implementation of kvfree_rcu() remains > unchanged in that configuration. > > Suggested-by: Alexei Starovoitov <[email protected]> > Signed-off-by: Harry Yoo <[email protected]> > --- > include/linux/rcupdate.h | 61 +++++++++++++++++++++++++++------------- > include/linux/types.h | 9 ++++++ > mm/slab_common.c | 40 +++++++++++++++----------- > 3 files changed, 75 insertions(+), 35 deletions(-) > > diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h > index c5b30054cd01..8924edf7e8c1 100644 > --- a/include/linux/rcupdate.h > +++ b/include/linux/rcupdate.h > @@ -1059,22 +1059,30 @@ static inline void rcu_read_unlock_migrate(void) > /** > * kfree_rcu() - kfree an object after a grace period. > * @ptr: pointer to kfree for double-argument invocations. > - * @rhf: the name of the struct rcu_head within the type of @ptr. > + * @rf: the name of the struct rcu_head or struct rcu_ptr within the type of > @ptr. > * > * Many rcu callbacks functions just call kfree() on the base structure. > * These functions are trivial, but their size adds up, and furthermore > * when they are used in a kernel module, that module must invoke the > * high-latency rcu_barrier() function at module-unload time. > + * The kfree_rcu() function handles this issue by batching. > * > - * The kfree_rcu() function handles this issue. In order to have a universal > - * callback function handling different offsets of rcu_head, the callback > needs > - * to determine the starting address of the freed object, which can be a > large > - * kmalloc or vmalloc allocation. To allow simply aligning the pointer down > to > - * page boundary for those, only offsets up to 4095 bytes can be > accommodated. > - * If the offset is larger than 4095 bytes, a compile-time error will > - * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can > - * either fall back to use of call_rcu() or rearrange the structure to > - * position the rcu_head structure into the first 4096 bytes. > + * Typically, struct rcu_head is used to process RCU callbacks, but it > requires > + * two pointers. However, since kfree_rcu() uses kfree() as the callback > + * function, it can process callbacks with struct rcu_ptr, which is only > + * one pointer in size (unless !CONFIG_KVFREE_RCU_BATCHED). > + * > + * The type of @rf can be either struct rcu_head or struct rcu_ptr, and when > + * possible, it is recommended to use struct rcu_ptr due to its smaller size. > + * > + * In order to have a universal callback function handling different offsets > + * of @rf, the callback needs to determine the starting address of the freed > + * object, which can be a large kmalloc or vmalloc allocation. To allow > simply > + * aligning the pointer down to page boundary for those, only offsets up to > + * 4095 bytes can be accommodated. If the offset is larger than 4095 bytes, > + * a compile-time error will be generated in kvfree_rcu_arg_2(). > + * If this error is triggered, you can either fall back to use of call_rcu() > + * or rearrange the structure to position @rf into the first 4096 bytes. > * > * The object to be freed can be allocated either by kmalloc() or > * kmem_cache_alloc(). > @@ -1084,8 +1092,8 @@ static inline void rcu_read_unlock_migrate(void) > * The BUILD_BUG_ON check must not involve any function calls, hence the > * checks are done in macros here. > */ > -#define kfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf) > -#define kvfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf) > +#define kfree_rcu(ptr, rf) kvfree_rcu_arg_2(ptr, rf) > +#define kvfree_rcu(ptr, rf) kvfree_rcu_arg_2(ptr, rf) > > /** > * kfree_rcu_mightsleep() - kfree an object after a grace period. > @@ -1107,22 +1115,37 @@ static inline void rcu_read_unlock_migrate(void) > #define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) > #define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) > > -/* > - * In mm/slab_common.c, no suitable header to include here. > - */ > -void kvfree_call_rcu(struct rcu_head *head, void *ptr); > + > +#ifdef CONFIG_KVFREE_RCU_BATCHED > +void kvfree_call_rcu_ptr(struct rcu_ptr *head, void *ptr); > +#define kvfree_call_rcu(head, ptr) \ > + _Generic((head), \ > + struct rcu_head *: kvfree_call_rcu_ptr, \ > + struct rcu_ptr *: kvfree_call_rcu_ptr, \ > + void *: kvfree_call_rcu_ptr \ > + )((struct rcu_ptr *)(head), (ptr)) > +#else > +void kvfree_call_rcu_head(struct rcu_head *head, void *ptr); > +static_assert(sizeof(struct rcu_head) == sizeof(struct rcu_ptr)); > +#define kvfree_call_rcu(head, ptr) \ > + _Generic((head), \ > + struct rcu_head *: kvfree_call_rcu_head, \ > + struct rcu_ptr *: kvfree_call_rcu_head, \ > + void *: kvfree_call_rcu_head \ > + )((struct rcu_head *)(head), (ptr)) > +#endif > > /* > * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the > * comment of kfree_rcu() for details. > */ > -#define kvfree_rcu_arg_2(ptr, rhf) \ > +#define kvfree_rcu_arg_2(ptr, rf) \ > do { \ > typeof (ptr) ___p = (ptr); \ > \ > if (___p) { \ > - BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096); \ > - kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ > + BUILD_BUG_ON(offsetof(typeof(*(ptr)), rf) >= 4096); \ > + kvfree_call_rcu(&((___p)->rf), (void *) (___p)); \ > } \ > } while (0) > > diff --git a/include/linux/types.h b/include/linux/types.h > index d4437e9c452c..e5596ebab29c 100644 > --- a/include/linux/types.h > +++ b/include/linux/types.h > @@ -245,6 +245,15 @@ struct callback_head { > } __attribute__((aligned(sizeof(void *)))); > #define rcu_head callback_head > > + > +struct rcu_ptr { > +#ifdef CONFIG_KVFREE_RCU_BATCHED > + struct rcu_ptr *next; > +#else > + struct callback_head; > +#endif > +} __attribute__((aligned(sizeof(void *)))); > + > typedef void (*rcu_callback_t)(struct rcu_head *head); > typedef void (*call_rcu_func_t)(struct rcu_head *head, rcu_callback_t func); > > diff --git a/mm/slab_common.c b/mm/slab_common.c > index d5a70a831a2a..3ec99a5463d3 100644 > --- a/mm/slab_common.c > +++ b/mm/slab_common.c > @@ -1265,7 +1265,7 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); > > #ifndef CONFIG_KVFREE_RCU_BATCHED > > -void kvfree_call_rcu(struct rcu_head *head, void *ptr) > +void kvfree_call_rcu_head(struct rcu_head *head, void *ptr) > { > if (head) { > kasan_record_aux_stack(ptr); > @@ -1278,7 +1278,7 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) > synchronize_rcu(); > kvfree(ptr); > } > -EXPORT_SYMBOL_GPL(kvfree_call_rcu); > +EXPORT_SYMBOL_GPL(kvfree_call_rcu_head); > > void __init kvfree_rcu_init(void) > { > @@ -1346,7 +1346,7 @@ struct kvfree_rcu_bulk_data { > > struct kfree_rcu_cpu_work { > struct rcu_work rcu_work; > - struct rcu_head *head_free; > + struct rcu_ptr *head_free; > struct rcu_gp_oldstate head_free_gp_snap; > struct list_head bulk_head_free[FREE_N_CHANNELS]; > struct kfree_rcu_cpu *krcp; > @@ -1381,8 +1381,7 @@ struct kfree_rcu_cpu_work { > */ > struct kfree_rcu_cpu { > // Objects queued on a linked list > - // through their rcu_head structures. > - struct rcu_head *head; > + struct rcu_ptr *head; > unsigned long head_gp_snap; > atomic_t head_count; > > @@ -1523,18 +1522,28 @@ kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, > } > > static void > -kvfree_rcu_list(struct rcu_head *head) > +kvfree_rcu_list(struct rcu_ptr *head) > { > - struct rcu_head *next; > + struct rcu_ptr *next; > > for (; head; head = next) { > - void *ptr = (void *) head->func; > - unsigned long offset = (void *) head - ptr; > + void *ptr; > + unsigned long offset; > + struct slab *slab; > + > + slab = virt_to_slab(head); > + if (is_vmalloc_addr(head) || !slab) > + ptr = (void *)PAGE_ALIGN_DOWN((unsigned long)head); > + else > + ptr = nearest_obj(slab->slab_cache, slab, head); > + offset = (void *)head - ptr; > > next = head->next; > debug_rcu_head_unqueue((struct rcu_head *)ptr); > rcu_lock_acquire(&rcu_callback_map); > - trace_rcu_invoke_kvfree_callback("slab", head, offset); > + trace_rcu_invoke_kvfree_callback("slab", > + (struct rcu_head *)head, > + offset); > > kvfree(ptr); > > @@ -1552,7 +1561,7 @@ static void kfree_rcu_work(struct work_struct *work) > unsigned long flags; > struct kvfree_rcu_bulk_data *bnode, *n; > struct list_head bulk_head[FREE_N_CHANNELS]; > - struct rcu_head *head; > + struct rcu_ptr *head; > struct kfree_rcu_cpu *krcp; > struct kfree_rcu_cpu_work *krwp; > struct rcu_gp_oldstate head_gp_snap; > @@ -1675,7 +1684,7 @@ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) > { > struct list_head bulk_ready[FREE_N_CHANNELS]; > struct kvfree_rcu_bulk_data *bnode, *n; > - struct rcu_head *head_ready = NULL; > + struct rcu_ptr *head_ready = NULL; > unsigned long flags; > int i; > > @@ -1938,7 +1947,7 @@ void __init kfree_rcu_scheduler_running(void) > * be free'd in workqueue context. This allows us to: batch requests > together to > * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() > load. > */ > -void kvfree_call_rcu(struct rcu_head *head, void *ptr) > +void kvfree_call_rcu_ptr(struct rcu_ptr *head, void *ptr) > { > unsigned long flags; > struct kfree_rcu_cpu *krcp; > @@ -1960,7 +1969,7 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) > // Queue the object but don't yet schedule the batch. > if (debug_rcu_head_queue(ptr)) { > // Probable double kfree_rcu(), just leak. > - WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", > + WARN_ONCE(1, "%s(): Double-freed call. rcu_ptr %p\n", > __func__, head); > > // Mark as success and leave. > @@ -1976,7 +1985,6 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) > // Inline if kvfree_rcu(one_arg) call. > goto unlock_return; > > - head->func = ptr; > head->next = krcp->head; > WRITE_ONCE(krcp->head, head); > atomic_inc(&krcp->head_count); > @@ -2012,7 +2020,7 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) > kvfree(ptr); > } > } > -EXPORT_SYMBOL_GPL(kvfree_call_rcu); > +EXPORT_SYMBOL_GPL(kvfree_call_rcu_ptr); > > static inline void __kvfree_rcu_barrier(void) > { > -- > 2.43.0 > If this is supposed to be invoked from NMI, should we better just detect such context in the kvfree_call_rcu()? There are lot of "allow_spin" checks which make it easy to get lost.
As i see you maintain llist and the idea is simply to re-enter to the kvfree_rcu() again with allow-spin=true, since then it will be "normal" context. -- Uladzislau Rezki
