Hi, Maarten, Thanks for reviewing.
On Tue, 2026-05-26 at 10:27 +0200, Maarten Lankhorst wrote: > Hello, > > Den 2026-05-12 kl. 10:24, skrev Thomas Hellström: > > Add an optional reclaim callback to struct dmem_cgroup_region. When > > dmem.max is set below the current usage of a cgroup pool, the new > > limit > > is applied immediately (so that concurrent allocations are > > throttled > > while reclaim is in progress) and then the driver is asked to evict > > memory to bring usage back below the limit. > > > > Reclaim is attempted up to a bounded number of times. No error is > > returned to userspace if usage remains above the limit after > > reclaim, > > and a pending signal will abort the reclaim loop early. This > > matches > > the behavior of memory.max in the memory cgroup controller. > > > > Also honor O_NONBLOCK so that if that flag is set during the > > max value write, no reclaim is initiated. The idea is to avoid > > charging the reclaim cost to the writer of the max value. > > > > v2: > > - Write max before reclaim is attempted (Maarten) > > - Let signals abort the reclaim without error (Maarten) > > - If a new max value is written with the O_NONBLOCK flag, > > reclaim is not attempted (Maarten) > > - Extract region from the pool parameter rather than > > passing it explicitly to set_resource_xxx(). > > v3: > > - Use an rwsem to protect reclaim callback registration and > > region unregister against concurrent reclaim invocations, > > ensuring reclaim_priv is visible when the callback is > > invoked. (Sashiko-bot) > > > > Assisted-by: GitHub_Copilot:claude-sonnet-4.6 > > Signed-off-by: Thomas Hellström <[email protected]> > > --- > > include/linux/cgroup_dmem.h | 24 ++++++++ > > kernel/cgroup/dmem.c | 106 > > +++++++++++++++++++++++++++++++++--- > > 2 files changed, 121 insertions(+), 9 deletions(-) > > > > diff --git a/include/linux/cgroup_dmem.h > > b/include/linux/cgroup_dmem.h > > index dd4869f1d736..c3bce21cbe80 100644 > > --- a/include/linux/cgroup_dmem.h > > +++ b/include/linux/cgroup_dmem.h > > @@ -14,6 +14,21 @@ struct dmem_cgroup_pool_state; > > /* Opaque definition of a cgroup region, used internally */ > > struct dmem_cgroup_region; > > > > +/** > > + * typedef dmem_cgroup_reclaim_fn_t - Reclaim callback for a dmem > > cgroup region. > > + * @pool: The cgroup pool that needs memory reclaimed. > > + * @target_bytes: Minimum number of bytes the driver should > > attempt to free. > > + * @priv: Private data registered with > > dmem_cgroup_region_set_reclaim(). > > + * > > + * Called by the dmem cgroup controller when dmem.max is set below > > the current > > + * usage of @pool. The driver should evict at least @target_bytes > > of memory > > + * from @pool. May be called multiple times if usage remains above > > the limit. > > + * > > + * Return: 0 if progress was made, negative error code otherwise. > > + */ > > +typedef int (*dmem_cgroup_reclaim_fn_t)(struct > > dmem_cgroup_pool_state *pool, > > + u64 target_bytes, void > > *priv); > > + > > #if IS_ENABLED(CONFIG_CGROUP_DMEM) > > struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, > > const char *name_fmt, ...) __printf(2,3); > > void dmem_cgroup_unregister_region(struct dmem_cgroup_region > > *region); > > @@ -26,6 +41,9 @@ bool dmem_cgroup_state_evict_valuable(struct > > dmem_cgroup_pool_state *limit_pool, > > bool ignore_low, bool > > *ret_hit_low); > > > > void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state > > *pool); > > +void dmem_cgroup_region_set_reclaim(struct dmem_cgroup_region > > *region, > > + dmem_cgroup_reclaim_fn_t > > reclaim, > > + void *priv); > > #else > > static inline __printf(2,3) struct dmem_cgroup_region * > > dmem_cgroup_register_region(u64 size, const char *name_fmt, ...) > > @@ -62,5 +80,11 @@ bool dmem_cgroup_state_evict_valuable(struct > > dmem_cgroup_pool_state *limit_pool, > > static inline void dmem_cgroup_pool_state_put(struct > > dmem_cgroup_pool_state *pool) > > { } > > > > +static inline void > > +dmem_cgroup_region_set_reclaim(struct dmem_cgroup_region *region, > > + dmem_cgroup_reclaim_fn_t reclaim, > > + void *priv) > > +{ } > > + > > #endif > > #endif /* _CGROUP_DMEM_H */ > > diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c > > index 1ab1fb47f271..5fd5a1634d21 100644 > > --- a/kernel/cgroup/dmem.c > > +++ b/kernel/cgroup/dmem.c > > @@ -51,6 +51,20 @@ struct dmem_cgroup_region { > > * No new pools should be added to the region afterwards. > > */ > > bool unregistered; > > + > > + /** > > + * @reclaim: Optional callback invoked when dmem.max is > > set below the > > + * current usage of a pool. The driver should attempt to > > free at least > > + * @target_bytes from @pool. May be called multiple times > > if usage > > + * remains above the limit after returning. > > + */ > > + dmem_cgroup_reclaim_fn_t reclaim; > > + > > + /** @reclaim_priv: Private data passed to @reclaim. */ > > + void *reclaim_priv; > > + > > + /** @unregister_sem: Protect @reclaim while it is running. > > */ > > + struct rw_semaphore unregister_sem; > > }; > > > > struct dmemcg_state { > > @@ -145,21 +159,58 @@ static void free_cg_pool(struct > > dmem_cgroup_pool_state *pool) > > } > > > > static void > > -set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val) > > +set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val, > > bool nonblock) > > { > > page_counter_set_min(&pool->cnt, val); > > } > > > > static void > > -set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val) > > +set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val, > > bool nonblock) > > { > > page_counter_set_low(&pool->cnt, val); > > } > > > > static void > > -set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val) > > +set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val, > > bool nonblock) > > { > > - page_counter_set_max(&pool->cnt, val); > > + struct dmem_cgroup_region *region = pool->region; > > + > > + /* > > + * Always update the limit, even if usage currently > > exceeds it. > > + * Concurrent allocations will be throttled against the > > new limit > > + * while reclaim is in progress. > > + */ > > + xchg(&pool->cnt.max, (unsigned long)val); > > + > > + if (nonblock || !READ_ONCE(region->reclaim)) > > + return; > > + > > + for (int retries = 5; retries > 0; retries--) { > Where does 5 come from? This code should retry until no longer above > limit, otherwise you'll get some hard to debug issues. The memcg controller uses MAX_RECLAIM_RETRIES, although if that fails, it will invoke the OOM killer, although if the reclaim callback makes progress, it will not consume a retry. Perhaps we should adopt the same behaviour except the OOM killer, at least for now. Note that if a signal is pending, the reclaim attempt is abandoned both here and in memcg. > > > + u64 usage = page_counter_read(&pool->cnt); > > + int ret; > > + > > + if (usage <= val) > > + break; > > + > > + if (signal_pending(current)) > > + break; > > + > > + /* Block unregister until the reclaim callback > > completes. */ > > + if (down_read_interruptible(®ion- > > >unregister_sem)) > > + break; > > + > > + if (!region->reclaim) { > > + up_read(®ion->unregister_sem); > > + break; > > + } > > + > > + ret = region->reclaim(pool, usage - val, region- > > >reclaim_priv); > > + up_read(®ion->unregister_sem); > > + if (ret) > > + break; > > + > > + cond_resched(); > > + } > > } > > > > static u64 get_resource_low(struct dmem_cgroup_pool_state *pool) > > @@ -184,9 +235,9 @@ static u64 get_resource_current(struct > > dmem_cgroup_pool_state *pool) > > > > static void reset_all_resource_limits(struct > > dmem_cgroup_pool_state *rpool) > > { > > - set_resource_min(rpool, 0); > > - set_resource_low(rpool, 0); > > - set_resource_max(rpool, PAGE_COUNTER_MAX); > > + set_resource_min(rpool, 0, false); > > + set_resource_low(rpool, 0, false); > > + set_resource_max(rpool, PAGE_COUNTER_MAX, false); > > } > > > > static void dmemcs_offline(struct cgroup_subsys_state *css) > > @@ -491,6 +542,12 @@ void dmem_cgroup_unregister_region(struct > > dmem_cgroup_region *region) > > region->unregistered = true; > > spin_unlock(&dmemcg_lock); > > > > + /* Ensure all reclaim() callbacks have finished. */ > > + down_write(®ion->unregister_sem); > > + /* Pairs with READ_ONCE() in set_resource_max() */ > > + WRITE_ONCE(region->reclaim, NULL); > > + up_write(®ion->unregister_sem); > > + > > kref_put(®ion->ref, dmemcg_free_region); > > } > I've thought about it some more, Can we do the same as dma-buf init? > > DEFINE_DMEMCG_REGION_INFO(info); > info.size = size. > info.ops = &drm_ttm_dmem_region_ops; > info.region_priv = ttm_region; > info.device_priv = drm_dev; > > dmem_region = dmem_cgroup_register_region(&info); > > This way we don't need to have a typedef for function pointers, > no need for READ_ONCE() and/or additional locking, which was only > added because it wasn't set at init. > > If we can push the responsibility for serialization against unload > to the driver, we should also be able to use drm_dev_enter/exit here > for the reclaim loop? > > Something like below: > > if (!ops->device_begin(device_priv, &cookie)) > return 0; // Device gone > > while (true) { > ops->reclaim(region_priv, ...); > } > > ops->device_end(device_priv, cookie); > > Although we will additionally need to ensure that the region holds a > refcount on > reclaim_priv until dmemcg_free_region is called, otherwise this > breaks. > > So 4 ops needed: > - device_begin > - reclaim > - device_end > - free (called after region refcount drops to 0, called immediately > on !CONFIG_DMEMCG, drops device refcount) > > Relatedly, I believe perhaps we should also convert from drmm managed > to devm managed, > as all memory is already freed after the device is physically > detached. > > Hopefully this solves all lifetime issues, and this design allows for > additional callbacks into the device or region later on if needed. Let me take a look at this. /Thomas > > Kind regards, > ~Maarten Lankhorst > > > EXPORT_SYMBOL_GPL(dmem_cgroup_unregister_region); > > @@ -530,6 +587,7 @@ struct dmem_cgroup_region > > *dmem_cgroup_register_region(u64 size, const char *fmt > > INIT_LIST_HEAD(&ret->pools); > > ret->name = region_name; > > ret->size = size; > > + init_rwsem(&ret->unregister_sem); > > kref_init(&ret->ref); > > > > spin_lock(&dmemcg_lock); > > @@ -568,6 +626,34 @@ void dmem_cgroup_pool_state_put(struct > > dmem_cgroup_pool_state *pool) > > } > > EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put); > > > > +/** > > + * dmem_cgroup_region_set_reclaim() - Register a reclaim callback > > on a region. > > + * @region: The region to register the callback for. > > + * @reclaim: Callback to invoke when dmem.max is set below current > > usage. > > + * Called with the pool that needs reclaiming and the > > number of > > + * bytes to free. Returns 0 on progress, negative on > > failure. > > + * @priv: Opaque pointer passed back to @reclaim. > > + * > > + * When dmem.max is lowered below the current usage of a cgroup > > pool, the > > + * dmem controller will call @reclaim with a target number of > > bytes to free. > > + * After @reclaim returns the controller retries setting the > > limit; if usage > > + * is still too high it calls @reclaim again, up to a bounded > > retry count. > > + */ > > +void dmem_cgroup_region_set_reclaim(struct dmem_cgroup_region > > *region, > > + dmem_cgroup_reclaim_fn_t > > reclaim, > > + void *priv) > > +{ > > + if (!region) > > + return; > > + > > + down_write(®ion->unregister_sem); > > + region->reclaim_priv = priv; > > + /* Pairs with READ_ONCE() in set_resource_max() */ > > + WRITE_ONCE(region->reclaim, reclaim); > > + up_write(®ion->unregister_sem); > > +} > > +EXPORT_SYMBOL_GPL(dmem_cgroup_region_set_reclaim); > > + > > static struct dmem_cgroup_pool_state * > > get_cg_pool_unlocked(struct dmemcg_state *cg, struct > > dmem_cgroup_region *region) > > { > > @@ -725,9 +811,10 @@ static int dmemcg_parse_limit(char *options, > > u64 *new_limit) > > > > static ssize_t dmemcg_limit_write(struct kernfs_open_file *of, > > char *buf, size_t nbytes, loff_t > > off, > > - void (*apply)(struct > > dmem_cgroup_pool_state *, u64)) > > + void (*apply)(struct > > dmem_cgroup_pool_state *, u64, bool)) > > { > > struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of)); > > + bool nonblock = of->file->f_flags & O_NONBLOCK; > > int err = 0; > > > > while (buf && !err) { > > @@ -772,7 +859,8 @@ static ssize_t dmemcg_limit_write(struct > > kernfs_open_file *of, > > } > > > > /* And commit */ > > - apply(pool, new_limit); > > + apply(pool, new_limit, nonblock); > > + > > dmemcg_pool_put(pool); > > > > out_put:
