On 15.09.25 14:36, Natalie Vock wrote:
> When the cgroup's memory usage is below the low/min limit and allocation
> fails, try evicting some unprotected buffers to make space. Otherwise,
> application buffers may be forced to go into GTT even though usage is
> below the corresponding low/min limit, if other applications filled VRAM
> with their allocations first.
>
> Signed-off-by: Natalie Vock <[email protected]>
> ---
> drivers/gpu/drm/ttm/ttm_bo.c | 57
> ++++++++++++++++++++++++++++++--------
> drivers/gpu/drm/ttm/ttm_resource.c | 48 +++++++++++++++++++++++---------
> include/drm/ttm/ttm_resource.h | 6 +++-
> 3 files changed, 86 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index
> f4d9e68b21e70cb25d0db5e79391233e1dc72221..d20ff41411c08cd97b4467f603751f483d1c7ff4
> 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -504,6 +504,8 @@ struct ttm_bo_evict_walk {
> /** @evicted: Number of successful evictions. */
> unsigned long evicted;
>
> + /** @charge_pool: The memory pool the resource is charged to */
> + struct dmem_cgroup_pool_state *charge_pool;
> /** @limit_pool: Which pool limit we should test against */
> struct dmem_cgroup_pool_state *limit_pool;
> /** @try_low: Whether we should attempt to evict BO's with low
> watermark threshold */
> @@ -539,7 +541,7 @@ static s64 ttm_bo_evict_cb(struct ttm_lru_walk *walk,
> struct ttm_buffer_object *
> evict_walk->evicted++;
> if (evict_walk->res)
> lret = ttm_resource_alloc(evict_walk->evictor,
> evict_walk->place,
> - evict_walk->res, NULL);
> + evict_walk->res,
> evict_walk->charge_pool);
> if (lret == 0)
> return 1;
> out:
> @@ -561,6 +563,8 @@ static int ttm_bo_evict_alloc(struct ttm_device *bdev,
> struct ttm_operation_ctx *ctx,
> struct ww_acquire_ctx *ticket,
> struct ttm_resource **res,
> + bool only_evict_unprotected,
> + struct dmem_cgroup_pool_state *charge_pool,
> struct dmem_cgroup_pool_state *limit_pool)
> {
> struct ttm_bo_evict_walk evict_walk = {
> @@ -574,6 +578,7 @@ static int ttm_bo_evict_alloc(struct ttm_device *bdev,
> .place = place,
> .evictor = evictor,
> .res = res,
> + .charge_pool = charge_pool,
> .limit_pool = limit_pool,
> };
> s64 lret;
> @@ -582,7 +587,7 @@ static int ttm_bo_evict_alloc(struct ttm_device *bdev,
> lret = ttm_lru_walk_for_evict(&evict_walk.walk, bdev, man, 1);
>
> /* One more attempt if we hit low limit? */
> - if (!lret && evict_walk.hit_low) {
> + if (!lret && evict_walk.hit_low && !only_evict_unprotected) {
> evict_walk.try_low = true;
> lret = ttm_lru_walk_for_evict(&evict_walk.walk, bdev, man, 1);
> }
> @@ -603,7 +608,8 @@ static int ttm_bo_evict_alloc(struct ttm_device *bdev,
> } while (!lret && evict_walk.evicted);
>
> /* We hit the low limit? Try once more */
> - if (!lret && evict_walk.hit_low && !evict_walk.try_low) {
> + if (!lret && evict_walk.hit_low && !evict_walk.try_low &&
> + !only_evict_unprotected) {
> evict_walk.try_low = true;
> goto retry;
> }
> @@ -724,9 +730,9 @@ static int ttm_bo_alloc_resource(struct ttm_buffer_object
> *bo,
>
> for (i = 0; i < placement->num_placement; ++i) {
> const struct ttm_place *place = &placement->placement[i];
> - struct dmem_cgroup_pool_state *limit_pool = NULL;
> + struct dmem_cgroup_pool_state *limit_pool = NULL, *charge_pool
> = NULL;
> struct ttm_resource_manager *man;
> - bool may_evict;
> + bool may_evict, is_protected = false;
>
> man = ttm_manager_type(bdev, place->mem_type);
> if (!man || !ttm_resource_manager_used(man))
> @@ -737,24 +743,53 @@ static int ttm_bo_alloc_resource(struct
> ttm_buffer_object *bo,
> continue;
>
> may_evict = (force_space && place->mem_type != TTM_PL_SYSTEM);
> - ret = ttm_resource_alloc(bo, place, res, force_space ?
> &limit_pool : NULL);
> + ret = ttm_resource_try_charge(bo, place, &charge_pool,
> + force_space ? &limit_pool : NULL);
> + if (ret) {
> + if (ret != -EAGAIN) {
> + dmem_cgroup_pool_state_put(limit_pool);
> + return ret;
> + } else if (!may_evict) {
> + dmem_cgroup_pool_state_put(limit_pool);
> + continue;
> + }
> + } else {
> + is_protected = dmem_cgroup_below_min(NULL, charge_pool)
> ||
> + dmem_cgroup_below_low(NULL, charge_pool);
> + ret = ttm_resource_alloc(bo, place, res, charge_pool);
> + }
> +
> if (ret) {
> if (ret != -ENOSPC && ret != -EAGAIN) {
> dmem_cgroup_pool_state_put(limit_pool);
> + if (charge_pool) {
> + dmem_cgroup_uncharge(charge_pool,
> bo->base.size);
> + dmem_cgroup_pool_state_put(charge_pool);
> + }
> return ret;
> }
> - if (!may_evict) {
> + if (!may_evict && !is_protected) {
> dmem_cgroup_pool_state_put(limit_pool);
> + if (charge_pool) {
> + dmem_cgroup_uncharge(charge_pool,
> bo->base.size);
> + dmem_cgroup_pool_state_put(charge_pool);
> + }
> continue;
> }
>
> ret = ttm_bo_evict_alloc(bdev, man, place, bo, ctx,
> - ticket, res, limit_pool);
> + ticket, res, !may_evict &&
> is_protected,
> + charge_pool, limit_pool);
> dmem_cgroup_pool_state_put(limit_pool);
> - if (ret == -EBUSY)
> - continue;
> - if (ret)
> + if (ret) {
> + if (charge_pool) {
> + dmem_cgroup_uncharge(charge_pool,
> bo->base.size);
> + dmem_cgroup_pool_state_put(charge_pool);
> + }
> + if (ret == -EBUSY)
> + continue;
> return ret;
> + }
Pushing that logic into ttm_bo_alloc_resource() is pretty much a no go to start
with.
Thomas and I have just removed all that stuff quite recently and moved forcing
the higher level placement logic into ttm_bo_mem_space.
You should probably start there instead.
Regards,
Christian.
> }
>
> ret = ttm_bo_add_move_fence(bo, man, ctx->no_wait_gpu);
> diff --git a/drivers/gpu/drm/ttm/ttm_resource.c
> b/drivers/gpu/drm/ttm/ttm_resource.c
> index
> e2c82ad07eb44b5e88bf5b5db1ef54dd6d27823b..fcfa8b51b033745f46a01e40a9dc83e0c69165fc
> 100644
> --- a/drivers/gpu/drm/ttm/ttm_resource.c
> +++ b/drivers/gpu/drm/ttm/ttm_resource.c
> @@ -372,30 +372,52 @@ void ttm_resource_fini(struct ttm_resource_manager *man,
> }
> EXPORT_SYMBOL(ttm_resource_fini);
>
> +/**
> + * ttm_resource_try_charge - charge a resource manager's cgroup pool
> + * @bo: buffer for which an allocation should be charged
> + * @place: where the allocation is attempted to be placed
> + * @ret_pool: on charge success, the pool that was charged
> + * @ret_limit_pool: on charge failure, the pool responsible for the failure
> + *
> + * Should be used to charge cgroups before attempting resource allocation.
> + * When charging succeeds, the value of ret_pool should be passed to
> + * ttm_resource_alloc.
> + *
> + * Returns: 0 on charge success, negative errno on failure.
> + */
> +int ttm_resource_try_charge(struct ttm_buffer_object *bo,
> + const struct ttm_place *place,
> + struct dmem_cgroup_pool_state **ret_pool,
> + struct dmem_cgroup_pool_state **ret_limit_pool)
> +{
> + struct ttm_resource_manager *man =
> + ttm_manager_type(bo->bdev, place->mem_type);
> +
> + if (!man->cg) {
> + *ret_pool = NULL;
> + if (ret_limit_pool)
> + *ret_limit_pool = NULL;
> + return 0;
> + }
> +
> + return dmem_cgroup_try_charge(man->cg, bo->base.size, ret_pool,
> + ret_limit_pool);
> +}
> +
> int ttm_resource_alloc(struct ttm_buffer_object *bo,
> const struct ttm_place *place,
> struct ttm_resource **res_ptr,
> - struct dmem_cgroup_pool_state **ret_limit_pool)
> + struct dmem_cgroup_pool_state *charge_pool)
> {
> struct ttm_resource_manager *man =
> ttm_manager_type(bo->bdev, place->mem_type);
> - struct dmem_cgroup_pool_state *pool = NULL;
> int ret;
>
> - if (man->cg) {
> - ret = dmem_cgroup_try_charge(man->cg, bo->base.size, &pool,
> ret_limit_pool);
> - if (ret)
> - return ret;
> - }
> -
> ret = man->func->alloc(man, bo, place, res_ptr);
> - if (ret) {
> - if (pool)
> - dmem_cgroup_uncharge(pool, bo->base.size);
> + if (ret)
> return ret;
> - }
>
> - (*res_ptr)->css = pool;
> + (*res_ptr)->css = charge_pool;
>
> spin_lock(&bo->bdev->lru_lock);
> ttm_resource_add_bulk_move(*res_ptr, bo);
> diff --git a/include/drm/ttm/ttm_resource.h b/include/drm/ttm/ttm_resource.h
> index
> e52bba15012f78e352f392232ac2e89a83afd311..3aef7efdd7cfb8fd93071db85e632b975b53cf81
> 100644
> --- a/include/drm/ttm/ttm_resource.h
> +++ b/include/drm/ttm/ttm_resource.h
> @@ -442,10 +442,14 @@ void ttm_resource_init(struct ttm_buffer_object *bo,
> void ttm_resource_fini(struct ttm_resource_manager *man,
> struct ttm_resource *res);
>
> +int ttm_resource_try_charge(struct ttm_buffer_object *bo,
> + const struct ttm_place *place,
> + struct dmem_cgroup_pool_state **ret_pool,
> + struct dmem_cgroup_pool_state **ret_limit_pool);
> int ttm_resource_alloc(struct ttm_buffer_object *bo,
> const struct ttm_place *place,
> struct ttm_resource **res,
> - struct dmem_cgroup_pool_state **ret_limit_pool);
> + struct dmem_cgroup_pool_state *charge_pool);
> void ttm_resource_free(struct ttm_buffer_object *bo, struct ttm_resource
> **res);
> bool ttm_resource_intersects(struct ttm_device *bdev,
> struct ttm_resource *res,
>