On 21/06/2022 11:42, Luben Tuikov wrote:
This reverts commit e68efb27647f2106d6b545667f35b2ea39746b57.

We see that the bo validate list gets corrupted, in
amdgpu_cs_list_validate(), the lobj->tv.bo is NULL. Then getting usermm on
the next line, references a NULL bo and we get a koops.

Bisecting leads to the commit being reverted as the cause of the list
corruption. See the link below for details of the corruption failure.

Cc: Christian König <christian.koe...@amd.com>
Cc: Andrey Grodzovsky <andrey.grodzov...@amd.com>
Cc: Alex Deucher <alexander.deuc...@amd.com>
Cc: Vitaly Prosyak <vitaly.pros...@amd.com>
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2048#note_1427539
Signed-off-by: Luben Tuikov <luben.tui...@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 16 +++++-----------
  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c |  2 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 +
  3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 36ac1f1d11e6b4..e619031753b927 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -128,6 +128,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser 
*p, union drm_amdgpu_cs
                goto free_chunk;
        }
+ mutex_lock(&p->ctx->lock);
+
        /* skip guilty context job */
        if (atomic_read(&p->ctx->guilty) == 1) {
                ret = -ECANCELED;
@@ -585,16 +587,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
                }
        }
- /* Move fence waiting after getting reservation lock of
-        * PD root. Then there is no need on a ctx mutex lock.
-        */
-       r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity);
-       if (unlikely(r != 0)) {
-               if (r != -ERESTARTSYS)
-                       DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
-               goto error_validate;
-       }
-
        amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
                                          &p->bytes_moved_vis_threshold);
        p->bytes_moved = 0;
@@ -722,6 +714,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser 
*parser, int error,
        dma_fence_put(parser->fence);
if (parser->ctx) {
+               mutex_unlock(&parser->ctx->lock);
                amdgpu_ctx_put(parser->ctx);
        }
        if (parser->bo_list)
@@ -965,7 +958,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
        if (parser->job->uf_addr && ring->funcs->no_user_fence)
                return -EINVAL;
- return 0;
+       return amdgpu_ctx_wait_prev_fence(parser->ctx, parser->entity);
  }
static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,
@@ -1384,6 +1377,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
                goto out;
r = amdgpu_cs_submit(&parser, cs);
+
  out:
        amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index 53f9268366f29e..2ef5296216d64d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -286,6 +286,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, 
int32_t priority,
        kref_init(&ctx->refcount);
        ctx->mgr = mgr;
        spin_lock_init(&ctx->ring_lock);
+       mutex_init(&ctx->lock);
ctx->reset_counter = atomic_read(&mgr->adev->gpu_reset_counter);
        ctx->reset_counter_query = ctx->reset_counter;
@@ -400,6 +401,7 @@ static void amdgpu_ctx_fini(struct kref *ref)
                drm_dev_exit(idx);
        }
+ mutex_destroy(&ctx->lock);
        kfree(ctx);
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
index 0fa0e56daf67e9..cc7c8afff4144c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
@@ -53,6 +53,7 @@ struct amdgpu_ctx {
        bool                            preamble_presented;
        int32_t                         init_priority;
        int32_t                         override_priority;
+       struct mutex                    lock;
        atomic_t                        guilty;
        unsigned long                   ras_counter_ce;
        unsigned long                   ras_counter_ue;

base-commit: f4b3c543a2a759ce657de4b6b7e88eaddee85ec2

Hello,

Applied on amd-staging-drm-next (with two additional commits from torvalds/master to allow me to compile using GCC12: 52a9dab6, 82880283) and tested with IGT using a RX580*; the errors on the "amd_cs_nop" tests are gone!

* Advanced Micro Devices, Inc. [AMD/ATI] Ellesmere [Radeon RX 470/480/570/570X/580/580X/590] (rev e7)

The remaining IGT tests matching ".*amdgpu.*" were not affected. (FYI, amdgpu/amd_plane, amdgpu/amd_bypass, amdgpu/amd_plane, amdgpu/amd_vrr_range are failing in here, some should skip instead, but that's another thread to come)

Tested-by: Tales Aparecida <tales.aparec...@gmail.com>

Thanks for your time,
Tales

Reply via email to