Re: [PATCH] Revert "drm/amdgpu: remove ctx->lock"
On 2022-07-11 23:22, Tales Lelo da Aparecida wrote: > On 21/06/2022 11:42, Luben Tuikov wrote: >> This reverts commit e68efb27647f2106d6b545667f35b2ea39746b57. >> >> We see that the bo validate list gets corrupted, in >> amdgpu_cs_list_validate(), the lobj->tv.bo is NULL. Then getting usermm on >> the next line, references a NULL bo and we get a koops. >> >> Bisecting leads to the commit being reverted as the cause of the list >> corruption. See the link below for details of the corruption failure. >> >> Cc: Christian König >> Cc: Andrey Grodzovsky >> Cc: Alex Deucher >> Cc: Vitaly Prosyak >> Link: >> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fdrm%2Famd%2F-%2Fissues%2F2048%23note_1427539data=05%7C01%7Cluben.tuikov%40amd.com%7C32a7f3ea490fc69f08da63b5c925%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637931929684937036%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7Csdata=eXMfeyqaPhBIl9iJa4kbQtN1r4OhFZSGkZiGRJ3MQsQ%3Dreserved=0 >> Signed-off-by: Luben Tuikov >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 16 +--- >> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 2 ++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 1 + >> 3 files changed, 8 insertions(+), 11 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> index 36ac1f1d11e6b4..e619031753b927 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> @@ -128,6 +128,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser >> *p, union drm_amdgpu_cs >> goto free_chunk; >> } >> >> +mutex_lock(>ctx->lock); >> + >> /* skip guilty context job */ >> if (atomic_read(>ctx->guilty) == 1) { >> ret = -ECANCELED; >> @@ -585,16 +587,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser >> *p, >> } >> } >> >> -/* Move fence waiting after getting reservation lock of >> - * PD root. Then there is no need on a ctx mutex lock. >> - */ >> -r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity); >> -if (unlikely(r != 0)) { >> -if (r != -ERESTARTSYS) >> -DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n"); >> -goto error_validate; >> -} >> - >> amdgpu_cs_get_threshold_for_moves(p->adev, >bytes_moved_threshold, >>>bytes_moved_vis_threshold); >> p->bytes_moved = 0; >> @@ -722,6 +714,7 @@ static void amdgpu_cs_parser_fini(struct >> amdgpu_cs_parser *parser, int error, >> dma_fence_put(parser->fence); >> >> if (parser->ctx) { >> +mutex_unlock(>ctx->lock); >> amdgpu_ctx_put(parser->ctx); >> } >> if (parser->bo_list) >> @@ -965,7 +958,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev, >> if (parser->job->uf_addr && ring->funcs->no_user_fence) >> return -EINVAL; >> >> -return 0; >> +return amdgpu_ctx_wait_prev_fence(parser->ctx, parser->entity); >> } >> >> static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p, >> @@ -1384,6 +1377,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void >> *data, struct drm_file *filp) >> goto out; >> >> r = amdgpu_cs_submit(, cs); >> + >> out: >> amdgpu_cs_parser_fini(, r, reserved_buffers); >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c >> index 53f9268366f29e..2ef5296216d64d 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c >> @@ -286,6 +286,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, >> int32_t priority, >> kref_init(>refcount); >> ctx->mgr = mgr; >> spin_lock_init(>ring_lock); >> +mutex_init(>lock); >> >> ctx->reset_counter = atomic_read(>adev->gpu_reset_counter); >> ctx->reset_counter_query = ctx->reset_counter; >> @@ -400,6 +401,7 @@ static void amdgpu_ctx_fini(struct kref *ref) >> drm_dev_exit(idx); >> } >> >> +mutex_destroy(>lock); >> kfree(ctx); >> } >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h >> index 0fa0e56daf67e9..cc7c8afff4144c 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h >> @@ -53,6 +53,7 @@ struct amdgpu_ctx { >> boolpreamble_presented; >> int32_t init_priority; >> int32_t override_priority; >> +struct mutexlock; >> atomic_tguilty; >> unsigned long ras_counter_ce; >> unsigned long ras_counter_ue; >> >> base-commit: f4b3c543a2a759ce657de4b6b7e88eaddee85ec2 > > Hello, >
Re: [PATCH] Revert "drm/amdgpu: remove ctx->lock"
On 21/06/2022 11:42, Luben Tuikov wrote: This reverts commit e68efb27647f2106d6b545667f35b2ea39746b57. We see that the bo validate list gets corrupted, in amdgpu_cs_list_validate(), the lobj->tv.bo is NULL. Then getting usermm on the next line, references a NULL bo and we get a koops. Bisecting leads to the commit being reverted as the cause of the list corruption. See the link below for details of the corruption failure. Cc: Christian König Cc: Andrey Grodzovsky Cc: Alex Deucher Cc: Vitaly Prosyak Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2048#note_1427539 Signed-off-by: Luben Tuikov --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 16 +--- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 1 + 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 36ac1f1d11e6b4..e619031753b927 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -128,6 +128,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs goto free_chunk; } + mutex_lock(>ctx->lock); + /* skip guilty context job */ if (atomic_read(>ctx->guilty) == 1) { ret = -ECANCELED; @@ -585,16 +587,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, } } - /* Move fence waiting after getting reservation lock of -* PD root. Then there is no need on a ctx mutex lock. -*/ - r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity); - if (unlikely(r != 0)) { - if (r != -ERESTARTSYS) - DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n"); - goto error_validate; - } - amdgpu_cs_get_threshold_for_moves(p->adev, >bytes_moved_threshold, >bytes_moved_vis_threshold); p->bytes_moved = 0; @@ -722,6 +714,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, dma_fence_put(parser->fence); if (parser->ctx) { + mutex_unlock(>ctx->lock); amdgpu_ctx_put(parser->ctx); } if (parser->bo_list) @@ -965,7 +958,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev, if (parser->job->uf_addr && ring->funcs->no_user_fence) return -EINVAL; - return 0; + return amdgpu_ctx_wait_prev_fence(parser->ctx, parser->entity); } static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p, @@ -1384,6 +1377,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) goto out; r = amdgpu_cs_submit(, cs); + out: amdgpu_cs_parser_fini(, r, reserved_buffers); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 53f9268366f29e..2ef5296216d64d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -286,6 +286,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority, kref_init(>refcount); ctx->mgr = mgr; spin_lock_init(>ring_lock); + mutex_init(>lock); ctx->reset_counter = atomic_read(>adev->gpu_reset_counter); ctx->reset_counter_query = ctx->reset_counter; @@ -400,6 +401,7 @@ static void amdgpu_ctx_fini(struct kref *ref) drm_dev_exit(idx); } + mutex_destroy(>lock); kfree(ctx); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h index 0fa0e56daf67e9..cc7c8afff4144c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h @@ -53,6 +53,7 @@ struct amdgpu_ctx { boolpreamble_presented; int32_t init_priority; int32_t override_priority; + struct mutexlock; atomic_tguilty; unsigned long ras_counter_ce; unsigned long ras_counter_ue; base-commit: f4b3c543a2a759ce657de4b6b7e88eaddee85ec2 Hello, Applied on amd-staging-drm-next (with two additional commits from torvalds/master to allow me to compile using GCC12: 52a9dab6, 82880283) and tested with IGT using a RX580*; the errors on the "amd_cs_nop" tests are gone! * Advanced Micro Devices, Inc. [AMD/ATI] Ellesmere [Radeon RX 470/480/570/570X/580/580X/590] (rev e7) The remaining IGT tests matching ".*amdgpu.*" were not affected. (FYI, amdgpu/amd_plane, amdgpu/amd_bypass, amdgpu/amd_plane, amdgpu/amd_vrr_range are failing in here, some should skip instead, but that's another thread to come) Tested-by: Tales Aparecida Thanks for your time, Tales
Re: [PATCH] Revert "drm/amdgpu: remove ctx->lock"
On Tue, Jun 21, 2022 at 10:42 AM Luben Tuikov wrote: > > This reverts commit e68efb27647f2106d6b545667f35b2ea39746b57. > > We see that the bo validate list gets corrupted, in > amdgpu_cs_list_validate(), the lobj->tv.bo is NULL. Then getting usermm on > the next line, references a NULL bo and we get a koops. > > Bisecting leads to the commit being reverted as the cause of the list > corruption. See the link below for details of the corruption failure. > > Cc: Christian König > Cc: Andrey Grodzovsky > Cc: Alex Deucher > Cc: Vitaly Prosyak > Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2048#note_1427539 Looks like this bug is also relevant: Bug: https://bugzilla.kernel.org/show_bug.cgi?id=216143 > Signed-off-by: Luben Tuikov > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 16 +--- > drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 2 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 1 + > 3 files changed, 8 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > index 36ac1f1d11e6b4..e619031753b927 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > @@ -128,6 +128,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser > *p, union drm_amdgpu_cs > goto free_chunk; > } > > + mutex_lock(>ctx->lock); > + > /* skip guilty context job */ > if (atomic_read(>ctx->guilty) == 1) { > ret = -ECANCELED; > @@ -585,16 +587,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser > *p, > } > } > > - /* Move fence waiting after getting reservation lock of > -* PD root. Then there is no need on a ctx mutex lock. > -*/ > - r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity); > - if (unlikely(r != 0)) { > - if (r != -ERESTARTSYS) > - DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n"); > - goto error_validate; > - } > - > amdgpu_cs_get_threshold_for_moves(p->adev, >bytes_moved_threshold, > >bytes_moved_vis_threshold); > p->bytes_moved = 0; > @@ -722,6 +714,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser > *parser, int error, > dma_fence_put(parser->fence); > > if (parser->ctx) { > + mutex_unlock(>ctx->lock); > amdgpu_ctx_put(parser->ctx); > } > if (parser->bo_list) > @@ -965,7 +958,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev, > if (parser->job->uf_addr && ring->funcs->no_user_fence) > return -EINVAL; > > - return 0; > + return amdgpu_ctx_wait_prev_fence(parser->ctx, parser->entity); > } > > static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p, > @@ -1384,6 +1377,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, > struct drm_file *filp) > goto out; > > r = amdgpu_cs_submit(, cs); > + > out: > amdgpu_cs_parser_fini(, r, reserved_buffers); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > index 53f9268366f29e..2ef5296216d64d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > @@ -286,6 +286,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, > int32_t priority, > kref_init(>refcount); > ctx->mgr = mgr; > spin_lock_init(>ring_lock); > + mutex_init(>lock); > > ctx->reset_counter = atomic_read(>adev->gpu_reset_counter); > ctx->reset_counter_query = ctx->reset_counter; > @@ -400,6 +401,7 @@ static void amdgpu_ctx_fini(struct kref *ref) > drm_dev_exit(idx); > } > > + mutex_destroy(>lock); > kfree(ctx); > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h > index 0fa0e56daf67e9..cc7c8afff4144c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h > @@ -53,6 +53,7 @@ struct amdgpu_ctx { > boolpreamble_presented; > int32_t init_priority; > int32_t override_priority; > + struct mutexlock; > atomic_tguilty; > unsigned long ras_counter_ce; > unsigned long ras_counter_ue; > > base-commit: f4b3c543a2a759ce657de4b6b7e88eaddee85ec2 > -- > 2.36.1.74.g277cf0bc36 >
[PATCH] Revert "drm/amdgpu: remove ctx->lock"
This reverts commit e68efb27647f2106d6b545667f35b2ea39746b57. We see that the bo validate list gets corrupted, in amdgpu_cs_list_validate(), the lobj->tv.bo is NULL. Then getting usermm on the next line, references a NULL bo and we get a koops. Bisecting leads to the commit being reverted as the cause of the list corruption. See the link below for details of the corruption failure. Cc: Christian König Cc: Andrey Grodzovsky Cc: Alex Deucher Cc: Vitaly Prosyak Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2048#note_1427539 Signed-off-by: Luben Tuikov --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 16 +--- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 1 + 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 36ac1f1d11e6b4..e619031753b927 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -128,6 +128,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs goto free_chunk; } + mutex_lock(>ctx->lock); + /* skip guilty context job */ if (atomic_read(>ctx->guilty) == 1) { ret = -ECANCELED; @@ -585,16 +587,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, } } - /* Move fence waiting after getting reservation lock of -* PD root. Then there is no need on a ctx mutex lock. -*/ - r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity); - if (unlikely(r != 0)) { - if (r != -ERESTARTSYS) - DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n"); - goto error_validate; - } - amdgpu_cs_get_threshold_for_moves(p->adev, >bytes_moved_threshold, >bytes_moved_vis_threshold); p->bytes_moved = 0; @@ -722,6 +714,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, dma_fence_put(parser->fence); if (parser->ctx) { + mutex_unlock(>ctx->lock); amdgpu_ctx_put(parser->ctx); } if (parser->bo_list) @@ -965,7 +958,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev, if (parser->job->uf_addr && ring->funcs->no_user_fence) return -EINVAL; - return 0; + return amdgpu_ctx_wait_prev_fence(parser->ctx, parser->entity); } static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p, @@ -1384,6 +1377,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) goto out; r = amdgpu_cs_submit(, cs); + out: amdgpu_cs_parser_fini(, r, reserved_buffers); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 53f9268366f29e..2ef5296216d64d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -286,6 +286,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority, kref_init(>refcount); ctx->mgr = mgr; spin_lock_init(>ring_lock); + mutex_init(>lock); ctx->reset_counter = atomic_read(>adev->gpu_reset_counter); ctx->reset_counter_query = ctx->reset_counter; @@ -400,6 +401,7 @@ static void amdgpu_ctx_fini(struct kref *ref) drm_dev_exit(idx); } + mutex_destroy(>lock); kfree(ctx); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h index 0fa0e56daf67e9..cc7c8afff4144c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h @@ -53,6 +53,7 @@ struct amdgpu_ctx { boolpreamble_presented; int32_t init_priority; int32_t override_priority; + struct mutexlock; atomic_tguilty; unsigned long ras_counter_ce; unsigned long ras_counter_ue; base-commit: f4b3c543a2a759ce657de4b6b7e88eaddee85ec2 -- 2.36.1.74.g277cf0bc36