Re: [PATCH] Revert "drm/amdgpu: remove ctx->lock"

2022-07-11 Thread Luben Tuikov
On 2022-07-11 23:22, Tales Lelo da Aparecida wrote:
> On 21/06/2022 11:42, Luben Tuikov wrote:
>> This reverts commit e68efb27647f2106d6b545667f35b2ea39746b57.
>>
>> We see that the bo validate list gets corrupted, in
>> amdgpu_cs_list_validate(), the lobj->tv.bo is NULL. Then getting usermm on
>> the next line, references a NULL bo and we get a koops.
>>
>> Bisecting leads to the commit being reverted as the cause of the list
>> corruption. See the link below for details of the corruption failure.
>>
>> Cc: Christian König 
>> Cc: Andrey Grodzovsky 
>> Cc: Alex Deucher 
>> Cc: Vitaly Prosyak 
>> Link: 
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fdrm%2Famd%2F-%2Fissues%2F2048%23note_1427539data=05%7C01%7Cluben.tuikov%40amd.com%7C32a7f3ea490fc69f08da63b5c925%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637931929684937036%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7Csdata=eXMfeyqaPhBIl9iJa4kbQtN1r4OhFZSGkZiGRJ3MQsQ%3Dreserved=0
>> Signed-off-by: Luben Tuikov 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 16 +---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c |  2 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 +
>>   3 files changed, 8 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 36ac1f1d11e6b4..e619031753b927 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -128,6 +128,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser 
>> *p, union drm_amdgpu_cs
>>  goto free_chunk;
>>  }
>>   
>> +mutex_lock(>ctx->lock);
>> +
>>  /* skip guilty context job */
>>  if (atomic_read(>ctx->guilty) == 1) {
>>  ret = -ECANCELED;
>> @@ -585,16 +587,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser 
>> *p,
>>  }
>>  }
>>   
>> -/* Move fence waiting after getting reservation lock of
>> - * PD root. Then there is no need on a ctx mutex lock.
>> - */
>> -r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity);
>> -if (unlikely(r != 0)) {
>> -if (r != -ERESTARTSYS)
>> -DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
>> -goto error_validate;
>> -}
>> -
>>  amdgpu_cs_get_threshold_for_moves(p->adev, >bytes_moved_threshold,
>>>bytes_moved_vis_threshold);
>>  p->bytes_moved = 0;
>> @@ -722,6 +714,7 @@ static void amdgpu_cs_parser_fini(struct 
>> amdgpu_cs_parser *parser, int error,
>>  dma_fence_put(parser->fence);
>>   
>>  if (parser->ctx) {
>> +mutex_unlock(>ctx->lock);
>>  amdgpu_ctx_put(parser->ctx);
>>  }
>>  if (parser->bo_list)
>> @@ -965,7 +958,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>>  if (parser->job->uf_addr && ring->funcs->no_user_fence)
>>  return -EINVAL;
>>   
>> -return 0;
>> +return amdgpu_ctx_wait_prev_fence(parser->ctx, parser->entity);
>>   }
>>   
>>   static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,
>> @@ -1384,6 +1377,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void 
>> *data, struct drm_file *filp)
>>  goto out;
>>   
>>  r = amdgpu_cs_submit(, cs);
>> +
>>   out:
>>  amdgpu_cs_parser_fini(, r, reserved_buffers);
>>   
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> index 53f9268366f29e..2ef5296216d64d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> @@ -286,6 +286,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, 
>> int32_t priority,
>>  kref_init(>refcount);
>>  ctx->mgr = mgr;
>>  spin_lock_init(>ring_lock);
>> +mutex_init(>lock);
>>   
>>  ctx->reset_counter = atomic_read(>adev->gpu_reset_counter);
>>  ctx->reset_counter_query = ctx->reset_counter;
>> @@ -400,6 +401,7 @@ static void amdgpu_ctx_fini(struct kref *ref)
>>  drm_dev_exit(idx);
>>  }
>>   
>> +mutex_destroy(>lock);
>>  kfree(ctx);
>>   }
>>   
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>> index 0fa0e56daf67e9..cc7c8afff4144c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>> @@ -53,6 +53,7 @@ struct amdgpu_ctx {
>>  boolpreamble_presented;
>>  int32_t init_priority;
>>  int32_t override_priority;
>> +struct mutexlock;
>>  atomic_tguilty;
>>  unsigned long   ras_counter_ce;
>>  unsigned long   ras_counter_ue;
>>
>> base-commit: f4b3c543a2a759ce657de4b6b7e88eaddee85ec2
> 
> Hello,
> 

Re: [PATCH] Revert "drm/amdgpu: remove ctx->lock"

2022-07-11 Thread Tales Lelo da Aparecida

On 21/06/2022 11:42, Luben Tuikov wrote:

This reverts commit e68efb27647f2106d6b545667f35b2ea39746b57.

We see that the bo validate list gets corrupted, in
amdgpu_cs_list_validate(), the lobj->tv.bo is NULL. Then getting usermm on
the next line, references a NULL bo and we get a koops.

Bisecting leads to the commit being reverted as the cause of the list
corruption. See the link below for details of the corruption failure.

Cc: Christian König 
Cc: Andrey Grodzovsky 
Cc: Alex Deucher 
Cc: Vitaly Prosyak 
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2048#note_1427539
Signed-off-by: Luben Tuikov 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 16 +---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c |  2 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 +
  3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 36ac1f1d11e6b4..e619031753b927 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -128,6 +128,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser 
*p, union drm_amdgpu_cs
goto free_chunk;
}
  
+	mutex_lock(>ctx->lock);

+
/* skip guilty context job */
if (atomic_read(>ctx->guilty) == 1) {
ret = -ECANCELED;
@@ -585,16 +587,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
}
}
  
-	/* Move fence waiting after getting reservation lock of

-* PD root. Then there is no need on a ctx mutex lock.
-*/
-   r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity);
-   if (unlikely(r != 0)) {
-   if (r != -ERESTARTSYS)
-   DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
-   goto error_validate;
-   }
-
amdgpu_cs_get_threshold_for_moves(p->adev, >bytes_moved_threshold,
  >bytes_moved_vis_threshold);
p->bytes_moved = 0;
@@ -722,6 +714,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser 
*parser, int error,
dma_fence_put(parser->fence);
  
  	if (parser->ctx) {

+   mutex_unlock(>ctx->lock);
amdgpu_ctx_put(parser->ctx);
}
if (parser->bo_list)
@@ -965,7 +958,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
if (parser->job->uf_addr && ring->funcs->no_user_fence)
return -EINVAL;
  
-	return 0;

+   return amdgpu_ctx_wait_prev_fence(parser->ctx, parser->entity);
  }
  
  static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,

@@ -1384,6 +1377,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
goto out;
  
  	r = amdgpu_cs_submit(, cs);

+
  out:
amdgpu_cs_parser_fini(, r, reserved_buffers);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

index 53f9268366f29e..2ef5296216d64d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -286,6 +286,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, 
int32_t priority,
kref_init(>refcount);
ctx->mgr = mgr;
spin_lock_init(>ring_lock);
+   mutex_init(>lock);
  
  	ctx->reset_counter = atomic_read(>adev->gpu_reset_counter);

ctx->reset_counter_query = ctx->reset_counter;
@@ -400,6 +401,7 @@ static void amdgpu_ctx_fini(struct kref *ref)
drm_dev_exit(idx);
}
  
+	mutex_destroy(>lock);

kfree(ctx);
  }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h

index 0fa0e56daf67e9..cc7c8afff4144c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
@@ -53,6 +53,7 @@ struct amdgpu_ctx {
boolpreamble_presented;
int32_t init_priority;
int32_t override_priority;
+   struct mutexlock;
atomic_tguilty;
unsigned long   ras_counter_ce;
unsigned long   ras_counter_ue;

base-commit: f4b3c543a2a759ce657de4b6b7e88eaddee85ec2


Hello,

Applied on amd-staging-drm-next (with two additional commits from 
torvalds/master to allow me to compile using GCC12: 52a9dab6, 82880283) 
and tested with IGT using a RX580*; the errors on the "amd_cs_nop" tests 
are gone!


* Advanced Micro Devices, Inc. [AMD/ATI] Ellesmere [Radeon RX 
470/480/570/570X/580/580X/590] (rev e7)


The remaining IGT tests matching ".*amdgpu.*" were not affected. (FYI, 
amdgpu/amd_plane, amdgpu/amd_bypass, amdgpu/amd_plane, 
amdgpu/amd_vrr_range are failing in here, some should skip instead, but 
that's another thread to come)


Tested-by: Tales Aparecida 

Thanks for your time,
Tales


Re: [PATCH] Revert "drm/amdgpu: remove ctx->lock"

2022-06-27 Thread Alex Deucher
On Tue, Jun 21, 2022 at 10:42 AM Luben Tuikov  wrote:
>
> This reverts commit e68efb27647f2106d6b545667f35b2ea39746b57.
>
> We see that the bo validate list gets corrupted, in
> amdgpu_cs_list_validate(), the lobj->tv.bo is NULL. Then getting usermm on
> the next line, references a NULL bo and we get a koops.
>
> Bisecting leads to the commit being reverted as the cause of the list
> corruption. See the link below for details of the corruption failure.
>
> Cc: Christian König 
> Cc: Andrey Grodzovsky 
> Cc: Alex Deucher 
> Cc: Vitaly Prosyak 
> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2048#note_1427539

Looks like this bug is also relevant:

Bug: https://bugzilla.kernel.org/show_bug.cgi?id=216143

> Signed-off-by: Luben Tuikov 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 16 +---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c |  2 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 +
>  3 files changed, 8 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 36ac1f1d11e6b4..e619031753b927 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -128,6 +128,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser 
> *p, union drm_amdgpu_cs
> goto free_chunk;
> }
>
> +   mutex_lock(>ctx->lock);
> +
> /* skip guilty context job */
> if (atomic_read(>ctx->guilty) == 1) {
> ret = -ECANCELED;
> @@ -585,16 +587,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser 
> *p,
> }
> }
>
> -   /* Move fence waiting after getting reservation lock of
> -* PD root. Then there is no need on a ctx mutex lock.
> -*/
> -   r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity);
> -   if (unlikely(r != 0)) {
> -   if (r != -ERESTARTSYS)
> -   DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
> -   goto error_validate;
> -   }
> -
> amdgpu_cs_get_threshold_for_moves(p->adev, >bytes_moved_threshold,
>   >bytes_moved_vis_threshold);
> p->bytes_moved = 0;
> @@ -722,6 +714,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser 
> *parser, int error,
> dma_fence_put(parser->fence);
>
> if (parser->ctx) {
> +   mutex_unlock(>ctx->lock);
> amdgpu_ctx_put(parser->ctx);
> }
> if (parser->bo_list)
> @@ -965,7 +958,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
> if (parser->job->uf_addr && ring->funcs->no_user_fence)
> return -EINVAL;
>
> -   return 0;
> +   return amdgpu_ctx_wait_prev_fence(parser->ctx, parser->entity);
>  }
>
>  static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,
> @@ -1384,6 +1377,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, 
> struct drm_file *filp)
> goto out;
>
> r = amdgpu_cs_submit(, cs);
> +
>  out:
> amdgpu_cs_parser_fini(, r, reserved_buffers);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index 53f9268366f29e..2ef5296216d64d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -286,6 +286,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, 
> int32_t priority,
> kref_init(>refcount);
> ctx->mgr = mgr;
> spin_lock_init(>ring_lock);
> +   mutex_init(>lock);
>
> ctx->reset_counter = atomic_read(>adev->gpu_reset_counter);
> ctx->reset_counter_query = ctx->reset_counter;
> @@ -400,6 +401,7 @@ static void amdgpu_ctx_fini(struct kref *ref)
> drm_dev_exit(idx);
> }
>
> +   mutex_destroy(>lock);
> kfree(ctx);
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> index 0fa0e56daf67e9..cc7c8afff4144c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> @@ -53,6 +53,7 @@ struct amdgpu_ctx {
> boolpreamble_presented;
> int32_t init_priority;
> int32_t override_priority;
> +   struct mutexlock;
> atomic_tguilty;
> unsigned long   ras_counter_ce;
> unsigned long   ras_counter_ue;
>
> base-commit: f4b3c543a2a759ce657de4b6b7e88eaddee85ec2
> --
> 2.36.1.74.g277cf0bc36
>


[PATCH] Revert "drm/amdgpu: remove ctx->lock"

2022-06-21 Thread Luben Tuikov
This reverts commit e68efb27647f2106d6b545667f35b2ea39746b57.

We see that the bo validate list gets corrupted, in
amdgpu_cs_list_validate(), the lobj->tv.bo is NULL. Then getting usermm on
the next line, references a NULL bo and we get a koops.

Bisecting leads to the commit being reverted as the cause of the list
corruption. See the link below for details of the corruption failure.

Cc: Christian König 
Cc: Andrey Grodzovsky 
Cc: Alex Deucher 
Cc: Vitaly Prosyak 
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2048#note_1427539
Signed-off-by: Luben Tuikov 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 16 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 +
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 36ac1f1d11e6b4..e619031753b927 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -128,6 +128,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser 
*p, union drm_amdgpu_cs
goto free_chunk;
}
 
+   mutex_lock(>ctx->lock);
+
/* skip guilty context job */
if (atomic_read(>ctx->guilty) == 1) {
ret = -ECANCELED;
@@ -585,16 +587,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
}
}
 
-   /* Move fence waiting after getting reservation lock of
-* PD root. Then there is no need on a ctx mutex lock.
-*/
-   r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity);
-   if (unlikely(r != 0)) {
-   if (r != -ERESTARTSYS)
-   DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
-   goto error_validate;
-   }
-
amdgpu_cs_get_threshold_for_moves(p->adev, >bytes_moved_threshold,
  >bytes_moved_vis_threshold);
p->bytes_moved = 0;
@@ -722,6 +714,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser 
*parser, int error,
dma_fence_put(parser->fence);
 
if (parser->ctx) {
+   mutex_unlock(>ctx->lock);
amdgpu_ctx_put(parser->ctx);
}
if (parser->bo_list)
@@ -965,7 +958,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
if (parser->job->uf_addr && ring->funcs->no_user_fence)
return -EINVAL;
 
-   return 0;
+   return amdgpu_ctx_wait_prev_fence(parser->ctx, parser->entity);
 }
 
 static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,
@@ -1384,6 +1377,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
goto out;
 
r = amdgpu_cs_submit(, cs);
+
 out:
amdgpu_cs_parser_fini(, r, reserved_buffers);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index 53f9268366f29e..2ef5296216d64d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -286,6 +286,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, 
int32_t priority,
kref_init(>refcount);
ctx->mgr = mgr;
spin_lock_init(>ring_lock);
+   mutex_init(>lock);
 
ctx->reset_counter = atomic_read(>adev->gpu_reset_counter);
ctx->reset_counter_query = ctx->reset_counter;
@@ -400,6 +401,7 @@ static void amdgpu_ctx_fini(struct kref *ref)
drm_dev_exit(idx);
}
 
+   mutex_destroy(>lock);
kfree(ctx);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
index 0fa0e56daf67e9..cc7c8afff4144c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
@@ -53,6 +53,7 @@ struct amdgpu_ctx {
boolpreamble_presented;
int32_t init_priority;
int32_t override_priority;
+   struct mutexlock;
atomic_tguilty;
unsigned long   ras_counter_ce;
unsigned long   ras_counter_ue;

base-commit: f4b3c543a2a759ce657de4b6b7e88eaddee85ec2
-- 
2.36.1.74.g277cf0bc36