RE: [PATCH 1/9] drm/amdgpu:Define the unified ras function pointers of each IP block

2021-11-25 Thread Chai, Thomas
Hi Lijo:
   I add my replay after your comment.

Thanks,
Thomas
-Original Message-
From: Lazar, Lijo  
Sent: Thursday, November 25, 2021 7:41 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas 
Subject: Re: [PATCH 1/9] drm/amdgpu:Define the unified ras function pointers of 
each IP block



On 11/25/2021 4:26 PM, yipechai wrote:
> Define an unified ras function pointers for each ip block to adapt.
> 
> Signed-off-by: yipechai 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 36 -
>   2 files changed, 37 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 90f0db3b4f65..dc6c8130e2d7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2739,3 +2739,23 @@ static void 
> amdgpu_register_bad_pages_mca_notifier(void)
>   }
>   }
>   #endif
> +
> +/* check if ras is supported on block, say, sdma, gfx */ int 
> +amdgpu_ras_is_supported(struct amdgpu_device *adev,
> + unsigned int block)
> +{
> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> + if (block >= AMDGPU_RAS_BLOCK_COUNT)
> + return 0;
> + return ras && (adev->ras_enabled & (1 << block)); }
> +
> +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) {
> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> + if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
> + schedule_work(&ras->recovery_work);
> + return 0;
> +}

>These changes look unrelated. Maybe as another patch to move from .h file to 
>.c file.
   When add amdgpu_ras.h  to other ip blocks .h file (such as amdgpu_gfx.h 
amdgpu_xgmi.h ...) for other block using 'struct amdgpu_ras_block_ops',  the 
code compilation will make an error:
“drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h:499:46: error: dereferencing 
pointer to incomplete type ‘struct amdgpu_device’
 499 | #define amdgpu_ras_get_context(adev)  
((adev)->psp.ras_context.ras)”
   The struct amdgpu_device has been defined in amdgpu.h file, and the amdgpu.h 
file has been included in amdgpu_ras.h, it seems that there are some problems 
for .h file cross-include. Due to the amdgpu_ras_get_context(adev)  has only 
been used in the functions of 'amdgpu_ras_is_supported' and ' 
amdgpu_ras_reset_gpu '. When move these two function to .c file, the code 
compilation becomes successful.

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index cdd0010a5389..4b7da40dd837 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -469,6 +469,19 @@ struct ras_debug_if {
>   };
>   int op;
>   };
> +
> +struct amdgpu_ras_block_ops {
> + int (*ras_late_init)(struct amdgpu_device *adev);
> + void (*ras_fini)(struct amdgpu_device *adev);
> + int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if);
> + void  (*query_ras_error_count)(struct amdgpu_device *adev,void 
> *ras_error_status);
> + void (*query_ras_error_status)(struct amdgpu_device *adev);
> + bool  (*query_ras_poison_mode)(struct amdgpu_device *adev);
> + void (*query_ras_error_address)(struct amdgpu_device *adev, void 
> *ras_error_status);
> + void (*reset_ras_error_count)(struct amdgpu_device *adev);
> + void (*reset_ras_error_status)(struct amdgpu_device *adev); };
> +

>Generic comment - Since all the operations are consolidated under _ops, it 
>makes sense to rename the _ras_funcs to _ras.

>Ex: amdgpu_gfx_ras_funcs => amdgpu_gfx_ras, amdgpu_xgmi_ras_funcs => 
>amdgpu_xgmi_ras and so forth.

>In future, these ras blocks may have data members to keep IP specific ras data.

OK, I will do it.

Thanks,
Lijo

>   /* work flow
>* vbios
>* 1: ras feature enable (enabled by default) @@ -486,16 +499,6 @@ 
> struct ras_debug_if {
>   #define amdgpu_ras_get_context(adev)
> ((adev)->psp.ras_context.ras)
>   #define amdgpu_ras_set_context(adev, ras_con)   
> ((adev)->psp.ras_context.ras = (ras_con))
>   
> -/* check if ras is supported on block, say, sdma, gfx */ -static 
> inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
> - unsigned int block)
> -{
> - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> -
> - if (block >= AMDGPU_RAS_BLOCK_COUNT)
> - return 0;
> - return ras && (adev->ras_enabled & (1 << block));
> -}
>   
>   int amdgpu_ras_recover

Re: [PATCH 1/9] drm/amdgpu:Define the unified ras function pointers of each IP block

2021-11-25 Thread Lazar, Lijo




On 11/25/2021 4:26 PM, yipechai wrote:

Define an unified ras function pointers for each ip block to adapt.

Signed-off-by: yipechai 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 36 -
  2 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 90f0db3b4f65..dc6c8130e2d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2739,3 +2739,23 @@ static void amdgpu_register_bad_pages_mca_notifier(void)
  }
  }
  #endif
+
+/* check if ras is supported on block, say, sdma, gfx */
+int amdgpu_ras_is_supported(struct amdgpu_device *adev,
+   unsigned int block)
+{
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   if (block >= AMDGPU_RAS_BLOCK_COUNT)
+   return 0;
+   return ras && (adev->ras_enabled & (1 << block));
+}
+
+int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
+   schedule_work(&ras->recovery_work);
+   return 0;
+}


These changes look unrelated. Maybe as another patch to move from .h 
file to .c file.



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index cdd0010a5389..4b7da40dd837 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -469,6 +469,19 @@ struct ras_debug_if {
};
int op;
  };
+
+struct amdgpu_ras_block_ops {
+   int (*ras_late_init)(struct amdgpu_device *adev);
+   void (*ras_fini)(struct amdgpu_device *adev);
+   int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if);
+   void  (*query_ras_error_count)(struct amdgpu_device *adev,void 
*ras_error_status);
+   void (*query_ras_error_status)(struct amdgpu_device *adev);
+   bool  (*query_ras_poison_mode)(struct amdgpu_device *adev);
+   void (*query_ras_error_address)(struct amdgpu_device *adev, void 
*ras_error_status);
+   void (*reset_ras_error_count)(struct amdgpu_device *adev);
+   void (*reset_ras_error_status)(struct amdgpu_device *adev);
+};
+


Generic comment - Since all the operations are consolidated under _ops, 
it makes sense to rename the _ras_funcs to _ras.


Ex: amdgpu_gfx_ras_funcs => amdgpu_gfx_ras, amdgpu_xgmi_ras_funcs => 
amdgpu_xgmi_ras and so forth.


In future, these ras blocks may have data members to keep IP specific 
ras data.


Thanks,
Lijo


  /* work flow
   * vbios
   * 1: ras feature enable (enabled by default)
@@ -486,16 +499,6 @@ struct ras_debug_if {
  #define amdgpu_ras_get_context(adev)  ((adev)->psp.ras_context.ras)
  #define amdgpu_ras_set_context(adev, ras_con) ((adev)->psp.ras_context.ras = 
(ras_con))
  
-/* check if ras is supported on block, say, sdma, gfx */

-static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
-   unsigned int block)
-{
-   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-
-   if (block >= AMDGPU_RAS_BLOCK_COUNT)
-   return 0;
-   return ras && (adev->ras_enabled & (1 << block));
-}
  
  int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
  
@@ -512,15 +515,6 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
  
  int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
  
-static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)

-{
-   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-
-   if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
-   schedule_work(&ras->recovery_work);
-   return 0;
-}
-
  static inline enum ta_ras_block
  amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
switch (block) {
@@ -652,4 +646,8 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block);
  
  bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev);
  
+int amdgpu_ras_is_supported(struct amdgpu_device *adev,	unsigned int block);

+
+int amdgpu_ras_reset_gpu(struct amdgpu_device *adev);
+
  #endif