RE: [PATCH] drm/amdgpu/jpeg5: reprogram doorbell setting after power up for each playback

2024-06-18 Thread Feng, Kenneth
[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Kenneth Feng 


-Original Message-
From: amd-gfx  On Behalf Of Sonny Jiang
Sent: Wednesday, June 19, 2024 12:40 AM
To: amd-gfx@lists.freedesktop.org
Cc: Jiang, Sonny 
Subject: [PATCH] drm/amdgpu/jpeg5: reprogram doorbell setting after power up 
for each playback

Caution: This message originated from an External Source. Use proper caution 
when opening attachments, clicking links, or responding.


From: Sonny Jiang 

Doorbell needs to be configured after power up during each playback

Signed-off-by: Sonny Jiang 
---
 drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c
index 68ef29bc70e2..e766b9463759 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c
@@ -137,10 +137,6 @@ static int jpeg_v5_0_0_hw_init(void *handle)
adev->nbio.funcs->vcn_doorbell_range(adev, ring->use_doorbell,
(adev->doorbell_index.vcn.vcn_ring0_1 << 1), 0);

-   WREG32_SOC15(VCN, 0, regVCN_JPEG_DB_CTRL,
-   ring->doorbell_index << VCN_JPEG_DB_CTRL__OFFSET__SHIFT 
|
-   VCN_JPEG_DB_CTRL__EN_MASK);
-
r = amdgpu_ring_test_helper(ring);
if (r)
return r;
@@ -314,6 +310,10 @@ static int jpeg_v5_0_0_start(struct amdgpu_device *adev)
JPEG_SYS_INT_EN__DJRBC0_MASK,
~JPEG_SYS_INT_EN__DJRBC0_MASK);

+   WREG32_SOC15(VCN, 0, regVCN_JPEG_DB_CTRL,
+   ring->doorbell_index << VCN_JPEG_DB_CTRL__OFFSET__SHIFT |
+   VCN_JPEG_DB_CTRL__EN_MASK);
+
WREG32_SOC15(JPEG, 0, regUVD_LMI_JRBC_RB_VMID, 0);
WREG32_SOC15(JPEG, 0, regUVD_JRBC_RB_CNTL, (0x0001L | 0x0002L));
WREG32_SOC15(JPEG, 0, regUVD_LMI_JRBC_RB_64BIT_BAR_LOW,
--
2.45.1



Re: [PATCH v2] drm/amdgpu: avoid using null object of framebuffer

2024-06-18 Thread Zhang, Julia

On 2024/6/19 12:53, Huang Rui wrote:

On Wed, Jun 19, 2024 at 12:46:25PM +0800, Huang Rui wrote:

On Wed, Jun 19, 2024 at 12:29:24PM +0800, Zhang, Julia wrote:

Instead of using state->fb->obj[0] directly, get object from framebuffer

by calling drm_gem_fb_get_obj() and return error code when object is

null to avoid using null object of framebuffer.



v2: Call drm_gem_fb_get_obj after check old_state->fb for NULL.



Signed-off-by: Julia Zhang 

---



Julia, you can add my RB during V1 in the next submission.



Reviewed-by: Huang Rui 



CC Fusheng who reported the issue.



Fusheng, may we have your reported-by?

Hi Fusheng,

Can you help to review this patch and may we have you reported-by? Thank you.

Julia







 drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 18 --

 1 file changed, 16 insertions(+), 2 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c

index d60c4a2eeb0c..212f6522859d 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c

@@ -2,6 +2,7 @@



 #include 

 #include 

+#include 

 #include 



 #include "amdgpu.h"

@@ -311,7 +312,13 @@ static int amdgpu_vkms_prepare_fb(struct drm_plane *plane,

 return 0;

 }

 afb = to_amdgpu_framebuffer(new_state->fb);

-obj = new_state->fb->obj[0];

+

+obj = drm_gem_fb_get_obj(new_state->fb, 0);

+if (!obj) {

+DRM_ERROR("Failed to get obj from framebuffer\n");

+return -EINVAL;

+}

+

 rbo = gem_to_amdgpu_bo(obj);

 adev = amdgpu_ttm_adev(rbo->tbo.bdev);



@@ -365,12 +372,19 @@ static void amdgpu_vkms_cleanup_fb(struct drm_plane 
*plane,

   struct drm_plane_state *old_state)

 {

 struct amdgpu_bo *rbo;

+struct drm_gem_object *obj;

 int r;



 if (!old_state->fb)

 return;



-rbo = gem_to_amdgpu_bo(old_state->fb->obj[0]);

+obj = drm_gem_fb_get_obj(old_state->fb, 0);

+if (!obj) {

+DRM_ERROR("Failed to get obj from framebuffer\n");

+return;

+}

+

+rbo = gem_to_amdgpu_bo(obj);

 r = amdgpu_bo_reserve(rbo, false);

 if (unlikely(r)) {

 DRM_ERROR("failed to reserve rbo before unpin\n");

--

2.34.1




Re: [PATCH v2] drm/amdgpu: avoid using null object of framebuffer

2024-06-18 Thread Huang Rui
On Wed, Jun 19, 2024 at 12:46:25PM +0800, Huang Rui wrote:
> On Wed, Jun 19, 2024 at 12:29:24PM +0800, Zhang, Julia wrote:
> > Instead of using state->fb->obj[0] directly, get object from framebuffer
> > by calling drm_gem_fb_get_obj() and return error code when object is
> > null to avoid using null object of framebuffer.
> > 
> > v2: Call drm_gem_fb_get_obj after check old_state->fb for NULL.
> > 
> > Signed-off-by: Julia Zhang 
> > ---
> 
> Julia, you can add my RB during V1 in the next submission.
> 
> Reviewed-by: Huang Rui 

CC Fusheng who reported the issue.

Fusheng, may we have your reported-by?

> 
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 18 --
> >  1 file changed, 16 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
> > index d60c4a2eeb0c..212f6522859d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
> > @@ -2,6 +2,7 @@
> >  
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  
> >  #include "amdgpu.h"
> > @@ -311,7 +312,13 @@ static int amdgpu_vkms_prepare_fb(struct drm_plane 
> > *plane,
> > return 0;
> > }
> > afb = to_amdgpu_framebuffer(new_state->fb);
> > -   obj = new_state->fb->obj[0];
> > +
> > +   obj = drm_gem_fb_get_obj(new_state->fb, 0);
> > +   if (!obj) {
> > +   DRM_ERROR("Failed to get obj from framebuffer\n");
> > +   return -EINVAL;
> > +   }
> > +
> > rbo = gem_to_amdgpu_bo(obj);
> > adev = amdgpu_ttm_adev(rbo->tbo.bdev);
> >  
> > @@ -365,12 +372,19 @@ static void amdgpu_vkms_cleanup_fb(struct drm_plane 
> > *plane,
> >struct drm_plane_state *old_state)
> >  {
> > struct amdgpu_bo *rbo;
> > +   struct drm_gem_object *obj;
> > int r;
> >  
> > if (!old_state->fb)
> > return;
> >  
> > -   rbo = gem_to_amdgpu_bo(old_state->fb->obj[0]);
> > +   obj = drm_gem_fb_get_obj(old_state->fb, 0);
> > +   if (!obj) {
> > +   DRM_ERROR("Failed to get obj from framebuffer\n");
> > +   return;
> > +   }
> > +
> > +   rbo = gem_to_amdgpu_bo(obj);
> > r = amdgpu_bo_reserve(rbo, false);
> > if (unlikely(r)) {
> > DRM_ERROR("failed to reserve rbo before unpin\n");
> > -- 
> > 2.34.1
> > 


Re: [PATCH v2] drm/amdgpu: avoid using null object of framebuffer

2024-06-18 Thread Huang Rui
On Wed, Jun 19, 2024 at 12:29:24PM +0800, Zhang, Julia wrote:
> Instead of using state->fb->obj[0] directly, get object from framebuffer
> by calling drm_gem_fb_get_obj() and return error code when object is
> null to avoid using null object of framebuffer.
> 
> v2: Call drm_gem_fb_get_obj after check old_state->fb for NULL.
> 
> Signed-off-by: Julia Zhang 
> ---

Julia, you can add my RB during V1 in the next submission.

Reviewed-by: Huang Rui 

>  drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 18 --
>  1 file changed, 16 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
> index d60c4a2eeb0c..212f6522859d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
> @@ -2,6 +2,7 @@
>  
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include "amdgpu.h"
> @@ -311,7 +312,13 @@ static int amdgpu_vkms_prepare_fb(struct drm_plane 
> *plane,
>   return 0;
>   }
>   afb = to_amdgpu_framebuffer(new_state->fb);
> - obj = new_state->fb->obj[0];
> +
> + obj = drm_gem_fb_get_obj(new_state->fb, 0);
> + if (!obj) {
> + DRM_ERROR("Failed to get obj from framebuffer\n");
> + return -EINVAL;
> + }
> +
>   rbo = gem_to_amdgpu_bo(obj);
>   adev = amdgpu_ttm_adev(rbo->tbo.bdev);
>  
> @@ -365,12 +372,19 @@ static void amdgpu_vkms_cleanup_fb(struct drm_plane 
> *plane,
>  struct drm_plane_state *old_state)
>  {
>   struct amdgpu_bo *rbo;
> + struct drm_gem_object *obj;
>   int r;
>  
>   if (!old_state->fb)
>   return;
>  
> - rbo = gem_to_amdgpu_bo(old_state->fb->obj[0]);
> + obj = drm_gem_fb_get_obj(old_state->fb, 0);
> + if (!obj) {
> + DRM_ERROR("Failed to get obj from framebuffer\n");
> + return;
> + }
> +
> + rbo = gem_to_amdgpu_bo(obj);
>   r = amdgpu_bo_reserve(rbo, false);
>   if (unlikely(r)) {
>   DRM_ERROR("failed to reserve rbo before unpin\n");
> -- 
> 2.34.1
> 


[PATCH v2] drm/amdgpu: avoid using null object of framebuffer

2024-06-18 Thread Julia Zhang
Instead of using state->fb->obj[0] directly, get object from framebuffer
by calling drm_gem_fb_get_obj() and return error code when object is
null to avoid using null object of framebuffer.

v2: Call drm_gem_fb_get_obj after check old_state->fb for NULL.

Signed-off-by: Julia Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
index d60c4a2eeb0c..212f6522859d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
@@ -2,6 +2,7 @@
 
 #include 
 #include 
+#include 
 #include 
 
 #include "amdgpu.h"
@@ -311,7 +312,13 @@ static int amdgpu_vkms_prepare_fb(struct drm_plane *plane,
return 0;
}
afb = to_amdgpu_framebuffer(new_state->fb);
-   obj = new_state->fb->obj[0];
+
+   obj = drm_gem_fb_get_obj(new_state->fb, 0);
+   if (!obj) {
+   DRM_ERROR("Failed to get obj from framebuffer\n");
+   return -EINVAL;
+   }
+
rbo = gem_to_amdgpu_bo(obj);
adev = amdgpu_ttm_adev(rbo->tbo.bdev);
 
@@ -365,12 +372,19 @@ static void amdgpu_vkms_cleanup_fb(struct drm_plane 
*plane,
   struct drm_plane_state *old_state)
 {
struct amdgpu_bo *rbo;
+   struct drm_gem_object *obj;
int r;
 
if (!old_state->fb)
return;
 
-   rbo = gem_to_amdgpu_bo(old_state->fb->obj[0]);
+   obj = drm_gem_fb_get_obj(old_state->fb, 0);
+   if (!obj) {
+   DRM_ERROR("Failed to get obj from framebuffer\n");
+   return;
+   }
+
+   rbo = gem_to_amdgpu_bo(obj);
r = amdgpu_bo_reserve(rbo, false);
if (unlikely(r)) {
DRM_ERROR("failed to reserve rbo before unpin\n");
-- 
2.34.1



Re: [PATCH v3 2/2] drm/amd: Add power_saving_policy drm property to eDP connectors

2024-06-18 Thread Mario Limonciello

On 6/18/2024 17:36, Leo Li wrote:



On 2024-06-05 22:04, Mario Limonciello wrote:

When the `power_saving_policy` property is set to bit mask
"Require color accuracy" ABM should be disabled immediately and
any requests by sysfs to update will return an -EBUSY error.

When the `power_saving_policy` property is set to bit mask
"Require low latency" PSR should be disabled.

When the property is restored to an empty bit mask the previous
value of ABM and pSR will be restored.

Signed-off-by: Mario Limonciello 

Reviewed-by: Leo Li 


Thanks!  I don't have permissions, so can you (or someone else) please 
apply to drm-misc-next for me?


After it's merged I'll rebase and work on the feedback for the new IGT 
tests.




Thanks!


---
v2->v3:
  * Use `disallow_edp_enter_psr` instead
  * Drop case in dm_update_crtc_state()
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_display.c   |  4 ++
  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 50 +--
  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h |  2 +
  3 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c

index 3ecc7ef95172..cfb5220cf182 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
@@ -1350,6 +1350,10 @@ int amdgpu_display_modeset_create_props(struct 
amdgpu_device *adev)

   "dither",
   amdgpu_dither_enum_list, sz);
+    if (adev->dc_enabled)
+    drm_mode_create_power_saving_policy_property(adev_to_drm(adev),
+ DRM_MODE_POWER_SAVING_POLICY_ALL);
+
  return 0;
  }
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c

index f1d67c6f4b98..5fd7210b2479 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -6436,6 +6436,13 @@ int 
amdgpu_dm_connector_atomic_set_property(struct drm_connector *connector,

  } else if (property == adev->mode_info.underscan_property) {
  dm_new_state->underscan_enable = val;
  ret = 0;
+    } else if (property == dev->mode_config.power_saving_policy) {
+    dm_new_state->abm_forbidden = val & 
DRM_MODE_REQUIRE_COLOR_ACCURACY;
+    dm_new_state->abm_level = (dm_new_state->abm_forbidden || 
!amdgpu_dm_abm_level) ?

+    ABM_LEVEL_IMMEDIATE_DISABLE :
+    amdgpu_dm_abm_level;
+    dm_new_state->psr_forbidden = val & 
DRM_MODE_REQUIRE_LOW_LATENCY;

+    ret = 0;
  }
  return ret;
@@ -6478,6 +6485,13 @@ int 
amdgpu_dm_connector_atomic_get_property(struct drm_connector *connector,

  } else if (property == adev->mode_info.underscan_property) {
  *val = dm_state->underscan_enable;
  ret = 0;
+    } else if (property == dev->mode_config.power_saving_policy) {
+    *val = 0;
+    if (dm_state->psr_forbidden)
+    *val |= DRM_MODE_REQUIRE_LOW_LATENCY;
+    if (dm_state->abm_forbidden)
+    *val |= DRM_MODE_REQUIRE_COLOR_ACCURACY;
+    ret = 0;
  }
  return ret;
@@ -6504,9 +6518,12 @@ static ssize_t panel_power_savings_show(struct 
device *device,

  u8 val;
  drm_modeset_lock(&dev->mode_config.connection_mutex, NULL);
-    val = to_dm_connector_state(connector->state)->abm_level ==
-    ABM_LEVEL_IMMEDIATE_DISABLE ? 0 :
-    to_dm_connector_state(connector->state)->abm_level;
+    if (to_dm_connector_state(connector->state)->abm_forbidden)
+    val = 0;
+    else
+    val = to_dm_connector_state(connector->state)->abm_level ==
+    ABM_LEVEL_IMMEDIATE_DISABLE ? 0 :
+    to_dm_connector_state(connector->state)->abm_level;
  drm_modeset_unlock(&dev->mode_config.connection_mutex);
  return sysfs_emit(buf, "%u\n", val);
@@ -6530,10 +6547,16 @@ static ssize_t 
panel_power_savings_store(struct device *device,

  return -EINVAL;
  drm_modeset_lock(&dev->mode_config.connection_mutex, NULL);
-    to_dm_connector_state(connector->state)->abm_level = val ?:
-    ABM_LEVEL_IMMEDIATE_DISABLE;
+    if (to_dm_connector_state(connector->state)->abm_forbidden)
+    ret = -EBUSY;
+    else
+    to_dm_connector_state(connector->state)->abm_level = val ?:
+    ABM_LEVEL_IMMEDIATE_DISABLE;
  drm_modeset_unlock(&dev->mode_config.connection_mutex);
+    if (ret)
+    return ret;
+
  drm_kms_helper_hotplug_event(dev);
  return count;
@@ -7704,6 +7727,13 @@ void amdgpu_dm_connector_init_helper(struct 
amdgpu_display_manager *dm,

  aconnector->base.state->max_bpc = 16;
  aconnector->base.state->max_requested_bpc = 
aconnector->base.state->max_bpc;

+    if (connector_type == DRM_MODE_CONNECTOR_eDP &&
+    (dc_is_dmcu_initialized(adev->dm.dc) ||
+ adev->dm.dc->ctx->dmub_srv)) {
+    drm_object_attach_property(&aconnector->base.base,
+  

RE: [PATCH] drm/amdgpu: Fix pci state save during mode-1 reset

2024-06-18 Thread Xu, Feifei
[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Feifei Xu 

-Original Message-
From: Zhang, Hawking 
Sent: Tuesday, June 18, 2024 7:34 PM
To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Kamal, Asad 
; Xu, Feifei 
Subject: RE: [PATCH] drm/amdgpu: Fix pci state save during mode-1 reset

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, June 18, 2024 16:44
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Xu, Feifei 

Subject: [PATCH] drm/amdgpu: Fix pci state save during mode-1 reset

Cache the PCI state before bus master is disabled. The saved state is later 
used for other cases like restoring config space after mode-2 reset.

Signed-off-by: Lijo Lazar 
Fixes: 5c03e5843e6b ("drm/amdgpu:add smu mode1/2 support for aldebaran")
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3fb02f5b91c9..6c2ab14ca102 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5224,11 +5224,14 @@ int amdgpu_device_mode1_reset(struct amdgpu_device 
*adev)

dev_info(adev->dev, "GPU mode1 reset\n");

+   /* Cache the state before bus master disable. The saved config space
+* values are used in other cases like restore after mode-2 reset.
+*/
+   amdgpu_device_cache_pci_state(adev->pdev);
+
/* disable BM */
pci_clear_master(adev->pdev);

-   amdgpu_device_cache_pci_state(adev->pdev);
-
if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
dev_info(adev->dev, "GPU smu mode1 reset\n");
ret = amdgpu_dpm_mode1_reset(adev);
--
2.25.1




RE: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to complete

2024-06-18 Thread Chai, Thomas
[AMD Official Use Only - AMD Internal Distribution Only]

-
Best Regards,
Thomas

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, June 18, 2024 8:00 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Wang, Yang(Kevin) ; Yang, 
Stanley 
Subject: Re: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to 
complete



On 6/18/2024 4:51 PM, Chai, Thomas wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> -
> Best Regards,
> Thomas
>
> -Original Message-
> From: Chai, Thomas
> Sent: Tuesday, June 18, 2024 7:09 PM
> To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking ; Zhou1, Tao
> ; Li, Candice ; Wang,
> Yang(Kevin) ; Yang, Stanley
> 
> Subject: RE: [PATCH 4/5] drm/amdgpu: add completion to wait for ras
> reset to complete
>
>
>
>
> -
> Best Regards,
> Thomas
>
> -Original Message-
> From: Lazar, Lijo 
> Sent: Tuesday, June 18, 2024 6:09 PM
> To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking ; Zhou1, Tao
> ; Li, Candice ; Wang,
> Yang(Kevin) ; Yang, Stanley
> 
> Subject: Re: [PATCH 4/5] drm/amdgpu: add completion to wait for ras
> reset to complete
>
>
>
> On 6/18/2024 12:03 PM, YiPeng Chai wrote:
>> Add completion to wait for ras reset to complete.
>>
>> Signed-off-by: YiPeng Chai 
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 +++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
>>  2 files changed, 12 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> index 898889600771..7f8e6ca07957 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> @@ -124,6 +124,8 @@ const char *get_ras_block_str(struct
>> ras_common_if
>> *ras_block)
>>
>>  #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms
>>
>> +#define MAX_RAS_RECOVERY_COMPLETION_TIME  12 //ms
>> +
>>  enum amdgpu_ras_retire_page_reservation {
>>   AMDGPU_RAS_RETIRE_PAGE_RESERVED,
>>   AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -2518,6 +2520,8 @@ static
>> void amdgpu_ras_do_recovery(struct work_struct *work)
>>   atomic_set(&hive->ras_recovery, 0);
>>   amdgpu_put_xgmi_hive(hive);
>>   }
>> +
>> + complete_all(&ras->ras_recovery_completion);
>>  }
>>
>>  /* alloc/realloc bps array */
>> @@ -2911,10 +2915,16 @@ static int
>> amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
>>
>>   flush_delayed_work(&con->page_retirement_dwork);
>>
>> + reinit_completion(&con->ras_recovery_completion);
>> +
>>   con->gpu_reset_flags |= reset;
>>   amdgpu_ras_reset_gpu(adev);
>>
>>   *gpu_reset = reset;
>> + if (!wait_for_completion_timeout(&con->ras_recovery_completion,
>> + 
>> msecs_to_jiffies(MAX_RAS_RECOVERY_COMPLETION_TIME)))
>> + dev_err(adev->dev, "Waiting for GPU to complete ras 
>> reset timeout! reset:0x%x\n",
>> + reset);
>
>> If a mode-1 reset gets to execute first due to job timeout/hws detect cases 
>> in poison timeout, then the ras handler will never get executed.
>> Why this wait is required?
>
>> Thanks,
>> Lijo
>
> [Thomas]  "[PATCH 5/5] drm/amdgpu: add gpu reset check and exception 
> handling" add the check before ras gpu reset.
> Poison ras reset is different from reset triggered by other 
> fatal errors, and all poison RAS resets are triggered from here,
>  in order to distinguish other gpu resets and facilitate 
> subsequent  code processing, so add wait for gpu ras reset here.
>

> Reset mechanism resets the GPU state - whether it's triggered due to poison 
> or fatal errors. As soon as the device is reset successfully, GPU operations 
> can continue.

>So why there needs to be a special wait for poison triggred reset alone?
[Thomas] Different applications may randomly trigger poison errors before gpu 
reset.
 Since poison gpu reset is triggered asynchronously, new poison 
consumption interrupts may occur in the period after gpu reset request is sent 
and before the GPU reset is actually performed..
  In order to avoid performing a poison gpu reset again after 
completing the current poison gpu reset,  It need to stay here to wait for gpu 
to complete reset and then clear the cached poison consumption messages.

>Why not wait on the RAS recovery work object  rather than another completion 
>notification?
[Thomas] Yes, "wait on RAS recovery work object" is a good idea,  I will do it.


Thanks,
Lijo

>>   }
>>
>>   return 0;
>> @@ -3041,6 +3051,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device 
>> *adev)
>>   }
>>   }
>>
>> + init_completion(&con->ras_recovery_completion);
>>   mutex_init(&con->page_rsv_lock);
>>   INIT_KFIFO(con->poison

RE: [PATCH] drm/amdgpu/atomfirmware: fix parsing of vram_info

2024-06-18 Thread Zhang, Hawking
[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: amd-gfx  On Behalf Of Alex Deucher
Sent: Saturday, June 15, 2024 01:55
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander 
Subject: [PATCH] drm/amdgpu/atomfirmware: fix parsing of vram_info

v3.x changed the how vram width was encoded.  The previous implementation 
actually worked correctly for most boards.
Fix the implementation to work correctly everywhere.

This fixes the vram width reported in the kernel log on some boards.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
index f932bec6e534..f873dd3cae16 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
@@ -433,7 +433,7 @@ amdgpu_atomfirmware_get_vram_info(struct amdgpu_device 
*adev,
mem_channel_number = 
vram_info->v30.channel_num;
mem_channel_width = 
vram_info->v30.channel_width;
if (vram_width)
-   *vram_width = 
mem_channel_number * (1 << mem_channel_width);
+   *vram_width = 
mem_channel_number * 16;
break;
default:
return -EINVAL;
--
2.45.1



Re: [PATCH 4/6] drm/amdgpu: add AMDGPU_INFO_GB_ADDR_CONFIG query

2024-06-18 Thread Marek Olšák
I would put this into drm_amdgpu_info_device. That structure can grow in size.

Marek

On Tue, Jun 18, 2024 at 11:30 AM Pierre-Eric Pelloux-Prayer
 wrote:
>
> libdrm_amdgpu uses AMDGPU_INFO_READ_MMR_REG to fill the dev->info.gb_addr_cfg
> value.
> Since this value is already known by the kernel, this commit implements a new
> query to return it.
>
> The libdrm MR to use this query is:
>https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/368
>
> Signed-off-by: Pierre-Eric Pelloux-Prayer 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 5 +
>  include/uapi/drm/amdgpu_drm.h   | 2 ++
>  3 files changed, 9 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index f51f121d804e..403add7f05af 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -117,9 +117,10 @@
>   * - 3.56.0 - Update IB start address and size alignment for decode and 
> encode
>   * - 3.57.0 - Compute tunneling on GFX10+
>   * - 3.58.0 - Add AMDGPU_IDS_FLAGS_MODE_PF, AMDGPU_IDS_FLAGS_MODE_VF & 
> AMDGPU_IDS_FLAGS_MODE_PT
> + * - 3.59.0 - Add AMDGPU_INFO_GB_ADDR_CONFIG support
>   */
>  #define KMS_DRIVER_MAJOR   3
> -#define KMS_DRIVER_MINOR   58
> +#define KMS_DRIVER_MINOR   59
>  #define KMS_DRIVER_PATCHLEVEL  0
>
>  /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index b32ff6e1baaf..dbb05d51682b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -1256,6 +1256,10 @@ static int amdgpu_info(struct drm_device *dev, void 
> *data, struct drm_file *filp
> return copy_to_user(out, &gpuvm_fault,
> min((size_t)size, sizeof(gpuvm_fault))) ? 
> -EFAULT : 0;
> }
> +   case AMDGPU_INFO_GB_ADDR_CONFIG: {
> +   ui32 = adev->gfx.config.gb_addr_config;
> +   return copy_to_user(out, &ui32, min(size, 4u)) ? -EFAULT : 0;
> +   }
> default:
> DRM_DEBUG_KMS("Invalid request %d\n", info->query);
> return -EINVAL;
> @@ -1310,6 +1314,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void 
> *data, struct drm_file *filp)
> case AMDGPU_INFO_VIDEO_CAPS:
> case AMDGPU_INFO_MAX_IBS:
> case AMDGPU_INFO_GPUVM_FAULT:
> +   case AMDGPU_INFO_GB_ADDR_CONFIG:
> need_runtime_pm = false;
> break;
>
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index 3e488b0119eb..680492cd73d8 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -933,6 +933,8 @@ struct drm_amdgpu_cs_chunk_cp_gfx_shadow {
>  #define AMDGPU_INFO_MAX_IBS0x22
>  /* query last page fault info */
>  #define AMDGPU_INFO_GPUVM_FAULT0x23
> +/* Query GB_ADDR_CONFIG */
> +#define AMDGPU_INFO_GB_ADDR_CONFIG 0x24
>
>  #define AMDGPU_INFO_MMR_SE_INDEX_SHIFT 0
>  #define AMDGPU_INFO_MMR_SE_INDEX_MASK  0xff
> --
> 2.40.1
>


Re: [PATCH v2 8/8] drm/amdgpu: Call drm_atomic_helper_shutdown() at shutdown time

2024-06-18 Thread Doug Anderson
Hi,

On Tue, Jun 18, 2024 at 3:00 PM Alex Deucher  wrote:
>
> On Tue, Jun 18, 2024 at 5:40 PM Doug Anderson  wrote:
> >
> > Hi,
> >
> >
> > On Mon, Jun 17, 2024 at 8:01 AM Alex Deucher  wrote:
> > >
> > > On Wed, Jun 12, 2024 at 6:37 PM Douglas Anderson  
> > > wrote:
> > > >
> > > > Based on grepping through the source code this driver appears to be
> > > > missing a call to drm_atomic_helper_shutdown() at system shutdown
> > > > time. Among other things, this means that if a panel is in use that it
> > > > won't be cleanly powered off at system shutdown time.
> > > >
> > > > The fact that we should call drm_atomic_helper_shutdown() in the case
> > > > of OS shutdown/restart comes straight out of the kernel doc "driver
> > > > instance overview" in drm_drv.c.
> > > >
> > > > Suggested-by: Maxime Ripard 
> > > > Cc: Alex Deucher 
> > > > Cc: Christian König 
> > > > Cc: Xinhui Pan 
> > > > Signed-off-by: Douglas Anderson 
> > > > ---
> > > > This commit is only compile-time tested.
> > > >
> > > > ...and further, I'd say that this patch is more of a plea for help
> > > > than a patch I think is actually right. I'm _fairly_ certain that
> > > > drm/amdgpu needs this call at shutdown time but the logic is a bit
> > > > hard for me to follow. I'd appreciate if anyone who actually knows
> > > > what this should look like could illuminate me, or perhaps even just
> > > > post a patch themselves!
> > >
> > > I'm not sure this patch makes sense or not.  The driver doesn't really
> > > do a formal tear down in its shutdown routine, it just quiesces the
> > > hardware.  What are the actual requirements of the shutdown function?
> > > In the past when we did a full driver tear down in shutdown, it
> > > delayed the shutdown sequence and users complained.
> >
> > The "inspiration" for this patch is to handle panels properly.
> > Specifically, panels often have several power/enable signals going to
> > them and often have requirements that these signals are powered off in
> > the proper order with the proper delays between them. While we can't
> > always do so when the system crashes / reboots in an uncontrolled way,
> > panel manufacturers / HW Engineers get upset if we don't power things
> > off properly during an orderly shutdown/reboot. When panels are
> > powered off badly it can cause garbage on the screen and, so I've been
> > told, can even cause long term damage to the panels over time.
> >
> > In Linux, some panel drivers have tried to ensure a proper poweroff of
> > the panel by handling the shutdown() call themselves. However, this is
> > ugly and panel maintainers want panel drivers to stop doing it. We
> > have removed the code doing this from most panels now [1]. Instead the
> > assumption is that the DRM modeset drivers should be calling
> > drm_atomic_helper_shutdown() which will make sure panels get an
> > orderly shutdown.
> >
> > For a lot more details, see the cover letter [2] which then contains
> > links to even more discussions about the topic.
> >
> > [1] https://lore.kernel.org/r/20240605002401.2848541-1-diand...@chromium.org
> > [2] https://lore.kernel.org/r/2024061435.3188234-1-diand...@chromium.org
>
> I don't think it's an issue.  We quiesce the hardware as if we were
> about to suspend the system (e.g., S3).  For the display hardware we
> call drm_atomic_helper_suspend() as part of that sequence.

OK. It's no skin off my teeth and we can drop this patch if you're
convinced it's not needed. From the point of view of someone who has
no experience with this driver it seems weird to me that it would use
drm_atomic_helper_suspend() at shutdown time instead of the documented
drm_atomic_helper_shutdown(), but if it works for everyone then I'm
not gonna complain.

-Doug


Re: [PATCH v3 2/2] drm/amd: Add power_saving_policy drm property to eDP connectors

2024-06-18 Thread Leo Li




On 2024-06-05 22:04, Mario Limonciello wrote:

When the `power_saving_policy` property is set to bit mask
"Require color accuracy" ABM should be disabled immediately and
any requests by sysfs to update will return an -EBUSY error.

When the `power_saving_policy` property is set to bit mask
"Require low latency" PSR should be disabled.

When the property is restored to an empty bit mask the previous
value of ABM and pSR will be restored.

Signed-off-by: Mario Limonciello 

Reviewed-by: Leo Li 

Thanks!


---
v2->v3:
  * Use `disallow_edp_enter_psr` instead
  * Drop case in dm_update_crtc_state()
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_display.c   |  4 ++
  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 50 +--
  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h |  2 +
  3 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
index 3ecc7ef95172..cfb5220cf182 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
@@ -1350,6 +1350,10 @@ int amdgpu_display_modeset_create_props(struct 
amdgpu_device *adev)
 "dither",
 amdgpu_dither_enum_list, sz);
  
+	if (adev->dc_enabled)

+   drm_mode_create_power_saving_policy_property(adev_to_drm(adev),
+
DRM_MODE_POWER_SAVING_POLICY_ALL);
+
return 0;
  }
  
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c

index f1d67c6f4b98..5fd7210b2479 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -6436,6 +6436,13 @@ int amdgpu_dm_connector_atomic_set_property(struct 
drm_connector *connector,
} else if (property == adev->mode_info.underscan_property) {
dm_new_state->underscan_enable = val;
ret = 0;
+   } else if (property == dev->mode_config.power_saving_policy) {
+   dm_new_state->abm_forbidden = val & 
DRM_MODE_REQUIRE_COLOR_ACCURACY;
+   dm_new_state->abm_level = (dm_new_state->abm_forbidden || 
!amdgpu_dm_abm_level) ?
+   ABM_LEVEL_IMMEDIATE_DISABLE :
+   amdgpu_dm_abm_level;
+   dm_new_state->psr_forbidden = val & 
DRM_MODE_REQUIRE_LOW_LATENCY;
+   ret = 0;
}
  
  	return ret;

@@ -6478,6 +6485,13 @@ int amdgpu_dm_connector_atomic_get_property(struct 
drm_connector *connector,
} else if (property == adev->mode_info.underscan_property) {
*val = dm_state->underscan_enable;
ret = 0;
+   } else if (property == dev->mode_config.power_saving_policy) {
+   *val = 0;
+   if (dm_state->psr_forbidden)
+   *val |= DRM_MODE_REQUIRE_LOW_LATENCY;
+   if (dm_state->abm_forbidden)
+   *val |= DRM_MODE_REQUIRE_COLOR_ACCURACY;
+   ret = 0;
}
  
  	return ret;

@@ -6504,9 +6518,12 @@ static ssize_t panel_power_savings_show(struct device 
*device,
u8 val;
  
  	drm_modeset_lock(&dev->mode_config.connection_mutex, NULL);

-   val = to_dm_connector_state(connector->state)->abm_level ==
-   ABM_LEVEL_IMMEDIATE_DISABLE ? 0 :
-   to_dm_connector_state(connector->state)->abm_level;
+   if (to_dm_connector_state(connector->state)->abm_forbidden)
+   val = 0;
+   else
+   val = to_dm_connector_state(connector->state)->abm_level ==
+   ABM_LEVEL_IMMEDIATE_DISABLE ? 0 :
+   to_dm_connector_state(connector->state)->abm_level;
drm_modeset_unlock(&dev->mode_config.connection_mutex);
  
  	return sysfs_emit(buf, "%u\n", val);

@@ -6530,10 +6547,16 @@ static ssize_t panel_power_savings_store(struct device 
*device,
return -EINVAL;
  
  	drm_modeset_lock(&dev->mode_config.connection_mutex, NULL);

-   to_dm_connector_state(connector->state)->abm_level = val ?:
-   ABM_LEVEL_IMMEDIATE_DISABLE;
+   if (to_dm_connector_state(connector->state)->abm_forbidden)
+   ret = -EBUSY;
+   else
+   to_dm_connector_state(connector->state)->abm_level = val ?:
+   ABM_LEVEL_IMMEDIATE_DISABLE;
drm_modeset_unlock(&dev->mode_config.connection_mutex);
  
+	if (ret)

+   return ret;
+
drm_kms_helper_hotplug_event(dev);
  
  	return count;

@@ -7704,6 +7727,13 @@ void amdgpu_dm_connector_init_helper(struct 
amdgpu_display_manager *dm,
aconnector->base.state->max_bpc = 16;
aconnector->base.state->max_requested_bpc = 
aconnector->base.state->max_bpc;
  
+	if (connector_type == DRM_MODE_CONNECTOR_eDP &&

+   (dc

Re: [PATCH] drm/amdgpu/atomfirmware: fix parsing of vram_info

2024-06-18 Thread Alex Deucher
Ping?

Alex

On Fri, Jun 14, 2024 at 2:12 PM Alex Deucher  wrote:
>
> v3.x changed the how vram width was encoded.  The previous
> implementation actually worked correctly for most boards.
> Fix the implementation to work correctly everywhere.
>
> This fixes the vram width reported in the kernel log on
> some boards.
>
> Signed-off-by: Alex Deucher 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
> index f932bec6e534..f873dd3cae16 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
> @@ -433,7 +433,7 @@ amdgpu_atomfirmware_get_vram_info(struct amdgpu_device 
> *adev,
> mem_channel_number = 
> vram_info->v30.channel_num;
> mem_channel_width = 
> vram_info->v30.channel_width;
> if (vram_width)
> -   *vram_width = 
> mem_channel_number * (1 << mem_channel_width);
> +   *vram_width = 
> mem_channel_number * 16;
> break;
> default:
> return -EINVAL;
> --
> 2.45.1
>


Re: [PATCH v2 8/8] drm/amdgpu: Call drm_atomic_helper_shutdown() at shutdown time

2024-06-18 Thread Alex Deucher
On Tue, Jun 18, 2024 at 5:40 PM Doug Anderson  wrote:
>
> Hi,
>
>
> On Mon, Jun 17, 2024 at 8:01 AM Alex Deucher  wrote:
> >
> > On Wed, Jun 12, 2024 at 6:37 PM Douglas Anderson  
> > wrote:
> > >
> > > Based on grepping through the source code this driver appears to be
> > > missing a call to drm_atomic_helper_shutdown() at system shutdown
> > > time. Among other things, this means that if a panel is in use that it
> > > won't be cleanly powered off at system shutdown time.
> > >
> > > The fact that we should call drm_atomic_helper_shutdown() in the case
> > > of OS shutdown/restart comes straight out of the kernel doc "driver
> > > instance overview" in drm_drv.c.
> > >
> > > Suggested-by: Maxime Ripard 
> > > Cc: Alex Deucher 
> > > Cc: Christian König 
> > > Cc: Xinhui Pan 
> > > Signed-off-by: Douglas Anderson 
> > > ---
> > > This commit is only compile-time tested.
> > >
> > > ...and further, I'd say that this patch is more of a plea for help
> > > than a patch I think is actually right. I'm _fairly_ certain that
> > > drm/amdgpu needs this call at shutdown time but the logic is a bit
> > > hard for me to follow. I'd appreciate if anyone who actually knows
> > > what this should look like could illuminate me, or perhaps even just
> > > post a patch themselves!
> >
> > I'm not sure this patch makes sense or not.  The driver doesn't really
> > do a formal tear down in its shutdown routine, it just quiesces the
> > hardware.  What are the actual requirements of the shutdown function?
> > In the past when we did a full driver tear down in shutdown, it
> > delayed the shutdown sequence and users complained.
>
> The "inspiration" for this patch is to handle panels properly.
> Specifically, panels often have several power/enable signals going to
> them and often have requirements that these signals are powered off in
> the proper order with the proper delays between them. While we can't
> always do so when the system crashes / reboots in an uncontrolled way,
> panel manufacturers / HW Engineers get upset if we don't power things
> off properly during an orderly shutdown/reboot. When panels are
> powered off badly it can cause garbage on the screen and, so I've been
> told, can even cause long term damage to the panels over time.
>
> In Linux, some panel drivers have tried to ensure a proper poweroff of
> the panel by handling the shutdown() call themselves. However, this is
> ugly and panel maintainers want panel drivers to stop doing it. We
> have removed the code doing this from most panels now [1]. Instead the
> assumption is that the DRM modeset drivers should be calling
> drm_atomic_helper_shutdown() which will make sure panels get an
> orderly shutdown.
>
> For a lot more details, see the cover letter [2] which then contains
> links to even more discussions about the topic.
>
> [1] https://lore.kernel.org/r/20240605002401.2848541-1-diand...@chromium.org
> [2] https://lore.kernel.org/r/2024061435.3188234-1-diand...@chromium.org

I don't think it's an issue.  We quiesce the hardware as if we were
about to suspend the system (e.g., S3).  For the display hardware we
call drm_atomic_helper_suspend() as part of that sequence.

Alex


Re: [PATCH v2 8/8] drm/amdgpu: Call drm_atomic_helper_shutdown() at shutdown time

2024-06-18 Thread Doug Anderson
Hi,


On Mon, Jun 17, 2024 at 8:01 AM Alex Deucher  wrote:
>
> On Wed, Jun 12, 2024 at 6:37 PM Douglas Anderson  
> wrote:
> >
> > Based on grepping through the source code this driver appears to be
> > missing a call to drm_atomic_helper_shutdown() at system shutdown
> > time. Among other things, this means that if a panel is in use that it
> > won't be cleanly powered off at system shutdown time.
> >
> > The fact that we should call drm_atomic_helper_shutdown() in the case
> > of OS shutdown/restart comes straight out of the kernel doc "driver
> > instance overview" in drm_drv.c.
> >
> > Suggested-by: Maxime Ripard 
> > Cc: Alex Deucher 
> > Cc: Christian König 
> > Cc: Xinhui Pan 
> > Signed-off-by: Douglas Anderson 
> > ---
> > This commit is only compile-time tested.
> >
> > ...and further, I'd say that this patch is more of a plea for help
> > than a patch I think is actually right. I'm _fairly_ certain that
> > drm/amdgpu needs this call at shutdown time but the logic is a bit
> > hard for me to follow. I'd appreciate if anyone who actually knows
> > what this should look like could illuminate me, or perhaps even just
> > post a patch themselves!
>
> I'm not sure this patch makes sense or not.  The driver doesn't really
> do a formal tear down in its shutdown routine, it just quiesces the
> hardware.  What are the actual requirements of the shutdown function?
> In the past when we did a full driver tear down in shutdown, it
> delayed the shutdown sequence and users complained.

The "inspiration" for this patch is to handle panels properly.
Specifically, panels often have several power/enable signals going to
them and often have requirements that these signals are powered off in
the proper order with the proper delays between them. While we can't
always do so when the system crashes / reboots in an uncontrolled way,
panel manufacturers / HW Engineers get upset if we don't power things
off properly during an orderly shutdown/reboot. When panels are
powered off badly it can cause garbage on the screen and, so I've been
told, can even cause long term damage to the panels over time.

In Linux, some panel drivers have tried to ensure a proper poweroff of
the panel by handling the shutdown() call themselves. However, this is
ugly and panel maintainers want panel drivers to stop doing it. We
have removed the code doing this from most panels now [1]. Instead the
assumption is that the DRM modeset drivers should be calling
drm_atomic_helper_shutdown() which will make sure panels get an
orderly shutdown.

For a lot more details, see the cover letter [2] which then contains
links to even more discussions about the topic.

[1] https://lore.kernel.org/r/20240605002401.2848541-1-diand...@chromium.org
[2] https://lore.kernel.org/r/2024061435.3188234-1-diand...@chromium.org


Re: [PATCH] drm/amd/display: Disable CONFIG_DRM_AMD_DC_FP for RISC-V with clang

2024-06-18 Thread Alex Deucher
Applied.  Thanks!

Alex

On Tue, Jun 18, 2024 at 10:17 AM Harry Wentland  wrote:
>
>
>
> On 2024-06-14 15:54, Nathan Chancellor wrote:
> > Commit 77acc6b55ae4 ("riscv: add support for kernel-mode FPU") and
> > commit a28e4b672f04 ("drm/amd/display: use ARCH_HAS_KERNEL_FPU_SUPPORT")
> > enabled support for CONFIG_DRM_AMD_DC_FP with RISC-V. Unfortunately,
> > this exposed -Wframe-larger-than warnings (which become fatal with
> > CONFIG_WERROR=y) when building ARCH=riscv allmodconfig with clang:
> >
> >   
> > drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/display_mode_vba_32.c:58:13:
> >  error: stack frame size (2448) exceeds limit (2048) in 
> > 'DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerformanceCalculation'
> >  [-Werror,-Wframe-larger-than]
> >  58 | static void 
> > DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerformanceCalculation(
> > | ^
> >   1 error generated.
> >
> > Many functions in this file use a large number of parameters, which must
> > be passed on the stack at a certain pointer due to register exhaustion,
> > which can cause high stack usage when inlining and issues with stack
> > slot analysis get involved. While the compiler can and should do better
> > (as GCC uses less than half the amount of stack space for the same
> > function), it is not as simple as a fix as adjusting the functions not
> > to take a large number of parameters.
> >
> > Unfortunately, modifying these files to avoid the problem is a difficult
> > to justify approach because any revisions to the files in the kernel
> > tree never make it back to the original source (so copies of the code
> > for newer hardware revisions just reintroduce the issue) and the files
> > are hard to read/modify due to being "gcc-parsable HW gospel, coming
> > straight from HW engineers".
> >
> > Avoid building the problematic code for RISC-V by modifying the existing
> > condition for arm64 that exists for the same reason. Factor out the
> > logical not to make the condition a little more readable naturally.
> >
> > Fixes: a28e4b672f04 ("drm/amd/display: use ARCH_HAS_KERNEL_FPU_SUPPORT")
> > Reported-by: Palmer Dabbelt 
> > Closes: https://lore.kernel.org/20240530145741.7506-2-pal...@rivosinc.com/
> > Signed-off-by: Nathan Chancellor 
> > ---
> >  drivers/gpu/drm/amd/display/Kconfig | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/display/Kconfig 
> > b/drivers/gpu/drm/amd/display/Kconfig
> > index 5fcd4f778dc3..47b8b49da8a7 100644
> > --- a/drivers/gpu/drm/amd/display/Kconfig
> > +++ b/drivers/gpu/drm/amd/display/Kconfig
> > @@ -8,7 +8,7 @@ config DRM_AMD_DC
> >   depends on BROKEN || !CC_IS_CLANG || ARM64 || RISCV || SPARC64 || 
> > X86_64
> >   select SND_HDA_COMPONENT if SND_HDA_CORE
> >   # !CC_IS_CLANG: https://github.com/ClangBuiltLinux/linux/issues/1752
> > - select DRM_AMD_DC_FP if ARCH_HAS_KERNEL_FPU_SUPPORT && (!ARM64 || 
> > !CC_IS_CLANG)
> > + select DRM_AMD_DC_FP if ARCH_HAS_KERNEL_FPU_SUPPORT && !(CC_IS_CLANG 
> > && (ARM64 || RISCV))
>
> Thanks for also making the logic easier to parse.
>
> Reviewed-by: Harry Wentland 
>
> Harry
>
> >   help
> > Choose this option if you want to use the new display engine
> > support for AMDGPU. This adds required support for Vega and
> >
> > ---
> > base-commit: c6c4dd54012551cce5cde408b35468f2c62b0cce
> > change-id: 20240614-amdgpu-disable-drm-amd-dc-fp-riscv-clang-31c84f6b990d
> >
> > Best regards,
>


[PATCH] drm/amdgpu: process RAS fatal error MB notification

2024-06-18 Thread Vignesh Chander
For RAS error scenario, VF guest driver will check mailbox
and set fed flag to avoid unnecessary HW accesses.
additionally, poll for reset completion message first
to avoid accidentally spamming multiple reset requests to host.

v2: add another mailbox check for handling case where kfd detects
timeout first

Signed-off-by: Vignesh Chander 
Change-Id: Ib501c653265883999c62a12a209ce5eb81c80846
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 25 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  4 +++-
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c| 22 +++--
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h|  4 +++-
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c| 22 +++--
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h|  3 ++-
 6 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 63f2286858c484..ccb3d041c2b249 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -229,6 +229,22 @@ void amdgpu_virt_free_mm_table(struct amdgpu_device *adev)
adev->virt.mm_table.gpu_addr = 0;
 }
 
+/**
+ * amdgpu_virt_rcvd_ras_interrupt() - receive ras interrupt
+ * @adev:  amdgpu device.
+ * Check whether host sent RAS error message
+ * Return: true if found, otherwise false
+ */
+bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev)
+{
+   struct amdgpu_virt *virt = &adev->virt;
+
+   if (!virt->ops || !virt->ops->rcvd_ras_intr)
+   return false;
+
+   return virt->ops->rcvd_ras_intr(adev);
+}
+
 
 unsigned int amd_sriov_msg_checksum(void *obj,
unsigned long obj_size,
@@ -612,11 +628,14 @@ static void amdgpu_virt_update_vf2pf_work_item(struct 
work_struct *work)
ret = amdgpu_virt_read_pf2vf_data(adev);
if (ret) {
adev->virt.vf2pf_update_retry_cnt++;
-   if ((adev->virt.vf2pf_update_retry_cnt >= 
AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
-   amdgpu_sriov_runtime(adev)) {
+
+   if ((amdgpu_virt_rcvd_ras_interrupt(adev) ||
+   adev->virt.vf2pf_update_retry_cnt >= 
AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
+   amdgpu_sriov_runtime(adev)) {
+
amdgpu_ras_set_fed(adev, true);
if (amdgpu_reset_domain_schedule(adev->reset_domain,
- 
&adev->kfd.reset_work))
+   &adev->kfd.reset_work))
return;
else
dev_err(adev->dev, "Failed to queue work! at 
%s", __func__);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index f04cd1586c7220..b42a8854dca0cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -52,7 +52,7 @@
 /* tonga/fiji use this offset */
 #define mmBIF_IOV_FUNC_IDENTIFIER 0x1503
 
-#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 5
+#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 2
 
 enum amdgpu_sriov_vf_mode {
SRIOV_VF_MODE_BARE_METAL = 0,
@@ -94,6 +94,7 @@ struct amdgpu_virt_ops {
  u32 data1, u32 data2, u32 data3);
void (*ras_poison_handler)(struct amdgpu_device *adev,
enum amdgpu_ras_block block);
+   bool (*rcvd_ras_intr)(struct amdgpu_device *adev);
 };
 
 /*
@@ -352,6 +353,7 @@ void amdgpu_virt_ready_to_reset(struct amdgpu_device *adev);
 int amdgpu_virt_wait_reset(struct amdgpu_device *adev);
 int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev);
 void amdgpu_virt_free_mm_table(struct amdgpu_device *adev);
+bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev);
 void amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev);
 void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev);
 void amdgpu_virt_exchange_data(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 65656afc6ed1c2..1bb8393ad6d358 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -196,11 +196,22 @@ static int xgpu_ai_request_reset(struct amdgpu_device 
*adev)
 {
int ret, i = 0;
 
-   while (i < AI_MAILBOX_POLL_MSG_REP_MAX) {
+   if (amdgpu_ras_get_fed_status(adev) || xgpu_ai_rcvd_ras_intr(adev)) {
+   dev_dbg(adev->dev, "ras flag is set, poll for 
IDH_FLR_NOTIFICATION_CMPL\n");
+
+   for (i = 0; i < AI_MAILBOX_POLL_MSG_REP_MAX; i++) {
+   ret = xgpu_ai_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL);
+   if (!ret)
+   break;
+
+   dev_dbg(adev->dev, "retries left = %d\n", 
AI_MAILBOX_POLL_MSG_REP_MAX - i);
+ 

Re: [PATCH v2 4/4] tests/amdgpu/amd_psr: Add support for `power saving policy` property

2024-06-18 Thread Mario Limonciello

On 6/18/2024 15:20, Leo Li wrote:



On 2024-05-22 18:08, Mario Limonciello wrote:

Verify that the property has disabled PSR
---
  tests/amdgpu/amd_psr.c | 74 ++
  1 file changed, 74 insertions(+)

diff --git a/tests/amdgpu/amd_psr.c b/tests/amdgpu/amd_psr.c
index 9da161a09..a9f4a6aa5 100644
--- a/tests/amdgpu/amd_psr.c
+++ b/tests/amdgpu/amd_psr.c
@@ -338,6 +338,78 @@ static void run_check_psr(data_t *data, bool 
test_null_crtc) {

  test_fini(data);
  }
+static void psr_forbidden(data_t *data)
+{
+    int edp_idx, ret, i, psr_state;
+    igt_fb_t ref_fb, ref_fb2;
+    igt_fb_t *flip_fb;
+    igt_output_t *output;
+
+    test_init(data);
+
+    edp_idx = check_conn_type(data, DRM_MODE_CONNECTOR_eDP);
+    igt_skip_on_f(edp_idx == -1, "no eDP connector found\n");
+
+    /* check if eDP support PSR1, PSR-SU.
+ */
+    igt_skip_on(!psr_mode_supported(data, PSR_MODE_1) && 
!psr_mode_supported(data, PSR_MODE_2));

+    for_each_connected_output(&data->display, output) {
+
+    if (output->config.connector->connector_type != 
DRM_MODE_CONNECTOR_eDP)

+    continue;
+    ret = clear_power_saving_policy(data->fd, output);
+    if (ret == -ENODEV) {
+    igt_skip("No power saving policy prop\n");
+    return;
+    }
+    igt_require(ret == 0);
+
+    /* try to engage PSR */
+    ret = set_panel_power_saving_policy(data->fd, output, 
DRM_MODE_REQUIRE_LOW_LATENCY);

+    igt_assert_eq(ret, 0);
+
+    igt_create_color_fb(data->fd, data->mode->hdisplay,
+    data->mode->vdisplay, DRM_FORMAT_XRGB, 0, 1.0,
+    0.0, 0.0, &ref_fb);
+    igt_create_color_fb(data->fd, data->mode->hdisplay,
+    data->mode->vdisplay, DRM_FORMAT_XRGB, 0, 0.0,
+    1.0, 0.0, &ref_fb2);
+
+    igt_plane_set_fb(data->primary, &ref_fb);
+
+    igt_display_commit_atomic(&data->display, 
DRM_MODE_ATOMIC_ALLOW_MODESET, 0);

+
+    for (i = 0; i < N_FLIPS; i++) {
+    if (i % 2 == 0)
+    flip_fb = &ref_fb2;
+    else
+    flip_fb = &ref_fb;
+
+    ret = drmModePageFlip(data->fd, 
output->config.crtc->crtc_id,
+  flip_fb->fb_id, DRM_MODE_PAGE_FLIP_EVENT, 
NULL);

+    igt_require(ret == 0);
+    kmstest_wait_for_pageflip(data->fd);
+    }
+
+    /* PSR state takes some time to settle its value on static 
screen */

+    sleep(PSR_SETTLE_DELAY);
+
+    psr_state =  igt_amd_read_psr_state(data->fd, output->name);
+    igt_require(psr_state == PSR_STATE3);


igt_fail_on* or igt_assert* should be used here, igt_require simply 
'skips' the

test if the condition evaluates to false.



Got it; thanks.

Should we be instead asserting psr_state == PSR_STATE0 here for 
disabled, since

we've set REQUIRE_LOW_LATENCY?


Yeah I think you're right.



I think as part of this test, we can also check that PSR re-enables after
clearing the power saving policy. Something like

ret = clear_power_saving_policy(data->fd, output);
... do some flipping ...
sleep(PSR_SETTLE_DELAY);
psr_state = igt_amd_read_psr_state(data->fd, output->name);
igt_assert_f(psr_state == PSR_STATE3,
  "Panel not in PSR after clearing power saving policy\n");



Agree, thanks.


Thanks,
Leo


+
+    igt_remove_fb(data->fd, &ref_fb);
+    igt_remove_fb(data->fd, &ref_fb2);
+
+    ret = clear_power_saving_policy(data->fd, output);
+    if (ret == -ENODEV) {
+    igt_skip("No power saving policy prop\n");
+    return;
+    }
+    igt_require(ret == 0);
+
+    }
+}
+
  static void run_check_psr_su_mpo(data_t *data, bool scaling, float 
scaling_ratio)

  {
  int edp_idx = check_conn_type(data, DRM_MODE_CONNECTOR_eDP);
@@ -756,6 +828,8 @@ igt_main_args("", long_options, help_str, 
opt_handler, NULL)
  igt_describe("Test to validate PSR SU enablement with Visual 
Confirm "
   "and to validate PSR SU disable/re-enable w/ primary 
scaling ratio 0.75");
  igt_subtest("psr_su_mpo_scaling_0_75") 
run_check_psr_su_mpo(&data, true, .75);

+    igt_describe("Test whether PSR can be forbidden");
+    igt_subtest("psr_forbidden") psr_forbidden(&data);
  igt_fixture
  {




Re: [PATCH v2 3/4] tests/amdgpu/amd_abm: Add support for panel_power_saving property

2024-06-18 Thread Mario Limonciello

On 6/18/2024 15:20, Leo Li wrote:


Thanks for the tests! FYI IGT patches should also cc 
igt-...@lists.freedesktop.org


Some comments inline:

On 2024-05-22 18:08, Mario Limonciello wrote:

From: Mario Limonciello 

When the "panel power saving" property is set to forbidden the
compositor has indicated that userspace prefers to have color
accuracy and fidelity instead of power saving.

Verify that the sysfs file behaves as expected in this situation.

Signed-off-by: Mario Limonciello 
---
  tests/amdgpu/amd_abm.c | 39 +++
  1 file changed, 39 insertions(+)

diff --git a/tests/amdgpu/amd_abm.c b/tests/amdgpu/amd_abm.c
index f74c3012c..3fa1366fa 100644
--- a/tests/amdgpu/amd_abm.c
+++ b/tests/amdgpu/amd_abm.c
@@ -365,6 +365,43 @@ static void abm_gradual(data_t *data)
  }
  }
+
+static void abm_forbidden(data_t *data)
+{
+    igt_output_t *output;
+    enum pipe pipe;
+    int target, r;
+
+    for_each_pipe_with_valid_output(&data->display, pipe, output) {
+    if (output->config.connector->connector_type != 
DRM_MODE_CONNECTOR_eDP)

+    continue;
+
+    r = clear_power_saving_policy(data->drm_fd, output);
+    if (r == -ENODEV) {
+    igt_skip("No power saving policy prop\n");
+    return;
+    }
+    igt_assert_eq(r, 0);
+
+    target = 3;
+    r = set_abm_level(data, output, target);
+    igt_assert_eq(r, 0);
+
+    r = set_panel_power_saving_policy(data->drm_fd, output, 
DRM_MODE_REQUIRE_COLOR_ACCURACY);

+    igt_assert_eq(r, 0);
+
+    target = 0;


Is there a purpose of setting target abm to 0 (disabled) here?

I suppose it should fail given that we've set REQUIRE_COLOR_ACCURACY. 
Though I'm

not sure why we can't keep target = 3.


Yes I think this would work as well to prove a failure.  I'll change it.



Thanks,
Leo


+    r = set_abm_level(data, output, target);
+    igt_assert_eq(r, -1);
+
+    r = clear_power_saving_policy(data->drm_fd, output);
+    igt_assert_eq(r, 0);
+
+    r = set_abm_level(data, output, target);
+    igt_assert_eq(r, 0);
+    }
+}
+
  igt_main
  {
  data_t data = {};
@@ -393,6 +430,8 @@ igt_main
  abm_enabled(&data);
  igt_subtest("abm_gradual")
  abm_gradual(&data);
+    igt_subtest("abm_forbidden")
+    abm_forbidden(&data);
  igt_fixture {
  igt_display_fini(&data.display);




Re: [PATCH v7 2/9] drm: Support per-plane async flip configuration

2024-06-18 Thread André Almeida

Em 18/06/2024 14:43, Dmitry Baryshkov escreveu:

On Tue, Jun 18, 2024 at 01:18:10PM GMT, André Almeida wrote:

Em 18/06/2024 07:07, Dmitry Baryshkov escreveu:

On Tue, 18 Jun 2024 at 12:38, Jani Nikula  wrote:


On Tue, 18 Jun 2024, André Almeida  wrote:

Drivers have different capabilities on what plane types they can or
cannot perform async flips. Create a plane::async_flip field so each
driver can choose which planes they allow doing async flips.

Signed-off-by: André Almeida 
---
   include/drm/drm_plane.h | 5 +
   1 file changed, 5 insertions(+)

diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
index 9507542121fa..0bebc72af5c3 100644
--- a/include/drm/drm_plane.h
+++ b/include/drm/drm_plane.h
@@ -786,6 +786,11 @@ struct drm_plane {
 * @kmsg_panic: Used to register a panic notifier for this plane
 */
struct kmsg_dumper kmsg_panic;
+
+ /**
+  * @async_flip: indicates if a plane can do async flips
+  */


When is it okay to set or change the value of this member?

If you don't document it, people will find creative uses for this.


Maybe it's better to have a callback instead of a static field? This
way it becomes clear that it's only relevant at the time of the
atomic_check().



So we would have something like bool (*async_flip) for struct
drm_plane_funcs I suppose. Then each driver will implement this function and
check on runtime if it should flip or not, right?

I agree that it makes more clear, but as far as I can see this is not
something that is subject to being changed at runtime at all, so it seems a
bit overkill to me to encapsulate a static information like that. I prefer
to improve the documentation on the struct member to see if this solves the
problem. What do you think of the following comment:


It looks like I keep on mixing async_flips as handled via the
DRM_MODE_PAGE_FLIP_ASYNC and the plane flips that are governed by
.atomic_async_check / .atomic_async_update / drm_atomic_helper_check()
and which end up being used just for legacy cursor updates.

So, yes, those are two different code paths, but with your changes I
think it becomes even easier to get confused between
atomic_async_check() and .async_flip member.



I see, now that I read about atomic_async_check(), it got me confused as 
well :)


I see that drivers define atomic_async_check() to tell DRM whether or 
not such plane is able to do async flips... just like I'm trying to do 
here. amdgpu implementation for that function is almost the opposite of 
the restrictions that I've implemented in this patchset:


int amdgpu_dm_plane_atomic_async_check(...) {
/* Only support async updates on cursor planes. */
if (plane->type != DRM_PLANE_TYPE_CURSOR)
return -EINVAL;

return 0;
}

Anyway, I'll try to see if the legacy cursor path might be incorporated 
somehow in the DRM_MODE_PAGE_FLIP_ASYNC path, or to come up with 
something that makes them more distinguishable.


Thanks!




/**
  * @async_flip: indicates if a plane can perform async flips. The
  * driver should set this true only for planes that the hardware
  * supports flipping asynchronously. It may not be changed during
  * runtime. This field is checked inside drm_mode_atomic_ioctl() to
  * allow only the correct planes to go with DRM_MODE_PAGE_FLIP_ASYNC.
  */




Re: [PATCH v2 4/4] tests/amdgpu/amd_psr: Add support for `power saving policy` property

2024-06-18 Thread Leo Li




On 2024-05-22 18:08, Mario Limonciello wrote:

Verify that the property has disabled PSR
---
  tests/amdgpu/amd_psr.c | 74 ++
  1 file changed, 74 insertions(+)

diff --git a/tests/amdgpu/amd_psr.c b/tests/amdgpu/amd_psr.c
index 9da161a09..a9f4a6aa5 100644
--- a/tests/amdgpu/amd_psr.c
+++ b/tests/amdgpu/amd_psr.c
@@ -338,6 +338,78 @@ static void run_check_psr(data_t *data, bool 
test_null_crtc) {
test_fini(data);
  }
  
+static void psr_forbidden(data_t *data)

+{
+   int edp_idx, ret, i, psr_state;
+   igt_fb_t ref_fb, ref_fb2;
+   igt_fb_t *flip_fb;
+   igt_output_t *output;
+
+   test_init(data);
+
+   edp_idx = check_conn_type(data, DRM_MODE_CONNECTOR_eDP);
+   igt_skip_on_f(edp_idx == -1, "no eDP connector found\n");
+
+   /* check if eDP support PSR1, PSR-SU.
+*/
+   igt_skip_on(!psr_mode_supported(data, PSR_MODE_1) && 
!psr_mode_supported(data, PSR_MODE_2));
+   for_each_connected_output(&data->display, output) {
+
+   if (output->config.connector->connector_type != 
DRM_MODE_CONNECTOR_eDP)
+   continue;
+   ret = clear_power_saving_policy(data->fd, output);
+   if (ret == -ENODEV) {
+   igt_skip("No power saving policy prop\n");
+   return;
+   }
+   igt_require(ret == 0);
+
+   /* try to engage PSR */
+   ret = set_panel_power_saving_policy(data->fd, output, 
DRM_MODE_REQUIRE_LOW_LATENCY);
+   igt_assert_eq(ret, 0);
+
+   igt_create_color_fb(data->fd, data->mode->hdisplay,
+   data->mode->vdisplay, DRM_FORMAT_XRGB, 
0, 1.0,
+   0.0, 0.0, &ref_fb);
+   igt_create_color_fb(data->fd, data->mode->hdisplay,
+   data->mode->vdisplay, DRM_FORMAT_XRGB, 
0, 0.0,
+   1.0, 0.0, &ref_fb2);
+
+   igt_plane_set_fb(data->primary, &ref_fb);
+
+   igt_display_commit_atomic(&data->display, 
DRM_MODE_ATOMIC_ALLOW_MODESET, 0);
+
+   for (i = 0; i < N_FLIPS; i++) {
+   if (i % 2 == 0)
+   flip_fb = &ref_fb2;
+   else
+   flip_fb = &ref_fb;
+
+   ret = drmModePageFlip(data->fd, 
output->config.crtc->crtc_id,
+ flip_fb->fb_id, 
DRM_MODE_PAGE_FLIP_EVENT, NULL);
+   igt_require(ret == 0);
+   kmstest_wait_for_pageflip(data->fd);
+   }
+
+   /* PSR state takes some time to settle its value on static 
screen */
+   sleep(PSR_SETTLE_DELAY);
+
+   psr_state =  igt_amd_read_psr_state(data->fd, output->name);
+   igt_require(psr_state == PSR_STATE3);


igt_fail_on* or igt_assert* should be used here, igt_require simply 'skips' the
test if the condition evaluates to false.

Should we be instead asserting psr_state == PSR_STATE0 here for disabled, since
we've set REQUIRE_LOW_LATENCY?

I think as part of this test, we can also check that PSR re-enables after
clearing the power saving policy. Something like

ret = clear_power_saving_policy(data->fd, output);
... do some flipping ...
sleep(PSR_SETTLE_DELAY);
psr_state = igt_amd_read_psr_state(data->fd, output->name);
igt_assert_f(psr_state == PSR_STATE3,
 "Panel not in PSR after clearing power saving policy\n");

Thanks,
Leo


+
+   igt_remove_fb(data->fd, &ref_fb);
+   igt_remove_fb(data->fd, &ref_fb2);
+
+   ret = clear_power_saving_policy(data->fd, output);
+   if (ret == -ENODEV) {
+   igt_skip("No power saving policy prop\n");
+   return;
+   }
+   igt_require(ret == 0);
+
+   }
+}
+
  static void run_check_psr_su_mpo(data_t *data, bool scaling, float 
scaling_ratio)
  {
int edp_idx = check_conn_type(data, DRM_MODE_CONNECTOR_eDP);
@@ -756,6 +828,8 @@ igt_main_args("", long_options, help_str, opt_handler, NULL)
igt_describe("Test to validate PSR SU enablement with Visual Confirm "
 "and to validate PSR SU disable/re-enable w/ primary scaling 
ratio 0.75");
igt_subtest("psr_su_mpo_scaling_0_75") run_check_psr_su_mpo(&data, 
true, .75);
+   igt_describe("Test whether PSR can be forbidden");
+   igt_subtest("psr_forbidden") psr_forbidden(&data);
  
  	igt_fixture

{


Re: [PATCH v2 3/4] tests/amdgpu/amd_abm: Add support for panel_power_saving property

2024-06-18 Thread Leo Li



Thanks for the tests! FYI IGT patches should also cc 
igt-...@lists.freedesktop.org

Some comments inline:

On 2024-05-22 18:08, Mario Limonciello wrote:

From: Mario Limonciello 

When the "panel power saving" property is set to forbidden the
compositor has indicated that userspace prefers to have color
accuracy and fidelity instead of power saving.

Verify that the sysfs file behaves as expected in this situation.

Signed-off-by: Mario Limonciello 
---
  tests/amdgpu/amd_abm.c | 39 +++
  1 file changed, 39 insertions(+)

diff --git a/tests/amdgpu/amd_abm.c b/tests/amdgpu/amd_abm.c
index f74c3012c..3fa1366fa 100644
--- a/tests/amdgpu/amd_abm.c
+++ b/tests/amdgpu/amd_abm.c
@@ -365,6 +365,43 @@ static void abm_gradual(data_t *data)
}
  }
  
+

+static void abm_forbidden(data_t *data)
+{
+   igt_output_t *output;
+   enum pipe pipe;
+   int target, r;
+
+   for_each_pipe_with_valid_output(&data->display, pipe, output) {
+   if (output->config.connector->connector_type != 
DRM_MODE_CONNECTOR_eDP)
+   continue;
+
+   r = clear_power_saving_policy(data->drm_fd, output);
+   if (r == -ENODEV) {
+   igt_skip("No power saving policy prop\n");
+   return;
+   }
+   igt_assert_eq(r, 0);
+
+   target = 3;
+   r = set_abm_level(data, output, target);
+   igt_assert_eq(r, 0);
+
+   r = set_panel_power_saving_policy(data->drm_fd, output, 
DRM_MODE_REQUIRE_COLOR_ACCURACY);
+   igt_assert_eq(r, 0);
+
+   target = 0;


Is there a purpose of setting target abm to 0 (disabled) here?

I suppose it should fail given that we've set REQUIRE_COLOR_ACCURACY. Though I'm
not sure why we can't keep target = 3.

Thanks,
Leo


+   r = set_abm_level(data, output, target);
+   igt_assert_eq(r, -1);
+
+   r = clear_power_saving_policy(data->drm_fd, output);
+   igt_assert_eq(r, 0);
+
+   r = set_abm_level(data, output, target);
+   igt_assert_eq(r, 0);
+   }
+}
+
  igt_main
  {
data_t data = {};
@@ -393,6 +430,8 @@ igt_main
abm_enabled(&data);
igt_subtest("abm_gradual")
abm_gradual(&data);
+   igt_subtest("abm_forbidden")
+   abm_forbidden(&data);
  
  	igt_fixture {

igt_display_fini(&data.display);


Re: [PATCH] drm/amdkfd: Update mm interval notifier tree without acquiring mm's mmap lock

2024-06-18 Thread Felix Kuehling



On 2024-06-12 16:11, Xiaogang.Chen wrote:

From: Xiaogang Chen 

Current kfd/svm driver acquires mm's mmap write lock before update
mm->notifier_subscriptions->itree. This tree is already protected
by mm->notifier_subscriptions->lock at mmu notifier. Each process mm interval
tree update from different components in kernel go to mmu interval notifier
where they got serialized. This patch removes mmap write lock acquiring at
kfd/svm driver when need updates process mm interval tree. It reduces chance
of dead lock or warning from lockdev and simplifies the driver code.

In addition, the patch adjusts some locks granularity to reduce the lock number
that driver holds at same time which also reduces the chance of dead lock or
warning from lockdev.

Signed-off-by: Xiaogang Chen
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |   3 +-
  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |   6 +-
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 181 +++
  drivers/gpu/drm/amd/amdkfd/kfd_svm.h |   2 +-
  4 files changed, 122 insertions(+), 70 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index fdf171ad4a3c..b52588ded567 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1078,9 +1078,8 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file 
*filep,
/* Flush pending deferred work to avoid racing with deferred actions
 * from previous memory map changes (e.g. munmap).
 */
-   svm_range_list_lock_and_flush_work(&p->svms, current->mm);
+   svm_range_list_flush_work(&p->svms);
mutex_lock(&p->svms.lock);
-   mmap_write_unlock(current->mm);
if (interval_tree_iter_first(&p->svms.objects,
 args->va_addr >> PAGE_SHIFT,
 (args->va_addr + args->size - 1) >> 
PAGE_SHIFT)) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 8ee3d07ffbdf..eb46643d96b2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -969,10 +969,12 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf)
mutex_lock(&p->svms.lock);
  
  	prange = svm_range_from_addr(&p->svms, addr, NULL);

+
+   mutex_unlock(&p->svms.lock);


You must continue to hold the svms.lock here. As soon as you drop the 
lock, the prange can be freed or changed, so cannot keep using this 
pointer without holding the lock.




if (!prange) {
pr_debug("failed get range svms 0x%p addr 0x%lx\n", &p->svms, 
addr);
r = -EFAULT;
-   goto out_unlock_svms;
+   goto out_unref_process;
}
  
  	mutex_lock(&prange->migrate_mutex);

@@ -993,8 +995,6 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf)
  
  out_unlock_prange:

mutex_unlock(&prange->migrate_mutex);
-out_unlock_svms:
-   mutex_unlock(&p->svms.lock);
  out_unref_process:
pr_debug("CPU fault svms 0x%p address 0x%lx done\n", &p->svms, addr);
kfd_unref_process(p);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 407636a68814..46f81c1215d9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -106,12 +106,31 @@ static void svm_range_unlink(struct svm_range *prange)
  }
  
  static void

-svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange)
+svm_range_add_notifier(struct mm_struct *mm, struct svm_range *prange, bool 
locked)
  {
pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
 prange, prange->start, prange->last);
  
-	mmu_interval_notifier_insert_locked(&prange->notifier, mm,

+   /* mm->notifier_subscriptions should have been setup for this process
+* ex: during kfd process creation
+*/
+   WARN_ON_ONCE(!mm->notifier_subscriptions);
+
+   /* not necessary hold mmap lock to update mm interval notifier tree as
+* opeations on mm->notifier_subscriptions->itree are serialized by
+* mm->notifier_subscriptions->lock
+*/
+   if (locked) {
+   /* if mmap write lock has been hold use lock version to udpate
+* mm interval notifier tree
+*/
+   mmu_interval_notifier_insert_locked(&prange->notifier, mm,
+  prange->start << PAGE_SHIFT,
+  prange->npages << PAGE_SHIFT,
+  &svm_range_mn_ops);
+   } else
+   /* use no-mmap-lock version to update mm interval notifier tree 
*/
+   mmu_interval_notifier_insert(&prange->notifier, mm,
 prange->start << PAGE_SHIFT,
 prange->npages << PAGE_SHIFT,
 &svm_range_mn_ops);
@@ -

Re: [PATCH] drm/amd: Don't initialize ISP hardware without FW

2024-06-18 Thread Alex Deucher
On Tue, Jun 18, 2024 at 2:02 PM Mario Limonciello
 wrote:
>
> Although designs may contain an ISP IP block, the camera might be a USB
> camera. Because of this the ISP firmware is considered optional from
> amdgpu.  However if the firmware doesn't get loaded the hardware should
> not be initialized.
>
> Adjust the return code for early init to ensure the IP block doesn't go
> through the other init and fini sequences. Also decrease the message
> about firmware load failure to debug so it's not as alarming to users.
>
> Signed-off-by: Mario Limonciello 

Acked-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c
> index 215bae809153..4766e99dd98f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c
> @@ -142,8 +142,8 @@ static int isp_early_init(void *handle)
> isp->parent = adev->dev;
>
> if (isp_load_fw_by_psp(adev)) {
> -   DRM_WARN("%s: isp fw load failed\n", __func__);
> -   return 0;
> +   DRM_DEBUG_DRIVER("%s: isp fw load failed\n", __func__);
> +   return -ENOENT;
> }
>
> return 0;
> --
> 2.34.1
>


Re: [PATCH] drm/amdgpu/jpeg5: reprogram doorbell setting after power up for each playback

2024-06-18 Thread Alex Deucher
On Tue, Jun 18, 2024 at 1:17 PM Sonny Jiang  wrote:
>
> From: Sonny Jiang 
>
> Doorbell needs to be configured after power up during each playback
>
> Signed-off-by: Sonny Jiang 

Acked-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c 
> b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c
> index 68ef29bc70e2..e766b9463759 100644
> --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c
> @@ -137,10 +137,6 @@ static int jpeg_v5_0_0_hw_init(void *handle)
> adev->nbio.funcs->vcn_doorbell_range(adev, ring->use_doorbell,
> (adev->doorbell_index.vcn.vcn_ring0_1 << 1), 0);
>
> -   WREG32_SOC15(VCN, 0, regVCN_JPEG_DB_CTRL,
> -   ring->doorbell_index << 
> VCN_JPEG_DB_CTRL__OFFSET__SHIFT |
> -   VCN_JPEG_DB_CTRL__EN_MASK);
> -
> r = amdgpu_ring_test_helper(ring);
> if (r)
> return r;
> @@ -314,6 +310,10 @@ static int jpeg_v5_0_0_start(struct amdgpu_device *adev)
> JPEG_SYS_INT_EN__DJRBC0_MASK,
> ~JPEG_SYS_INT_EN__DJRBC0_MASK);
>
> +   WREG32_SOC15(VCN, 0, regVCN_JPEG_DB_CTRL,
> +   ring->doorbell_index << VCN_JPEG_DB_CTRL__OFFSET__SHIFT |
> +   VCN_JPEG_DB_CTRL__EN_MASK);
> +
> WREG32_SOC15(JPEG, 0, regUVD_LMI_JRBC_RB_VMID, 0);
> WREG32_SOC15(JPEG, 0, regUVD_JRBC_RB_CNTL, (0x0001L | 
> 0x0002L));
> WREG32_SOC15(JPEG, 0, regUVD_LMI_JRBC_RB_64BIT_BAR_LOW,
> --
> 2.45.1
>


RE: [PATCH] drm/amd/pm: check whether smu is idle in sriov case

2024-06-18 Thread Slivka, Danijel
[AMD Official Use Only - AMD Internal Distribution Only]

I tried to set the C2PMSG_90 register to 1 on the PF side ( after receiving 
command for request GPU init from VF) and from PF side this value is set to 0x1 
but from VF side the register still reads the old value.


BR,
Danijel

>-Original Message-
>From: Wang, Yang(Kevin) 
>Sent: Tuesday, June 18, 2024 5:20 PM
>To: Slivka, Danijel ; amd-gfx@lists.freedesktop.org
>Cc: Slivka, Danijel ; Chen, JingWen (Wayne)
>; Zhou, Peng Ju 
>Subject: RE: [PATCH] drm/amd/pm: check whether smu is idle in sriov case
>
>This looks more like a workaround.
>Can we write the C2PMSG_90 register to 1 on the PF side when host receive
>GPU_RESET/GPU_INIT request command?
>
>Best Regards,
>Kevin
>
>-Original Message-
>From: amd-gfx  On Behalf Of Danijel
>Slivka
>Sent: 2024年6月18日 23:00
>To: amd-gfx@lists.freedesktop.org
>Cc: Slivka, Danijel ; Chen, JingWen (Wayne)
>; Zhou, Peng Ju 
>Subject: [PATCH] drm/amd/pm: check whether smu is idle in sriov case
>
>Why:
>If the reg mmMP1_SMN_C2PMSG_90 is being written to before or during
>amdgpu driver load or driver unload in sriov case, subsequent amdgpu driver
>load will fail at smu hw_init.
>The default of mmMP1_SMN_C2PMSG_90 register at a clean environment is
>0x1, and if value differs from 0x1, amdgpu driver load will fail.
>
>How to fix:
>This patch is to check whether smu is idle by sending a test message to smu. If
>smu is idle, it will respond.
>This will avoid errors in case mmMP1_SMN_C2PMSG_90 is not 0x1
>eventhough smu is idle.
>
>Signed-off-by: Danijel Slivka 
>Signed-off-by: Jingwen Chen 
>Signed-off-by: pengzhou 
>---
> .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 17 ++--
> drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c| 42
>+++
> drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h|  3 ++
> 3 files changed, 58 insertions(+), 4 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>index e17466cc1952..dafd91b352ec 100644
>--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>@@ -231,6 +231,7 @@ int smu_v13_0_check_fw_status(struct smu_context
>*smu)  {
>   struct amdgpu_device *adev = smu->adev;
>   uint32_t mp1_fw_flags;
>+  int ret = 0;
>
>   switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
>   case IP_VERSION(13, 0, 4):
>@@ -244,11 +245,19 @@ int smu_v13_0_check_fw_status(struct
>smu_context *smu)
>   break;
>   }
>
>-  if ((mp1_fw_flags &
>MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK) >>
>-  MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED__SHIFT)
>-  return 0;
>+  if (!((mp1_fw_flags &
>MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK) >>
>+  MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED__SHIFT))
>+  return -EIO;
>+
>+  if (amdgpu_sriov_vf(adev)) {
>+  ret = smu_cmn_wait_smu_idle(smu);
>+  if (ret) {
>+  dev_err(adev->dev, "SMU is not idle\n");
>+  return ret;
>+  }
>+  }
>
>-  return -EIO;
>+  return 0;
> }
>
> int smu_v13_0_check_fw_version(struct smu_context *smu) diff --git
>a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>index 5592fd825aa3..de431c31ca7f 100644
>--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>@@ -359,6 +359,48 @@ int smu_cmn_wait_for_response(struct smu_context
>*smu)
>   return res;
> }
>
>+/**
>+ * smu_cmn_wait_smu_idle -- wait for smu to become idle
>+ * @smu: pointer to an SMU context
>+ *
>+ * Send SMU_MSG_TestMessage to check whether SMU is idle.
>+ * If SMU is idle, it will respond.
>+ * The returned parameter will be the param you pass + 1.
>+ *
>+ * Return 0 on success, -errno on error, indicating the execution
>+ * status and result of the message being waited for. See
>+ * __smu_cmn_reg2errno() for details of the -errno.
>+ */
>+int smu_cmn_wait_smu_idle(struct smu_context *smu) {
>+  u32 reg;
>+  u32 param = 0xff00011;
>+  uint32_t read_arg;
>+  int res, index;
>+
>+  index = smu_cmn_to_asic_specific_index(smu,
>+ CMN2ASIC_MAPPING_MSG,
>+ SMU_MSG_TestMessage);
>+
>+  if (index < 0)
>+  return index == -EACCES ? 0 : index;
>+
>+  __smu_cmn_send_msg(smu, index, param);
>+  reg = __smu_cmn_poll_stat(smu);
>+  res = __smu_cmn_reg2errno(smu, reg);
>+
>+  if (unlikely(smu->adev->pm.smu_debug_mask &
>SMU_DEBUG_HALT_ON_ERROR) &&
>+  res && (res != -ETIME)) {
>+  amdgpu_device_halt(smu->adev);
>+  WARN_ON(1);
>+  }
>+
>+  smu_cmn_read_arg(smu, &read_arg);
>+  if (read_arg == param + 1)
>+  return 0;
>+  return res;
>+}
>+
> /**
>  * smu_cmn_send_smc_msg_with_param -- send a message with parameter
>  * @smu: poi

RE: [PATCH] drm/amd: Don't initialize ISP hardware without FW

2024-06-18 Thread Nirujogi, Pratap
[AMD Official Use Only - AMD Internal Distribution Only]

LGTM, thank you.

-Original Message-
From: Limonciello, Mario 
Sent: Tuesday, June 18, 2024 12:43 PM
To: amd-gfx@lists.freedesktop.org
Cc: Nirujogi, Pratap ; Limonciello, Mario 

Subject: [PATCH] drm/amd: Don't initialize ISP hardware without FW

Although designs may contain an ISP IP block, the camera might be a USB camera. 
Because of this the ISP firmware is considered optional from amdgpu.  However 
if the firmware doesn't get loaded the hardware should not be initialized.

Adjust the return code for early init to ensure the IP block doesn't go through 
the other init and fini sequences. Also decrease the message about firmware 
load failure to debug so it's not as alarming to users.

Signed-off-by: Mario Limonciello 
Reviewed-by: Pratap Nirujogi 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c
index 215bae809153..4766e99dd98f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c
@@ -142,8 +142,8 @@ static int isp_early_init(void *handle)
isp->parent = adev->dev;

if (isp_load_fw_by_psp(adev)) {
-   DRM_WARN("%s: isp fw load failed\n", __func__);
-   return 0;
+   DRM_DEBUG_DRIVER("%s: isp fw load failed\n", __func__);
+   return -ENOENT;
}

return 0;
--
2.34.1



Re: [PATCH v7 2/9] drm: Support per-plane async flip configuration

2024-06-18 Thread Dmitry Baryshkov
On Tue, Jun 18, 2024 at 01:18:10PM GMT, André Almeida wrote:
> Em 18/06/2024 07:07, Dmitry Baryshkov escreveu:
> > On Tue, 18 Jun 2024 at 12:38, Jani Nikula  
> > wrote:
> > > 
> > > On Tue, 18 Jun 2024, André Almeida  wrote:
> > > > Drivers have different capabilities on what plane types they can or
> > > > cannot perform async flips. Create a plane::async_flip field so each
> > > > driver can choose which planes they allow doing async flips.
> > > > 
> > > > Signed-off-by: André Almeida 
> > > > ---
> > > >   include/drm/drm_plane.h | 5 +
> > > >   1 file changed, 5 insertions(+)
> > > > 
> > > > diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
> > > > index 9507542121fa..0bebc72af5c3 100644
> > > > --- a/include/drm/drm_plane.h
> > > > +++ b/include/drm/drm_plane.h
> > > > @@ -786,6 +786,11 @@ struct drm_plane {
> > > > * @kmsg_panic: Used to register a panic notifier for this plane
> > > > */
> > > >struct kmsg_dumper kmsg_panic;
> > > > +
> > > > + /**
> > > > +  * @async_flip: indicates if a plane can do async flips
> > > > +  */
> > > 
> > > When is it okay to set or change the value of this member?
> > > 
> > > If you don't document it, people will find creative uses for this.
> > 
> > Maybe it's better to have a callback instead of a static field? This
> > way it becomes clear that it's only relevant at the time of the
> > atomic_check().
> > 
> 
> So we would have something like bool (*async_flip) for struct
> drm_plane_funcs I suppose. Then each driver will implement this function and
> check on runtime if it should flip or not, right?
> 
> I agree that it makes more clear, but as far as I can see this is not
> something that is subject to being changed at runtime at all, so it seems a
> bit overkill to me to encapsulate a static information like that. I prefer
> to improve the documentation on the struct member to see if this solves the
> problem. What do you think of the following comment:

It looks like I keep on mixing async_flips as handled via the
DRM_MODE_PAGE_FLIP_ASYNC and the plane flips that are governed by
.atomic_async_check / .atomic_async_update / drm_atomic_helper_check()
and which end up being used just for legacy cursor updates.

So, yes, those are two different code paths, but with your changes I
think it becomes even easier to get confused between
atomic_async_check() and .async_flip member.


> /**
>  * @async_flip: indicates if a plane can perform async flips. The
>  * driver should set this true only for planes that the hardware
>  * supports flipping asynchronously. It may not be changed during
>  * runtime. This field is checked inside drm_mode_atomic_ioctl() to
>  * allow only the correct planes to go with DRM_MODE_PAGE_FLIP_ASYNC.
>  */

-- 
With best wishes
Dmitry


RE: [PATCH 1/2] drm/amdgpu: Use dev_ prints for virtualization as it supports multi adapter

2024-06-18 Thread Luo, Zhigang
[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Zhigang Luo 

-Original Message-
From: Chander, Vignesh 
Sent: Monday, June 17, 2024 10:55 AM
To: amd-gfx@lists.freedesktop.org
Cc: Chan, Hing Pong ; Luo, Zhigang ; 
Chander, Vignesh ; Chander, Vignesh 

Subject: [PATCH 1/2] drm/amdgpu: Use dev_ prints for virtualization as it 
supports multi adapter

Signed-off-by: Vignesh Chander 
Change-Id: Ifead637951c00e5b4e97c766d172323dcac4da08
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 19 +++  
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 23 +++
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 6b71ee85ee6556..65656afc6ed1c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -93,7 +93,7 @@ static int xgpu_ai_poll_ack(struct amdgpu_device *adev)
timeout -= 5;
} while (timeout > 1);

-   pr_err("Doesn't get TRN_MSG_ACK from pf in %d msec\n", 
AI_MAILBOX_POLL_ACK_TIMEDOUT);
+   dev_err(adev->dev, "Doesn't get TRN_MSG_ACK from pf in %d msec\n",
+AI_MAILBOX_POLL_ACK_TIMEDOUT);

return -ETIME;
 }
@@ -111,7 +111,7 @@ static int xgpu_ai_poll_msg(struct amdgpu_device *adev, 
enum idh_event event)
timeout -= 10;
} while (timeout > 1);

-   pr_err("Doesn't get msg:%d from pf, error=%d\n", event, r);
+   dev_err(adev->dev, "Doesn't get msg:%d from pf, error=%d\n", event,
+r);

return -ETIME;
 }
@@ -132,7 +132,7 @@ static void xgpu_ai_mailbox_trans_msg (struct amdgpu_device 
*adev,
xgpu_ai_mailbox_set_valid(adev, false);
trn = xgpu_ai_peek_ack(adev);
if (trn) {
-   pr_err("trn=%x ACK should not assert! wait again !\n", 
trn);
+   dev_err_ratelimited(adev->dev, "trn=%x ACK should not 
assert! wait
+again !\n", trn);
msleep(1);
}
} while(trn);
@@ -155,7 +155,7 @@ static void xgpu_ai_mailbox_trans_msg (struct amdgpu_device 
*adev,
/* start to poll ack */
r = xgpu_ai_poll_ack(adev);
if (r)
-   pr_err("Doesn't get ack from pf, continue\n");
+   dev_err(adev->dev, "Doesn't get ack from pf, continue\n");

xgpu_ai_mailbox_set_valid(adev, false);  } @@ -173,7 +173,7 @@ static 
int xgpu_ai_send_access_requests(struct amdgpu_device *adev,
req == IDH_REQ_GPU_RESET_ACCESS) {
r = xgpu_ai_poll_msg(adev, IDH_READY_TO_ACCESS_GPU);
if (r) {
-   pr_err("Doesn't get READY_TO_ACCESS_GPU from pf, give 
up\n");
+   dev_err(adev->dev, "Doesn't get READY_TO_ACCESS_GPU 
from pf, give
+up\n");
return r;
}
/* Retrieve checksum from mailbox2 */ @@ -231,7 +231,7 @@ 
static int xgpu_ai_mailbox_ack_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry)
 {
-   DRM_DEBUG("get ack intr and do nothing.\n");
+   dev_dbg(adev->dev, "get ack intr and do nothing.\n");
return 0;
 }

@@ -258,12 +258,15 @@ static int xgpu_ai_wait_reset(struct amdgpu_device *adev) 
 {
int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
do {
-   if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
+   if (xgpu_ai_mailbox_peek_msg(adev) == 
IDH_FLR_NOTIFICATION_CMPL) {
+   dev_dbg(adev->dev, "Got AI IDH_FLR_NOTIFICATION_CMPL 
after %d ms\n",
+AI_MAILBOX_POLL_FLR_TIMEDOUT - timeout);
return 0;
+   }
msleep(10);
timeout -= 10;
} while (timeout > 1);
-   dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");
+
+   dev_dbg(adev->dev, "waiting AI IDH_FLR_NOTIFICATION_CMPL timeout\n");
return -ETIME;
 }

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 22af30a15a5fd7..17e1e8cc243752 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -91,7 +91,7 @@ static int xgpu_nv_poll_ack(struct amdgpu_device *adev)
timeout -= 5;
} while (timeout > 1);

-   pr_err("Doesn't get TRN_MSG_ACK from pf in %d msec\n", 
NV_MAILBOX_POLL_ACK_TIMEDOUT);
+   dev_err(adev->dev, "Doesn't get TRN_MSG_ACK from pf in %d msec \n",
+NV_MAILBOX_POLL_ACK_TIMEDOUT);

return -ETIME;
 }
@@ -106,13 +106,16 @@ static int xgpu_nv_poll_msg(struct amdgpu_device *adev, 
enum idh_event event)

do {
r = xgpu_nv_mailbox_rcv_msg(adev, event);
-   if (!r)
+   if (!r) {
+   dev_dbg(adev->dev, "rcv_msg 0x%x after %llu ms\n", 
event,
+

[PATCH] drm/amd: Don't initialize ISP hardware without FW

2024-06-18 Thread Mario Limonciello
Although designs may contain an ISP IP block, the camera might be a USB
camera. Because of this the ISP firmware is considered optional from
amdgpu.  However if the firmware doesn't get loaded the hardware should
not be initialized.

Adjust the return code for early init to ensure the IP block doesn't go
through the other init and fini sequences. Also decrease the message
about firmware load failure to debug so it's not as alarming to users.

Signed-off-by: Mario Limonciello 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c
index 215bae809153..4766e99dd98f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c
@@ -142,8 +142,8 @@ static int isp_early_init(void *handle)
isp->parent = adev->dev;
 
if (isp_load_fw_by_psp(adev)) {
-   DRM_WARN("%s: isp fw load failed\n", __func__);
-   return 0;
+   DRM_DEBUG_DRIVER("%s: isp fw load failed\n", __func__);
+   return -ENOENT;
}
 
return 0;
-- 
2.34.1



[PATCH] drm/amdgpu/jpeg5: reprogram doorbell setting after power up for each playback

2024-06-18 Thread Sonny Jiang
From: Sonny Jiang 

Doorbell needs to be configured after power up during each playback

Signed-off-by: Sonny Jiang 
---
 drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c
index 68ef29bc70e2..e766b9463759 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c
@@ -137,10 +137,6 @@ static int jpeg_v5_0_0_hw_init(void *handle)
adev->nbio.funcs->vcn_doorbell_range(adev, ring->use_doorbell,
(adev->doorbell_index.vcn.vcn_ring0_1 << 1), 0);
 
-   WREG32_SOC15(VCN, 0, regVCN_JPEG_DB_CTRL,
-   ring->doorbell_index << VCN_JPEG_DB_CTRL__OFFSET__SHIFT 
|
-   VCN_JPEG_DB_CTRL__EN_MASK);
-
r = amdgpu_ring_test_helper(ring);
if (r)
return r;
@@ -314,6 +310,10 @@ static int jpeg_v5_0_0_start(struct amdgpu_device *adev)
JPEG_SYS_INT_EN__DJRBC0_MASK,
~JPEG_SYS_INT_EN__DJRBC0_MASK);
 
+   WREG32_SOC15(VCN, 0, regVCN_JPEG_DB_CTRL,
+   ring->doorbell_index << VCN_JPEG_DB_CTRL__OFFSET__SHIFT |
+   VCN_JPEG_DB_CTRL__EN_MASK);
+
WREG32_SOC15(JPEG, 0, regUVD_LMI_JRBC_RB_VMID, 0);
WREG32_SOC15(JPEG, 0, regUVD_JRBC_RB_CNTL, (0x0001L | 0x0002L));
WREG32_SOC15(JPEG, 0, regUVD_LMI_JRBC_RB_64BIT_BAR_LOW,
-- 
2.45.1



Re: [PATCH v7 2/9] drm: Support per-plane async flip configuration

2024-06-18 Thread André Almeida

Em 18/06/2024 07:07, Dmitry Baryshkov escreveu:

On Tue, 18 Jun 2024 at 12:38, Jani Nikula  wrote:


On Tue, 18 Jun 2024, André Almeida  wrote:

Drivers have different capabilities on what plane types they can or
cannot perform async flips. Create a plane::async_flip field so each
driver can choose which planes they allow doing async flips.

Signed-off-by: André Almeida 
---
  include/drm/drm_plane.h | 5 +
  1 file changed, 5 insertions(+)

diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
index 9507542121fa..0bebc72af5c3 100644
--- a/include/drm/drm_plane.h
+++ b/include/drm/drm_plane.h
@@ -786,6 +786,11 @@ struct drm_plane {
* @kmsg_panic: Used to register a panic notifier for this plane
*/
   struct kmsg_dumper kmsg_panic;
+
+ /**
+  * @async_flip: indicates if a plane can do async flips
+  */


When is it okay to set or change the value of this member?

If you don't document it, people will find creative uses for this.


Maybe it's better to have a callback instead of a static field? This
way it becomes clear that it's only relevant at the time of the
atomic_check().



So we would have something like bool (*async_flip) for struct 
drm_plane_funcs I suppose. Then each driver will implement this function 
and check on runtime if it should flip or not, right?


I agree that it makes more clear, but as far as I can see this is not 
something that is subject to being changed at runtime at all, so it 
seems a bit overkill to me to encapsulate a static information like 
that. I prefer to improve the documentation on the struct member to see 
if this solves the problem. What do you think of the following comment:


/**
 * @async_flip: indicates if a plane can perform async flips. The
 * driver should set this true only for planes that the hardware
 * supports flipping asynchronously. It may not be changed during
 * runtime. This field is checked inside drm_mode_atomic_ioctl() to
 * allow only the correct planes to go with DRM_MODE_PAGE_FLIP_ASYNC.
 */


[PATCH 5/6] drm/amdgpu: cache mclk/sclk min/max values

2024-06-18 Thread Pierre-Eric Pelloux-Prayer
This way these values can be returned directly when using
AMDGPU_INFO_DEV_INFO, without waking up the GPU.

Signed-off-by: Pierre-Eric Pelloux-Prayer 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   | 12 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c  |  8 
 .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c  |  8 
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  7 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   | 19 +--
 drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c|  8 
 drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c| 12 ++--
 7 files changed, 40 insertions(+), 34 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 083f353cff6e..75db8eba73d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -418,8 +418,16 @@ struct amdgpu_clock {
struct amdgpu_pll spll;
struct amdgpu_pll mpll;
/* 10 Khz units */
-   uint32_t default_mclk;
-   uint32_t default_sclk;
+   struct {
+   uint32_t min;
+   uint32_t max;
+   uint32_t def;
+   } mclk;
+   struct {
+   uint32_t min;
+   uint32_t max;
+   uint32_t def;
+   } sclk;
uint32_t default_dispclk;
uint32_t current_dispclk;
uint32_t dp_extclk;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
index 7dc102f0bc1d..f2c2b05233f0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
@@ -658,9 +658,9 @@ int amdgpu_atombios_get_clock_info(struct amdgpu_device 
*adev)
mpll->pll_in_max =
le16_to_cpu(firmware_info->info.usMaxMemoryClockPLL_Input);
 
-   adev->clock.default_sclk =
+   adev->clock.sclk.def = adev->clock.sclk.min = 
adev->clock.sclk.max =
le32_to_cpu(firmware_info->info.ulDefaultEngineClock);
-   adev->clock.default_mclk =
+   adev->clock.mclk.def = adev->clock.mclk.min = 
adev->clock.mclk.max =
le32_to_cpu(firmware_info->info.ulDefaultMemoryClock);
 
mpll->min_post_div = 1;
@@ -699,8 +699,8 @@ int amdgpu_atombios_get_clock_info(struct amdgpu_device 
*adev)
ret = 0;
}
 
-   adev->pm.current_sclk = adev->clock.default_sclk;
-   adev->pm.current_mclk = adev->clock.default_mclk;
+   adev->pm.current_sclk = adev->clock.sclk.def;
+   adev->pm.current_mclk = adev->clock.mclk.def;
 
return ret;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
index f932bec6e534..6eb125b1bd08 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
@@ -719,13 +719,13 @@ int amdgpu_atomfirmware_get_clock_info(struct 
amdgpu_device *adev)
(union firmware_info *)(mode_info->atom_context->bios +
data_offset);
 
-   adev->clock.default_sclk =
+   adev->clock.sclk.def = adev->clock.sclk.min = 
adev->clock.sclk.max =
le32_to_cpu(firmware_info->v31.bootup_sclk_in10khz);
-   adev->clock.default_mclk =
+   adev->clock.mclk.def = adev->clock.mclk.min = 
adev->clock.mclk.max =
le32_to_cpu(firmware_info->v31.bootup_mclk_in10khz);
 
-   adev->pm.current_sclk = adev->clock.default_sclk;
-   adev->pm.current_mclk = adev->clock.default_mclk;
+   adev->pm.current_sclk = adev->clock.sclk.def;
+   adev->pm.current_mclk = adev->clock.mclk.def;
 
ret = 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3fb02f5b91c9..03417e7e8961 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4422,6 +4422,13 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 
amdgpu_device_check_iommu_direct_map(adev);
 
+   if (adev->pm.dpm_enabled) {
+   adev->clock.sclk.min = amdgpu_dpm_get_sclk(adev, true);
+   adev->clock.sclk.max = amdgpu_dpm_get_sclk(adev, false);
+   adev->clock.mclk.min = amdgpu_dpm_get_mclk(adev, true);
+   adev->clock.mclk.max = amdgpu_dpm_get_mclk(adev, false);
+   }
+
return 0;
 
 release_ras_con:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index dbb05d51682b..781851cf8dc9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -827,19 +827,10 @@ static int amdgpu_info(struct drm_device *dev, void 
*data, struct drm_file *filp
dev_info->num_shader_arrays_

[PATCH 6/6] drm/amdgpu: resume the device from amdgpu_gem_fault

2024-06-18 Thread Pierre-Eric Pelloux-Prayer
The fault handler may push some work to the GPU through amdgpu_bo_move
so use the pm_runtime functions before that.

Since we're in an interrupt context, we can't use the sync version,
so pm_runtime_get is called.

Signed-off-by: Pierre-Eric Pelloux-Prayer 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 1f22b4208729..ec120e33536d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -52,9 +53,13 @@ static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf)
vm_fault_t ret;
int idx;
 
+   ret = pm_runtime_get(ddev->dev);
+   if (ret < 0)
+   return ret;
+
ret = ttm_bo_vm_reserve(bo, vmf);
if (ret)
-   return ret;
+   goto put_pm;
 
if (drm_dev_enter(ddev, &idx)) {
ret = amdgpu_bo_fault_reserve_notify(bo);
@@ -71,10 +76,14 @@ static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf)
ret = ttm_bo_vm_dummy_page(vmf, vmf->vma->vm_page_prot);
}
if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
-   return ret;
+   goto put_pm;
 
 unlock:
dma_resv_unlock(bo->base.resv);
+put_pm:
+   pm_runtime_mark_last_busy(ddev->dev);
+   pm_runtime_put_autosuspend(ddev->dev);
+
return ret;
 }
 
-- 
2.40.1



[PATCH 3/6] drm/amdgpu: refactor amdgpu_info_ioctl to allow finer pm

2024-06-18 Thread Pierre-Eric Pelloux-Prayer
AMDGPU_INFO_xxx has lots of different queries, and only a small
number of them actually reaches out to the GPU.

This commit extract the amdgpu_info_ioctl implementation to a
helper function, and then wrap it with the runtime pm logic
each query type needs.

Signed-off-by: Pierre-Eric Pelloux-Prayer 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 94 -
 2 files changed, 80 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index d21d5af7f187..f51f121d804e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2871,6 +2871,8 @@ long amdgpu_drm_ioctl(struct file *filp,
 
if (is_driver_ioctl) {
switch (nr - DRM_COMMAND_BASE) {
+   /* amdgpu_info_ioctl will resume the device if it needs to. */
+   case DRM_AMDGPU_INFO:
/* These 4 only operate on kernel data structure. */
case DRM_AMDGPU_VM:
case DRM_AMDGPU_SCHED:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 260cd0ad286d..b32ff6e1baaf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -543,22 +543,7 @@ static int amdgpu_hw_ip_info(struct amdgpu_device *adev,
return 0;
 }
 
-/*
- * Userspace get information ioctl
- */
-/**
- * amdgpu_info_ioctl - answer a device specific request.
- *
- * @dev: drm device pointer
- * @data: request object
- * @filp: drm filp
- *
- * This function is used to pass device specific parameters to the userspace
- * drivers.  Examples include: pci device id, pipeline parms, tiling params,
- * etc. (all asics).
- * Returns 0 on success, -EINVAL on failure.
- */
-int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file 
*filp)
+static int amdgpu_info(struct drm_device *dev, void *data, struct drm_file 
*filp)
 {
struct amdgpu_device *adev = drm_to_adev(dev);
struct drm_amdgpu_info *info = data;
@@ -1278,6 +1263,83 @@ int amdgpu_info_ioctl(struct drm_device *dev, void 
*data, struct drm_file *filp)
return 0;
 }
 
+/*
+ * Userspace get information ioctl
+ */
+/**
+ * amdgpu_info_ioctl - answer a device specific request.
+ *
+ * @dev: drm device pointer
+ * @data: request object
+ * @filp: drm filp
+ *
+ * This function is used to pass device specific parameters to the userspace
+ * drivers.  Examples include: pci device id, pipeline parms, tiling params,
+ * etc. (all asics).
+ * Returns 0 on success, -EINVAL on failure.
+ */
+int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file 
*filp)
+{
+   struct drm_amdgpu_info *info = data;
+   bool need_runtime_pm;
+   int ret;
+
+   if (!info->return_size || !info->return_pointer)
+   return -EINVAL;
+
+   switch (info->query) {
+   case AMDGPU_INFO_ACCEL_WORKING:
+   case AMDGPU_INFO_CRTC_FROM_ID:
+   case AMDGPU_INFO_HW_IP_INFO:
+   case AMDGPU_INFO_HW_IP_COUNT:
+   case AMDGPU_INFO_FW_VERSION:
+   case AMDGPU_INFO_NUM_BYTES_MOVED:
+   case AMDGPU_INFO_NUM_EVICTIONS:
+   case AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS:
+   case AMDGPU_INFO_VRAM_USAGE:
+   case AMDGPU_INFO_VIS_VRAM_USAGE:
+   case AMDGPU_INFO_GTT_USAGE:
+   case AMDGPU_INFO_GDS_CONFIG:
+   case AMDGPU_INFO_VRAM_GTT:
+   case AMDGPU_INFO_MEMORY:
+   case AMDGPU_INFO_VCE_CLOCK_TABLE:
+   case AMDGPU_INFO_VBIOS:
+   case AMDGPU_INFO_NUM_HANDLES:
+   case AMDGPU_INFO_VRAM_LOST_COUNTER:
+   case AMDGPU_INFO_RAS_ENABLED_FEATURES:
+   case AMDGPU_INFO_VIDEO_CAPS:
+   case AMDGPU_INFO_MAX_IBS:
+   case AMDGPU_INFO_GPUVM_FAULT:
+   need_runtime_pm = false;
+   break;
+
+   case AMDGPU_INFO_TIMESTAMP:
+   case AMDGPU_INFO_READ_MMR_REG:
+   case AMDGPU_INFO_DEV_INFO:
+   case AMDGPU_INFO_SENSOR:
+   need_runtime_pm = true;
+   break;
+   default:
+   DRM_DEBUG_KMS("Invalid request %d\n", info->query);
+   return -EINVAL;
+   }
+
+   if (need_runtime_pm) {
+   ret = pm_runtime_get_sync(dev->dev);
+   if (ret < 0)
+   goto put_pm;
+   }
+
+   ret = amdgpu_info(dev, data, filp);
+
+put_pm:
+   if (need_runtime_pm) {
+   pm_runtime_mark_last_busy(dev->dev);
+   pm_runtime_put_autosuspend(dev->dev);
+   }
+
+   return ret;
+}
 
 /*
  * Outdated mess for old drm with Xorg being in charge (void function now).
-- 
2.40.1



[PATCH 4/6] drm/amdgpu: add AMDGPU_INFO_GB_ADDR_CONFIG query

2024-06-18 Thread Pierre-Eric Pelloux-Prayer
libdrm_amdgpu uses AMDGPU_INFO_READ_MMR_REG to fill the dev->info.gb_addr_cfg
value.
Since this value is already known by the kernel, this commit implements a new
query to return it.

The libdrm MR to use this query is:
   https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/368

Signed-off-by: Pierre-Eric Pelloux-Prayer 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 5 +
 include/uapi/drm/amdgpu_drm.h   | 2 ++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index f51f121d804e..403add7f05af 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -117,9 +117,10 @@
  * - 3.56.0 - Update IB start address and size alignment for decode and encode
  * - 3.57.0 - Compute tunneling on GFX10+
  * - 3.58.0 - Add AMDGPU_IDS_FLAGS_MODE_PF, AMDGPU_IDS_FLAGS_MODE_VF & 
AMDGPU_IDS_FLAGS_MODE_PT
+ * - 3.59.0 - Add AMDGPU_INFO_GB_ADDR_CONFIG support
  */
 #define KMS_DRIVER_MAJOR   3
-#define KMS_DRIVER_MINOR   58
+#define KMS_DRIVER_MINOR   59
 #define KMS_DRIVER_PATCHLEVEL  0
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index b32ff6e1baaf..dbb05d51682b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1256,6 +1256,10 @@ static int amdgpu_info(struct drm_device *dev, void 
*data, struct drm_file *filp
return copy_to_user(out, &gpuvm_fault,
min((size_t)size, sizeof(gpuvm_fault))) ? 
-EFAULT : 0;
}
+   case AMDGPU_INFO_GB_ADDR_CONFIG: {
+   ui32 = adev->gfx.config.gb_addr_config;
+   return copy_to_user(out, &ui32, min(size, 4u)) ? -EFAULT : 0;
+   }
default:
DRM_DEBUG_KMS("Invalid request %d\n", info->query);
return -EINVAL;
@@ -1310,6 +1314,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
case AMDGPU_INFO_VIDEO_CAPS:
case AMDGPU_INFO_MAX_IBS:
case AMDGPU_INFO_GPUVM_FAULT:
+   case AMDGPU_INFO_GB_ADDR_CONFIG:
need_runtime_pm = false;
break;
 
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 3e488b0119eb..680492cd73d8 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -933,6 +933,8 @@ struct drm_amdgpu_cs_chunk_cp_gfx_shadow {
 #define AMDGPU_INFO_MAX_IBS0x22
 /* query last page fault info */
 #define AMDGPU_INFO_GPUVM_FAULT0x23
+/* Query GB_ADDR_CONFIG */
+#define AMDGPU_INFO_GB_ADDR_CONFIG 0x24
 
 #define AMDGPU_INFO_MMR_SE_INDEX_SHIFT 0
 #define AMDGPU_INFO_MMR_SE_INDEX_MASK  0xff
-- 
2.40.1



[PATCH 2/6] drm/amdgpu: skip runtime pm for selected ioctls

2024-06-18 Thread Pierre-Eric Pelloux-Prayer
These ioctls don't need to GPU, so don't resume it.

Signed-off-by: Pierre-Eric Pelloux-Prayer 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 45 +++--
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index a9831b243bfc..d21d5af7f187 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2855,7 +2855,9 @@ long amdgpu_drm_ioctl(struct file *filp,
 {
struct drm_file *file_priv = filp->private_data;
struct drm_device *dev;
+   bool is_driver_ioctl;
bool needs_device;
+   unsigned int nr;
long ret;
 
dev = file_priv->minor->dev;
@@ -2863,9 +2865,46 @@ long amdgpu_drm_ioctl(struct file *filp,
/* Some ioctl can opt-out of powermanagement handling
 * if they don't require the device to be resumed.
 */
-   switch (cmd) {
-   default:
-   needs_device = true;
+   nr = DRM_IOCTL_NR(cmd);
+
+   is_driver_ioctl = nr >= DRM_COMMAND_BASE && nr < DRM_COMMAND_END;
+
+   if (is_driver_ioctl) {
+   switch (nr - DRM_COMMAND_BASE) {
+   /* These 4 only operate on kernel data structure. */
+   case DRM_AMDGPU_VM:
+   case DRM_AMDGPU_SCHED:
+   case DRM_AMDGPU_BO_LIST:
+   case DRM_AMDGPU_FENCE_TO_HANDLE:
+   /* All the waits don't need to resume up the device. If there 
are
+* jobs in progress, the device can't be in suspended state. 
And if
+* there's nothing no in-flight jobs, then the waits are no-op.
+*/
+   case DRM_AMDGPU_GEM_WAIT_IDLE:
+   case DRM_AMDGPU_WAIT_CS:
+   case DRM_AMDGPU_WAIT_FENCES:
+   needs_device = false;
+   break;
+   default:
+   needs_device = true;
+   }
+   } else {
+   /* Most drm core ioctls don't need the device, but to avoid 
missing one
+* that requires it, implement the "can skip pm" logic as an 
allow list.
+*/
+   switch (nr) {
+   case DRM_IOCTL_NR(DRM_IOCTL_VERSION):
+   case DRM_IOCTL_NR(DRM_IOCTL_AUTH_MAGIC):
+   case DRM_IOCTL_NR(DRM_IOCTL_GET_CAP):
+   case DRM_IOCTL_NR(DRM_IOCTL_SYNCOBJ_CREATE):
+   /* Same logic as DRM_AMDGPU_WAIT_* */
+   case DRM_IOCTL_NR(DRM_IOCTL_SYNCOBJ_WAIT):
+   case DRM_IOCTL_NR(DRM_IOCTL_SYNCOBJ_DESTROY):
+   needs_device = false;
+   break;
+   default:
+   needs_device = true;
+   }
}
 
if (needs_device) {
-- 
2.40.1



[PATCH 1/6] drm/amdgpu: allow ioctls to opt-out of runtime pm

2024-06-18 Thread Pierre-Eric Pelloux-Prayer
Waking up a device can take multiple seconds, so if it's not
going to be used we might as well not resume it.

The safest default behavior for all ioctls is to resume the GPU,
so this change allows specific ioctls to opt-out of generic
runtime pm.

Signed-off-by: Pierre-Eric Pelloux-Prayer 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 25 -
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 60d5758939ae..a9831b243bfc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2855,18 +2855,33 @@ long amdgpu_drm_ioctl(struct file *filp,
 {
struct drm_file *file_priv = filp->private_data;
struct drm_device *dev;
+   bool needs_device;
long ret;
 
dev = file_priv->minor->dev;
-   ret = pm_runtime_get_sync(dev->dev);
-   if (ret < 0)
-   goto out;
+
+   /* Some ioctl can opt-out of powermanagement handling
+* if they don't require the device to be resumed.
+*/
+   switch (cmd) {
+   default:
+   needs_device = true;
+   }
+
+   if (needs_device) {
+   ret = pm_runtime_get_sync(dev->dev);
+   if (ret < 0)
+   goto out;
+   }
 
ret = drm_ioctl(filp, cmd, arg);
 
-   pm_runtime_mark_last_busy(dev->dev);
 out:
-   pm_runtime_put_autosuspend(dev->dev);
+   if (needs_device) {
+   pm_runtime_mark_last_busy(dev->dev);
+   pm_runtime_put_autosuspend(dev->dev);
+   }
+
return ret;
 }
 
-- 
2.40.1



[PATCH 0/6] Reduce the number of GPU resumes

2024-06-18 Thread Pierre-Eric Pelloux-Prayer
The goal I'm aiming for is to be able to open a dri node and use
amdgpu_device_initialize without waking up the GPU. One visible outcome
is that it would become possible to call vkEnumeratePhysicalDevices
without having to resume all the GPUs in the system.

This series implements some of the changes required to achieve this
goal with a focus on not waking up the GPU for ioctls that don't need
the GPU to be active.
The output of AMD_DEBUG=info doesn't change with these patches applied.

The other changes required are found in following patchset:
https://lists.freedesktop.org/archives/amd-gfx/2024-June/109796.html

Pierre-Eric Pelloux-Prayer (6):
  drm/amdgpu: allow ioctls to opt-out of runtime pm
  drm/amdgpu: skip runtime pm for selected ioctls
  drm/amdgpu: refactor amdgpu_info_ioctl to allow finer pm
  drm/amdgpu: add AMDGPU_INFO_GB_ADDR_CONFIG query
  drm/amdgpu: cache mclk/sclk min/max values
  drm/amdgpu: resume the device from amdgpu_gem_fault

 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  12 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c  |   8 +-
 .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c  |   8 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|   7 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   |  69 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c   |  13 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   | 116 +-
 drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c|   8 +-
 drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c|  12 +-
 include/uapi/drm/amdgpu_drm.h |   2 +
 10 files changed, 198 insertions(+), 57 deletions(-)

-- 
2.40.1



RE: [PATCH] drm/amd/pm: check whether smu is idle in sriov case

2024-06-18 Thread Wang, Yang(Kevin)
This looks more like a workaround. 
Can we write the C2PMSG_90 register to 1 on the PF side when host receive 
GPU_RESET/GPU_INIT request command?

Best Regards,
Kevin

-Original Message-
From: amd-gfx  On Behalf Of Danijel 
Slivka
Sent: 2024年6月18日 23:00
To: amd-gfx@lists.freedesktop.org
Cc: Slivka, Danijel ; Chen, JingWen (Wayne) 
; Zhou, Peng Ju 
Subject: [PATCH] drm/amd/pm: check whether smu is idle in sriov case

Why:
If the reg mmMP1_SMN_C2PMSG_90 is being written to before or during amdgpu 
driver load or driver unload in sriov case, subsequent amdgpu driver load will 
fail at smu hw_init.
The default of mmMP1_SMN_C2PMSG_90 register at a clean environment is 0x1, and 
if value differs from 0x1, amdgpu driver load will fail.

How to fix:
This patch is to check whether smu is idle by sending a test message to smu. If 
smu is idle, it will respond.
This will avoid errors in case mmMP1_SMN_C2PMSG_90 is not 0x1 eventhough smu is 
idle.

Signed-off-by: Danijel Slivka 
Signed-off-by: Jingwen Chen 
Signed-off-by: pengzhou 
---
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 17 ++--
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c| 42 +++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h|  3 ++
 3 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index e17466cc1952..dafd91b352ec 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -231,6 +231,7 @@ int smu_v13_0_check_fw_status(struct smu_context *smu)  {
struct amdgpu_device *adev = smu->adev;
uint32_t mp1_fw_flags;
+   int ret = 0;
 
switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
case IP_VERSION(13, 0, 4):
@@ -244,11 +245,19 @@ int smu_v13_0_check_fw_status(struct smu_context *smu)
break;
}
 
-   if ((mp1_fw_flags & MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK) >>
-   MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED__SHIFT)
-   return 0;
+   if (!((mp1_fw_flags & MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK) >>
+   MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED__SHIFT))
+   return -EIO;
+
+   if (amdgpu_sriov_vf(adev)) {
+   ret = smu_cmn_wait_smu_idle(smu);
+   if (ret) {
+   dev_err(adev->dev, "SMU is not idle\n");
+   return ret;
+   }
+   }
 
-   return -EIO;
+   return 0;
 }
 
 int smu_v13_0_check_fw_version(struct smu_context *smu) diff --git 
a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 5592fd825aa3..de431c31ca7f 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -359,6 +359,48 @@ int smu_cmn_wait_for_response(struct smu_context *smu)
return res;
 }
 
+/**
+ * smu_cmn_wait_smu_idle -- wait for smu to become idle
+ * @smu: pointer to an SMU context
+ *
+ * Send SMU_MSG_TestMessage to check whether SMU is idle.
+ * If SMU is idle, it will respond.
+ * The returned parameter will be the param you pass + 1.
+ *
+ * Return 0 on success, -errno on error, indicating the execution
+ * status and result of the message being waited for. See
+ * __smu_cmn_reg2errno() for details of the -errno.
+ */
+int smu_cmn_wait_smu_idle(struct smu_context *smu) {
+   u32 reg;
+   u32 param = 0xff00011;
+   uint32_t read_arg;
+   int res, index;
+
+   index = smu_cmn_to_asic_specific_index(smu,
+  CMN2ASIC_MAPPING_MSG,
+  SMU_MSG_TestMessage);
+
+   if (index < 0)
+   return index == -EACCES ? 0 : index;
+
+   __smu_cmn_send_msg(smu, index, param);
+   reg = __smu_cmn_poll_stat(smu);
+   res = __smu_cmn_reg2errno(smu, reg);
+
+   if (unlikely(smu->adev->pm.smu_debug_mask & SMU_DEBUG_HALT_ON_ERROR) &&
+   res && (res != -ETIME)) {
+   amdgpu_device_halt(smu->adev);
+   WARN_ON(1);
+   }
+
+   smu_cmn_read_arg(smu, &read_arg);
+   if (read_arg == param + 1)
+   return 0;
+   return res;
+}
+
 /**
  * smu_cmn_send_smc_msg_with_param -- send a message with parameter
  * @smu: pointer to an SMU context
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
index 1de685defe85..486acfc956a5 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
@@ -51,6 +51,9 @@ static inline int pcie_gen_to_speed(uint32_t gen)  int 
smu_cmn_send_msg_without_waiting(struct smu_context *smu,
 uint16_t msg_index,
 uint32_t param);
+
+int smu_cmn_wait_smu_idle(struct smu_context *smu);
+
 int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,

[PATCH] drm/amd/pm: check whether smu is idle in sriov case

2024-06-18 Thread Danijel Slivka
Why:
If the reg mmMP1_SMN_C2PMSG_90 is being written to before or during
amdgpu driver load or driver unload in sriov case, subsequent amdgpu
driver load will fail at smu hw_init.
The default of mmMP1_SMN_C2PMSG_90 register at a clean environment is 0x1,
and if value differs from 0x1, amdgpu driver load will fail.

How to fix:
This patch is to check whether smu is idle by sending a test
message to smu. If smu is idle, it will respond.
This will avoid errors in case mmMP1_SMN_C2PMSG_90 is not 0x1 eventhough
smu is idle.

Signed-off-by: Danijel Slivka 
Signed-off-by: Jingwen Chen 
Signed-off-by: pengzhou 
---
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 17 ++--
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c| 42 +++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h|  3 ++
 3 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index e17466cc1952..dafd91b352ec 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -231,6 +231,7 @@ int smu_v13_0_check_fw_status(struct smu_context *smu)
 {
struct amdgpu_device *adev = smu->adev;
uint32_t mp1_fw_flags;
+   int ret = 0;
 
switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
case IP_VERSION(13, 0, 4):
@@ -244,11 +245,19 @@ int smu_v13_0_check_fw_status(struct smu_context *smu)
break;
}
 
-   if ((mp1_fw_flags & MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK) >>
-   MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED__SHIFT)
-   return 0;
+   if (!((mp1_fw_flags & MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK) >>
+   MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED__SHIFT))
+   return -EIO;
+
+   if (amdgpu_sriov_vf(adev)) {
+   ret = smu_cmn_wait_smu_idle(smu);
+   if (ret) {
+   dev_err(adev->dev, "SMU is not idle\n");
+   return ret;
+   }
+   }
 
-   return -EIO;
+   return 0;
 }
 
 int smu_v13_0_check_fw_version(struct smu_context *smu)
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 5592fd825aa3..de431c31ca7f 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -359,6 +359,48 @@ int smu_cmn_wait_for_response(struct smu_context *smu)
return res;
 }
 
+/**
+ * smu_cmn_wait_smu_idle -- wait for smu to become idle
+ * @smu: pointer to an SMU context
+ *
+ * Send SMU_MSG_TestMessage to check whether SMU is idle.
+ * If SMU is idle, it will respond.
+ * The returned parameter will be the param you pass + 1.
+ *
+ * Return 0 on success, -errno on error, indicating the execution
+ * status and result of the message being waited for. See
+ * __smu_cmn_reg2errno() for details of the -errno.
+ */
+int smu_cmn_wait_smu_idle(struct smu_context *smu)
+{
+   u32 reg;
+   u32 param = 0xff00011;
+   uint32_t read_arg;
+   int res, index;
+
+   index = smu_cmn_to_asic_specific_index(smu,
+  CMN2ASIC_MAPPING_MSG,
+  SMU_MSG_TestMessage);
+
+   if (index < 0)
+   return index == -EACCES ? 0 : index;
+
+   __smu_cmn_send_msg(smu, index, param);
+   reg = __smu_cmn_poll_stat(smu);
+   res = __smu_cmn_reg2errno(smu, reg);
+
+   if (unlikely(smu->adev->pm.smu_debug_mask & SMU_DEBUG_HALT_ON_ERROR) &&
+   res && (res != -ETIME)) {
+   amdgpu_device_halt(smu->adev);
+   WARN_ON(1);
+   }
+
+   smu_cmn_read_arg(smu, &read_arg);
+   if (read_arg == param + 1)
+   return 0;
+   return res;
+}
+
 /**
  * smu_cmn_send_smc_msg_with_param -- send a message with parameter
  * @smu: pointer to an SMU context
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
index 1de685defe85..486acfc956a5 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
@@ -51,6 +51,9 @@ static inline int pcie_gen_to_speed(uint32_t gen)
 int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
 uint16_t msg_index,
 uint32_t param);
+
+int smu_cmn_wait_smu_idle(struct smu_context *smu);
+
 int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
enum smu_message_type msg,
uint32_t param,
-- 
2.34.1



Re: [PATCH] drm/amd/display: Disable CONFIG_DRM_AMD_DC_FP for RISC-V with clang

2024-06-18 Thread Harry Wentland



On 2024-06-14 15:54, Nathan Chancellor wrote:
> Commit 77acc6b55ae4 ("riscv: add support for kernel-mode FPU") and
> commit a28e4b672f04 ("drm/amd/display: use ARCH_HAS_KERNEL_FPU_SUPPORT")
> enabled support for CONFIG_DRM_AMD_DC_FP with RISC-V. Unfortunately,
> this exposed -Wframe-larger-than warnings (which become fatal with
> CONFIG_WERROR=y) when building ARCH=riscv allmodconfig with clang:
> 
>   
> drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/display_mode_vba_32.c:58:13:
>  error: stack frame size (2448) exceeds limit (2048) in 
> 'DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerformanceCalculation'
>  [-Werror,-Wframe-larger-than]
>  58 | static void 
> DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerformanceCalculation(
> | ^
>   1 error generated.
> 
> Many functions in this file use a large number of parameters, which must
> be passed on the stack at a certain pointer due to register exhaustion,
> which can cause high stack usage when inlining and issues with stack
> slot analysis get involved. While the compiler can and should do better
> (as GCC uses less than half the amount of stack space for the same
> function), it is not as simple as a fix as adjusting the functions not
> to take a large number of parameters.
> 
> Unfortunately, modifying these files to avoid the problem is a difficult
> to justify approach because any revisions to the files in the kernel
> tree never make it back to the original source (so copies of the code
> for newer hardware revisions just reintroduce the issue) and the files
> are hard to read/modify due to being "gcc-parsable HW gospel, coming
> straight from HW engineers".
> 
> Avoid building the problematic code for RISC-V by modifying the existing
> condition for arm64 that exists for the same reason. Factor out the
> logical not to make the condition a little more readable naturally.
> 
> Fixes: a28e4b672f04 ("drm/amd/display: use ARCH_HAS_KERNEL_FPU_SUPPORT")
> Reported-by: Palmer Dabbelt 
> Closes: https://lore.kernel.org/20240530145741.7506-2-pal...@rivosinc.com/
> Signed-off-by: Nathan Chancellor 
> ---
>  drivers/gpu/drm/amd/display/Kconfig | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/Kconfig 
> b/drivers/gpu/drm/amd/display/Kconfig
> index 5fcd4f778dc3..47b8b49da8a7 100644
> --- a/drivers/gpu/drm/amd/display/Kconfig
> +++ b/drivers/gpu/drm/amd/display/Kconfig
> @@ -8,7 +8,7 @@ config DRM_AMD_DC
>   depends on BROKEN || !CC_IS_CLANG || ARM64 || RISCV || SPARC64 || X86_64
>   select SND_HDA_COMPONENT if SND_HDA_CORE
>   # !CC_IS_CLANG: https://github.com/ClangBuiltLinux/linux/issues/1752
> - select DRM_AMD_DC_FP if ARCH_HAS_KERNEL_FPU_SUPPORT && (!ARM64 || 
> !CC_IS_CLANG)
> + select DRM_AMD_DC_FP if ARCH_HAS_KERNEL_FPU_SUPPORT && !(CC_IS_CLANG && 
> (ARM64 || RISCV))

Thanks for also making the logic easier to parse.

Reviewed-by: Harry Wentland 

Harry

>   help
> Choose this option if you want to use the new display engine
> support for AMDGPU. This adds required support for Vega and
> 
> ---
> base-commit: c6c4dd54012551cce5cde408b35468f2c62b0cce
> change-id: 20240614-amdgpu-disable-drm-amd-dc-fp-riscv-clang-31c84f6b990d
> 
> Best regards,



Re: [PATCH] drm/amdgpu: update MTYPE mapping for gfx12

2024-06-18 Thread Alex Deucher
On Tue, Jun 18, 2024 at 9:07 AM Min, Frank  wrote:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> From: Frank Min 
>
> gfx12 only support MTYPE UC and NC, so update it accordingly.
>
> Signed-off-by: Frank Min 

Acked-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 4 
>  1 file changed, 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
> index 26122c8cfcc3..61db331adcc2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
> @@ -464,10 +464,6 @@ static uint64_t gmc_v12_0_map_mtype(struct amdgpu_device 
> *adev, uint32_t flags)
> return AMDGPU_PTE_MTYPE_GFX12(0ULL, MTYPE_NC);
> case AMDGPU_VM_MTYPE_NC:
> return AMDGPU_PTE_MTYPE_GFX12(0ULL, MTYPE_NC);
> -   case AMDGPU_VM_MTYPE_WC:
> -   return AMDGPU_PTE_MTYPE_GFX12(0ULL, MTYPE_WC);
> -   case AMDGPU_VM_MTYPE_CC:
> -   return AMDGPU_PTE_MTYPE_GFX12(0ULL, MTYPE_CC);
> case AMDGPU_VM_MTYPE_UC:
> return AMDGPU_PTE_MTYPE_GFX12(0ULL, MTYPE_UC);
> default:
> --
> 2.34.1
>


Re: [bug report] drm/amdgpu: amdgpu crash on playing videos, linux 6.10-rc

2024-06-18 Thread Alex Deucher
On Tue, Jun 18, 2024 at 3:42 AM Winston Ma  wrote:
>
> As reported in my previous email. In my device I don't see a more
> critical system freeze, which causes system reboot, during video play.
> But I still experience green screen problem during full screen
> toggling while watching video (Screenshot: https://ibb.co/8Dpdxc3). It
> didn't cause my system to reboot but I guess there are bug to be
> fixed.
>
> Should this issue be monitored by another new ticket?

Please file a ticket:
https://gitlab.freedesktop.org/drm/amd/-/issues/
And ideally bisect.  Thanks,

Alex

>
> Thanks and Regards,
> Winston
>
> On Mon, Jun 17, 2024 at 5:40 PM Wang Yunchen  wrote:
> >
> > On Mon, 2024-06-17 at 06:55 +0800, Winston Ma wrote:
> > > I only build the kernel once. I could try but I think you couldn't
> > > expect much from my side.
> > >
> > > BTW I installed 6.10-rc4 this morning from Ubuntu mainline
> > > (https://kernel.ubuntu.com/mainline/v6.10-rc4/amd64/) and I couldn't
> > > replicate the video crash problem. Yunchen could you try 6.10-rc4 and
> > > see if you still have the video crash problem?
> > >
> > > But I still get the green blocky object when I keep toggling full
> > > screen during youtube watch (Screenshot: https://ibb.co/8Dpdxc3). I
> > > didn't see the green block in 6.9 so it could be another issue.
> > >
> > > Thanks and Regards,
> > > Winston
> > >
> > >
> > > On Sun, Jun 16, 2024 at 12:10 AM Wang Yunchen  
> > > wrote:
> > > >
> > > > On Sat, 2024-06-15 at 17:50 +0200, Thorsten Leemhuis wrote:
> > > > > [reply made easier by moving something in the quote]
> > > > >
> > > > > On 12.06.24 18:55, Wang Yunchen wrote:
> > > > > > On Wed, 2024-06-12 at 15:14 +0200, Linux regression tracking 
> > > > > > (Thorsten
> > > > > > Leemhuis) wrote:
> > > > > > > On 06.06.24 05:06, Winston Ma wrote:
> > > > > > > > Hi I got the same problem on Linux Kernel 6.10-rc2. I got the
> > > > > > > > problem
> > > > > > > > by
> > > > > > > > following the procedure below:
> > > > > > > >
> > > > > > > >  1. Boot Linux Kernel 6.10-rc2
> > > > > > > >  2. Open Firefox (Any browser should work)
> > > > > > > >  3. Open a Youtube Video
> > > > > > > >  4. On the playing video, toggle fullscreen quickly Then after 
> > > > > > > > 10-
> > > > > > > > 20
> > > > > > > > times of fullscreen toggling, the screen would enter freeze
> > > > > > > > mode.
> > > > > > > > This is the log that I captured using the above method.
> > > > > > >
> > > > > > > Hmm, seems nothing happened here for a while. Could you maybe try 
> > > > > > > to
> > > > > > > bisect this
> > > > > > > (
> > > > > > > https://docs.kernel.org/admin-guide/verify-bugs-and-bisect-regressions.ht
> > > > > > > ml
> > > > > > > )?
> > > > > >
> > > > > > It seems that the issue persists on 6.10 rc3.
> > > > >
> > > > > That's good to know, but...
> > > > >
> > > > > > > @amd-gfx devs: Or is this unneeded, as the cause found or maybe 
> > > > > > > even
> > > > > > > fixed meanwhile?
> > > > >
> > > > > ...as there was no reply to that inquiry it seems we really need 
> > > > > either
> > > > > you or Winston Ma (or somebody else that is affected we don't yet know
> > > > > about) to perform a git bisection (see the link quoted above) to find
> > > > > the exact change that broke things. Without this it might not be 
> > > > > getting
> > > > > fixed.
> > > > >
> > > > > Ciao, Thorsten
> > > > >
> > > > > > > > This is the kernel log
> > > > > > > >
> > > > > > > > 2024-06-06T10:26:40.747576+08:00 kernel:
> > > > > > > > gmc_v10_0_process_interrupt:
> > > > > > > > 6 callbacks suppressed
> > > > > > > > 2024-06-06T10:26:40.747618+08:00 kernel: amdgpu :03:00.0:
> > > > > > > > amdgpu:
> > > > > > > > [mmhub] page fault (src_id:0 ring:8 vmid:2
> > > > > > > > pasid:32789)
> > > > > > > > 2024-06-06T10:26:40.747623+08:00 kernel: amdgpu :03:00.0:
> > > > > > > > amdgpu:
> > > > > > > > in process RDD Process pid 39524 thread
> > > > > > > > firefox-bi:cs0 pid 40342
> > > > > > > > 2024-06-06T10:26:40.747625+08:00 kernel: amdgpu :03:00.0:
> > > > > > > > amdgpu:   in page starting at address
> > > > > > > > 0x800106ffe000 from client 0x12 (VMC)
> > > > > > > > 2024-06-06T10:26:40.747628+08:00 kernel: amdgpu :03:00.0:
> > > > > > > > amdgpu:
> > > > > > > > MMVM_L2_PROTECTION_FAULT_STATUS:0x00203811
> > > > > > > > 2024-06-06T10:26:40.747629+08:00 kernel: amdgpu :03:00.0:
> > > > > > > > amdgpu:  Faulty UTCL2 client ID: VCN (0x1c)
> > > > > > > > 2024-06-06T10:26:40.747631+08:00 kernel: amdgpu :03:00.0:
> > > > > > > > amdgpu:  MORE_FAULTS: 0x1
> > > > > > > > 2024-06-06T10:26:40.747651+08:00 kernel: amdgpu :03:00.0:
> > > > > > > > amdgpu:  WALKER_ERROR: 0x0
> > > > > > > > 2024-06-06T10:26:40.747653+08:00 kernel: amdgpu :03:00.0:
> > > > > > > > amdgpu:  PERMISSION_FAULTS: 0x1
> > > > > > > > 2024-06-06T10:26:40.747655+08:00 kernel: amdgpu :03:00.0:
> > > > > > > > amdgpu:  MAPPING

[PATCH] drm/amdgpu: update MTYPE mapping for gfx12

2024-06-18 Thread Min, Frank
[AMD Official Use Only - AMD Internal Distribution Only]

From: Frank Min 

gfx12 only support MTYPE UC and NC, so update it accordingly.

Signed-off-by: Frank Min 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
index 26122c8cfcc3..61db331adcc2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
@@ -464,10 +464,6 @@ static uint64_t gmc_v12_0_map_mtype(struct amdgpu_device 
*adev, uint32_t flags)
return AMDGPU_PTE_MTYPE_GFX12(0ULL, MTYPE_NC);
case AMDGPU_VM_MTYPE_NC:
return AMDGPU_PTE_MTYPE_GFX12(0ULL, MTYPE_NC);
-   case AMDGPU_VM_MTYPE_WC:
-   return AMDGPU_PTE_MTYPE_GFX12(0ULL, MTYPE_WC);
-   case AMDGPU_VM_MTYPE_CC:
-   return AMDGPU_PTE_MTYPE_GFX12(0ULL, MTYPE_CC);
case AMDGPU_VM_MTYPE_UC:
return AMDGPU_PTE_MTYPE_GFX12(0ULL, MTYPE_UC);
default:
--
2.34.1



[PATCH AUTOSEL 6.9 38/44] drm/amdgpu/pptable: Fix UBSAN array-index-out-of-bounds

2024-06-18 Thread Sasha Levin
From: Tasos Sahanidis 

[ Upstream commit c6c4dd54012551cce5cde408b35468f2c62b0cce ]

Flexible arrays used [1] instead of []. Replace the former with the latter
to resolve multiple UBSAN warnings observed on boot with a BONAIRE card.

In addition, use the __counted_by attribute where possible to hint the
length of the arrays to the compiler and any sanitizers.

Signed-off-by: Tasos Sahanidis 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/include/pptable.h | 91 ++-
 1 file changed, 49 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/include/pptable.h 
b/drivers/gpu/drm/amd/include/pptable.h
index 2e8e6c9875f6c..f83ace2d7ec30 100644
--- a/drivers/gpu/drm/amd/include/pptable.h
+++ b/drivers/gpu/drm/amd/include/pptable.h
@@ -477,31 +477,30 @@ typedef struct _ATOM_PPLIB_STATE_V2
 } ATOM_PPLIB_STATE_V2;
 
 typedef struct _StateArray{
-//how many states we have 
-UCHAR ucNumEntries;
-
-ATOM_PPLIB_STATE_V2 states[1];
+   //how many states we have
+   UCHAR ucNumEntries;
+
+   ATOM_PPLIB_STATE_V2 states[] /* __counted_by(ucNumEntries) */;
 }StateArray;
 
 
 typedef struct _ClockInfoArray{
-//how many clock levels we have
-UCHAR ucNumEntries;
-
-//sizeof(ATOM_PPLIB_CLOCK_INFO)
-UCHAR ucEntrySize;
-
-UCHAR clockInfo[];
+   //how many clock levels we have
+   UCHAR ucNumEntries;
+
+   //sizeof(ATOM_PPLIB_CLOCK_INFO)
+   UCHAR ucEntrySize;
+
+   UCHAR clockInfo[];
 }ClockInfoArray;
 
 typedef struct _NonClockInfoArray{
+   //how many non-clock levels we have. normally should be same as number 
of states
+   UCHAR ucNumEntries;
+   //sizeof(ATOM_PPLIB_NONCLOCK_INFO)
+   UCHAR ucEntrySize;
 
-//how many non-clock levels we have. normally should be same as number of 
states
-UCHAR ucNumEntries;
-//sizeof(ATOM_PPLIB_NONCLOCK_INFO)
-UCHAR ucEntrySize;
-
-ATOM_PPLIB_NONCLOCK_INFO nonClockInfo[];
+   ATOM_PPLIB_NONCLOCK_INFO nonClockInfo[] __counted_by(ucNumEntries);
 }NonClockInfoArray;
 
 typedef struct _ATOM_PPLIB_Clock_Voltage_Dependency_Record
@@ -513,8 +512,10 @@ typedef struct _ATOM_PPLIB_Clock_Voltage_Dependency_Record
 
 typedef struct _ATOM_PPLIB_Clock_Voltage_Dependency_Table
 {
-UCHAR ucNumEntries;// 
Number of entries.
-ATOM_PPLIB_Clock_Voltage_Dependency_Record entries[1]; // 
Dynamically allocate entries.
+   // Number of entries.
+   UCHAR ucNumEntries;
+   // Dynamically allocate entries.
+   ATOM_PPLIB_Clock_Voltage_Dependency_Record entries[] 
__counted_by(ucNumEntries);
 }ATOM_PPLIB_Clock_Voltage_Dependency_Table;
 
 typedef struct _ATOM_PPLIB_Clock_Voltage_Limit_Record
@@ -529,8 +530,10 @@ typedef struct _ATOM_PPLIB_Clock_Voltage_Limit_Record
 
 typedef struct _ATOM_PPLIB_Clock_Voltage_Limit_Table
 {
-UCHAR ucNumEntries;// 
Number of entries.
-ATOM_PPLIB_Clock_Voltage_Limit_Record entries[1];  // 
Dynamically allocate entries.
+   // Number of entries.
+   UCHAR ucNumEntries;
+   // Dynamically allocate entries.
+   ATOM_PPLIB_Clock_Voltage_Limit_Record entries[] 
__counted_by(ucNumEntries);
 }ATOM_PPLIB_Clock_Voltage_Limit_Table;
 
 union _ATOM_PPLIB_CAC_Leakage_Record
@@ -553,8 +556,10 @@ typedef union _ATOM_PPLIB_CAC_Leakage_Record 
ATOM_PPLIB_CAC_Leakage_Record;
 
 typedef struct _ATOM_PPLIB_CAC_Leakage_Table
 {
-UCHAR ucNumEntries; // 
Number of entries.
-ATOM_PPLIB_CAC_Leakage_Record entries[1];   // 
Dynamically allocate entries.
+   // Number of entries.
+   UCHAR ucNumEntries;
+   // Dynamically allocate entries.
+   ATOM_PPLIB_CAC_Leakage_Record entries[] __counted_by(ucNumEntries);
 }ATOM_PPLIB_CAC_Leakage_Table;
 
 typedef struct _ATOM_PPLIB_PhaseSheddingLimits_Record
@@ -568,8 +573,10 @@ typedef struct _ATOM_PPLIB_PhaseSheddingLimits_Record
 
 typedef struct _ATOM_PPLIB_PhaseSheddingLimits_Table
 {
-UCHAR ucNumEntries; // 
Number of entries.
-ATOM_PPLIB_PhaseSheddingLimits_Record entries[1];   // 
Dynamically allocate entries.
+   // Number of entries.
+   UCHAR ucNumEntries;
+   // Dynamically allocate entries.
+   ATOM_PPLIB_PhaseSheddingLimits_Record entries[] 
__counted_by(ucNumEntries);
 }ATOM_PPLIB_PhaseSheddingLimits_Table;
 
 typedef struct _VCEClockInfo{
@@ -580,8 +587,8 @@ typedef struct _VCEClockInfo{
 }VCEClockInfo;
 
 typedef struct _VCEClockInfoArray{
-UCHAR ucNumEntries;
-VCEClockInfo entries[1];
+   UCHAR ucNumEntries;
+   VCEClockInfo entries[] __counted_by(ucNumEntries);
 }VCEClockInfoArray;
 
 typedef struct _ATOM_PPLIB_VCE_Clock_Voltage_Limit_Record
@@ -592,8 +599,8 @@ typedef struct _ATOM_PPLIB_VCE_Clock_Voltage_Li

Re: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to complete

2024-06-18 Thread Lazar, Lijo



On 6/18/2024 4:51 PM, Chai, Thomas wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
> 
> -
> Best Regards,
> Thomas
> 
> -Original Message-
> From: Chai, Thomas
> Sent: Tuesday, June 18, 2024 7:09 PM
> To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking ; Zhou1, Tao ; 
> Li, Candice ; Wang, Yang(Kevin) ; 
> Yang, Stanley 
> Subject: RE: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to 
> complete
> 
> 
> 
> 
> -
> Best Regards,
> Thomas
> 
> -Original Message-
> From: Lazar, Lijo 
> Sent: Tuesday, June 18, 2024 6:09 PM
> To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking ; Zhou1, Tao ; 
> Li, Candice ; Wang, Yang(Kevin) ; 
> Yang, Stanley 
> Subject: Re: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to 
> complete
> 
> 
> 
> On 6/18/2024 12:03 PM, YiPeng Chai wrote:
>> Add completion to wait for ras reset to complete.
>>
>> Signed-off-by: YiPeng Chai 
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 +++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
>>  2 files changed, 12 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> index 898889600771..7f8e6ca07957 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> @@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if
>> *ras_block)
>>
>>  #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms
>>
>> +#define MAX_RAS_RECOVERY_COMPLETION_TIME  12 //ms
>> +
>>  enum amdgpu_ras_retire_page_reservation {
>>   AMDGPU_RAS_RETIRE_PAGE_RESERVED,
>>   AMDGPU_RAS_RETIRE_PAGE_PENDING,
>> @@ -2518,6 +2520,8 @@ static void amdgpu_ras_do_recovery(struct work_struct 
>> *work)
>>   atomic_set(&hive->ras_recovery, 0);
>>   amdgpu_put_xgmi_hive(hive);
>>   }
>> +
>> + complete_all(&ras->ras_recovery_completion);
>>  }
>>
>>  /* alloc/realloc bps array */
>> @@ -2911,10 +2915,16 @@ static int
>> amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
>>
>>   flush_delayed_work(&con->page_retirement_dwork);
>>
>> + reinit_completion(&con->ras_recovery_completion);
>> +
>>   con->gpu_reset_flags |= reset;
>>   amdgpu_ras_reset_gpu(adev);
>>
>>   *gpu_reset = reset;
>> + if (!wait_for_completion_timeout(&con->ras_recovery_completion,
>> + 
>> msecs_to_jiffies(MAX_RAS_RECOVERY_COMPLETION_TIME)))
>> + dev_err(adev->dev, "Waiting for GPU to complete ras 
>> reset timeout! reset:0x%x\n",
>> + reset);
> 
>> If a mode-1 reset gets to execute first due to job timeout/hws detect cases 
>> in poison timeout, then the ras handler will never get executed.
>> Why this wait is required?
> 
>> Thanks,
>> Lijo
> 
> [Thomas]  "[PATCH 5/5] drm/amdgpu: add gpu reset check and exception 
> handling" add the check before ras gpu reset.
> Poison ras reset is different from reset triggered by other 
> fatal errors, and all poison RAS resets are triggered from here,
>  in order to distinguish other gpu resets and facilitate 
> subsequent  code processing, so add wait for gpu ras reset here.
> 

Reset mechanism resets the GPU state - whether it's triggered due to
poison or fatal errors. As soon as the device is reset successfully, GPU
operations can continue. So why there needs to be a special wait for
poison triggred reset alone? Why not wait on the RAS recovery work
object rather than another completion notification?

Thanks,
Lijo

>>   }
>>
>>   return 0;
>> @@ -3041,6 +3051,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device 
>> *adev)
>>   }
>>   }
>>
>> + init_completion(&con->ras_recovery_completion);
>>   mutex_init(&con->page_rsv_lock);
>>   INIT_KFIFO(con->poison_fifo);
>>   mutex_init(&con->page_retirement_lock);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> index 91daf48be03a..b47f03edac87 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> @@ -537,6 +537,7 @@ struct amdgpu_ras {
>>   DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
>>   struct ras_ecc_log_info  umc_ecc_log;
>>   struct delayed_work page_retirement_dwork;
>> + struct completion ras_recovery_completion;
>>
>>   /* Fatal error detected flag */
>>   atomic_t fed;


Re: [PATCH AUTOSEL 6.1 13/14] drm/amdgpu: fix dereference null return value for the function amdgpu_vm_pt_parent

2024-06-18 Thread Christian König

Am 18.06.24 um 11:11 schrieb Pavel Machek:

Hi!


[ Upstream commit a0cf36546cc24ae1c95d72253c7795d4d2fc77aa ]

The pointer parent may be NULLed by the function amdgpu_vm_pt_parent.
To make the code more robust, check the pointer parent.

If this can happen, it should not WARN().

If this can not happen, we don't need the patch in stable.


Right, that patch shouldn't be backported in any way.

Regards,
Christian.



Best regards,
Pavel





RE: [PATCH] drm/amdgpu: Fix pci state save during mode-1 reset

2024-06-18 Thread Zhang, Hawking
[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, June 18, 2024 16:44
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Kamal, Asad ; Xu, Feifei 

Subject: [PATCH] drm/amdgpu: Fix pci state save during mode-1 reset

Cache the PCI state before bus master is disabled. The saved state is later 
used for other cases like restoring config space after mode-2 reset.

Signed-off-by: Lijo Lazar 
Fixes: 5c03e5843e6b ("drm/amdgpu:add smu mode1/2 support for aldebaran")
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3fb02f5b91c9..6c2ab14ca102 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5224,11 +5224,14 @@ int amdgpu_device_mode1_reset(struct amdgpu_device 
*adev)

dev_info(adev->dev, "GPU mode1 reset\n");

+   /* Cache the state before bus master disable. The saved config space
+* values are used in other cases like restore after mode-2 reset.
+*/
+   amdgpu_device_cache_pci_state(adev->pdev);
+
/* disable BM */
pci_clear_master(adev->pdev);

-   amdgpu_device_cache_pci_state(adev->pdev);
-
if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
dev_info(adev->dev, "GPU smu mode1 reset\n");
ret = amdgpu_dpm_mode1_reset(adev);
--
2.25.1



RE: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to complete

2024-06-18 Thread Chai, Thomas
[AMD Official Use Only - AMD Internal Distribution Only]

-
Best Regards,
Thomas

-Original Message-
From: Chai, Thomas
Sent: Tuesday, June 18, 2024 7:09 PM
To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Wang, Yang(Kevin) ; Yang, 
Stanley 
Subject: RE: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to 
complete




-
Best Regards,
Thomas

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, June 18, 2024 6:09 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Wang, Yang(Kevin) ; Yang, 
Stanley 
Subject: Re: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to 
complete



On 6/18/2024 12:03 PM, YiPeng Chai wrote:
> Add completion to wait for ras reset to complete.
>
> Signed-off-by: YiPeng Chai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 +++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
>  2 files changed, 12 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 898889600771..7f8e6ca07957 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if
> *ras_block)
>
>  #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms
>
> +#define MAX_RAS_RECOVERY_COMPLETION_TIME  12 //ms
> +
>  enum amdgpu_ras_retire_page_reservation {
>   AMDGPU_RAS_RETIRE_PAGE_RESERVED,
>   AMDGPU_RAS_RETIRE_PAGE_PENDING,
> @@ -2518,6 +2520,8 @@ static void amdgpu_ras_do_recovery(struct work_struct 
> *work)
>   atomic_set(&hive->ras_recovery, 0);
>   amdgpu_put_xgmi_hive(hive);
>   }
> +
> + complete_all(&ras->ras_recovery_completion);
>  }
>
>  /* alloc/realloc bps array */
> @@ -2911,10 +2915,16 @@ static int
> amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
>
>   flush_delayed_work(&con->page_retirement_dwork);
>
> + reinit_completion(&con->ras_recovery_completion);
> +
>   con->gpu_reset_flags |= reset;
>   amdgpu_ras_reset_gpu(adev);
>
>   *gpu_reset = reset;
> + if (!wait_for_completion_timeout(&con->ras_recovery_completion,
> + 
> msecs_to_jiffies(MAX_RAS_RECOVERY_COMPLETION_TIME)))
> + dev_err(adev->dev, "Waiting for GPU to complete ras 
> reset timeout! reset:0x%x\n",
> + reset);

> If a mode-1 reset gets to execute first due to job timeout/hws detect cases 
> in poison timeout, then the ras handler will never get executed.
> Why this wait is required?

>Thanks,
>Lijo

[Thomas]  "[PATCH 5/5] drm/amdgpu: add gpu reset check and exception handling" 
add the check before ras gpu reset.
Poison ras reset is different from reset triggered by other 
fatal errors, and all poison RAS resets are triggered from here,
 in order to distinguish other gpu resets and facilitate subsequent 
 code processing, so add wait for gpu ras reset here.

>   }
>
>   return 0;
> @@ -3041,6 +3051,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
>   }
>   }
>
> + init_completion(&con->ras_recovery_completion);
>   mutex_init(&con->page_rsv_lock);
>   INIT_KFIFO(con->poison_fifo);
>   mutex_init(&con->page_retirement_lock);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 91daf48be03a..b47f03edac87 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -537,6 +537,7 @@ struct amdgpu_ras {
>   DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
>   struct ras_ecc_log_info  umc_ecc_log;
>   struct delayed_work page_retirement_dwork;
> + struct completion ras_recovery_completion;
>
>   /* Fatal error detected flag */
>   atomic_t fed;


RE: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to complete

2024-06-18 Thread Chai, Thomas
[AMD Official Use Only - AMD Internal Distribution Only]

-
Best Regards,
Thomas

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, June 18, 2024 6:09 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Wang, Yang(Kevin) ; Yang, 
Stanley 
Subject: Re: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to 
complete



On 6/18/2024 12:03 PM, YiPeng Chai wrote:
> Add completion to wait for ras reset to complete.
>
> Signed-off-by: YiPeng Chai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 +++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
>  2 files changed, 12 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 898889600771..7f8e6ca07957 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if
> *ras_block)
>
>  #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms
>
> +#define MAX_RAS_RECOVERY_COMPLETION_TIME  12 //ms
> +
>  enum amdgpu_ras_retire_page_reservation {
>   AMDGPU_RAS_RETIRE_PAGE_RESERVED,
>   AMDGPU_RAS_RETIRE_PAGE_PENDING,
> @@ -2518,6 +2520,8 @@ static void amdgpu_ras_do_recovery(struct work_struct 
> *work)
>   atomic_set(&hive->ras_recovery, 0);
>   amdgpu_put_xgmi_hive(hive);
>   }
> +
> + complete_all(&ras->ras_recovery_completion);
>  }
>
>  /* alloc/realloc bps array */
> @@ -2911,10 +2915,16 @@ static int
> amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
>
>   flush_delayed_work(&con->page_retirement_dwork);
>
> + reinit_completion(&con->ras_recovery_completion);
> +
>   con->gpu_reset_flags |= reset;
>   amdgpu_ras_reset_gpu(adev);
>
>   *gpu_reset = reset;
> + if (!wait_for_completion_timeout(&con->ras_recovery_completion,
> + 
> msecs_to_jiffies(MAX_RAS_RECOVERY_COMPLETION_TIME)))
> + dev_err(adev->dev, "Waiting for GPU to complete ras 
> reset timeout! reset:0x%x\n",
> + reset);

> If a mode-1 reset gets to execute first due to job timeout/hws detect cases 
> in poison timeout, then the ras handler will never get executed.
> Why this wait is required?

[Thomas]  "[PATCH 5/5] drm/amdgpu: add gpu reset check and exception handling" 
add the check before ras gpu reset.


Thanks,
Lijo

>   }
>
>   return 0;
> @@ -3041,6 +3051,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
>   }
>   }
>
> + init_completion(&con->ras_recovery_completion);
>   mutex_init(&con->page_rsv_lock);
>   INIT_KFIFO(con->poison_fifo);
>   mutex_init(&con->page_retirement_lock);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 91daf48be03a..b47f03edac87 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -537,6 +537,7 @@ struct amdgpu_ras {
>   DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
>   struct ras_ecc_log_info  umc_ecc_log;
>   struct delayed_work page_retirement_dwork;
> + struct completion ras_recovery_completion;
>
>   /* Fatal error detected flag */
>   atomic_t fed;


Re: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to complete

2024-06-18 Thread Lazar, Lijo



On 6/18/2024 12:03 PM, YiPeng Chai wrote:
> Add completion to wait for ras reset to complete.
> 
> Signed-off-by: YiPeng Chai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
>  2 files changed, 12 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 898889600771..7f8e6ca07957 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if 
> *ras_block)
>  
>  #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms
>  
> +#define MAX_RAS_RECOVERY_COMPLETION_TIME  12 //ms
> +
>  enum amdgpu_ras_retire_page_reservation {
>   AMDGPU_RAS_RETIRE_PAGE_RESERVED,
>   AMDGPU_RAS_RETIRE_PAGE_PENDING,
> @@ -2518,6 +2520,8 @@ static void amdgpu_ras_do_recovery(struct work_struct 
> *work)
>   atomic_set(&hive->ras_recovery, 0);
>   amdgpu_put_xgmi_hive(hive);
>   }
> +
> + complete_all(&ras->ras_recovery_completion);
>  }
>  
>  /* alloc/realloc bps array */
> @@ -2911,10 +2915,16 @@ static int 
> amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
>  
>   flush_delayed_work(&con->page_retirement_dwork);
>  
> + reinit_completion(&con->ras_recovery_completion);
> +
>   con->gpu_reset_flags |= reset;
>   amdgpu_ras_reset_gpu(adev);
>  
>   *gpu_reset = reset;
> + if (!wait_for_completion_timeout(&con->ras_recovery_completion,
> + 
> msecs_to_jiffies(MAX_RAS_RECOVERY_COMPLETION_TIME)))
> + dev_err(adev->dev, "Waiting for GPU to complete ras 
> reset timeout! reset:0x%x\n",
> + reset);

If a mode-1 reset gets to execute first due to job timeout/hws detect
cases in poison timeout, then the ras handler will never get executed.
Why this wait is required?

Thanks,
Lijo

>   }
>  
>   return 0;
> @@ -3041,6 +3051,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
>   }
>   }
>  
> + init_completion(&con->ras_recovery_completion);
>   mutex_init(&con->page_rsv_lock);
>   INIT_KFIFO(con->poison_fifo);
>   mutex_init(&con->page_retirement_lock);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 91daf48be03a..b47f03edac87 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -537,6 +537,7 @@ struct amdgpu_ras {
>   DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
>   struct ras_ecc_log_info  umc_ecc_log;
>   struct delayed_work page_retirement_dwork;
> + struct completion ras_recovery_completion;
>  
>   /* Fatal error detected flag */
>   atomic_t fed;


Re: [PATCH v7 2/9] drm: Support per-plane async flip configuration

2024-06-18 Thread Dmitry Baryshkov
On Tue, 18 Jun 2024 at 12:38, Jani Nikula  wrote:
>
> On Tue, 18 Jun 2024, André Almeida  wrote:
> > Drivers have different capabilities on what plane types they can or
> > cannot perform async flips. Create a plane::async_flip field so each
> > driver can choose which planes they allow doing async flips.
> >
> > Signed-off-by: André Almeida 
> > ---
> >  include/drm/drm_plane.h | 5 +
> >  1 file changed, 5 insertions(+)
> >
> > diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
> > index 9507542121fa..0bebc72af5c3 100644
> > --- a/include/drm/drm_plane.h
> > +++ b/include/drm/drm_plane.h
> > @@ -786,6 +786,11 @@ struct drm_plane {
> >* @kmsg_panic: Used to register a panic notifier for this plane
> >*/
> >   struct kmsg_dumper kmsg_panic;
> > +
> > + /**
> > +  * @async_flip: indicates if a plane can do async flips
> > +  */
>
> When is it okay to set or change the value of this member?
>
> If you don't document it, people will find creative uses for this.

Maybe it's better to have a callback instead of a static field? This
way it becomes clear that it's only relevant at the time of the
atomic_check().

> > + bool async_flip;
> >  };
> >
> >  #define obj_to_plane(x) container_of(x, struct drm_plane, base)
>
> --
> Jani Nikula, Intel



-- 
With best wishes
Dmitry


Re: [PATCH v7 2/9] drm: Support per-plane async flip configuration

2024-06-18 Thread Jani Nikula
On Tue, 18 Jun 2024, André Almeida  wrote:
> Drivers have different capabilities on what plane types they can or
> cannot perform async flips. Create a plane::async_flip field so each
> driver can choose which planes they allow doing async flips.
>
> Signed-off-by: André Almeida 
> ---
>  include/drm/drm_plane.h | 5 +
>  1 file changed, 5 insertions(+)
>
> diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
> index 9507542121fa..0bebc72af5c3 100644
> --- a/include/drm/drm_plane.h
> +++ b/include/drm/drm_plane.h
> @@ -786,6 +786,11 @@ struct drm_plane {
>* @kmsg_panic: Used to register a panic notifier for this plane
>*/
>   struct kmsg_dumper kmsg_panic;
> +
> + /**
> +  * @async_flip: indicates if a plane can do async flips
> +  */

When is it okay to set or change the value of this member?

If you don't document it, people will find creative uses for this.

BR,
Jani.


> + bool async_flip;
>  };
>  
>  #define obj_to_plane(x) container_of(x, struct drm_plane, base)

-- 
Jani Nikula, Intel


Re: [PATCH] drm/amdgpu: normalize registers as local xcc to read/write under sriov

2024-06-18 Thread Lazar, Lijo



On 6/17/2024 3:41 PM, Jane Jian wrote:
> [WHY]
> sriov has the higher bit violation when flushing tlb
> 
> [HOW]
> normalize the registers to keep lower 16-bit(dword aligned) to aviod higher 
> bit violation
> RLCG will mask xcd out and always assume it's accessing its own xcd
> 
> also fix the typo in sriov_w/rreg:
> for KIQ case, use xcc with xcc_id to read and write
> 
> v2
> amend some typos
> 
> Signed-off-by: Jane Jian 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c  | 12 ++--
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |  8 ++--
>  drivers/gpu/drm/amd/amdgpu/soc15_common.h |  2 ++
>  3 files changed, 18 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 63f2286858c4..d43652a38484 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -1075,6 +1075,10 @@ void amdgpu_sriov_wreg(struct amdgpu_device *adev,
>   if (amdgpu_device_skip_hw_access(adev))
>   return;
>  
> + /* Select lower 16 bits to write in local xcc */
> + if ((hwip == GC_HWIP) && !(acc_flags & AMDGPU_REGS_NO_KIQ))
> + offset = NORMALIZE_XCC_REG_OFFSET(offset);

This cannot be generalized. Instead use a similar approach of having an
soc specific function => adev->asic_funcs->encode_ext_smn_addressing

> +
>   if (!amdgpu_sriov_runtime(adev) &&
>   amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, hwip, 
> true, &rlcg_flag)) {
>   amdgpu_virt_rlcg_reg_rw(adev, offset, value, rlcg_flag, xcc_id);
> @@ -1084,7 +1088,7 @@ void amdgpu_sriov_wreg(struct amdgpu_device *adev,
>   if (acc_flags & AMDGPU_REGS_NO_KIQ)
>   WREG32_NO_KIQ(offset, value);
>   else
> - WREG32(offset, value);
> + WREG32_XCC(offset, value, xcc_id);

This doesn't look correct. AFAIU, this macro is specifically for XCC
registers. amdgpu_sriov_wreg can have registers other than hwip == GC_HWIP.

>  }
>  
>  u32 amdgpu_sriov_rreg(struct amdgpu_device *adev,
> @@ -1095,6 +1099,10 @@ u32 amdgpu_sriov_rreg(struct amdgpu_device *adev,
>   if (amdgpu_device_skip_hw_access(adev))
>   return 0;
>  
> + /* Select lower 16 bits to read in local xcc */
> + if ((hwip == GC_HWIP) && !(acc_flags & AMDGPU_REGS_NO_KIQ))
> + offset = NORMALIZE_XCC_REG_OFFSET(offset);
> +
>   if (!amdgpu_sriov_runtime(adev) &&
>   amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, hwip, 
> false, &rlcg_flag))
>   return amdgpu_virt_rlcg_reg_rw(adev, offset, 0, rlcg_flag, 
> xcc_id);
> @@ -1102,7 +1110,7 @@ u32 amdgpu_sriov_rreg(struct amdgpu_device *adev,
>   if (acc_flags & AMDGPU_REGS_NO_KIQ)
>   return RREG32_NO_KIQ(offset);
>   else
> - return RREG32(offset);
> + return RREG32_XCC(offset, xcc_id);>  }
>  
>  bool amdgpu_sriov_xnack_support(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 88b4644f8e96..5bb275b96e6a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -853,8 +853,12 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device 
> *adev, uint32_t vmid,
>*/
>   if (adev->gfx.kiq[inst].ring.sched.ready &&
>   (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev))) {
> - uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
> - uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
> +
> + /* Select lower 16 bits to write in local xcc */
> + if (AMDGPU_IS_GFXHUB(vmhub)) {
> + req = NORMALIZE_XCC_REG_OFFSET(req);
> + ack = NORMALIZE_XCC_REG_OFFSET(ack);
> + }
Not sure if there are other things to check like cross AID register
offsets for MMHUB.

Thanks,
Lijo
>  
>   amdgpu_gmc_fw_reg_write_reg_wait(adev, req, ack, inv_req,
>1 << vmid, inst);
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15_common.h 
> b/drivers/gpu/drm/amd/amdgpu/soc15_common.h
> index 242b24f73c17..9ddf68e7d06e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15_common.h
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15_common.h
> @@ -210,4 +210,6 @@
>  #define WREG64_MCA(ext, mca_base, idx, val) \
>   WREG64_PCIE_EXT(adev->asic_funcs->encode_ext_smn_addressing(ext) + 
> mca_base + (idx * 8), val)
>  
> +#define NORMALIZE_XCC_REG_OFFSET(offset) (offset & 0x)
> +
>  #endif


RE: [PATCH 5/5] drm/amdgpu: add gpu reset check and exception handling

2024-06-18 Thread Chai, Thomas
[AMD Official Use Only - AMD Internal Distribution Only]

-
Best Regards,
Thomas

-Original Message-
From: Wang, Yang(Kevin) 
Sent: Tuesday, June 18, 2024 3:19 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Yang, Stanley 
Subject: RE: [PATCH 5/5] drm/amdgpu: add gpu reset check and exception handling



-Original Message-
From: Chai, Thomas 
Sent: 2024年6月18日 14:34
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Wang, Yang(Kevin) ; Yang, 
Stanley ; Chai, Thomas 
Subject: [PATCH 5/5] drm/amdgpu: add gpu reset check and exception handling

Add gpu reset check and exception handling for page retirement.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 +
 1 file changed, 43 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7f8e6ca07957..635dc86dbfd8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1386,10 +1386,15 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev, struct ras_query_i
memset(&qctx, 0, sizeof(qctx));
qctx.event_id = amdgpu_ras_acquire_event_id(adev, 
amdgpu_ras_intr_triggered() ?
   RAS_EVENT_TYPE_ISR : 
RAS_EVENT_TYPE_INVALID);
+
+   if (!down_read_trylock(&adev->reset_domain->sem))
+   return -EIO;
+
ret = amdgpu_ras_query_error_status_helper(adev, info,
   &err_data,
   &qctx,
   error_query_mode);
+   up_read(&adev->reset_domain->sem);
if (ret)
goto out_fini_err_data;

@@ -2884,6 +2889,14 @@ static int amdgpu_ras_poison_creation_handler(struct 
amdgpu_device *adev,
return 0;
 }

+static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev) {
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_poison_msg msg;
+
+   while (kfifo_get(&con->poison_fifo, &msg)); }
+
 static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
uint32_t msg_count, uint32_t *gpu_reset)  { @@ -2913,6 
+2926,11 @@ static int amdgpu_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
else
reset = reset_flags;

+   /* Check if gpu is in reset state */
+   if (!down_read_trylock(&adev->reset_domain->sem))
+   return -EIO;
+   up_read(&adev->reset_domain->sem);

> [Kevin]:
> I'm confused that why not using 'amdgpu_in_reset()' helper function to check 
> reset state?

>Best Regards,
> Kevin

[Thomas] This function is called in page retirement thread.
 According to Christian König's previous email suggestions  
that "It's illegal to call amdgpu_in_reset() from outside of the hw specific 
backends."

+
flush_delayed_work(&con->page_retirement_dwork);

reinit_completion(&con->ras_recovery_completion);
@@ -2977,6 +2995,31 @@ static int amdgpu_ras_page_retirement_thread(void *param)
}
}

+   if ((ret == -EIO) || (gpu_reset == 
AMDGPU_RAS_GPU_RESET_MODE1_RESET)) {
+   /* gpu is in mode-1 reset state */
+   /* Clear poison creation request */
+   while (atomic_read(&con->poison_creation_count))
+   atomic_dec(&con->poison_creation_count);
[Kevin]:

Aha! It is better to use atomic_set() to instead of it.

Best Regards,
Kevin
+
+   /* Clear poison consumption fifo */
+   amdgpu_ras_clear_poison_fifo(adev);
+
+   while (atomic_read(&con->page_retirement_req_cnt))
+   atomic_dec(&con->page_retirement_req_cnt);
+
+   if (ret == -EIO) {
+   /* Wait for mode-1 reset to complete */
+   down_read(&adev->reset_domain->sem);
+   up_read(&adev->reset_domain->sem);
+   }
+
+   /* Wake up work queue to save bad pages to eeprom */
+   schedule_delayed_work(&con->page_retirement_dwork, 0);
+   } else if (gpu_reset) {
+   /* gpu is in mode-2 reset or other reset state */
+   /* Wake up work queue to save bad pages to eeprom */
+   schedule_delayed_work(&con->page_retirement_dwork, 0);
+   }
 #else
 dev_info(adev->dev, "Start processing page retirement. request:%d\n",
 atomic_read(&con->page_retirement_req_cnt));
--
2.34.1



[PATCH] drm/amdgpu: Fix pci state save during mode-1 reset

2024-06-18 Thread Lijo Lazar
Cache the PCI state before bus master is disabled. The saved state is
later used for other cases like restoring config space after mode-2
reset.

Signed-off-by: Lijo Lazar 
Fixes: 5c03e5843e6b ("drm/amdgpu:add smu mode1/2 support for aldebaran")
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3fb02f5b91c9..6c2ab14ca102 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5224,11 +5224,14 @@ int amdgpu_device_mode1_reset(struct amdgpu_device 
*adev)
 
dev_info(adev->dev, "GPU mode1 reset\n");
 
+   /* Cache the state before bus master disable. The saved config space
+* values are used in other cases like restore after mode-2 reset.
+*/
+   amdgpu_device_cache_pci_state(adev->pdev);
+
/* disable BM */
pci_clear_master(adev->pdev);
 
-   amdgpu_device_cache_pci_state(adev->pdev);
-
if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
dev_info(adev->dev, "GPU smu mode1 reset\n");
ret = amdgpu_dpm_mode1_reset(adev);
-- 
2.25.1



RE: [PATCH 2/2] Revert "drm/amdgpu: change aca bank error lock type to spinlock"

2024-06-18 Thread Chai, Thomas
[AMD Official Use Only - AMD Internal Distribution Only]

Series is
Reviewed-by: YiPeng Chai 


-
Best Regards,
Thomas

-Original Message-
From: Wang, Yang(Kevin) 
Sent: Tuesday, June 18, 2024 3:49 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; 
Chai, Thomas 
Subject: [PATCH 2/2] Revert "drm/amdgpu: change aca bank error lock type to 
spinlock"

This reverts commit 354436e7905d166011f2aa26dccd9fa04b20940e.

Revert this patch to modify lock type back to 'mutex' to avoid kernel calltrace 
issue.

[  602.668806] Workqueue: amdgpu-reset-dev amdgpu_ras_do_recovery [amdgpu] [  
602.668939] Call Trace:
[  602.668940]  
[  602.668941]  dump_stack_lvl+0x4c/0x70 [  602.668945]  dump_stack+0x14/0x20 [ 
 602.668946]  __schedule_bug+0x5a/0x70 [  602.668950]  __schedule+0x940/0xb30 [ 
 602.668952]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.668955]  ? hrtimer_reprogram+0x77/0xb0 [  602.668957]  ? 
srso_alias_return_thunk+0x5/0xfbef5
[  602.668959]  ? hrtimer_start_range_ns+0x126/0x370
[  602.668961]  schedule+0x39/0xe0
[  602.668962]  schedule_hrtimeout_range_clock+0xb1/0x140
[  602.668964]  ? __pfx_hrtimer_wakeup+0x10/0x10 [  602.668966]  
schedule_hrtimeout_range+0x17/0x20
[  602.668967]  usleep_range_state+0x69/0x90 [  602.668970]  
psp_cmd_submit_buf+0x132/0x570 [amdgpu] [  602.669066]  
psp_ras_invoke+0x75/0x1a0 [amdgpu] [  602.669156]  
psp_ras_query_address+0x9c/0x120 [amdgpu] [  602.669245]  
umc_v12_0_update_ecc_status+0x16d/0x520 [amdgpu] [  602.669337]  ? 
srso_alias_return_thunk+0x5/0xfbef5
[  602.669339]  ? stack_depot_save+0x12/0x20 [  602.669342]  ? 
srso_alias_return_thunk+0x5/0xfbef5
[  602.669343]  ? set_track_prepare+0x52/0x70 [  602.669346]  ? 
kmemleak_alloc+0x4f/0x90 [  602.669348]  ? __kmalloc_node+0x34b/0x450 [  
602.669352]  amdgpu_umc_update_ecc_status+0x23/0x40 [amdgpu] [  602.669438]  
mca_umc_mca_get_err_count+0x85/0xc0 [amdgpu] [  602.669554]  
mca_smu_parse_mca_error_count+0x120/0x1d0 [amdgpu] [  602.669655]  
amdgpu_mca_dispatch_mca_set.part.0+0x141/0x250 [amdgpu] [  602.669743]  ? 
kmemleak_free+0x36/0x60 [  602.669745]  ? kvfree+0x32/0x40 [  602.669747]  ? 
srso_alias_return_thunk+0x5/0xfbef5
[  602.669749]  ? kfree+0x15d/0x2a0
[  602.669752]  amdgpu_mca_smu_log_ras_error+0x1f6/0x210 [amdgpu] [  
602.669839]  amdgpu_ras_query_error_status_helper+0x2ad/0x390 [amdgpu] [  
602.669924]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.669925]  ? __call_rcu_common.constprop.0+0xa6/0x2b0
[  602.669929]  amdgpu_ras_query_error_status+0xf3/0x620 [amdgpu] [  
602.670014]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.670017]  amdgpu_ras_log_on_err_counter+0xe1/0x170 [amdgpu] [  
602.670103]  amdgpu_ras_do_recovery+0xd2/0x2c0 [amdgpu] [  602.670187]  ? 
srso_alias_return_thunk+0x5/0

Signed-off-by: Yang Wang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 19 ++-  
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h |  3 +--
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index 04515c1c7241..7945173321a2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -222,9 +222,9 @@ static struct aca_bank_error *new_bank_error(struct 
aca_error *aerr, struct aca_
INIT_LIST_HEAD(&bank_error->node);
memcpy(&bank_error->info, info, sizeof(*info));

-   spin_lock(&aerr->lock);
+   mutex_lock(&aerr->lock);
list_add_tail(&bank_error->node, &aerr->list);
-   spin_unlock(&aerr->lock);
+   mutex_unlock(&aerr->lock);

return bank_error;
 }
@@ -235,7 +235,7 @@ static struct aca_bank_error *find_bank_error(struct 
aca_error *aerr, struct aca
struct aca_bank_info *tmp_info;
bool found = false;

-   spin_lock(&aerr->lock);
+   mutex_lock(&aerr->lock);
list_for_each_entry(bank_error, &aerr->list, node) {
tmp_info = &bank_error->info;
if (tmp_info->socket_id == info->socket_id && @@ -246,7 +246,7 
@@ static struct aca_bank_error *find_bank_error(struct aca_error *aerr, struct 
aca
}

 out_unlock:
-   spin_unlock(&aerr->lock);
+   mutex_unlock(&aerr->lock);

return found ? bank_error : NULL;
 }
@@ -474,7 +474,7 @@ static int aca_log_aca_error(struct aca_handle *handle, 
enum aca_error_type type
struct aca_error *aerr = &error_cache->errors[type];
struct aca_bank_error *bank_error, *tmp;

-   spin_lock(&aerr->lock);
+   mutex_lock(&aerr->lock);

if (list_empty(&aerr->list))
goto out_unlock;
@@ -485,7 +485,7 @@ static int aca_log_aca_error(struct aca_handle *handle, 
enum aca_error_type type
}

 out_unlock:
-   spin_unlock(&aerr->lock);
+   mutex_unlock(&aerr->lock);

return 0;
 }
@@ -542,7 +542,7 @@ int amdgpu_aca_get_error_data(struct amdgpu_device *adev, 
struct aca_handle *han

 static void aca_error_init(struct aca_error 

[PATCH 1/2] Revert "drm/amdgpu: change bank cache lock type to spinlock"

2024-06-18 Thread Yang Wang
This reverts commit 379e6cd11499f35d67bbf8f0114b0a054b9f73d7

revert this patch to modify lock type back to 'mutex' to avoid kernel
calltrace issue.

[  602.668806] Workqueue: amdgpu-reset-dev amdgpu_ras_do_recovery [amdgpu]
[  602.668939] Call Trace:
[  602.668940]  
[  602.668941]  dump_stack_lvl+0x4c/0x70
[  602.668945]  dump_stack+0x14/0x20
[  602.668946]  __schedule_bug+0x5a/0x70
[  602.668950]  __schedule+0x940/0xb30
[  602.668952]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.668955]  ? hrtimer_reprogram+0x77/0xb0
[  602.668957]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.668959]  ? hrtimer_start_range_ns+0x126/0x370
[  602.668961]  schedule+0x39/0xe0
[  602.668962]  schedule_hrtimeout_range_clock+0xb1/0x140
[  602.668964]  ? __pfx_hrtimer_wakeup+0x10/0x10
[  602.668966]  schedule_hrtimeout_range+0x17/0x20
[  602.668967]  usleep_range_state+0x69/0x90
[  602.668970]  psp_cmd_submit_buf+0x132/0x570 [amdgpu]
[  602.669066]  psp_ras_invoke+0x75/0x1a0 [amdgpu]
[  602.669156]  psp_ras_query_address+0x9c/0x120 [amdgpu]
[  602.669245]  umc_v12_0_update_ecc_status+0x16d/0x520 [amdgpu]
[  602.669337]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.669339]  ? stack_depot_save+0x12/0x20
[  602.669342]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.669343]  ? set_track_prepare+0x52/0x70
[  602.669346]  ? kmemleak_alloc+0x4f/0x90
[  602.669348]  ? __kmalloc_node+0x34b/0x450
[  602.669352]  amdgpu_umc_update_ecc_status+0x23/0x40 [amdgpu]
[  602.669438]  mca_umc_mca_get_err_count+0x85/0xc0 [amdgpu]
[  602.669554]  mca_smu_parse_mca_error_count+0x120/0x1d0 [amdgpu]
[  602.669655]  amdgpu_mca_dispatch_mca_set.part.0+0x141/0x250 [amdgpu]
[  602.669743]  ? kmemleak_free+0x36/0x60
[  602.669745]  ? kvfree+0x32/0x40
[  602.669747]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.669749]  ? kfree+0x15d/0x2a0
[  602.669752]  amdgpu_mca_smu_log_ras_error+0x1f6/0x210 [amdgpu]
[  602.669839]  amdgpu_ras_query_error_status_helper+0x2ad/0x390 [amdgpu]
[  602.669924]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.669925]  ? __call_rcu_common.constprop.0+0xa6/0x2b0
[  602.669929]  amdgpu_ras_query_error_status+0xf3/0x620 [amdgpu]
[  602.670014]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.670017]  amdgpu_ras_log_on_err_counter+0xe1/0x170 [amdgpu]
[  602.670103]  amdgpu_ras_do_recovery+0xd2/0x2c0 [amdgpu]
[  602.670187]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.670189]  ? __schedule+0x37d/0xb30
[  602.670191]  process_one_work+0x176/0x350
[  602.670194]  worker_thread+0x2f7/0x420
[  602.670197]  ?

Signed-off-by: Yang Wang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 11 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index c7e602d69f2c..9d3a3c778504 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -232,7 +232,7 @@ int amdgpu_mca_init(struct amdgpu_device *adev)
 
for (i = 0; i < ARRAY_SIZE(mca->mca_caches); i++) {
mca_cache = &mca->mca_caches[i];
-   spin_lock_init(&mca_cache->lock);
+   mutex_init(&mca_cache->lock);
amdgpu_mca_bank_set_init(&mca_cache->mca_set);
}
 
@@ -250,6 +250,7 @@ void amdgpu_mca_fini(struct amdgpu_device *adev)
for (i = 0; i < ARRAY_SIZE(mca->mca_caches); i++) {
mca_cache = &mca->mca_caches[i];
amdgpu_mca_bank_set_release(&mca_cache->mca_set);
+   mutex_destroy(&mca_cache->lock);
}
 }
 
@@ -454,9 +455,9 @@ static int amdgpu_mca_add_mca_set_to_cache(struct 
amdgpu_device *adev, enum amdg
struct mca_bank_cache *mca_cache = &adev->mca.mca_caches[type];
int ret;
 
-   spin_lock(&mca_cache->lock);
+   mutex_lock(&mca_cache->lock);
ret = amdgpu_mca_bank_set_merge(&mca_cache->mca_set, new);
-   spin_unlock(&mca_cache->lock);
+   mutex_unlock(&mca_cache->lock);
 
return ret;
 }
@@ -486,10 +487,10 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device 
*adev, enum amdgpu_ras_blo
}
 
/* dispatch mca set again if mca cache has valid data */
-   spin_lock(&mca_cache->lock);
+   mutex_lock(&mca_cache->lock);
if (mca_cache->mca_set.nr_entries)
ret = amdgpu_mca_dispatch_mca_set(adev, blk, type, 
&mca_cache->mca_set, err_data);
-   spin_unlock(&mca_cache->lock);
+   mutex_unlock(&mca_cache->lock);
 
 out_mca_release:
amdgpu_mca_bank_set_release(&mca_set);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
index c3c184c88dad..e80323ff90c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
@@ -84,7 +84,7 @@ struct mca_bank_set {
 
 struct mca_bank_cache {
struct mca_bank_set mca_set;
-   spinlock_t lock;
+   struct mutex lock;
 };
 
 struct amdgpu_mca {
-- 
2.34.1

[PATCH 2/2] Revert "drm/amdgpu: change aca bank error lock type to spinlock"

2024-06-18 Thread Yang Wang
This reverts commit 354436e7905d166011f2aa26dccd9fa04b20940e.

Revert this patch to modify lock type back to 'mutex' to avoid kernel
calltrace issue.

[  602.668806] Workqueue: amdgpu-reset-dev amdgpu_ras_do_recovery [amdgpu]
[  602.668939] Call Trace:
[  602.668940]  
[  602.668941]  dump_stack_lvl+0x4c/0x70
[  602.668945]  dump_stack+0x14/0x20
[  602.668946]  __schedule_bug+0x5a/0x70
[  602.668950]  __schedule+0x940/0xb30
[  602.668952]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.668955]  ? hrtimer_reprogram+0x77/0xb0
[  602.668957]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.668959]  ? hrtimer_start_range_ns+0x126/0x370
[  602.668961]  schedule+0x39/0xe0
[  602.668962]  schedule_hrtimeout_range_clock+0xb1/0x140
[  602.668964]  ? __pfx_hrtimer_wakeup+0x10/0x10
[  602.668966]  schedule_hrtimeout_range+0x17/0x20
[  602.668967]  usleep_range_state+0x69/0x90
[  602.668970]  psp_cmd_submit_buf+0x132/0x570 [amdgpu]
[  602.669066]  psp_ras_invoke+0x75/0x1a0 [amdgpu]
[  602.669156]  psp_ras_query_address+0x9c/0x120 [amdgpu]
[  602.669245]  umc_v12_0_update_ecc_status+0x16d/0x520 [amdgpu]
[  602.669337]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.669339]  ? stack_depot_save+0x12/0x20
[  602.669342]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.669343]  ? set_track_prepare+0x52/0x70
[  602.669346]  ? kmemleak_alloc+0x4f/0x90
[  602.669348]  ? __kmalloc_node+0x34b/0x450
[  602.669352]  amdgpu_umc_update_ecc_status+0x23/0x40 [amdgpu]
[  602.669438]  mca_umc_mca_get_err_count+0x85/0xc0 [amdgpu]
[  602.669554]  mca_smu_parse_mca_error_count+0x120/0x1d0 [amdgpu]
[  602.669655]  amdgpu_mca_dispatch_mca_set.part.0+0x141/0x250 [amdgpu]
[  602.669743]  ? kmemleak_free+0x36/0x60
[  602.669745]  ? kvfree+0x32/0x40
[  602.669747]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.669749]  ? kfree+0x15d/0x2a0
[  602.669752]  amdgpu_mca_smu_log_ras_error+0x1f6/0x210 [amdgpu]
[  602.669839]  amdgpu_ras_query_error_status_helper+0x2ad/0x390 [amdgpu]
[  602.669924]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.669925]  ? __call_rcu_common.constprop.0+0xa6/0x2b0
[  602.669929]  amdgpu_ras_query_error_status+0xf3/0x620 [amdgpu]
[  602.670014]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.670017]  amdgpu_ras_log_on_err_counter+0xe1/0x170 [amdgpu]
[  602.670103]  amdgpu_ras_do_recovery+0xd2/0x2c0 [amdgpu]
[  602.670187]  ? srso_alias_return_thunk+0x5/0

Signed-off-by: Yang Wang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 19 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h |  3 +--
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index 04515c1c7241..7945173321a2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -222,9 +222,9 @@ static struct aca_bank_error *new_bank_error(struct 
aca_error *aerr, struct aca_
INIT_LIST_HEAD(&bank_error->node);
memcpy(&bank_error->info, info, sizeof(*info));
 
-   spin_lock(&aerr->lock);
+   mutex_lock(&aerr->lock);
list_add_tail(&bank_error->node, &aerr->list);
-   spin_unlock(&aerr->lock);
+   mutex_unlock(&aerr->lock);
 
return bank_error;
 }
@@ -235,7 +235,7 @@ static struct aca_bank_error *find_bank_error(struct 
aca_error *aerr, struct aca
struct aca_bank_info *tmp_info;
bool found = false;
 
-   spin_lock(&aerr->lock);
+   mutex_lock(&aerr->lock);
list_for_each_entry(bank_error, &aerr->list, node) {
tmp_info = &bank_error->info;
if (tmp_info->socket_id == info->socket_id &&
@@ -246,7 +246,7 @@ static struct aca_bank_error *find_bank_error(struct 
aca_error *aerr, struct aca
}
 
 out_unlock:
-   spin_unlock(&aerr->lock);
+   mutex_unlock(&aerr->lock);
 
return found ? bank_error : NULL;
 }
@@ -474,7 +474,7 @@ static int aca_log_aca_error(struct aca_handle *handle, 
enum aca_error_type type
struct aca_error *aerr = &error_cache->errors[type];
struct aca_bank_error *bank_error, *tmp;
 
-   spin_lock(&aerr->lock);
+   mutex_lock(&aerr->lock);
 
if (list_empty(&aerr->list))
goto out_unlock;
@@ -485,7 +485,7 @@ static int aca_log_aca_error(struct aca_handle *handle, 
enum aca_error_type type
}
 
 out_unlock:
-   spin_unlock(&aerr->lock);
+   mutex_unlock(&aerr->lock);
 
return 0;
 }
@@ -542,7 +542,7 @@ int amdgpu_aca_get_error_data(struct amdgpu_device *adev, 
struct aca_handle *han
 
 static void aca_error_init(struct aca_error *aerr, enum aca_error_type type)
 {
-   spin_lock_init(&aerr->lock);
+   mutex_init(&aerr->lock);
INIT_LIST_HEAD(&aerr->list);
aerr->type = type;
aerr->nr_errors = 0;
@@ -561,10 +561,11 @@ static void aca_error_fini(struct aca_error *aerr)
 {
struct aca_bank_error *bank_error, *tmp;
 
-   spin_lock(&aerr->lock);
+   mutex_lock(&aerr->lock);
list_for_each

Re: [BUG] 6.10-rc3 [drm:amdgpu_fill_buffer [amdgpu]] *ERROR* Trying to clear memory with ring turned off [DUPLICATE]

2024-06-18 Thread Mirsad Todorovac
On 6/12/24 21:15, Mirsad Todorovac wrote:
> Hi, all!
> 
> Running the vanilla torvalds tree kernel 6.10-rc3, there occurred an error in 
> boot with
> amdgpu.
> 
> Here is the complete output:
> 
> kernel: [8.704024] WARNING: CPU: 24 PID: 689 at 
> drivers/gpu/drm/amd/amdgpu/amdgpu_object.c:1379 amdgpu_bo_release_notify 
> (drivers/gpu/drm/amd/amdgpu/amdgpu_object.c:1382 (discriminator 1)) amdgpu
> kernel: [8.704146] Modules linked in: binfmt_misc amd_atl intel_rapl_msr 
> intel_rapl_common edac_mce_amd kvm_amd kvm snd_hda_codec_realtek 
> snd_hda_codec_generic amdgpu(+) crct10dif_pclmul nls_iso8859_1 
> snd_hda_scodec_component polyval_clmulni snd_hda_codec_hdmi polyval_generic 
> ghash_clmulni_intel snd_hda_intel sha256_ssse3 sha1_ssse3 snd_intel_dspcfg 
> snd_intel_sdw_acpi aesni_intel snd_hda_codec crypto_simd cryptd snd_seq_midi 
> amdxcp snd_seq_midi_event snd_hda_core drm_exec gpu_sched joydev snd_rawmidi 
> snd_hwdep rapl drm_buddy input_leds drm_suballoc_helper snd_seq 
> drm_ttm_helper wmi_bmof snd_pcm snd_seq_device ttm k10temp ccp snd_timer 
> drm_display_helper snd drm_kms_helper i2c_algo_bit soundcore mac_hid tcp_bbr 
> sch_fq msr parport_pc ppdev lp parport efi_pstore drm ip_tables x_tables 
> autofs4 btrfs blake2b_generic xor raid6_pq libcrc32c hid_generic usbhid hid 
> nvme crc32_pclmul ahci i2c_piix4 nvme_core r8169 xhci_pci libahci 
> xhci_pci_renesas realtek video wmi gpio_amdpt
> kernel: [8.704200] CPU: 24 PID: 689 Comm: systemd-udevd Not tainted 
> 6.10.0-rc1-next-20240528 #1
> kernel: [8.704202] Hardware name: ASRock X670E PG Lightning/X670E PG 
> Lightning, BIOS 1.21 04/26/2023
> kernel: [8.704203] RIP: 0010:amdgpu_bo_release_notify 
> (drivers/gpu/drm/amd/amdgpu/amdgpu_object.c:1382 (discriminator 1)) amdgpu
> kernel: [ 8.704324] Code: 0b e9 a3 fe ff ff 48 ba ff ff ff ff ff ff ff 7f 31 
> f6 4c 89 ef e8 c2 c4 dc ee eb 99 e8 eb bc dc ee eb b2 0f 0b e9 3c fe ff ff 
> <0f> 0b eb a7 be 03 00 00 00 e8 b4 4b a1 ee eb 9b e8 4d 5c 36 ef 66
> All code
> 
>0: 0b e9   or %ecx,%ebp
>2: a3 fe ff ff 48 ba ffmovabs %eax,0xffba48fe
>9: ff ff 
>b: ff  (bad)  
>c: ff  (bad)  
>d: ff  (bad)  
>e: ff  (bad)  
>f: 7f 31   jg 0x42
>   11: f6 4c 89 ef e8  testb  $0xe8,-0x11(%rcx,%rcx,4)
>   16: c2 c4 dcret$0xdcc4
>   19: ee  out%al,(%dx)
>   1a: eb 99   jmp0xffb5
>   1c: e8 eb bc dc ee  call   0xeedcbd0c
>   21: eb b2   jmp0xffd5
>   23: 0f 0b   ud2
>   25: e9 3c fe ff ff  jmp0xfe66
>   2a:*0f 0b   ud2 <-- trapping instruction
>   2c: eb a7   jmp0xffd5
>   2e: be 03 00 00 00  mov$0x3,%esi
>   33: e8 b4 4b a1 ee  call   0xeea14bec
>   38: eb 9b   jmp0xffd5
>   3a: e8 4d 5c 36 ef  call   0xef365c8c
>   3f: 66  data16
> 
> Code starting with the faulting instruction
> ===
>0: 0f 0b   ud2
>2: eb a7   jmp0xffab
>4: be 03 00 00 00  mov$0x3,%esi
>9: e8 b4 4b a1 ee  call   0xeea14bc2
>e: eb 9b   jmp0xffab
>   10: e8 4d 5c 36 ef  call   0xef365c62
>   15: 66  data16
> kernel: [8.704325] RSP: 0018:b74b014d3380 EFLAGS: 00010282
> kernel: [8.704327] RAX: ffea RBX: 940781ec5c48 RCX: 
> 
> kernel: [8.704328] RDX:  RSI:  RDI: 
> 
> kernel: [8.704329] RBP: b74b014d33b8 R08:  R09: 
> 
> kernel: [8.704330] R10:  R11:  R12: 
> 9407dc80ef58
> kernel: [8.704330] R13: 940781ec5c00 R14:  R15: 
> 
> kernel: [8.704331] FS:  783805ca28c0() GS:94169860() 
> knlGS:
> kernel: [8.704333] CS:  0010 DS:  ES:  CR0: 80050033
> kernel: [8.704334] CR2: 7ec7be572000 CR3: 00010f7e4000 CR4: 
> 00750ef0
> kernel: [8.704335] PKRU: 5554
> kernel: [8.704335] Call Trace:
> kernel: [8.704337]  
> kernel: [8.704339] ? show_regs+0x71/0x90 
> kernel: [8.704344] ? __warn+0x88/0x140 
> kernel: [8.704347] ? amdgpu_bo_release_notify 
> (drivers/gpu/drm/amd/amdgpu/amdgpu_object.c:1382 (discriminator 1)) amdgpu
> kernel: [8.704464] ? report_bug+0x1ab/0x1c0 
> kernel: [8.704468] ? handle_bug+0x46/0x90 
> kernel: [8.704471] ? exc_invalid_op+0x19/0x80 
> kernel: [8.704473] ? asm_exc_invalid_op+0x1b/0x

Re: [bug report] drm/amdgpu: amdgpu crash on playing videos, linux 6.10-rc

2024-06-18 Thread Winston Ma
As reported in my previous email. In my device I don't see a more
critical system freeze, which causes system reboot, during video play.
But I still experience green screen problem during full screen
toggling while watching video (Screenshot: https://ibb.co/8Dpdxc3). It
didn't cause my system to reboot but I guess there are bug to be
fixed.

Should this issue be monitored by another new ticket?

Thanks and Regards,
Winston

On Mon, Jun 17, 2024 at 5:40 PM Wang Yunchen  wrote:
>
> On Mon, 2024-06-17 at 06:55 +0800, Winston Ma wrote:
> > I only build the kernel once. I could try but I think you couldn't
> > expect much from my side.
> >
> > BTW I installed 6.10-rc4 this morning from Ubuntu mainline
> > (https://kernel.ubuntu.com/mainline/v6.10-rc4/amd64/) and I couldn't
> > replicate the video crash problem. Yunchen could you try 6.10-rc4 and
> > see if you still have the video crash problem?
> >
> > But I still get the green blocky object when I keep toggling full
> > screen during youtube watch (Screenshot: https://ibb.co/8Dpdxc3). I
> > didn't see the green block in 6.9 so it could be another issue.
> >
> > Thanks and Regards,
> > Winston
> >
> >
> > On Sun, Jun 16, 2024 at 12:10 AM Wang Yunchen  wrote:
> > >
> > > On Sat, 2024-06-15 at 17:50 +0200, Thorsten Leemhuis wrote:
> > > > [reply made easier by moving something in the quote]
> > > >
> > > > On 12.06.24 18:55, Wang Yunchen wrote:
> > > > > On Wed, 2024-06-12 at 15:14 +0200, Linux regression tracking (Thorsten
> > > > > Leemhuis) wrote:
> > > > > > On 06.06.24 05:06, Winston Ma wrote:
> > > > > > > Hi I got the same problem on Linux Kernel 6.10-rc2. I got the
> > > > > > > problem
> > > > > > > by
> > > > > > > following the procedure below:
> > > > > > >
> > > > > > >  1. Boot Linux Kernel 6.10-rc2
> > > > > > >  2. Open Firefox (Any browser should work)
> > > > > > >  3. Open a Youtube Video
> > > > > > >  4. On the playing video, toggle fullscreen quickly Then after 10-
> > > > > > > 20
> > > > > > > times of fullscreen toggling, the screen would enter freeze
> > > > > > > mode.
> > > > > > > This is the log that I captured using the above method.
> > > > > >
> > > > > > Hmm, seems nothing happened here for a while. Could you maybe try to
> > > > > > bisect this
> > > > > > (
> > > > > > https://docs.kernel.org/admin-guide/verify-bugs-and-bisect-regressions.ht
> > > > > > ml
> > > > > > )?
> > > > >
> > > > > It seems that the issue persists on 6.10 rc3.
> > > >
> > > > That's good to know, but...
> > > >
> > > > > > @amd-gfx devs: Or is this unneeded, as the cause found or maybe even
> > > > > > fixed meanwhile?
> > > >
> > > > ...as there was no reply to that inquiry it seems we really need either
> > > > you or Winston Ma (or somebody else that is affected we don't yet know
> > > > about) to perform a git bisection (see the link quoted above) to find
> > > > the exact change that broke things. Without this it might not be getting
> > > > fixed.
> > > >
> > > > Ciao, Thorsten
> > > >
> > > > > > > This is the kernel log
> > > > > > >
> > > > > > > 2024-06-06T10:26:40.747576+08:00 kernel:
> > > > > > > gmc_v10_0_process_interrupt:
> > > > > > > 6 callbacks suppressed
> > > > > > > 2024-06-06T10:26:40.747618+08:00 kernel: amdgpu :03:00.0:
> > > > > > > amdgpu:
> > > > > > > [mmhub] page fault (src_id:0 ring:8 vmid:2
> > > > > > > pasid:32789)
> > > > > > > 2024-06-06T10:26:40.747623+08:00 kernel: amdgpu :03:00.0:
> > > > > > > amdgpu:
> > > > > > > in process RDD Process pid 39524 thread
> > > > > > > firefox-bi:cs0 pid 40342
> > > > > > > 2024-06-06T10:26:40.747625+08:00 kernel: amdgpu :03:00.0:
> > > > > > > amdgpu:   in page starting at address
> > > > > > > 0x800106ffe000 from client 0x12 (VMC)
> > > > > > > 2024-06-06T10:26:40.747628+08:00 kernel: amdgpu :03:00.0:
> > > > > > > amdgpu:
> > > > > > > MMVM_L2_PROTECTION_FAULT_STATUS:0x00203811
> > > > > > > 2024-06-06T10:26:40.747629+08:00 kernel: amdgpu :03:00.0:
> > > > > > > amdgpu:  Faulty UTCL2 client ID: VCN (0x1c)
> > > > > > > 2024-06-06T10:26:40.747631+08:00 kernel: amdgpu :03:00.0:
> > > > > > > amdgpu:  MORE_FAULTS: 0x1
> > > > > > > 2024-06-06T10:26:40.747651+08:00 kernel: amdgpu :03:00.0:
> > > > > > > amdgpu:  WALKER_ERROR: 0x0
> > > > > > > 2024-06-06T10:26:40.747653+08:00 kernel: amdgpu :03:00.0:
> > > > > > > amdgpu:  PERMISSION_FAULTS: 0x1
> > > > > > > 2024-06-06T10:26:40.747655+08:00 kernel: amdgpu :03:00.0:
> > > > > > > amdgpu:  MAPPING_ERROR: 0x0
> > > > > > > 2024-06-06T10:26:40.747656+08:00 kernel: amdgpu :03:00.0:
> > > > > > > amdgpu:  RW: 0x0
> > > > > > > 2024-06-06T10:26:40.747658+08:00 kernel: amdgpu :03:00.0:
> > > > > > > amdgpu:
> > > > > > > [mmhub] page fault (src_id:0 ring:8 vmid:2
> > > > > > > pasid:32789)
> > > > > > > 2024-06-06T10:26:40.747660+08:00 kernel: amdgpu :03:00.0:
> > > > > > > amdgpu:
> > > > > > > in process RDD Process pid 39524 thr

Re: [PATCH 1/2] drm/amdgpu: make duplicated EOP packet for GFX7/8 have real content

2024-06-18 Thread Xi Ruoyao
On Mon, 2024-06-17 at 22:30 +0800, Icenowy Zheng wrote:
> > Two consecutive writes to the same bus address are perfectly legal
> > from 
> > the PCIe specification and can happen all the time, even without this
> > specific hw workaround.
> 
> Yes I know it, and I am not from Loongson, just some user trying to
> mess around it.

There are some purposed "workarounds" like reducing the link speed (from
x16 to x8), tweaking the power management setting, etc.  Someone even
claims improving the heat sink of the LS7A chip can help to work around
this issue but I'm really skeptical...

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University


RE: [PATCH 5/5] drm/amdgpu: add gpu reset check and exception handling

2024-06-18 Thread Wang, Yang(Kevin)


-Original Message-
From: Chai, Thomas  
Sent: 2024年6月18日 14:34
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Wang, Yang(Kevin) ; Yang, 
Stanley ; Chai, Thomas 
Subject: [PATCH 5/5] drm/amdgpu: add gpu reset check and exception handling

Add gpu reset check and exception handling for page retirement.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 +
 1 file changed, 43 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7f8e6ca07957..635dc86dbfd8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1386,10 +1386,15 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev, struct ras_query_i
memset(&qctx, 0, sizeof(qctx));
qctx.event_id = amdgpu_ras_acquire_event_id(adev, 
amdgpu_ras_intr_triggered() ?
   RAS_EVENT_TYPE_ISR : 
RAS_EVENT_TYPE_INVALID);
+
+   if (!down_read_trylock(&adev->reset_domain->sem))
+   return -EIO;
+
ret = amdgpu_ras_query_error_status_helper(adev, info,
   &err_data,
   &qctx,
   error_query_mode);
+   up_read(&adev->reset_domain->sem);
if (ret)
goto out_fini_err_data;
 
@@ -2884,6 +2889,14 @@ static int amdgpu_ras_poison_creation_handler(struct 
amdgpu_device *adev,
return 0;
 }
 
+static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev) {
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_poison_msg msg;
+
+   while (kfifo_get(&con->poison_fifo, &msg)); }
+
 static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
uint32_t msg_count, uint32_t *gpu_reset)  { @@ -2913,6 
+2926,11 @@ static int amdgpu_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
else
reset = reset_flags;
 
+   /* Check if gpu is in reset state */
+   if (!down_read_trylock(&adev->reset_domain->sem))
+   return -EIO;
+   up_read(&adev->reset_domain->sem);

[Kevin]:
I'm confused that why not using 'amdgpu_in_reset()' helper function to check 
reset state?

Best Regards,
Kevin
+
flush_delayed_work(&con->page_retirement_dwork);
 
reinit_completion(&con->ras_recovery_completion);
@@ -2977,6 +2995,31 @@ static int amdgpu_ras_page_retirement_thread(void *param)
}
}
 
+   if ((ret == -EIO) || (gpu_reset == 
AMDGPU_RAS_GPU_RESET_MODE1_RESET)) {
+   /* gpu is in mode-1 reset state */
+   /* Clear poison creation request */
+   while (atomic_read(&con->poison_creation_count))
+   atomic_dec(&con->poison_creation_count);
[Kevin]:

Aha! It is better to use atomic_set() to instead of it.

Best Regards,
Kevin
+
+   /* Clear poison consumption fifo */
+   amdgpu_ras_clear_poison_fifo(adev);
+
+   while (atomic_read(&con->page_retirement_req_cnt))
+   atomic_dec(&con->page_retirement_req_cnt);
+
+   if (ret == -EIO) {
+   /* Wait for mode-1 reset to complete */
+   down_read(&adev->reset_domain->sem);
+   up_read(&adev->reset_domain->sem);
+   }
+
+   /* Wake up work queue to save bad pages to eeprom */
+   schedule_delayed_work(&con->page_retirement_dwork, 0);
+   } else if (gpu_reset) {
+   /* gpu is in mode-2 reset or other reset state */
+   /* Wake up work queue to save bad pages to eeprom */
+   schedule_delayed_work(&con->page_retirement_dwork, 0);
+   }
 #else
 dev_info(adev->dev, "Start processing page retirement. request:%d\n",
 atomic_read(&con->page_retirement_req_cnt));
--
2.34.1