Re: [PATCH] drm/amdgpu: move buffer funcs setting up a level (v2)

2023-10-24 Thread Christian König

Am 25.10.23 um 06:24 schrieb Luben Tuikov:

From: Alex Deucher 

Rather than doing this in the IP code for the SDMA paging
engine, move it up to the core device level init level.
This should fix the scheduler init ordering.

v2: Fix checkpatch parens complaint; long lines. (Luben)

Signed-off-by: Alex Deucher 
Tested-by: Luben Tuikov 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 
  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c   | 21 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h   |  1 +
  drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  5 -
  drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  5 -
  drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  5 -
  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 16 +---
  drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 10 +-
  drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 10 +-
  drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 10 +-
  drivers/gpu/drm/amd/amdgpu/si_dma.c|  5 -
  11 files changed, 38 insertions(+), 62 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7ec32b44df052f..47c1e60109c14c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2662,6 +2662,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
if (r)
goto init_failed;
  
+	amdgpu_sdma_set_buffer_funcs_helper(adev);

+
/* Don't init kfd if whole hive need to be reset during init */
if (!adev->gmc.xgmi.pending_reset) {
kgd2kfd_init_zone_device(adev);
@@ -3260,6 +3262,8 @@ int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
amdgpu_virt_request_full_gpu(adev, false);
}
  
+	amdgpu_sdma_unset_buffer_funcs_helper(adev);

+
r = amdgpu_device_ip_suspend_phase1(adev);
if (r)
return r;
@@ -3449,6 +3453,8 @@ static int amdgpu_device_ip_resume(struct amdgpu_device 
*adev)
  
  	r = amdgpu_device_ip_resume_phase2(adev);
  
+	amdgpu_sdma_set_buffer_funcs_helper(adev);

+
return r;
  }
  
@@ -4236,6 +4242,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)

/* disable ras feature must before hw fini */
amdgpu_ras_pre_fini(adev);
  
+	amdgpu_sdma_unset_buffer_funcs_helper(adev);

+
amdgpu_device_ip_fini_early(adev);
  
  	amdgpu_irq_fini_hw(adev);

@@ -4407,6 +4415,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
  
  	amdgpu_ras_suspend(adev);
  
+	amdgpu_sdma_unset_buffer_funcs_helper(adev);

+
amdgpu_device_ip_suspend_phase1(adev);
  
  	if (!adev->in_s0ix)

@@ -5178,6 +5188,8 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
if (r)
goto out;
  
+amdgpu_sdma_set_buffer_funcs_helper(tmp_adev);

+
if (vram_lost)

amdgpu_device_fill_reset_magic(tmp_adev);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c

index e8cbc4142d8021..c4d642b06f3c5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -292,6 +292,27 @@ int amdgpu_sdma_init_microcode(struct amdgpu_device *adev,
return err;
  }
  
+void amdgpu_sdma_set_buffer_funcs_helper(struct amdgpu_device *adev)


From the functionality and general idea that looks good to me.

But I think both the amdgpu_sdma_set_buffer_funcs_helper() as well the 
existing amdgpu_sdma_unset_buffer_funcs_helper() are just an unnecessary 
extra check when they are not used by the SDMA code.


I think we should just call amdgpu_ttm_set_buffer_funcs_status() 
directly instead.


Regards,
Christian.


+{
+   struct amdgpu_ring *sdma;
+   int i;
+
+   for (i = 0; i < adev->sdma.num_instances; i++) {
+   if (adev->sdma.has_page_queue) {
+   sdma = >sdma.instance[i].page;
+   if (adev->mman.buffer_funcs_ring == sdma && 
sdma->sched.ready) {
+   amdgpu_ttm_set_buffer_funcs_status(adev, true);
+   break;
+   }
+   }
+   sdma = >sdma.instance[i].ring;
+   if (adev->mman.buffer_funcs_ring == sdma && sdma->sched.ready) {
+   amdgpu_ttm_set_buffer_funcs_status(adev, true);
+   break;
+   }
+   }
+}
+
  void amdgpu_sdma_unset_buffer_funcs_helper(struct amdgpu_device *adev)
  {
struct amdgpu_ring *sdma;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 513ac22120c1fa..33209593e97461 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -169,6 +169,7 @@ int amdgpu_sdma_init_microcode(struct amdgpu_device 

[PATCH] drm/amdgpu: move buffer funcs setting up a level (v2)

2023-10-24 Thread Luben Tuikov
From: Alex Deucher 

Rather than doing this in the IP code for the SDMA paging
engine, move it up to the core device level init level.
This should fix the scheduler init ordering.

v2: Fix checkpatch parens complaint; long lines. (Luben)

Signed-off-by: Alex Deucher 
Tested-by: Luben Tuikov 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c   | 21 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h   |  1 +
 drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  5 -
 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  5 -
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  5 -
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 16 +---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 10 +-
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 10 +-
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 10 +-
 drivers/gpu/drm/amd/amdgpu/si_dma.c|  5 -
 11 files changed, 38 insertions(+), 62 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7ec32b44df052f..47c1e60109c14c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2662,6 +2662,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
if (r)
goto init_failed;
 
+   amdgpu_sdma_set_buffer_funcs_helper(adev);
+
/* Don't init kfd if whole hive need to be reset during init */
if (!adev->gmc.xgmi.pending_reset) {
kgd2kfd_init_zone_device(adev);
@@ -3260,6 +3262,8 @@ int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
amdgpu_virt_request_full_gpu(adev, false);
}
 
+   amdgpu_sdma_unset_buffer_funcs_helper(adev);
+
r = amdgpu_device_ip_suspend_phase1(adev);
if (r)
return r;
@@ -3449,6 +3453,8 @@ static int amdgpu_device_ip_resume(struct amdgpu_device 
*adev)
 
r = amdgpu_device_ip_resume_phase2(adev);
 
+   amdgpu_sdma_set_buffer_funcs_helper(adev);
+
return r;
 }
 
@@ -4236,6 +4242,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
/* disable ras feature must before hw fini */
amdgpu_ras_pre_fini(adev);
 
+   amdgpu_sdma_unset_buffer_funcs_helper(adev);
+
amdgpu_device_ip_fini_early(adev);
 
amdgpu_irq_fini_hw(adev);
@@ -4407,6 +4415,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
 
amdgpu_ras_suspend(adev);
 
+   amdgpu_sdma_unset_buffer_funcs_helper(adev);
+
amdgpu_device_ip_suspend_phase1(adev);
 
if (!adev->in_s0ix)
@@ -5178,6 +5188,8 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
if (r)
goto out;
 
+   amdgpu_sdma_set_buffer_funcs_helper(tmp_adev);
+
if (vram_lost)

amdgpu_device_fill_reset_magic(tmp_adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index e8cbc4142d8021..c4d642b06f3c5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -292,6 +292,27 @@ int amdgpu_sdma_init_microcode(struct amdgpu_device *adev,
return err;
 }
 
+void amdgpu_sdma_set_buffer_funcs_helper(struct amdgpu_device *adev)
+{
+   struct amdgpu_ring *sdma;
+   int i;
+
+   for (i = 0; i < adev->sdma.num_instances; i++) {
+   if (adev->sdma.has_page_queue) {
+   sdma = >sdma.instance[i].page;
+   if (adev->mman.buffer_funcs_ring == sdma && 
sdma->sched.ready) {
+   amdgpu_ttm_set_buffer_funcs_status(adev, true);
+   break;
+   }
+   }
+   sdma = >sdma.instance[i].ring;
+   if (adev->mman.buffer_funcs_ring == sdma && sdma->sched.ready) {
+   amdgpu_ttm_set_buffer_funcs_status(adev, true);
+   break;
+   }
+   }
+}
+
 void amdgpu_sdma_unset_buffer_funcs_helper(struct amdgpu_device *adev)
 {
struct amdgpu_ring *sdma;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 513ac22120c1fa..33209593e97461 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -169,6 +169,7 @@ int amdgpu_sdma_init_microcode(struct amdgpu_device *adev, 
u32 instance,
   bool duplicate);
 void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device *adev,
 bool duplicate);
+void amdgpu_sdma_set_buffer_funcs_helper(struct amdgpu_device *adev);
 void amdgpu_sdma_unset_buffer_funcs_helper(struct amdgpu_device *adev);
 int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev);
 
diff --git 

RE: [PATCH] drm/amdgpu: check RAS supported first in ras_reset_error_count

2023-10-24 Thread Wang, Yang(Kevin)
[AMD Official Use Only - General]

Reviewed-by: Yang Wang 

Best Regards,
Kevin

-Original Message-
From: amd-gfx  On Behalf Of Tao Zhou
Sent: Wednesday, October 25, 2023 11:59 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao 
Subject: [PATCH] drm/amdgpu: check RAS supported first in ras_reset_error_count

Not all platforms support RAS.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c71321edf50b..a6cff4a31c54 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1233,15 +1233,15 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device 
*adev,
return -EOPNOTSUPP;
}

+   if (!amdgpu_ras_is_supported(adev, block) ||
+   !amdgpu_ras_get_mca_debug_mode(adev))
+   return -EOPNOTSUPP;
+
/* skip ras error reset in gpu reset */
if ((amdgpu_in_reset(adev) || atomic_read(>in_recovery)) &&
mca_funcs && mca_funcs->mca_set_debug_mode)
return -EOPNOTSUPP;

-   if (!amdgpu_ras_is_supported(adev, block) ||
-   !amdgpu_ras_get_mca_debug_mode(adev))
-   return -EOPNOTSUPP;
-
if (block_obj->hw_ops->reset_ras_error_count)
block_obj->hw_ops->reset_ras_error_count(adev);

--
2.35.1



[PATCH] drm/amdgpu: check RAS supported first in ras_reset_error_count

2023-10-24 Thread Tao Zhou
Not all platforms support RAS.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c71321edf50b..a6cff4a31c54 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1233,15 +1233,15 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device 
*adev,
return -EOPNOTSUPP;
}
 
+   if (!amdgpu_ras_is_supported(adev, block) ||
+   !amdgpu_ras_get_mca_debug_mode(adev))
+   return -EOPNOTSUPP;
+
/* skip ras error reset in gpu reset */
if ((amdgpu_in_reset(adev) || atomic_read(>in_recovery)) &&
mca_funcs && mca_funcs->mca_set_debug_mode)
return -EOPNOTSUPP;
 
-   if (!amdgpu_ras_is_supported(adev, block) ||
-   !amdgpu_ras_get_mca_debug_mode(adev))
-   return -EOPNOTSUPP;
-
if (block_obj->hw_ops->reset_ras_error_count)
block_obj->hw_ops->reset_ras_error_count(adev);
 
-- 
2.35.1



RE: [PATCH v2] drm/amd/pm: fix the high voltage and temperature issue

2023-10-24 Thread Feng, Kenneth
[AMD Official Use Only - General]

Sorry that I forgot another change in this V2.
Please ignore this one.
Thanks.


-Original Message-
From: Kenneth Feng 
Sent: Wednesday, October 25, 2023 11:51 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Feng, Kenneth 

Subject: [PATCH v2] drm/amd/pm: fix the high voltage and temperature issue

fix the high voltage and temperature issue after the driver is unloaded on smu 
13.0.0, smu 13.0.7 and smu 13.0.10
v2 - fix the code format and make sure it is used on the unload case only.

Signed-off-by: Kenneth Feng 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 36 +++
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 33 +++--
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  1 +  
drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h  |  2 ++
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 13 +++
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c  |  8 -  
.../drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c  |  8 -
 7 files changed, 90 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 31f8c3ead161..c5c892a8b3f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3986,13 +3986,23 @@ int amdgpu_device_init(struct amdgpu_device *adev,
}
}
} else {
-   tmp = amdgpu_reset_method;
-   /* It should do a default reset when loading or 
reloading the driver,
-* regardless of the module parameter reset_method.
-*/
-   amdgpu_reset_method = AMD_RESET_METHOD_NONE;
-   r = amdgpu_asic_reset(adev);
-   amdgpu_reset_method = tmp;
+   switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
+   case IP_VERSION(13, 0, 0):
+   case IP_VERSION(13, 0, 7):
+   case IP_VERSION(13, 0, 10):
+   r = psp_gpu_reset(adev);
+   break;
+   default:
+   tmp = amdgpu_reset_method;
+   /* It should do a default reset when loading or 
reloading the driver,
+* regardless of the module parameter 
reset_method.
+*/
+   amdgpu_reset_method = AMD_RESET_METHOD_NONE;
+   r = amdgpu_asic_reset(adev);
+   amdgpu_reset_method = tmp;
+   break;
+   }
+
if (r) {
dev_err(adev->dev, "asic reset on init 
failed\n");
goto failed;
@@ -5945,6 +5955,18 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
return -ENOTSUPP;

ret = amdgpu_dpm_baco_exit(adev);
+
+   if (!ret)
+   switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
+   case IP_VERSION(13, 0, 0):
+   case IP_VERSION(13, 0, 7):
+   case IP_VERSION(13, 0, 10):
+   adev->gfx.is_poweron = false;
+   break;
+   default:
+   break;
+   }
+
if (ret)
return ret;

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 7c3356d6da5e..2e82172ba250 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -733,7 +733,7 @@ static int smu_early_init(void *handle)
smu->adev = adev;
smu->pm_enabled = !!amdgpu_dpm;
smu->is_apu = false;
-   smu->smu_baco.state = SMU_BACO_STATE_EXIT;
+   smu->smu_baco.state = SMU_BACO_STATE_NONE;
smu->smu_baco.platform_support = false;
smu->user_dpm_profile.fan_mode = -1;

@@ -1740,10 +1740,31 @@ static int smu_smc_hw_cleanup(struct smu_context *smu)
return 0;
 }

+static int smu_reset_mp1_state(struct smu_context *smu) {
+   struct amdgpu_device *adev = smu->adev;
+   int ret = 0;
+
+   if ((!adev->in_runpm) && (!adev->in_suspend) &&
+   (!amdgpu_in_reset(adev)))
+   switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
+ case IP_VERSION(13, 0, 0):
+ case IP_VERSION(13, 0, 7):
+ case IP_VERSION(13, 0, 10):
+   ret = smu_set_mp1_state(smu, PP_MP1_STATE_UNLOAD);
+   break;
+ default:
+   break;
+   }
+
+   return ret;
+}
+
 static int smu_hw_fini(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
struct smu_context *smu = 

[PATCH v2] drm/amd/pm: fix the high voltage and temperature issue

2023-10-24 Thread Kenneth Feng
fix the high voltage and temperature issue after the driver is unloaded on smu 
13.0.0,
smu 13.0.7 and smu 13.0.10
v2 - fix the code format and make sure it is used on the unload case only.

Signed-off-by: Kenneth Feng 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 36 +++
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 33 +++--
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  1 +
 drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h  |  2 ++
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 13 +++
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c  |  8 -
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c  |  8 -
 7 files changed, 90 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 31f8c3ead161..c5c892a8b3f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3986,13 +3986,23 @@ int amdgpu_device_init(struct amdgpu_device *adev,
}
}
} else {
-   tmp = amdgpu_reset_method;
-   /* It should do a default reset when loading or 
reloading the driver,
-* regardless of the module parameter reset_method.
-*/
-   amdgpu_reset_method = AMD_RESET_METHOD_NONE;
-   r = amdgpu_asic_reset(adev);
-   amdgpu_reset_method = tmp;
+   switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
+   case IP_VERSION(13, 0, 0):
+   case IP_VERSION(13, 0, 7):
+   case IP_VERSION(13, 0, 10):
+   r = psp_gpu_reset(adev);
+   break;
+   default:
+   tmp = amdgpu_reset_method;
+   /* It should do a default reset when loading or 
reloading the driver,
+* regardless of the module parameter 
reset_method.
+*/
+   amdgpu_reset_method = AMD_RESET_METHOD_NONE;
+   r = amdgpu_asic_reset(adev);
+   amdgpu_reset_method = tmp;
+   break;
+   }
+
if (r) {
dev_err(adev->dev, "asic reset on init 
failed\n");
goto failed;
@@ -5945,6 +5955,18 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
return -ENOTSUPP;
 
ret = amdgpu_dpm_baco_exit(adev);
+
+   if (!ret)
+   switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
+   case IP_VERSION(13, 0, 0):
+   case IP_VERSION(13, 0, 7):
+   case IP_VERSION(13, 0, 10):
+   adev->gfx.is_poweron = false;
+   break;
+   default:
+   break;
+   }
+
if (ret)
return ret;
 
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 7c3356d6da5e..2e82172ba250 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -733,7 +733,7 @@ static int smu_early_init(void *handle)
smu->adev = adev;
smu->pm_enabled = !!amdgpu_dpm;
smu->is_apu = false;
-   smu->smu_baco.state = SMU_BACO_STATE_EXIT;
+   smu->smu_baco.state = SMU_BACO_STATE_NONE;
smu->smu_baco.platform_support = false;
smu->user_dpm_profile.fan_mode = -1;
 
@@ -1740,10 +1740,31 @@ static int smu_smc_hw_cleanup(struct smu_context *smu)
return 0;
 }
 
+static int smu_reset_mp1_state(struct smu_context *smu)
+{
+   struct amdgpu_device *adev = smu->adev;
+   int ret = 0;
+
+   if ((!adev->in_runpm) && (!adev->in_suspend) &&
+   (!amdgpu_in_reset(adev)))
+   switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
+ case IP_VERSION(13, 0, 0):
+ case IP_VERSION(13, 0, 7):
+ case IP_VERSION(13, 0, 10):
+   ret = smu_set_mp1_state(smu, PP_MP1_STATE_UNLOAD);
+   break;
+ default:
+   break;
+   }
+
+   return ret;
+}
+
 static int smu_hw_fini(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
struct smu_context *smu = adev->powerplay.pp_handle;
+   int ret;
 
if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev))
return 0;
@@ -1761,7 +1782,15 @@ static int smu_hw_fini(void *handle)
 
adev->pm.dpm_enabled = false;
 
-   return smu_smc_hw_cleanup(smu);
+   ret = smu_smc_hw_cleanup(smu);
+   if (ret)
+   

Re: [PATCH] drm/amd check num of link levels when update pcie param

2023-10-24 Thread Chen, JingWen (Wayne)

Acked-by: Jingwen Chen 

Best Regards,
JingWen Chen

On 2023/10/19 17:46, Lin.Cao wrote:

In SR-IOV environment, the value of pcie_table->num_of_link_levels will
be 0, and num_of_levels - 1 will cause array index out of bounds

Signed-off-by: Lin.Cao 
---
  drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index bcb7ab9d2221..6906b0a7d1d1 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -2437,6 +2437,9 @@ int smu_v13_0_update_pcie_parameters(struct smu_context 
*smu,
uint32_t smu_pcie_arg;
int ret, i;
  
+	if (!num_of_levels)

+   return 0;
+
if (!amdgpu_device_pcie_dynamic_switching_supported()) {
if (pcie_table->pcie_gen[num_of_levels - 1] < pcie_gen_cap)
pcie_gen_cap = pcie_table->pcie_gen[num_of_levels - 1];


RE: [PATCH v2] drm/amd/amdgpu: fix the GPU power print error in pm info

2023-10-24 Thread Zhang, Yifan
[Public]

This patch is:

Reviewed-by: Yifan Zhang 

-Original Message-
From: Ma, Li 
Sent: Wednesday, October 25, 2023 10:31 AM
To: amd-gfx@lists.freedesktop.org; Zhang, Yifan 
Cc: Deucher, Alexander ; Feng, Kenneth 
; StDenis, Tom ; Ma, Li 

Subject: [PATCH v2] drm/amd/amdgpu: fix the GPU power print error in pm info

Modify the print format of the fractional part to avoid display error.

Signed-off-by: Li Ma 
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 358bb5e485f2..517b9fb4624c 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -4290,10 +4290,10 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file 
*m, struct amdgpu_device *a
seq_printf(m, "\t%u mV (VDDNB)\n", value);
size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_AVG_POWER, (void 
*), ))
-   seq_printf(m, "\t%u.%u W (average GPU)\n", query >> 8, query & 
0xff);
+   seq_printf(m, "\t%u.%02u W (average GPU)\n", query >> 8, query 
& 0xff);
size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_INPUT_POWER, 
(void *), ))
-   seq_printf(m, "\t%u.%u W (current GPU)\n", query >> 8, query & 
0xff);
+   seq_printf(m, "\t%u.%02u W (current GPU)\n", query >> 8, query 
& 0xff);
size = sizeof(value);
seq_printf(m, "\n");

--
2.25.1



[PATCH v2] drm/amd/amdgpu: fix the GPU power print error in pm info

2023-10-24 Thread Li Ma
Modify the print format of the fractional part to avoid display error.

Signed-off-by: Li Ma 
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 358bb5e485f2..517b9fb4624c 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -4290,10 +4290,10 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file 
*m, struct amdgpu_device *a
seq_printf(m, "\t%u mV (VDDNB)\n", value);
size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_AVG_POWER, (void 
*), ))
-   seq_printf(m, "\t%u.%u W (average GPU)\n", query >> 8, query & 
0xff);
+   seq_printf(m, "\t%u.%02u W (average GPU)\n", query >> 8, query 
& 0xff);
size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_INPUT_POWER, 
(void *), ))
-   seq_printf(m, "\t%u.%u W (current GPU)\n", query >> 8, query & 
0xff);
+   seq_printf(m, "\t%u.%02u W (current GPU)\n", query >> 8, query 
& 0xff);
size = sizeof(value);
seq_printf(m, "\n");
 
-- 
2.25.1



RE: [PATCH] drm/amd/amdgpu: fix the GPU power print error in pm info

2023-10-24 Thread Ma, Li
[AMD Official Use Only - General]

Hi Yifan,

Got it. This is the better modification.
I will resend patch.

Best Regards,
Ma,Li
-Original Message-
From: Zhang, Yifan 
Sent: Wednesday, October 25, 2023 9:26 AM
To: Ma, Li ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Feng, Kenneth 
; StDenis, Tom 
Subject: RE: [PATCH] drm/amd/amdgpu: fix the GPU power print error in pm info

[AMD Official Use Only - General]

I see your point. How about this one ?

--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -4290,7 +4290,7 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file *m, 
struct amdgpu_device *a
seq_printf(m, "\t%u mV (VDDNB)\n", value);
size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_AVG_POWER, (void 
*), ))
-   seq_printf(m, "\t%u.%u W (average GPU)\n", query >> 8, query & 
0xff);
+   seq_printf(m, "\t%u.%02u W (average GPU)\n", query >> 8, query 
& 0xff);
size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_INPUT_POWER, 
(void *), ))
seq_printf(m, "\t%u.%u W (current GPU)\n", query >> 8, query & 
0xff);

Best Regards,
Yifan

-Original Message-
From: Ma, Li 
Sent: Tuesday, October 24, 2023 11:43 PM
To: Zhang, Yifan ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Feng, Kenneth 
; StDenis, Tom 
Subject: RE: [PATCH] drm/amd/amdgpu: fix the GPU power print error in pm info

[AMD Official Use Only - General]

-Original Message-
From: Zhang, Yifan 
Sent: Tuesday, October 24, 2023 10:29 PM
To: Ma, Li ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Feng, Kenneth 
; StDenis, Tom 
Subject: RE: [PATCH] drm/amd/amdgpu: fix the GPU power print error in pm info

[AMD Official Use Only - General]

-Original Message-
From: Ma, Li 
Sent: Tuesday, October 24, 2023 7:09 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Zhang, Yifan 
; Feng, Kenneth ; StDenis, Tom 
; Ma, Li 
Subject: [PATCH] drm/amd/amdgpu: fix the GPU power print error in pm info

Print the digit of the fractional part individually to avoid carrying during 
display.

Signed-off-by: Li Ma 
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 358bb5e485f2..cc853559cf0f 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -4290,10 +4290,10 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file 
*m, struct amdgpu_device *a
seq_printf(m, "\t%u mV (VDDNB)\n", value);
size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_AVG_POWER, (void 
*), ))
-   seq_printf(m, "\t%u.%u W (average GPU)\n", query >> 8, query & 
0xff);
+   seq_printf(m, "\t%u.%u%u W (average GPU)\n", query >> 8, (query 
&
+0xff) / 10, (query & 0xff) % 10);

Would you pls elaborate on this with an example ? it looks to me it makes no 
difference here.
Li: If the range of  (query&0xff) is [0x01,0x09], the origin output is x.1~x.9. 
However, it should be x.01~x.09 which is same as smu fw.

size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_INPUT_POWER, 
(void *), ))
-   seq_printf(m, "\t%u.%u W (current GPU)\n", query >> 8, query & 
0xff);
+   seq_printf(m, "\t%u.%u%u W (current GPU)\n", query >> 8, (query 
&
+0xff) / 10, (query & 0xff) % 10);

Would you pls elaborate on this with an example? it looks to me it makes no 
difference here.
Li: If the range of  (query&0xff) is [0x01,0x09], the origin output is x.1~x.9. 
However, it should be x.01~x.09 which is same as smu fw.
size = sizeof(value);
seq_printf(m, "\n");

--
2.25.1






RE: [PATCH v2] drm/amd/pm: call smu_cmn_get_smc_version in is_mode1_reset_supported.

2023-10-24 Thread Li, Candice
[AMD Official Use Only - General]

Reviewed-by: Candice Li 



Thanks,
Candice

-Original Message-
From: Zhang, Yifan 
Sent: Wednesday, October 25, 2023 9:40 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Koenig, Christian 
; Feng, Kenneth ; Li, Candice 
; Zhang, Yifan 
Subject: [PATCH v2] drm/amd/pm: call smu_cmn_get_smc_version in 
is_mode1_reset_supported.

is_mode1_reset_supported may be called before smu init, when smu_context
is unitialized in driver load/unload test. Call smu_cmn_get_smc_version
explicitly in is_mode1_reset_supported.

v2: apply to aldebaran in case is_mode1_reset_supported will be
uncommented (Candice Li)

Fixes: 5fe5098c64d9 ("drm/amd/pm: drop most smu_cmn_get_smc_version in smu")
Signed-off-by: Yifan Zhang 
---
 .../gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c|  8 +++-
 drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 10 +-
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c   |  8 +++-
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
index 090249b6422a..77c3d76c76a2 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
@@ -2461,12 +2461,18 @@ static bool 
sienna_cichlid_is_mode1_reset_supported(struct smu_context *smu)
 {
struct amdgpu_device *adev = smu->adev;
uint32_t val;
+   uint32_t smu_version;
+   int ret;

/**
 * SRIOV env will not support SMU mode1 reset
 * PM FW support mode1 reset from 58.26
 */
-   if (amdgpu_sriov_vf(adev) || (smu->smc_fw_version < 0x003a1a00))
+   ret = smu_cmn_get_smc_version(smu, NULL, _version);
+   if (ret)
+   return false;
+
+   if (amdgpu_sriov_vf(adev) || (smu_version < 0x003a1a00))
return false;

/**
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index f082cd4b40c1..1a6675d70a4b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1931,11 +1931,19 @@ static bool aldebaran_is_mode1_reset_supported(struct 
smu_context *smu)
 #if 0
struct amdgpu_device *adev = smu->adev;
uint32_t val;
+   uint32_t smu_version;
+   int ret;
+
/**
 * PM FW version support mode1 reset from 68.07
 */
-   if ((smu->smc_fw_version < 0x00440700))
+   ret = smu_cmn_get_smc_version(smu, NULL, _version);
+   if (ret)
return false;
+
+   if ((smu_version < 0x00440700))
+   return false;
+
/**
 * mode1 reset relies on PSP, so we should check if
 * PSP is alive.
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index b1433973380b..648d5eafb27b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -2615,13 +2615,19 @@ static int smu_v13_0_0_baco_exit(struct smu_context 
*smu)
 static bool smu_v13_0_0_is_mode1_reset_supported(struct smu_context *smu)
 {
struct amdgpu_device *adev = smu->adev;
+   u32 smu_version;
+   int ret;

/* SRIOV does not support SMU mode1 reset */
if (amdgpu_sriov_vf(adev))
return false;

/* PMFW support is available since 78.41 */
-   if (smu->smc_fw_version < 0x004e2900)
+   ret = smu_cmn_get_smc_version(smu, NULL, _version);
+   if (ret)
+   return false;
+
+   if (smu_version < 0x004e2900)
return false;

return true;
--
2.37.3



[PATCH v2] drm/amd/pm: call smu_cmn_get_smc_version in is_mode1_reset_supported.

2023-10-24 Thread Yifan Zhang
is_mode1_reset_supported may be called before smu init, when smu_context
is unitialized in driver load/unload test. Call smu_cmn_get_smc_version
explicitly in is_mode1_reset_supported.

v2: apply to aldebaran in case is_mode1_reset_supported will be
uncommented (Candice Li)

Fixes: 5fe5098c64d9 ("drm/amd/pm: drop most smu_cmn_get_smc_version in smu")
Signed-off-by: Yifan Zhang 
---
 .../gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c|  8 +++-
 drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 10 +-
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c   |  8 +++-
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
index 090249b6422a..77c3d76c76a2 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
@@ -2461,12 +2461,18 @@ static bool 
sienna_cichlid_is_mode1_reset_supported(struct smu_context *smu)
 {
struct amdgpu_device *adev = smu->adev;
uint32_t val;
+   uint32_t smu_version;
+   int ret;
 
/**
 * SRIOV env will not support SMU mode1 reset
 * PM FW support mode1 reset from 58.26
 */
-   if (amdgpu_sriov_vf(adev) || (smu->smc_fw_version < 0x003a1a00))
+   ret = smu_cmn_get_smc_version(smu, NULL, _version);
+   if (ret)
+   return false;
+
+   if (amdgpu_sriov_vf(adev) || (smu_version < 0x003a1a00))
return false;
 
/**
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index f082cd4b40c1..1a6675d70a4b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1931,11 +1931,19 @@ static bool aldebaran_is_mode1_reset_supported(struct 
smu_context *smu)
 #if 0
struct amdgpu_device *adev = smu->adev;
uint32_t val;
+   uint32_t smu_version;
+   int ret;
+
/**
 * PM FW version support mode1 reset from 68.07
 */
-   if ((smu->smc_fw_version < 0x00440700))
+   ret = smu_cmn_get_smc_version(smu, NULL, _version);
+   if (ret)
return false;
+
+   if ((smu_version < 0x00440700))
+   return false;
+
/**
 * mode1 reset relies on PSP, so we should check if
 * PSP is alive.
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index b1433973380b..648d5eafb27b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -2615,13 +2615,19 @@ static int smu_v13_0_0_baco_exit(struct smu_context 
*smu)
 static bool smu_v13_0_0_is_mode1_reset_supported(struct smu_context *smu)
 {
struct amdgpu_device *adev = smu->adev;
+   u32 smu_version;
+   int ret;
 
/* SRIOV does not support SMU mode1 reset */
if (amdgpu_sriov_vf(adev))
return false;
 
/* PMFW support is available since 78.41 */
-   if (smu->smc_fw_version < 0x004e2900)
+   ret = smu_cmn_get_smc_version(smu, NULL, _version);
+   if (ret)
+   return false;
+
+   if (smu_version < 0x004e2900)
return false;
 
return true;
-- 
2.37.3



RE: [PATCH] drm/amd/amdgpu: fix the GPU power print error in pm info

2023-10-24 Thread Zhang, Yifan
[AMD Official Use Only - General]

I see your point. How about this one ?

--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -4290,7 +4290,7 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file *m, 
struct amdgpu_device *a
seq_printf(m, "\t%u mV (VDDNB)\n", value);
size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_AVG_POWER, (void 
*), ))
-   seq_printf(m, "\t%u.%u W (average GPU)\n", query >> 8, query & 
0xff);
+   seq_printf(m, "\t%u.%02u W (average GPU)\n", query >> 8, query 
& 0xff);
size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_INPUT_POWER, 
(void *), ))
seq_printf(m, "\t%u.%u W (current GPU)\n", query >> 8, query & 
0xff);

Best Regards,
Yifan

-Original Message-
From: Ma, Li 
Sent: Tuesday, October 24, 2023 11:43 PM
To: Zhang, Yifan ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Feng, Kenneth 
; StDenis, Tom 
Subject: RE: [PATCH] drm/amd/amdgpu: fix the GPU power print error in pm info

[AMD Official Use Only - General]

-Original Message-
From: Zhang, Yifan 
Sent: Tuesday, October 24, 2023 10:29 PM
To: Ma, Li ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Feng, Kenneth 
; StDenis, Tom 
Subject: RE: [PATCH] drm/amd/amdgpu: fix the GPU power print error in pm info

[AMD Official Use Only - General]

-Original Message-
From: Ma, Li 
Sent: Tuesday, October 24, 2023 7:09 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Zhang, Yifan 
; Feng, Kenneth ; StDenis, Tom 
; Ma, Li 
Subject: [PATCH] drm/amd/amdgpu: fix the GPU power print error in pm info

Print the digit of the fractional part individually to avoid carrying during 
display.

Signed-off-by: Li Ma 
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 358bb5e485f2..cc853559cf0f 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -4290,10 +4290,10 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file 
*m, struct amdgpu_device *a
seq_printf(m, "\t%u mV (VDDNB)\n", value);
size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_AVG_POWER, (void 
*), ))
-   seq_printf(m, "\t%u.%u W (average GPU)\n", query >> 8, query & 
0xff);
+   seq_printf(m, "\t%u.%u%u W (average GPU)\n", query >> 8, (query 
&
+0xff) / 10, (query & 0xff) % 10);

Would you pls elaborate on this with an example ? it looks to me it makes no 
difference here.
Li: If the range of  (query&0xff) is [0x01,0x09], the origin output is x.1~x.9. 
However, it should be x.01~x.09 which is same as smu fw.

size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_INPUT_POWER, 
(void *), ))
-   seq_printf(m, "\t%u.%u W (current GPU)\n", query >> 8, query & 
0xff);
+   seq_printf(m, "\t%u.%u%u W (current GPU)\n", query >> 8, (query 
&
+0xff) / 10, (query & 0xff) % 10);

Would you pls elaborate on this with an example? it looks to me it makes no 
difference here.
Li: If the range of  (query&0xff) is [0x01,0x09], the origin output is x.1~x.9. 
However, it should be x.01~x.09 which is same as smu fw.
size = sizeof(value);
seq_printf(m, "\n");

--
2.25.1





Re: [PATCH] drm/amdgpu: Initialize schedulers before using them

2023-10-24 Thread Luben Tuikov
On 2023-10-24 10:46, Alex Deucher wrote:
> On Tue, Oct 24, 2023 at 6:14 AM Christian König
>  wrote:
>>
>> [SNIP]
>>> Let me take a closer look first
>>
>> I think I've figured out why this isn't working as expected. It started
>> with this patch here:
>>
>> commit 5fd8518d187ed03403a4d4f7f56f52c00b11c148
>> Author: Andrey Grodzovsky 
>> Date:   Mon Dec 6 14:59:35 2021 -0500
>>
>>  drm/amdgpu: Move scheduler init to after XGMI is ready
>>
>>  Before we initialize schedulers we must know which reset
>>  domain are we in - for single device there iis a single
>>  domain per device and so single wq per device. For XGMI
>>  the reset domain spans the entire XGMI hive and so the
>>  reset wq is per hive.
>>
>>  Signed-off-by: Andrey Grodzovsky 
>>  Reviewed-by: Christian König 
>>  Link: https://www.spinics.net/lists/amd-gfx/msg74112.html
>>
>> Andrey separated the scheduler initialization from the ring init because
>> we need some of the rings for XGMI initialization which in turn in
>> necessary to figure out the XGMI hive and so the reset domain for the
>> scheduler.
>>
>> The code inside amdgpu_ttm_set_buffer_funcs_status() is actually
>> correct, the problem is that this is called as part of the hw init which
>> comes earlier than the scheduler init.
>>
>> @Alex, Ideas how to fix this? My best guess is that we should move the
>> call to amdgpu_ttm_set_buffer_funcs_status() from the DMA specific code
>> into the higher level handling in amdgpu_device.c
> 
> Yes, I think so, but there could be some tricky ordering issues with
> respect to suspend and resume.  I think something like the attached
> patch should do the trick.

This patch works. I've tested suspend and resume too.

Tested-by: Luben Tuikov 

scripts/checkpatch.pl complains about extra parenthesis.

-- 
Regards,
Luben



[PATCH v8 6/6] amd/display: indicate support for atomic async page-flips on DC

2023-10-24 Thread André Almeida
From: Simon Ser 

amdgpu_dm_commit_planes() already sets the flip_immediate flag for
async page-flips. This flag is used to set the UNP_FLIP_CONTROL
register. Thus, no additional change is required to handle async
page-flips with the atomic uAPI.

Signed-off-by: Simon Ser 
Reviewed-by: André Almeida 
Reviewed-by: Alex Deucher 
Signed-off-by: André Almeida 
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index dec6e43e7198..45b8fd61a044 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -4003,7 +4003,6 @@ static int amdgpu_dm_mode_config_init(struct 
amdgpu_device *adev)
adev_to_drm(adev)->mode_config.prefer_shadow = 1;
/* indicates support for immediate flip */
adev_to_drm(adev)->mode_config.async_page_flip = true;
-   adev_to_drm(adev)->mode_config.atomic_async_page_flip_not_supported = 
true;
 
state = kzalloc(sizeof(*state), GFP_KERNEL);
if (!state)
-- 
2.42.0



[PATCH v8 5/6] drm/doc: Define KMS atomic state set

2023-10-24 Thread André Almeida
From: Pekka Paalanen 

Specify how the atomic state is maintained between userspace and
kernel, plus the special case for async flips.

Signed-off-by: Pekka Paalanen 
Signed-off-by: André Almeida 
---
v8:
- no changes
v7:
- add a note that drivers can make exceptions for ad-hoc prop changes
- add a note about flipping the same FB_ID as a no-op
---
---
 Documentation/gpu/drm-uapi.rst | 47 ++
 1 file changed, 47 insertions(+)

diff --git a/Documentation/gpu/drm-uapi.rst b/Documentation/gpu/drm-uapi.rst
index 632989df3727..34bd02270ee7 100644
--- a/Documentation/gpu/drm-uapi.rst
+++ b/Documentation/gpu/drm-uapi.rst
@@ -570,3 +570,50 @@ dma-buf interoperability
 
 Please see Documentation/userspace-api/dma-buf-alloc-exchange.rst for
 information on how dma-buf is integrated and exposed within DRM.
+
+KMS atomic state
+
+
+An atomic commit can change multiple KMS properties in an atomic fashion,
+without ever applying intermediate or partial state changes.  Either the whole
+commit succeeds or fails, and it will never be applied partially. This is the
+fundamental improvement of the atomic API over the older non-atomic API which 
is
+referred to as the "legacy API".  Applying intermediate state could 
unexpectedly
+fail, cause visible glitches, or delay reaching the final state.
+
+An atomic commit can be flagged with DRM_MODE_ATOMIC_TEST_ONLY, which means the
+complete state change is validated but not applied.  Userspace should use this
+flag to validate any state change before asking to apply it. If validation 
fails
+for any reason, userspace should attempt to fall back to another, perhaps
+simpler, final state.  This allows userspace to probe for various 
configurations
+without causing visible glitches on screen and without the need to undo a
+probing change.
+
+The changes recorded in an atomic commit apply on top the current KMS state in
+the kernel. Hence, the complete new KMS state is the complete old KMS state 
with
+the committed property settings done on top. The kernel will try to avoid
+no-operation changes, so it is safe for userspace to send redundant property
+settings.  However, not every situation allows for no-op changes, due to the
+need to acquire locks for some attributes. Userspace needs to be aware that 
some
+redundant information might result in oversynchronization issues.  No-operation
+changes do not count towards actually needed changes, e.g.  setting MODE_ID to 
a
+different blob with identical contents as the current KMS state shall not be a
+modeset on its own. As a special exception for VRR needs, explicitly setting
+FB_ID to its current value is not a no-op.
+
+A "modeset" is a change in KMS state that might enable, disable, or temporarily
+disrupt the emitted video signal, possibly causing visible glitches on screen. 
A
+modeset may also take considerably more time to complete than other kinds of
+changes, and the video sink might also need time to adapt to the new signal
+properties. Therefore a modeset must be explicitly allowed with the flag
+DRM_MODE_ATOMIC_ALLOW_MODESET.  This in combination with
+DRM_MODE_ATOMIC_TEST_ONLY allows userspace to determine if a state change is
+likely to cause visible disruption on screen and avoid such changes when end
+users do not expect them.
+
+An atomic commit with the flag DRM_MODE_PAGE_FLIP_ASYNC is allowed to
+effectively change only the FB_ID property on any planes. No-operation changes
+are ignored as always. Changing any other property will cause the commit to be
+rejected. Each driver may relax this restriction if they have guarantees that
+such property change doesn't cause modesets. Userspace can use TEST_ONLY 
commits
+to query the driver about this.
-- 
2.42.0



[PATCH v8 4/6] drm: Refuse to async flip with atomic prop changes

2023-10-24 Thread André Almeida
Given that prop changes may lead to modesetting, which would defeat the
fast path of the async flip, refuse any atomic prop change for async
flips in atomic API. The only exception is the framebuffer ID to flip
to. Currently the only plane type supported is the primary one.

Reviewed-by: Simon Ser 
Signed-off-by: André Almeida 
---
v8: add a check for plane type, we can only flip primary planes
v7: drop the mode_id exception for prop changes
---
---
 drivers/gpu/drm/drm_atomic_uapi.c   | 54 +++--
 drivers/gpu/drm/drm_crtc_internal.h |  2 +-
 drivers/gpu/drm/drm_mode_object.c   |  2 +-
 3 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/drm_atomic_uapi.c 
b/drivers/gpu/drm/drm_atomic_uapi.c
index a15121e75a0a..ebaa6413d5a0 100644
--- a/drivers/gpu/drm/drm_atomic_uapi.c
+++ b/drivers/gpu/drm/drm_atomic_uapi.c
@@ -1006,13 +1006,28 @@ int drm_atomic_connector_commit_dpms(struct 
drm_atomic_state *state,
return ret;
 }
 
+static int drm_atomic_check_prop_changes(int ret, uint64_t old_val, uint64_t 
prop_value,
+struct drm_property *prop)
+{
+   if (ret != 0 || old_val != prop_value) {
+   drm_dbg_atomic(prop->dev,
+  "[PROP:%d:%s] No prop can be changed during 
async flip\n",
+  prop->base.id, prop->name);
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
 int drm_atomic_set_property(struct drm_atomic_state *state,
struct drm_file *file_priv,
struct drm_mode_object *obj,
struct drm_property *prop,
-   uint64_t prop_value)
+   uint64_t prop_value,
+   bool async_flip)
 {
struct drm_mode_object *ref;
+   uint64_t old_val;
int ret;
 
if (!drm_property_change_valid_get(prop, prop_value, ))
@@ -1029,6 +1044,13 @@ int drm_atomic_set_property(struct drm_atomic_state 
*state,
break;
}
 
+   if (async_flip) {
+   ret = drm_atomic_connector_get_property(connector, 
connector_state,
+   prop, _val);
+   ret = drm_atomic_check_prop_changes(ret, old_val, 
prop_value, prop);
+   break;
+   }
+
ret = drm_atomic_connector_set_property(connector,
connector_state, file_priv,
prop, prop_value);
@@ -1044,6 +1066,13 @@ int drm_atomic_set_property(struct drm_atomic_state 
*state,
break;
}
 
+   if (async_flip) {
+   ret = drm_atomic_crtc_get_property(crtc, crtc_state,
+  prop, _val);
+   ret = drm_atomic_check_prop_changes(ret, old_val, 
prop_value, prop);
+   break;
+   }
+
ret = drm_atomic_crtc_set_property(crtc,
crtc_state, prop, prop_value);
break;
@@ -1051,6 +1080,7 @@ int drm_atomic_set_property(struct drm_atomic_state 
*state,
case DRM_MODE_OBJECT_PLANE: {
struct drm_plane *plane = obj_to_plane(obj);
struct drm_plane_state *plane_state;
+   struct drm_mode_config *config = >dev->mode_config;
 
plane_state = drm_atomic_get_plane_state(state, plane);
if (IS_ERR(plane_state)) {
@@ -1058,6 +1088,21 @@ int drm_atomic_set_property(struct drm_atomic_state 
*state,
break;
}
 
+   if (async_flip && prop != config->prop_fb_id) {
+   ret = drm_atomic_plane_get_property(plane, plane_state,
+   prop, _val);
+   ret = drm_atomic_check_prop_changes(ret, old_val, 
prop_value, prop);
+   break;
+   }
+
+   if (async_flip && plane_state->plane->type != 
DRM_PLANE_TYPE_PRIMARY) {
+   drm_dbg_atomic(prop->dev,
+   "[OBJECT:%d] Only primary planes can be changed 
during async flip\n",
+   obj->id);
+   ret = -EINVAL;
+   break;
+   }
+
ret = drm_atomic_plane_set_property(plane,
plane_state, file_priv,
prop, prop_value);
@@ -1349,6 +1394,7 @@ int drm_mode_atomic_ioctl(struct drm_device *dev,
struct drm_out_fence_state *fence_state;
int ret = 0;
unsigned int i, j, num_fences;
+   bool async_flip = false;
 
/* disallow for drivers not supporting atomic: */
if 

[PATCH v8 3/6] drm: introduce drm_mode_config.atomic_async_page_flip_not_supported

2023-10-24 Thread André Almeida
From: Simon Ser 

This new field indicates whether the driver has the necessary logic
to support async page-flips via the atomic uAPI. This is leveraged by
the next commit to allow user-space to use this functionality.

All atomic drivers setting drm_mode_config.async_page_flip are updated
to also set drm_mode_config.atomic_async_page_flip_not_supported. We
will gradually check and update these drivers to properly handle
drm_crtc_state.async_flip in their atomic logic.

The goal of this negative flag is the same as
fb_modifiers_not_supported: we want to eventually get rid of all
drivers missing atomic support for async flips. New drivers should not
set this flag, instead they should support atomic async flips (if
they support async flips at all). IOW, we don't want more drivers
with async flip support for legacy but not atomic.

Signed-off-by: Simon Ser 
Reviewed-by: André Almeida 
Reviewed-by: Alex Deucher 
Signed-off-by: André Almeida 
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c   |  1 +
 drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_dc.c|  1 +
 drivers/gpu/drm/i915/display/intel_display_driver.c |  1 +
 drivers/gpu/drm/nouveau/nouveau_display.c   |  1 +
 include/drm/drm_mode_config.h   | 11 +++
 5 files changed, 15 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 45b8fd61a044..dec6e43e7198 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -4003,6 +4003,7 @@ static int amdgpu_dm_mode_config_init(struct 
amdgpu_device *adev)
adev_to_drm(adev)->mode_config.prefer_shadow = 1;
/* indicates support for immediate flip */
adev_to_drm(adev)->mode_config.async_page_flip = true;
+   adev_to_drm(adev)->mode_config.atomic_async_page_flip_not_supported = 
true;
 
state = kzalloc(sizeof(*state), GFP_KERNEL);
if (!state)
diff --git a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_dc.c 
b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_dc.c
index 84c54e8622d1..f1d9bb1d7c34 100644
--- a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_dc.c
+++ b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_dc.c
@@ -639,6 +639,7 @@ static int atmel_hlcdc_dc_modeset_init(struct drm_device 
*dev)
dev->mode_config.max_height = dc->desc->max_height;
dev->mode_config.funcs = _config_funcs;
dev->mode_config.async_page_flip = true;
+   dev->mode_config.atomic_async_page_flip_not_supported = true;
 
return 0;
 }
diff --git a/drivers/gpu/drm/i915/display/intel_display_driver.c 
b/drivers/gpu/drm/i915/display/intel_display_driver.c
index 44b59ac301e6..6142c83fba06 100644
--- a/drivers/gpu/drm/i915/display/intel_display_driver.c
+++ b/drivers/gpu/drm/i915/display/intel_display_driver.c
@@ -126,6 +126,7 @@ static void intel_mode_config_init(struct drm_i915_private 
*i915)
mode_config->helper_private = _mode_config_funcs;
 
mode_config->async_page_flip = HAS_ASYNC_FLIPS(i915);
+   mode_config->atomic_async_page_flip_not_supported = true;
 
/*
 * Maximum framebuffer dimensions, chosen to match
diff --git a/drivers/gpu/drm/nouveau/nouveau_display.c 
b/drivers/gpu/drm/nouveau/nouveau_display.c
index d8c92521226d..586aa51794e8 100644
--- a/drivers/gpu/drm/nouveau/nouveau_display.c
+++ b/drivers/gpu/drm/nouveau/nouveau_display.c
@@ -720,6 +720,7 @@ nouveau_display_create(struct drm_device *dev)
dev->mode_config.async_page_flip = false;
else
dev->mode_config.async_page_flip = true;
+   dev->mode_config.atomic_async_page_flip_not_supported = true;
 
drm_kms_helper_poll_init(dev);
drm_kms_helper_poll_disable(dev);
diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
index 973119a9176b..47b005671e6a 100644
--- a/include/drm/drm_mode_config.h
+++ b/include/drm/drm_mode_config.h
@@ -918,6 +918,17 @@ struct drm_mode_config {
 */
bool async_page_flip;
 
+   /**
+* @atomic_async_page_flip_not_supported:
+*
+* If true, the driver does not support async page-flips with the
+* atomic uAPI. This is only used by old drivers which haven't yet
+* accomodated for _crtc_state.async_flip in their atomic logic,
+* even if they have _mode_config.async_page_flip set to true.
+* New drivers shall not set this flag.
+*/
+   bool atomic_async_page_flip_not_supported;
+
/**
 * @fb_modifiers_not_supported:
 *
-- 
2.42.0



[PATCH v8 2/6] drm: introduce DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP

2023-10-24 Thread André Almeida
From: Simon Ser 

This new kernel capability indicates whether async page-flips are
supported via the atomic uAPI. DRM clients can use it to check
for support before feeding DRM_MODE_PAGE_FLIP_ASYNC to the kernel.

Make it clear that DRM_CAP_ASYNC_PAGE_FLIP is for legacy uAPI only.

Signed-off-by: Simon Ser 
Reviewed-by: André Almeida 
Reviewed-by: Alex Deucher 
Signed-off-by: André Almeida 
---
 drivers/gpu/drm/drm_ioctl.c |  5 +
 include/uapi/drm/drm.h  | 10 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c
index 77590b0f38fa..a96e7acb9071 100644
--- a/drivers/gpu/drm/drm_ioctl.c
+++ b/drivers/gpu/drm/drm_ioctl.c
@@ -301,6 +301,11 @@ static int drm_getcap(struct drm_device *dev, void *data, 
struct drm_file *file_
case DRM_CAP_CRTC_IN_VBLANK_EVENT:
req->value = 1;
break;
+   case DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP:
+   req->value = drm_core_check_feature(dev, DRIVER_ATOMIC) &&
+dev->mode_config.async_page_flip &&
+
!dev->mode_config.atomic_async_page_flip_not_supported;
+   break;
default:
return -EINVAL;
}
diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
index 794c1d857677..58baefe32c23 100644
--- a/include/uapi/drm/drm.h
+++ b/include/uapi/drm/drm.h
@@ -713,7 +713,8 @@ struct drm_gem_open {
 /**
  * DRM_CAP_ASYNC_PAGE_FLIP
  *
- * If set to 1, the driver supports _MODE_PAGE_FLIP_ASYNC.
+ * If set to 1, the driver supports _MODE_PAGE_FLIP_ASYNC for legacy
+ * page-flips.
  */
 #define DRM_CAP_ASYNC_PAGE_FLIP0x7
 /**
@@ -773,6 +774,13 @@ struct drm_gem_open {
  * :ref:`drm_sync_objects`.
  */
 #define DRM_CAP_SYNCOBJ_TIMELINE   0x14
+/**
+ * DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP
+ *
+ * If set to 1, the driver supports _MODE_PAGE_FLIP_ASYNC for atomic
+ * commits.
+ */
+#define DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP 0x15
 
 /* DRM_IOCTL_GET_CAP ioctl argument type */
 struct drm_get_cap {
-- 
2.42.0



[PATCH v8 1/6] drm: allow DRM_MODE_PAGE_FLIP_ASYNC for atomic commits

2023-10-24 Thread André Almeida
From: Simon Ser 

If the driver supports it, allow user-space to supply the
DRM_MODE_PAGE_FLIP_ASYNC flag to request an async page-flip.
Set drm_crtc_state.async_flip accordingly.

Document that drivers will reject atomic commits if an async
flip isn't possible. This allows user-space to fall back to
something else. For instance, Xorg falls back to a blit.
Another option is to wait as close to the next vblank as
possible before performing the page-flip to reduce latency.

Signed-off-by: Simon Ser 
Reviewed-by: Alex Deucher 
Co-developed-by: André Almeida 
Signed-off-by: André Almeida 
---
 drivers/gpu/drm/drm_atomic_uapi.c | 28 +---
 include/uapi/drm/drm_mode.h   |  9 +
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_atomic_uapi.c 
b/drivers/gpu/drm/drm_atomic_uapi.c
index 98d3b10c08ae..a15121e75a0a 100644
--- a/drivers/gpu/drm/drm_atomic_uapi.c
+++ b/drivers/gpu/drm/drm_atomic_uapi.c
@@ -1323,6 +1323,18 @@ static void complete_signaling(struct drm_device *dev,
kfree(fence_state);
 }
 
+static void
+set_async_flip(struct drm_atomic_state *state)
+{
+   struct drm_crtc *crtc;
+   struct drm_crtc_state *crtc_state;
+   int i;
+
+   for_each_new_crtc_in_state(state, crtc, crtc_state, i) {
+   crtc_state->async_flip = true;
+   }
+}
+
 int drm_mode_atomic_ioctl(struct drm_device *dev,
  void *data, struct drm_file *file_priv)
 {
@@ -1363,9 +1375,16 @@ int drm_mode_atomic_ioctl(struct drm_device *dev,
}
 
if (arg->flags & DRM_MODE_PAGE_FLIP_ASYNC) {
-   drm_dbg_atomic(dev,
-  "commit failed: invalid flag 
DRM_MODE_PAGE_FLIP_ASYNC\n");
-   return -EINVAL;
+   if (!dev->mode_config.async_page_flip) {
+   drm_dbg_atomic(dev,
+  "commit failed: DRM_MODE_PAGE_FLIP_ASYNC 
not supported\n");
+   return -EINVAL;
+   }
+   if (dev->mode_config.atomic_async_page_flip_not_supported) {
+   drm_dbg_atomic(dev,
+  "commit failed: DRM_MODE_PAGE_FLIP_ASYNC 
not supported with atomic\n");
+   return -EINVAL;
+   }
}
 
/* can't test and expect an event at the same time. */
@@ -1468,6 +1487,9 @@ int drm_mode_atomic_ioctl(struct drm_device *dev,
if (ret)
goto out;
 
+   if (arg->flags & DRM_MODE_PAGE_FLIP_ASYNC)
+   set_async_flip(state);
+
if (arg->flags & DRM_MODE_ATOMIC_TEST_ONLY) {
ret = drm_atomic_check_only(state);
} else if (arg->flags & DRM_MODE_ATOMIC_NONBLOCK) {
diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index ea1b639bcb28..04e6a3caa675 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@ -957,6 +957,15 @@ struct hdr_output_metadata {
  * Request that the page-flip is performed as soon as possible, ie. with no
  * delay due to waiting for vblank. This may cause tearing to be visible on
  * the screen.
+ *
+ * When used with atomic uAPI, the driver will return an error if the hardware
+ * doesn't support performing an asynchronous page-flip for this update.
+ * User-space should handle this, e.g. by falling back to a regular page-flip.
+ *
+ * Note, some hardware might need to perform one last synchronous page-flip
+ * before being able to switch to asynchronous page-flips. As an exception,
+ * the driver will return success even though that first page-flip is not
+ * asynchronous.
  */
 #define DRM_MODE_PAGE_FLIP_ASYNC 0x02
 #define DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE 0x4
-- 
2.42.0



[PATCH v8 0/6] drm: Add support for atomic async page-flip

2023-10-24 Thread André Almeida
Hi,

This work from me and Simon adds support for DRM_MODE_PAGE_FLIP_ASYNC through
the atomic API. This feature is already available via the legacy API. The use
case is to be able to present a new frame immediately (or as soon as
possible), even if after missing a vblank. This might result in tearing, but
it's useful when a high framerate is desired, such as for gaming.

Differently from earlier versions, this one refuses to flip if any prop changes
for async flips. The idea is that the fast path of immediate page flips doesn't
play well with modeset changes, so only the fb_id can be changed.

Thanks,
André

- User-space patch: https://github.com/Plagman/gamescope/pull/595
- IGT tests: 
https://gitlab.freedesktop.org/andrealmeid/igt-gpu-tools/-/tree/atomic_async_page_flip

Changes from v7:
- Only accept flips to primary planes. If a driver support flips in different
planes, support will be added  later.
v7: 
https://lore.kernel.org/dri-devel/20231017092837.32428-1-andrealm...@igalia.com/

Changes from v6:
- Dropped the exception to allow MODE_ID changes (Simon)
- Clarify what happens when flipping with the same FB_ID (Pekka)

v6: 
https://lore.kernel.org/dri-devel/20230815185710.159779-1-andrealm...@igalia.com/

Changes from v5:
- Add note in the docs that not every redundant attribute will result in no-op,
  some might cause oversynchronization issues.

v5: 
https://lore.kernel.org/dri-devel/20230707224059.305474-1-andrealm...@igalia.com/

Changes from v4:
 - Documentation rewrote by Pekka Paalanen

v4: 
https://lore.kernel.org/dri-devel/20230701020917.143394-1-andrealm...@igalia.com/

Changes from v3:
 - Add new patch to reject prop changes
 - Add a documentation clarifying the KMS atomic state set

v3: 
https://lore.kernel.org/dri-devel/20220929184307.258331-1-cont...@emersion.fr/

André Almeida (1):
  drm: Refuse to async flip with atomic prop changes

Pekka Paalanen (1):
  drm/doc: Define KMS atomic state set

Simon Ser (4):
  drm: allow DRM_MODE_PAGE_FLIP_ASYNC for atomic commits
  drm: introduce DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP
  drm: introduce drm_mode_config.atomic_async_page_flip_not_supported
  amd/display: indicate support for atomic async page-flips on DC

 Documentation/gpu/drm-uapi.rst| 47 +++
 drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_dc.c  |  1 +
 drivers/gpu/drm/drm_atomic_uapi.c | 82 +--
 drivers/gpu/drm/drm_crtc_internal.h   |  2 +-
 drivers/gpu/drm/drm_ioctl.c   |  5 ++
 drivers/gpu/drm/drm_mode_object.c |  2 +-
 .../drm/i915/display/intel_display_driver.c   |  1 +
 drivers/gpu/drm/nouveau/nouveau_display.c |  1 +
 include/drm/drm_mode_config.h | 11 +++
 include/uapi/drm/drm.h| 10 ++-
 include/uapi/drm/drm_mode.h   |  9 ++
 11 files changed, 162 insertions(+), 9 deletions(-)

-- 
2.42.0



Re: [PATCH v3] drm/amdgpu: Add EXT_COHERENT support for APU and NUMA systems

2023-10-24 Thread Felix Kuehling



On 2023-10-24 15:08, David Francis wrote:

On gfx943 APU, EXT_COHERENT should give MTYPE_CC for local and
MTYPE_UC for nonlocal memory.

On NUMA systems, local memory gets the local mtype, set by an
override callback. If EXT_COHERENT is set, memory will be set as
MTYPE_UC by default, with local memory MTYPE_CC.

Add an option in the override function for this case, and
add a check to ensure it is not used on UNCACHED memory.

V2: Combined APU and NUMA code into one patch
V3: Fixed a potential nullptr in amdgpu_vm_bo_update

Signed-off-by: David Francis 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 17 +++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|  8 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c |  2 +-
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 33 +++
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c  |  8 +++---
  5 files changed, 45 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index d72daf15662f..155c04589753 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -761,6 +761,7 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
   * @immediate: immediate submission in a page fault
   * @unlocked: unlocked invalidation during MM callback
   * @flush_tlb: trigger tlb invalidation after update completed
+ * @allow_override: change MTYPE for local NUMA nodes
   * @resv: fences we need to sync to
   * @start: start of mapped range
   * @last: last mapped entry
@@ -777,7 +778,7 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
   * 0 for success, negative erro code for failure.
   */
  int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
-  bool immediate, bool unlocked, bool flush_tlb,
+  bool immediate, bool unlocked, bool flush_tlb, bool 
allow_override,
   struct dma_resv *resv, uint64_t start, uint64_t last,
   uint64_t flags, uint64_t offset, uint64_t vram_base,
   struct ttm_resource *res, dma_addr_t *pages_addr,
@@ -815,6 +816,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, 
struct amdgpu_vm *vm,
params.immediate = immediate;
params.pages_addr = pages_addr;
params.unlocked = unlocked;
+   params.allow_override = allow_override;
  
  	/* Implicitly sync to command submissions in the same VM before

 * unmapping. Sync to moving fences before mapping.
@@ -990,6 +992,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct 
amdgpu_bo_va *bo_va,
struct ttm_resource *mem;
struct dma_fence **last_update;
bool flush_tlb = clear;
+   bool uncached;
struct dma_resv *resv;
uint64_t vram_base;
uint64_t flags;
@@ -1027,9 +1030,11 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, 
struct amdgpu_bo_va *bo_va,
  
  		bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);

vram_base = bo_adev->vm_manager.vram_base_offset;
+   uncached = (bo->flags & AMDGPU_GEM_CREATE_UNCACHED) != 0;
} else {
flags = 0x0;
vram_base = 0;
+   uncached = false;
}
  
  	if (clear || (bo && bo->tbo.base.resv ==

@@ -1063,7 +1068,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, 
struct amdgpu_bo_va *bo_va,
trace_amdgpu_vm_bo_update(mapping);
  
  		r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb,

-  resv, mapping->start, mapping->last,
+  !uncached, resv, mapping->start, 
mapping->last,
   update_flags, mapping->offset,
   vram_base, mem, pages_addr,
   last_update);
@@ -1258,8 +1263,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
mapping->start < AMDGPU_GMC_HOLE_START)
init_pte_value = AMDGPU_PTE_DEFAULT_ATC;
  
-		r = amdgpu_vm_update_range(adev, vm, false, false, true, resv,

-  mapping->start, mapping->last,
+   r = amdgpu_vm_update_range(adev, vm, false, false, true, false,
+  resv, mapping->start, mapping->last,
   init_pte_value, 0, 0, NULL, NULL,
   );
amdgpu_vm_free_mapping(adev, vm, mapping, f);
@@ -2547,8 +2552,8 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, 
u32 pasid,
goto error_unlock;
}
  
-	r = amdgpu_vm_update_range(adev, vm, true, false, false, NULL, addr,

-  addr, flags, value, 0, NULL, NULL, NULL);
+   r = amdgpu_vm_update_range(adev, 

Re: [PATCH v3 2/2] drm/amdgpu: Permit PCIe transfer over links with XGMI

2023-10-24 Thread Felix Kuehling



On 2023-10-24 15:20, David Francis wrote:

When the CPU is XGMI connected, the PCIe links should
not be enumerated for topology purposes. However, PCIe
transfer should still be a valid option for remote
doorbells and MMIO mappings.

Move the XGMI connection check out of the shared helper
function amdgpu_device_is_peer_accessible and into the
topology path.

Signed-off-by: David Francis 


This patch is

Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +---
  drivers/gpu/drm/amd/amdkfd/kfd_topology.c  | 6 --
  2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7ec32b44df05..a5c054f8a427 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5783,9 +5783,7 @@ bool amdgpu_device_is_peer_accessible(struct 
amdgpu_device *adev,
~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
resource_size_t aper_limit =
adev->gmc.aper_base + adev->gmc.aper_size - 1;
-   bool p2p_access =
-   !adev->gmc.xgmi.connected_to_cpu &&
-   !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
+   bool p2p_access = !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) 
< 0);
  
  	return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&

adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 4e530791507e..cb64c19482f3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1514,11 +1514,13 @@ static int kfd_dev_create_p2p_links(void)
goto next;
  
  		/* check if node(s) is/are peer accessible in one direction or bi-direction */

-   ret = kfd_add_peer_prop(new_dev, dev, i, k);
+   if (!new_dev->gpu->adev->gmc.xgmi.connected_to_cpu)
+   ret = kfd_add_peer_prop(new_dev, dev, i, k);
if (ret < 0)
goto out;
  
-		ret = kfd_add_peer_prop(dev, new_dev, k, i);

+   if (!dev->gpu->adev->gmc.xgmi.connected_to_cpu)
+   ret = kfd_add_peer_prop(dev, new_dev, k, i);
if (ret < 0)
goto out;
  next:


Re: [PATCH v3 1/2] drm/amdgpu: Acquire ttm locks for dmaunmap

2023-10-24 Thread Felix Kuehling

On 2023-10-24 15:20, David Francis wrote:

dmaunmap can call ttm_bo_validate, which expects the
ttm dma_resv to be held.

Acquire the locks in amdgpu_amdkfd_gpuvm_dmaunmap_mem.

Because the dmaunmap step can now fail, two new numbers
need to be tracked. n_dmaunmap_success tracks the number
of devices that have completed dmaunmap. If a device fails
to dmaunmap due to a signal interrupt, n_dmaunmap_bos tracks
the number of bos on that device that were successfully
dmaunmapped.


I think what you mean here is "tracks the number of attachments on that 
device".





Track those values in struct kgd_mem.

This failure can also cause the sync_memory step of the ioctl
to be repeated; it is idempotent, so this should not cause any issues.

Signed-off-by: David Francis 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  6 -
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 23 +++
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 19 ++-
  3 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 3ad8dc523b42..c60564ec4312 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -86,6 +86,10 @@ struct kgd_mem {
  
  	bool aql_queue;

bool is_imported;
+
+   /* Used to track successful dmaunmap across retries in unmap ioctl */
+   uint32_t n_dmaunmap_success;
+   uint32_t n_dmaunmap_bos;
  };
  
  /* KFD Memory Eviction */

@@ -302,7 +306,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(struct 
amdgpu_device *adev,
  struct kgd_mem *mem, void *drm_priv);
  int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
struct amdgpu_device *adev, struct kgd_mem *mem, void 
*drm_priv);
-void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv);
+int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv);
  int amdgpu_amdkfd_gpuvm_sync_memory(
struct amdgpu_device *adev, struct kgd_mem *mem, bool intr);
  int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 54f31a420229..c431132d7cc1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2102,21 +2102,36 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
return ret;
  }
  
-void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv)

+int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv)
  {
struct kfd_mem_attachment *entry;
struct amdgpu_vm *vm;
+   int ret;
+   int i = 0;
  
  	vm = drm_priv_to_vm(drm_priv);
  
  	mutex_lock(>lock);
  
  	list_for_each_entry(entry, >attachments, list) {

-   if (entry->bo_va->base.vm == vm)
-   kfd_mem_dmaunmap_attachment(mem, entry);
-   }
+   if (i >= mem->n_dmaunmap_bos) {
+   ret = amdgpu_bo_reserve(entry->bo_va->base.bo, false);
This will lock and unlock things that aren't needed. This should be 
inside the "if (entry->bo_va->base.vm == vm)" where it's actually 
calling the dmaunmap_attachment.



+   if (ret) {
+   mem->n_dmaunmap_bos = i;


This counting approach feels a bit fragile. Also, what you're counting 
with "i" is not the number of attachments per device, but the number of 
attachments overall. So this double counting with two counters in two 
places is probably redundant and could be simplified to using just one 
counter and in one place.


But this may still have issue in corner cases where multiple unmap 
ioctls are happening concurrently. I'm not sure if this happens in 
practice, but it's something that a robust implementation needs to 
handle. The consequences of getting this wrong would be resource leaks 
or DMA mappings or potentially double-frees of dma mappings.


What would be simpler and more robust is, to have a flag in struct 
kfd_mem_attachment that indicates whether it is currently dma-mapped or 
not. If it's not mapped, you assume it was already unmapped and you 
don't unmap it again. That way you wouldn't need to count at all and it 
handles concurrent calls without problems. There is already an is_mapped 
flag there. You could add an is_dmamapped flag.


Regards,
  Felix



+   goto out;
+   }
+
+   if (entry->bo_va->base.vm == vm)
+   kfd_mem_dmaunmap_attachment(mem, entry);
  
+			amdgpu_bo_unreserve(entry->bo_va->base.bo);

+   }
+   i++;
+   }
+   mem->n_dmaunmap_bos = 0;
+out:
mutex_unlock(>lock);
+   return ret;
  }
  
  int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(

diff --git 

Re: [pull] amdgpu, amdkfd drm-next-6.7

2023-10-24 Thread Alex Deucher
On Fri, Oct 20, 2023 at 3:51 PM Alex Deucher  wrote:
>
> Hi Dave, Sima,
>
> More updates for 6.7.  Mostly bug fixes.
>
> The following changes since commit 27442758e9b4e083bef3f164a1739475c01f3202:
>
>   Merge tag 'amd-drm-next-6.7-2023-10-13' of 
> https://gitlab.freedesktop.org/agd5f/linux into drm-next (2023-10-18 16:08:07 
> +1000)
>
> are available in the Git repository at:
>
>   https://gitlab.freedesktop.org/agd5f/linux.git 
> tags/amd-drm-next-6.7-2023-10-20
>
> for you to fetch changes up to 5b2c54e0d0ea09f7a3b500510731878326e1117e:
>
>   drm/amd/display: Fix stack size issue on DML2 (2023-10-20 15:11:29 -0400)

This was lost during the gitlab migration, but I've pushed everything
up again so it should be valid again now.

Alex

>
> 
> amd-drm-next-6.7-2023-10-20:
>
> amdgpu:
> - SMU 13 updates
> - UMSCH updates
> - DC MPO fixes
> - RAS updates
> - MES 11 fixes
> - Fix possible memory leaks in error pathes
> - GC 11.5 fixes
> - Kernel doc updates
> - PSP updates
> - APU IMU fixes
> - Misc code cleanups
> - SMU 11 fixes
> - OD fix
> - Frame size warning fixes
> - SR-IOV fixes
> - NBIO 7.11 updates
> - NBIO 7.7 updates
> - XGMI fixes
> - devcoredump updates
>
> amdkfd:
> - Misc code cleanups
> - SVM fixes
>
> 
> Alex Deucher (3):
>   drm/amdgpu/pm: update SMU 13.0.0 PMFW version check
>   drm/amdgpu/mes11: remove aggregated doorbell code
>   drm/amdgpu: update to the latest GC 11.5 headers
>
> Alex Sierra (1):
>   drm/amdkfd: remap unaligned svm ranges that have split
>
> André Almeida (3):
>   drm/amdgpu: Encapsulate all device reset info
>   drm/amdgpu: Move coredump code to amdgpu_reset file
>   drm/amdgpu: Create version number for coredumps
>
> Asad Kamal (2):
>   drm/amdgpu : Add hive ras recovery check
>   drm/amdgpu: update retry times for psp BL wait
>
> Bas Nieuwenhuizen (1):
>   drm/amd/pm: Handle non-terminated overdrive commands.
>
> Bokun Zhang (4):
>   drm/amd/amdgpu/vcn: Add RB decouple feature under SRIOV - P1
>   drm/amd/amdgpu/vcn: Add RB decouple feature under SRIOV - P2
>   drm/amd/amdgpu/vcn: Add RB decouple feature under SRIOV - P3
>   drm/amd/amdgpu/vcn: Add RB decouple feature under SRIOV - P4
>
> Candice Li (1):
>   drm/amdgpu: Log UE corrected by replay as correctable error
>
> Colin Ian King (1):
>   drm/amd/display: Fix a handful of spelling mistakes in dml_print output
>
> Felix Kuehling (2):
>   drm/amdgpu: Fix possible null pointer dereference
>   drm/amdgpu: Reserve fences for VM update
>
> Hawking Zhang (2):
>   drm/amdgpu: Enable software RAS in vcn v4_0_3
>   drm/amdgpu: Add UVD_VCPU_INT_EN2 to dpg sram
>
> Jesse Zhang (1):
>   drm/amdkfd:remove unused code
>
> Jiapeng Chong (2):
>   drm/amdkfd: clean up some inconsistent indenting
>   drm/amd/display: clean up some inconsistent indenting
>
> Kunwu.Chan (1):
>   drm/amd/pm: Fix a memory leak on an error path
>
> Lang Yu (1):
>   drm/amdgpu/umsch: add suspend and resume callback
>
> Li Ma (2):
>   drm/amdgpu: fix missing stuff in NBIO v7.11
>   drm/amdgpu: add clockgating support for NBIO v7.7.1
>
> Ma Jun (1):
>   drm/amd/pm: Support for getting power1_cap_min value
>
> Mangesh Gadre (1):
>   Revert "drm/amdgpu: Program xcp_ctl registers as needed"
>
> Mario Limonciello (4):
>   drm/amd: Add missing kernel doc for prepare_suspend()
>   drm/amd: Move microcode init step to early_init()
>   drm/amd: Don't parse IMU ucode version if it won't be loaded
>   drm/amd: Read IMU FW version from scratch register during hw_init
>
> Nathan Chancellor (1):
>   drm/amd/display: Respect CONFIG_FRAME_WARN=0 in DML2
>
> Rodrigo Siqueira (2):
>   drm/amd/display: Reduce stack size by splitting function
>   drm/amd/display: Fix stack size issue on DML2
>
> Shiwu Zhang (3):
>   drm/amdgpu: update the xgmi ta interface header
>   drm/amdgpu: prepare the output buffer for GET_PEER_LINKS command
>   drm/amdgpu: support the port num info based on the capability flag
>
> Stanley.Yang (4):
>   drm/amdgpu: Workaround to skip kiq ring test during ras gpu recovery
>   drm/amdgpu: Enable mca debug mode mode when ras enabled
>   drm/amdgpu: Fix delete nodes that have been relesed
>   drm/amdgpu: Enable RAS feature by default for APU
>
> Stylon Wang (2):
>   drm/amd/display: Add missing lines of code in dc.c
>   drm/amd/display: Remove brackets in macro to conform to coding style
>
> Tao Zhou (4):
>   drm/amdgpu: define ras_reset_error_count function
>   drm/amdgpu: replace reset_error_count with amdgpu_ras_reset_error_count
>   drm/amdgpu: add set/get mca debug mode operations
>   drm/amdgpu: drop status query/reset for GCEA 9.4.3 and MMEA 1.8
>
> Yang Li (4):
>   drm/amd/display: clean up some inconsistent indentings
>   

[PATCH v3 2/2] drm/amdgpu: Permit PCIe transfer over links with XGMI

2023-10-24 Thread David Francis
When the CPU is XGMI connected, the PCIe links should
not be enumerated for topology purposes. However, PCIe
transfer should still be a valid option for remote
doorbells and MMIO mappings.

Move the XGMI connection check out of the shared helper
function amdgpu_device_is_peer_accessible and into the
topology path.

Signed-off-by: David Francis 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +---
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c  | 6 --
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7ec32b44df05..a5c054f8a427 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5783,9 +5783,7 @@ bool amdgpu_device_is_peer_accessible(struct 
amdgpu_device *adev,
~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
resource_size_t aper_limit =
adev->gmc.aper_base + adev->gmc.aper_size - 1;
-   bool p2p_access =
-   !adev->gmc.xgmi.connected_to_cpu &&
-   !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
+   bool p2p_access = !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, 
false) < 0);
 
return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 4e530791507e..cb64c19482f3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1514,11 +1514,13 @@ static int kfd_dev_create_p2p_links(void)
goto next;
 
/* check if node(s) is/are peer accessible in one direction or 
bi-direction */
-   ret = kfd_add_peer_prop(new_dev, dev, i, k);
+   if (!new_dev->gpu->adev->gmc.xgmi.connected_to_cpu)
+   ret = kfd_add_peer_prop(new_dev, dev, i, k);
if (ret < 0)
goto out;
 
-   ret = kfd_add_peer_prop(dev, new_dev, k, i);
+   if (!dev->gpu->adev->gmc.xgmi.connected_to_cpu)
+   ret = kfd_add_peer_prop(dev, new_dev, k, i);
if (ret < 0)
goto out;
 next:
-- 
2.34.1



[PATCH v3 1/2] drm/amdgpu: Acquire ttm locks for dmaunmap

2023-10-24 Thread David Francis
dmaunmap can call ttm_bo_validate, which expects the
ttm dma_resv to be held.

Acquire the locks in amdgpu_amdkfd_gpuvm_dmaunmap_mem.

Because the dmaunmap step can now fail, two new numbers
need to be tracked. n_dmaunmap_success tracks the number
of devices that have completed dmaunmap. If a device fails
to dmaunmap due to a signal interrupt, n_dmaunmap_bos tracks
the number of bos on that device that were successfully
dmaunmapped.

Track those values in struct kgd_mem.

This failure can also cause the sync_memory step of the ioctl
to be repeated; it is idempotent, so this should not cause any issues.

Signed-off-by: David Francis 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  6 -
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 23 +++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 19 ++-
 3 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 3ad8dc523b42..c60564ec4312 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -86,6 +86,10 @@ struct kgd_mem {
 
bool aql_queue;
bool is_imported;
+
+   /* Used to track successful dmaunmap across retries in unmap ioctl */
+   uint32_t n_dmaunmap_success;
+   uint32_t n_dmaunmap_bos;
 };
 
 /* KFD Memory Eviction */
@@ -302,7 +306,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(struct 
amdgpu_device *adev,
  struct kgd_mem *mem, void *drm_priv);
 int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
struct amdgpu_device *adev, struct kgd_mem *mem, void 
*drm_priv);
-void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv);
+int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv);
 int amdgpu_amdkfd_gpuvm_sync_memory(
struct amdgpu_device *adev, struct kgd_mem *mem, bool intr);
 int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 54f31a420229..c431132d7cc1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2102,21 +2102,36 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
return ret;
 }
 
-void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv)
+int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv)
 {
struct kfd_mem_attachment *entry;
struct amdgpu_vm *vm;
+   int ret;
+   int i = 0;
 
vm = drm_priv_to_vm(drm_priv);
 
mutex_lock(>lock);
 
list_for_each_entry(entry, >attachments, list) {
-   if (entry->bo_va->base.vm == vm)
-   kfd_mem_dmaunmap_attachment(mem, entry);
-   }
+   if (i >= mem->n_dmaunmap_bos) {
+   ret = amdgpu_bo_reserve(entry->bo_va->base.bo, false);
+   if (ret) {
+   mem->n_dmaunmap_bos = i;
+   goto out;
+   }
+
+   if (entry->bo_va->base.vm == vm)
+   kfd_mem_dmaunmap_attachment(mem, entry);
 
+   amdgpu_bo_unreserve(entry->bo_va->base.bo);
+   }
+   i++;
+   }
+   mem->n_dmaunmap_bos = 0;
+out:
mutex_unlock(>lock);
+   return ret;
 }
 
 int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 06988cf1db51..66dee67ad859 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1366,7 +1366,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file 
*filep,
 {
struct kfd_ioctl_unmap_memory_from_gpu_args *args = data;
struct kfd_process_device *pdd, *peer_pdd;
-   void *mem;
+   struct kgd_mem *mem;
long err = 0;
uint32_t *devices_arr = NULL, i;
bool flush_tlb;
@@ -1400,7 +1400,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file 
*filep,
goto bind_process_to_device_failed;
}
 
-   mem = kfd_process_device_translate_handle(pdd,
+   mem = (struct kgd_mem *)kfd_process_device_translate_handle(pdd,
GET_IDR_HANDLE(args->handle));
if (!mem) {
err = -ENOMEM;
@@ -1414,7 +1414,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file 
*filep,
goto get_mem_obj_from_handle_failed;
}
err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
-   peer_pdd->dev->adev, (struct kgd_mem *)mem, 
peer_pdd->drm_priv);
+   peer_pdd->dev->adev, mem, peer_pdd->drm_priv);
if (err) {
  

[PATCH v3] drm/amdgpu: Add EXT_COHERENT support for APU and NUMA systems

2023-10-24 Thread David Francis
On gfx943 APU, EXT_COHERENT should give MTYPE_CC for local and
MTYPE_UC for nonlocal memory.

On NUMA systems, local memory gets the local mtype, set by an
override callback. If EXT_COHERENT is set, memory will be set as
MTYPE_UC by default, with local memory MTYPE_CC.

Add an option in the override function for this case, and
add a check to ensure it is not used on UNCACHED memory.

V2: Combined APU and NUMA code into one patch
V3: Fixed a potential nullptr in amdgpu_vm_bo_update

Signed-off-by: David Francis 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 17 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|  8 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 33 +++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c  |  8 +++---
 5 files changed, 45 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index d72daf15662f..155c04589753 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -761,6 +761,7 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
  * @immediate: immediate submission in a page fault
  * @unlocked: unlocked invalidation during MM callback
  * @flush_tlb: trigger tlb invalidation after update completed
+ * @allow_override: change MTYPE for local NUMA nodes
  * @resv: fences we need to sync to
  * @start: start of mapped range
  * @last: last mapped entry
@@ -777,7 +778,7 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
  * 0 for success, negative erro code for failure.
  */
 int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
-  bool immediate, bool unlocked, bool flush_tlb,
+  bool immediate, bool unlocked, bool flush_tlb, bool 
allow_override,
   struct dma_resv *resv, uint64_t start, uint64_t last,
   uint64_t flags, uint64_t offset, uint64_t vram_base,
   struct ttm_resource *res, dma_addr_t *pages_addr,
@@ -815,6 +816,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, 
struct amdgpu_vm *vm,
params.immediate = immediate;
params.pages_addr = pages_addr;
params.unlocked = unlocked;
+   params.allow_override = allow_override;
 
/* Implicitly sync to command submissions in the same VM before
 * unmapping. Sync to moving fences before mapping.
@@ -990,6 +992,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct 
amdgpu_bo_va *bo_va,
struct ttm_resource *mem;
struct dma_fence **last_update;
bool flush_tlb = clear;
+   bool uncached;
struct dma_resv *resv;
uint64_t vram_base;
uint64_t flags;
@@ -1027,9 +1030,11 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, 
struct amdgpu_bo_va *bo_va,
 
bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
vram_base = bo_adev->vm_manager.vram_base_offset;
+   uncached = (bo->flags & AMDGPU_GEM_CREATE_UNCACHED) != 0;
} else {
flags = 0x0;
vram_base = 0;
+   uncached = false;
}
 
if (clear || (bo && bo->tbo.base.resv ==
@@ -1063,7 +1068,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, 
struct amdgpu_bo_va *bo_va,
trace_amdgpu_vm_bo_update(mapping);
 
r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb,
-  resv, mapping->start, mapping->last,
+  !uncached, resv, mapping->start, 
mapping->last,
   update_flags, mapping->offset,
   vram_base, mem, pages_addr,
   last_update);
@@ -1258,8 +1263,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
mapping->start < AMDGPU_GMC_HOLE_START)
init_pte_value = AMDGPU_PTE_DEFAULT_ATC;
 
-   r = amdgpu_vm_update_range(adev, vm, false, false, true, resv,
-  mapping->start, mapping->last,
+   r = amdgpu_vm_update_range(adev, vm, false, false, true, false,
+  resv, mapping->start, mapping->last,
   init_pte_value, 0, 0, NULL, NULL,
   );
amdgpu_vm_free_mapping(adev, vm, mapping, f);
@@ -2547,8 +2552,8 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, 
u32 pasid,
goto error_unlock;
}
 
-   r = amdgpu_vm_update_range(adev, vm, true, false, false, NULL, addr,
-  addr, flags, value, 0, NULL, NULL, NULL);
+   r = amdgpu_vm_update_range(adev, vm, true, false, false, false,
+ 

[PATCH] drm/amd/display: avoid variable reinitialization

2023-10-24 Thread Bragatheswaran Manickavel
The member variable enable_hpo_pg_support is already initialized 
and hence the reinitialization instruction can be removed. Issue 
identified using the doubleinit.cocci Coccinelle semantic patch script.

Signed-off-by: Bragatheswaran Manickavel 
---
 drivers/gpu/drm/amd/display/dc/dcn35/dcn35_resource.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_resource.c 
b/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_resource.c
index 99d55b958977..1fd9df8da09c 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_resource.c
@@ -739,7 +739,6 @@ static const struct dc_debug_options debug_defaults_drv = {
.disable_boot_optimizations = false,
.disable_unbounded_requesting = false,
.disable_mem_low_power = false,
-   .enable_hpo_pg_support = false,
//must match enable_single_display_2to1_odm_policy to support dynamic 
ODM transitions
.enable_double_buffered_dsc_pg_support = true,
.enable_dp_dig_pixel_rate_div_policy = 1,
-- 
2.34.1



Re: [PATCH] drm/amd/amdgpu: avoid to disable gfxhub interrupt when driver is unloaded

2023-10-24 Thread Deucher, Alexander
[Public]

Reviewed-by: Alex Deucher 

From: Kenneth Feng 
Sent: Monday, October 23, 2023 11:32 PM
To: amd-gfx@lists.freedesktop.org 
Cc: Deucher, Alexander ; Feng, Kenneth 

Subject: [PATCH] drm/amd/amdgpu: avoid to disable gfxhub interrupt when driver 
is unloaded

avoid to disable gfxhub interrupt when driver is unloaded on gmc 11

Signed-off-by: Kenneth Feng 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 80ca2c05b0b8..8e36a8395464 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -73,7 +73,8 @@ gmc_v11_0_vm_fault_interrupt_state(struct amdgpu_device *adev,
  * fini/suspend, so the overall state doesn't
  * change over the course of suspend/resume.
  */
-   if (!adev->in_s0ix)
+   if (!adev->in_s0ix && (adev->in_runpm || adev->in_suspend ||
+  
amdgpu_in_reset(adev)))
 amdgpu_gmc_set_vm_fault_masks(adev, AMDGPU_GFXHUB(0), 
false);
 break;
 case AMDGPU_IRQ_STATE_ENABLE:
--
2.34.1



Re: [PATCH] drm/amd/pm: call smu_cmn_get_smc_version in is_mode1_reset_supported.

2023-10-24 Thread Deucher, Alexander
[Public]

Acked-by: Alex Deucher 

From: Zhang, Yifan 
Sent: Tuesday, October 24, 2023 9:41 AM
To: amd-gfx@lists.freedesktop.org 
Cc: Deucher, Alexander ; Koenig, Christian 
; Li, Candice ; Feng, Kenneth 
; Zhang, Yifan 
Subject: [PATCH] drm/amd/pm: call smu_cmn_get_smc_version in 
is_mode1_reset_supported.

is_mode1_reset_supported may be called before smu init, when smu_context
is unitialized in driver load/unload test. Call smu_cmn_get_smc_version
explicitly is_mode1_reset_supported.

Fixes: 5fe5098c64d9 ("drm/amd/pm: drop most smu_cmn_get_smc_version in smu")
Signed-off-by: Yifan Zhang 
---
 drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 8 +++-
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c| 8 +++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
index 090249b6422a..77c3d76c76a2 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
@@ -2461,12 +2461,18 @@ static bool 
sienna_cichlid_is_mode1_reset_supported(struct smu_context *smu)
 {
 struct amdgpu_device *adev = smu->adev;
 uint32_t val;
+   uint32_t smu_version;
+   int ret;

 /**
  * SRIOV env will not support SMU mode1 reset
  * PM FW support mode1 reset from 58.26
  */
-   if (amdgpu_sriov_vf(adev) || (smu->smc_fw_version < 0x003a1a00))
+   ret = smu_cmn_get_smc_version(smu, NULL, _version);
+   if (ret)
+   return false;
+
+   if (amdgpu_sriov_vf(adev) || (smu_version < 0x003a1a00))
 return false;

 /**
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index b1433973380b..648d5eafb27b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -2615,13 +2615,19 @@ static int smu_v13_0_0_baco_exit(struct smu_context 
*smu)
 static bool smu_v13_0_0_is_mode1_reset_supported(struct smu_context *smu)
 {
 struct amdgpu_device *adev = smu->adev;
+   u32 smu_version;
+   int ret;

 /* SRIOV does not support SMU mode1 reset */
 if (amdgpu_sriov_vf(adev))
 return false;

 /* PMFW support is available since 78.41 */
-   if (smu->smc_fw_version < 0x004e2900)
+   ret = smu_cmn_get_smc_version(smu, NULL, _version);
+   if (ret)
+   return false;
+
+   if (smu_version < 0x004e2900)
 return false;

 return true;
--
2.37.3



RE: [PATCH] drm/amd/amdgpu: fix the GPU power print error in pm info

2023-10-24 Thread Ma, Li
[AMD Official Use Only - General]

-Original Message-
From: Zhang, Yifan 
Sent: Tuesday, October 24, 2023 10:29 PM
To: Ma, Li ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Feng, Kenneth 
; StDenis, Tom 
Subject: RE: [PATCH] drm/amd/amdgpu: fix the GPU power print error in pm info

[AMD Official Use Only - General]

-Original Message-
From: Ma, Li 
Sent: Tuesday, October 24, 2023 7:09 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Zhang, Yifan 
; Feng, Kenneth ; StDenis, Tom 
; Ma, Li 
Subject: [PATCH] drm/amd/amdgpu: fix the GPU power print error in pm info

Print the digit of the fractional part individually to avoid carrying during 
display.

Signed-off-by: Li Ma 
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 358bb5e485f2..cc853559cf0f 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -4290,10 +4290,10 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file 
*m, struct amdgpu_device *a
seq_printf(m, "\t%u mV (VDDNB)\n", value);
size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_AVG_POWER, (void 
*), ))
-   seq_printf(m, "\t%u.%u W (average GPU)\n", query >> 8, query & 
0xff);
+   seq_printf(m, "\t%u.%u%u W (average GPU)\n", query >> 8, (query 
&
+0xff) / 10, (query & 0xff) % 10);

Would you pls elaborate on this with an example ? it looks to me it makes no 
difference here.
Li: If the range of  (query&0xff) is [0x01,0x09], the origin output is x.1~x.9. 
However, it should be x.01~x.09 which is same as smu fw.

size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_INPUT_POWER, 
(void *), ))
-   seq_printf(m, "\t%u.%u W (current GPU)\n", query >> 8, query & 
0xff);
+   seq_printf(m, "\t%u.%u%u W (current GPU)\n", query >> 8, (query 
&
+0xff) / 10, (query & 0xff) % 10);

Would you pls elaborate on this with an example? it looks to me it makes no 
difference here.
Li: If the range of  (query&0xff) is [0x01,0x09], the origin output is x.1~x.9. 
However, it should be x.01~x.09 which is same as smu fw.
size = sizeof(value);
seq_printf(m, "\n");

--
2.25.1




[PATCH v2 3/3] drm/amdgpu: optimize RLC powerdown notification on Vangogh

2023-10-24 Thread Perry Yuan
The smu needs to get the rlc power down message to sync the rlc state
with smu, the rlc state updating message need to be sent at while smu
begin suspend sequence , otherwise SMU will crash while RLC state is not
notified by driver, and rlc state probally changed after that
notification, so it needs to notify rlc state to smu at the end of the
suspend sequence in amdgpu_device_suspend() that can make sure the rlc
state  is correctly set to SMU.

[  101.000590] amdgpu :03:00.0: amdgpu: SMU: I'm not done with your 
previous command: SMN_C2PMSG_66:0x001E SMN_C2PMSG_82:0x
[  101.000598] amdgpu :03:00.0: amdgpu: Failed to disable gfxoff!
[  110.838026] amdgpu :03:00.0: amdgpu: SMU: I'm not done with your 
previous command: SMN_C2PMSG_66:0x001E SMN_C2PMSG_82:0x
[  110.838035] amdgpu :03:00.0: amdgpu: Failed to disable smu features.
[  110.838039] amdgpu :03:00.0: amdgpu: Fail to disable dpm features!
[  110.838040] [drm:amdgpu_device_ip_suspend_phase2 [amdgpu]] *ERROR* suspend 
of IP block  failed -62
[  110.884394] PM: suspend of devices aborted after 21213.620 msecs
[  110.884402] PM: start suspend of devices aborted after 21213.882 msecs
[  110.884405] PM: Some devices failed to suspend, or early wake event detected

Signed-off-by: Perry Yuan 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 
 drivers/gpu/drm/amd/include/kgd_pp_interface.h |  1 +
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c| 18 ++
 drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h|  2 ++
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c  | 10 ++
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h  |  5 +
 .../gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c   |  5 ++---
 drivers/gpu/drm/amd/pm/swsmu/smu_internal.h|  1 +
 8 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index cc047fe0b7ee..be08ffc69231 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4428,6 +4428,10 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
if (amdgpu_sriov_vf(adev))
amdgpu_virt_release_full_gpu(adev, false);
 
+   r = amdgpu_dpm_notify_rlc_state(adev, false);
+   if (r)
+   return r;
+
return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h 
b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
index 3201808c2dd8..4eacfdfcfd4b 100644
--- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
@@ -444,6 +444,7 @@ struct amd_pm_funcs {
   struct dpm_clocks *clock_table);
int (*get_smu_prv_buf_details)(void *handle, void **addr, size_t *size);
void (*pm_compute_clocks)(void *handle);
+   int (*notify_rlc_state)(void *handle, bool en);
 };
 
 struct metrics_table_header {
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index acf3527fff2d..ed7237bb64c8 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -181,6 +181,24 @@ int amdgpu_dpm_set_mp1_state(struct amdgpu_device *adev,
return ret;
 }
 
+int amdgpu_dpm_notify_rlc_state(struct amdgpu_device *adev, bool en)
+{
+   int ret = 0;
+   const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
+
+   if (pp_funcs && pp_funcs->notify_rlc_state) {
+   mutex_lock(>pm.mutex);
+
+   ret = pp_funcs->notify_rlc_state(
+   adev->powerplay.pp_handle,
+   en);
+
+   mutex_unlock(>pm.mutex);
+   }
+
+   return ret;
+}
+
 bool amdgpu_dpm_is_baco_supported(struct amdgpu_device *adev)
 {
const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
index feccd2a7120d..482ea30147ab 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
@@ -415,6 +415,8 @@ int amdgpu_dpm_mode1_reset(struct amdgpu_device *adev);
 int amdgpu_dpm_set_mp1_state(struct amdgpu_device *adev,
 enum pp_mp1_state mp1_state);
 
+int amdgpu_dpm_notify_rlc_state(struct amdgpu_device *adev, bool en);
+
 int amdgpu_dpm_set_gfx_power_up_by_imu(struct amdgpu_device *adev);
 
 int amdgpu_dpm_baco_exit(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index a0b8d5d78beb..a8fb914f746b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -1710,6 +1710,16 @@ static int smu_disable_dpms(struct smu_context *smu)
}
}
 
+   /* Notify SMU RLC is going to be off, stop RLC and SMU interaction.
+* otherwise SMU will hang while 

[PATCH v2 2/3] drm/amdgpu: avoid sending csib command when system resumes from S3

2023-10-24 Thread Perry Yuan
Previously the CSIB command pocket was sent to GFX block while amdgpu
driver loading or S3 resuming time all the time.
As the CP protocol required, the CSIB is not needed to send again while
GC is not powered down while resuming from aborted S3 suspend sequence.

PREAMBLE_CNTL packet coming in the ring after PG event where the RLC
already sent its copy of CSIB, send another CSIB pocket will cause
Gfx IB testing timeout when system resume from S3.

Add flag `csib_initialized` to make sure normal S3 suspend/resume
will initialize csib normally, when system abort to S3 suspend and
resume immediately because of some failed suspend callback, GPU is not
power down at that time, so csib command is not needed to send again.

Error dmesg log:
amdgpu :04:00.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed 
on gfx_0.0.0 (-110).
[drm:amdgpu_device_delayed_init_work_handler [amdgpu]] *ERROR* ib ring test 
failed (-110).
PM: resume of devices complete after 2373.995 msecs
PM: Finishing wakeup.

v1-v2:
 * align csib_initialized with other failed with space[Kevin]
 * check return value check for gfx_v10_0_wait_for_idle()[Kevin]

Signed-off-by: Perry Yuan 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  5 
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 32 ++---
 3 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 44df1a5bce7f..7d570f53e66b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1114,6 +1114,7 @@ struct amdgpu_device {
booldebug_vm;
booldebug_largebar;
booldebug_disable_soft_recovery;
+   boolcsib_initialized;
 };
 
 static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 420196a17e22..a47c9f840754 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2468,6 +2468,11 @@ static int amdgpu_pmops_suspend_noirq(struct device *dev)
if (amdgpu_acpi_should_gpu_reset(adev))
return amdgpu_asic_reset(adev);
 
+   /* update flag to make sure csib will be sent when system
+* resume from normal S3
+*/
+   adev->csib_initialized = false;
+
return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 6399bc71c56d..138340e1fa95 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -3481,6 +3481,7 @@ static uint64_t gfx_v10_0_get_gpu_clock_counter(struct 
amdgpu_device *adev);
 static void gfx_v10_0_select_se_sh(struct amdgpu_device *adev, u32 se_num,
   u32 sh_num, u32 instance, int xcc_id);
 static u32 gfx_v10_0_get_wgp_active_bitmap_per_sh(struct amdgpu_device *adev);
+static int gfx_v10_0_wait_for_idle(void *handle);
 
 static int gfx_v10_0_rlc_backdoor_autoload_buffer_init(struct amdgpu_device 
*adev);
 static void gfx_v10_0_rlc_backdoor_autoload_buffer_fini(struct amdgpu_device 
*adev);
@@ -5958,7 +5959,7 @@ static int gfx_v10_0_cp_gfx_load_microcode(struct 
amdgpu_device *adev)
return 0;
 }
 
-static int gfx_v10_0_cp_gfx_start(struct amdgpu_device *adev)
+static int gfx_v10_csib_submit(struct amdgpu_device *adev)
 {
struct amdgpu_ring *ring;
const struct cs_section_def *sect = NULL;
@@ -5966,13 +5967,6 @@ static int gfx_v10_0_cp_gfx_start(struct amdgpu_device 
*adev)
int r, i;
int ctx_reg_offset;
 
-   /* init the CP */
-   WREG32_SOC15(GC, 0, mmCP_MAX_CONTEXT,
-adev->gfx.config.max_hw_contexts - 1);
-   WREG32_SOC15(GC, 0, mmCP_DEVICE_ID, 1);
-
-   gfx_v10_0_cp_gfx_enable(adev, true);
-
ring = >gfx.gfx_ring[0];
r = amdgpu_ring_alloc(ring, gfx_v10_0_get_csb_size(adev) + 4);
if (r) {
@@ -6035,6 +6029,28 @@ static int gfx_v10_0_cp_gfx_start(struct amdgpu_device 
*adev)
 
amdgpu_ring_commit(ring);
}
+
+   r = gfx_v10_0_wait_for_idle(adev);
+   if (r)
+   return r;
+
+   adev->csib_initialized = true;
+
+   return 0;
+};
+
+static int gfx_v10_0_cp_gfx_start(struct amdgpu_device *adev)
+{
+   /* init the CP */
+   WREG32_SOC15(GC, 0, mmCP_MAX_CONTEXT,
+adev->gfx.config.max_hw_contexts - 1);
+   WREG32_SOC15(GC, 0, mmCP_DEVICE_ID, 1);
+
+   gfx_v10_0_cp_gfx_enable(adev, true);
+
+   if (!adev->csib_initialized)
+   gfx_v10_csib_submit(adev);
+
return 0;
 }
 
-- 
2.34.1



[PATCH v2 1/3] drm/amdgpu: ungate power gating when system suspend

2023-10-24 Thread Perry Yuan
[Why] During suspend, if GFX DPM is enabled and GFXOFF feature is
enabled the system may get hung. So, it is suggested to disable
GFXOFF feature during suspend and enable it after resume.

[How] Update the code to disable GFXOFF feature during suspend and enable
it after resume.

[  311.396526] amdgpu :03:00.0: amdgpu: SMU: I'm not done with your 
previous command: SMN_C2PMSG_66:0x001E SMN_C2PMSG_82:0x
[  311.396530] amdgpu :03:00.0: amdgpu: Fail to disable dpm features!
[  311.396531] [drm:amdgpu_device_ip_suspend_phase2 [amdgpu]] *ERROR* suspend 
of IP block  failed -62

Acked-by: Yang Wang 
Reviewed-by: Kenneth Feng 
Signed-off-by: Perry Yuan 
Signed-off-by: Kun Liu 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index d9ccacd06fba..6399bc71c56d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -3498,6 +3498,8 @@ static void gfx_v10_0_ring_invalidate_tlbs(struct 
amdgpu_ring *ring,
 static void gfx_v10_0_update_spm_vmid_internal(struct amdgpu_device *adev,
   unsigned int vmid);
 
+static int gfx_v10_0_set_powergating_state(void *handle,
+ enum amd_powergating_state state);
 static void gfx10_kiq_set_resources(struct amdgpu_ring *kiq_ring, uint64_t 
queue_mask)
 {
amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_SET_RESOURCES, 6));
@@ -7172,6 +7174,13 @@ static int gfx_v10_0_hw_fini(void *handle)
amdgpu_irq_put(adev, >gfx.priv_reg_irq, 0);
amdgpu_irq_put(adev, >gfx.priv_inst_irq, 0);
 
+   /* WA added for Vangogh asic fixing the SMU suspend failure
+* It needs to set power gating again during gfxoff control
+* otherwise the gfxoff disallowing will be failed to set.
+*/
+   if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(10, 3, 1))
+   gfx_v10_0_set_powergating_state(handle, AMD_PG_STATE_UNGATE);
+
if (!adev->no_hw_access) {
if (amdgpu_async_gfx_ring) {
if (amdgpu_gfx_disable_kgq(adev, 0))
-- 
2.34.1



RE: [PATCH 1/3] drm/amdgpu: ungate power gating when system suspend

2023-10-24 Thread Yuan, Perry
[AMD Official Use Only - General]

Hi Kevin, Kenneth,

Thanks for the review and ack, will pick it up in v2.

Regards.
Perry

> -Original Message-
> From: Wang, Yang(Kevin) 
> Sent: Tuesday, October 24, 2023 4:15 PM
> To: Feng, Kenneth ; Yuan, Perry
> ; Zhang, Yifan ;
> Limonciello, Mario 
> Cc: Deucher, Alexander ; amd-
> g...@lists.freedesktop.org
> Subject: RE: [PATCH 1/3] drm/amdgpu: ungate power gating when system
> suspend
>
> [AMD Official Use Only - General]
>
> Acked-by: Yang Wang 
>
> Best Regards,
> Kevin
>
> -Original Message-
> From: Feng, Kenneth 
> Sent: Tuesday, October 24, 2023 2:33 PM
> To: Yuan, Perry ; Zhang, Yifan
> ; Limonciello, Mario
> 
> Cc: Deucher, Alexander ; Wang, Yang(Kevin)
> ; amd-gfx@lists.freedesktop.org
> Subject: RE: [PATCH 1/3] drm/amdgpu: ungate power gating when system
> suspend
>
> [AMD Official Use Only - General]
>
> Reviewed-by: Kenneth Feng 
>
>
> -Original Message-
> From: Yuan, Perry 
> Sent: Tuesday, October 24, 2023 10:33 AM
> To: Zhang, Yifan ; Feng, Kenneth
> ; Limonciello, Mario
> 
> Cc: Deucher, Alexander ; Wang, Yang(Kevin)
> ; amd-gfx@lists.freedesktop.org
> Subject: [PATCH 1/3] drm/amdgpu: ungate power gating when system
> suspend
>
> [Why] During suspend, if GFX DPM is enabled and GFXOFF feature is enabled
> the system may get hung. So, it is suggested to disable GFXOFF feature
> during suspend and enable it after resume.
>
> [How] Update the code to disable GFXOFF feature during suspend and
> enable it after resume.
>
> [  311.396526] amdgpu :03:00.0: amdgpu: SMU: I'm not done with your
> previous command: SMN_C2PMSG_66:0x001E
> SMN_C2PMSG_82:0x [  311.396530] amdgpu :03:00.0: amdgpu:
> Fail to disable dpm features!
> [  311.396531] [drm:amdgpu_device_ip_suspend_phase2 [amdgpu]]
> *ERROR* suspend of IP block  failed -62
>
> Signed-off-by: Perry Yuan 
> Signed-off-by: Kun Liu 
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 9 +
>  1 file changed, 9 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index d9ccacd06fba..6399bc71c56d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -3498,6 +3498,8 @@ static void gfx_v10_0_ring_invalidate_tlbs(struct
> amdgpu_ring *ring,  static void
> gfx_v10_0_update_spm_vmid_internal(struct amdgpu_device *adev,
>unsigned int vmid);
>
> +static int gfx_v10_0_set_powergating_state(void *handle,
> + enum amd_powergating_state state);
>  static void gfx10_kiq_set_resources(struct amdgpu_ring *kiq_ring, uint64_t
> queue_mask)  {
> amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_SET_RESOURCES, 6));
> @@ -7172,6 +7174,13 @@ static int gfx_v10_0_hw_fini(void *handle)
> amdgpu_irq_put(adev, >gfx.priv_reg_irq, 0);
> amdgpu_irq_put(adev, >gfx.priv_inst_irq, 0);
>
> +   /* WA added for Vangogh asic fixing the SMU suspend failure
> +* It needs to set power gating again during gfxoff control
> +* otherwise the gfxoff disallowing will be failed to set.
> +*/
> +   if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(10, 3, 1))
> +   gfx_v10_0_set_powergating_state(handle,
> AMD_PG_STATE_UNGATE);
> +
> if (!adev->no_hw_access) {
> if (amdgpu_async_gfx_ring) {
> if (amdgpu_gfx_disable_kgq(adev, 0))
> --
> 2.34.1
>
>



Re: [PATCH] drm/amdgpu: Initialize schedulers before using them

2023-10-24 Thread Alex Deucher
On Tue, Oct 24, 2023 at 6:14 AM Christian König
 wrote:
>
> [SNIP]
> > Let me take a closer look first
>
> I think I've figured out why this isn't working as expected. It started
> with this patch here:
>
> commit 5fd8518d187ed03403a4d4f7f56f52c00b11c148
> Author: Andrey Grodzovsky 
> Date:   Mon Dec 6 14:59:35 2021 -0500
>
>  drm/amdgpu: Move scheduler init to after XGMI is ready
>
>  Before we initialize schedulers we must know which reset
>  domain are we in - for single device there iis a single
>  domain per device and so single wq per device. For XGMI
>  the reset domain spans the entire XGMI hive and so the
>  reset wq is per hive.
>
>  Signed-off-by: Andrey Grodzovsky 
>  Reviewed-by: Christian König 
>  Link: https://www.spinics.net/lists/amd-gfx/msg74112.html
>
> Andrey separated the scheduler initialization from the ring init because
> we need some of the rings for XGMI initialization which in turn in
> necessary to figure out the XGMI hive and so the reset domain for the
> scheduler.
>
> The code inside amdgpu_ttm_set_buffer_funcs_status() is actually
> correct, the problem is that this is called as part of the hw init which
> comes earlier than the scheduler init.
>
> @Alex, Ideas how to fix this? My best guess is that we should move the
> call to amdgpu_ttm_set_buffer_funcs_status() from the DMA specific code
> into the higher level handling in amdgpu_device.c

Yes, I think so, but there could be some tricky ordering issues with
respect to suspend and resume.  I think something like the attached
patch should do the trick.

Alex
From e06ec86af03c8f730e8ba2ae8b668bb3727f455d Mon Sep 17 00:00:00 2001
From: Alex Deucher 
Date: Tue, 24 Oct 2023 10:42:08 -0400
Subject: [PATCH] drm/amdgpu: move buffer funcs setting up a level

Rather than doing this in the IP code for the SDMA paging
engine, move it up to the core device level init level.
This should fix the scheduler init ordering.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c   | 23 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h   |  1 +
 drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  5 -
 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  5 -
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  5 -
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 16 +--
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 10 +-
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 10 +-
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 10 +-
 drivers/gpu/drm/amd/amdgpu/si_dma.c|  5 -
 11 files changed, 40 insertions(+), 62 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index cc047fe0b7ee..7b4120383f89 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2667,6 +2667,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 	if (r)
 		goto init_failed;
 
+	amdgpu_sdma_set_buffer_funcs_helper(adev);
+
 	/* Don't init kfd if whole hive need to be reset during init */
 	if (!adev->gmc.xgmi.pending_reset) {
 		kgd2kfd_init_zone_device(adev);
@@ -3265,6 +3267,8 @@ int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
 		amdgpu_virt_request_full_gpu(adev, false);
 	}
 
+	amdgpu_sdma_unset_buffer_funcs_helper(adev);
+
 	r = amdgpu_device_ip_suspend_phase1(adev);
 	if (r)
 		return r;
@@ -3454,6 +3458,8 @@ static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
 
 	r = amdgpu_device_ip_resume_phase2(adev);
 
+	amdgpu_sdma_set_buffer_funcs_helper(adev);
+
 	return r;
 }
 
@@ -4241,6 +4247,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
 	/* disable ras feature must before hw fini */
 	amdgpu_ras_pre_fini(adev);
 
+	amdgpu_sdma_unset_buffer_funcs_helper(adev);
+
 	amdgpu_device_ip_fini_early(adev);
 
 	amdgpu_irq_fini_hw(adev);
@@ -4412,6 +4420,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
 
 	amdgpu_ras_suspend(adev);
 
+	amdgpu_sdma_unset_buffer_funcs_helper(adev);
+
 	amdgpu_device_ip_suspend_phase1(adev);
 
 	if (!adev->in_s0ix)
@@ -5183,6 +5193,8 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 if (r)
 	goto out;
 
+amdgpu_sdma_set_buffer_funcs_helper(tmp_adev);
+
 if (vram_lost)
 	amdgpu_device_fill_reset_magic(tmp_adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index e8cbc4142d80..33f88fc9d92f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -292,6 +292,29 @@ int amdgpu_sdma_init_microcode(struct amdgpu_device *adev,
 	return err;
 }
 
+void amdgpu_sdma_set_buffer_funcs_helper(struct amdgpu_device *adev)
+{
+	struct amdgpu_ring *sdma;
+	int i;
+
+	for (i = 0; i < adev->sdma.num_instances; i++) {
+		if (adev->sdma.has_page_queue) {
+			sdma = >sdma.instance[i].page;
+			if 

RE: [PATCH 2/3] drm/amdgpu: avoid sending csib command when system resumes from S3

2023-10-24 Thread Yuan, Perry
[AMD Official Use Only - General]

Hi Kevin,


> -Original Message-
> From: Wang, Yang(Kevin) 
> Sent: Tuesday, October 24, 2023 1:24 PM
> To: Yuan, Perry ; Zhang, Yifan
> ; Feng, Kenneth ;
> Limonciello, Mario 
> Cc: Deucher, Alexander ; amd-
> g...@lists.freedesktop.org
> Subject: RE: [PATCH 2/3] drm/amdgpu: avoid sending csib command when
> system resumes from S3
>
> [AMD Official Use Only - General]
>
> -Original Message-
> From: Yuan, Perry 
> Sent: Tuesday, October 24, 2023 10:33 AM
> To: Zhang, Yifan ; Feng, Kenneth
> ; Limonciello, Mario
> 
> Cc: Deucher, Alexander ; Wang, Yang(Kevin)
> ; amd-gfx@lists.freedesktop.org
> Subject: [PATCH 2/3] drm/amdgpu: avoid sending csib command when
> system resumes from S3
>
> Previously the CSIB command pocket was sent to GFX block while amdgpu
> driver loading or S3 resuming time all the time.
> As the CP protocol required, the CSIB is not needed to send again while GC is
> not powered down while resuming from aborted S3 suspend sequence.
>
> PREAMBLE_CNTL packet coming in the ring after PG event where the RLC
> already sent its copy of CSIB, send another CSIB pocket will cause Gfx IB
> testing timeout when system resume from S3.
>
> Add flag `csib_initialized` to make sure normal S3 suspend/resume will
> initialize csib normally, when system abort to S3 suspend and resume
> immediately because of some failed suspend callback, GPU is not power
> down at that time, so csib command is not needed to send again.
>
> Error dmesg log:
> amdgpu :04:00.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB
> test failed on gfx_0.0.0 (-110).
> [drm:amdgpu_device_delayed_init_work_handler [amdgpu]] *ERROR* ib
> ring test failed (-110).
> PM: resume of devices complete after 2373.995 msecs
> PM: Finishing wakeup.
>
> Signed-off-by: Perry Yuan 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  5 +
> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 29 ++
> ---
>  3 files changed, 27 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 44df1a5bce7f..e5d85ea26a5e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1114,6 +1114,7 @@ struct amdgpu_device {
> booldebug_vm;
> booldebug_largebar;
> booldebug_disable_soft_recovery;
> +   boolcsib_initialized;
> [Kevin]:
> you'd better use space to instead of "tab" , to align with other field.

Cool, I didn`t notice that, changed in v2.
Thanks !

>
>  };
>
>  static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 420196a17e22..a47c9f840754 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2468,6 +2468,11 @@ static int amdgpu_pmops_suspend_noirq(struct
> device *dev)
> if (amdgpu_acpi_should_gpu_reset(adev))
> return amdgpu_asic_reset(adev);
>
> +   /* update flag to make sure csib will be sent when system
> +* resume from normal S3
> +*/
> +   adev->csib_initialized = false;
> +
> return 0;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 6399bc71c56d..ab2e3e592dfc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -3481,6 +3481,7 @@ static uint64_t
> gfx_v10_0_get_gpu_clock_counter(struct amdgpu_device *adev);  static
> void gfx_v10_0_select_se_sh(struct amdgpu_device *adev, u32 se_num,
>u32 sh_num, u32 instance, int xcc_id);  
> static u32
> gfx_v10_0_get_wgp_active_bitmap_per_sh(struct amdgpu_device *adev);
> +static int gfx_v10_0_wait_for_idle(void *handle);
>
>  static int gfx_v10_0_rlc_backdoor_autoload_buffer_init(struct
> amdgpu_device *adev);  static void
> gfx_v10_0_rlc_backdoor_autoload_buffer_fini(struct amdgpu_device
> *adev); @@ -5958,7 +5959,7 @@ static int
> gfx_v10_0_cp_gfx_load_microcode(struct amdgpu_device *adev)
> return 0;
>  }
>
> -static int gfx_v10_0_cp_gfx_start(struct amdgpu_device *adev)
> +static int gfx_v10_csib_submit(struct amdgpu_device *adev)
>  {
> struct amdgpu_ring *ring;
> const struct cs_section_def *sect = NULL; @@ -5966,13 +5967,6 @@
> static int gfx_v10_0_cp_gfx_start(struct amdgpu_device *adev)
> int r, i;
> int ctx_reg_offset;
>
> -   /* init the CP */
> -   WREG32_SOC15(GC, 0, mmCP_MAX_CONTEXT,
> -adev->gfx.config.max_hw_contexts - 1);
> -   WREG32_SOC15(GC, 0, mmCP_DEVICE_ID, 1);
> -
> -   gfx_v10_0_cp_gfx_enable(adev, true);
> -
> ring = >gfx.gfx_ring[0];
> r = 

Re: [PATCH] drm/amdgpu: Use pcie domain of xcc acpi objects

2023-10-24 Thread Bhardwaj, Rajneesh
[AMD Official Use Only - General]

Looks good to me.
Reviewed-by: Rajneesh Bhardwaj 

Regards,
Rajneesh

From: Lazar, Lijo 
Sent: Monday, October 23, 2023 2:43:01 PM
To: amd-gfx@lists.freedesktop.org 
Cc: Deucher, Alexander ; Kasiviswanathan, Harish 
; Zhang, Hawking ; 
Bhardwaj, Rajneesh 
Subject: Re: [PATCH] drm/amdgpu: Use pcie domain of xcc acpi objects


[AMD Official Use Only - General]



Thanks,
Lijo

From: amd-gfx  on behalf of Lijo Lazar 

Sent: Friday, October 20, 2023 8:44:22 PM
To: amd-gfx@lists.freedesktop.org 
Cc: Deucher, Alexander ; Kasiviswanathan, Harish 
; Zhang, Hawking 
Subject: [PATCH] drm/amdgpu: Use pcie domain of xcc acpi objects

PCI domain/segment information of xccs is available through ACPI DSM
methods. Consider that also while looking for devices.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 40 +---
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
index 2bca37044ad0..d62e49758635 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
@@ -68,7 +68,7 @@ struct amdgpu_acpi_xcc_info {
 struct amdgpu_acpi_dev_info {
 struct list_head list;
 struct list_head xcc_list;
-   uint16_t bdf;
+   uint32_t sbdf;
 uint16_t supp_xcp_mode;
 uint16_t xcp_mode;
 uint16_t mem_mode;
@@ -927,7 +927,7 @@ static acpi_status amdgpu_acpi_get_node_id(acpi_handle 
handle,
 #endif
 }

-static struct amdgpu_acpi_dev_info *amdgpu_acpi_get_dev(u16 bdf)
+static struct amdgpu_acpi_dev_info *amdgpu_acpi_get_dev(u32 sbdf)
 {
 struct amdgpu_acpi_dev_info *acpi_dev;

@@ -935,14 +935,14 @@ static struct amdgpu_acpi_dev_info 
*amdgpu_acpi_get_dev(u16 bdf)
 return NULL;

 list_for_each_entry(acpi_dev, _acpi_dev_list, list)
-   if (acpi_dev->bdf == bdf)
+   if (acpi_dev->sbdf == sbdf)
 return acpi_dev;

 return NULL;
 }

 static int amdgpu_acpi_dev_init(struct amdgpu_acpi_dev_info **dev_info,
-   struct amdgpu_acpi_xcc_info *xcc_info, u16 bdf)
+   struct amdgpu_acpi_xcc_info *xcc_info, u32 sbdf)
 {
 struct amdgpu_acpi_dev_info *tmp;
 union acpi_object *obj;
@@ -955,7 +955,7 @@ static int amdgpu_acpi_dev_init(struct amdgpu_acpi_dev_info 
**dev_info,

 INIT_LIST_HEAD(>xcc_list);
 INIT_LIST_HEAD(>list);
-   tmp->bdf = bdf;
+   tmp->sbdf = sbdf;

 obj = acpi_evaluate_dsm_typed(xcc_info->handle, _xcc_dsm_guid, 0,
   AMD_XCC_DSM_GET_SUPP_MODE, NULL,
@@ -1007,7 +1007,7 @@ static int amdgpu_acpi_dev_init(struct 
amdgpu_acpi_dev_info **dev_info,

 DRM_DEBUG_DRIVER(
 "New dev(%x): Supported xcp mode: %x curr xcp_mode : %x mem 
mode : %x, tmr base: %llx tmr size: %llx  ",
-   tmp->bdf, tmp->supp_xcp_mode, tmp->xcp_mode, tmp->mem_mode,
+   tmp->sbdf, tmp->supp_xcp_mode, tmp->xcp_mode, tmp->mem_mode,
 tmp->tmr_base, tmp->tmr_size);
 list_add_tail(>list, _acpi_dev_list);
 *dev_info = tmp;
@@ -1023,7 +1023,7 @@ static int amdgpu_acpi_dev_init(struct 
amdgpu_acpi_dev_info **dev_info,
 }

 static int amdgpu_acpi_get_xcc_info(struct amdgpu_acpi_xcc_info *xcc_info,
-   u16 *bdf)
+   u32 *sbdf)
 {
 union acpi_object *obj;
 acpi_status status;
@@ -1054,8 +1054,10 @@ static int amdgpu_acpi_get_xcc_info(struct 
amdgpu_acpi_xcc_info *xcc_info,
 xcc_info->phy_id = (obj->integer.value >> 32) & 0xFF;
 /* xcp node of this xcc [47:40] */
 xcc_info->xcp_node = (obj->integer.value >> 40) & 0xFF;
+   /* PF domain of this xcc [31:16] */
+   *sbdf = (obj->integer.value) & 0x;
 /* PF bus/dev/fn of this xcc [63:48] */
-   *bdf = (obj->integer.value >> 48) & 0x;
+   *sbdf |= (obj->integer.value >> 48) & 0x;
 ACPI_FREE(obj);
 obj = NULL;

@@ -1079,7 +1081,7 @@ static int amdgpu_acpi_enumerate_xcc(void)
 struct acpi_device *acpi_dev;
 char hid[ACPI_ID_LEN];
 int ret, id;
-   u16 bdf;
+   u32 sbdf;

 INIT_LIST_HEAD(_acpi_dev_list);
 xa_init(_info_xa);
@@ -1107,16 +1109,16 @@ static int amdgpu_acpi_enumerate_xcc(void)
 xcc_info->handle = acpi_device_handle(acpi_dev);
 acpi_dev_put(acpi_dev);

-   ret = amdgpu_acpi_get_xcc_info(xcc_info, );
+   ret = amdgpu_acpi_get_xcc_info(xcc_info, );
 if (ret) {
 kfree(xcc_info);
 continue;
 }

-   dev_info = amdgpu_acpi_get_dev(bdf);
+   dev_info = 

RE: [PATCH] drm/amd/amdgpu: fix the GPU power print error in pm info

2023-10-24 Thread Zhang, Yifan
[AMD Official Use Only - General]

-Original Message-
From: Ma, Li 
Sent: Tuesday, October 24, 2023 7:09 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Zhang, Yifan 
; Feng, Kenneth ; StDenis, Tom 
; Ma, Li 
Subject: [PATCH] drm/amd/amdgpu: fix the GPU power print error in pm info

Print the digit of the fractional part individually to avoid carrying during 
display.

Signed-off-by: Li Ma 
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 358bb5e485f2..cc853559cf0f 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -4290,10 +4290,10 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file 
*m, struct amdgpu_device *a
seq_printf(m, "\t%u mV (VDDNB)\n", value);
size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_AVG_POWER, (void 
*), ))
-   seq_printf(m, "\t%u.%u W (average GPU)\n", query >> 8, query & 
0xff);
+   seq_printf(m, "\t%u.%u%u W (average GPU)\n", query >> 8, (query 
&
+0xff) / 10, (query & 0xff) % 10);

Would you pls elaborate on this with an example ? it looks to me it makes no 
difference here.

size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_INPUT_POWER, 
(void *), ))
-   seq_printf(m, "\t%u.%u W (current GPU)\n", query >> 8, query & 
0xff);
+   seq_printf(m, "\t%u.%u%u W (current GPU)\n", query >> 8, (query 
&
+0xff) / 10, (query & 0xff) % 10);

Would you pls elaborate on this with an example? it looks to me it makes no 
difference here.

size = sizeof(value);
seq_printf(m, "\n");

--
2.25.1



[PATCH] drm/amd/pm: call smu_cmn_get_smc_version in is_mode1_reset_supported.

2023-10-24 Thread Yifan Zhang
is_mode1_reset_supported may be called before smu init, when smu_context
is unitialized in driver load/unload test. Call smu_cmn_get_smc_version
explicitly is_mode1_reset_supported.

Fixes: 5fe5098c64d9 ("drm/amd/pm: drop most smu_cmn_get_smc_version in smu")
Signed-off-by: Yifan Zhang 
---
 drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 8 +++-
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c| 8 +++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
index 090249b6422a..77c3d76c76a2 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
@@ -2461,12 +2461,18 @@ static bool 
sienna_cichlid_is_mode1_reset_supported(struct smu_context *smu)
 {
struct amdgpu_device *adev = smu->adev;
uint32_t val;
+   uint32_t smu_version;
+   int ret;
 
/**
 * SRIOV env will not support SMU mode1 reset
 * PM FW support mode1 reset from 58.26
 */
-   if (amdgpu_sriov_vf(adev) || (smu->smc_fw_version < 0x003a1a00))
+   ret = smu_cmn_get_smc_version(smu, NULL, _version);
+   if (ret)
+   return false;
+
+   if (amdgpu_sriov_vf(adev) || (smu_version < 0x003a1a00))
return false;
 
/**
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index b1433973380b..648d5eafb27b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -2615,13 +2615,19 @@ static int smu_v13_0_0_baco_exit(struct smu_context 
*smu)
 static bool smu_v13_0_0_is_mode1_reset_supported(struct smu_context *smu)
 {
struct amdgpu_device *adev = smu->adev;
+   u32 smu_version;
+   int ret;
 
/* SRIOV does not support SMU mode1 reset */
if (amdgpu_sriov_vf(adev))
return false;
 
/* PMFW support is available since 78.41 */
-   if (smu->smc_fw_version < 0x004e2900)
+   ret = smu_cmn_get_smc_version(smu, NULL, _version);
+   if (ret)
+   return false;
+
+   if (smu_version < 0x004e2900)
return false;
 
return true;
-- 
2.37.3



[PATCH] drm/amd/amdgpu: fix the GPU power print error in pm info

2023-10-24 Thread Li Ma
Print the digit of the fractional part individually to avoid carrying
during display.

Signed-off-by: Li Ma 
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 358bb5e485f2..cc853559cf0f 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -4290,10 +4290,10 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file 
*m, struct amdgpu_device *a
seq_printf(m, "\t%u mV (VDDNB)\n", value);
size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_AVG_POWER, (void 
*), ))
-   seq_printf(m, "\t%u.%u W (average GPU)\n", query >> 8, query & 
0xff);
+   seq_printf(m, "\t%u.%u%u W (average GPU)\n", query >> 8, (query 
& 0xff) / 10, (query & 0xff) % 10);
size = sizeof(uint32_t);
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_INPUT_POWER, 
(void *), ))
-   seq_printf(m, "\t%u.%u W (current GPU)\n", query >> 8, query & 
0xff);
+   seq_printf(m, "\t%u.%u%u W (current GPU)\n", query >> 8, (query 
& 0xff) / 10, (query & 0xff) % 10);
size = sizeof(value);
seq_printf(m, "\n");
 
-- 
2.25.1



RE: [PATCH] drm/amdgpu: get RAS poison status from DF v4_6_2

2023-10-24 Thread Zhang, Hawking
[AMD Official Use Only - General]

Acked-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: amd-gfx  On Behalf Of Tao Zhou
Sent: Tuesday, October 24, 2023 14:42
To: amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Yang, Stanley 
Subject: [PATCH] drm/amdgpu: get RAS poison status from DF v4_6_2

Add DF block and RAS poison mode query for DF v4_6_2.

Signed-off-by: Tao Zhou 
Reviewed-by: Stanley.Yang 
---
 drivers/gpu/drm/amd/amdgpu/Makefile   |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c |  4 +++
 drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c| 34 +++
 drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h| 31 +
 4 files changed, 71 insertions(+), 1 deletion(-)  create mode 100644 
drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index ec1daf7112a9..260e32ef7bae 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -104,7 +104,8 @@ amdgpu-y += \
 amdgpu-y += \
df_v1_7.o \
df_v3_6.o \
-   df_v4_3.o
+   df_v4_3.o \
+   df_v4_6_2.o

 # add GMC block
 amdgpu-y += \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 17d4311e22d5..8d3681172cea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -35,6 +35,7 @@
 #include "df_v1_7.h"
 #include "df_v3_6.h"
 #include "df_v4_3.h"
+#include "df_v4_6_2.h"
 #include "nbio_v6_1.h"
 #include "nbio_v7_0.h"
 #include "nbio_v7_4.h"
@@ -2557,6 +2558,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device 
*adev)
case IP_VERSION(4, 3, 0):
adev->df.funcs = _v4_3_funcs;
break;
+   case IP_VERSION(4, 6, 2):
+   adev->df.funcs = _v4_6_2_funcs;
+   break;
default:
break;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c 
b/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c
new file mode 100644
index ..a47960a0babd
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+obtaining a
+ * copy of this software and associated documentation files (the
+"Software"),
+ * to deal in the Software without restriction, including without
+limitation
+ * the rights to use, copy, modify, merge, publish, distribute,
+sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom
+the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
+SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
+DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "amdgpu.h"
+#include "df_v4_6_2.h"
+
+static bool df_v4_6_2_query_ras_poison_mode(struct amdgpu_device *adev)
+{
+   /* return true since related regs are inaccessible */
+   return true;
+}
+
+const struct amdgpu_df_funcs df_v4_6_2_funcs = {
+   .query_ras_poison_mode = df_v4_6_2_query_ras_poison_mode, };
diff --git a/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h 
b/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h
new file mode 100644
index ..3bc3e6d216e2
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+obtaining a
+ * copy of this software and associated documentation files (the
+"Software"),
+ * to deal in the Software without restriction, including without
+limitation
+ * the rights to use, copy, modify, merge, publish, distribute,
+sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom
+the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
+SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
+DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 

Re: [PATCH] drm/amdgpu: Initialize schedulers before using them

2023-10-24 Thread Christian König

[SNIP]

Let me take a closer look first


I think I've figured out why this isn't working as expected. It started 
with this patch here:


commit 5fd8518d187ed03403a4d4f7f56f52c00b11c148
Author: Andrey Grodzovsky 
Date:   Mon Dec 6 14:59:35 2021 -0500

    drm/amdgpu: Move scheduler init to after XGMI is ready

    Before we initialize schedulers we must know which reset
    domain are we in - for single device there iis a single
    domain per device and so single wq per device. For XGMI
    the reset domain spans the entire XGMI hive and so the
    reset wq is per hive.

    Signed-off-by: Andrey Grodzovsky 
    Reviewed-by: Christian König 
    Link: https://www.spinics.net/lists/amd-gfx/msg74112.html

Andrey separated the scheduler initialization from the ring init because 
we need some of the rings for XGMI initialization which in turn in 
necessary to figure out the XGMI hive and so the reset domain for the 
scheduler.


The code inside amdgpu_ttm_set_buffer_funcs_status() is actually 
correct, the problem is that this is called as part of the hw init which 
comes earlier than the scheduler init.


@Alex, Ideas how to fix this? My best guess is that we should move the 
call to amdgpu_ttm_set_buffer_funcs_status() from the DMA specific code 
into the higher level handling in amdgpu_device.c


Regards,
Christian.




[bug report] drm/amdgpu: Workaround to skip kiq ring test during ras gpu recovery

2023-10-24 Thread Dan Carpenter
Hello Stanley.Yang,

The patch b1338a8e71ac: "drm/amdgpu: Workaround to skip kiq ring test
during ras gpu recovery" from Oct 17, 2023 (linux-next), leads to the
following Smatch static checker warning:

drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c:513 amdgpu_get_xgmi_hive()
warn: sleeping in atomic context

drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
500 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device 
*adev)
501 {
502 struct amdgpu_hive_info *hive = NULL;
503 int ret;
504 
505 if (!adev->gmc.xgmi.hive_id)
506 return NULL;
507 
508 if (adev->hive) {
509 kobject_get(>hive->kobj);
510 return adev->hive;
511 }
512 
--> 513 mutex_lock(_mutex);

The patch adds a new caller amdgpu_gfx_disable_kcq() which is holding
spin_lock(>ring_lock).  And we can't take a mutex if we're already
holding a spin_lock.  Turn on CONFIG_DEBUG_ATOMIC_SLEEP to see the
warning.


regards,
dan carpenter


RE: [PATCH 1/3] drm/amdgpu: ungate power gating when system suspend

2023-10-24 Thread Wang, Yang(Kevin)
[AMD Official Use Only - General]

Acked-by: Yang Wang 

Best Regards,
Kevin

-Original Message-
From: Feng, Kenneth 
Sent: Tuesday, October 24, 2023 2:33 PM
To: Yuan, Perry ; Zhang, Yifan ; 
Limonciello, Mario 
Cc: Deucher, Alexander ; Wang, Yang(Kevin) 
; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH 1/3] drm/amdgpu: ungate power gating when system suspend

[AMD Official Use Only - General]

Reviewed-by: Kenneth Feng 


-Original Message-
From: Yuan, Perry 
Sent: Tuesday, October 24, 2023 10:33 AM
To: Zhang, Yifan ; Feng, Kenneth ; 
Limonciello, Mario 
Cc: Deucher, Alexander ; Wang, Yang(Kevin) 
; amd-gfx@lists.freedesktop.org
Subject: [PATCH 1/3] drm/amdgpu: ungate power gating when system suspend

[Why] During suspend, if GFX DPM is enabled and GFXOFF feature is enabled the 
system may get hung. So, it is suggested to disable GFXOFF feature during 
suspend and enable it after resume.

[How] Update the code to disable GFXOFF feature during suspend and enable it 
after resume.

[  311.396526] amdgpu :03:00.0: amdgpu: SMU: I'm not done with your 
previous command: SMN_C2PMSG_66:0x001E SMN_C2PMSG_82:0x [  
311.396530] amdgpu :03:00.0: amdgpu: Fail to disable dpm features!
[  311.396531] [drm:amdgpu_device_ip_suspend_phase2 [amdgpu]] *ERROR* suspend 
of IP block  failed -62

Signed-off-by: Perry Yuan 
Signed-off-by: Kun Liu 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index d9ccacd06fba..6399bc71c56d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -3498,6 +3498,8 @@ static void gfx_v10_0_ring_invalidate_tlbs(struct 
amdgpu_ring *ring,  static void gfx_v10_0_update_spm_vmid_internal(struct 
amdgpu_device *adev,
   unsigned int vmid);

+static int gfx_v10_0_set_powergating_state(void *handle,
+ enum amd_powergating_state state);
 static void gfx10_kiq_set_resources(struct amdgpu_ring *kiq_ring, uint64_t 
queue_mask)  {
amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_SET_RESOURCES, 6)); @@ 
-7172,6 +7174,13 @@ static int gfx_v10_0_hw_fini(void *handle)
amdgpu_irq_put(adev, >gfx.priv_reg_irq, 0);
amdgpu_irq_put(adev, >gfx.priv_inst_irq, 0);

+   /* WA added for Vangogh asic fixing the SMU suspend failure
+* It needs to set power gating again during gfxoff control
+* otherwise the gfxoff disallowing will be failed to set.
+*/
+   if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(10, 3, 1))
+   gfx_v10_0_set_powergating_state(handle, AMD_PG_STATE_UNGATE);
+
if (!adev->no_hw_access) {
if (amdgpu_async_gfx_ring) {
if (amdgpu_gfx_disable_kgq(adev, 0))
--
2.34.1




Re: [PATCH] Revert "drm/amdgpu: remove vm sanity check from amdgpu_vm_make_compute"

2023-10-24 Thread Christian König

Am 24.10.23 um 01:41 schrieb Felix Kuehling:


[sorry, I hit send too early]


On 2023-10-23 11:15, Christian König wrote:

Am 23.10.23 um 15:06 schrieb Daniel Tang:

That commit causes the screen to freeze a few moments after running
clinfo on v6.6-rc7 and ROCm 5.6. Sometimes the rest of the computer
including ssh also freezes. On v6.5-rc1, it only results in a NULL 
pointer

deference message in dmesg and the process to become a zombie whose
unkillableness prevents shutdown without REISUB. Although llama.cpp and
hashcat were working in v6.2 and ROCm 5.6, broke, and are not fixed by
this revert, pytorch-rocm is now working with stability and without
whole-computer freezes caused by any accidental running of clinfo.

This reverts commit 1d7776cc148b9f2f3ebaf1181662ba695a29f639.


That result doesn't make much sense. Felix please correct me, but 
AFAIK the ATS stuff was completely removed by now.


Are you sure that this is pure v6.6-rc7 and not some other patches 
applied? If yes than we must have missed something.


This revert doesn't really affect systems with ATS. It moves the 
sanity check back out of the ATS-specific code.


Ah! I've read the code wrong, that makes much more sense now.



The Null pointer dereference in the bug report comes from the CPU page 
table update code:

[10089.267556] BUG: kernel NULL pointer dereference, address: 
[10089.267563] #PF: supervisor write access in kernel mode
[10089.267566] #PF: error_code(0x0002) - not-present page
[10089.267569] PGD 0 P4D 0
[10089.267574] Oops: 0002 [#1] PREEMPT SMP NOPTI
[10089.267578] CPU: 23 PID: 18191 Comm: clinfo Tainted: G   OE  
6.5.0-9-generic #9-Ubuntu
[10089.267582] Hardware name: Micro-Star International Co., Ltd. MS-7C37/X570-A 
PRO (MS-7C37), BIOS H.I0 08/10/2022
[10089.267585] RIP: 0010:amdgpu_gmc_set_pte_pde+0x23/0x40 [amdgpu]
[10089.267820] Code: 90 90 90 90 90 90 90 0f 1f 44 00 00 48 b8 00 f0 ff ff ff ff 00 
00 55 48 21 c1 8d 04 d5 00 00 00 00 4c 09 c1 48 01 c6 48 89 e5 <48> 89 0e 31 c0 
5d 31 d2 31 c9 31 f6 45 31 c0 e9 89 7e 27 fb 66 0f
[10089.267823] RSP: 0018:b49805eeb8b0 EFLAGS: 00010246
[10089.267827] RAX:  RBX: 0020 RCX: 00400480
[10089.267830] RDX:  RSI:  RDI: 9890d438
[10089.267832] RBP: b49805eeb8b0 R08: 00400480 R09: 0020
[10089.267835] R10: 000800100200 R11: 000800100200 R12: b49805eeba98
[10089.267837] R13: 0001 R14: 0020 R15: 0001
[10089.267840] FS:  7f8ca9f09740() GS:9897befc() 
knlGS:
[10089.267843] CS:  0010 DS:  ES:  CR0: 80050033
[10089.267846] CR2:  CR3: 0002e0746000 CR4: 00750ee0
[10089.267849] PKRU: 5554
[10089.267851] Call Trace:
[10089.267853]  
[10089.267858]  ? show_regs+0x6d/0x80
[10089.267865]  ? __die+0x24/0x80
[10089.267870]  ? page_fault_oops+0x99/0x1b0
[10089.267876]  ? do_user_addr_fault+0x316/0x6b0
[10089.267879]  ? srso_alias_return_thunk+0x5/0x7f
[10089.267884]  ? scsi_dispatch_cmd+0x91/0x240
[10089.267891]  ? exc_page_fault+0x83/0x1b0
[10089.267896]  ? asm_exc_page_fault+0x27/0x30
[10089.267904]  ? amdgpu_gmc_set_pte_pde+0x23/0x40 [amdgpu]
[10089.268140]  amdgpu_vm_cpu_update+0xa9/0x130 [amdgpu]
...
This revert is just a roundabout way of disabling CPU page table 
updates for compute VMs. But I don't think it really addresses the 
root cause.


Yeah, completely agree. Looks like some page tables isn't CPU accessible 
for some reason.


Going to take a look when I have time.

Regards,
Christian.



Regards,
  Felix




Regards,
Christian.



Closes: https://github.com/RadeonOpenCompute/ROCm/issues/2596
Signed-off-by: Daniel Tang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 12 ++--
  1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

index 82f25996ff5e..602f311ab766 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2243,16 +2243,16 @@ int amdgpu_vm_make_compute(struct 
amdgpu_device *adev, struct amdgpu_vm *vm)

  if (r)
  return r;
  +    /* Sanity checks */
+    if (!amdgpu_vm_pt_is_root_clean(adev, vm)) {
+    r = -EINVAL;
+    goto unreserve_bo;
+    }
+
  /* Check if PD needs to be reinitialized and do it before
   * changing any other state, in case it fails.
   */
  if (pte_support_ats != vm->pte_support_ats) {
-    /* Sanity checks */
-    if (!amdgpu_vm_pt_is_root_clean(adev, vm)) {
-    r = -EINVAL;
-    goto unreserve_bo;
-    }
-
  vm->pte_support_ats = pte_support_ats;
  r = amdgpu_vm_pt_clear(adev, vm, to_amdgpu_bo_vm(vm->root.bo),
 false);
-- 2.40.1







[PATCH] drm/amdgpu: get RAS poison status from DF v4_6_2

2023-10-24 Thread Tao Zhou
Add DF block and RAS poison mode query for DF v4_6_2.

Signed-off-by: Tao Zhou 
Reviewed-by: Stanley.Yang 
---
 drivers/gpu/drm/amd/amdgpu/Makefile   |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c |  4 +++
 drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c| 34 +++
 drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h| 31 +
 4 files changed, 71 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index ec1daf7112a9..260e32ef7bae 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -104,7 +104,8 @@ amdgpu-y += \
 amdgpu-y += \
df_v1_7.o \
df_v3_6.o \
-   df_v4_3.o
+   df_v4_3.o \
+   df_v4_6_2.o
 
 # add GMC block
 amdgpu-y += \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 17d4311e22d5..8d3681172cea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -35,6 +35,7 @@
 #include "df_v1_7.h"
 #include "df_v3_6.h"
 #include "df_v4_3.h"
+#include "df_v4_6_2.h"
 #include "nbio_v6_1.h"
 #include "nbio_v7_0.h"
 #include "nbio_v7_4.h"
@@ -2557,6 +2558,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device 
*adev)
case IP_VERSION(4, 3, 0):
adev->df.funcs = _v4_3_funcs;
break;
+   case IP_VERSION(4, 6, 2):
+   adev->df.funcs = _v4_6_2_funcs;
+   break;
default:
break;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c 
b/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c
new file mode 100644
index ..a47960a0babd
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "amdgpu.h"
+#include "df_v4_6_2.h"
+
+static bool df_v4_6_2_query_ras_poison_mode(struct amdgpu_device *adev)
+{
+   /* return true since related regs are inaccessible */
+   return true;
+}
+
+const struct amdgpu_df_funcs df_v4_6_2_funcs = {
+   .query_ras_poison_mode = df_v4_6_2_query_ras_poison_mode,
+};
diff --git a/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h 
b/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h
new file mode 100644
index ..3bc3e6d216e2
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/df_v4_6_2.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __DF_V4_6_2_H__
+#define __DF_V4_6_2_H__
+
+#include "soc15_common.h"
+
+extern const struct amdgpu_df_funcs df_v4_6_2_funcs;
+
+#endif
-- 
2.35.1



RE: [PATCH 1/3] drm/amdgpu: ungate power gating when system suspend

2023-10-24 Thread Feng, Kenneth
[AMD Official Use Only - General]

Reviewed-by: Kenneth Feng 


-Original Message-
From: Yuan, Perry 
Sent: Tuesday, October 24, 2023 10:33 AM
To: Zhang, Yifan ; Feng, Kenneth ; 
Limonciello, Mario 
Cc: Deucher, Alexander ; Wang, Yang(Kevin) 
; amd-gfx@lists.freedesktop.org
Subject: [PATCH 1/3] drm/amdgpu: ungate power gating when system suspend

[Why] During suspend, if GFX DPM is enabled and GFXOFF feature is enabled the 
system may get hung. So, it is suggested to disable GFXOFF feature during 
suspend and enable it after resume.

[How] Update the code to disable GFXOFF feature during suspend and enable it 
after resume.

[  311.396526] amdgpu :03:00.0: amdgpu: SMU: I'm not done with your 
previous command: SMN_C2PMSG_66:0x001E SMN_C2PMSG_82:0x [  
311.396530] amdgpu :03:00.0: amdgpu: Fail to disable dpm features!
[  311.396531] [drm:amdgpu_device_ip_suspend_phase2 [amdgpu]] *ERROR* suspend 
of IP block  failed -62

Signed-off-by: Perry Yuan 
Signed-off-by: Kun Liu 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index d9ccacd06fba..6399bc71c56d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -3498,6 +3498,8 @@ static void gfx_v10_0_ring_invalidate_tlbs(struct 
amdgpu_ring *ring,  static void gfx_v10_0_update_spm_vmid_internal(struct 
amdgpu_device *adev,
   unsigned int vmid);

+static int gfx_v10_0_set_powergating_state(void *handle,
+ enum amd_powergating_state state);
 static void gfx10_kiq_set_resources(struct amdgpu_ring *kiq_ring, uint64_t 
queue_mask)  {
amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_SET_RESOURCES, 6)); @@ 
-7172,6 +7174,13 @@ static int gfx_v10_0_hw_fini(void *handle)
amdgpu_irq_put(adev, >gfx.priv_reg_irq, 0);
amdgpu_irq_put(adev, >gfx.priv_inst_irq, 0);

+   /* WA added for Vangogh asic fixing the SMU suspend failure
+* It needs to set power gating again during gfxoff control
+* otherwise the gfxoff disallowing will be failed to set.
+*/
+   if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(10, 3, 1))
+   gfx_v10_0_set_powergating_state(handle, AMD_PG_STATE_UNGATE);
+
if (!adev->no_hw_access) {
if (amdgpu_async_gfx_ring) {
if (amdgpu_gfx_disable_kgq(adev, 0))
--
2.34.1



Re: [PATCH 2/2] drm/amdgpu: Add timeout for sync wait

2023-10-24 Thread Christian König

Am 20.10.23 um 11:59 schrieb Emily Deng:

Issue: Dead heappen during gpu recover, the call sequence as below:

amdgpu_device_gpu_recover->amdgpu_amdkfd_pre_reset->flush_delayed_work->
amdgpu_amdkfd_gpuvm_restore_process_bos->amdgpu_sync_wait


Resolving a deadlock with a timeout is illegal in general. So this patch 
here is an obvious no-go.


Additional to this problem Xinhu already investigated that the delayed 
work is causing issues during suspend because because flushing doesn't 
guarantee that a new one isn't started right after doing that.


After talking with Felix about this the correct solution is to stop 
flushing the delayed work and instead submitting it to the freezable 
work queue.


Regards,
Christian.



It is because the amdgpu_sync_wait is waiting for the bad job's fence, and
never return, so the recover couldn't continue.

Signed-off-by: Emily Deng 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 11 +--
  1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
index dcd8c066bc1f..9d4f122a7bf0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
@@ -406,8 +406,15 @@ int amdgpu_sync_wait(struct amdgpu_sync *sync, bool intr)
int i, r;
  
  	hash_for_each_safe(sync->fences, i, tmp, e, node) {

-   r = dma_fence_wait(e->fence, intr);
-   if (r)
+   struct drm_sched_fence *s_fence = to_drm_sched_fence(e->fence);
+   long timeout = msecs_to_jiffies(1);
+
+   if (s_fence)
+   timeout = s_fence->sched->timeout;
+   r = dma_fence_wait_timeout(e->fence, intr, timeout);
+   if (r == 0)
+   r = -ETIMEDOUT;
+   if (r < 0)
return r;
  
  		amdgpu_sync_entry_free(e);




Re: [PATCH] drm/amdgpu: Initialize schedulers before using them

2023-10-24 Thread Christian König

Am 24.10.23 um 04:55 schrieb Luben Tuikov:

On 2023-10-23 01:49, Christian König wrote:


Am 23.10.23 um 05:23 schrieb Luben Tuikov:

Initialize ring schedulers before using them, very early in the amdgpu boot,
at PCI probe time, specifically at frame-buffer dumb-create at fill-buffer.

This was discovered by using dynamic scheduler run-queues, which showed that
amdgpu was using a scheduler before calling drm_sched_init(), and the only
reason it was working was because sched_rq[] was statically allocated in the
scheduler structure. However, the scheduler structure had _not_ been
initialized.

When switching to dynamically allocated run-queues, this lack of
initialization was causing an oops and a blank screen at boot up. This patch
fixes this amdgpu bug.

This patch depends on the "drm/sched: Convert the GPU scheduler to variable
number of run-queues" patch, as that patch prevents subsequent scheduler
initialization if a scheduler has already been initialized.

Cc: Christian König 
Cc: Alex Deucher 
Cc: Felix Kuehling 
Cc: AMD Graphics 
Signed-off-by: Luben Tuikov 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 14 ++
   1 file changed, 14 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 4e51dce3aab5d6..575ef7e1e30fd4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -60,6 +60,7 @@
   #include "amdgpu_atomfirmware.h"
   #include "amdgpu_res_cursor.h"
   #include "bif/bif_4_1_d.h"
+#include "amdgpu_reset.h"
   
   MODULE_IMPORT_NS(DMA_BUF);
   
@@ -2059,6 +2060,19 @@ void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool enable)
   
   		ring = adev->mman.buffer_funcs_ring;

sched = >sched;
+
+   r = drm_sched_init(sched, _sched_ops,
+  DRM_SCHED_PRIORITY_COUNT,
+  ring->num_hw_submission, 0,
+  adev->sdma_timeout, adev->reset_domain->wq,
+  ring->sched_score, ring->name,
+  adev->dev);
+   if (r) {
+   drm_err(adev, "%s: couldn't initialize ring:%s 
error:%d\n",
+   __func__, ring->name, r);
+   return;
+   }

That doesn't look correct either.

amdgpu_ttm_set_buffer_funcs_status() should only be called with
enable=true as argument *after* the copy ring is initialized and valid
to use. One part of this ring initialization is to setup the scheduler.

It's the only way to keep the functionality of amdgpu_fill_buffer()
from amdgpu_mode_dumb_create(), from drm_client_framebuffer_create(),
from ... without an oops and a blank screen at boot up.

Here is a stack of the oops:

Oct 20 22:12:34 fedora kernel: RIP: 0010:drm_sched_job_arm+0x1f/0x60 [gpu_sched]
Oct 20 22:12:34 fedora kernel: Code: 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 
55 53 48 8b 6f 58 48 85 ed 74 3f 48 89 fb 48 89 ef e8 95 34 00 00 48 8b 45 10 
<48> 8b 50 08 48 89 53 18 8b 45 24 89 43 54 b8 01 00 00 00 f0 48 0f
Oct 20 22:12:34 fedora kernel: RSP: 0018:c90001613838 EFLAGS: 00010246
Oct 20 22:12:34 fedora kernel: RAX:  RBX: 88812f33b400 RCX: 
0004
Oct 20 22:12:34 fedora kernel: RDX:  RSI: c9000395145c RDI: 
88812eacf850
Oct 20 22:12:34 fedora kernel: RBP: 88812eacf850 R08: 0004 R09: 
0003
Oct 20 22:12:34 fedora kernel: R10: c066b850 R11: bc848ef1 R12: 

Oct 20 22:12:34 fedora kernel: R13: 0004 R14: 00800300 R15: 
0100
Oct 20 22:12:34 fedora kernel: FS:  7f7be4866940() 
GS:0ed0() knlGS:
Oct 20 22:12:34 fedora kernel: CS:  0010 DS:  ES:  CR0: 80050033
Oct 20 22:12:34 fedora kernel: CR2: 0008 CR3: 00012cf22000 CR4: 
003506e0
Oct 20 22:12:34 fedora kernel: Call Trace:
Oct 20 22:12:34 fedora kernel:  
Oct 20 22:12:34 fedora kernel:  ? __die+0x1f/0x70
Oct 20 22:12:34 fedora kernel:  ? page_fault_oops+0x149/0x440
Oct 20 22:12:34 fedora kernel:  ? drm_sched_fence_alloc+0x1a/0x40 [gpu_sched]
Oct 20 22:12:34 fedora kernel:  ? amdgpu_job_alloc_with_ib+0x34/0xb0 [amdgpu]
Oct 20 22:12:34 fedora kernel:  ? srso_return_thunk+0x5/0x10
Oct 20 22:12:34 fedora kernel:  ? do_user_addr_fault+0x65/0x650
Oct 20 22:12:34 fedora kernel:  ? drm_client_framebuffer_create+0xa3/0x280 [drm]
Oct 20 22:12:34 fedora kernel:  ? exc_page_fault+0x7b/0x180
Oct 20 22:12:34 fedora kernel:  ? asm_exc_page_fault+0x22/0x30
Oct 20 22:12:34 fedora kernel:  ? local_pci_probe+0x41/0x90
Oct 20 22:12:34 fedora kernel:  ? __pfx_sdma_v5_0_emit_fill_buffer+0x10/0x10 
[amdgpu]
Oct 20 22:12:34 fedora kernel:  ? drm_sched_job_arm+0x1f/0x60 [gpu_sched]
Oct 20 22:12:34 fedora kernel:  ? drm_sched_job_arm+0x1b/0x60 [gpu_sched]
Oct 20 22:12:34