[PATCH] drm/amdgpu: skip call ras_late_init if ras feature is not enabled

2024-01-17 Thread Yang Wang
skip call ras_late_init callback if ras feature is not enabled.

Signed-off-by: Yang Wang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5c817c155d72..5c73d0871220 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3312,6 +3312,9 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
}
 
obj = node->ras_obj;
+   if (!amdgpu_ras_is_feature_enabled(adev, >ras_comm))
+   continue;
+
if (obj->ras_late_init) {
r = obj->ras_late_init(adev, >ras_comm);
if (r) {
-- 
2.34.1



Re: [PATCH v2] drm/amd/pm: enable amdgpu smu send message log

2024-01-17 Thread Lazar, Lijo

On 1/18/2024 11:07 AM, Yang Wang wrote:

From: Yang Wang 

v1:
enable amdgpu smu driver message log.

v2:
add smu/pmfw response value into debug log.

Signed-off-by: Yang Wang 


Reviewed-by: Lijo Lazar 

Thanks,
Lijo


---
  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 9 -
  1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 00cd615bbcdc..b8dbd4e25348 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -378,8 +378,15 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context 
*smu,
res = __smu_cmn_reg2errno(smu, reg);
if (res != 0)
__smu_cmn_reg_print_error(smu, reg, index, param, msg);
-   if (read_arg)
+   if (read_arg) {
smu_cmn_read_arg(smu, read_arg);
+   dev_dbg(adev->dev, "smu send message: %s(%d) param: 0x%08x, 
resp: 0x%08x,\
+   readval: 0x%08x\n",
+   smu_get_message_name(smu, msg), index, param, reg, 
*read_arg);
+   } else {
+   dev_dbg(adev->dev, "smu send message: %s(%d) param: 0x%08x, resp: 
0x%08x\n",
+   smu_get_message_name(smu, msg), index, param, reg);
+   }
  Out:
if (unlikely(adev->pm.smu_debug_mask & SMU_DEBUG_HALT_ON_ERROR) && res) 
{
amdgpu_device_halt(adev);




[PATCH 2/2] update check condition of query for ras page retire

2024-01-17 Thread Tao Zhou
Support page retirement handling in debug mode.

v2: revert smu_v13_0_6_get_ecc_info directly.

Signed-off-by: Tao Zhou 
Change-Id: I0aaa807d7fe87b3da0f023c380e57ab6dd446fcf
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 9d1cf41cf483..d8d263956e85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -93,11 +93,14 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret = 0;
+   unsigned int error_query_mode;
unsigned long err_count;
 
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+   amdgpu_ras_get_error_query_mode(adev, _query_mode);
ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
-   if (ret == -EOPNOTSUPP) {
+   if (ret == -EOPNOTSUPP &&
+   error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
adev->umc.ras->ras_block.hw_ops->query_ras_error_count)

adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status);
@@ -121,7 +124,8 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
 */

adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, 
ras_error_status);
}
-   } else if (!ret) {
+   } else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY ||
+   (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) {
if (adev->umc.ras &&
adev->umc.ras->ecc_info_query_ras_error_count)
adev->umc.ras->ecc_info_query_ras_error_count(adev, 
ras_error_status);
-- 
2.34.1



[PATCH 1/2] Revert "drm/amd/pm: smu v13_0_6 supports ecc info by default"

2024-01-17 Thread Tao Zhou
This reverts commit affdce050ab4119a3cdf74d7faa8f1eb30f6f6aa.
We use debug mode flag instead of this interface.

Signed-off-by: Tao Zhou 
Change-Id: I49eae821ce352d542143d68c05802634b4bf469d
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 952a983da49a..6d8fdf8c538c 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -3034,13 +3034,6 @@ static int smu_v13_0_6_select_xgmi_plpd_policy(struct 
smu_context *smu,
return ret;
 }
 
-static ssize_t smu_v13_0_6_get_ecc_info(struct smu_context *smu,
-   void *table)
-{
-   /* Support ecc info by default */
-   return 0;
-}
-
 static const struct pptable_funcs smu_v13_0_6_ppt_funcs = {
/* init dpm */
.get_allowed_feature_mask = smu_v13_0_6_get_allowed_feature_mask,
@@ -3095,7 +3088,6 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = 
{
.i2c_init = smu_v13_0_6_i2c_control_init,
.i2c_fini = smu_v13_0_6_i2c_control_fini,
.send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num,
-   .get_ecc_info = smu_v13_0_6_get_ecc_info,
 };
 
 void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu)
-- 
2.34.1



[PATCH v2] drm/amdgpu/pm: Fix the power source flag error

2024-01-17 Thread Ma Jun
The power source flag should be updated when
[1] System receives an interrupt indicating that the power source
has changed.
[2] System resumes from suspend or runtime suspend

Signed-off-by: Ma Jun 
---
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c  | 13 +++--
 drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c |  2 ++
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c |  2 ++
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index c16703868e5c..a54663f2e2ab 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -24,6 +24,7 @@
 
 #include 
 #include 
+#include 
 #include 
 
 #include "amdgpu.h"
@@ -817,16 +818,8 @@ static int smu_late_init(void *handle)
 * handle the switch automatically. Driver involvement
 * is unnecessary.
 */
-   if (!smu->dc_controlled_by_gpio) {
-   ret = smu_set_power_source(smu,
-  adev->pm.ac_power ? 
SMU_POWER_SOURCE_AC :
-  SMU_POWER_SOURCE_DC);
-   if (ret) {
-   dev_err(adev->dev, "Failed to switch to %s mode!\n",
-   adev->pm.ac_power ? "AC" : "DC");
-   return ret;
-   }
-   }
+   adev->pm.ac_power = power_supply_is_system_supplied() > 0;
+   smu_set_ac_dc(smu);
 
if ((amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 1)) ||
(amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 3)))
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
index 2e7f8d5cfc28..8047150fddd4 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
@@ -1442,10 +1442,12 @@ static int smu_v11_0_irq_process(struct amdgpu_device 
*adev,
case 0x3:
dev_dbg(adev->dev, "Switched to AC mode!\n");
schedule_work(>interrupt_work);
+   adev->pm.ac_power = true;
break;
case 0x4:
dev_dbg(adev->dev, "Switched to DC mode!\n");
schedule_work(>interrupt_work);
+   adev->pm.ac_power = false;
break;
case 0x7:
/*
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 771a3d457c33..c486182ff275 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -1379,10 +1379,12 @@ static int smu_v13_0_irq_process(struct amdgpu_device 
*adev,
case 0x3:
dev_dbg(adev->dev, "Switched to AC mode!\n");
smu_v13_0_ack_ac_dc_interrupt(smu);
+   adev->pm.ac_power = true;
break;
case 0x4:
dev_dbg(adev->dev, "Switched to DC mode!\n");
smu_v13_0_ack_ac_dc_interrupt(smu);
+   adev->pm.ac_power = false;
break;
case 0x7:
/*
-- 
2.34.1



Re: [PATCH] drm/amdgpu/pm: Fix the power source flag error

2024-01-17 Thread Ma, Jun



On 1/18/2024 1:05 PM, Lazar, Lijo wrote:
> On 1/18/2024 7:54 AM, Ma, Jun wrote:
>> Hi Lijo,
>>
>> On 1/17/2024 5:41 PM, Lazar, Lijo wrote:
>>> On 1/17/2024 2:22 PM, Ma Jun wrote:
 The power source flag should be updated when
 [1] System receives an interrupt indicating that the power source
 has changed.
 [2] System resumes from suspend or runtime suspend

 Signed-off-by: Ma Jun 
 ---
drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 24 +++
.../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c|  2 ++
.../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|  2 ++
3 files changed, 18 insertions(+), 10 deletions(-)

 diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
 b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
 index c16703868e5c..e16d22e30a8a 100644
 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
 +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
 @@ -24,6 +24,7 @@

#include 
#include 
 +#include 
#include 

#include "amdgpu.h"
 @@ -793,6 +794,17 @@ static int 
 smu_apply_default_config_table_settings(struct smu_context *smu)
return smu_set_config_table(smu, >pm.config_table);
}

 +static void smu_update_power_source(struct amdgpu_device *adev)
 +{
 +  if (power_supply_is_system_supplied() > 0)
 +  adev->pm.ac_power = true;
 +  else
 +  adev->pm.ac_power = false;
 +
 +  if (is_support_sw_smu(adev))
 +  smu_set_ac_dc(adev->powerplay.pp_handle);
 +}
 +
static int smu_late_init(void *handle)
{
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 @@ -817,16 +829,8 @@ static int smu_late_init(void *handle)
 * handle the switch automatically. Driver involvement
 * is unnecessary.
 */
 -  if (!smu->dc_controlled_by_gpio) {
 -  ret = smu_set_power_source(smu,
 - adev->pm.ac_power ? 
 SMU_POWER_SOURCE_AC :
 - SMU_POWER_SOURCE_DC);
 -  if (ret) {
 -  dev_err(adev->dev, "Failed to switch to %s mode!\n",
 -  adev->pm.ac_power ? "AC" : "DC");
 -  return ret;
 -  }
 -  }
>>>
>>> For this part of the change - driver already updates FW with the initial
>>> detected power state or the last detected power state before going for
>>> suspend. Isn't that good enough?
>>>
>>
>> The power source may change during system suspend, so it's necessary to 
>> detect the
>> power source again before system reads the power related data, such as 
>> max_power_limit.
>>
> 
> Ok. For dGPUs, then refresh of power state after resume will be required 
> in GPIO controlled case also. Since FW is reloaded, FW may not detect it 
> as a power source change.
> 
> Rather than creating another wrapper function, it is simpler to do 
> something like
>   adev->pm.ac_power = power_supply_is_system_supplied() > 0;
>   ret = smu_set_ac_dc(smu);

Ok, will fix this in v2.

Regards,
Ma Jun
> 
> Thanks,
> Lijo
> 
>> Regards,
>> Ma Jun
>>
>>
>>> Thanks,
>>> Lijo
>>>
 +  if (!smu->dc_controlled_by_gpio)
 +  smu_update_power_source(adev);

if ((amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 
 1)) ||
(amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 
 3)))
 diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c 
 b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
 index 2e7f8d5cfc28..8047150fddd4 100644
 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
 +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
 @@ -1442,10 +1442,12 @@ static int smu_v11_0_irq_process(struct 
 amdgpu_device *adev,
case 0x3:
dev_dbg(adev->dev, "Switched to AC 
 mode!\n");
schedule_work(>interrupt_work);
 +  adev->pm.ac_power = true;
break;
case 0x4:
dev_dbg(adev->dev, "Switched to DC 
 mode!\n");
schedule_work(>interrupt_work);
 +  adev->pm.ac_power = false;
break;
case 0x7:
/*
 diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
 b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
 index 771a3d457c33..c486182ff275 100644
 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
 +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
 @@ -1379,10 +1379,12 @@ static int 

RE: [PATCH v2] drm/amd/pm: enable amdgpu smu send message log

2024-01-17 Thread Feng, Kenneth
[AMD Official Use Only - General]

Reviewed-by: Kenneth Feng 


-Original Message-
From: amd-gfx  On Behalf Of Yang Wang
Sent: Thursday, January 18, 2024 1:37 PM
To: amd-gfx@lists.freedesktop.org
Cc: Lazar, Lijo ; Feng, Kenneth ; 
Wang, Yang(Kevin) 
Subject: [PATCH v2] drm/amd/pm: enable amdgpu smu send message log

Caution: This message originated from an External Source. Use proper caution 
when opening attachments, clicking links, or responding.


From: Yang Wang 

v1:
enable amdgpu smu driver message log.

v2:
add smu/pmfw response value into debug log.

Signed-off-by: Yang Wang 
---
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 00cd615bbcdc..b8dbd4e25348 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -378,8 +378,15 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context 
*smu,
res = __smu_cmn_reg2errno(smu, reg);
if (res != 0)
__smu_cmn_reg_print_error(smu, reg, index, param, msg);
-   if (read_arg)
+   if (read_arg) {
smu_cmn_read_arg(smu, read_arg);
+   dev_dbg(adev->dev, "smu send message: %s(%d) param: 0x%08x, 
resp: 0x%08x,\
+   readval: 0x%08x\n",
+   smu_get_message_name(smu, msg), index, param, reg, 
*read_arg);
+   } else {
+   dev_dbg(adev->dev, "smu send message: %s(%d) param: 0x%08x, 
resp: 0x%08x\n",
+   smu_get_message_name(smu, msg), index, param, reg);
+   }
 Out:
if (unlikely(adev->pm.smu_debug_mask & SMU_DEBUG_HALT_ON_ERROR) && res) 
{
amdgpu_device_halt(adev);
--
2.34.1



[PATCH V2 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning

2024-01-17 Thread YiPeng Chai
Use asynchronous polling to handle umc_v12_0 poisoning.

v2:
  1. Change function name.
  2. Change the debugging information content.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |   5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 139 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |   3 +
 3 files changed, 116 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 856206e95842..61a02dbac087 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -118,6 +118,8 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block)
 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
 #define RAS_BAD_PAGE_COVER  (100 * 1024 * 1024ULL)
 
+#define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms
+
 enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -2670,6 +2672,9 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_read(>page_retirement_req_cnt));
 
atomic_dec(>page_retirement_req_cnt);
+
+   amdgpu_umc_bad_page_polling_timeout(adev,
+   false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
}
 
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 9d1cf41cf483..83983f0e8eb5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -23,6 +23,7 @@
 
 #include "amdgpu.h"
 #include "umc_v6_7.h"
+#define MAX_UMC_POISON_POLLING_TIME_SYNC   20  //ms
 
 static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t 
err_addr,
@@ -85,17 +86,14 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device 
*adev,
return ret;
 }
 
-static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
-   void *ras_error_status,
-   struct amdgpu_iv_entry *entry,
-   bool reset)
+static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
+   void *ras_error_status)
 {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret = 0;
unsigned long err_count;
-
-   kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+   mutex_lock(>page_retirement_lock);
ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
if (ret == -EOPNOTSUPP) {
if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
@@ -163,19 +161,85 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
con->update_channel_flag = false;
}
}
-
-   if (reset) {
-   /* use mode-2 reset for poison consumption */
-   if (!entry)
-   con->gpu_reset_flags |= 
AMDGPU_RAS_GPU_RESET_MODE2_RESET;
-   amdgpu_ras_reset_gpu(adev);
-   }
}
 
kfree(err_data->err_addr);
+
+   mutex_unlock(>page_retirement_lock);
+}
+
+static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
+   void *ras_error_status,
+   struct amdgpu_iv_entry *entry,
+   bool reset)
+{
+   struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+   amdgpu_umc_handle_bad_pages(adev, ras_error_status);
+
+   if (err_data->ue_count && reset) {
+   /* use mode-2 reset for poison consumption */
+   if (!entry)
+   con->gpu_reset_flags |= 
AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   amdgpu_ras_reset_gpu(adev);
+   }
+
return AMDGPU_RAS_SUCCESS;
 }
 
+int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
+   bool reset, uint32_t timeout_ms)
+{
+   struct ras_err_data err_data;
+   struct ras_common_if head = {
+   .block = AMDGPU_RAS_BLOCK__UMC,
+   };
+   struct ras_manager *obj = amdgpu_ras_find_obj(adev, );
+   uint32_t timeout = timeout_ms;
+
+   memset(_data, 0, sizeof(err_data));
+   amdgpu_ras_error_data_init(_data);
+
+   do {
+
+   amdgpu_umc_handle_bad_pages(adev, _data);
+
+   if (timeout && !err_data.de_count) {
+   msleep(1);
+   timeout--;
+   }
+
+   } while (timeout && !err_data.de_count);
+
+   if (!timeout)
+   dev_warn(adev->dev, "Can't find bad pages\n");
+
+   if (err_data.de_count)
+   

[PATCH V2 4/5] drm/amdgpu: add interface to check mca umc status

2024-01-17 Thread YiPeng Chai
Add interface to check mca umc status.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c   | 12 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h   |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h   |  4 +++-
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c| 20 +++
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  |  6 +++---
 5 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index fde20857b3dd..65ed8bb5c120 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -27,6 +27,16 @@
 #include "umc/umc_6_7_0_offset.h"
 #include "umc/umc_6_7_0_sh_mask.h"
 
+static bool amdgpu_mca_is_deferred_error(struct amdgpu_device *adev,
+   uint64_t mc_status)
+{
+   if (adev->umc.ras->check_ecc_err_status)
+   return adev->umc.ras->check_ecc_err_status(adev,
+   AMDGPU_MCA_ERROR_TYPE_DE, _status);
+
+   return false;
+}
+
 void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
  uint64_t mc_status_addr,
  unsigned long *error_count)
@@ -257,7 +267,7 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device 
*adev, enum amdgpu_ras_blo
amdgpu_ras_error_statistic_ue_count(err_data,
_info, _addr, (uint64_t)count);
else {
-   if 
(!!(MCA_REG__STATUS__DEFERRED(entry->regs[MCA_REG_IDX_STATUS])))
+   if (amdgpu_mca_is_deferred_error(adev, 
entry->regs[MCA_REG_IDX_STATUS]))
amdgpu_ras_error_statistic_de_count(err_data,
_info, _addr, (uint64_t)count);
else
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
index b399f1b62887..b964110ed1e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
@@ -65,6 +65,7 @@ enum amdgpu_mca_ip {
 enum amdgpu_mca_error_type {
AMDGPU_MCA_ERROR_TYPE_UE = 0,
AMDGPU_MCA_ERROR_TYPE_CE,
+   AMDGPU_MCA_ERROR_TYPE_DE,
 };
 
 struct amdgpu_mca_ras_block {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index de2dc1853636..83199296ed10 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -21,7 +21,7 @@
 #ifndef __AMDGPU_UMC_H__
 #define __AMDGPU_UMC_H__
 #include "amdgpu_ras.h"
-
+#include "amdgpu_mca.h"
 /*
  * (addr / 256) * 4096, the higher 26 bits in ErrorAddr
  * is the index of 4KB block
@@ -64,6 +64,8 @@ struct amdgpu_umc_ras {
  void *ras_error_status);
void (*ecc_info_query_ras_error_address)(struct amdgpu_device *adev,
void *ras_error_status);
+   bool (*check_ecc_err_status)(struct amdgpu_device *adev,
+   enum amdgpu_mca_error_type type, void 
*ras_error_status);
/* support different eeprom table version for different asic */
void (*set_eeprom_table_version)(struct amdgpu_ras_eeprom_table_header 
*hdr);
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index fa2168f1d3bf..1e8e97d72f1e 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -425,6 +425,25 @@ static void 
umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
}
 }
 
+static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev,
+   enum amdgpu_mca_error_type type, void *ras_error_status)
+{
+   uint64_t mc_umc_status = *(uint64_t *)ras_error_status;
+
+   switch (type) {
+   case AMDGPU_MCA_ERROR_TYPE_UE:
+   return umc_v12_0_is_uncorrectable_error(adev, mc_umc_status);
+   case AMDGPU_MCA_ERROR_TYPE_CE:
+   return umc_v12_0_is_correctable_error(adev, mc_umc_status);
+   case AMDGPU_MCA_ERROR_TYPE_DE:
+   return umc_v12_0_is_deferred_error(adev, mc_umc_status);
+   default:
+   return false;
+   }
+
+   return false;
+}
+
 static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev)
 {
amdgpu_umc_loop_channels(adev,
@@ -510,5 +529,6 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
.query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
.ecc_info_query_ras_error_count = 
umc_v12_0_ecc_info_query_ras_error_count,
.ecc_info_query_ras_error_address = 
umc_v12_0_ecc_info_query_ras_error_address,
+   .check_ecc_err_status = umc_v12_0_check_ecc_err_status,
 };
 
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c

[PATCH V2 5/5] drm/amdgpu:Support retiring multiple MCA error address pages

2024-01-17 Thread YiPeng Chai
Support retiring multiple MCA error address pages in
one in-band query for umc v12_0.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  8 ++-
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 66 +
 3 files changed, 77 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 61a02dbac087..879e1e59ac76 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3909,8 +3909,7 @@ static int ras_err_info_cmp(void *priv, struct list_head 
*a, struct list_head *b
 }
 
 static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data 
*err_data,
-   struct amdgpu_smuio_mcm_config_info *mcm_info,
-   struct ras_err_addr *err_addr)
+   struct amdgpu_smuio_mcm_config_info *mcm_info)
 {
struct ras_err_node *err_node;
 
@@ -3922,10 +3921,9 @@ static struct ras_err_info 
*amdgpu_ras_error_get_info(struct ras_err_data *err_d
if (!err_node)
return NULL;
 
-   memcpy(_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
+   INIT_LIST_HEAD(_node->err_info.err_addr_list);
 
-   if (err_addr)
-   memcpy(_node->err_info.err_addr, err_addr, 
sizeof(*err_addr));
+   memcpy(_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
 
err_data->err_list_count++;
list_add_tail(_node->node, _data->err_node_list);
@@ -3934,6 +3932,29 @@ static struct ras_err_info 
*amdgpu_ras_error_get_info(struct ras_err_data *err_d
return _node->err_info;
 }
 
+void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct 
ras_err_addr *err_addr)
+{
+   struct ras_err_addr *mca_err_addr;
+
+   mca_err_addr = kzalloc(sizeof(*mca_err_addr), GFP_KERNEL);
+   if (!mca_err_addr)
+   return;
+
+   INIT_LIST_HEAD(_err_addr->node);
+
+   mca_err_addr->err_status = err_addr->err_status;
+   mca_err_addr->err_ipid = err_addr->err_ipid;
+   mca_err_addr->err_addr = err_addr->err_addr;
+
+   list_add_tail(_err_addr->node, _info->err_addr_list);
+}
+
+void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, struct 
ras_err_addr *mca_err_addr)
+{
+   list_del(_err_addr->node);
+   kfree(mca_err_addr);
+}
+
 int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr, u64 count)
@@ -3946,10 +3967,13 @@ int amdgpu_ras_error_statistic_ue_count(struct 
ras_err_data *err_data,
if (!count)
return 0;
 
-   err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
+   err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
if (!err_info)
return -EINVAL;
 
+   if (err_addr && err_addr->err_status)
+   amdgpu_ras_add_mca_err_addr(err_info, err_addr);
+
err_info->ue_count += count;
err_data->ue_count += count;
 
@@ -3968,7 +3992,7 @@ int amdgpu_ras_error_statistic_ce_count(struct 
ras_err_data *err_data,
if (!count)
return 0;
 
-   err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
+   err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
if (!err_info)
return -EINVAL;
 
@@ -3990,10 +4014,13 @@ int amdgpu_ras_error_statistic_de_count(struct 
ras_err_data *err_data,
if (!count)
return 0;
 
-   err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
+   err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
if (!err_info)
return -EINVAL;
 
+   if (err_addr && err_addr->err_status)
+   amdgpu_ras_add_mca_err_addr(err_info, err_addr);
+
err_info->de_count += count;
err_data->de_count += count;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 9c3df9985fad..a25aea6ae230 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -474,6 +474,7 @@ struct ras_fs_data {
 };
 
 struct ras_err_addr {
+   struct list_head node;
uint64_t err_status;
uint64_t err_ipid;
uint64_t err_addr;
@@ -484,7 +485,7 @@ struct ras_err_info {
u64 ce_count;
u64 ue_count;
u64 de_count;
-   struct ras_err_addr err_addr;
+   struct list_head err_addr_list;
 };
 
 struct ras_err_node {
@@ -856,4 +857,9 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum 
amdgpu_ras_block blk)
 ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute 
*attr,
  struct aca_handle *handle, char *buf, void 
*data);
 
+void 

[PATCH V2 2/5] drm/amdgpu: Prepare for asynchronous processing of umc page retirement

2024-01-17 Thread YiPeng Chai
Preparing for asynchronous processing of umc page retirement.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 34 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 
 2 files changed, 39 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index a988360ce3e2..856206e95842 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2656,6 +2656,25 @@ static void amdgpu_ras_validate_threshold(struct 
amdgpu_device *adev,
}
 }
 
+static int amdgpu_ras_page_retirement_thread(void *param)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device *)param;
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   while (!kthread_should_stop()) {
+
+   wait_event_interruptible(con->page_retirement_wq,
+   atomic_read(>page_retirement_req_cnt));
+
+   dev_info(adev->dev, "Start processing page retirement. 
request:%d\n",
+   atomic_read(>page_retirement_req_cnt));
+
+   atomic_dec(>page_retirement_req_cnt);
+   }
+
+   return 0;
+}
+
 int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -2719,6 +2738,16 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
}
}
 
+   mutex_init(>page_retirement_lock);
+   init_waitqueue_head(>page_retirement_wq);
+   atomic_set(>page_retirement_req_cnt, 0);
+   con->page_retirement_thread =
+   kthread_run(amdgpu_ras_page_retirement_thread, adev, 
"umc_page_retirement");
+   if (IS_ERR(con->page_retirement_thread)) {
+   con->page_retirement_thread = NULL;
+   dev_warn(adev->dev, "Failed to create umc_page_retirement 
thread!!!\n");
+   }
+
 #ifdef CONFIG_X86_MCE_AMD
 #ifdef HAVE_SMCA_UMC_V2
if ((adev->asic_type == CHIP_ALDEBARAN) &&
@@ -2757,6 +2786,11 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device 
*adev)
if (!data)
return 0;
 
+   if (con->page_retirement_thread)
+   kthread_stop(con->page_retirement_thread);
+
+   atomic_set(>page_retirement_req_cnt, 0);
+
cancel_work_sync(>recovery_work);
 
mutex_lock(>recovery_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 2cd89328dc73..9c3df9985fad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -461,6 +461,11 @@ struct amdgpu_ras {
 
/* Record special requirements of gpu reset caller */
uint32_t  gpu_reset_flags;
+
+   struct task_struct *page_retirement_thread;
+   wait_queue_head_t page_retirement_wq;
+   struct mutex page_retirement_lock;
+   atomic_t page_retirement_req_cnt;
 };
 
 struct ras_fs_data {
-- 
2.34.1



[PATCH V2 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

2024-01-17 Thread YiPeng Chai
Add log info for umc_v12_0 and smu_v13_0_6.

v2:
 Delete redundant logs.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c |  6 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6423dca5b777..fa2168f1d3bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -91,6 +91,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device 
*adev)
 
 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t 
mc_umc_status)
 {
+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Poison),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC)
+   );
+
return (amdgpu_ras_is_poison_mode_supported(adev) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) 
== 1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 11923964ce9a..51bb98db5d7a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1297,8 +1297,10 @@ void kfd_signal_poison_consumed_event(struct kfd_node 
*dev, u32 pasid)
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
int user_gpu_id;
 
-   if (!p)
+   if (!p) {
+   dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", 
pasid);
return; /* Presumably process exited. */
+   }
 
user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) {
@@ -1334,6 +1336,8 @@ void kfd_signal_poison_consumed_event(struct kfd_node 
*dev, u32 pasid)
}
}
 
+   dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n",
+   p->lead_thread->comm, pasid);
rcu_read_unlock();
 
/* user application will handle SIGBUS signal */
-- 
2.34.1



[PATCH v2] drm/amd/pm: enable amdgpu smu send message log

2024-01-17 Thread Yang Wang
From: Yang Wang 

v1:
enable amdgpu smu driver message log.

v2:
add smu/pmfw response value into debug log.

Signed-off-by: Yang Wang 
---
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 00cd615bbcdc..b8dbd4e25348 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -378,8 +378,15 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context 
*smu,
res = __smu_cmn_reg2errno(smu, reg);
if (res != 0)
__smu_cmn_reg_print_error(smu, reg, index, param, msg);
-   if (read_arg)
+   if (read_arg) {
smu_cmn_read_arg(smu, read_arg);
+   dev_dbg(adev->dev, "smu send message: %s(%d) param: 0x%08x, 
resp: 0x%08x,\
+   readval: 0x%08x\n",
+   smu_get_message_name(smu, msg), index, param, reg, 
*read_arg);
+   } else {
+   dev_dbg(adev->dev, "smu send message: %s(%d) param: 0x%08x, 
resp: 0x%08x\n",
+   smu_get_message_name(smu, msg), index, param, reg);
+   }
 Out:
if (unlikely(adev->pm.smu_debug_mask & SMU_DEBUG_HALT_ON_ERROR) && res) 
{
amdgpu_device_halt(adev);
-- 
2.34.1



RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

2024-01-17 Thread Chai, Thomas
[AMD Official Use Only - General]

OK


-
Best Regards,
Thomas

-Original Message-
From: Wang, Yang(Kevin) 
Sent: Thursday, January 18, 2024 11:00 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Yang, Stanley 
Subject: RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

[AMD Official Use Only - General]

-Original Message-
From: Chai, Thomas 
Sent: Tuesday, January 16, 2024 4:21 PM
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

Add log info for umc_v12_0 and smu_v13_0_6.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c |  6 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c| 13 +
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6423dca5b777..fa2168f1d3bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -91,6 +91,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device 
*adev)

 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t 
mc_umc_status)  {
+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Poison),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC)
+   );
+
return (amdgpu_ras_is_poison_mode_supported(adev) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) 
== 1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 11923964ce9a..51bb98db5d7a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1297,8 +1297,10 @@ void kfd_signal_poison_consumed_event(struct kfd_node 
*dev, u32 pasid)
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
int user_gpu_id;

-   if (!p)
+   if (!p) {
+   dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", 
pasid);
return; /* Presumably process exited. */
+   }

user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) { @@ -1334,6 +1336,8 @@ void 
kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
}
}

+   dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n",
+   p->lead_thread->comm, pasid);
rcu_read_unlock();

/* user application will handle SIGBUS signal */ diff --git 
a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 952a983da49a..cee8ee5afcb6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2406,10 +2406,23 @@ static int smu_v13_0_6_get_valid_mca_count(struct 
smu_context *smu, enum amdgpu_

ret = smu_cmn_send_smc_msg(smu, msg, count);
if (ret) {
+   dev_err(smu->adev->dev, "%s(%d) failed to query %s MCA count, 
ret:%d\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   ret);
*count = 0;
return ret;
}

+   dev_info(smu->adev->dev, "MSG %s(%d) query %s MCA count result:%u\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   *count);


[Kevin]:
Please make following function public then use this helper function to get msg 
name string.
- smu_get_message_name()

Best Regards,
Kevin
+
return 0;
 }

--
2.34.1




RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning

2024-01-17 Thread Chai, Thomas
[AMD Official Use Only - General]


-
Best Regards,
Thomas


_
From: Zhou1, Tao 
Sent: Thursday, January 18, 2024 11:24 AM
To: Chai, Thomas ; Zhang, Hawking ; 
amd-gfx@lists.freedesktop.org
Cc: Li, Candice ; Wang, Yang(Kevin) 
; Yang, Stanley 
Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning


[AMD Official Use Only - General]





  _
  From: Chai, Thomas mailto:yipeng.c...@amd.com>>
  Sent: Thursday, January 18, 2024 11:06 AM
  To: Zhang, Hawking mailto:hawking.zh...@amd.com>>; 
amd-gfx@lists.freedesktop.org
  Cc: Zhou1, Tao mailto:tao.zh...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>
  Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning


  [AMD Official Use Only - General]






  -
  Best Regards,
  Thomas


  _
  From: Zhang, Hawking mailto:hawking.zh...@amd.com>>
  Sent: Wednesday, January 17, 2024 7:54 PM
  To: Chai, Thomas mailto:yipeng.c...@amd.com>>; 
amd-gfx@lists.freedesktop.org
  Cc: Zhou1, Tao mailto:tao.zh...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>
  Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning


  [AMD Official Use Only - General]



  Please check my comments inline

  Regards,
  Hawking

  -Original Message-
  From: Chai, Thomas mailto:yipeng.c...@amd.com>>
  Sent: Tuesday, January 16, 2024 16:21
  To: amd-gfx@lists.freedesktop.org
  Cc: Chai, Thomas mailto:yipeng.c...@amd.com>>; 
Zhang, Hawking mailto:hawking.zh...@amd.com>>; Zhou1, 
Tao mailto:tao.zh...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>; Chai, Thomas 
mailto:yipeng.c...@amd.com>>
  Subject: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning

  Use asynchronous polling to handle umc_v12_0 poisoning.

  Signed-off-by: YiPeng Chai 
mailto:yipeng.c...@amd.com>>
  ---
   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |   5 +
   drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 143 +++-
   drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |   3 +
   3 files changed, 120 insertions(+), 31 deletions(-)

  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
  index 856206e95842..44929281840e 100644
  --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
  +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
  @@ -118,6 +118,8 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block)
   /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
   #define RAS_BAD_PAGE_COVER  (100 * 1024 * 1024ULL)

  +#define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms
  +
   enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
  @@ -2670,6 +2672,9 @@ static int amdgpu_ras_page_retirement_thread(void 
*param)
atomic_read(>page_retirement_req_cnt));

atomic_dec(>page_retirement_req_cnt);
  +
  + amdgpu_umc_poison_retire_page_polling_timeout(adev,
  + false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
}

return 0;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
  index 9d1cf41cf483..2dde29cb807d 100644
  --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
  +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
  @@ -23,6 +23,7 @@

   #include "amdgpu.h"
   #include "umc_v6_7.h"
  +#define MAX_UMC_POISON_POLLING_TIME_SYNC   20  //ms

   static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t 
err_addr, @@ -85,17 +86,14 @@ int amdgpu_umc_page_retirement_mca(struct 
amdgpu_device *adev,
return ret;
   }

  -static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
  - void *ras_error_status,
  - struct amdgpu_iv_entry *entry,
  - bool reset)
  +static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
  + void *ras_error_status)
   {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = 

RE: [PATCH] drm/amd/pm: enable amdgpu smu send message log

2024-01-17 Thread Wang, Yang(Kevin)
[AMD Official Use Only - General]

-Original Message-
From: Lazar, Lijo 
Sent: Thursday, January 18, 2024 1:13 PM
To: Wang, Yang(Kevin) ; amd-gfx@lists.freedesktop.org
Cc: Feng, Kenneth 
Subject: Re: [PATCH] drm/amd/pm: enable amdgpu smu send message log

On 1/18/2024 8:56 AM, Yang Wang wrote:
> From: Yang Wang 
>
> enable amdgpu smu driver message log.
>
> Signed-off-by: Yang Wang 
> ---
>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 8 +++-
>   1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index 00cd615bbcdc..b9a24ff02a12 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -378,8 +378,14 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context 
> *smu,
>   res = __smu_cmn_reg2errno(smu, reg);
>   if (res != 0)
>   __smu_cmn_reg_print_error(smu, reg, index, param, msg);
> - if (read_arg)
> + if (read_arg) {
>   smu_cmn_read_arg(smu, read_arg);
> + dev_dbg(adev->dev, "smu send message: %s(%d) param: 0x%08x, 
> readval: 0x%08x\n",
> + smu_get_message_name(smu, msg), index, param, 
> *read_arg);
> + } else {
> + dev_dbg(adev->dev, "smu send message: %s(%d) param: 0x%08x\n",
> + smu_get_message_name(smu, msg), index, param);
> + }

Better to add the exact response reg value (reg =
__smu_cmn_poll_stat(smu)) also to this.

Thanks,
Lijo


[kevin]:
OK,  will be added in next version.

Best Regards,
Kevin
>   Out:
>   if (unlikely(adev->pm.smu_debug_mask & SMU_DEBUG_HALT_ON_ERROR) && res) 
> {
>   amdgpu_device_halt(adev);



Re: [PATCH] drm/amd/pm: enable amdgpu smu send message log

2024-01-17 Thread Lazar, Lijo

On 1/18/2024 8:56 AM, Yang Wang wrote:

From: Yang Wang 

enable amdgpu smu driver message log.

Signed-off-by: Yang Wang 
---
  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 8 +++-
  1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 00cd615bbcdc..b9a24ff02a12 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -378,8 +378,14 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context 
*smu,
res = __smu_cmn_reg2errno(smu, reg);
if (res != 0)
__smu_cmn_reg_print_error(smu, reg, index, param, msg);
-   if (read_arg)
+   if (read_arg) {
smu_cmn_read_arg(smu, read_arg);
+   dev_dbg(adev->dev, "smu send message: %s(%d) param: 0x%08x, readval: 
0x%08x\n",
+   smu_get_message_name(smu, msg), index, param, 
*read_arg);
+   } else {
+   dev_dbg(adev->dev, "smu send message: %s(%d) param: 0x%08x\n",
+   smu_get_message_name(smu, msg), index, param);
+   }


Better to add the exact response reg value (reg = 
__smu_cmn_poll_stat(smu)) also to this.


Thanks,
Lijo


  Out:
if (unlikely(adev->pm.smu_debug_mask & SMU_DEBUG_HALT_ON_ERROR) && res) 
{
amdgpu_device_halt(adev);




Re: [PATCH] drm/amdgpu/pm: Fix the power source flag error

2024-01-17 Thread Lazar, Lijo

On 1/18/2024 7:54 AM, Ma, Jun wrote:

Hi Lijo,

On 1/17/2024 5:41 PM, Lazar, Lijo wrote:

On 1/17/2024 2:22 PM, Ma Jun wrote:

The power source flag should be updated when
[1] System receives an interrupt indicating that the power source
has changed.
[2] System resumes from suspend or runtime suspend

Signed-off-by: Ma Jun 
---
   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 24 +++
   .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c|  2 ++
   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|  2 ++
   3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index c16703868e5c..e16d22e30a8a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -24,6 +24,7 @@
   
   #include 

   #include 
+#include 
   #include 
   
   #include "amdgpu.h"

@@ -793,6 +794,17 @@ static int smu_apply_default_config_table_settings(struct 
smu_context *smu)
return smu_set_config_table(smu, >pm.config_table);
   }
   
+static void smu_update_power_source(struct amdgpu_device *adev)

+{
+   if (power_supply_is_system_supplied() > 0)
+   adev->pm.ac_power = true;
+   else
+   adev->pm.ac_power = false;
+
+   if (is_support_sw_smu(adev))
+   smu_set_ac_dc(adev->powerplay.pp_handle);
+}
+
   static int smu_late_init(void *handle)
   {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -817,16 +829,8 @@ static int smu_late_init(void *handle)
 * handle the switch automatically. Driver involvement
 * is unnecessary.
 */
-   if (!smu->dc_controlled_by_gpio) {
-   ret = smu_set_power_source(smu,
-  adev->pm.ac_power ? 
SMU_POWER_SOURCE_AC :
-  SMU_POWER_SOURCE_DC);
-   if (ret) {
-   dev_err(adev->dev, "Failed to switch to %s mode!\n",
-   adev->pm.ac_power ? "AC" : "DC");
-   return ret;
-   }
-   }


For this part of the change - driver already updates FW with the initial
detected power state or the last detected power state before going for
suspend. Isn't that good enough?



The power source may change during system suspend, so it's necessary to detect 
the
power source again before system reads the power related data, such as 
max_power_limit.



Ok. For dGPUs, then refresh of power state after resume will be required 
in GPIO controlled case also. Since FW is reloaded, FW may not detect it 
as a power source change.


Rather than creating another wrapper function, it is simpler to do 
something like

adev->pm.ac_power = power_supply_is_system_supplied() > 0;
ret = smu_set_ac_dc(smu);

Thanks,
Lijo


Regards,
Ma Jun



Thanks,
Lijo


+   if (!smu->dc_controlled_by_gpio)
+   smu_update_power_source(adev);
   
   	if ((amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 1)) ||

(amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 3)))
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
index 2e7f8d5cfc28..8047150fddd4 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
@@ -1442,10 +1442,12 @@ static int smu_v11_0_irq_process(struct amdgpu_device 
*adev,
case 0x3:
dev_dbg(adev->dev, "Switched to AC mode!\n");
schedule_work(>interrupt_work);
+   adev->pm.ac_power = true;
break;
case 0x4:
dev_dbg(adev->dev, "Switched to DC mode!\n");
schedule_work(>interrupt_work);
+   adev->pm.ac_power = false;
break;
case 0x7:
/*
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 771a3d457c33..c486182ff275 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -1379,10 +1379,12 @@ static int smu_v13_0_irq_process(struct amdgpu_device 
*adev,
case 0x3:
dev_dbg(adev->dev, "Switched to AC mode!\n");
smu_v13_0_ack_ac_dc_interrupt(smu);
+   adev->pm.ac_power = true;
break;
case 0x4:
dev_dbg(adev->dev, "Switched to DC mode!\n");
smu_v13_0_ack_ac_dc_interrupt(smu);
+   adev->pm.ac_power = false;
   

[linux-next:master] BUILD REGRESSION 943b9f0ab2cfbaea148dd6ac279957eb08b96904

2024-01-17 Thread kernel test robot
tree/branch: 
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
branch HEAD: 943b9f0ab2cfbaea148dd6ac279957eb08b96904  Add linux-next specific 
files for 20240117

Error/Warning reports:

https://lore.kernel.org/oe-kbuild-all/202401172119.fc2ic3l9-...@intel.com
https://lore.kernel.org/oe-kbuild-all/202401172139.xsfsnhrj-...@intel.com

Error/Warning: (recently discovered and may have been fixed)

drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm_crtc.c:100: warning: 
This comment starts with '/**', but isn't a kernel-doc comment. Refer 
Documentation/doc-guide/kernel-doc.rst
fs/bcachefs/btree_write_buffer.c:128:24: warning: unused variable 'c' 
[-Wunused-variable]

Error/Warning ids grouped by kconfigs:

gcc_recent_errors
|-- alpha-allyesconfig
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-dst_addr-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-filled_descs_num-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-size-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-src_addr-not-described-in-xdma_fill_descs
|   `-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-sw_desc-not-described-in-xdma_fill_descs
|-- arc-allmodconfig
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-dst_addr-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-filled_descs_num-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-size-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-src_addr-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-sw_desc-not-described-in-xdma_fill_descs
|   `-- 
drivers-gpu-drm-msm-disp-dpu1-dpu_encoder.c:warning:Excess-struct-member-debugfs_root-description-in-dpu_encoder_virt
|-- arc-allyesconfig
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-dst_addr-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-filled_descs_num-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-size-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-src_addr-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-sw_desc-not-described-in-xdma_fill_descs
|   `-- 
drivers-gpu-drm-msm-disp-dpu1-dpu_encoder.c:warning:Excess-struct-member-debugfs_root-description-in-dpu_encoder_virt
|-- arm-allmodconfig
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-dst_addr-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-filled_descs_num-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-size-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-src_addr-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-sw_desc-not-described-in-xdma_fill_descs
|   `-- 
drivers-gpu-drm-msm-disp-dpu1-dpu_encoder.c:warning:Excess-struct-member-debugfs_root-description-in-dpu_encoder_virt
|-- arm-allyesconfig
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-dst_addr-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-filled_descs_num-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-size-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-src_addr-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-sw_desc-not-described-in-xdma_fill_descs
|   `-- 
drivers-gpu-drm-msm-disp-dpu1-dpu_encoder.c:warning:Excess-struct-member-debugfs_root-description-in-dpu_encoder_virt
|-- arm-randconfig-002-20240117
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-dst_addr-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-filled_descs_num-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-size-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function-parameter-or-struct-member-src_addr-not-described-in-xdma_fill_descs
|   |-- 
drivers-dma-xilinx-xdma.c:warning:Function

[PATCH] drm/amd/pm: enable amdgpu smu send message log

2024-01-17 Thread Yang Wang
From: Yang Wang 

enable amdgpu smu driver message log.

Signed-off-by: Yang Wang 
---
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 00cd615bbcdc..b9a24ff02a12 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -378,8 +378,14 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context 
*smu,
res = __smu_cmn_reg2errno(smu, reg);
if (res != 0)
__smu_cmn_reg_print_error(smu, reg, index, param, msg);
-   if (read_arg)
+   if (read_arg) {
smu_cmn_read_arg(smu, read_arg);
+   dev_dbg(adev->dev, "smu send message: %s(%d) param: 0x%08x, 
readval: 0x%08x\n",
+   smu_get_message_name(smu, msg), index, param, 
*read_arg);
+   } else {
+   dev_dbg(adev->dev, "smu send message: %s(%d) param: 0x%08x\n",
+   smu_get_message_name(smu, msg), index, param);
+   }
 Out:
if (unlikely(adev->pm.smu_debug_mask & SMU_DEBUG_HALT_ON_ERROR) && res) 
{
amdgpu_device_halt(adev);
-- 
2.34.1



[PATCH Review 1/1] drm/amdgpu: Fix ras features value calltrace

2024-01-17 Thread Stanley . Yang
The high three bits of ras features mask indicate socket
id, it should skip to check high three bits of ras features
mask before disable all ras features.

Signed-off-by: Stanley.Yang 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  6 ++
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index f35a74bf5265..c91d7d89a1e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2987,7 +2987,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
/* Packed socket_id to ras feature mask bits[31:29] */
if (adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id)
-   con->features |= ((adev->smuio.funcs->get_socket_id(adev)) << 
29);
+   con->features |= ((adev->smuio.funcs->get_socket_id(adev)) <<
+  AMDGPU_RAS_FEATURES_SOCKETID_SHIFT);
 
/* Get RAS schema for particular SOC */
con->schema = amdgpu_get_ras_schema(adev);
@@ -3193,7 +3194,7 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev)
 
amdgpu_ras_disable_all_features(adev, 0);
/* Make sure all ras objects are disabled. */
-   if (con->features)
+   if (AMDGPU_RAS_GET_FEATURES(con->features))
amdgpu_ras_disable_all_features(adev, 1);
 }
 
@@ -3240,7 +3241,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
 
 
/* Need disable ras on all IPs here before ip [hw/sw]fini */
-   if (con->features)
+   if (AMDGPU_RAS_GET_FEATURES(con->features))
amdgpu_ras_disable_all_features(adev, 0);
amdgpu_ras_recovery_fini(adev);
return 0;
@@ -3273,9 +3274,9 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
amdgpu_ras_fs_fini(adev);
amdgpu_ras_interrupt_remove_all(adev);
 
-   WARN(con->features, "Feature mask is not cleared");
+   WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not 
cleared");
 
-   if (con->features)
+   if (AMDGPU_RAS_GET_FEATURES(con->features))
amdgpu_ras_disable_all_features(adev, 1);
 
cancel_delayed_work_sync(>ras_counte_delay_work);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 99d7da125c8a..33f7e5a972b2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -52,6 +52,12 @@ struct amdgpu_iv_entry;
 #define AMDGPU_RAS_INST_MASK 0xf000
 #define AMDGPU_RAS_INST_SHIFT 0xc
 
+#define AMDGPU_RAS_FEATURES_SOCKETID_SHIFT 29
+#define AMDGPU_RAS_FEATURES_SOCKETID_MASK 0xe000
+
+/* The high three bits indicates socketid */
+#define AMDGPU_RAS_GET_FEATURES(val)  (val) & 
~AMDGPU_RAS_FEATURES_SOCKETID_MASK
+
 enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__UMC = 0,
AMDGPU_RAS_BLOCK__SDMA,
-- 
2.25.1



[PATCH Review 1/1] drm/amdgpu: Skip do PCI error slot reset during RAS recovery

2024-01-17 Thread Stanley . Yang
Why:
The PCI error slot reset maybe triggered after inject ue to UMC multi 
times, this
caused system hang.
[  557.371857] amdgpu :af:00.0: amdgpu: GPU reset succeeded, trying to 
resume
[  557.373718] [drm] PCIE GART of 512M enabled.
[  557.373722] [drm] PTB located at 0x031FED70
[  557.373788] [drm] VRAM is lost due to GPU reset!
[  557.373789] [drm] PSP is resuming...
[  557.547012] mlx5_core :55:00.0: mlx5_pci_err_detected Device state = 
1 pci_status: 0. Exit, result = 3, need reset
[  557.547067] [drm] PCI error: detected callback, state(1)!!
[  557.547069] [drm] No support for XGMI hive yet...
[  557.548125] mlx5_core :55:00.0: mlx5_pci_slot_reset Device state = 1 
pci_status: 0. Enter
[  557.607763] mlx5_core :55:00.0: wait vital counter value 0x16b5b 
after 1 iterations
[  557.60] mlx5_core :55:00.0: mlx5_pci_slot_reset Device state = 1 
pci_status: 1. Exit, err = 0, result = 5, recovered
[  557.610492] [drm] PCI error: slot reset callback!!
...
[  560.689382] amdgpu :3f:00.0: amdgpu: GPU reset(2) succeeded!
[  560.689546] amdgpu :5a:00.0: amdgpu: GPU reset(2) succeeded!
[  560.689562] general protection fault, probably for non-canonical address 
0x5f080b54534f611f:  [#1] SMP NOPTI
[  560.701008] CPU: 16 PID: 2361 Comm: kworker/u448:9 Tainted: G   
OE 5.15.0-91-generic #101-Ubuntu
[  560.712057] Hardware name: Microsoft C278A/C278A, BIOS 
C2789.5.BS.1C11.AG.1 11/08/2023
[  560.720959] Workqueue: amdgpu-reset-hive amdgpu_ras_do_recovery [amdgpu]
[  560.728887] RIP: 0010:amdgpu_device_gpu_recover.cold+0xbf1/0xcf5 [amdgpu]
[  560.736891] Code: ff 41 89 c6 e9 1b ff ff ff 44 0f b6 45 b0 e9 4f ff ff 
ff be 01 00 00 00 4c 89 e7 e8 76 c9 8b ff 44 0f b6 45 b0 e9 3c fd ff ff <48> 83 
ba 18 02 00 00 00 0f 84 6a f8 ff ff 48 8d 7a 78 be 01 00 00
[  560.757967] RSP: 0018:ffa032e53d80 EFLAGS: 00010202
[  560.763848] RAX: ffa0001dfd10 RBX: ffa000197090 RCX: 
ffa032e53db0
[  560.771856] RDX: 5f080b54534f5f07 RSI:  RDI: 
ff11000128100010
[  560.779867] RBP: ffa032e53df0 R08:  R09: 
ffe77f08
[  560.787879] R10: 000a R11: 0001 R12: 

[  560.795889] R13: ffa032e53e00 R14:  R15: 

[  560.803889] FS:  () GS:ff11007e7e80() 
knlGS:
[  560.812973] CS:  0010 DS:  ES:  CR0: 80050033
[  560.819422] CR2: 55a04c118e68 CR3: 07410005 CR4: 
00771ee0
[  560.827433] DR0:  DR1:  DR2: 

[  560.835433] DR3:  DR6: fffe07f0 DR7: 
0400
[  560.843444] PKRU: 5554
[  560.846480] Call Trace:
[  560.849225]  
[  560.851580]  ? show_trace_log_lvl+0x1d6/0x2ea
[  560.856488]  ? show_trace_log_lvl+0x1d6/0x2ea
[  560.861379]  ? amdgpu_ras_do_recovery+0x1b2/0x210 [amdgpu]
[  560.867778]  ? show_regs.part.0+0x23/0x29
[  560.872293]  ? __die_body.cold+0x8/0xd
[  560.876502]  ? die_addr+0x3e/0x60
[  560.880238]  ? exc_general_protection+0x1c5/0x410
[  560.885532]  ? asm_exc_general_protection+0x27/0x30
[  560.891025]  ? amdgpu_device_gpu_recover.cold+0xbf1/0xcf5 [amdgpu]
[  560.898323]  amdgpu_ras_do_recovery+0x1b2/0x210 [amdgpu]
[  560.904520]  process_one_work+0x228/0x3d0
How:
In RAS recovery, mode-1 reset is issued from RAS fatal error handling and 
expected
all the nodes in a hive to be reset. no need to issue another mode-1 during 
this procedure.

Signed-off-by: Stanley.Yang 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6b9e0dcf9d7d..0bb02291a5a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6175,6 +6175,20 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev 
*pdev)
struct amdgpu_reset_context reset_context;
u32 memsize;
struct list_head device_list;
+   struct amdgpu_hive_info *hive;
+   int hive_ras_recovery = 0;
+   struct amdgpu_ras *ras;
+
+   /* PCI error slot reset should be skipped During RAS recovery */
+   hive = amdgpu_get_xgmi_hive(adev);
+   if (hive) {
+   hive_ras_recovery = atomic_read(>ras_recovery);
+   amdgpu_put_xgmi_hive(hive);
+   }
+   ras = amdgpu_ras_get_context(adev);
+   if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) &&
+ras && (atomic_read(>in_recovery) || hive_ras_recovery))
+   return PCI_ERS_RESULT_RECOVERED;
 
DRM_INFO("PCI error: slot reset callback!!\n");
 
-- 
2.25.1



[PATCH Review 1/1] drm/amdgpu: Show deferred error count for UMC

2024-01-17 Thread Stanley . Yang
Show deferred error count for UMC syfs node

Signed-off-by: Stanley.Yang 
Reviewed-by: Tao Zhou 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 35b4fff54ded..f35a74bf5265 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -632,8 +632,12 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
dev_warn(obj->adev->dev, "Failed to reset error counter 
and error status");
}
 
-   return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
- "ce", info.ce_count);
+   if (info.head.block == AMDGPU_RAS_BLOCK__UMC)
+   return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", 
info.ue_count,
+   "ce", info.ce_count, "de", info.de_count);
+   else
+   return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", 
info.ue_count,
+   "ce", info.ce_count);
 }
 
 /* obj begin */
-- 
2.25.1



RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning

2024-01-17 Thread Zhou1, Tao
[AMD Official Use Only - General]


  _
  From: Chai, Thomas 
  Sent: Thursday, January 18, 2024 11:06 AM
  To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org
  Cc: Zhou1, Tao ; Li, Candice ; 
Wang, Yang(Kevin) ; Yang, Stanley 
  Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning


  [AMD Official Use Only - General]






  -
  Best Regards,
  Thomas


  _
  From: Zhang, Hawking mailto:hawking.zh...@amd.com>>
  Sent: Wednesday, January 17, 2024 7:54 PM
  To: Chai, Thomas mailto:yipeng.c...@amd.com>>; 
amd-gfx@lists.freedesktop.org
  Cc: Zhou1, Tao mailto:tao.zh...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>
  Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning


  [AMD Official Use Only - General]



  Please check my comments inline

  Regards,
  Hawking

  -Original Message-
  From: Chai, Thomas mailto:yipeng.c...@amd.com>>
  Sent: Tuesday, January 16, 2024 16:21
  To: amd-gfx@lists.freedesktop.org
  Cc: Chai, Thomas mailto:yipeng.c...@amd.com>>; 
Zhang, Hawking mailto:hawking.zh...@amd.com>>; Zhou1, 
Tao mailto:tao.zh...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>; Chai, Thomas 
mailto:yipeng.c...@amd.com>>
  Subject: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning

  Use asynchronous polling to handle umc_v12_0 poisoning.

  Signed-off-by: YiPeng Chai 
mailto:yipeng.c...@amd.com>>
  ---
   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |   5 +
   drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 143 +++-
   drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |   3 +
   3 files changed, 120 insertions(+), 31 deletions(-)

  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
  index 856206e95842..44929281840e 100644
  --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
  +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
  @@ -118,6 +118,8 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block)
   /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
   #define RAS_BAD_PAGE_COVER  (100 * 1024 * 1024ULL)

  +#define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms
  +
   enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
  @@ -2670,6 +2672,9 @@ static int amdgpu_ras_page_retirement_thread(void 
*param)
atomic_read(>page_retirement_req_cnt));

atomic_dec(>page_retirement_req_cnt);
  +
  + amdgpu_umc_poison_retire_page_polling_timeout(adev,
  + false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
}

return 0;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
  index 9d1cf41cf483..2dde29cb807d 100644
  --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
  +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
  @@ -23,6 +23,7 @@

   #include "amdgpu.h"
   #include "umc_v6_7.h"
  +#define MAX_UMC_POISON_POLLING_TIME_SYNC   20  //ms

   static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t 
err_addr, @@ -85,17 +86,14 @@ int amdgpu_umc_page_retirement_mca(struct 
amdgpu_device *adev,
return ret;
   }

  -static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
  - void *ras_error_status,
  - struct amdgpu_iv_entry *entry,
  - bool reset)
  +static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
  + void *ras_error_status)
   {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret = 0;
unsigned long err_count;
  -
  - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
  + mutex_lock(>page_retirement_lock);
ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
if (ret == -EOPNOTSUPP) {
if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && @@ 
-163,19 +161,86 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
con->update_channel_flag = false;
}
}
  -
  - if (reset) {
  - /* use 

[PATCH] drm/amdgpu: Cleanup inconsistent indenting in 'amdgpu_gfx_enable_kcq()'

2024-01-17 Thread Srinivasan Shanmugam
Fixes the below:
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c:645 amdgpu_gfx_enable_kcq() warn: 
inconsistent indenting

Cc: Le Ma 
Cc: Hawking Zhang 
Cc: Christian König 
Cc: Alex Deucher 
Signed-off-by: Srinivasan Shanmugam 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index b9674c57c436..eb03f2d7b607 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -642,8 +642,8 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int 
xcc_id)
kiq->pmf->kiq_set_resources(kiq_ring, queue_mask);
for (i = 0; i < adev->gfx.num_compute_rings; i++) {
j = i + xcc_id * adev->gfx.num_compute_rings;
-   kiq->pmf->kiq_map_queues(kiq_ring,
->gfx.compute_ring[j]);
+   kiq->pmf->kiq_map_queues(kiq_ring,
+>gfx.compute_ring[j]);
}
 
r = amdgpu_ring_test_helper(kiq_ring);
-- 
2.34.1



RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning

2024-01-17 Thread Chai, Thomas
[AMD Official Use Only - General]


-
Best Regards,
Thomas


_
From: Zhang, Hawking 
Sent: Wednesday, January 17, 2024 7:54 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley 
Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning


[AMD Official Use Only - General]



Please check my comments inline

Regards,
Hawking

-Original Message-
From: Chai, Thomas mailto:yipeng.c...@amd.com>>
Sent: Tuesday, January 16, 2024 16:21
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas mailto:yipeng.c...@amd.com>>; Zhang, 
Hawking mailto:hawking.zh...@amd.com>>; Zhou1, Tao 
mailto:tao.zh...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>; Chai, Thomas 
mailto:yipeng.c...@amd.com>>
Subject: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 
poisoning

Use asynchronous polling to handle umc_v12_0 poisoning.

Signed-off-by: YiPeng Chai mailto:yipeng.c...@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |   5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 143 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |   3 +
 3 files changed, 120 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 856206e95842..44929281840e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -118,6 +118,8 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block)
 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
 #define RAS_BAD_PAGE_COVER  (100 * 1024 * 1024ULL)

+#define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms
+
 enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -2670,6 +2672,9 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_read(>page_retirement_req_cnt));

atomic_dec(>page_retirement_req_cnt);
+
+   amdgpu_umc_poison_retire_page_polling_timeout(adev,
+   false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
}

return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 9d1cf41cf483..2dde29cb807d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -23,6 +23,7 @@

 #include "amdgpu.h"
 #include "umc_v6_7.h"
+#define MAX_UMC_POISON_POLLING_TIME_SYNC   20  //ms

 static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t 
err_addr, @@ -85,17 +86,14 @@ int amdgpu_umc_page_retirement_mca(struct 
amdgpu_device *adev,
return ret;
 }

-static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
-   void *ras_error_status,
-   struct amdgpu_iv_entry *entry,
-   bool reset)
+static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
+   void *ras_error_status)
 {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret = 0;
unsigned long err_count;
-
-   kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+   mutex_lock(>page_retirement_lock);
ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
if (ret == -EOPNOTSUPP) {
if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && @@ 
-163,19 +161,86 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
con->update_channel_flag = false;
}
}
-
-   if (reset) {
-   /* use mode-2 reset for poison consumption */
-   if (!entry)
-   con->gpu_reset_flags |= 
AMDGPU_RAS_GPU_RESET_MODE2_RESET;
-   amdgpu_ras_reset_gpu(adev);
-   }
}

kfree(err_data->err_addr);
+
+   mutex_unlock(>page_retirement_lock);
+}
+
+static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
+   void *ras_error_status,
+   struct amdgpu_iv_entry *entry,
+   bool reset)
+{
+   struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+   amdgpu_umc_handle_bad_pages(adev, ras_error_status);
+
+   if (err_data->ue_count && reset) {
+   /* use mode-2 reset for poison consumption */
+   if (!entry)
+

RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

2024-01-17 Thread Wang, Yang(Kevin)
[AMD Official Use Only - General]

-Original Message-
From: Chai, Thomas 
Sent: Tuesday, January 16, 2024 4:21 PM
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

Add log info for umc_v12_0 and smu_v13_0_6.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c |  6 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c| 13 +
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6423dca5b777..fa2168f1d3bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -91,6 +91,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device 
*adev)

 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t 
mc_umc_status)  {
+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Poison),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC)
+   );
+
return (amdgpu_ras_is_poison_mode_supported(adev) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) 
== 1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 11923964ce9a..51bb98db5d7a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1297,8 +1297,10 @@ void kfd_signal_poison_consumed_event(struct kfd_node 
*dev, u32 pasid)
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
int user_gpu_id;

-   if (!p)
+   if (!p) {
+   dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", 
pasid);
return; /* Presumably process exited. */
+   }

user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) { @@ -1334,6 +1336,8 @@ void 
kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
}
}

+   dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n",
+   p->lead_thread->comm, pasid);
rcu_read_unlock();

/* user application will handle SIGBUS signal */ diff --git 
a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 952a983da49a..cee8ee5afcb6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2406,10 +2406,23 @@ static int smu_v13_0_6_get_valid_mca_count(struct 
smu_context *smu, enum amdgpu_

ret = smu_cmn_send_smc_msg(smu, msg, count);
if (ret) {
+   dev_err(smu->adev->dev, "%s(%d) failed to query %s MCA count, 
ret:%d\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   ret);
*count = 0;
return ret;
}

+   dev_info(smu->adev->dev, "MSG %s(%d) query %s MCA count result:%u\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   *count);


[Kevin]:
Please make following function public then use this helper function to get msg 
name string.
- smu_get_message_name()

Best Regards,
Kevin
+
return 0;
 }

--
2.34.1



RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

2024-01-17 Thread Chai, Thomas
[AMD Official Use Only - General]

OK


-
Best Regards,
Thomas

-Original Message-
From: Zhang, Hawking 
Sent: Wednesday, January 17, 2024 7:40 PM
To: Zhang, Hawking ; Chai, Thomas ; 
amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Yang, Stanley ; Wang, 
Yang(Kevin) ; Li, Candice 
Subject: RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

[AMD Official Use Only - General]

Please ignore my first comment. It doesn't necessarily associated with socket  
id in UMC MCA status log at this stage.

Regards,
Hawking

-Original Message-
From: amd-gfx  On Behalf Of Zhang, 
Hawking
Sent: Wednesday, January 17, 2024 19:12
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Yang, Stanley ; Wang, 
Yang(Kevin) ; Li, Candice 
Subject: RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

[AMD Official Use Only - General]

[AMD Official Use Only - General]

+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,

Please also print out socket id for UMC MCA status.

+   dev_info(smu->adev->dev, "MSG %s(%d) query %s MCA count result:%u\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   *count);
+

This seems redundant or was added for debugging purpose. We can drop this print 
since there is log to cover failures.

Regards,
Hawking


-Original Message-
From: Chai, Thomas 
Sent: Tuesday, January 16, 2024 16:21
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

Add log info for umc_v12_0 and smu_v13_0_6.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c |  6 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c| 13 +
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6423dca5b777..fa2168f1d3bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -91,6 +91,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device 
*adev)

 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t 
mc_umc_status)  {
+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Poison),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC)
+   );
+
return (amdgpu_ras_is_poison_mode_supported(adev) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) 
== 1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 11923964ce9a..51bb98db5d7a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1297,8 +1297,10 @@ void kfd_signal_poison_consumed_event(struct kfd_node 
*dev, u32 pasid)
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
int user_gpu_id;

-   if (!p)
+   if (!p) {
+   dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", 
pasid);
return; /* Presumably process exited. */
+   }

user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) { @@ -1334,6 +1336,8 @@ void 
kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
}
}

+   dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n",
+   p->lead_thread->comm, pasid);
rcu_read_unlock();

/* user application will handle SIGBUS signal */ diff --git 
a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 952a983da49a..cee8ee5afcb6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2406,10 +2406,23 @@ static int smu_v13_0_6_get_valid_mca_count(struct 

Re: [PATCH] drm/amdgpu/pm: Fix the power source flag error

2024-01-17 Thread Ma, Jun
Hi Alex,

On 1/17/2024 10:13 PM, Alex Deucher wrote:
> On Wed, Jan 17, 2024 at 4:01 AM Ma Jun  wrote:
>>
>> The power source flag should be updated when
>> [1] System receives an interrupt indicating that the power source
>> has changed.
>> [2] System resumes from suspend or runtime suspend
>>
>> Signed-off-by: Ma Jun 
>> ---
>>  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 24 +++
>>  .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c|  2 ++
>>  .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|  2 ++
>>  3 files changed, 18 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
>> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> index c16703868e5c..e16d22e30a8a 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> @@ -24,6 +24,7 @@
>>
>>  #include 
>>  #include 
>> +#include 
>>  #include 
>>
>>  #include "amdgpu.h"
>> @@ -793,6 +794,17 @@ static int 
>> smu_apply_default_config_table_settings(struct smu_context *smu)
>> return smu_set_config_table(smu, >pm.config_table);
>>  }
>>
>> +static void smu_update_power_source(struct amdgpu_device *adev)
>> +{
>> +   if (power_supply_is_system_supplied() > 0)
>> +   adev->pm.ac_power = true;
>> +   else
>> +   adev->pm.ac_power = false;
>> +
>> +   if (is_support_sw_smu(adev))
> 
> Do we need this check here?  This is the sw_smu code.

Thanks,will drop this check in v2.

Regards,
Ma Jun
> 
>> +   smu_set_ac_dc(adev->powerplay.pp_handle);
>> +}
>> +
>>  static int smu_late_init(void *handle)
>>  {
>> struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> @@ -817,16 +829,8 @@ static int smu_late_init(void *handle)
>>  * handle the switch automatically. Driver involvement
>>  * is unnecessary.
>>  */
>> -   if (!smu->dc_controlled_by_gpio) {
>> -   ret = smu_set_power_source(smu,
>> -  adev->pm.ac_power ? 
>> SMU_POWER_SOURCE_AC :
>> -  SMU_POWER_SOURCE_DC);
>> -   if (ret) {
>> -   dev_err(adev->dev, "Failed to switch to %s mode!\n",
>> -   adev->pm.ac_power ? "AC" : "DC");
>> -   return ret;
>> -   }
>> -   }
>> +   if (!smu->dc_controlled_by_gpio)
>> +   smu_update_power_source(adev);
>>
>> if ((amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 1)) ||
>> (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 3)))
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c 
>> b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
>> index 2e7f8d5cfc28..8047150fddd4 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
>> @@ -1442,10 +1442,12 @@ static int smu_v11_0_irq_process(struct 
>> amdgpu_device *adev,
>> case 0x3:
>> dev_dbg(adev->dev, "Switched to AC mode!\n");
>> schedule_work(>interrupt_work);
>> +   adev->pm.ac_power = true;
>> break;
>> case 0x4:
>> dev_dbg(adev->dev, "Switched to DC mode!\n");
>> schedule_work(>interrupt_work);
>> +   adev->pm.ac_power = false;
>> break;
>> case 0x7:
>> /*
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>> index 771a3d457c33..c486182ff275 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>> @@ -1379,10 +1379,12 @@ static int smu_v13_0_irq_process(struct 
>> amdgpu_device *adev,
>> case 0x3:
>> dev_dbg(adev->dev, "Switched to AC mode!\n");
>> smu_v13_0_ack_ac_dc_interrupt(smu);
>> +   adev->pm.ac_power = true;
>> break;
>> case 0x4:
>> dev_dbg(adev->dev, "Switched to DC mode!\n");
>> smu_v13_0_ack_ac_dc_interrupt(smu);
>> +   adev->pm.ac_power = false;
>> break;
>> case 0x7:
>> /*
>> --
>> 2.34.1
>>


Re: [PATCH] drm/amdgpu/pm: Fix the power source flag error

2024-01-17 Thread Ma, Jun
Hi Lijo,

On 1/17/2024 5:41 PM, Lazar, Lijo wrote:
> On 1/17/2024 2:22 PM, Ma Jun wrote:
>> The power source flag should be updated when
>> [1] System receives an interrupt indicating that the power source
>> has changed.
>> [2] System resumes from suspend or runtime suspend
>>
>> Signed-off-by: Ma Jun 
>> ---
>>   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 24 +++
>>   .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c|  2 ++
>>   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|  2 ++
>>   3 files changed, 18 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
>> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> index c16703868e5c..e16d22e30a8a 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> @@ -24,6 +24,7 @@
>>   
>>   #include 
>>   #include 
>> +#include 
>>   #include 
>>   
>>   #include "amdgpu.h"
>> @@ -793,6 +794,17 @@ static int 
>> smu_apply_default_config_table_settings(struct smu_context *smu)
>>  return smu_set_config_table(smu, >pm.config_table);
>>   }
>>   
>> +static void smu_update_power_source(struct amdgpu_device *adev)
>> +{
>> +if (power_supply_is_system_supplied() > 0)
>> +adev->pm.ac_power = true;
>> +else
>> +adev->pm.ac_power = false;
>> +
>> +if (is_support_sw_smu(adev))
>> +smu_set_ac_dc(adev->powerplay.pp_handle);
>> +}
>> +
>>   static int smu_late_init(void *handle)
>>   {
>>  struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> @@ -817,16 +829,8 @@ static int smu_late_init(void *handle)
>>   * handle the switch automatically. Driver involvement
>>   * is unnecessary.
>>   */
>> -if (!smu->dc_controlled_by_gpio) {
>> -ret = smu_set_power_source(smu,
>> -   adev->pm.ac_power ? 
>> SMU_POWER_SOURCE_AC :
>> -   SMU_POWER_SOURCE_DC);
>> -if (ret) {
>> -dev_err(adev->dev, "Failed to switch to %s mode!\n",
>> -adev->pm.ac_power ? "AC" : "DC");
>> -return ret;
>> -}
>> -}
> 
> For this part of the change - driver already updates FW with the initial 
> detected power state or the last detected power state before going for 
> suspend. Isn't that good enough?
> 

The power source may change during system suspend, so it's necessary to detect 
the
power source again before system reads the power related data, such as 
max_power_limit.

Regards,
Ma Jun


> Thanks,
> Lijo
> 
>> +if (!smu->dc_controlled_by_gpio)
>> +smu_update_power_source(adev);
>>   
>>  if ((amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 1)) ||
>>  (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 3)))
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c 
>> b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
>> index 2e7f8d5cfc28..8047150fddd4 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
>> @@ -1442,10 +1442,12 @@ static int smu_v11_0_irq_process(struct 
>> amdgpu_device *adev,
>>  case 0x3:
>>  dev_dbg(adev->dev, "Switched to AC mode!\n");
>>  schedule_work(>interrupt_work);
>> +adev->pm.ac_power = true;
>>  break;
>>  case 0x4:
>>  dev_dbg(adev->dev, "Switched to DC mode!\n");
>>  schedule_work(>interrupt_work);
>> +adev->pm.ac_power = false;
>>  break;
>>  case 0x7:
>>  /*
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>> index 771a3d457c33..c486182ff275 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
>> @@ -1379,10 +1379,12 @@ static int smu_v13_0_irq_process(struct 
>> amdgpu_device *adev,
>>  case 0x3:
>>  dev_dbg(adev->dev, "Switched to AC mode!\n");
>>  smu_v13_0_ack_ac_dc_interrupt(smu);
>> +adev->pm.ac_power = true;
>>  break;
>>  case 0x4:
>>  dev_dbg(adev->dev, "Switched to DC mode!\n");
>>  smu_v13_0_ack_ac_dc_interrupt(smu);
>> +adev->pm.ac_power = false;
>>  break;
>>  case 0x7:
>>  /*
> 


RE: [PATCH 2/2] update check condition of query for ras page retire

2024-01-17 Thread Zhou1, Tao
[AMD Official Use Only - General]

Sure, will revert related patch in the next version.

Regards,
Tao

> -Original Message-
> From: Zhang, Hawking 
> Sent: Wednesday, January 17, 2024 8:09 PM
> To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org
> Cc: Zhou1, Tao 
> Subject: RE: [PATCH 2/2] update check condition of query for ras page retire
>
> [AMD Official Use Only - General]
>
> static ssize_t smu_v13_0_6_get_ecc_info(struct smu_context *smu,
> void *table)
>  {
> -   /* Support ecc info by default */
> -   return 0;
> +   /* we use debug mode flag instead of this interface */
> +   return -EOPNOTSUPP;
>  }
>
> Shall we just drop the callback implementation? smu_get_ecc_info will return -
> EOPNOTSUPP if the callback is not supported.
>
> Regards,
> Hawking
>
> -Original Message-
> From: amd-gfx  On Behalf Of Tao Zhou
> Sent: Wednesday, January 17, 2024 17:15
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhou1, Tao 
> Subject: [PATCH 2/2] update check condition of query for ras page retire
>
> Support page retirement handling in debug mode.
>
> Signed-off-by: Tao Zhou 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c  | 9 +++--
>  drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 4 ++--
>  2 files changed, 9 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index 41139bac7643..6df32f0afd89 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -90,12 +90,16 @@ static void amdgpu_umc_handle_bad_pages(struct
> amdgpu_device *adev,  {
> struct ras_err_data *err_data = (struct ras_err_data 
> *)ras_error_status;
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +   unsigned int error_query_mode;
> int ret = 0;
>
> +   amdgpu_ras_get_error_query_mode(adev, _query_mode);
> +
> mutex_lock(>page_retirement_lock);
>
> ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
> -   if (ret == -EOPNOTSUPP) {
> +   if (ret == -EOPNOTSUPP &&
> +   error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
> if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
> adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
> 
> adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev,
> ras_error_status); @@ -119,7 +123,8 @@ static void
> amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
>  */
> adev->umc.ras->ras_block.hw_ops-
> >query_ras_error_address(adev, ras_error_status);
> }
> -   } else if (!ret) {
> +   } else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY
> ||
> +   (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) {
> if (adev->umc.ras &&
> adev->umc.ras->ecc_info_query_ras_error_count)
> adev->umc.ras->ecc_info_query_ras_error_count(adev,
> ras_error_status); diff --git
> a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> index c560f4af214d..d86c9e7fc64b 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> @@ -2909,8 +2909,8 @@ static int
> smu_v13_0_6_select_xgmi_plpd_policy(struct smu_context *smu,  static ssize_t
> smu_v13_0_6_get_ecc_info(struct smu_context *smu,
> void *table)
>  {
> -   /* Support ecc info by default */
> -   return 0;
> +   /* we use debug mode flag instead of this interface */
> +   return -EOPNOTSUPP;
>  }
>
>  static const struct pptable_funcs smu_v13_0_6_ppt_funcs = {
> --
> 2.35.1
>



Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr

2024-01-17 Thread Friedrich Vock

On 18.01.24 00:00, Alex Deucher wrote:

On Wed, Jan 17, 2024 at 7:36 AM Christian König
 wrote:

Am 16.01.24 um 11:31 schrieb Friedrich Vock:

On 16.01.24 08:03, Christian König wrote:

Am 15.01.24 um 12:18 schrieb Friedrich Vock:

[SNIP]

+if (ih->overflow) {
+tmp = RREG32(mmIH_RB_CNTL);
+tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
+WREG32(mmIH_RB_CNTL, tmp);
+ih->overflow = false;
+}

Well that is an extremely bad idea. We already reset the overflow
after reading the WPTR.

This is not resetting the overflow bit. This is resetting a "clear
overflow" bit. I don't have the hardware docs, but the name (and my
observations) strongly suggest that setting this bit actually prevents
the hardware from setting the overflow bit ever again.

Well that doesn't make any sense at all. The hardware documentation
clearly states that this bit is write only and should always read as
zero.

Setting this bit will clear the overflow flag in the WPTR register and
clearing it has no effect at all.

I could only ping the hw engineer responsible for this block to double
check if the documentation is somehow outdated, but I really doubt so.


I see. I wish I had access to the documentation,

Well, doesn't Valve has an NDA in place?


but I don't, so all I
can do is tell you what I observe the hardware doing. I've tested this
on both a Steam Deck (OSSYS 5.2.0) and an RX 6700 XT (OSSYS 5.0.3). On
both systems, launching a bunch of shaders that cause page faults leads
to lots of "[gfxhub] page fault" messages in dmesg, followed by an
"amdgpu: IH ring buffer overflow".

Well that is certainly a bug, maybe even the same thing we have seen on
Vega and MI.

What we could do is to try to apply the same workaround to re-route the
page faults to a different IH ring.

See those patches here as well:

commit 516bc3d8dd7965f1a8a3ea453857f14d95971e62
Author: Christian König 
Date:   Fri Nov 2 15:00:16 2018 +0100

  drm/amdgpu: reroute VMC and UMD to IH ring 1

  Page faults can easily overwhelm the interrupt handler.

  So to make sure that we never lose valuable interrupts on the
primary ring
  we re-route page faults to IH ring 1.

commit b849aaa41c914a0fd88003f88cb04420a873c624
Author: Christian König 
Date:   Mon Mar 4 19:34:34 2019 +0100

  drm/amdgpu: also reroute VMC and UMD to IH ring 1 on Vega 20

  Same patch we alredy did for Vega10. Just re-route page faults to a
separate
  ring to avoid drowning in interrupts.


If I re-launch the same set of shaders after the GPU has soft-recovered,
the "amdgpu: IH ring buffer overflow" message is missing, even though
the same amount of page faults should've been triggered at roughly the
same rate. Running with this patch applied makes more "amdgpu: IH ring
buffer overflow" messages appear after relaunching the faulting shaders
(but not when processing any non-faulting work).

That is actually the expected behavior. There should be a limit on the
number of faults written to the ring so that the ring never overflows.


The only possible conclusion I can draw from this is that clearing that
bit *does* have an effect, and I don't think it's far-fetched to assume
the IH ring buffer overflows still happen after re-launching the
faulting shaders but go undetected so far.

Well that can only mean that the hw documentation is incorrect.

Either the value is not write only trigger bit as documented or we need
an additional read of the register for it to take effect or something
like this.


Right now, IH overflows, even if they occur repeatedly, only get
registered once. If not registering IH overflows can trivially lead to
system crashes, it's amdgpu's current handling that is broken.

It's years that we last tested this but according to the HW
documentation this should work fine.

What could potentially happen is that the IH has silenced the source
of the overflow. We never implemented resetting those, but in this
case that here won't help either.


If the IH silenced the page faults (which quite clearly cause the
overflow here), then how are the page faults still logged in dmesg?

There should be a hardware rate limit for the page faults, e.g. there
can only be X faults reported in N clock cycles and then a delay is
inserted.

@Christian Koenig  Is that tied to xnack (i.e., noretry)?  The default
is noretry=1 on gfx10.3 and newer.  But it can be overridden.  It was
not set on some older kernels, maybe that is the problem?  @Friedrich
Vock does setting amdgpu.noretry=1 fix the issue?



No, amdgpu.noretry=1 does not change anything.

Regards,
Friedrich


Alex


The possibility of a repeated IH overflow in between reading the wptr
and updating the rptr is a good point, but how can we detect that at
all? It seems to me like we can't set the OVERFLOW_CLEAR bit at all
then, because we're guaranteed to miss any overflows that happen while
the bit is set.

When an IH overflow is signaled we clear that flag by writing 1 into
the 

Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr

2024-01-17 Thread Alex Deucher
On Wed, Jan 17, 2024 at 7:36 AM Christian König
 wrote:
>
> Am 16.01.24 um 11:31 schrieb Friedrich Vock:
> > On 16.01.24 08:03, Christian König wrote:
> >> Am 15.01.24 um 12:18 schrieb Friedrich Vock:
> >>> [SNIP]
> > +if (ih->overflow) {
> > +tmp = RREG32(mmIH_RB_CNTL);
> > +tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
> > +WREG32(mmIH_RB_CNTL, tmp);
> > +ih->overflow = false;
> > +}
> 
>  Well that is an extremely bad idea. We already reset the overflow
>  after reading the WPTR.
> >>>
> >>> This is not resetting the overflow bit. This is resetting a "clear
> >>> overflow" bit. I don't have the hardware docs, but the name (and my
> >>> observations) strongly suggest that setting this bit actually prevents
> >>> the hardware from setting the overflow bit ever again.
> >>
> >> Well that doesn't make any sense at all. The hardware documentation
> >> clearly states that this bit is write only and should always read as
> >> zero.
> >>
> >> Setting this bit will clear the overflow flag in the WPTR register and
> >> clearing it has no effect at all.
> >>
> >> I could only ping the hw engineer responsible for this block to double
> >> check if the documentation is somehow outdated, but I really doubt so.
> >>
> > I see. I wish I had access to the documentation,
>
> Well, doesn't Valve has an NDA in place?
>
> > but I don't, so all I
> > can do is tell you what I observe the hardware doing. I've tested this
> > on both a Steam Deck (OSSYS 5.2.0) and an RX 6700 XT (OSSYS 5.0.3). On
> > both systems, launching a bunch of shaders that cause page faults leads
> > to lots of "[gfxhub] page fault" messages in dmesg, followed by an
> > "amdgpu: IH ring buffer overflow".
>
> Well that is certainly a bug, maybe even the same thing we have seen on
> Vega and MI.
>
> What we could do is to try to apply the same workaround to re-route the
> page faults to a different IH ring.
>
> See those patches here as well:
>
> commit 516bc3d8dd7965f1a8a3ea453857f14d95971e62
> Author: Christian König 
> Date:   Fri Nov 2 15:00:16 2018 +0100
>
>  drm/amdgpu: reroute VMC and UMD to IH ring 1
>
>  Page faults can easily overwhelm the interrupt handler.
>
>  So to make sure that we never lose valuable interrupts on the
> primary ring
>  we re-route page faults to IH ring 1.
>
> commit b849aaa41c914a0fd88003f88cb04420a873c624
> Author: Christian König 
> Date:   Mon Mar 4 19:34:34 2019 +0100
>
>  drm/amdgpu: also reroute VMC and UMD to IH ring 1 on Vega 20
>
>  Same patch we alredy did for Vega10. Just re-route page faults to a
> separate
>  ring to avoid drowning in interrupts.
>
> >
> > If I re-launch the same set of shaders after the GPU has soft-recovered,
> > the "amdgpu: IH ring buffer overflow" message is missing, even though
> > the same amount of page faults should've been triggered at roughly the
> > same rate. Running with this patch applied makes more "amdgpu: IH ring
> > buffer overflow" messages appear after relaunching the faulting shaders
> > (but not when processing any non-faulting work).
>
> That is actually the expected behavior. There should be a limit on the
> number of faults written to the ring so that the ring never overflows.
>
> >
> > The only possible conclusion I can draw from this is that clearing that
> > bit *does* have an effect, and I don't think it's far-fetched to assume
> > the IH ring buffer overflows still happen after re-launching the
> > faulting shaders but go undetected so far.
>
> Well that can only mean that the hw documentation is incorrect.
>
> Either the value is not write only trigger bit as documented or we need
> an additional read of the register for it to take effect or something
> like this.
>
> >>> Right now, IH overflows, even if they occur repeatedly, only get
> >>> registered once. If not registering IH overflows can trivially lead to
> >>> system crashes, it's amdgpu's current handling that is broken.
> >>
> >> It's years that we last tested this but according to the HW
> >> documentation this should work fine.
> >>
> >> What could potentially happen is that the IH has silenced the source
> >> of the overflow. We never implemented resetting those, but in this
> >> case that here won't help either.
> >>
> > If the IH silenced the page faults (which quite clearly cause the
> > overflow here), then how are the page faults still logged in dmesg?
>
> There should be a hardware rate limit for the page faults, e.g. there
> can only be X faults reported in N clock cycles and then a delay is
> inserted.

@Christian Koenig  Is that tied to xnack (i.e., noretry)?  The default
is noretry=1 on gfx10.3 and newer.  But it can be overridden.  It was
not set on some older kernels, maybe that is the problem?  @Friedrich
Vock does setting amdgpu.noretry=1 fix the issue?

Alex

>
> >>>
> >>> The possibility of a repeated IH overflow in between reading the wptr
> >>> and updating the rptr is a good 

Re: [PATCH v2] drm/amd/display: Fix uninitialized variable usage in core_link_ 'read_dpcd() & write_dpcd()' functions

2024-01-17 Thread Rodrigo Siqueira Jordao

Hi Srinivasan

On 1/17/24 08:23, Srinivasan Shanmugam wrote:

The 'status' variable in 'core_link_read_dpcd()' &
'core_link_write_dpcd()' was uninitialized.

Thus, initializing 'status' variable to 'DC_ERROR_UNEXPECTED' by default.

Fixes the below:
drivers/gpu/drm/amd/amdgpu/../display/dc/link/protocols/link_dpcd.c:226 
core_link_read_dpcd() error: uninitialized symbol 'status'.
drivers/gpu/drm/amd/amdgpu/../display/dc/link/protocols/link_dpcd.c:248 
core_link_write_dpcd() error: uninitialized symbol 'status'.

Cc: sta...@vger.kernel.org
Cc: Jerry Zuo 
Cc: Jun Lei 
Cc: Wayne Lin 
Cc: Aurabindo Pillai 
Cc: Rodrigo Siqueira 
Cc: Hamza Mahfooz 
Signed-off-by: Srinivasan Shanmugam 



This change lgtm.

Btw, avoid to send new patches as a reply of the previous one.

Reviewed-by: Rodrigo Siqueira 

Thanks
Siqueira


---
v2:
   - Initialized status variable to 'DC_ERROR_UNEXPECTED' default.
   - Added Jerry to Cc

  drivers/gpu/drm/amd/display/dc/link/protocols/link_dpcd.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dpcd.c 
b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dpcd.c
index 5c9a30211c10..fc50931c2aec 100644
--- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dpcd.c
+++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dpcd.c
@@ -205,7 +205,7 @@ enum dc_status core_link_read_dpcd(
uint32_t extended_size;
/* size of the remaining partitioned address space */
uint32_t size_left_to_read;
-   enum dc_status status;
+   enum dc_status status = DC_ERROR_UNEXPECTED;
/* size of the next partition to be read from */
uint32_t partition_size;
uint32_t data_index = 0;
@@ -234,7 +234,7 @@ enum dc_status core_link_write_dpcd(
  {
uint32_t partition_size;
uint32_t data_index = 0;
-   enum dc_status status;
+   enum dc_status status = DC_ERROR_UNEXPECTED;
  
  	while (size) {

partition_size = dpcd_get_next_partition_size(address, size);




[PATCH 2/2] amdgpu: use trace_dma_fence_sync_to in amdgpu_fence_sync

2024-01-17 Thread Pierre-Eric Pelloux-Prayer
This makes it possible to understand the dependencies between jobs.
Possible usage of this trace:
* stuttering issues like Mesa !9189
* incorrect synchronization: I don't have a link for this one, but having
  these events was very useful to debug a virtio-gpu / native-context /
  radeonsi sync issue

I have prototype code using this in UMR, as can be see here:
   https://gitlab.freedesktop.org/tomstdenis/umr/-/merge_requests/37

The 'reason' param currently uses __func__ but I didn't add a macro for
this because it'd be interesting to use more descriptive names for each
use of amdgpu_fence_sync.

Signed-off-by: Pierre-Eric Pelloux-Prayer 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |  8 
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   | 14 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c  |  8 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 11 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c |  4 ++--
 7 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index d17b2452cb1f..fde98e48c84b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -491,7 +491,7 @@ static int vm_update_pds(struct amdgpu_vm *vm, struct 
amdgpu_sync *sync)
if (ret)
return ret;
 
-   return amdgpu_sync_fence(sync, vm->last_update);
+   return amdgpu_sync_fence(sync, vm->last_update, __func__);
 }
 
 static uint64_t get_pte_flags(struct amdgpu_device *adev, struct kgd_mem *mem)
@@ -1251,7 +1251,7 @@ static void unmap_bo_from_gpuvm(struct kgd_mem *mem,
 
amdgpu_vm_clear_freed(adev, vm, _va->last_pt_update);
 
-   amdgpu_sync_fence(sync, bo_va->last_pt_update);
+   amdgpu_sync_fence(sync, bo_va->last_pt_update, __func__);
 }
 
 static int update_gpuvm_pte(struct kgd_mem *mem,
@@ -1273,7 +1273,7 @@ static int update_gpuvm_pte(struct kgd_mem *mem,
return ret;
}
 
-   return amdgpu_sync_fence(sync, bo_va->last_pt_update);
+   return amdgpu_sync_fence(sync, bo_va->last_pt_update, __func__);
 }
 
 static int map_bo_to_gpuvm(struct kgd_mem *mem,
@@ -2910,7 +2910,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, 
struct dma_fence **ef)
}
dma_resv_for_each_fence(, bo->tbo.base.resv,
DMA_RESV_USAGE_KERNEL, fence) {
-   ret = amdgpu_sync_fence(_obj, fence);
+   ret = amdgpu_sync_fence(_obj, fence, __func__);
if (ret) {
pr_debug("Memory eviction: Sync BO fence 
failed. Try again\n");
goto validate_map_fail;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 6adeddfb3d56..6830892383c3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -423,7 +423,7 @@ static int amdgpu_cs_p2_dependencies(struct 
amdgpu_cs_parser *p,
dma_fence_put(old);
}
 
-   r = amdgpu_sync_fence(>sync, fence);
+   r = amdgpu_sync_fence(>sync, fence, __func__);
dma_fence_put(fence);
if (r)
return r;
@@ -445,7 +445,7 @@ static int amdgpu_syncobj_lookup_and_add(struct 
amdgpu_cs_parser *p,
return r;
}
 
-   r = amdgpu_sync_fence(>sync, fence);
+   r = amdgpu_sync_fence(>sync, fence, __func__);
dma_fence_put(fence);
return r;
 }
@@ -1101,7 +1101,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser 
*p)
if (r)
return r;
 
-   r = amdgpu_sync_fence(>sync, fpriv->prt_va->last_pt_update);
+   r = amdgpu_sync_fence(>sync, fpriv->prt_va->last_pt_update, 
__func__);
if (r)
return r;
 
@@ -1112,7 +1112,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser 
*p)
if (r)
return r;
 
-   r = amdgpu_sync_fence(>sync, bo_va->last_pt_update);
+   r = amdgpu_sync_fence(>sync, bo_va->last_pt_update, 
__func__);
if (r)
return r;
}
@@ -1131,7 +1131,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser 
*p)
if (r)
return r;
 
-   r = amdgpu_sync_fence(>sync, bo_va->last_pt_update);
+   r = amdgpu_sync_fence(>sync, bo_va->last_pt_update, 
__func__);
if (r)
return r;
}
@@ -1144,7 +1144,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser 
*p)
if (r)
return r;
 
-   r = 

[PATCH 1/2] tracing, dma-buf: add a trace_dma_fence_sync_to event

2024-01-17 Thread Pierre-Eric Pelloux-Prayer
This new event can be used to trace where a given dma_fence is added
as a dependency of some other work.

I plan to use it in amdgpu.

Signed-off-by: Pierre-Eric Pelloux-Prayer 
---
 drivers/dma-buf/dma-fence.c  |  1 +
 include/trace/events/dma_fence.h | 34 
 2 files changed, 35 insertions(+)

diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index e0fd99e61a2d..671a499a5ccd 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -23,6 +23,7 @@
 EXPORT_TRACEPOINT_SYMBOL(dma_fence_emit);
 EXPORT_TRACEPOINT_SYMBOL(dma_fence_enable_signal);
 EXPORT_TRACEPOINT_SYMBOL(dma_fence_signaled);
+EXPORT_TRACEPOINT_SYMBOL(dma_fence_sync_to);
 
 static DEFINE_SPINLOCK(dma_fence_stub_lock);
 static struct dma_fence dma_fence_stub;
diff --git a/include/trace/events/dma_fence.h b/include/trace/events/dma_fence.h
index 3963e79ca7b4..9b3875f7aa79 100644
--- a/include/trace/events/dma_fence.h
+++ b/include/trace/events/dma_fence.h
@@ -83,6 +83,40 @@ DEFINE_EVENT(dma_fence, dma_fence_wait_end,
TP_ARGS(fence)
 );
 
+DECLARE_EVENT_CLASS(dma_fence_from,
+
+   TP_PROTO(struct dma_fence *fence, const char *reason),
+
+   TP_ARGS(fence, reason),
+
+   TP_STRUCT__entry(
+   __string(driver, fence->ops->get_driver_name(fence))
+   __string(timeline, fence->ops->get_timeline_name(fence))
+   __field(unsigned int, context)
+   __field(unsigned int, seqno)
+   __string(reason, reason)
+   ),
+
+   TP_fast_assign(
+   __assign_str(driver, fence->ops->get_driver_name(fence));
+   __assign_str(timeline, fence->ops->get_timeline_name(fence));
+   __entry->context = fence->context;
+   __entry->seqno = fence->seqno;
+   __assign_str(reason, reason);
+   ),
+
+   TP_printk("driver=%s timeline=%s context=%u seqno=%u reason=%s",
+ __get_str(driver), __get_str(timeline), __entry->context,
+ __entry->seqno, __get_str(reason))
+);
+
+DEFINE_EVENT(dma_fence_from, dma_fence_sync_to,
+
+   TP_PROTO(struct dma_fence *fence, const char *reason),
+
+   TP_ARGS(fence, reason)
+);
+
 #endif /*  _TRACE_DMA_FENCE_H */
 
 /* This part must be outside protection */
-- 
2.40.1



[PATCH] drm/amd/amdgpu: Assign GART pages to AMD device mapping

2024-01-17 Thread Tom St Denis
This allows kernel mapped pages like the PDB and PTB to be
read via the iomem debugfs when there is no vram in the system.

Signed-off-by: Tom St Denis 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index 73b8cca35bab..f0bdbcc7b1ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -121,6 +121,7 @@ int amdgpu_gart_table_ram_alloc(struct amdgpu_device *adev)
struct amdgpu_bo_param bp;
dma_addr_t dma_addr;
struct page *p;
+   unsigned long x;
int ret;
 
if (adev->gart.bo != NULL)
@@ -130,6 +131,11 @@ int amdgpu_gart_table_ram_alloc(struct amdgpu_device *adev)
if (!p)
return -ENOMEM;
 
+   /* assign pages to this device */
+   for (x = 0; x < (1UL << order); x++) {
+   p[x].mapping = adev->mman.bdev.dev_mapping;
+   }
+
/* If the hardware does not support UTCL2 snooping of the CPU caches
 * then set_memory_wc() could be used as a workaround to mark the pages
 * as write combine memory.
@@ -223,6 +229,7 @@ void amdgpu_gart_table_ram_free(struct amdgpu_device *adev)
unsigned int order = get_order(adev->gart.table_size);
struct sg_table *sg = adev->gart.bo->tbo.sg;
struct page *p;
+   unsigned long x;
int ret;
 
ret = amdgpu_bo_reserve(adev->gart.bo, false);
@@ -234,6 +241,9 @@ void amdgpu_gart_table_ram_free(struct amdgpu_device *adev)
sg_free_table(sg);
kfree(sg);
p = virt_to_page(adev->gart.ptr);
+   for (x = 0; x < (1UL << order); x++) {
+   p[x].mapping = NULL;
+   }
__free_pages(p, order);
 
adev->gart.ptr = NULL;
-- 
2.40.1



[PATCH v2] drm/amd/display: Fix uninitialized variable usage in core_link_ 'read_dpcd() & write_dpcd()' functions

2024-01-17 Thread Srinivasan Shanmugam
The 'status' variable in 'core_link_read_dpcd()' &
'core_link_write_dpcd()' was uninitialized.

Thus, initializing 'status' variable to 'DC_ERROR_UNEXPECTED' by default.

Fixes the below:
drivers/gpu/drm/amd/amdgpu/../display/dc/link/protocols/link_dpcd.c:226 
core_link_read_dpcd() error: uninitialized symbol 'status'.
drivers/gpu/drm/amd/amdgpu/../display/dc/link/protocols/link_dpcd.c:248 
core_link_write_dpcd() error: uninitialized symbol 'status'.

Cc: sta...@vger.kernel.org
Cc: Jerry Zuo 
Cc: Jun Lei 
Cc: Wayne Lin 
Cc: Aurabindo Pillai 
Cc: Rodrigo Siqueira 
Cc: Hamza Mahfooz 
Signed-off-by: Srinivasan Shanmugam 
---
v2:
  - Initialized status variable to 'DC_ERROR_UNEXPECTED' default.
  - Added Jerry to Cc

 drivers/gpu/drm/amd/display/dc/link/protocols/link_dpcd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dpcd.c 
b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dpcd.c
index 5c9a30211c10..fc50931c2aec 100644
--- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dpcd.c
+++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dpcd.c
@@ -205,7 +205,7 @@ enum dc_status core_link_read_dpcd(
uint32_t extended_size;
/* size of the remaining partitioned address space */
uint32_t size_left_to_read;
-   enum dc_status status;
+   enum dc_status status = DC_ERROR_UNEXPECTED;
/* size of the next partition to be read from */
uint32_t partition_size;
uint32_t data_index = 0;
@@ -234,7 +234,7 @@ enum dc_status core_link_write_dpcd(
 {
uint32_t partition_size;
uint32_t data_index = 0;
-   enum dc_status status;
+   enum dc_status status = DC_ERROR_UNEXPECTED;
 
while (size) {
partition_size = dpcd_get_next_partition_size(address, size);
-- 
2.34.1



Re: [PATCH] drm/amdgpu: only remove existing FBs for devices with displays

2024-01-17 Thread Alex Deucher
On Wed, Jan 17, 2024 at 2:42 AM Christian König
 wrote:
>
> Am 16.01.24 um 15:39 schrieb Alex Deucher:
> > Seems calling drm_aperture_remove_conflicting_pci_framebuffers()
> > will take away the apertures for unrelated devices on some kernel
> > versions.  E.g., calling this on a PCIe accelerator with no display
> > IP may take the apertures away from the actual PCIe display adapter
> > on the system, breaking efifb, depending on the kernel version.
> >
> > Just do this if there is display IP present.
>
> I would have checked the PCI device type instead, because a system BIOS
> most likely has no idea that a VGA device doesn't has a connector.

True, but we have GPUs of PCI class PCI_CLASS_DISPLAY_OTHER that can
have efifb bound to it.  Unfortunately, the combinations of PCI
classes and displays is complicated:

PCI_CLASS_DISPLAY_VGA - boards both with and without display hardware
PCI_CLASS_DISPLAY_OTHER - boards both with and without display hardware
PCI_CLASS_ACCELERATOR_PROCESSING - no display hardware

>
> > Signed-off-by: Alex Deucher 
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 ++
> >   1 file changed, 6 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 62772b58ef3d..353c38f008e8 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -4056,10 +4056,12 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> >
> >   amdgpu_device_set_mcbp(adev);
> >
> > - /* Get rid of things like offb */
> > - r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, 
> > _kms_driver);
> > - if (r)
> > - return r;
> > + if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_DCE)) {
>
> This certainly worth a comment why we do this.
>
> Apart from that I'm not sure we should upstream this, the customer
> kernel is most likely just missing this fix here:
>
> commit 5ae3716cfdcd286268133867f67d0803847acefc
> Author: Daniel Vetter 
> Date:   Thu Apr 6 15:21:07 2023 +0200
>
>  video/aperture: Only remove sysfb on the default vga pci device
>
>  Instead of calling aperture_remove_conflicting_devices() to remove the
>  conflicting devices, just call to aperture_detach_devices() to detach
>  the device that matches the same PCI BAR / aperture range. Since the
>  former is just a wrapper of the latter plus a sysfb_disable() call,
>  and now that's done in this function but only for the primary devices.
>
>  This fixes a regression introduced by commit ee7a69aa38d8 ("fbdev:
>  Disable sysfb device registration when removing conflicting FBs"),
>  where we remove the sysfb when loading a driver for an unrelated pci
>  device, resulting in the user losing their efifb console or similar.
>
>  Note that in practice this only is a problem with the nvidia blob,
>  because that's the only gpu driver people might install which does not
>  come with an fbdev driver of it's own. For everyone else the real gpu
>  driver will restore a working console.
>
>  Also note that in the referenced bug there's confusion that this same
>  bug also happens on amdgpu. But that was just another amdgpu specific
>  regression, which just happened to happen at roughly the same time and
>  with the same user-observable symptoms. That bug is fixed now, see
>  https://bugzilla.kernel.org/show_bug.cgi?id=216331#c15
>

yeah, that looks like the right fix.

Alex


> Regards,
> Christian.
>
> > + /* Get rid of things like offb */
> > + r = 
> > drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, 
> > _kms_driver);
> > + if (r)
> > + return r;
> > + }
> >
> >   /* Enable TMZ based on IP_VERSION */
> >   amdgpu_gmc_tmz_set(adev);
>


Re: [PATCH 0/2] drm/atomic: Allow drivers to write their own plane check for async

2024-01-17 Thread Michel Dänzer
On 2024-01-17 13:57, Xaver Hugl wrote:
> Am Mi., 17. Jan. 2024 um 09:55 Uhr schrieb Pekka Paalanen 
> :
>> Is it important enough to be special-cased, e.g. to be always allowed
>> with async commits?
> 
> I thought so, and sent a patch to dri-devel to make it happen, but
> there are some
> concerns about untested driver paths.
> https://lists.freedesktop.org/archives/dri-devel/2024-January/437511.html
> 
>> Now that I think of it, if userspace needs to wait for the in-fence
>> itself before kicking KMS async, that would defeat much of the async's
>> point, right? And cases where in-fence is not necessary are so rare
>> they might not even exist?
>>
>> So if driver/hardware cannot do IN_FENCE_FD with async, is there any
>> use of supporting async to begin with?
> 
> KWin never commits a buffer where IN_FENCE_FD would actually delay the
> pageflip; it's really only used to disable implicit sync, as there's some edge
> cases where it can wrongly delay the pageflip. The waiting for buffers to 
> become
> readable on the compositor side isn't really significant in terms of latency.
> 
> If hardware doesn't support IN_FENCE_FD with async commits, checking if the
> fence is already signaled at commit time would thus still make things work, at
> least for KWin.

That's how IN_FENCE_FD (and implicit sync) is handled anyway, in common code: 
It waits for all fences to signal before calling into the driver to commit the 
atomic commit.

I can't see why this wouldn't work with async commits, the same as with 
synchronous ones, with any driver.


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer



Re: [PATCH] drm/amdgpu/pm: Fix the power source flag error

2024-01-17 Thread Alex Deucher
On Wed, Jan 17, 2024 at 4:01 AM Ma Jun  wrote:
>
> The power source flag should be updated when
> [1] System receives an interrupt indicating that the power source
> has changed.
> [2] System resumes from suspend or runtime suspend
>
> Signed-off-by: Ma Jun 
> ---
>  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 24 +++
>  .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c|  2 ++
>  .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|  2 ++
>  3 files changed, 18 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index c16703868e5c..e16d22e30a8a 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -24,6 +24,7 @@
>
>  #include 
>  #include 
> +#include 
>  #include 
>
>  #include "amdgpu.h"
> @@ -793,6 +794,17 @@ static int 
> smu_apply_default_config_table_settings(struct smu_context *smu)
> return smu_set_config_table(smu, >pm.config_table);
>  }
>
> +static void smu_update_power_source(struct amdgpu_device *adev)
> +{
> +   if (power_supply_is_system_supplied() > 0)
> +   adev->pm.ac_power = true;
> +   else
> +   adev->pm.ac_power = false;
> +
> +   if (is_support_sw_smu(adev))

Do we need this check here?  This is the sw_smu code.

> +   smu_set_ac_dc(adev->powerplay.pp_handle);
> +}
> +
>  static int smu_late_init(void *handle)
>  {
> struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> @@ -817,16 +829,8 @@ static int smu_late_init(void *handle)
>  * handle the switch automatically. Driver involvement
>  * is unnecessary.
>  */
> -   if (!smu->dc_controlled_by_gpio) {
> -   ret = smu_set_power_source(smu,
> -  adev->pm.ac_power ? 
> SMU_POWER_SOURCE_AC :
> -  SMU_POWER_SOURCE_DC);
> -   if (ret) {
> -   dev_err(adev->dev, "Failed to switch to %s mode!\n",
> -   adev->pm.ac_power ? "AC" : "DC");
> -   return ret;
> -   }
> -   }
> +   if (!smu->dc_controlled_by_gpio)
> +   smu_update_power_source(adev);
>
> if ((amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 1)) ||
> (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 3)))
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c 
> b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
> index 2e7f8d5cfc28..8047150fddd4 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
> @@ -1442,10 +1442,12 @@ static int smu_v11_0_irq_process(struct amdgpu_device 
> *adev,
> case 0x3:
> dev_dbg(adev->dev, "Switched to AC mode!\n");
> schedule_work(>interrupt_work);
> +   adev->pm.ac_power = true;
> break;
> case 0x4:
> dev_dbg(adev->dev, "Switched to DC mode!\n");
> schedule_work(>interrupt_work);
> +   adev->pm.ac_power = false;
> break;
> case 0x7:
> /*
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
> b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> index 771a3d457c33..c486182ff275 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> @@ -1379,10 +1379,12 @@ static int smu_v13_0_irq_process(struct amdgpu_device 
> *adev,
> case 0x3:
> dev_dbg(adev->dev, "Switched to AC mode!\n");
> smu_v13_0_ack_ac_dc_interrupt(smu);
> +   adev->pm.ac_power = true;
> break;
> case 0x4:
> dev_dbg(adev->dev, "Switched to DC mode!\n");
> smu_v13_0_ack_ac_dc_interrupt(smu);
> +   adev->pm.ac_power = false;
> break;
> case 0x7:
> /*
> --
> 2.34.1
>


Re: [PATCH 0/2] drm/atomic: Allow drivers to write their own plane check for async

2024-01-17 Thread Xaver Hugl
Am Mi., 17. Jan. 2024 um 09:55 Uhr schrieb Pekka Paalanen :
> Is it important enough to be special-cased, e.g. to be always allowed
> with async commits?

I thought so, and sent a patch to dri-devel to make it happen, but
there are some
concerns about untested driver paths.
https://lists.freedesktop.org/archives/dri-devel/2024-January/437511.html

> Now that I think of it, if userspace needs to wait for the in-fence
> itself before kicking KMS async, that would defeat much of the async's
> point, right? And cases where in-fence is not necessary are so rare
> they might not even exist?
>
> So if driver/hardware cannot do IN_FENCE_FD with async, is there any
> use of supporting async to begin with?

KWin never commits a buffer where IN_FENCE_FD would actually delay the
pageflip; it's really only used to disable implicit sync, as there's some edge
cases where it can wrongly delay the pageflip. The waiting for buffers to become
readable on the compositor side isn't really significant in terms of latency.

If hardware doesn't support IN_FENCE_FD with async commits, checking if the
fence is already signaled at commit time would thus still make things work, at
least for KWin.

> > If the compositor prioritizes tearing and would like to do overlay planes
> > if possible,
> > it would have to know that switching to synchronous commits for a single
> > frame,
> > setting up the overlay planes and then switching back to async commits
> > works, and
> > that can't be figured out with TEST_ONLY commits.
>
> I had to ponder a bit why. So I guess the synchronous commit in between
> is because driver/hardware may not be able to enable/disable extra
> planes in async, so you need a synchronous commit to set them up, but
> afterwards updates can tear.

The hardware could be a factor, yes, but I've been thinking more about the API.
With this patchset, the compositor is still only allowed to change a
limited set of
plane properties - but it needs to set at least SRC_X, SRC_Y, CRTC_X etc on
the overlay plane(s) to the correct values before it can only change the allowed
properties again.

> The comment about Intel needing one more synchronous commit when
> switching from sync to async updates comes to mind as well, would that
> be a problem?

With only one synchronous update, the compositor could theoretically mask the
issue by committing it right before vblank; with that one
implicitly-sync "async"
commit though, you'd definitely get one frame without async commits. Having a
flag for a sync-but-then-async-again commit could solve that issue.

In practice I don't think anyone will notice one or two frames without
async commits.
It should be a pretty rare occurance, usually when the game or match
starts or an
overlay gets opened, so I doubt it's worth putting effort in to fix that.

> > So I think having a CAP or immutable plane property to signal that async
> > commits
> > with overlay and/or cursor planes is supported would be useful.
>
> Async cursor planes a good point, particularly moving them around. I'm
> not too informed about the prior/on-going efforts to allow cursor
> movement more often than refresh rate, I recall something about
> amending atomic commits? How would these interact?
>
> I suppose the kernel still prevents any new async commit while a
> previous commit is not finished, so amending commits would still be
> necessary for cursor plane motion? Or would it, if you time "big
> commits" to finish quickly and then spam async "cursor commits" in the
> mean time?

With async commits for cursor planes I'm really only talking about
getting to use
the cursor plane while doing async commits on the primary plane.

FWIW I personally consider the amend stuff mostly solved - KWin does that
internally since a few months ago now, with a separate thread to amend and
even reorder commits in a queue, and only actually commit immediately
before vblank.

>
> Thanks,
> pq
>
> > Am Di., 16. Jan. 2024 um 14:35 Uhr schrieb André Almeida <
> > andrealm...@igalia.com>:
> >
> > > + Joshua
> > >
> > > Em 16/01/2024 10:14, Pekka Paalanen escreveu:
> > > > On Tue, 16 Jan 2024 08:50:59 -0300
> > > > André Almeida  wrote:
> > > >
> > > >> Hi Pekka,
> > > >>
> > > >> Em 16/01/2024 06:45, Pekka Paalanen escreveu:
> > > >>> On Tue, 16 Jan 2024 01:51:57 -0300
> > > >>> André Almeida  wrote:
> > > >>>
> > >  Hi,
> > > 
> > >  AMD hardware can do more on the async flip path than just the primary
> > > plane, so
> > >  to lift up the current restrictions, this patchset allows drivers to
> > > write their
> > >  own check for planes for async flips.
> > > >>>
> > > >>> Hi,
> > > >>>
> > > >>> what's the userspace story for this, how could userspace know it could
> > > do more?
> > > >>> What kind of userspace would take advantage of this and in what
> > > situations?
> > > >>>
> > > >>> Or is this not meant for generic userspace?
> > > >>
> > > >> Sorry, I forgot to document this. So the idea is that userspace will
> > > >> 

Re: [PATCH v2 2/4] drm/uAPI: Add "force color format" drm property as setting for userspace

2024-01-17 Thread Andri Yngvason
mið., 17. jan. 2024 kl. 09:21 skrifaði Pekka Paalanen :
>
> On Tue, 16 Jan 2024 14:11:43 +
> Andri Yngvason  wrote:
>
> > þri., 16. jan. 2024 kl. 13:29 skrifaði Sebastian Wick
> > :
> > >
> > > On Tue, Jan 16, 2024 at 01:13:13PM +, Andri Yngvason wrote:
> > [...]
> > > > şri., 16. jan. 2024 kl. 11:42 skrifaği Sebastian Wick
> > > > :
> > > > >
> > > > > On Mon, Jan 15, 2024 at 04:05:52PM +, Andri Yngvason wrote:
> > > > > > From: Werner Sembach 
> > > > > >
> > > > > > Add a new general drm property "force color format" which can be 
> > > > > > used
> > > > > > by userspace to tell the graphics driver which color format to use.
> > > > >
> > > > > I don't like the "force" in the name. This just selects the color
> > > > > format, let's just call it "color format" then.
> > > > >
> > > >
> > > > In previous revisions, this was "preferred color format" and "actual
> > > > color format", of which the latter has been dropped. I recommend
> > > > reading the discussion for previous revisions.
> > >
> > > Please don't imply that I didn't read the thread I'm answering to.
>
> FYI, I have not read this thread.
>

pq, You did not read this summary?
https://lore.kernel.org/dri-devel/cafnqbqwjejax6b4oewpgasmud5_nxzymxufdog294ctvgbt...@mail.gmail.com/

You partook in the discussion on IRC. Please read it and tell me if I
misunderstood anything.

Sebastian, I apologise. You clearly read it as you even replied to it!

> > >
> > > > There are arguments for adding "actual color format" later and if it
> > > > is added later, we'd end up with "color format" and "actual color
> > > > format", which might be confusing, and it is why I chose to call it
> > > > "force color format" because it clearly communicates intent and
> > > > disambiguates it from "actual color format".
> > >
> > > There is no such thing as "actual color format" in upstream though.
> > > Basing your naming on discarded ideas is not useful. The thing that sets
> > > the color space for example is called "Colorspace", not "force
> > > colorspace".
> > >
> >
> > Sure, I'm happy with calling it whatever people want. Maybe we can
> > have a vote on it?
>
> It would sound strange to say "force color format" = "auto". Just drop
> the "force" of it.
>
> If and when we need the feedback counterpart, it could be an immutable
> prop called "active color format" where "auto" is not a valid value, or
> something in the new "output properties" design Sima has been thinking
> of.

There seems to be consensus for calling it "color format"

>
> > > > [...]
> > > > > > @@ -1396,6 +1404,15 @@ static const u32 dp_colorspaces =
> > > > > >   *   drm_connector_attach_max_bpc_property() to create and attach 
> > > > > > the
> > > > > >   *   property to the connector during initialization.
> > > > > >   *
> > > > > > + * force color format:
> > > > > > + *   This property is used by userspace to change the used color 
> > > > > > format. When
> > > > > > + *   used the driver will use the selected format if valid for the 
> > > > > > hardware,
> > > > >
> > > > > All properties are always "used", they just can have different values.
> > > > > You probably want to talk about the auto mode here.
> > > >
> > > > Maybe we can say something like: If userspace does not set the
> > > > property or if it is explicitly set to zero, the driver will select
> > > > the appropriate color format based on other constraints.
> > >
> > > The property can be in any state without involvement from user space.
> > > Don't talk about setting it, talk about the state it is in:
> > >
> > >   When the color format is auto, the driver will select a format.
> > >
> >
> > Ok.
> >
> > > > >
> > > > > > + *   sink, and current resolution and refresh rate combination. 
> > > > > > Drivers to
> > > > >
> > > > > If valid? So when a value is not actually supported user space can 
> > > > > still
> > > > > set it? What happens then? How should user space figure out if the
> > > > > driver and the sink support the format?
> > > >
> > > > The kernel does not expose this property unless it's implemented in the 
> > > > driver.
> > >
> > > If the driver simply doesn't support *one format*, the enum value for
> > > that format should not be exposed, period. This isn't about the property
> > > on its own.
> >
> > Right, understood. You mean that enum should only contain values that
> > are supported by the driver.
>
> Yes. When a driver installs a property, it can choose which of the enum
> entries are exposed. That cannot be changed later though, so the list
> cannot live by the currently connected sink, only by what the driver
> and display controlled could ever do.

Yes, and I think that basing it also on the connected sink's
capabilities would just add complexity for very little gain. In fact,
I think that limiting it based on the driver's capabilities is also
over-engineering, but I don't mind adding it if that's what people
really want.

>
> > > > This was originally "preferred color format". 

Re: [PATCH 1/2] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit after writing rptr

2024-01-17 Thread Christian König

Am 16.01.24 um 11:31 schrieb Friedrich Vock:

On 16.01.24 08:03, Christian König wrote:

Am 15.01.24 um 12:18 schrieb Friedrich Vock:

[SNIP]

+    if (ih->overflow) {
+    tmp = RREG32(mmIH_RB_CNTL);
+    tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK;
+    WREG32(mmIH_RB_CNTL, tmp);
+    ih->overflow = false;
+    }


Well that is an extremely bad idea. We already reset the overflow
after reading the WPTR.


This is not resetting the overflow bit. This is resetting a "clear
overflow" bit. I don't have the hardware docs, but the name (and my
observations) strongly suggest that setting this bit actually prevents
the hardware from setting the overflow bit ever again.


Well that doesn't make any sense at all. The hardware documentation
clearly states that this bit is write only and should always read as
zero.

Setting this bit will clear the overflow flag in the WPTR register and
clearing it has no effect at all.

I could only ping the hw engineer responsible for this block to double
check if the documentation is somehow outdated, but I really doubt so.


I see. I wish I had access to the documentation,


Well, doesn't Valve has an NDA in place?


but I don't, so all I
can do is tell you what I observe the hardware doing. I've tested this
on both a Steam Deck (OSSYS 5.2.0) and an RX 6700 XT (OSSYS 5.0.3). On
both systems, launching a bunch of shaders that cause page faults leads
to lots of "[gfxhub] page fault" messages in dmesg, followed by an
"amdgpu: IH ring buffer overflow".


Well that is certainly a bug, maybe even the same thing we have seen on 
Vega and MI.


What we could do is to try to apply the same workaround to re-route the 
page faults to a different IH ring.


See those patches here as well:

commit 516bc3d8dd7965f1a8a3ea453857f14d95971e62
Author: Christian König 
Date:   Fri Nov 2 15:00:16 2018 +0100

    drm/amdgpu: reroute VMC and UMD to IH ring 1

    Page faults can easily overwhelm the interrupt handler.

    So to make sure that we never lose valuable interrupts on the 
primary ring

    we re-route page faults to IH ring 1.

commit b849aaa41c914a0fd88003f88cb04420a873c624
Author: Christian König 
Date:   Mon Mar 4 19:34:34 2019 +0100

    drm/amdgpu: also reroute VMC and UMD to IH ring 1 on Vega 20

    Same patch we alredy did for Vega10. Just re-route page faults to a 
separate

    ring to avoid drowning in interrupts.



If I re-launch the same set of shaders after the GPU has soft-recovered,
the "amdgpu: IH ring buffer overflow" message is missing, even though
the same amount of page faults should've been triggered at roughly the
same rate. Running with this patch applied makes more "amdgpu: IH ring
buffer overflow" messages appear after relaunching the faulting shaders
(but not when processing any non-faulting work).


That is actually the expected behavior. There should be a limit on the 
number of faults written to the ring so that the ring never overflows.




The only possible conclusion I can draw from this is that clearing that
bit *does* have an effect, and I don't think it's far-fetched to assume
the IH ring buffer overflows still happen after re-launching the
faulting shaders but go undetected so far.


Well that can only mean that the hw documentation is incorrect.

Either the value is not write only trigger bit as documented or we need 
an additional read of the register for it to take effect or something 
like this.



Right now, IH overflows, even if they occur repeatedly, only get
registered once. If not registering IH overflows can trivially lead to
system crashes, it's amdgpu's current handling that is broken.


It's years that we last tested this but according to the HW
documentation this should work fine.

What could potentially happen is that the IH has silenced the source
of the overflow. We never implemented resetting those, but in this
case that here won't help either.


If the IH silenced the page faults (which quite clearly cause the
overflow here), then how are the page faults still logged in dmesg?


There should be a hardware rate limit for the page faults, e.g. there 
can only be X faults reported in N clock cycles and then a delay is 
inserted.




The possibility of a repeated IH overflow in between reading the wptr
and updating the rptr is a good point, but how can we detect that at
all? It seems to me like we can't set the OVERFLOW_CLEAR bit at all
then, because we're guaranteed to miss any overflows that happen while
the bit is set.


When an IH overflow is signaled we clear that flag by writing 1 into
the OVERFLOW_CLEAR bit and skip one entry in the IH ring buffer.

What can of course happen is that the IH ring buffer overflows more
than this single entry and we process IVs which are potentially
corrupted, but we won't miss any additional overflows since we only
start processing after resetting the flag.

An IH overflow is also something you should *never* see in a
production system. This is purely for driver bringup and as 

RE: [PATCH 2/2] update check condition of query for ras page retire

2024-01-17 Thread Zhang, Hawking
[AMD Official Use Only - General]

static ssize_t smu_v13_0_6_get_ecc_info(struct smu_context *smu,
void *table)
 {
-   /* Support ecc info by default */
-   return 0;
+   /* we use debug mode flag instead of this interface */
+   return -EOPNOTSUPP;
 }

Shall we just drop the callback implementation? smu_get_ecc_info will return 
-EOPNOTSUPP if the callback is not supported.

Regards,
Hawking

-Original Message-
From: amd-gfx  On Behalf Of Tao Zhou
Sent: Wednesday, January 17, 2024 17:15
To: amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao 
Subject: [PATCH 2/2] update check condition of query for ras page retire

Support page retirement handling in debug mode.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c  | 9 +++--
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 4 ++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 41139bac7643..6df32f0afd89 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -90,12 +90,16 @@ static void amdgpu_umc_handle_bad_pages(struct 
amdgpu_device *adev,  {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   unsigned int error_query_mode;
int ret = 0;

+   amdgpu_ras_get_error_query_mode(adev, _query_mode);
+
mutex_lock(>page_retirement_lock);

ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
-   if (ret == -EOPNOTSUPP) {
+   if (ret == -EOPNOTSUPP &&
+   error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
adev->umc.ras->ras_block.hw_ops->query_ras_error_count)

adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status); 
@@ -119,7 +123,8 @@ static void amdgpu_umc_handle_bad_pages(struct 
amdgpu_device *adev,
 */

adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, 
ras_error_status);
}
-   } else if (!ret) {
+   } else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY ||
+   (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) {
if (adev->umc.ras &&
adev->umc.ras->ecc_info_query_ras_error_count)
adev->umc.ras->ecc_info_query_ras_error_count(adev, 
ras_error_status); diff --git 
a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index c560f4af214d..d86c9e7fc64b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2909,8 +2909,8 @@ static int smu_v13_0_6_select_xgmi_plpd_policy(struct 
smu_context *smu,  static ssize_t smu_v13_0_6_get_ecc_info(struct smu_context 
*smu,
void *table)
 {
-   /* Support ecc info by default */
-   return 0;
+   /* we use debug mode flag instead of this interface */
+   return -EOPNOTSUPP;
 }

 static const struct pptable_funcs smu_v13_0_6_ppt_funcs = {
--
2.35.1



RE: [PATCH 1/2] update error condition check for umc_v12_0_query_error_address

2024-01-17 Thread Zhang, Hawking
[AMD Official Use Only - General]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: amd-gfx  On Behalf Of Tao Zhou
Sent: Wednesday, January 17, 2024 17:14
To: amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao 
Subject: [PATCH 1/2] update error condition check for 
umc_v12_0_query_error_address

Deferred error is also taken into account.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 10edf818acf5..2e0bd4312f2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -337,10 +337,7 @@ static int umc_v12_0_query_error_address(struct 
amdgpu_device *adev,
}

/* calculate error address if ue error is detected */
-   if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 
&&
-   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 
1 &&
-   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1) 
{
-
+   if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)) {
mc_umc_addrt0 =
SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);

--
2.35.1



RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning

2024-01-17 Thread Zhang, Hawking
[AMD Official Use Only - General]


Please check my comments inline

Regards,
Hawking

-Original Message-
From: Chai, Thomas 
Sent: Tuesday, January 16, 2024 16:21
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 
poisoning

Use asynchronous polling to handle umc_v12_0 poisoning.

Signed-off-by: YiPeng Chai mailto:yipeng.c...@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |   5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 143 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |   3 +
 3 files changed, 120 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 856206e95842..44929281840e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -118,6 +118,8 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block)
 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
 #define RAS_BAD_PAGE_COVER  (100 * 1024 * 1024ULL)

+#define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms
+
 enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -2670,6 +2672,9 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_read(>page_retirement_req_cnt));

atomic_dec(>page_retirement_req_cnt);
+
+   amdgpu_umc_poison_retire_page_polling_timeout(adev,
+   false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
}

return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 9d1cf41cf483..2dde29cb807d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -23,6 +23,7 @@

 #include "amdgpu.h"
 #include "umc_v6_7.h"
+#define MAX_UMC_POISON_POLLING_TIME_SYNC   20  //ms

 static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t 
err_addr, @@ -85,17 +86,14 @@ int amdgpu_umc_page_retirement_mca(struct 
amdgpu_device *adev,
return ret;
 }

-static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
-   void *ras_error_status,
-   struct amdgpu_iv_entry *entry,
-   bool reset)
+static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
+   void *ras_error_status)
 {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret = 0;
unsigned long err_count;
-
-   kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+   mutex_lock(>page_retirement_lock);
ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
if (ret == -EOPNOTSUPP) {
if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && @@ 
-163,19 +161,86 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
con->update_channel_flag = false;
}
}
-
-   if (reset) {
-   /* use mode-2 reset for poison consumption */
-   if (!entry)
-   con->gpu_reset_flags |= 
AMDGPU_RAS_GPU_RESET_MODE2_RESET;
-   amdgpu_ras_reset_gpu(adev);
-   }
}

kfree(err_data->err_addr);
+
+   mutex_unlock(>page_retirement_lock);
+}
+
+static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
+   void *ras_error_status,
+   struct amdgpu_iv_entry *entry,
+   bool reset)
+{
+   struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+   amdgpu_umc_handle_bad_pages(adev, ras_error_status);
+
+   if (err_data->ue_count && reset) {
+   /* use mode-2 reset for poison consumption */
+   if (!entry)
+   con->gpu_reset_flags |= 
AMDGPU_RAS_GPU_RESET_MODE2_RESET;

[Hawking]: Shall we do further check on con->poison_supported flag to decide 
issuing mode-2 or mode-1.

+   amdgpu_ras_reset_gpu(adev);
+   }
+
return AMDGPU_RAS_SUCCESS;
 }

+int amdgpu_umc_poison_retire_page_polling_timeout(struct amdgpu_device *adev,
+   bool reset, uint32_t timeout_ms)
[Hawking] int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, 
boot reset, uint32_t timeout_ms)
+{
+   struct ras_err_data err_data;
+   struct ras_common_if head = {
+   .block = AMDGPU_RAS_BLOCK__UMC,
+   };
+   struct 

RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

2024-01-17 Thread Zhang, Hawking
[AMD Official Use Only - General]

Please ignore my first comment. It doesn't necessarily associated with socket  
id in UMC MCA status log at this stage.

Regards,
Hawking

-Original Message-
From: amd-gfx  On Behalf Of Zhang, 
Hawking
Sent: Wednesday, January 17, 2024 19:12
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Yang, Stanley ; Wang, 
Yang(Kevin) ; Li, Candice 
Subject: RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

[AMD Official Use Only - General]

[AMD Official Use Only - General]

+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,

Please also print out socket id for UMC MCA status.

+   dev_info(smu->adev->dev, "MSG %s(%d) query %s MCA count result:%u\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   *count);
+

This seems redundant or was added for debugging purpose. We can drop this print 
since there is log to cover failures.

Regards,
Hawking


-Original Message-
From: Chai, Thomas 
Sent: Tuesday, January 16, 2024 16:21
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

Add log info for umc_v12_0 and smu_v13_0_6.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c |  6 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c| 13 +
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6423dca5b777..fa2168f1d3bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -91,6 +91,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device 
*adev)

 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t 
mc_umc_status)  {
+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Poison),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC)
+   );
+
return (amdgpu_ras_is_poison_mode_supported(adev) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) 
== 1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 11923964ce9a..51bb98db5d7a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1297,8 +1297,10 @@ void kfd_signal_poison_consumed_event(struct kfd_node 
*dev, u32 pasid)
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
int user_gpu_id;

-   if (!p)
+   if (!p) {
+   dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", 
pasid);
return; /* Presumably process exited. */
+   }

user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) { @@ -1334,6 +1336,8 @@ void 
kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
}
}

+   dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n",
+   p->lead_thread->comm, pasid);
rcu_read_unlock();

/* user application will handle SIGBUS signal */ diff --git 
a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 952a983da49a..cee8ee5afcb6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2406,10 +2406,23 @@ static int smu_v13_0_6_get_valid_mca_count(struct 
smu_context *smu, enum amdgpu_

ret = smu_cmn_send_smc_msg(smu, msg, count);
if (ret) {
+   dev_err(smu->adev->dev, "%s(%d) failed to query %s MCA count, 
ret:%d\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+

RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

2024-01-17 Thread Zhang, Hawking
[AMD Official Use Only - General]

+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,

Please also print out socket id for UMC MCA status.

+   dev_info(smu->adev->dev, "MSG %s(%d) query %s MCA count result:%u\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   *count);
+

This seems redundant or was added for debugging purpose. We can drop this print 
since there is log to cover failures.

Regards,
Hawking


-Original Message-
From: Chai, Thomas 
Sent: Tuesday, January 16, 2024 16:21
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

Add log info for umc_v12_0 and smu_v13_0_6.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c |  6 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c| 13 +
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6423dca5b777..fa2168f1d3bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -91,6 +91,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device 
*adev)

 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t 
mc_umc_status)  {
+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Poison),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC)
+   );
+
return (amdgpu_ras_is_poison_mode_supported(adev) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) 
== 1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 11923964ce9a..51bb98db5d7a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1297,8 +1297,10 @@ void kfd_signal_poison_consumed_event(struct kfd_node 
*dev, u32 pasid)
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
int user_gpu_id;

-   if (!p)
+   if (!p) {
+   dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", 
pasid);
return; /* Presumably process exited. */
+   }

user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) { @@ -1334,6 +1336,8 @@ void 
kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
}
}

+   dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n",
+   p->lead_thread->comm, pasid);
rcu_read_unlock();

/* user application will handle SIGBUS signal */ diff --git 
a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 952a983da49a..cee8ee5afcb6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2406,10 +2406,23 @@ static int smu_v13_0_6_get_valid_mca_count(struct 
smu_context *smu, enum amdgpu_

ret = smu_cmn_send_smc_msg(smu, msg, count);
if (ret) {
+   dev_err(smu->adev->dev, "%s(%d) failed to query %s MCA count, 
ret:%d\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   ret);
*count = 0;
return ret;
}

+   dev_info(smu->adev->dev, "MSG %s(%d) query %s MCA count result:%u\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   *count);
+

Re: [PATCH] drm/amdgpu/pm: Fix the power source flag error

2024-01-17 Thread Lazar, Lijo

On 1/17/2024 2:22 PM, Ma Jun wrote:

The power source flag should be updated when
[1] System receives an interrupt indicating that the power source
has changed.
[2] System resumes from suspend or runtime suspend

Signed-off-by: Ma Jun 
---
  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 24 +++
  .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c|  2 ++
  .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|  2 ++
  3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index c16703868e5c..e16d22e30a8a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -24,6 +24,7 @@
  
  #include 

  #include 
+#include 
  #include 
  
  #include "amdgpu.h"

@@ -793,6 +794,17 @@ static int smu_apply_default_config_table_settings(struct 
smu_context *smu)
return smu_set_config_table(smu, >pm.config_table);
  }
  
+static void smu_update_power_source(struct amdgpu_device *adev)

+{
+   if (power_supply_is_system_supplied() > 0)
+   adev->pm.ac_power = true;
+   else
+   adev->pm.ac_power = false;
+
+   if (is_support_sw_smu(adev))
+   smu_set_ac_dc(adev->powerplay.pp_handle);
+}
+
  static int smu_late_init(void *handle)
  {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -817,16 +829,8 @@ static int smu_late_init(void *handle)
 * handle the switch automatically. Driver involvement
 * is unnecessary.
 */
-   if (!smu->dc_controlled_by_gpio) {
-   ret = smu_set_power_source(smu,
-  adev->pm.ac_power ? 
SMU_POWER_SOURCE_AC :
-  SMU_POWER_SOURCE_DC);
-   if (ret) {
-   dev_err(adev->dev, "Failed to switch to %s mode!\n",
-   adev->pm.ac_power ? "AC" : "DC");
-   return ret;
-   }
-   }


For this part of the change - driver already updates FW with the initial 
detected power state or the last detected power state before going for 
suspend. Isn't that good enough?


Thanks,
Lijo


+   if (!smu->dc_controlled_by_gpio)
+   smu_update_power_source(adev);
  
  	if ((amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 1)) ||

(amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 3)))
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
index 2e7f8d5cfc28..8047150fddd4 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
@@ -1442,10 +1442,12 @@ static int smu_v11_0_irq_process(struct amdgpu_device 
*adev,
case 0x3:
dev_dbg(adev->dev, "Switched to AC mode!\n");
schedule_work(>interrupt_work);
+   adev->pm.ac_power = true;
break;
case 0x4:
dev_dbg(adev->dev, "Switched to DC mode!\n");
schedule_work(>interrupt_work);
+   adev->pm.ac_power = false;
break;
case 0x7:
/*
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 771a3d457c33..c486182ff275 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -1379,10 +1379,12 @@ static int smu_v13_0_irq_process(struct amdgpu_device 
*adev,
case 0x3:
dev_dbg(adev->dev, "Switched to AC mode!\n");
smu_v13_0_ack_ac_dc_interrupt(smu);
+   adev->pm.ac_power = true;
break;
case 0x4:
dev_dbg(adev->dev, "Switched to DC mode!\n");
smu_v13_0_ack_ac_dc_interrupt(smu);
+   adev->pm.ac_power = false;
break;
case 0x7:
/*




Re: [PATCH v2 2/4] drm/uAPI: Add "force color format" drm property as setting for userspace

2024-01-17 Thread Andri Yngvason
Hi Sebastian,

þri., 16. jan. 2024 kl. 11:42 skrifaði Sebastian Wick
:
>
> On Mon, Jan 15, 2024 at 04:05:52PM +, Andri Yngvason wrote:
> > From: Werner Sembach 
> >
> > Add a new general drm property "force color format" which can be used
> > by userspace to tell the graphics driver which color format to use.
>
> I don't like the "force" in the name. This just selects the color
> format, let's just call it "color format" then.
>

In previous revisions, this was "preferred color format" and "actual
color format", of which the latter has been dropped. I recommend
reading the discussion for previous revisions.

There are arguments for adding "actual color format" later and if it
is added later, we'd end up with "color format" and "actual color
format", which might be confusing, and it is why I chose to call it
"force color format" because it clearly communicates intent and
disambiguates it from "actual color format".

[...]
> > @@ -1396,6 +1404,15 @@ static const u32 dp_colorspaces =
> >   *   drm_connector_attach_max_bpc_property() to create and attach the
> >   *   property to the connector during initialization.
> >   *
> > + * force color format:
> > + *   This property is used by userspace to change the used color format. 
> > When
> > + *   used the driver will use the selected format if valid for the 
> > hardware,
>
> All properties are always "used", they just can have different values.
> You probably want to talk about the auto mode here.

Maybe we can say something like: If userspace does not set the
property or if it is explicitly set to zero, the driver will select
the appropriate color format based on other constraints.

>
> > + *   sink, and current resolution and refresh rate combination. Drivers to
>
> If valid? So when a value is not actually supported user space can still
> set it? What happens then? How should user space figure out if the
> driver and the sink support the format?

The kernel does not expose this property unless it's implemented in the driver.

This was originally "preferred color format". Perhaps the
documentation should better reflect that it is now a mandatory
constraint which fails the modeset if not satisfied.

>
> For the Colorspace prop, the kernel just exposes all formats it supports
> (independent of the sink) and then makes it the job of user space to
> figure out if the sink supports it.
>
> The same could be done here. Property value is exposed if the driver
> supports it in general, commits can fail if the driver can't support it
> for a specific commit because e.g. the resolution or refresh rate. User
> space must look at the EDID/DisplayID/mode to figure out the supported
> format for the sink.

Yes, we can make it possible for userspace to discover which modes are
supported by the monitor, but there are other constraints that need to
be satisfied. This was discussed in the previous revision.

In any case, these things can be added later and need not be a part of
this change set.

[...]

Regards,
Andri


Re: [PATCH 0/2] drm/atomic: Allow drivers to write their own plane check for async

2024-01-17 Thread Xaver Hugl
My plan is to require support for IN_FENCE_FD at least. If the driver
doesn't
allow tearing with that, then tearing just doesn't happen.

For overlay planes though, it depends on how the compositor prioritizes
things.
If the compositor prioritizes overlay planes and would like to do tearing
if possible,
then this patch works.
If the compositor prioritizes tearing and would like to do overlay planes
if possible,
it would have to know that switching to synchronous commits for a single
frame,
setting up the overlay planes and then switching back to async commits
works, and
that can't be figured out with TEST_ONLY commits.
So I think having a CAP or immutable plane property to signal that async
commits
with overlay and/or cursor planes is supported would be useful.

Am Di., 16. Jan. 2024 um 14:35 Uhr schrieb André Almeida <
andrealm...@igalia.com>:

> + Joshua
>
> Em 16/01/2024 10:14, Pekka Paalanen escreveu:
> > On Tue, 16 Jan 2024 08:50:59 -0300
> > André Almeida  wrote:
> >
> >> Hi Pekka,
> >>
> >> Em 16/01/2024 06:45, Pekka Paalanen escreveu:
> >>> On Tue, 16 Jan 2024 01:51:57 -0300
> >>> André Almeida  wrote:
> >>>
>  Hi,
> 
>  AMD hardware can do more on the async flip path than just the primary
> plane, so
>  to lift up the current restrictions, this patchset allows drivers to
> write their
>  own check for planes for async flips.
> >>>
> >>> Hi,
> >>>
> >>> what's the userspace story for this, how could userspace know it could
> do more?
> >>> What kind of userspace would take advantage of this and in what
> situations?
> >>>
> >>> Or is this not meant for generic userspace?
> >>
> >> Sorry, I forgot to document this. So the idea is that userspace will
> >> query what they can do here with DRM_MODE_ATOMIC_TEST_ONLY calls,
> >> instead of having capabilities for each prop.
> >
> > That's the theory, but do you have a practical example?
> >
> > What other planes and props would one want change in some specific use
> > case?
> >
> > Is it just "all or nothing", or would there be room to choose and pick
> > which props you change and which you don't based on what the driver
> > supports? If the latter, then relying on TEST_ONLY might be yet another
> > combinatorial explosion to iterate through.
> >
>
> That's a good question, maybe Simon, Xaver or Joshua can share how they
> were planning to use this on Gamescope or Kwin.
>
> >
> > Thanks,
> > pq
> >
>  I'm not sure if adding something new to drm_plane_funcs is the right
> way to do,
>  because if we want to expand the other object types (crtc, connector)
> we would
>  need to add their own drm_XXX_funcs, so feedbacks are welcome!
> 
> André
> 
>  André Almeida (2):
>  drm/atomic: Allow drivers to write their own plane check for async
>    flips
>  drm/amdgpu: Implement check_async_props for planes
> 
> .../amd/display/amdgpu_dm/amdgpu_dm_plane.c   | 30 +
> drivers/gpu/drm/drm_atomic_uapi.c | 62
> ++-
> include/drm/drm_atomic_uapi.h | 12 
> include/drm/drm_plane.h   |  5 ++
> 4 files changed, 92 insertions(+), 17 deletions(-)
> 
> >>>
> >
>


Re: [PATCH v2 2/4] drm/uAPI: Add "force color format" drm property as setting for userspace

2024-01-17 Thread Andri Yngvason
þri., 16. jan. 2024 kl. 13:29 skrifaði Sebastian Wick
:
>
> On Tue, Jan 16, 2024 at 01:13:13PM +, Andri Yngvason wrote:
[...]
> > şri., 16. jan. 2024 kl. 11:42 skrifaği Sebastian Wick
> > :
> > >
> > > On Mon, Jan 15, 2024 at 04:05:52PM +, Andri Yngvason wrote:
> > > > From: Werner Sembach 
> > > >
> > > > Add a new general drm property "force color format" which can be used
> > > > by userspace to tell the graphics driver which color format to use.
> > >
> > > I don't like the "force" in the name. This just selects the color
> > > format, let's just call it "color format" then.
> > >
> >
> > In previous revisions, this was "preferred color format" and "actual
> > color format", of which the latter has been dropped. I recommend
> > reading the discussion for previous revisions.
>
> Please don't imply that I didn't read the thread I'm answering to.
>
> > There are arguments for adding "actual color format" later and if it
> > is added later, we'd end up with "color format" and "actual color
> > format", which might be confusing, and it is why I chose to call it
> > "force color format" because it clearly communicates intent and
> > disambiguates it from "actual color format".
>
> There is no such thing as "actual color format" in upstream though.
> Basing your naming on discarded ideas is not useful. The thing that sets
> the color space for example is called "Colorspace", not "force
> colorspace".
>

Sure, I'm happy with calling it whatever people want. Maybe we can
have a vote on it?

> > [...]
> > > > @@ -1396,6 +1404,15 @@ static const u32 dp_colorspaces =
> > > >   *   drm_connector_attach_max_bpc_property() to create and attach the
> > > >   *   property to the connector during initialization.
> > > >   *
> > > > + * force color format:
> > > > + *   This property is used by userspace to change the used color 
> > > > format. When
> > > > + *   used the driver will use the selected format if valid for the 
> > > > hardware,
> > >
> > > All properties are always "used", they just can have different values.
> > > You probably want to talk about the auto mode here.
> >
> > Maybe we can say something like: If userspace does not set the
> > property or if it is explicitly set to zero, the driver will select
> > the appropriate color format based on other constraints.
>
> The property can be in any state without involvement from user space.
> Don't talk about setting it, talk about the state it is in:
>
>   When the color format is auto, the driver will select a format.
>

Ok.

> > >
> > > > + *   sink, and current resolution and refresh rate combination. 
> > > > Drivers to
> > >
> > > If valid? So when a value is not actually supported user space can still
> > > set it? What happens then? How should user space figure out if the
> > > driver and the sink support the format?
> >
> > The kernel does not expose this property unless it's implemented in the 
> > driver.
>
> If the driver simply doesn't support *one format*, the enum value for
> that format should not be exposed, period. This isn't about the property
> on its own.

Right, understood. You mean that enum should only contain values that
are supported by the driver.

>
> > This was originally "preferred color format". Perhaps the
> > documentation should better reflect that it is now a mandatory
> > constraint which fails the modeset if not satisfied.
>
> That would definitely help.
>
> > >
> > > For the Colorspace prop, the kernel just exposes all formats it supports
> > > (independent of the sink) and then makes it the job of user space to
> > > figure out if the sink supports it.
> > >
> > > The same could be done here. Property value is exposed if the driver
> > > supports it in general, commits can fail if the driver can't support it
> > > for a specific commit because e.g. the resolution or refresh rate. User
> > > space must look at the EDID/DisplayID/mode to figure out the supported
> > > format for the sink.
> >
> > Yes, we can make it possible for userspace to discover which modes are
> > supported by the monitor, but there are other constraints that need to
> > be satisfied. This was discussed in the previous revision.
>
> I mean, yes, that's what I said. User space would then only be
> responsible for checking the sink capabilities and the atomic check
> would take into account other (non-sink) constraints.

Since we need to probe using TEST_ONLY anyway, we'll end up with two
mechanisms to do the same thing where one of them depends on the other
for completeness.

>
> > In any case, these things can be added later and need not be a part of
> > this change set.
>
> No, this is the contract between the kernel and user space and has to be
> figured out before we can merge new uAPI.
>
> >
> > [...]
> >

Thanks,
Andri


Re: [PATCH v2 2/4] drm/uAPI: Add "force color format" drm property as setting for userspace

2024-01-17 Thread Pekka Paalanen
On Tue, 16 Jan 2024 14:11:43 +
Andri Yngvason  wrote:

> þri., 16. jan. 2024 kl. 13:29 skrifaði Sebastian Wick
> :
> >
> > On Tue, Jan 16, 2024 at 01:13:13PM +, Andri Yngvason wrote:  
> [...]
> > > şri., 16. jan. 2024 kl. 11:42 skrifaği Sebastian Wick
> > > :  
> > > >
> > > > On Mon, Jan 15, 2024 at 04:05:52PM +, Andri Yngvason wrote:  
> > > > > From: Werner Sembach 
> > > > >
> > > > > Add a new general drm property "force color format" which can be used
> > > > > by userspace to tell the graphics driver which color format to use.  
> > > >
> > > > I don't like the "force" in the name. This just selects the color
> > > > format, let's just call it "color format" then.
> > > >  
> > >
> > > In previous revisions, this was "preferred color format" and "actual
> > > color format", of which the latter has been dropped. I recommend
> > > reading the discussion for previous revisions.  
> >
> > Please don't imply that I didn't read the thread I'm answering to.

FYI, I have not read this thread.

> >  
> > > There are arguments for adding "actual color format" later and if it
> > > is added later, we'd end up with "color format" and "actual color
> > > format", which might be confusing, and it is why I chose to call it
> > > "force color format" because it clearly communicates intent and
> > > disambiguates it from "actual color format".  
> >
> > There is no such thing as "actual color format" in upstream though.
> > Basing your naming on discarded ideas is not useful. The thing that sets
> > the color space for example is called "Colorspace", not "force
> > colorspace".
> >  
> 
> Sure, I'm happy with calling it whatever people want. Maybe we can
> have a vote on it?

It would sound strange to say "force color format" = "auto". Just drop
the "force" of it.

If and when we need the feedback counterpart, it could be an immutable
prop called "active color format" where "auto" is not a valid value, or
something in the new "output properties" design Sima has been thinking
of.

> > > [...]  
> > > > > @@ -1396,6 +1404,15 @@ static const u32 dp_colorspaces =
> > > > >   *   drm_connector_attach_max_bpc_property() to create and attach the
> > > > >   *   property to the connector during initialization.
> > > > >   *
> > > > > + * force color format:
> > > > > + *   This property is used by userspace to change the used color 
> > > > > format. When
> > > > > + *   used the driver will use the selected format if valid for the 
> > > > > hardware,  
> > > >
> > > > All properties are always "used", they just can have different values.
> > > > You probably want to talk about the auto mode here.  
> > >
> > > Maybe we can say something like: If userspace does not set the
> > > property or if it is explicitly set to zero, the driver will select
> > > the appropriate color format based on other constraints.  
> >
> > The property can be in any state without involvement from user space.
> > Don't talk about setting it, talk about the state it is in:
> >
> >   When the color format is auto, the driver will select a format.
> >  
> 
> Ok.
> 
> > > >  
> > > > > + *   sink, and current resolution and refresh rate combination. 
> > > > > Drivers to  
> > > >
> > > > If valid? So when a value is not actually supported user space can still
> > > > set it? What happens then? How should user space figure out if the
> > > > driver and the sink support the format?  
> > >
> > > The kernel does not expose this property unless it's implemented in the 
> > > driver.  
> >
> > If the driver simply doesn't support *one format*, the enum value for
> > that format should not be exposed, period. This isn't about the property
> > on its own.  
> 
> Right, understood. You mean that enum should only contain values that
> are supported by the driver.

Yes. When a driver installs a property, it can choose which of the enum
entries are exposed. That cannot be changed later though, so the list
cannot live by the currently connected sink, only by what the driver
and display controlled could ever do.

> > > This was originally "preferred color format". Perhaps the
> > > documentation should better reflect that it is now a mandatory
> > > constraint which fails the modeset if not satisfied.  
> >
> > That would definitely help.
> >  
> > > >
> > > > For the Colorspace prop, the kernel just exposes all formats it supports
> > > > (independent of the sink) and then makes it the job of user space to
> > > > figure out if the sink supports it.
> > > >
> > > > The same could be done here. Property value is exposed if the driver
> > > > supports it in general, commits can fail if the driver can't support it
> > > > for a specific commit because e.g. the resolution or refresh rate. User
> > > > space must look at the EDID/DisplayID/mode to figure out the supported
> > > > format for the sink.  
> > >
> > > Yes, we can make it possible for userspace to discover which modes are
> > > supported by the monitor, but there are other constraints that 

[PATCH 2/2] update check condition of query for ras page retire

2024-01-17 Thread Tao Zhou
Support page retirement handling in debug mode.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c  | 9 +++--
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 4 ++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 41139bac7643..6df32f0afd89 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -90,12 +90,16 @@ static void amdgpu_umc_handle_bad_pages(struct 
amdgpu_device *adev,
 {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   unsigned int error_query_mode;
int ret = 0;
 
+   amdgpu_ras_get_error_query_mode(adev, _query_mode);
+
mutex_lock(>page_retirement_lock);
 
ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
-   if (ret == -EOPNOTSUPP) {
+   if (ret == -EOPNOTSUPP &&
+   error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
adev->umc.ras->ras_block.hw_ops->query_ras_error_count)

adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status);
@@ -119,7 +123,8 @@ static void amdgpu_umc_handle_bad_pages(struct 
amdgpu_device *adev,
 */

adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, 
ras_error_status);
}
-   } else if (!ret) {
+   } else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY ||
+   (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) {
if (adev->umc.ras &&
adev->umc.ras->ecc_info_query_ras_error_count)
adev->umc.ras->ecc_info_query_ras_error_count(adev, 
ras_error_status);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index c560f4af214d..d86c9e7fc64b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2909,8 +2909,8 @@ static int smu_v13_0_6_select_xgmi_plpd_policy(struct 
smu_context *smu,
 static ssize_t smu_v13_0_6_get_ecc_info(struct smu_context *smu,
void *table)
 {
-   /* Support ecc info by default */
-   return 0;
+   /* we use debug mode flag instead of this interface */
+   return -EOPNOTSUPP;
 }
 
 static const struct pptable_funcs smu_v13_0_6_ppt_funcs = {
-- 
2.35.1



[PATCH 1/2] update error condition check for umc_v12_0_query_error_address

2024-01-17 Thread Tao Zhou
Deferred error is also taken into account.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 10edf818acf5..2e0bd4312f2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -337,10 +337,7 @@ static int umc_v12_0_query_error_address(struct 
amdgpu_device *adev,
}
 
/* calculate error address if ue error is detected */
-   if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 
&&
-   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 
1 &&
-   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1) 
{
-
+   if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)) {
mc_umc_addrt0 =
SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
 
-- 
2.35.1



Re: [PATCH 0/2] drm/atomic: Allow drivers to write their own plane check for async

2024-01-17 Thread Pekka Paalanen
On Tue, 16 Jan 2024 17:10:18 +0100
Xaver Hugl  wrote:

> My plan is to require support for IN_FENCE_FD at least. If the driver
> doesn't
> allow tearing with that, then tearing just doesn't happen.

That's an excellent point. I think this is important enough in its own
right, that it should be called out in the patch series.

Is it important enough to be special-cased, e.g. to be always allowed
with async commits?

Now that I think of it, if userspace needs to wait for the in-fence
itself before kicking KMS async, that would defeat much of the async's
point, right? And cases where in-fence is not necessary are so rare
they might not even exist?

So if driver/hardware cannot do IN_FENCE_FD with async, is there any
use of supporting async to begin with?

> For overlay planes though, it depends on how the compositor prioritizes
> things.
> If the compositor prioritizes overlay planes and would like to do tearing
> if possible,
> then this patch works.

Ok, I can see that.

> If the compositor prioritizes tearing and would like to do overlay planes
> if possible,
> it would have to know that switching to synchronous commits for a single
> frame,
> setting up the overlay planes and then switching back to async commits
> works, and
> that can't be figured out with TEST_ONLY commits.

I had to ponder a bit why. So I guess the synchronous commit in between
is because driver/hardware may not be able to enable/disable extra
planes in async, so you need a synchronous commit to set them up, but
afterwards updates can tear.

The comment about Intel needing one more synchronous commit when
switching from sync to async updates comes to mind as well, would that
be a problem?

> So I think having a CAP or immutable plane property to signal that async
> commits
> with overlay and/or cursor planes is supported would be useful.

Async cursor planes a good point, particularly moving them around. I'm
not too informed about the prior/on-going efforts to allow cursor
movement more often than refresh rate, I recall something about
amending atomic commits? How would these interact?

I suppose the kernel still prevents any new async commit while a
previous commit is not finished, so amending commits would still be
necessary for cursor plane motion? Or would it, if you time "big
commits" to finish quickly and then spam async "cursor commits" in the
mean time?


Thanks,
pq

> Am Di., 16. Jan. 2024 um 14:35 Uhr schrieb André Almeida <
> andrealm...@igalia.com>:  
> 
> > + Joshua
> >
> > Em 16/01/2024 10:14, Pekka Paalanen escreveu:  
> > > On Tue, 16 Jan 2024 08:50:59 -0300
> > > André Almeida  wrote:
> > >  
> > >> Hi Pekka,
> > >>
> > >> Em 16/01/2024 06:45, Pekka Paalanen escreveu:  
> > >>> On Tue, 16 Jan 2024 01:51:57 -0300
> > >>> André Almeida  wrote:
> > >>>  
> >  Hi,
> > 
> >  AMD hardware can do more on the async flip path than just the primary  
> > plane, so  
> >  to lift up the current restrictions, this patchset allows drivers to  
> > write their  
> >  own check for planes for async flips.  
> > >>>
> > >>> Hi,
> > >>>
> > >>> what's the userspace story for this, how could userspace know it could  
> > do more?  
> > >>> What kind of userspace would take advantage of this and in what  
> > situations?  
> > >>>
> > >>> Or is this not meant for generic userspace?  
> > >>
> > >> Sorry, I forgot to document this. So the idea is that userspace will
> > >> query what they can do here with DRM_MODE_ATOMIC_TEST_ONLY calls,
> > >> instead of having capabilities for each prop.  
> > >
> > > That's the theory, but do you have a practical example?
> > >
> > > What other planes and props would one want change in some specific use
> > > case?
> > >
> > > Is it just "all or nothing", or would there be room to choose and pick
> > > which props you change and which you don't based on what the driver
> > > supports? If the latter, then relying on TEST_ONLY might be yet another
> > > combinatorial explosion to iterate through.
> > >  
> >
> > That's a good question, maybe Simon, Xaver or Joshua can share how they
> > were planning to use this on Gamescope or Kwin.
> >  
> > >
> > > Thanks,
> > > pq
> > >  
> >  I'm not sure if adding something new to drm_plane_funcs is the right  
> > way to do,  
> >  because if we want to expand the other object types (crtc, connector)  
> > we would  
> >  need to add their own drm_XXX_funcs, so feedbacks are welcome!
> > 
> > André
> > 
> >  André Almeida (2):
> >  drm/atomic: Allow drivers to write their own plane check for async
> >    flips
> >  drm/amdgpu: Implement check_async_props for planes
> > 
> > .../amd/display/amdgpu_dm/amdgpu_dm_plane.c   | 30 +
> > drivers/gpu/drm/drm_atomic_uapi.c | 62  
> > ++-  
> > include/drm/drm_atomic_uapi.h | 12 
> > include/drm/drm_plane.h   |  5 ++
> > 4 files 

[PATCH] drm/amdgpu/pm: Fix the power source flag error

2024-01-17 Thread Ma Jun
The power source flag should be updated when
[1] System receives an interrupt indicating that the power source
has changed.
[2] System resumes from suspend or runtime suspend

Signed-off-by: Ma Jun 
---
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 24 +++
 .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c|  2 ++
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|  2 ++
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index c16703868e5c..e16d22e30a8a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -24,6 +24,7 @@
 
 #include 
 #include 
+#include 
 #include 
 
 #include "amdgpu.h"
@@ -793,6 +794,17 @@ static int smu_apply_default_config_table_settings(struct 
smu_context *smu)
return smu_set_config_table(smu, >pm.config_table);
 }
 
+static void smu_update_power_source(struct amdgpu_device *adev)
+{
+   if (power_supply_is_system_supplied() > 0)
+   adev->pm.ac_power = true;
+   else
+   adev->pm.ac_power = false;
+
+   if (is_support_sw_smu(adev))
+   smu_set_ac_dc(adev->powerplay.pp_handle);
+}
+
 static int smu_late_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -817,16 +829,8 @@ static int smu_late_init(void *handle)
 * handle the switch automatically. Driver involvement
 * is unnecessary.
 */
-   if (!smu->dc_controlled_by_gpio) {
-   ret = smu_set_power_source(smu,
-  adev->pm.ac_power ? 
SMU_POWER_SOURCE_AC :
-  SMU_POWER_SOURCE_DC);
-   if (ret) {
-   dev_err(adev->dev, "Failed to switch to %s mode!\n",
-   adev->pm.ac_power ? "AC" : "DC");
-   return ret;
-   }
-   }
+   if (!smu->dc_controlled_by_gpio)
+   smu_update_power_source(adev);
 
if ((amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 1)) ||
(amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 3)))
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
index 2e7f8d5cfc28..8047150fddd4 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
@@ -1442,10 +1442,12 @@ static int smu_v11_0_irq_process(struct amdgpu_device 
*adev,
case 0x3:
dev_dbg(adev->dev, "Switched to AC mode!\n");
schedule_work(>interrupt_work);
+   adev->pm.ac_power = true;
break;
case 0x4:
dev_dbg(adev->dev, "Switched to DC mode!\n");
schedule_work(>interrupt_work);
+   adev->pm.ac_power = false;
break;
case 0x7:
/*
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 771a3d457c33..c486182ff275 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -1379,10 +1379,12 @@ static int smu_v13_0_irq_process(struct amdgpu_device 
*adev,
case 0x3:
dev_dbg(adev->dev, "Switched to AC mode!\n");
smu_v13_0_ack_ac_dc_interrupt(smu);
+   adev->pm.ac_power = true;
break;
case 0x4:
dev_dbg(adev->dev, "Switched to DC mode!\n");
smu_v13_0_ack_ac_dc_interrupt(smu);
+   adev->pm.ac_power = false;
break;
case 0x7:
/*
-- 
2.34.1