date:20230626

[PATCH 1/2] drm/amd/pm: expose swctf threshold setting for legacy powerplay

2023-06-26 Thread Evan Quan

Preparation for coming optimization which eliminates the influence of
GPU temperature momentary fluctuation.

Signed-off-by: Evan Quan 
---
 drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h|  2 ++
 .../gpu/drm/amd/pm/powerplay/hwmgr/hardwaremanager.c   |  4 +++-
 drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c|  2 ++
 drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c  | 10 ++
 drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega12_hwmgr.c  |  4 
 drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_hwmgr.c  |  4 
 drivers/gpu/drm/amd/pm/powerplay/inc/power_state.h |  1 +
 7 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
index a9161f3da8b5..7faad759a6cc 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
@@ -90,6 +90,8 @@ struct amdgpu_dpm_thermal {
intmax_mem_crit_temp;
/* memory max emergency(shutdown) temp */
intmax_mem_emergency_temp;
+   /* SWCTF threshold */
+   intsw_ctf_threshold;
/* was last interrupt low to high or high to low */
bool   high_to_low;
/* interrupt source */
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/hardwaremanager.c 
b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/hardwaremanager.c
index 981dc8c7112d..90452b66e107 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/hardwaremanager.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/hardwaremanager.c
@@ -241,7 +241,8 @@ int phm_start_thermal_controller(struct pp_hwmgr *hwmgr)
TEMP_RANGE_MAX,
TEMP_RANGE_MIN,
TEMP_RANGE_MAX,
-   TEMP_RANGE_MAX};
+   TEMP_RANGE_MAX,
+   0};
struct amdgpu_device *adev = hwmgr->adev;
 
if (!hwmgr->not_vf)
@@ -265,6 +266,7 @@ int phm_start_thermal_controller(struct pp_hwmgr *hwmgr)
adev->pm.dpm.thermal.min_mem_temp = range.mem_min;
adev->pm.dpm.thermal.max_mem_crit_temp = range.mem_crit_max;
adev->pm.dpm.thermal.max_mem_emergency_temp = range.mem_emergency_max;
+   adev->pm.dpm.thermal.sw_ctf_threshold = range.sw_ctf_threshold;
 
return ret;
 }
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c 
b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
index d82767866ac1..6d887ead2967 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
@@ -5433,6 +5433,8 @@ static int smu7_get_thermal_temperature_range(struct 
pp_hwmgr *hwmgr,
thermal_data->max = 
data->thermal_temp_setting.temperature_shutdown *
PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
 
+   thermal_data->sw_ctf_threshold = thermal_data->max;
+
return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c 
b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c
index 6f5161738bf8..d8cd23438b76 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c
@@ -5242,6 +5242,9 @@ static int vega10_get_thermal_temperature_range(struct 
pp_hwmgr *hwmgr,
 {
struct vega10_hwmgr *data = hwmgr->backend;
PPTable_t *pp_table = &(data->smc_state_table.pp_table);
+   struct phm_ppt_v2_information *pp_table_info =
+   (struct phm_ppt_v2_information *)(hwmgr->pptable);
+   struct phm_tdp_table *tdp_table = pp_table_info->tdp_table;
 
memcpy(thermal_data, &SMU7ThermalWithDelayPolicy[0], sizeof(struct 
PP_TemperatureRange));
 
@@ -5258,6 +5261,13 @@ static int vega10_get_thermal_temperature_range(struct 
pp_hwmgr *hwmgr,
thermal_data->mem_emergency_max = (pp_table->ThbmLimit + 
CTF_OFFSET_HBM)*
PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
 
+   if (tdp_table->usSoftwareShutdownTemp > pp_table->ThotspotLimit &&
+   tdp_table->usSoftwareShutdownTemp < 
VEGA10_THERMAL_MAXIMUM_ALERT_TEMP)
+   thermal_data->sw_ctf_threshold = 
tdp_table->usSoftwareShutdownTemp;
+   else
+   thermal_data->sw_ctf_threshold = 
VEGA10_THERMAL_MAXIMUM_ALERT_TEMP;
+   thermal_data->sw_ctf_threshold *= PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
+
return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega12_hwmgr.c 
b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega12_hwmgr.c
index 33f31461ea6c..1069eaaae2f8 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega12_hwmgr.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega12_hwmgr.c
@@ -2764,6 +2764,8 @@ static int vega12_notify_cac_buffer_info(struct pp_hwmgr 
*hwmgr,
 static int vega12_get_thermal_temperature_range(struct pp_hwmgr *hwmgr,
struct PP_TemperatureRange *thermal_data)
 {
+   struct phm_ppt_v3_information *pptable_information =
+   (struct phm_ppt_v3_information *)hwmgr->ppt

[PATCH 2/2] drm/amd/pm: avoid unintentional shutdown due to temperature momentary fluctuation

2023-06-26 Thread Evan Quan

An intentional delay is added on soft ctf triggered. Then there will
be a double check for the GPU temperature before taking further
action. This can avoid unintended shutdown due to temperature
momentary fluctuation.

Signed-off-by: Evan Quan 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  3 ++
 .../gpu/drm/amd/pm/powerplay/amd_powerplay.c  | 48 +++
 .../drm/amd/pm/powerplay/hwmgr/smu_helper.c   | 27 ---
 drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h  |  2 +
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 34 +
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  2 +
 .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c|  9 +---
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|  9 +---
 8 files changed, 102 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e459381dc759..5ef1f31e703c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -287,6 +287,9 @@ extern int amdgpu_user_partt_mode;
 #define AMDGPU_SMARTSHIFT_MAX_BIAS (100)
 #define AMDGPU_SMARTSHIFT_MIN_BIAS (-100)
 
+/* Extra time delay(in ms) to eliminate the influence of temperature momentary 
fluctuation */
+#define AMDGPU_SWCTF_EXTRA_DELAY   50
+
 struct amdgpu_xcp_mgr;
 struct amdgpu_device;
 struct amdgpu_irq_src;
diff --git a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c 
b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
index 11b7b4cffaae..ff360c699171 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
@@ -26,6 +26,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "amd_shared.h"
 #include "amd_powerplay.h"
 #include "power_state.h"
@@ -91,6 +92,45 @@ static int pp_early_init(void *handle)
return 0;
 }
 
+static void pp_swctf_delayed_work_handler(struct work_struct *work)
+{
+   struct pp_hwmgr *hwmgr =
+   container_of(work, struct pp_hwmgr, swctf_delayed_work.work);
+   struct amdgpu_device *adev = hwmgr->adev;
+   struct amdgpu_dpm_thermal *range =
+   &adev->pm.dpm.thermal;
+   uint32_t gpu_temperature, size;
+   int ret;
+
+   /*
+* If the hotspot/edge temperature is confirmed as below SW CTF setting 
point
+* after the delay enforced, nothing will be done.
+* Otherwise, a graceful shutdown will be performed to prevent further 
damage.
+*/
+   if (range->sw_ctf_threshold &&
+   hwmgr->hwmgr_func->read_sensor) {
+   ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
+
AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
+&gpu_temperature,
+&size);
+   /*
+* For some legacy ASICs, hotspot temperature retrieving might 
be not
+* supported. Check the edge temperature instead then.
+*/
+   if (ret == -EOPNOTSUPP)
+   ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
+
AMDGPU_PP_SENSOR_EDGE_TEMP,
+&gpu_temperature,
+&size);
+   if (!ret && gpu_temperature / 1000 < range->sw_ctf_threshold)
+   return;
+   }
+
+   dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) 
detected!\n");
+   dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW 
CTF!\n");
+   orderly_poweroff(true);
+}
+
 static int pp_sw_init(void *handle)
 {
struct amdgpu_device *adev = handle;
@@ -101,6 +141,10 @@ static int pp_sw_init(void *handle)
 
pr_debug("powerplay sw init %s\n", ret ? "failed" : "successfully");
 
+   if (!ret)
+   INIT_DELAYED_WORK(&hwmgr->swctf_delayed_work,
+ pp_swctf_delayed_work_handler);
+
return ret;
 }
 
@@ -135,6 +179,8 @@ static int pp_hw_fini(void *handle)
struct amdgpu_device *adev = handle;
struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
 
+   cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);
+
hwmgr_hw_fini(hwmgr);
 
return 0;
@@ -221,6 +267,8 @@ static int pp_suspend(void *handle)
struct amdgpu_device *adev = handle;
struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
 
+   cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);
+
return hwmgr_suspend(hwmgr);
 }
 
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c 
b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
index bfe80ac0ad8c..d0b1ab6c4523 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
@@ -603,21 +603,17 @@ int phm_irq_process(struct a

Re: [PATCH] drm: Remove the deprecated drm_put_dev() function

2023-06-26 Thread Jani Nikula

On Sun, 25 Jun 2023, Sui Jingfeng  wrote:
> As this function can be replaced with drm_dev_unregister() + drm_dev_put(),
> it is already marked as deprecated, so remove it. No functional change.
>
> Signed-off-by: Sui Jingfeng 
> ---
>  drivers/gpu/drm/drm_drv.c   | 28 
>  drivers/gpu/drm/drm_pci.c   |  3 ++-
>  drivers/gpu/drm/radeon/radeon_drv.c |  3 ++-
>  include/drm/drm_drv.h   |  1 -
>  4 files changed, 4 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
> index 12687dd9e1ac..5057307fe22a 100644
> --- a/drivers/gpu/drm/drm_drv.c
> +++ b/drivers/gpu/drm/drm_drv.c
> @@ -406,34 +406,6 @@ void drm_minor_release(struct drm_minor *minor)
>   * possibly leaving the hardware enabled.
>   */
>  
> -/**
> - * drm_put_dev - Unregister and release a DRM device
> - * @dev: DRM device
> - *
> - * Called at module unload time or when a PCI device is unplugged.
> - *
> - * Cleans up all DRM device, calling drm_lastclose().
> - *
> - * Note: Use of this function is deprecated. It will eventually go away
> - * completely.  Please use drm_dev_unregister() and drm_dev_put() explicitly
> - * instead to make sure that the device isn't userspace accessible any more
> - * while teardown is in progress, ensuring that userspace can't access an
> - * inconsistent state.

The last sentence is the crucial one. While the patch has no functional
changes, I believe the goal never was to just mechanically replace one
call with the two.

BR,
Jani.


> - */
> -void drm_put_dev(struct drm_device *dev)
> -{
> - DRM_DEBUG("\n");
> -
> - if (!dev) {
> - DRM_ERROR("cleanup called no dev\n");
> - return;
> - }
> -
> - drm_dev_unregister(dev);
> - drm_dev_put(dev);
> -}
> -EXPORT_SYMBOL(drm_put_dev);
> -
>  /**
>   * drm_dev_enter - Enter device critical section
>   * @dev: DRM device
> diff --git a/drivers/gpu/drm/drm_pci.c b/drivers/gpu/drm/drm_pci.c
> index 39d35fc3a43b..b3a68a92eaa6 100644
> --- a/drivers/gpu/drm/drm_pci.c
> +++ b/drivers/gpu/drm/drm_pci.c
> @@ -257,7 +257,8 @@ void drm_legacy_pci_exit(const struct drm_driver *driver,
>legacy_dev_list) {
>   if (dev->driver == driver) {
>   list_del(&dev->legacy_dev_list);
> - drm_put_dev(dev);
> + drm_dev_unregister(dev);
> + drm_dev_put(dev);
>   }
>   }
>   mutex_unlock(&legacy_dev_list_lock);
> diff --git a/drivers/gpu/drm/radeon/radeon_drv.c 
> b/drivers/gpu/drm/radeon/radeon_drv.c
> index e4374814f0ef..a4955ae10659 100644
> --- a/drivers/gpu/drm/radeon/radeon_drv.c
> +++ b/drivers/gpu/drm/radeon/radeon_drv.c
> @@ -357,7 +357,8 @@ radeon_pci_remove(struct pci_dev *pdev)
>  {
>   struct drm_device *dev = pci_get_drvdata(pdev);
>  
> - drm_put_dev(dev);
> + drm_dev_unregister(dev);
> + drm_dev_put(dev);
>  }
>  
>  static void
> diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
> index 89e2706cac56..289c97b12e82 100644
> --- a/include/drm/drm_drv.h
> +++ b/include/drm/drm_drv.h
> @@ -511,7 +511,6 @@ void drm_dev_unregister(struct drm_device *dev);
>  
>  void drm_dev_get(struct drm_device *dev);
>  void drm_dev_put(struct drm_device *dev);
> -void drm_put_dev(struct drm_device *dev);
>  bool drm_dev_enter(struct drm_device *dev, int *idx);
>  void drm_dev_exit(int idx);
>  void drm_dev_unplug(struct drm_device *dev);

-- 
Jani Nikula, Intel Open Source Graphics Center

Re: [PATCH] drm: Remove the deprecated drm_put_dev() function

2023-06-26 Thread Thomas Zimmermann


Hi

Am 25.06.23 um 07:09 schrieb Sui Jingfeng:

As this function can be replaced with drm_dev_unregister() + drm_dev_put(),
it is already marked as deprecated, so remove it. No functional change.

Signed-off-by: Sui Jingfeng 
---
  drivers/gpu/drm/drm_drv.c   | 28 
  drivers/gpu/drm/drm_pci.c   |  3 ++-
  drivers/gpu/drm/radeon/radeon_drv.c |  3 ++-
  include/drm/drm_drv.h   |  1 -
  4 files changed, 4 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 12687dd9e1ac..5057307fe22a 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -406,34 +406,6 @@ void drm_minor_release(struct drm_minor *minor)
   * possibly leaving the hardware enabled.
   */
  
-/**

- * drm_put_dev - Unregister and release a DRM device
- * @dev: DRM device
- *
- * Called at module unload time or when a PCI device is unplugged.
- *
- * Cleans up all DRM device, calling drm_lastclose().
- *
- * Note: Use of this function is deprecated. It will eventually go away
- * completely.  Please use drm_dev_unregister() and drm_dev_put() explicitly
- * instead to make sure that the device isn't userspace accessible any more
- * while teardown is in progress, ensuring that userspace can't access an
- * inconsistent state.
- */
-void drm_put_dev(struct drm_device *dev)
-{
-   DRM_DEBUG("\n");
-
-   if (!dev) {
-   DRM_ERROR("cleanup called no dev\n");
-   return;
-   }
-
-   drm_dev_unregister(dev);
-   drm_dev_put(dev);
-}
-EXPORT_SYMBOL(drm_put_dev);
-
  /**
   * drm_dev_enter - Enter device critical section
   * @dev: DRM device
diff --git a/drivers/gpu/drm/drm_pci.c b/drivers/gpu/drm/drm_pci.c
index 39d35fc3a43b..b3a68a92eaa6 100644
--- a/drivers/gpu/drm/drm_pci.c
+++ b/drivers/gpu/drm/drm_pci.c
@@ -257,7 +257,8 @@ void drm_legacy_pci_exit(const struct drm_driver *driver,
 legacy_dev_list) {
if (dev->driver == driver) {
list_del(&dev->legacy_dev_list);
-   drm_put_dev(dev);
+   drm_dev_unregister(dev);
+   drm_dev_put(dev);
}
}
mutex_unlock(&legacy_dev_list_lock);
diff --git a/drivers/gpu/drm/radeon/radeon_drv.c 
b/drivers/gpu/drm/radeon/radeon_drv.c
index e4374814f0ef..a4955ae10659 100644
--- a/drivers/gpu/drm/radeon/radeon_drv.c
+++ b/drivers/gpu/drm/radeon/radeon_drv.c
@@ -357,7 +357,8 @@ radeon_pci_remove(struct pci_dev *pdev)
  {
struct drm_device *dev = pci_get_drvdata(pdev);
  
-	drm_put_dev(dev);


Did you verify that dev cannot be NULL here? There was a check in 
drm_put_dev() for !dev.


Best regards
Thomas


+   drm_dev_unregister(dev);
+   drm_dev_put(dev);
  }
  
  static void

diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
index 89e2706cac56..289c97b12e82 100644
--- a/include/drm/drm_drv.h
+++ b/include/drm/drm_drv.h
@@ -511,7 +511,6 @@ void drm_dev_unregister(struct drm_device *dev);
  
  void drm_dev_get(struct drm_device *dev);

  void drm_dev_put(struct drm_device *dev);
-void drm_put_dev(struct drm_device *dev);
  bool drm_dev_enter(struct drm_device *dev, int *idx);
  void drm_dev_exit(int idx);
  void drm_dev_unplug(struct drm_device *dev);


--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Frankenstrasse 146, 90461 Nuernberg, Germany
GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman
HRB 36809 (AG Nuernberg)


OpenPGP_signature
Description: OpenPGP digital signature

RE: [PATCH 1/2] drm/amdgpu: make mcbp a per device setting

2023-06-26 Thread Zhu, Jiadong

[AMD Official Use Only - General]

Reviewed-and-tested-by: Jiadong Zhu 

-Original Message-
From: amd-gfx  On Behalf Of Alex Deucher
Sent: Saturday, June 17, 2023 5:10 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander 
Subject: [PATCH 1/2] drm/amdgpu: make mcbp a per device setting

So we can selectively enable it on certain devices.  No intended functional 
change.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h|  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c|  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c   |  3 ---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c |  2 +-
 7 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f39db4a2c2cf..78c6265fe79b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2551,7 +2551,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
adev->ip_blocks[i].status.hw = true;

/* right after GMC hw init, we create CSA */
-   if (amdgpu_mcbp) {
+   if (adev->gfx.mcbp) {
r = amdgpu_allocate_static_csa(adev, 
&adev->virt.csa_obj,
   
AMDGPU_GEM_DOMAIN_VRAM |
   
AMDGPU_GEM_DOMAIN_GTT,
@@ -3672,6 +3672,18 @@ static const struct attribute *amdgpu_dev_attributes[] = 
{
NULL
 };

+static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) {
+   if (amdgpu_mcbp == 1)
+   adev->gfx.mcbp = true;
+
+   if (amdgpu_sriov_vf(adev))
+   adev->gfx.mcbp = true;
+
+   if (adev->gfx.mcbp)
+   DRM_INFO("MCBP is enabled\n");
+}
+
 /**
  * amdgpu_device_init - initialize the driver
  *
@@ -3823,9 +3835,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);

-   if (amdgpu_mcbp)
-   DRM_INFO("MCBP is enabled\n");
-
/*
 * Reset domain needs to be present early, before XGMI hive discovered
 * (if any) and intitialized to use reset sem and in_gpu reset flag @@ 
-3851,6 +3860,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (r)
return r;

+   amdgpu_device_set_mcbp(adev);
+
/* Get rid of things like offb */
r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, 
&amdgpu_kms_driver);
if (r)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index ce0f7a8ad4b8..a4ff515ce896 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -434,6 +434,7 @@ struct amdgpu_gfx {
uint16_txcc_mask;
uint32_tnum_xcc_per_xcp;
struct mutexpartition_mutex;
+   boolmcbp; /* mid command buffer preemption 
*/
 };

 struct amdgpu_gfx_ras_reg_entry {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index e3531aa3c8bd..cca5a495611f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -805,7 +805,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
dev_info->ids_flags = 0;
if (adev->flags & AMD_IS_APU)
dev_info->ids_flags |= AMDGPU_IDS_FLAGS_FUSION;
-   if (amdgpu_mcbp)
+   if (adev->gfx.mcbp)
dev_info->ids_flags |= AMDGPU_IDS_FLAGS_PREEMPTION;
if (amdgpu_is_tmz(adev))
dev_info->ids_flags |= AMDGPU_IDS_FLAGS_TMZ; @@ -1247,7 
+1247,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file 
*file_priv)
goto error_vm;
}

-   if (amdgpu_mcbp) {
+   if (adev->gfx.mcbp) {
uint64_t csa_addr = amdgpu_csa_vaddr(adev) & 
AMDGPU_GMC_HOLE_MASK;

r = amdgpu_map_static_csa(adev, &fpriv->vm, adev->virt.csa_obj, 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 78ec3420ef85..dacf281d2b21 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -72,7 +72,7 @@ uint64_t amdgpu_sdma_get_csa_mc_addr(struct amdgpu_ring *ring,
int r;

/* don't enable OS preemption on SDMA under SRIOV */
-   if (amdgpu_sriov_vf(adev) || vmid == 0 |

RE: [PATCH 2/2] drm/amdgpu: enable mcbp by default on gfx9

2023-06-26 Thread Zhu, Jiadong

[AMD Official Use Only - General]

Reviewed-and-tested-by: Jiadong Zhu 

-Original Message-
From: amd-gfx  On Behalf Of Alex Deucher
Sent: Saturday, June 17, 2023 5:10 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander 
Subject: [PATCH 2/2] drm/amdgpu: enable mcbp by default on gfx9

It's required for high priority queues.

Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2535
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 6 +++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 78c6265fe79b..3eb370b77ad9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3677,6 +3677,11 @@ static void amdgpu_device_set_mcbp(struct amdgpu_device 
*adev)
if (amdgpu_mcbp == 1)
adev->gfx.mcbp = true;

+   if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) &&
+   (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) &&
+   adev->gfx.num_gfx_rings)
+   adev->gfx.mcbp = true;
+
if (amdgpu_sriov_vf(adev))
adev->gfx.mcbp = true;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 03874371af60..308149dd7d00 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -180,7 +180,7 @@ uint amdgpu_dc_feature_mask = 2;  uint 
amdgpu_dc_debug_mask;  uint amdgpu_dc_visual_confirm;  int 
amdgpu_async_gfx_ring = 1; -int amdgpu_mcbp;
+int amdgpu_mcbp = -1;
 int amdgpu_discovery = -1;
 int amdgpu_mes;
 int amdgpu_mes_kiq;
@@ -635,10 +635,10 @@ module_param_named(async_gfx_ring, amdgpu_async_gfx_ring, 
int, 0444);

 /**
  * DOC: mcbp (int)
- * It is used to enable mid command buffer preemption. (0 = disabled 
(default), 1 = enabled)
+ * It is used to enable mid command buffer preemption. (0 = disabled, 1
+ = enabled, -1 auto (default))
  */
 MODULE_PARM_DESC(mcbp,
-   "Enable Mid-command buffer preemption (0 = disabled (default), 1 = 
enabled)");
+   "Enable Mid-command buffer preemption (0 = disabled, 1 = enabled), -1
+= auto (default)");
 module_param_named(mcbp, amdgpu_mcbp, int, 0444);

 /**
--
2.40.1

Re: [PATCH 3/3] drm/amdgpu: add new INFO ioctl query for the last GPU page fault

2023-06-26 Thread Christian König


Am 25.05.23 um 18:52 schrieb Alex Deucher:

Add a interface to query the last GPU page fault for the process.
Useful for debugging context lost errors.

v2: split vmhub representation between kernel and userspace

Mesa MR: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23238
libdrm MR: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23238

Cc: samuel.pitoi...@gmail.com
Signed-off-by: Alex Deucher 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 16 
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 16 +++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  | 13 ++---
  include/uapi/drm/amdgpu_drm.h   | 16 
  5 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 7300df2a342c..7e17b285decc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -112,9 +112,10 @@
   *gl1c_cache_size, gl2c_cache_size, mall_size, 
enabled_rb_pipes_mask_hi
   *   3.53.0 - Support for GFX11 CP GFX shadowing
   *   3.54.0 - Add AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS support
+ * - 3.55.0 - Add AMDGPU_INFO_GPUVM_FAULT query
   */
  #define KMS_DRIVER_MAJOR  3
-#define KMS_DRIVER_MINOR   54
+#define KMS_DRIVER_MINOR   55
  #define KMS_DRIVER_PATCHLEVEL 0
  
  unsigned int amdgpu_vram_limit = UINT_MAX;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 41d047e5de69..bca2a56046ae 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1163,6 +1163,22 @@ int amdgpu_info_ioctl(struct drm_device *dev, void 
*data, struct drm_file *filp)
return copy_to_user(out, max_ibs,
min((size_t)size, sizeof(max_ibs))) ? 
-EFAULT : 0;
}
+   case AMDGPU_INFO_GPUVM_FAULT: {
+   struct amdgpu_fpriv *fpriv = filp->driver_priv;
+   struct amdgpu_vm *vm = &fpriv->vm;
+   struct drm_amdgpu_info_gpuvm_fault gpuvm_fault;
+
+   if (!vm)
+   return -EINVAL;
+
+   memset(&gpuvm_fault, 0, sizeof(gpuvm_fault));
+   gpuvm_fault.addr = vm->fault_info.addr;
+   gpuvm_fault.status = vm->fault_info.status;
+   gpuvm_fault.vmhub = vm->fault_info.vmhub;


You need something to provide locking and barrier here. I suggest to 
just grab the xa lock.



+
+   return copy_to_user(out, &gpuvm_fault,
+   min((size_t)size, sizeof(gpuvm_fault))) ? 
-EFAULT : 0;
+   }
default:
DRM_DEBUG_KMS("Invalid request %d\n", info->query);
return -EINVAL;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 73e022f3daa4..c1b0c5f3c1f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2657,7 +2657,21 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device 
*adev,
if (vm) {
vm->fault_info.addr = addr;
vm->fault_info.status = status;
-   vm->fault_info.vmhub = vmhub;
+   if (AMDGPU_IS_GFXHUB(vmhub)) {
+   vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_GFX;
+   vm->fault_info.vmhub |=
+   (vmhub - AMDGPU_GFXHUB_START) << 
AMDGPU_VMHUB_IDX_SHIFT;
+   } else if (AMDGPU_IS_MMHUB0(vmhub)) {
+   vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_MM0;
+   vm->fault_info.vmhub |=
+   (vmhub - AMDGPU_MMHUB0_START) << 
AMDGPU_VMHUB_IDX_SHIFT;
+   } else if (AMDGPU_IS_MMHUB1(vmhub)) {
+   vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_MM1;
+   vm->fault_info.vmhub |=
+   (vmhub - AMDGPU_MMHUB1_START) << 
AMDGPU_VMHUB_IDX_SHIFT;
+   } else {
+   WARN_ONCE(1, "Invalid vmhub %u\n", vmhub);
+   }
}
xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index fb66a413110c..1a34fea9acb9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -116,9 +116,16 @@ struct amdgpu_mem_stats;
   * layout: max 8 GFXHUB + 4 MMHUB0 + 1 MMHUB1
   */
  #define AMDGPU_MAX_VMHUBS 13
-#define AMDGPU_GFXHUB(x)   (x)
-#define AMDGPU_MMHUB0(x)   (8 + x)
-#define AMDGPU_MMHUB1(x)   (8 + 4 + x)
+#define AMDGPU_GFXHUB_START0
+#define AMDGPU_MMHUB0_START8
+#define AMDGPU_MMHUB1_START12
+#define AM

Re: [PATCH 2/2] drm/amd/pm: avoid unintentional shutdown due to temperature momentary fluctuation

2023-06-26 Thread Lazar, Lijo





On 6/26/2023 1:17 PM, Evan Quan wrote:

An intentional delay is added on soft ctf triggered. Then there will
be a double check for the GPU temperature before taking further
action. This can avoid unintended shutdown due to temperature
momentary fluctuation.

Signed-off-by: Evan Quan 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  3 ++
  .../gpu/drm/amd/pm/powerplay/amd_powerplay.c  | 48 +++
  .../drm/amd/pm/powerplay/hwmgr/smu_helper.c   | 27 ---
  drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h  |  2 +
  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 34 +
  drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  2 +
  .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c|  9 +---
  .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|  9 +---
  8 files changed, 102 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e459381dc759..5ef1f31e703c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -287,6 +287,9 @@ extern int amdgpu_user_partt_mode;
  #define AMDGPU_SMARTSHIFT_MAX_BIAS (100)
  #define AMDGPU_SMARTSHIFT_MIN_BIAS (-100)
  
+/* Extra time delay(in ms) to eliminate the influence of temperature momentary fluctuation */

+#define AMDGPU_SWCTF_EXTRA_DELAY   50


I think a delay of 10-15ms is good enough to filter out any spike.

With that change, the series is
Reviewed-by: Lijo Lazar 

Thanks,
Lijo


+
  struct amdgpu_xcp_mgr;
  struct amdgpu_device;
  struct amdgpu_irq_src;
diff --git a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c 
b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
index 11b7b4cffaae..ff360c699171 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
@@ -26,6 +26,7 @@
  #include 
  #include 
  #include 
+#include 
  #include "amd_shared.h"
  #include "amd_powerplay.h"
  #include "power_state.h"
@@ -91,6 +92,45 @@ static int pp_early_init(void *handle)
return 0;
  }
  
+static void pp_swctf_delayed_work_handler(struct work_struct *work)

+{
+   struct pp_hwmgr *hwmgr =
+   container_of(work, struct pp_hwmgr, swctf_delayed_work.work);
+   struct amdgpu_device *adev = hwmgr->adev;
+   struct amdgpu_dpm_thermal *range =
+   &adev->pm.dpm.thermal;
+   uint32_t gpu_temperature, size;
+   int ret;
+
+   /*
+* If the hotspot/edge temperature is confirmed as below SW CTF setting 
point
+* after the delay enforced, nothing will be done.
+* Otherwise, a graceful shutdown will be performed to prevent further 
damage.
+*/
+   if (range->sw_ctf_threshold &&
+   hwmgr->hwmgr_func->read_sensor) {
+   ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
+
AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
+&gpu_temperature,
+&size);
+   /*
+* For some legacy ASICs, hotspot temperature retrieving might 
be not
+* supported. Check the edge temperature instead then.
+*/
+   if (ret == -EOPNOTSUPP)
+   ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
+
AMDGPU_PP_SENSOR_EDGE_TEMP,
+&gpu_temperature,
+&size);
+   if (!ret && gpu_temperature / 1000 < range->sw_ctf_threshold)
+   return;
+   }
+
+   dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) 
detected!\n");
+   dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW 
CTF!\n");
+   orderly_poweroff(true);
+}
+
  static int pp_sw_init(void *handle)
  {
struct amdgpu_device *adev = handle;
@@ -101,6 +141,10 @@ static int pp_sw_init(void *handle)
  
  	pr_debug("powerplay sw init %s\n", ret ? "failed" : "successfully");
  
+	if (!ret)

+   INIT_DELAYED_WORK(&hwmgr->swctf_delayed_work,
+ pp_swctf_delayed_work_handler);
+
return ret;
  }
  
@@ -135,6 +179,8 @@ static int pp_hw_fini(void *handle)

struct amdgpu_device *adev = handle;
struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
  
+	cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);

+
hwmgr_hw_fini(hwmgr);
  
  	return 0;

@@ -221,6 +267,8 @@ static int pp_suspend(void *handle)
struct amdgpu_device *adev = handle;
struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
  
+	cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);

+
return hwmgr_suspend(hwmgr);
  }
  
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c

Re: [PATCH] drm/amdgpu: add VISIBLE info in amdgpu_bo_print_info

2023-06-26 Thread Pelloux-Prayer, Pierre-Eric

[Public]

Thanks Christian for the review. I'll remove the leading blanks before 
submitting the patch.

Pierre-Eric

From: Koenig, Christian 
Sent: Wednesday, June 21, 2023 5:00 PM
To: Pelloux-Prayer, Pierre-Eric ; 
amd-gfx@lists.freedesktop.org 
Subject: Re: [PATCH] drm/amdgpu: add VISIBLE info in amdgpu_bo_print_info

Am 21.06.23 um 16:35 schrieb Pierre-Eric Pelloux-Prayer:
> This allows tools to distinguish between VRAM and visible VRAM.
>
> Use the opportunity to fix locking before accessing bo.
>
> Signed-off-by: Pierre-Eric Pelloux-Prayer 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 33 ++
>   1 file changed, 21 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index ff73cc11d47e..f12f019d7f99 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -1583,18 +1583,27 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo 
> *bo, struct seq_file *m)
>unsigned int pin_count;
>u64 size;
>
> - domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);
> - switch (domain) {
> - case AMDGPU_GEM_DOMAIN_VRAM:
> - placement = "VRAM";
> - break;
> - case AMDGPU_GEM_DOMAIN_GTT:
> - placement = " GTT";
> - break;
> - case AMDGPU_GEM_DOMAIN_CPU:
> - default:
> - placement = " CPU";
> - break;
> + if (dma_resv_trylock(bo->tbo.base.resv)) {
> + unsigned int domain;
> + domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);
> + switch (domain) {
> + case AMDGPU_GEM_DOMAIN_VRAM:
> + if (amdgpu_bo_in_cpu_visible_vram(bo))
> + placement = "VRAM VISIBLE";
> + else
> + placement = "VRAM";
> + break;
> + case AMDGPU_GEM_DOMAIN_GTT:
> + placement = " GTT";

We can probably drop the leading blank here and

> + break;
> + case AMDGPU_GEM_DOMAIN_CPU:
> + default:
> + placement = " CPU";

here when we don't keep the strings at the same length anyway.

With that fixed the change is Reviewed-by: Christian König


Regards,
Christian.

> + break;
> + }
> + dma_resv_unlock(bo->tbo.base.resv);
> + } else {
> + placement = "UNKNOWN";
>}
>
>size = amdgpu_bo_size(bo);

[PATCH] drm/amd/pm: Enable pp_feature attribute

2023-06-26 Thread Lijo Lazar

on APUs with GFX v9.4.3

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 9ec51f50fc52..9ef88a0b1b57 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -2083,7 +2083,9 @@ static int default_attr_update(struct amdgpu_device 
*adev, struct amdgpu_device_
*states = ATTR_STATE_UNSUPPORTED;
}
} else if (DEVICE_ATTR_IS(pp_features)) {
-   if (adev->flags & AMD_IS_APU || gc_ver < IP_VERSION(9, 0, 0))
+   if ((adev->flags & AMD_IS_APU &&
+gc_ver != IP_VERSION(9, 4, 3)) ||
+   gc_ver < IP_VERSION(9, 0, 0))
*states = ATTR_STATE_UNSUPPORTED;
} else if (DEVICE_ATTR_IS(gpu_metrics)) {
if (gc_ver < IP_VERSION(9, 1, 0))
-- 
2.25.1

[PATCH v3] drm/amd/display: Remove unnecessary casts in amdgpu_dm_helpers.c

2023-06-26 Thread Srinivasan Shanmugam

Fixes the following category of checkpatch complaints:

WARNING: unnecessary cast may hide bugs, see 
http://c-faq.com/malloc/mallocnocast.html
+   char *buf = (char *)kvcalloc(total, sizeof(char), GFP_KERNEL);

Cc: Rodrigo Siqueira 
Cc: Aurabindo Pillai 
Signed-off-by: Srinivasan Shanmugam 
---

v3:
 - Keeping same as v1 - so that variable "buf" remains to local to the block,
   whereever it is declared, by having just removed the casting.

 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
index cd20cfc04996..4590deca25f8 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
@@ -400,7 +400,7 @@ void dm_dtn_log_append_v(struct dc_context *ctx,
total = log_ctx->pos + n + 1;
 
if (total > log_ctx->size) {
-   char *buf = (char *)kvcalloc(total, sizeof(char), GFP_KERNEL);
+   char *buf = kvcalloc(total, sizeof(char), GFP_KERNEL);
 
if (buf) {
memcpy(buf, log_ctx->buf, log_ctx->pos);
-- 
2.25.1

Re: [PATCH] Revert "drm/amdgpu: Enable VM_CONTEXT1_CNTL after page table addr is set."

2023-06-26 Thread Christian König


Am 31.05.23 um 16:39 schrieb Alex Deucher:

This reverts commit f57a74f5b42d1627bd5366f88952d42819e91146.

After talking this over with Christian, the original programming
sequence was correct.  The enable bit needs to be set before
programming the rest of the context.

Signed-off-by: Alex Deucher 
Cc: Zibin Liu 


Sorry for the delay, I'm only catching up to mails from lost month by now.

Reviewed-by: Christian König 


---
  drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c   | 5 +
  drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c   | 5 +
  drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c   | 5 +
  drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c   | 5 +
  drivers/gpu/drm/amd/amdgpu/gfxhub_v3_0.c   | 5 +
  drivers/gpu/drm/amd/amdgpu/gfxhub_v3_0_3.c | 5 +
  drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c| 5 +
  drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c| 5 +
  drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c| 6 +-
  drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c| 5 +
  drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c| 5 +
  drivers/gpu/drm/amd/amdgpu/mmhub_v3_0.c| 5 +
  drivers/gpu/drm/amd/amdgpu/mmhub_v3_0_1.c  | 5 +
  drivers/gpu/drm/amd/amdgpu/mmhub_v3_0_2.c  | 5 +
  drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c| 7 +--
  15 files changed, 15 insertions(+), 63 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
index 52a1e79ee4d8..d94cc1ec7242 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
@@ -261,7 +261,7 @@ static void gfxhub_v1_0_setup_vmid_config(struct 
amdgpu_device *adev)
  
  	for (i = 0; i <= 14; i++) {

tmp = RREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_CNTL, i);
-   tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, ENABLE_CONTEXT, 0);
+   tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, ENABLE_CONTEXT, 1);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, PAGE_TABLE_DEPTH,
num_level);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
@@ -302,9 +302,6 @@ static void gfxhub_v1_0_setup_vmid_config(struct 
amdgpu_device *adev)
WREG32_SOC15_OFFSET(GC, 0, 
mmVM_CONTEXT1_PAGE_TABLE_END_ADDR_HI32,
i * hub->ctx_addr_distance,
upper_32_bits(adev->vm_manager.max_pfn - 
1));
-   tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, ENABLE_CONTEXT, 1);
-   WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_CNTL,
-   i * hub->ctx_distance, tmp);
}
  }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c

index 108674f6eef0..4dabf910334b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
@@ -330,7 +330,7 @@ static void gfxhub_v1_2_xcc_setup_vmid_config(struct 
amdgpu_device *adev,
hub = &adev->vmhub[AMDGPU_GFXHUB(j)];
for (i = 0; i <= 14; i++) {
tmp = RREG32_SOC15_OFFSET(GC, GET_INST(GC, j), 
regVM_CONTEXT1_CNTL, i);
-   tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, 
ENABLE_CONTEXT, 0);
+   tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, 
ENABLE_CONTEXT, 1);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, 
PAGE_TABLE_DEPTH,
num_level);
tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
@@ -377,9 +377,6 @@ static void gfxhub_v1_2_xcc_setup_vmid_config(struct 
amdgpu_device *adev,

regVM_CONTEXT1_PAGE_TABLE_END_ADDR_HI32,
i * hub->ctx_addr_distance,

upper_32_bits(adev->vm_manager.max_pfn - 1));
-   tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, 
ENABLE_CONTEXT, 1);
-   WREG32_SOC15_OFFSET(GC, GET_INST(GC, j), 
regVM_CONTEXT1_CNTL,
-   i * hub->ctx_distance, tmp);
}
}
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c
index 502cb6e1fe84..f173a61c6c15 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c
@@ -288,7 +288,7 @@ static void gfxhub_v2_0_setup_vmid_config(struct 
amdgpu_device *adev)
  
  	for (i = 0; i <= 14; i++) {

tmp = RREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_CNTL, i);
-   tmp = REG_SET_FIELD(tmp, GCVM_CONTEXT1_CNTL, ENABLE_CONTEXT, 0);
+   tmp = REG_SET_FIELD(tmp, GCVM_CONTEXT1_CNTL, ENABLE_CONTEXT, 1);
tmp = REG_SET_FIELD(tmp, GCVM_CONTEXT1_CNTL, PAGE_TABLE_DEPTH,
adev->vm_manager.num_level);
tmp = REG_SET_FIELD(tmp, GCVM_CONTEXT1_CNTL,
@@ -324,9 +324,6 @@ static void gfxh

Re: [PATCH] drm/amd/pm: Enable pp_feature attribute

2023-06-26 Thread Alex Deucher

Acked-by: Alex Deucher 

On Mon, Jun 26, 2023 at 8:35 AM Lijo Lazar  wrote:
>
> on APUs with GFX v9.4.3
>
> Signed-off-by: Lijo Lazar 
> ---
>  drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
> b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> index 9ec51f50fc52..9ef88a0b1b57 100644
> --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> @@ -2083,7 +2083,9 @@ static int default_attr_update(struct amdgpu_device 
> *adev, struct amdgpu_device_
> *states = ATTR_STATE_UNSUPPORTED;
> }
> } else if (DEVICE_ATTR_IS(pp_features)) {
> -   if (adev->flags & AMD_IS_APU || gc_ver < IP_VERSION(9, 0, 0))
> +   if ((adev->flags & AMD_IS_APU &&
> +gc_ver != IP_VERSION(9, 4, 3)) ||
> +   gc_ver < IP_VERSION(9, 0, 0))
> *states = ATTR_STATE_UNSUPPORTED;
> } else if (DEVICE_ATTR_IS(gpu_metrics)) {
> if (gc_ver < IP_VERSION(9, 1, 0))
> --
> 2.25.1
>

Re: [PATCH 1/3] Revert "drm/amdgpu: change the reference clock for raven/raven2"

2023-06-26 Thread Christian König


Hi guys,

Vitaly and Jasber have been recently working on disabling the IGT tests 
for the TSC query on RV/RV2 (which I'm not very keen on).


And additional to that we have this RADV merge request here: 
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23481


What exactly is going on here? That sounds like we are working around a 
FW or more specific GFXOFF bug in userspace.


Regards,
Christian.

Am 05.06.23 um 10:57 schrieb Michel Dänzer:

On 6/2/23 20:43, Alex Deucher wrote:

This reverts commit fbc24293ca16b3b9ef891fe32ccd04735a6f8dc1.

This results in inconsistent timing reported via asynchronous
GPU queries.

Link: https://lists.freedesktop.org/archives/amd-gfx/2023-May/093731.html
Cc: jesse.zh...@amd.com
Cc: mic...@daenzer.net
Signed-off-by: Alex Deucher 

The series is

Reviewed-by: Michel Dänzer 

Thanks!

Re: [PATCH] drm/amd/display: Clean up warnings in amdgpu_dm _mst_types, _plane, _psr.c

2023-06-26 Thread Harry Wentland




On 6/23/23 23:49, Srinivasan Shanmugam wrote:
> Fix the following warnings reported by checkpatch:
> 
> WARNING: Missing a blank line after declarations
> WARNING: Prefer 'unsigned int' to bare use of 'unsigned'
> 
> Cc: Rodrigo Siqueira 
> Cc: Aurabindo Pillai 
> Signed-off-by: Srinivasan Shanmugam 

Hi Srini,

I've seen a lot of these minor fixes from you. It's great. But please
put them in a patchset when sending so (sensible) email clients can
organize them. Ideally with a cover letter that describes overall what
the patch set is trying to accomplish:

git format-patch --cover-letter ..

Thanks,
Harry

> ---
>  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c | 1 +
>  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c | 4 ++--
>  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c   | 1 +
>  3 files changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
> index 46d0a8f57e55..95eefa6b4f2f 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
> @@ -296,6 +296,7 @@ static int dm_dp_mst_get_modes(struct drm_connector 
> *connector)
>  
>   if (!aconnector->edid) {
>   struct edid *edid;
> +
>   edid = drm_dp_mst_get_edid(connector, 
> &aconnector->mst_root->mst_mgr, aconnector->mst_output_port);
>  
>   if (!edid) {
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
> index 322668973747..de1c7026ffcd 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
> @@ -164,7 +164,7 @@ static bool modifier_has_dcc(uint64_t modifier)
>   return IS_AMD_FMT_MOD(modifier) && AMD_FMT_MOD_GET(DCC, modifier);
>  }
>  
> -static unsigned modifier_gfx9_swizzle_mode(uint64_t modifier)
> +static unsigned int modifier_gfx9_swizzle_mode(uint64_t modifier)
>  {
>   if (modifier == DRM_FORMAT_MOD_LINEAR)
>   return 0;
> @@ -581,7 +581,7 @@ static void add_gfx11_modifiers(struct amdgpu_device 
> *adev,
>   int pkrs = 0;
>   u32 gb_addr_config;
>   u8 i = 0;
> - unsigned swizzle_r_x;
> + unsigned int swizzle_r_x;
>   uint64_t modifier_r_x;
>   uint64_t modifier_dcc_best;
>   uint64_t modifier_dcc_4k;
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c
> index d647f68fd563..be63d34400d4 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c
> @@ -165,6 +165,7 @@ bool amdgpu_dm_psr_enable(struct dc_stream_state *stream)
>*/
>   if (vsync_rate_hz != 0) {
>   unsigned int frame_time_microsec = 100 / vsync_rate_hz;
> +
>   num_frames_static = (3 / frame_time_microsec) + 1;
>   }
>

Re: [PATCH 1/3] Revert "drm/amdgpu: change the reference clock for raven/raven2"

2023-06-26 Thread Alex Deucher

On Mon, Jun 26, 2023 at 9:58 AM Christian König
 wrote:
>
> Hi guys,
>
> Vitaly and Jasber have been recently working on disabling the IGT tests
> for the TSC query on RV/RV2 (which I'm not very keen on).
>
> And additional to that we have this RADV merge request here:
> https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23481
>
> What exactly is going on here? That sounds like we are working around a
> FW or more specific GFXOFF bug in userspace.

The clock source used by the gfx firmware on Raven is part of the gfx
domain so it gets powered off when gfxoff turns off the graphics
block.  There is a clock source in an always on domain, but the gfx
block doesn't use it.  I don't know off hand if the clock source used
by gfx can be changed or not on raven (IIRC, I don't think it can,
which is why this was never fixed on RV).  Since they are different
clock sources, the GPU timestamp doesn't match the GPU timestamp.  On
renoir and newer APUs, gfx uses the always on clock source so it's
always consistent between CPU and GPU.

Alex

>
> Regards,
> Christian.
>
> Am 05.06.23 um 10:57 schrieb Michel Dänzer:
> > On 6/2/23 20:43, Alex Deucher wrote:
> >> This reverts commit fbc24293ca16b3b9ef891fe32ccd04735a6f8dc1.
> >>
> >> This results in inconsistent timing reported via asynchronous
> >> GPU queries.
> >>
> >> Link: https://lists.freedesktop.org/archives/amd-gfx/2023-May/093731.html
> >> Cc: jesse.zh...@amd.com
> >> Cc: mic...@daenzer.net
> >> Signed-off-by: Alex Deucher 
> > The series is
> >
> > Reviewed-by: Michel Dänzer 
> >
> > Thanks!
> >
> >
>

[PATCH 1/2] drm/amdgpu:update kernel vcn ring test

2023-06-26 Thread Saleemkhan Jamadar

add session context buffer to decoder ring test.

v2 - put the buffer at the end of the IB (Christian)

Signed-off-by: Saleemkhan Jamadar 
Acked-by: Leo Liu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 30 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  3 +++
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 2d94f1b63bd6..04daaaf6ab34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -691,7 +691,8 @@ static void amdgpu_vcn_unified_ring_ib_checksum(uint32_t 
**ib_checksum,
 
 static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring *ring,
  struct amdgpu_ib *ib_msg,
- struct dma_fence **fence)
+ struct dma_fence **fence,
+ uint64_t session_ctx_buf_gaddr)
 {
struct amdgpu_vcn_decode_buffer *decode_buffer = NULL;
unsigned int ib_size_dw = 64;
@@ -730,6 +731,14 @@ static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring 
*ring,
ib->length_dw += sizeof(struct amdgpu_vcn_decode_buffer) / 4;
memset(decode_buffer, 0, sizeof(struct amdgpu_vcn_decode_buffer));
 
+   if (session_ctx_buf_gaddr) {
+   decode_buffer->valid_buf_flag |=
+   
cpu_to_le32(AMDGPU_VCN_CMD_FLAG_SESSION_CONTEXT_BUFFER);
+   decode_buffer->session_context_buffer_address_hi =
+   
cpu_to_le32(session_ctx_buf_gaddr >> 32);
+   decode_buffer->session_context_buffer_address_lo =
+   
cpu_to_le32(session_ctx_buf_gaddr);
+   }
decode_buffer->valid_buf_flag |= 
cpu_to_le32(AMDGPU_VCN_CMD_FLAG_MSG_BUFFER);
decode_buffer->msg_buffer_address_hi = cpu_to_le32(addr >> 32);
decode_buffer->msg_buffer_address_lo = cpu_to_le32(addr);
@@ -763,20 +772,34 @@ int amdgpu_vcn_dec_sw_ring_test_ib(struct amdgpu_ring 
*ring, long timeout)
 {
struct dma_fence *fence = NULL;
struct amdgpu_ib ib;
+   struct amdgpu_bo *session_ctx_buf = NULL;
+   void *cpu_addr = NULL;
+   uint64_t gpu_addr = 0;
long r;
 
+   r = amdgpu_bo_create_kernel(ring->adev, 128*1024, PAGE_SIZE,
+   AMDGPU_GEM_DOMAIN_VRAM |
+   AMDGPU_GEM_DOMAIN_GTT,
+   &session_ctx_buf,
+   &gpu_addr,
+   &cpu_addr);
+   if (r) {
+   dev_err(ring->adev->dev, "VCN ib test:%ld failed to allocate 
session ctx bo\n", r);
+   return r;
+   }
+
r = amdgpu_vcn_dec_get_create_msg(ring, 1, &ib);
if (r)
goto error;
 
-   r = amdgpu_vcn_dec_sw_send_msg(ring, &ib, NULL);
+   r = amdgpu_vcn_dec_sw_send_msg(ring, &ib, NULL, gpu_addr);
if (r)
goto error;
r = amdgpu_vcn_dec_get_destroy_msg(ring, 1, &ib);
if (r)
goto error;
 
-   r = amdgpu_vcn_dec_sw_send_msg(ring, &ib, &fence);
+   r = amdgpu_vcn_dec_sw_send_msg(ring, &ib, &fence, gpu_addr);
if (r)
goto error;
 
@@ -788,6 +811,7 @@ int amdgpu_vcn_dec_sw_ring_test_ib(struct amdgpu_ring 
*ring, long timeout)
 
dma_fence_put(fence);
 error:
+   amdgpu_bo_free_kernel(&session_ctx_buf, &gpu_addr, &cpu_addr);
return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index f1397ef66fd7..06f9ee91a1e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -166,6 +166,7 @@
 
 #define AMDGPU_VCN_IB_FLAG_DECODE_BUFFER   0x0001
 #define AMDGPU_VCN_CMD_FLAG_MSG_BUFFER 0x0001
+#define AMDGPU_VCN_CMD_FLAG_SESSION_CONTEXT_BUFFER 0x0010
 
 #define VCN_CODEC_DISABLE_MASK_AV1  (1 << 0)
 #define VCN_CODEC_DISABLE_MASK_VP9  (1 << 1)
@@ -357,6 +358,8 @@ struct amdgpu_vcn_decode_buffer {
uint32_t valid_buf_flag;
uint32_t msg_buffer_address_hi;
uint32_t msg_buffer_address_lo;
+   unsigned int session_context_buffer_address_hi;
+   unsigned int session_context_buffer_address_lo;
uint32_t pad[30];
 };
 
-- 
2.25.1

[PATCH 2/2] drm/amdgpu:update kernel vcn ring test

2023-06-26 Thread Saleemkhan Jamadar

add session context buffer to decoder ring test fro vcn v1 to v3.

Signed-off-by: Saleemkhan Jamadar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 43 ++---
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 04daaaf6ab34..3e9c023e6c42 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -518,9 +518,11 @@ int amdgpu_vcn_dec_sw_ring_test_ring(struct amdgpu_ring 
*ring)
 
 static int amdgpu_vcn_dec_send_msg(struct amdgpu_ring *ring,
   struct amdgpu_ib *ib_msg,
-  struct dma_fence **fence)
+  struct dma_fence **fence,
+  uint64_t session_ctx_buf_gaddr)
 {
u64 addr = AMDGPU_GPU_PAGE_ALIGN(ib_msg->gpu_addr);
+struct amdgpu_vcn_decode_buffer *decode_buffer = NULL;
struct amdgpu_device *adev = ring->adev;
struct dma_fence *f = NULL;
struct amdgpu_job *job;
@@ -534,6 +536,22 @@ static int amdgpu_vcn_dec_send_msg(struct amdgpu_ring 
*ring,
goto err;
 
ib = &job->ibs[0];
+   ib->length_dw = 0;
+ib->ptr[ib->length_dw++] = sizeof(struct amdgpu_vcn_decode_buffer) + 8;
+ib->ptr[ib->length_dw++] = 
cpu_to_le32(AMDGPU_VCN_IB_FLAG_DECODE_BUFFER);
+decode_buffer = (struct amdgpu_vcn_decode_buffer 
*)&(ib->ptr[ib->length_dw]);
+ib->length_dw += sizeof(struct amdgpu_vcn_decode_buffer) / 4;
+memset(decode_buffer, 0, sizeof(struct amdgpu_vcn_decode_buffer));
+
+if (session_ctx_buf_gaddr) {
+decode_buffer->valid_buf_flag |=
+
cpu_to_le32(AMDGPU_VCN_CMD_FLAG_SESSION_CONTEXT_BUFFER);
+decode_buffer->session_context_buffer_address_hi =
+
cpu_to_le32(session_ctx_buf_gaddr >> 32);
+decode_buffer->session_context_buffer_address_lo =
+
cpu_to_le32(session_ctx_buf_gaddr);
+}
+
ib->ptr[0] = PACKET0(adev->vcn.internal.data0, 0);
ib->ptr[1] = addr;
ib->ptr[2] = PACKET0(adev->vcn.internal.data1, 0);
@@ -544,7 +562,7 @@ static int amdgpu_vcn_dec_send_msg(struct amdgpu_ring *ring,
ib->ptr[i] = PACKET0(adev->vcn.internal.nop, 0);
ib->ptr[i+1] = 0;
}
-   ib->length_dw = 16;
+   ib->length_dw += 16;
 
r = amdgpu_job_submit_direct(job, ring, &f);
if (r)
@@ -631,20 +649,34 @@ int amdgpu_vcn_dec_ring_test_ib(struct amdgpu_ring *ring, 
long timeout)
 {
struct dma_fence *fence = NULL;
struct amdgpu_ib ib;
+   struct amdgpu_bo *session_ctx_buf = NULL;
+void *cpu_addr = NULL;
+uint64_t gpu_addr = 0;
long r;
 
+   r = amdgpu_bo_create_kernel(ring->adev, 128*1024, PAGE_SIZE,
+   AMDGPU_GEM_DOMAIN_VRAM |
+   AMDGPU_GEM_DOMAIN_GTT,
+   &session_ctx_buf,
+   &gpu_addr,
+   &cpu_addr);
+   if (r) {
+   dev_err(ring->adev->dev, "VCN ib test:%ld failed to allocate 
session ctx bo\n", r);
+   return r;
+   }
+
r = amdgpu_vcn_dec_get_create_msg(ring, 1, &ib);
if (r)
goto error;
 
-   r = amdgpu_vcn_dec_send_msg(ring, &ib, NULL);
+   r = amdgpu_vcn_dec_send_msg(ring, &ib, NULL, gpu_addr);
if (r)
goto error;
r = amdgpu_vcn_dec_get_destroy_msg(ring, 1, &ib);
if (r)
goto error;
 
-   r = amdgpu_vcn_dec_send_msg(ring, &ib, &fence);
+   r = amdgpu_vcn_dec_send_msg(ring, &ib, &fence, gpu_addr);
if (r)
goto error;
 
@@ -656,6 +688,7 @@ int amdgpu_vcn_dec_ring_test_ib(struct amdgpu_ring *ring, 
long timeout)
 
dma_fence_put(fence);
 error:
+   amdgpu_bo_free_kernel(&session_ctx_buf, &gpu_addr, &cpu_addr);
return r;
 }
 
@@ -692,7 +725,7 @@ static void amdgpu_vcn_unified_ring_ib_checksum(uint32_t 
**ib_checksum,
 static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring *ring,
  struct amdgpu_ib *ib_msg,
  struct dma_fence **fence,
- uint64_t session_ctx_buf_gaddr)
+ uint64_t session_ctx_buf_gaddr)
 {
struct amdgpu_vcn_decode_buffer *decode_buffer = NULL;
unsigned int ib_size_dw = 64;
-- 
2.25.1

[PATCH] drm/amd: Fix a documentation warning about excess parameters

2023-06-26 Thread Mario Limonciello

`pcie_index` and `pcie_data` aren't used by
amdgpu_device_indirect_wreg() since commit 65ba96e91b68
("drm/amdgpu: Move to common indirect reg access helper") but
the documentation wasn't updated. This causes a warning while
building documentation.

Fixes: 65ba96e91b68 ("drm/amdgpu: Move to common indirect reg access helper")
Signed-off-by: Mario Limonciello 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 65fe0f3488679..a3dae8ffbdb10 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -747,8 +747,6 @@ u64 amdgpu_device_indirect_rreg64(struct amdgpu_device 
*adev,
  * amdgpu_device_indirect_wreg - write an indirect register address
  *
  * @adev: amdgpu_device pointer
- * @pcie_index: mmio register offset
- * @pcie_data: mmio register offset
  * @reg_addr: indirect register offset
  * @reg_data: indirect register data
  *
@@ -778,8 +776,6 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
  *
  * @adev: amdgpu_device pointer
- * @pcie_index: mmio register offset
- * @pcie_data: mmio register offset
  * @reg_addr: indirect register offset
  * @reg_data: indirect register data
  *
-- 
2.34.1

[PATCH 1/5] drm/amd: Don't initialize PSP twice for Navi3x

2023-06-26 Thread Mario Limonciello

PSP functions are already set by psp_early_init() so initializing
them a second time is unnecessary.
No intended functional changes.

Signed-off-by: Mario Limonciello 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index a150b7a4b4aae..eb687a338a1bd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -3621,10 +3621,6 @@ int amdgpu_psp_sysfs_init(struct amdgpu_device *adev)
switch (adev->ip_versions[MP0_HWIP][0]) {
case IP_VERSION(13, 0, 0):
case IP_VERSION(13, 0, 7):
-   if (!psp->adev) {
-   psp->adev = adev;
-   psp_v13_0_set_psp_funcs(psp);
-   }
ret = sysfs_create_bin_file(&adev->dev->kobj, 
&psp_vbflash_bin_attr);
if (ret)
dev_err(adev->dev, "Failed to create device file 
psp_vbflash");
-- 
2.34.1

[PATCH 3/5] drm/amd: Make flashing messages quieter

2023-06-26 Thread Mario Limonciello

Debug messages related to the kernel process of flashing an updated
IFWI are needlessly noisy and also confusing.

Downgrade them to debug instead and clarify what they are actually
doing.

Signed-off-by: Mario Limonciello 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 4286c0b4beb90..93d014e69ee39 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -3531,7 +3531,7 @@ static ssize_t amdgpu_psp_vbflash_write(struct file 
*filp, struct kobject *kobj,
adev->psp.vbflash_image_size += count;
mutex_unlock(&adev->psp.mutex);
 
-   dev_info(adev->dev, "VBIOS flash write PSP done");
+   dev_dbg(adev->dev, "IFWI staged for update");
 
return count;
 }
@@ -3551,7 +3551,7 @@ static ssize_t amdgpu_psp_vbflash_read(struct file *filp, 
struct kobject *kobj,
if (adev->psp.vbflash_image_size == 0)
return -EINVAL;
 
-   dev_info(adev->dev, "VBIOS flash to PSP started");
+   dev_dbg(adev->dev, "PSP IFWI flash process initiated");
 
ret = amdgpu_bo_create_kernel(adev, adev->psp.vbflash_image_size,
AMDGPU_GPU_PAGE_SIZE,
@@ -3576,11 +3576,11 @@ static ssize_t amdgpu_psp_vbflash_read(struct file 
*filp, struct kobject *kobj,
adev->psp.vbflash_image_size = 0;
 
if (ret) {
-   dev_err(adev->dev, "Failed to load VBIOS FW, err = %d", ret);
+   dev_err(adev->dev, "Failed to load IFWI, err = %d", ret);
return ret;
}
 
-   dev_info(adev->dev, "VBIOS flash to PSP done");
+   dev_dbg(adev->dev, "PSP IFWI flash process done");
return 0;
 }
 
-- 
2.34.1

[PATCH 4/5] drm/amd: Convert USB-C PD F/W attributes into groups

2023-06-26 Thread Mario Limonciello

Rather than special casing the creation of the file, special case
the visibility to the supported dGPUs.

Signed-off-by: Mario Limonciello 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 40 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  1 -
 2 files changed, 9 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 93d014e69ee39..7872004ed7f9b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -45,9 +45,6 @@
 
 #define AMD_VBIOS_FILE_MAX_SIZE_B  (1024*1024*3)
 
-static int psp_sysfs_init(struct amdgpu_device *adev);
-static void psp_sysfs_fini(struct amdgpu_device *adev);
-
 static int psp_load_smu_fw(struct psp_context *psp);
 static int psp_rap_terminate(struct psp_context *psp);
 static int psp_securedisplay_terminate(struct psp_context *psp);
@@ -456,14 +453,6 @@ static int psp_sw_init(void *handle)
}
}
 
-   if (adev->ip_versions[MP0_HWIP][0] == IP_VERSION(11, 0, 0) ||
-   adev->ip_versions[MP0_HWIP][0] == IP_VERSION(11, 0, 7)) {
-   ret= psp_sysfs_init(adev);
-   if (ret) {
-   return ret;
-   }
-   }
-
ret = amdgpu_bo_create_kernel(adev, PSP_1_MEG, PSP_1_MEG,
  amdgpu_sriov_vf(adev) ?
  AMDGPU_GEM_DOMAIN_VRAM : 
AMDGPU_GEM_DOMAIN_GTT,
@@ -513,10 +502,6 @@ static int psp_sw_fini(void *handle)
amdgpu_ucode_release(&psp->cap_fw);
amdgpu_ucode_release(&psp->toc_fw);
 
-   if (adev->ip_versions[MP0_HWIP][0] == IP_VERSION(11, 0, 0) ||
-   adev->ip_versions[MP0_HWIP][0] == IP_VERSION(11, 0, 7))
-   psp_sysfs_fini(adev);
-
kfree(cmd);
cmd = NULL;
 
@@ -3612,6 +3597,7 @@ static DEVICE_ATTR(psp_vbflash_status, 0440, 
amdgpu_psp_vbflash_status, NULL);
 static struct attribute *flash_attrs[] = {
&dev_attr_psp_vbflash_status.attr,
&psp_vbflash_bin_attr.attr,
+   &dev_attr_usbc_pd_fw.attr,
NULL
 };
 
@@ -3625,9 +3611,16 @@ static umode_t amdgpu_flash_attr_is_visible(struct 
kobject *kobj, struct attribu
return 0;
 
switch (adev->ip_versions[MP0_HWIP][0]) {
+   case IP_VERSION(11, 0, 0):
+   case IP_VERSION(11, 0, 7):
+   if (attr == &dev_attr_usbc_pd_fw.attr)
+   return 0660;
+   return 0;
case IP_VERSION(13, 0, 0):
case IP_VERSION(13, 0, 7):
-   if (attr == &psp_vbflash_bin_attr.attr)
+   if (attr == &dev_attr_usbc_pd_fw.attr)
+   return 0;
+   else if (attr == &psp_vbflash_bin_attr.attr)
return 0660;
return 0440;
default:
@@ -3658,21 +3651,6 @@ const struct amd_ip_funcs psp_ip_funcs = {
.set_powergating_state = psp_set_powergating_state,
 };
 
-static int psp_sysfs_init(struct amdgpu_device *adev)
-{
-   int ret = device_create_file(adev->dev, &dev_attr_usbc_pd_fw);
-
-   if (ret)
-   DRM_ERROR("Failed to create USBC PD FW control file!");
-
-   return ret;
-}
-
-static void psp_sysfs_fini(struct amdgpu_device *adev)
-{
-   device_remove_file(adev->dev, &dev_attr_usbc_pd_fw);
-}
-
 const struct amdgpu_ip_block_version psp_v3_1_ip_block =
 {
.type = AMD_IP_BLOCK_TYPE_PSP,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index b441c07e5a16f..619b27e891b5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -521,5 +521,4 @@ void psp_copy_fw(struct psp_context *psp, uint8_t 
*start_addr, uint32_t bin_size
 
 int is_psp_fw_valid(struct psp_bin_desc bin);
 
-int amdgpu_psp_sysfs_init(struct amdgpu_device *adev);
 #endif
-- 
2.34.1

[PATCH 2/5] drm/amd: Use attribute groups for PSP flashing attributes

2023-06-26 Thread Mario Limonciello

Individually creating attributes can be racy, instead make attributes
using attribute groups and control their visibility with an is_visible
callback to only show when using appropriate products.

Signed-off-by: Mario Limonciello 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c| 49 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h|  1 -
 5 files changed, 27 insertions(+), 36 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 02b827785e399..a7ef43e25c758 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1004,7 +1004,6 @@ struct amdgpu_device {
boolhas_pr3;
 
boolucode_sysfs_en;
-   boolpsp_sysfs_en;
 
/* Chip product information */
charproduct_number[20];
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5c7d40873ee20..65fe0f3488679 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3907,14 +3907,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
} else
adev->ucode_sysfs_en = true;
 
-   r = amdgpu_psp_sysfs_init(adev);
-   if (r) {
-   adev->psp_sysfs_en = false;
-   if (!amdgpu_sriov_vf(adev))
-   DRM_ERROR("Creating psp sysfs failed\n");
-   } else
-   adev->psp_sysfs_en = true;
-
/*
 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
 * Otherwise the mgpu fan boost feature will be skipped due to the
@@ -4064,8 +4056,6 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
amdgpu_pm_sysfs_fini(adev);
if (adev->ucode_sysfs_en)
amdgpu_ucode_sysfs_fini(adev);
-   if (adev->psp_sysfs_en)
-   amdgpu_psp_sysfs_fini(adev);
sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
 
/* disable ras feature must before hw fini */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 393b6fb7a71d3..99b8d3113d6af 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2827,11 +2827,13 @@ static struct pci_error_handlers amdgpu_pci_err_handler 
= {
 extern const struct attribute_group amdgpu_vram_mgr_attr_group;
 extern const struct attribute_group amdgpu_gtt_mgr_attr_group;
 extern const struct attribute_group amdgpu_vbios_version_attr_group;
+extern const struct attribute_group amdgpu_flash_attr_group;
 
 static const struct attribute_group *amdgpu_sysfs_groups[] = {
&amdgpu_vram_mgr_attr_group,
&amdgpu_gtt_mgr_attr_group,
&amdgpu_vbios_version_attr_group,
+   &amdgpu_flash_attr_group,
NULL,
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index eb687a338a1bd..4286c0b4beb90 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -3584,6 +3584,13 @@ static ssize_t amdgpu_psp_vbflash_read(struct file 
*filp, struct kobject *kobj,
return 0;
 }
 
+static struct bin_attribute psp_vbflash_bin_attr = {
+   .attr = {.name = "psp_vbflash", .mode = 0660},
+   .size = 0,
+   .write = amdgpu_psp_vbflash_write,
+   .read = amdgpu_psp_vbflash_read,
+};
+
 static ssize_t amdgpu_psp_vbflash_status(struct device *dev,
 struct device_attribute *attr,
 char *buf)
@@ -3600,39 +3607,39 @@ static ssize_t amdgpu_psp_vbflash_status(struct device 
*dev,
 
return sysfs_emit(buf, "0x%x\n", vbflash_status);
 }
+static DEVICE_ATTR(psp_vbflash_status, 0440, amdgpu_psp_vbflash_status, NULL);
 
-static const struct bin_attribute psp_vbflash_bin_attr = {
-   .attr = {.name = "psp_vbflash", .mode = 0660},
-   .size = 0,
-   .write = amdgpu_psp_vbflash_write,
-   .read = amdgpu_psp_vbflash_read,
+static struct attribute *flash_attrs[] = {
+   &dev_attr_psp_vbflash_status.attr,
+   &psp_vbflash_bin_attr.attr,
+   NULL
 };
 
-static DEVICE_ATTR(psp_vbflash_status, 0440, amdgpu_psp_vbflash_status, NULL);
-
-int amdgpu_psp_sysfs_init(struct amdgpu_device *adev)
+static umode_t amdgpu_flash_attr_is_visible(struct kobject *kobj, struct 
attribute *attr, int idx)
 {
-   int ret = 0;
-   struct psp_context *psp = &adev->psp;
+   struct device *dev = kobj_to_dev(kobj);
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
 
if (amdgpu_sriov_vf(adev))
-   return -E

[PATCH 5/5] drm/amd: Add documentation for how to flash a dGPU

2023-06-26 Thread Mario Limonciello

Signed-off-by: Mario Limonciello 
---
 Documentation/gpu/amdgpu/flashing.rst   | 33 +
 Documentation/gpu/amdgpu/index.rst  |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 16 
 3 files changed, 50 insertions(+)
 create mode 100644 Documentation/gpu/amdgpu/flashing.rst

diff --git a/Documentation/gpu/amdgpu/flashing.rst 
b/Documentation/gpu/amdgpu/flashing.rst
new file mode 100644
index 0..bd745c42a538f
--- /dev/null
+++ b/Documentation/gpu/amdgpu/flashing.rst
@@ -0,0 +1,33 @@
+===
+ dGPU firmware flashing
+===
+
+IFWI
+
+Flashing the dGPU integrated firmware image (IFWI) is supported by GPUs that
+use the PSP to orchestrate the update (Navi3x or newer GPUs).
+For supported GPUs, `amdgpu` will export a series of sysfs files that can be
+used for the flash process.
+
+The IFWI flash process is:
+
+1. Ensure the IFWI image is intended for the dGPU on the system.
+2. "Write" the IFWI image to the sysfs file `psp_vbflash`. This will stage the 
IFWI in memory.
+3. "Read" from the `psp_vbflash` sysfs file to initiate the flash process.
+4. Poll the `psp_vbflash_status` sysfs file to determine when the flash 
process completes.
+
+USB-C PD F/W
+
+On GPUs that support flashing an updated USB-C PD firmware image, the process
+is done using the `usbc_pd_fw` sysfs file.
+
+* Reading the file will provide the current firmware version.
+* Writing the name of a firmware payload stored in `/lib/firmware/amdgpu` to 
the sysfs file will initiate the flash process.
+
+The firmware payload stored in `/lib/firmware/amdgpu` can be named any name
+as long as it doesn't conflict with other existing binaries that are used by
+`amdgpu`.
+
+sysfs files
+---
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
diff --git a/Documentation/gpu/amdgpu/index.rst 
b/Documentation/gpu/amdgpu/index.rst
index 03c2966cae798..912e699fd3731 100644
--- a/Documentation/gpu/amdgpu/index.rst
+++ b/Documentation/gpu/amdgpu/index.rst
@@ -10,6 +10,7 @@ Next (GCN), Radeon DNA (RDNA), and Compute DNA (CDNA) 
architectures.
module-parameters
driver-core
display/index
+   flashing
xgmi
ras
thermal
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 7872004ed7f9b..047760bafcc23 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -3476,6 +3476,11 @@ void psp_copy_fw(struct psp_context *psp, uint8_t 
*start_addr, uint32_t bin_size
drm_dev_exit(idx);
 }
 
+/**
+ * DOC: usbc_pd_fw
+ * Reading from this file will retrieve the USB-C PD firmware version. Writing 
to
+ * this file will trigger the update process.
+ */
 static DEVICE_ATTR(usbc_pd_fw, S_IRUGO | S_IWUSR,
   psp_usbc_pd_fw_sysfs_read,
   psp_usbc_pd_fw_sysfs_write);
@@ -3569,6 +3574,11 @@ static ssize_t amdgpu_psp_vbflash_read(struct file 
*filp, struct kobject *kobj,
return 0;
 }
 
+/**
+ * DOC: psp_vbflash
+ * Writing to this file will stage an IFWI for update. Reading from this file
+ * will trigger the update process.
+ */
 static struct bin_attribute psp_vbflash_bin_attr = {
.attr = {.name = "psp_vbflash", .mode = 0660},
.size = 0,
@@ -3576,6 +3586,12 @@ static struct bin_attribute psp_vbflash_bin_attr = {
.read = amdgpu_psp_vbflash_read,
 };
 
+/**
+ * DOC: psp_vbflash_status
+ * The status of the flash process.
+ * 0: IFWI flash not complete.
+ * 1: IFWI flash complete.
+ */
 static ssize_t amdgpu_psp_vbflash_status(struct device *dev,
 struct device_attribute *attr,
 char *buf)
-- 
2.34.1

Re: [PATCH] drm/amd: Fix a documentation warning about excess parameters

2023-06-26 Thread Alex Deucher

On Mon, Jun 26, 2023 at 11:00 AM Mario Limonciello
 wrote:
>
> `pcie_index` and `pcie_data` aren't used by
> amdgpu_device_indirect_wreg() since commit 65ba96e91b68
> ("drm/amdgpu: Move to common indirect reg access helper") but
> the documentation wasn't updated. This causes a warning while
> building documentation.
>
> Fixes: 65ba96e91b68 ("drm/amdgpu: Move to common indirect reg access helper")
> Signed-off-by: Mario Limonciello 

Reviewed-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 
>  1 file changed, 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 65fe0f3488679..a3dae8ffbdb10 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -747,8 +747,6 @@ u64 amdgpu_device_indirect_rreg64(struct amdgpu_device 
> *adev,
>   * amdgpu_device_indirect_wreg - write an indirect register address
>   *
>   * @adev: amdgpu_device pointer
> - * @pcie_index: mmio register offset
> - * @pcie_data: mmio register offset
>   * @reg_addr: indirect register offset
>   * @reg_data: indirect register data
>   *
> @@ -778,8 +776,6 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device 
> *adev,
>   * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
>   *
>   * @adev: amdgpu_device pointer
> - * @pcie_index: mmio register offset
> - * @pcie_data: mmio register offset
>   * @reg_addr: indirect register offset
>   * @reg_data: indirect register data
>   *
> --
> 2.34.1
>

Re: [PATCH] drm/amd: Fix a documentation warning about excess parameters

2023-06-26 Thread Limonciello, Mario




On 6/26/2023 10:05 AM, Alex Deucher wrote:

On Mon, Jun 26, 2023 at 11:00 AM Mario Limonciello
 wrote:

`pcie_index` and `pcie_data` aren't used by
amdgpu_device_indirect_wreg() since commit 65ba96e91b68
("drm/amdgpu: Move to common indirect reg access helper") but
the documentation wasn't updated. This causes a warning while
building documentation.

Fixes: 65ba96e91b68 ("drm/amdgpu: Move to common indirect reg access helper")
Signed-off-by: Mario Limonciello 

Reviewed-by: Alex Deucher 

It turns out that the exact same patch already landed in ASDN as:

fbdfbe84aaf4 ("drm/amdgpu: Fix up kdoc in amdgpu_device.c")

and I missed this.  Sorry for that.




---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 
  1 file changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 65fe0f3488679..a3dae8ffbdb10 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -747,8 +747,6 @@ u64 amdgpu_device_indirect_rreg64(struct amdgpu_device 
*adev,
   * amdgpu_device_indirect_wreg - write an indirect register address
   *
   * @adev: amdgpu_device pointer
- * @pcie_index: mmio register offset
- * @pcie_data: mmio register offset
   * @reg_addr: indirect register offset
   * @reg_data: indirect register data
   *
@@ -778,8 +776,6 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
   * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
   *
   * @adev: amdgpu_device pointer
- * @pcie_index: mmio register offset
- * @pcie_data: mmio register offset
   * @reg_addr: indirect register offset
   * @reg_data: indirect register data
   *
--
2.34.1

Re: [PATCH v3] drm/amd/display: Remove unnecessary casts in amdgpu_dm_helpers.c

2023-06-26 Thread Christian König


Am 26.06.23 um 14:43 schrieb Srinivasan Shanmugam:

Fixes the following category of checkpatch complaints:

WARNING: unnecessary cast may hide bugs, see 
http://c-faq.com/malloc/mallocnocast.html
+   char *buf = (char *)kvcalloc(total, sizeof(char), GFP_KERNEL);

Cc: Rodrigo Siqueira 
Cc: Aurabindo Pillai 
Signed-off-by: Srinivasan Shanmugam 


Reviewed-by: Christian König 


---

v3:
  - Keeping same as v1 - so that variable "buf" remains to local to the block,
whereever it is declared, by having just removed the casting.

  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
index cd20cfc04996..4590deca25f8 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
@@ -400,7 +400,7 @@ void dm_dtn_log_append_v(struct dc_context *ctx,
total = log_ctx->pos + n + 1;
  
  	if (total > log_ctx->size) {

-   char *buf = (char *)kvcalloc(total, sizeof(char), GFP_KERNEL);
+   char *buf = kvcalloc(total, sizeof(char), GFP_KERNEL);
  
  		if (buf) {

memcpy(buf, log_ctx->buf, log_ctx->pos);

Re: [PATCH 1/2] drm/amdgpu:update kernel vcn ring test

2023-06-26 Thread Christian König


Am 26.06.23 um 16:50 schrieb Saleemkhan Jamadar:

add session context buffer to decoder ring test.

v2 - put the buffer at the end of the IB (Christian)

Signed-off-by: Saleemkhan Jamadar 
Acked-by: Leo Liu 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 30 ++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  3 +++
  2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 2d94f1b63bd6..04daaaf6ab34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -691,7 +691,8 @@ static void amdgpu_vcn_unified_ring_ib_checksum(uint32_t 
**ib_checksum,
  
  static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring *ring,

  struct amdgpu_ib *ib_msg,
- struct dma_fence **fence)
+ struct dma_fence **fence,
+ uint64_t session_ctx_buf_gaddr)


That looks like it isn't correctly indented.


  {
struct amdgpu_vcn_decode_buffer *decode_buffer = NULL;
unsigned int ib_size_dw = 64;
@@ -730,6 +731,14 @@ static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring 
*ring,
ib->length_dw += sizeof(struct amdgpu_vcn_decode_buffer) / 4;
memset(decode_buffer, 0, sizeof(struct amdgpu_vcn_decode_buffer));
  
+	if (session_ctx_buf_gaddr) {

+   decode_buffer->valid_buf_flag |=
+   
cpu_to_le32(AMDGPU_VCN_CMD_FLAG_SESSION_CONTEXT_BUFFER);
+   decode_buffer->session_context_buffer_address_hi =
+   
cpu_to_le32(session_ctx_buf_gaddr >> 32);
+   decode_buffer->session_context_buffer_address_lo =
+   
cpu_to_le32(session_ctx_buf_gaddr);
+   }
decode_buffer->valid_buf_flag |= 
cpu_to_le32(AMDGPU_VCN_CMD_FLAG_MSG_BUFFER);
decode_buffer->msg_buffer_address_hi = cpu_to_le32(addr >> 32);
decode_buffer->msg_buffer_address_lo = cpu_to_le32(addr);
@@ -763,20 +772,34 @@ int amdgpu_vcn_dec_sw_ring_test_ib(struct amdgpu_ring 
*ring, long timeout)
  {
struct dma_fence *fence = NULL;
struct amdgpu_ib ib;
+   struct amdgpu_bo *session_ctx_buf = NULL;
+   void *cpu_addr = NULL;
+   uint64_t gpu_addr = 0;
long r;
  
+	r = amdgpu_bo_create_kernel(ring->adev, 128*1024, PAGE_SIZE,

+   AMDGPU_GEM_DOMAIN_VRAM |
+   AMDGPU_GEM_DOMAIN_GTT,
+   &session_ctx_buf,
+   &gpu_addr,
+   &cpu_addr);


That still creates a buffer for the session ctx instead of putting it 
into the IB.



+   if (r) {
+   dev_err(ring->adev->dev, "VCN ib test:%ld failed to allocate session 
ctx bo\n", r);
+   return r;
+   }
+
r = amdgpu_vcn_dec_get_create_msg(ring, 1, &ib);
if (r)
goto error;
  
-	r = amdgpu_vcn_dec_sw_send_msg(ring, &ib, NULL);

+   r = amdgpu_vcn_dec_sw_send_msg(ring, &ib, NULL, gpu_addr);
if (r)
goto error;
r = amdgpu_vcn_dec_get_destroy_msg(ring, 1, &ib);
if (r)
goto error;
  
-	r = amdgpu_vcn_dec_sw_send_msg(ring, &ib, &fence);

+   r = amdgpu_vcn_dec_sw_send_msg(ring, &ib, &fence, gpu_addr);
if (r)
goto error;
  
@@ -788,6 +811,7 @@ int amdgpu_vcn_dec_sw_ring_test_ib(struct amdgpu_ring *ring, long timeout)
  
  	dma_fence_put(fence);

  error:
+   amdgpu_bo_free_kernel(&session_ctx_buf, &gpu_addr, 
&cpu_addr);ebd59851c796c


Which is freed here.


return r;
  }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h

index f1397ef66fd7..06f9ee91a1e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -166,6 +166,7 @@
  
  #define AMDGPU_VCN_IB_FLAG_DECODE_BUFFER	0x0001

  #define AMDGPU_VCN_CMD_FLAG_MSG_BUFFER0x0001
+#define AMDGPU_VCN_CMD_FLAG_SESSION_CONTEXT_BUFFER 0x0010
  
  #define VCN_CODEC_DISABLE_MASK_AV1  (1 << 0)

  #define VCN_CODEC_DISABLE_MASK_VP9  (1 << 1)
@@ -357,6 +358,8 @@ struct amdgpu_vcn_decode_buffer {
uint32_t valid_buf_flag;
uint32_t msg_buffer_address_hi;
uint32_t msg_buffer_address_lo;
+   unsigned int session_context_buffer_address_hi;
+   unsigned int session_context_buffer_address_lo;
uint32_t pad[30];


that here looks incorrect as well.

Christian.


  };

Re: [RFC PATCH v3 1/4] drm/doc: Document DRM device reset expectations

2023-06-26 Thread André Almeida


Em 22/06/2023 05:12, Pekka Paalanen escreveu:

On Wed, 21 Jun 2023 13:28:34 -0300
André Almeida  wrote:


Em 21/06/2023 04:58, Pekka Paalanen escreveu:

On Tue, 20 Jun 2023 21:57:16 -0300
André Almeida  wrote:
   

Create a section that specifies how to deal with DRM device resets for
kernel and userspace drivers.

Signed-off-by: André Almeida 


Hi André,

nice to see this! I ended up giving lots of grammar comments, but I'm
not a native speaker. Generally it looks good to me.


Thank you for your feedback :)

   

---
   Documentation/gpu/drm-uapi.rst | 65 ++
   1 file changed, 65 insertions(+)

diff --git a/Documentation/gpu/drm-uapi.rst b/Documentation/gpu/drm-uapi.rst
index 65fb3036a580..da4f8a694d8d 100644
--- a/Documentation/gpu/drm-uapi.rst
+++ b/Documentation/gpu/drm-uapi.rst
@@ -285,6 +285,71 @@ for GPU1 and GPU2 from different vendors, and a third 
handler for
   mmapped regular files. Threads cause additional pain with signal
   handling as well.
   
+Device reset

+
+
+The GPU stack is really complex and is prone to errors, from hardware bugs,
+faulty applications and everything in between the many layers. To recover
+from this kind of state, sometimes is needed to reset the device. This section


It seems unclear what "this kind of state" refers to, so maybe just write 
"errors"?

Maybe:

Some errors require resetting the device in order to make the
device usable again.

I presume that recovery does not mean that the failed job could recover.
   

+describes what's the expectations for DRM and usermode drivers when a device
+resets and how to propagate the reset status.
+
+Kernel Mode Driver
+--
+
+The KMD is responsible for checking if the device needs a reset, and to perform
+it as needed. Usually a hung is detected when a job gets stuck executing. KMD


s/hung/hang/ ?
   

+then update it's internal reset tracking to be ready when userspace asks the


updates its

"update reset tracking"... do you mean that KMD records information
about the reset in case userspace asks for it later?


Yes, kernel drivers do annotate whenever a reset happens, so it can
report to userspace when it asks about resets.

For instance, this is the amdgpu implementation of
AMDGPU_CTX_OP_QUERY_STATE2:

https://elixir.bootlin.com/linux/v6.3.8/source/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c#L548


You can see there stored information about resets.


Hi André,

right. What I mean is, if I have to ask this, then that implies that
the wording could be more clear.

I don't know if "reset tracking" is some sub-system that is turned on
and off as needed or what updating it would mean.



Understood, I'll rewrite it to be more clear.

   

+kernel about reset information. Drivers should implement the 
DRM_IOCTL_GET_RESET
+for that.


At this point, I'm not sure what "reset tracking" or "reset
information" entails. Could something be said about those?
  >> +

+User Mode Driver
+
+
+The UMD should check before submitting new commands to the KMD if the device 
has
+been reset, and this can be checked more often if it requires to. The
+DRM_IOCTL_GET_RESET is the default interface for those kind of checks. After
+detecting a reset, UMD will then proceed to report it to the application using
+the appropriated API error code, as explained in the bellow section about


s/bellow/below/
   

+robustness.
+
+Robustness
+--
+
+The only way to try to keep an application working after a reset is if it
+complies with the robustness aspects of the graphical API that is using.


that it is using.
   

+
+Graphical APIs provide ways to application to deal with device resets. However,


provide ways for applications to deal with
   

+there's no guarantee that the app will be correctly using such features, and 
UMD
+can implement policies to close the app if it's a repeating offender, likely in
+a broken loop. This is done to ensure that it doesn't keeps blocking the user


does not keep

I think contractions are usually avoided in documents, but I'm not
bothering to flag them all.
   

+interface to be correctly displayed.


interface from being correctly displayed.
   

+
+OpenGL
+~~
+
+Apps using OpenGL can rely on ``GL_ARB_robustness`` to be robust. This 
extension
+tells if a reset has happened, and if so, all the context state is considered
+lost and the app proceeds by creating new ones. If robustness isn't in use, UMD
+will terminate the app when a reset is detected, giving that the contexts are
+lost and the app won't be able to figure this out and recreate the contexts.


What about GL ES? Is GL_ARB_robustness implemented or even defined there?
   


I found this:
https://registry.khronos.org/OpenGL/extensions/EXT/EXT_robustness.txt

"Since this is intended to be a version of ARB_robustness for OpenGL ES,
it should be named accordingly."

I can add this to this paragraph.


Yes, please!

I suppose there could be even more extensio

[PATCH] Revert "drm/amd/display: edp do not add non-edid timings"

2023-06-26 Thread Hersen Wu

This change causes regression when eDP and external display in mirror
mode. When external display supports low resolution than eDP, use eDP
timing to driver external display may cause corruption on external
display.

This reverts commit aa9704d5127f06c9ffedb0480d2788b87fecedfb.

Signed-off-by: Hersen Wu 
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index a46b8b47b756..073bf00c6fdc 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -7258,13 +7258,7 @@ static int amdgpu_dm_connector_get_modes(struct 
drm_connector *connector)
drm_add_modes_noedid(connector, 1920, 1080);
} else {
amdgpu_dm_connector_ddc_get_modes(connector, edid);
-   /* most eDP supports only timings from its edid,
-* usually only detailed timings are available
-* from eDP edid. timings which are not from edid
-* may damage eDP
-*/
-   if (connector->connector_type != DRM_MODE_CONNECTOR_eDP)
-   amdgpu_dm_connector_add_common_modes(encoder, 
connector);
+   amdgpu_dm_connector_add_common_modes(encoder, connector);
amdgpu_dm_connector_add_freesync_modes(connector, edid);
}
amdgpu_dm_fbc_init(connector);
-- 
2.25.1

RE: [PATCH] Revert "drm/amd/display: edp do not add non-edid timings"

2023-06-26 Thread Limonciello, Mario

[Public]

> This change causes regression when eDP and external display in mirror
> mode. When external display supports low resolution than eDP, use eDP
> timing to driver external display may cause corruption on external
> display.
>
> This reverts commit aa9704d5127f06c9ffedb0480d2788b87fecedfb.
>
> Signed-off-by: Hersen Wu 

The original commit CC to stable, we need this to go to stable too.

Here's some tags to pick up when merging.

Cc: sta...@vger.kernel.org
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2655
Reviewed-by: Mario Limonciello 

> ---
>  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 8 +---
>  1 file changed, 1 insertion(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index a46b8b47b756..073bf00c6fdc 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -7258,13 +7258,7 @@ static int
> amdgpu_dm_connector_get_modes(struct drm_connector *connector)
>   drm_add_modes_noedid(connector, 1920,
> 1080);
>   } else {
>   amdgpu_dm_connector_ddc_get_modes(connector, edid);
> - /* most eDP supports only timings from its edid,
> -  * usually only detailed timings are available
> -  * from eDP edid. timings which are not from edid
> -  * may damage eDP
> -  */
> - if (connector->connector_type !=
> DRM_MODE_CONNECTOR_eDP)
> -
>   amdgpu_dm_connector_add_common_modes(encoder, connector);
> + amdgpu_dm_connector_add_common_modes(encoder,
> connector);
>   amdgpu_dm_connector_add_freesync_modes(connector,
> edid);
>   }
>   amdgpu_dm_fbc_init(connector);
> --
> 2.25.1

RE: [PATCH] Revert "drm/amd/display: edp do not add non-edid timings"

2023-06-26 Thread Limonciello, Mario

[Public]

> -Original Message-
> From: Limonciello, Mario
> Sent: Monday, June 26, 2023 12:45 PM
> To: Hersen Wu ; amd-gfx@lists.freedesktop.org;
> Wentland, Harry 
> Cc: Wu, Hersen 
> Subject: RE: [PATCH] Revert "drm/amd/display: edp do not add non-edid
> timings"
>
> > This change causes regression when eDP and external display in mirror
> > mode. When external display supports low resolution than eDP, use eDP
> > timing to driver external display may cause corruption on external
> > display.
> >
> > This reverts commit aa9704d5127f06c9ffedb0480d2788b87fecedfb.

One more thing - although this is the correct hash for ASDN, this merged
into Linus' tree as e749dd10e5f292061ad63d2b030194bf7d7d452c.

As this has to go back to stable trees properly, I think the hash should
reflect what's in Linus' tree instead of what's in ASDN.

> >
> > Signed-off-by: Hersen Wu 
>
> The original commit CC to stable, we need this to go to stable too.
>
> Here's some tags to pick up when merging.
>
> Cc: sta...@vger.kernel.org
> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2655
> Reviewed-by: Mario Limonciello 
>
> > ---
> >  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 8 +---
> >  1 file changed, 1 insertion(+), 7 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > index a46b8b47b756..073bf00c6fdc 100644
> > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > @@ -7258,13 +7258,7 @@ static int
> > amdgpu_dm_connector_get_modes(struct drm_connector *connector)
> > drm_add_modes_noedid(connector, 1920,
> > 1080);
> > } else {
> > amdgpu_dm_connector_ddc_get_modes(connector, edid);
> > -   /* most eDP supports only timings from its edid,
> > -* usually only detailed timings are available
> > -* from eDP edid. timings which are not from edid
> > -* may damage eDP
> > -*/
> > -   if (connector->connector_type !=
> > DRM_MODE_CONNECTOR_eDP)
> > -
> > amdgpu_dm_connector_add_common_modes(encoder, connector);
> > +   amdgpu_dm_connector_add_common_modes(encoder,
> > connector);
> > amdgpu_dm_connector_add_freesync_modes(connector,
> > edid);
> > }
> > amdgpu_dm_fbc_init(connector);
> > --
> > 2.25.1

[PATCH v4 1/1] drm/doc: Document DRM device reset expectations

2023-06-26 Thread André Almeida

Create a section that specifies how to deal with DRM device resets for
kernel and userspace drivers.

Signed-off-by: André Almeida 
---
 Documentation/gpu/drm-uapi.rst | 68 ++
 1 file changed, 68 insertions(+)

diff --git a/Documentation/gpu/drm-uapi.rst b/Documentation/gpu/drm-uapi.rst
index 65fb3036a580..25a11b9b98fa 100644
--- a/Documentation/gpu/drm-uapi.rst
+++ b/Documentation/gpu/drm-uapi.rst
@@ -285,6 +285,74 @@ for GPU1 and GPU2 from different vendors, and a third 
handler for
 mmapped regular files. Threads cause additional pain with signal
 handling as well.
 
+Device reset
+
+
+The GPU stack is really complex and is prone to errors, from hardware bugs,
+faulty applications and everything in between the many layers. Some errors
+require resetting the device in order to make the device usable again. This
+section describes what is the expectations for DRM and usermode drivers when a
+device resets and how to propagate the reset status.
+
+Kernel Mode Driver
+--
+
+The KMD is responsible for checking if the device needs a reset, and to perform
+it as needed. Usually a hang is detected when a job gets stuck executing. KMD
+should keep track of resets, because userspace can query any time about the
+reset stats for an specific context. This is needed to propagate to the rest of
+the stack that a reset has happened. Currently, this is implemented by each
+driver separately, with no common DRM interface.
+
+User Mode Driver
+
+
+The UMD should check before submitting new commands to the KMD if the device 
has
+been reset, and this can be checked more often if it requires to. After
+detecting a reset, UMD will then proceed to report it to the application using
+the appropriated API error code, as explained in the below section about
+robustness.
+
+Robustness
+--
+
+The only way to try to keep an application working after a reset is if it
+complies with the robustness aspects of the graphical API that it is using.
+
+Graphical APIs provide ways to application to deal with device resets. However,
+there is no guarantee that the app will be correctly using such features, and
+UMD can implement policies to close the app if it is a repeating offender,
+likely in a broken loop. This is done to ensure that it does not keeps blocking
+the user interface from being correctly displayed. This should be done even if
+the app is correct but happens to trigger some bug in the hardware/driver.
+
+OpenGL
+~~
+
+Apps using OpenGL should use the available robust interfaces, like the
+extension ``GL_ARB_robustness`` (or ``GL_EXT_robustness`` for OpenGL ES). This
+interface tells if a reset has happened, and if so, all the context state is
+considered lost and the app proceeds by creating new ones. If is possible to
+determine that robustness is not in use, UMD will terminate the app when a 
reset
+is detected, giving that the contexts are lost and the app won't be able to
+figure this out and recreate the contexts.
+
+Vulkan
+~~
+
+Apps using Vulkan should check for ``VK_ERROR_DEVICE_LOST`` for submissions.
+This error code means, among other things, that a device reset has happened and
+it needs to recreate the contexts to keep going.
+
+Reporting resets causes
+---
+
+Apart from propagating the reset through the stack so apps can recover, it's
+really useful for driver developers to learn more about what caused the reset 
in
+first place. DRM devices should make use of devcoredump to store relevant
+information about the reset, so this information can be added to user bug
+reports.
+
 .. _drm_driver_ioctl:
 
 IOCTL Support on Device Nodes
-- 
2.41.0

[PATCH v4 0/1] drm/doc: Document DRM device reset expectations

2023-06-26 Thread André Almeida

This v4 removes the common DRM ioctl, and adds just the documentation for now,
giving the lack of a common "DRM context" infrascture make it hard to implement.

v3: https://lore.kernel.org/lkml/20230621005719.836857-1-andrealm...@igalia.com/

Changes: 
 - Drop the ioctl
 - Addresed comments com Pekka, as making the documentation more clear and
 consistent.

André Almeida (1):
  drm/doc: Document DRM device reset expectations

 Documentation/gpu/drm-uapi.rst | 68 ++
 1 file changed, 68 insertions(+)

-- 
2.41.0

Re: [PATCH 5/5] drm/amd: Add documentation for how to flash a dGPU

2023-06-26 Thread Alex Deucher

On Mon, Jun 26, 2023 at 11:04 AM Mario Limonciello
 wrote:
>

Needs a basic patch description.  Even just "add documentation"

With that fixed, the series is:
Reviewed-by: Alex Deucher 

> Signed-off-by: Mario Limonciello 
> ---
>  Documentation/gpu/amdgpu/flashing.rst   | 33 +
>  Documentation/gpu/amdgpu/index.rst  |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 16 
>  3 files changed, 50 insertions(+)
>  create mode 100644 Documentation/gpu/amdgpu/flashing.rst
>
> diff --git a/Documentation/gpu/amdgpu/flashing.rst 
> b/Documentation/gpu/amdgpu/flashing.rst
> new file mode 100644
> index 0..bd745c42a538f
> --- /dev/null
> +++ b/Documentation/gpu/amdgpu/flashing.rst
> @@ -0,0 +1,33 @@
> +===
> + dGPU firmware flashing
> +===
> +
> +IFWI
> +
> +Flashing the dGPU integrated firmware image (IFWI) is supported by GPUs that
> +use the PSP to orchestrate the update (Navi3x or newer GPUs).
> +For supported GPUs, `amdgpu` will export a series of sysfs files that can be
> +used for the flash process.
> +
> +The IFWI flash process is:
> +
> +1. Ensure the IFWI image is intended for the dGPU on the system.
> +2. "Write" the IFWI image to the sysfs file `psp_vbflash`. This will stage 
> the IFWI in memory.
> +3. "Read" from the `psp_vbflash` sysfs file to initiate the flash process.
> +4. Poll the `psp_vbflash_status` sysfs file to determine when the flash 
> process completes.
> +
> +USB-C PD F/W
> +
> +On GPUs that support flashing an updated USB-C PD firmware image, the process
> +is done using the `usbc_pd_fw` sysfs file.
> +
> +* Reading the file will provide the current firmware version.
> +* Writing the name of a firmware payload stored in `/lib/firmware/amdgpu` to 
> the sysfs file will initiate the flash process.
> +
> +The firmware payload stored in `/lib/firmware/amdgpu` can be named any name
> +as long as it doesn't conflict with other existing binaries that are used by
> +`amdgpu`.
> +
> +sysfs files
> +---
> +.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> diff --git a/Documentation/gpu/amdgpu/index.rst 
> b/Documentation/gpu/amdgpu/index.rst
> index 03c2966cae798..912e699fd3731 100644
> --- a/Documentation/gpu/amdgpu/index.rst
> +++ b/Documentation/gpu/amdgpu/index.rst
> @@ -10,6 +10,7 @@ Next (GCN), Radeon DNA (RDNA), and Compute DNA (CDNA) 
> architectures.
> module-parameters
> driver-core
> display/index
> +   flashing
> xgmi
> ras
> thermal
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 7872004ed7f9b..047760bafcc23 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -3476,6 +3476,11 @@ void psp_copy_fw(struct psp_context *psp, uint8_t 
> *start_addr, uint32_t bin_size
> drm_dev_exit(idx);
>  }
>
> +/**
> + * DOC: usbc_pd_fw
> + * Reading from this file will retrieve the USB-C PD firmware version. 
> Writing to
> + * this file will trigger the update process.
> + */
>  static DEVICE_ATTR(usbc_pd_fw, S_IRUGO | S_IWUSR,
>psp_usbc_pd_fw_sysfs_read,
>psp_usbc_pd_fw_sysfs_write);
> @@ -3569,6 +3574,11 @@ static ssize_t amdgpu_psp_vbflash_read(struct file 
> *filp, struct kobject *kobj,
> return 0;
>  }
>
> +/**
> + * DOC: psp_vbflash
> + * Writing to this file will stage an IFWI for update. Reading from this file
> + * will trigger the update process.
> + */
>  static struct bin_attribute psp_vbflash_bin_attr = {
> .attr = {.name = "psp_vbflash", .mode = 0660},
> .size = 0,
> @@ -3576,6 +3586,12 @@ static struct bin_attribute psp_vbflash_bin_attr = {
> .read = amdgpu_psp_vbflash_read,
>  };
>
> +/**
> + * DOC: psp_vbflash_status
> + * The status of the flash process.
> + * 0: IFWI flash not complete.
> + * 1: IFWI flash complete.
> + */
>  static ssize_t amdgpu_psp_vbflash_status(struct device *dev,
>  struct device_attribute *attr,
>  char *buf)
> --
> 2.34.1
>

Re: [PATCH v2 2/4] drm/amd/display: Set minimum requirement for using PSR-SU on Rembrandt

2023-06-26 Thread Leo Li





On 6/23/23 11:05, Mario Limonciello wrote:

A number of parade TCONs are causing system hangs when utilized with
older DMUB firmware and PSR-SU. Some changes have been introduced into
DMUB firmware to add resilience against these failures.

Don't allow running PSR-SU unless on the newer firmware.

Cc: Sean Wang 
Cc: Marc Rossi 
Cc: Hamza Mahfooz 
Cc: Tsung-hua (Ryan) Lin 
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2443
Signed-off-by: Mario Limonciello 


Reviewed-by: Leo Li 


---
v1->v2:
 * Fix a s/dcn314/dcn31/ mixup
---
  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c |  3 ++-
  drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c  |  7 +++
  drivers/gpu/drm/amd/display/dc/dc_dmub_srv.h  |  1 +
  drivers/gpu/drm/amd/display/dmub/dmub_srv.h   |  2 ++
  drivers/gpu/drm/amd/display/dmub/src/dmub_dcn31.c |  5 +
  drivers/gpu/drm/amd/display/dmub/src/dmub_dcn31.h |  2 ++
  drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c   | 10 ++
  7 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c
index d647f68fd563..4f61d4f257cd 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c
@@ -24,6 +24,7 @@
   */
  
  #include "amdgpu_dm_psr.h"

+#include "dc_dmub_srv.h"
  #include "dc.h"
  #include "dm_helpers.h"
  #include "amdgpu_dm.h"
@@ -50,7 +51,7 @@ static bool link_supports_psrsu(struct dc_link *link)
!link->dpcd_caps.psr_info.psr2_su_y_granularity_cap)
return false;
  
-	return true;

+   return dc_dmub_check_min_version(dc->ctx->dmub_srv->dmub);
  }
  
  /*

diff --git a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c 
b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
index c52c40b16387..c753c6f30dd7 100644
--- a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
+++ b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
@@ -1011,3 +1011,10 @@ void dc_send_update_cursor_info_to_dmu(
dm_execute_dmub_cmd_list(pCtx->stream->ctx, 2, cmd, 
DM_DMUB_WAIT_TYPE_WAIT);
}
  }
+
+bool dc_dmub_check_min_version(struct dmub_srv *srv)
+{
+   if (!srv->hw_funcs.is_psrsu_supported)
+   return true;
+   return srv->hw_funcs.is_psrsu_supported(srv);
+}
diff --git a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.h 
b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.h
index a5196a9292b3..099f94b6107c 100644
--- a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.h
+++ b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.h
@@ -86,4 +86,5 @@ void dc_dmub_setup_subvp_dmub_command(struct dc *dc, struct 
dc_state *context, b
  void dc_dmub_srv_log_diagnostic_data(struct dc_dmub_srv *dc_dmub_srv);
  
  void dc_send_update_cursor_info_to_dmu(struct pipe_ctx *pCtx, uint8_t pipe_idx);

+bool dc_dmub_check_min_version(struct dmub_srv *srv);
  #endif /* _DMUB_DC_SRV_H_ */
diff --git a/drivers/gpu/drm/amd/display/dmub/dmub_srv.h 
b/drivers/gpu/drm/amd/display/dmub/dmub_srv.h
index 2a66a305679a..4585e0419da6 100644
--- a/drivers/gpu/drm/amd/display/dmub/dmub_srv.h
+++ b/drivers/gpu/drm/amd/display/dmub/dmub_srv.h
@@ -367,6 +367,8 @@ struct dmub_srv_hw_funcs {
  
  	bool (*is_supported)(struct dmub_srv *dmub);
  
+	bool (*is_psrsu_supported)(struct dmub_srv *dmub);

+
bool (*is_hw_init)(struct dmub_srv *dmub);
  
  	void (*enable_dmub_boot_options)(struct dmub_srv *dmub,

diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn31.c 
b/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn31.c
index ebf7aeec4029..c8445d474107 100644
--- a/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn31.c
+++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn31.c
@@ -302,6 +302,11 @@ bool dmub_dcn31_is_supported(struct dmub_srv *dmub)
return supported;
  }
  
+bool dmub_dcn31_is_psrsu_supported(struct dmub_srv *dmub)

+{
+   return dmub->fw_version >= DMUB_FW_VERSION(4, 0, 58);
+}
+
  void dmub_dcn31_set_gpint(struct dmub_srv *dmub,
  union dmub_gpint_data_register reg)
  {
diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn31.h 
b/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn31.h
index 7d5c10ee539b..89c5a948b67d 100644
--- a/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn31.h
+++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn31.h
@@ -221,6 +221,8 @@ bool dmub_dcn31_is_hw_init(struct dmub_srv *dmub);
  
  bool dmub_dcn31_is_supported(struct dmub_srv *dmub);
  
+bool dmub_dcn31_is_psrsu_supported(struct dmub_srv *dmub);

+
  void dmub_dcn31_set_gpint(struct dmub_srv *dmub,
  union dmub_gpint_data_register reg);
  
diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c

index 9e9a6a44a7ac..7a31e3e27bab 100644
--- a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c
+++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c
@@ -226,14 +226,16 @@ static bool dmub_srv_hw_setup

Re: [PATCH v2 3/4] drm/amd/display: Set minimum requirement for using PSR-SU on Phoenix

2023-06-26 Thread Leo Li





On 6/23/23 11:05, Mario Limonciello wrote:

The same parade TCON issue can potentially happen on Phoenix, and the same
PSR resilience changes have been ported into the DMUB firmware.

Don't allow running PSR-SU unless on the newer firmware.

Cc: Sean Wang 
Cc: Marc Rossi 
Cc: Hamza Mahfooz 
Cc: Tsung-hua (Ryan) Lin 
Signed-off-by: Mario Limonciello 


Reviewed-by: Leo Li 


---
v1->v2:
 * Fix a s/dcn31/dcn314/ mixup
---
  drivers/gpu/drm/amd/display/dmub/src/dmub_dcn314.c | 5 +
  drivers/gpu/drm/amd/display/dmub/src/dmub_dcn314.h | 2 ++
  drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c| 1 +
  3 files changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn314.c 
b/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn314.c
index 48a06dbd9be7..f161aeb7e7c4 100644
--- a/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn314.c
+++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn314.c
@@ -60,3 +60,8 @@ const struct dmub_srv_dcn31_regs dmub_srv_dcn314_regs = {
{ DMUB_DCN31_FIELDS() },
  #undef DMUB_SF
  };
+
+bool dmub_dcn314_is_psrsu_supported(struct dmub_srv *dmub)
+{
+   return dmub->fw_version >= DMUB_FW_VERSION(8, 0, 16);
+}
diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn314.h 
b/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn314.h
index 674267a2940e..f213bd82c911 100644
--- a/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn314.h
+++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn314.h
@@ -30,4 +30,6 @@
  
  extern const struct dmub_srv_dcn31_regs dmub_srv_dcn314_regs;
  
+bool dmub_dcn314_is_psrsu_supported(struct dmub_srv *dmub);

+
  #endif /* _DMUB_DCN314_H_ */
diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c 
b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c
index 7a31e3e27bab..bdaf43892f47 100644
--- a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c
+++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c
@@ -228,6 +228,7 @@ static bool dmub_srv_hw_setup(struct dmub_srv *dmub, enum 
dmub_asic asic)
case DMUB_ASIC_DCN316:
if (asic == DMUB_ASIC_DCN314) {
dmub->regs_dcn31 = &dmub_srv_dcn314_regs;
+   funcs->is_psrsu_supported = 
dmub_dcn314_is_psrsu_supported;
} else if (asic == DMUB_ASIC_DCN315) {
dmub->regs_dcn31 = &dmub_srv_dcn315_regs;
} else if (asic == DMUB_ASIC_DCN316) {

[PATCH] drm/amdgpu: rename psp_update_vcn_sram to a common name

2023-06-26 Thread Lang Yu

Rename psp_update_vcn_sram to psp_execute_ucode_loading
so that other clients can feel free to use it.

Signed-off-by: Lang Yu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 19 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  7 +--
 drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c   |  7 ---
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c   |  8 +---
 drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c   |  6 --
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   |  6 --
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c |  5 +++--
 7 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index aa69269169a1..33f8b8389979 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2931,15 +2931,16 @@ int psp_rlc_autoload_start(struct psp_context *psp)
return ret;
 }
 
-int psp_update_vcn_sram(struct amdgpu_device *adev, int inst_idx,
-   uint64_t cmd_gpu_addr, int cmd_size)
-{
-   struct amdgpu_firmware_info ucode = {0};
-
-   ucode.ucode_id = inst_idx ? AMDGPU_UCODE_ID_VCN1_RAM :
-   AMDGPU_UCODE_ID_VCN0_RAM;
-   ucode.mc_addr = cmd_gpu_addr;
-   ucode.ucode_size = cmd_size;
+int psp_execute_ucode_loading(struct amdgpu_device *adev,
+ enum AMDGPU_UCODE_ID ucode_id,
+ uint64_t ucode_gpu_addr,
+ int ucode_size)
+{
+   struct amdgpu_firmware_info ucode = {
+   .ucode_id = ucode_id,
+   .mc_addr = ucode_gpu_addr,
+   .ucode_size = ucode_size,
+   };
 
return psp_execute_non_psp_fw_load(&adev->psp, &ucode);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 2cae0b1a0b8a..93849db18696 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -459,8 +459,11 @@ extern int psp_wait_for_spirom_update(struct psp_context 
*psp, uint32_t reg_inde
uint32_t field_val, uint32_t mask, uint32_t 
msec_timeout);
 
 int psp_gpu_reset(struct amdgpu_device *adev);
-int psp_update_vcn_sram(struct amdgpu_device *adev, int inst_idx,
-   uint64_t cmd_gpu_addr, int cmd_size);
+
+int psp_execute_ucode_loading(struct amdgpu_device *adev,
+ enum AMDGPU_UCODE_ID ucode_id,
+ uint64_t ucode_gpu_addr,
+ int ucode_size);
 
 int psp_ta_init_shared_buf(struct psp_context *psp,
  struct ta_mem_context *mem_ctx);
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
index c975aed2f6c7..e89b1e76 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
@@ -881,9 +881,10 @@ static int vcn_v2_0_start_dpg_mode(struct amdgpu_device 
*adev, bool indirect)
UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect);
 
if (indirect)
-   psp_update_vcn_sram(adev, 0, adev->vcn.inst->dpg_sram_gpu_addr,
-   
(uint32_t)((uintptr_t)adev->vcn.inst->dpg_sram_curr_addr -
-  
(uintptr_t)adev->vcn.inst->dpg_sram_cpu_addr));
+   psp_execute_ucode_loading(adev, AMDGPU_UCODE_ID_VCN0_RAM,
+   adev->vcn.inst->dpg_sram_gpu_addr,
+   
(uint32_t)((uintptr_t)adev->vcn.inst->dpg_sram_curr_addr -
+  
(uintptr_t)adev->vcn.inst->dpg_sram_cpu_addr));
 
/* force RBC into idle state */
rb_bufsz = order_base_2(ring->ring_size);
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
index bb1875f926f1..e5df190a79a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
@@ -912,9 +912,11 @@ static int vcn_v2_5_start_dpg_mode(struct amdgpu_device 
*adev, int inst_idx, boo
UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect);
 
if (indirect)
-   psp_update_vcn_sram(adev, inst_idx, 
adev->vcn.inst[inst_idx].dpg_sram_gpu_addr,
-   
(uint32_t)((uintptr_t)adev->vcn.inst[inst_idx].dpg_sram_curr_addr -
-  
(uintptr_t)adev->vcn.inst[inst_idx].dpg_sram_cpu_addr));
+   psp_execute_ucode_loading(adev,
+   inst_idx ? AMDGPU_UCODE_ID_VCN1_RAM : 
AMDGPU_UCODE_ID_VCN0_RAM,
+   adev->vcn.inst[inst_idx].dpg_sram_gpu_addr,
+   
(uint32_t)((uintptr_t)adev->vcn.inst[inst_idx].dpg_sram_curr_addr -
+  
(uintptr_t)adev->vcn.inst[inst_idx].dpg_sram_cpu_addr));
 
ring = &adev->vcn.inst[inst_idx].ring_dec;
/* force RBC into idle state */
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c 
b/dr

Re: [PATCH] drm/amdgpu: rename psp_update_vcn_sram to a common name

2023-06-26 Thread Lang Yu

Please ignore this patch, will send a new one.

Regards,
Lang

On 06/27/ , Lang Yu wrote:
> Rename psp_update_vcn_sram to psp_execute_ucode_loading
> so that other clients can feel free to use it.
> 
> Signed-off-by: Lang Yu 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 19 ++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  7 +--
>  drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c   |  7 ---
>  drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c   |  8 +---
>  drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c   |  6 --
>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   |  6 --
>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c |  5 +++--
>  7 files changed, 35 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index aa69269169a1..33f8b8389979 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -2931,15 +2931,16 @@ int psp_rlc_autoload_start(struct psp_context *psp)
>   return ret;
>  }
>  
> -int psp_update_vcn_sram(struct amdgpu_device *adev, int inst_idx,
> - uint64_t cmd_gpu_addr, int cmd_size)
> -{
> - struct amdgpu_firmware_info ucode = {0};
> -
> - ucode.ucode_id = inst_idx ? AMDGPU_UCODE_ID_VCN1_RAM :
> - AMDGPU_UCODE_ID_VCN0_RAM;
> - ucode.mc_addr = cmd_gpu_addr;
> - ucode.ucode_size = cmd_size;
> +int psp_execute_ucode_loading(struct amdgpu_device *adev,
> +   enum AMDGPU_UCODE_ID ucode_id,
> +   uint64_t ucode_gpu_addr,
> +   int ucode_size)
> +{
> + struct amdgpu_firmware_info ucode = {
> + .ucode_id = ucode_id,
> + .mc_addr = ucode_gpu_addr,
> + .ucode_size = ucode_size,
> + };
>  
>   return psp_execute_non_psp_fw_load(&adev->psp, &ucode);
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> index 2cae0b1a0b8a..93849db18696 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> @@ -459,8 +459,11 @@ extern int psp_wait_for_spirom_update(struct psp_context 
> *psp, uint32_t reg_inde
>   uint32_t field_val, uint32_t mask, uint32_t 
> msec_timeout);
>  
>  int psp_gpu_reset(struct amdgpu_device *adev);
> -int psp_update_vcn_sram(struct amdgpu_device *adev, int inst_idx,
> - uint64_t cmd_gpu_addr, int cmd_size);
> +
> +int psp_execute_ucode_loading(struct amdgpu_device *adev,
> +   enum AMDGPU_UCODE_ID ucode_id,
> +   uint64_t ucode_gpu_addr,
> +   int ucode_size);
>  
>  int psp_ta_init_shared_buf(struct psp_context *psp,
> struct ta_mem_context *mem_ctx);
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c 
> b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
> index c975aed2f6c7..e89b1e76 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
> @@ -881,9 +881,10 @@ static int vcn_v2_0_start_dpg_mode(struct amdgpu_device 
> *adev, bool indirect)
>   UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect);
>  
>   if (indirect)
> - psp_update_vcn_sram(adev, 0, adev->vcn.inst->dpg_sram_gpu_addr,
> - 
> (uint32_t)((uintptr_t)adev->vcn.inst->dpg_sram_curr_addr -
> -
> (uintptr_t)adev->vcn.inst->dpg_sram_cpu_addr));
> + psp_execute_ucode_loading(adev, AMDGPU_UCODE_ID_VCN0_RAM,
> + adev->vcn.inst->dpg_sram_gpu_addr,
> + 
> (uint32_t)((uintptr_t)adev->vcn.inst->dpg_sram_curr_addr -
> +
> (uintptr_t)adev->vcn.inst->dpg_sram_cpu_addr));
>  
>   /* force RBC into idle state */
>   rb_bufsz = order_base_2(ring->ring_size);
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c 
> b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> index bb1875f926f1..e5df190a79a0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> @@ -912,9 +912,11 @@ static int vcn_v2_5_start_dpg_mode(struct amdgpu_device 
> *adev, int inst_idx, boo
>   UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect);
>  
>   if (indirect)
> - psp_update_vcn_sram(adev, inst_idx, 
> adev->vcn.inst[inst_idx].dpg_sram_gpu_addr,
> - 
> (uint32_t)((uintptr_t)adev->vcn.inst[inst_idx].dpg_sram_curr_addr -
> -
> (uintptr_t)adev->vcn.inst[inst_idx].dpg_sram_cpu_addr));
> + psp_execute_ucode_loading(adev,
> + inst_idx ? AMDGPU_UCODE_ID_VCN1_RAM : 
> AMDGPU_UCODE_ID_VCN0_RAM,
> + adev->vcn.inst[inst_idx].dpg_sram_gpu_addr,
> + 
> (uint32_t)((uintptr_t)adev->vcn.inst[inst_idx].dpg_sram_curr_addr -
> +

RE: [PATCH 2/2] drm/amd/pm: avoid unintentional shutdown due to temperature momentary fluctuation

2023-06-26 Thread Quan, Evan

[AMD Official Use Only - General]

> -Original Message-
> From: Lazar, Lijo 
> Sent: Monday, June 26, 2023 7:54 PM
> To: Quan, Evan ; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander 
> Subject: Re: [PATCH 2/2] drm/amd/pm: avoid unintentional shutdown due to
> temperature momentary fluctuation
>
>
>
> On 6/26/2023 1:17 PM, Evan Quan wrote:
> > An intentional delay is added on soft ctf triggered. Then there will
> > be a double check for the GPU temperature before taking further
> > action. This can avoid unintended shutdown due to temperature
> > momentary fluctuation.
> >
> > Signed-off-by: Evan Quan 
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  3 ++
> >   .../gpu/drm/amd/pm/powerplay/amd_powerplay.c  | 48
> +++
> >   .../drm/amd/pm/powerplay/hwmgr/smu_helper.c   | 27 ---
> >   drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h  |  2 +
> >   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 34
> +
> >   drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  2 +
> >   .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c|  9 +---
> >   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|  9 +---
> >   8 files changed, 102 insertions(+), 32 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index e459381dc759..5ef1f31e703c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -287,6 +287,9 @@ extern int amdgpu_user_partt_mode;
> >   #define AMDGPU_SMARTSHIFT_MAX_BIAS (100)
> >   #define AMDGPU_SMARTSHIFT_MIN_BIAS (-100)
> >
> > +/* Extra time delay(in ms) to eliminate the influence of temperature
> momentary fluctuation */
> > +#define AMDGPU_SWCTF_EXTRA_DELAY   50
>
> I think a delay of 10-15ms is good enough to filter out any spike.
50ms is required by our CE team for supporting the customer. It is also aligned 
with Windows side.
Considering we cannot guard that(10-15ms is good), I think it's better to stick 
to the 50ms recommended setting.
How do you think?

Evan
>
> With that change, the series is
>   Reviewed-by: Lijo Lazar 
>
> Thanks,
> Lijo
>
> > +
> >   struct amdgpu_xcp_mgr;
> >   struct amdgpu_device;
> >   struct amdgpu_irq_src;
> > diff --git a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
> > b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
> > index 11b7b4cffaae..ff360c699171 100644
> > --- a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
> > +++ b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
> > @@ -26,6 +26,7 @@
> >   #include 
> >   #include 
> >   #include 
> > +#include 
> >   #include "amd_shared.h"
> >   #include "amd_powerplay.h"
> >   #include "power_state.h"
> > @@ -91,6 +92,45 @@ static int pp_early_init(void *handle)
> > return 0;
> >   }
> >
> > +static void pp_swctf_delayed_work_handler(struct work_struct *work) {
> > +   struct pp_hwmgr *hwmgr =
> > +   container_of(work, struct pp_hwmgr,
> swctf_delayed_work.work);
> > +   struct amdgpu_device *adev = hwmgr->adev;
> > +   struct amdgpu_dpm_thermal *range =
> > +   &adev->pm.dpm.thermal;
> > +   uint32_t gpu_temperature, size;
> > +   int ret;
> > +
> > +   /*
> > +* If the hotspot/edge temperature is confirmed as below SW CTF
> setting point
> > +* after the delay enforced, nothing will be done.
> > +* Otherwise, a graceful shutdown will be performed to prevent
> further damage.
> > +*/
> > +   if (range->sw_ctf_threshold &&
> > +   hwmgr->hwmgr_func->read_sensor) {
> > +   ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
> > +
> AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
> > +&gpu_temperature,
> > +&size);
> > +   /*
> > +* For some legacy ASICs, hotspot temperature retrieving
> might be not
> > +* supported. Check the edge temperature instead then.
> > +*/
> > +   if (ret == -EOPNOTSUPP)
> > +   ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
> > +
> AMDGPU_PP_SENSOR_EDGE_TEMP,
> > +
> &gpu_temperature,
> > +&size);
> > +   if (!ret && gpu_temperature / 1000 < range-
> >sw_ctf_threshold)
> > +   return;
> > +   }
> > +
> > +   dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW
> CTF) detected!\n");
> > +   dev_emerg(adev->dev, "ERROR: System is going to shutdown due to
> GPU SW CTF!\n");
> > +   orderly_poweroff(true);
> > +}
> > +
> >   static int pp_sw_init(void *handle)
> >   {
> > struct amdgpu_device *adev = handle; @@ -101,6 +141,10 @@ static
> > int pp_sw_init(void *handle)
> >
> > pr_debug("powerplay sw init %s\n", ret ? "failed" :
> > "successfully");
> >
> > +   if (!ret)
> > +   INIT_DELAYED_WORK(&hwmgr->swctf_delayed_work,
> > + pp_swctf_delayed_work_handler);
> > +
> > return ret;
> >   }
> >
>

[PATCH 1/1] drm/amdgpu: remove duplicated doorbell range init for sdma v4.4.2

2023-06-26 Thread Le Ma

Handled in earlier phase

Signed-off-by: Le Ma 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 6be19ffc502b..f413898dda37 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -902,11 +902,6 @@ static int sdma_v4_4_2_inst_start(struct amdgpu_device 
*adev,
WREG32_SDMA(i, regSDMA_CNTL, temp);
 
if (!amdgpu_sriov_vf(adev)) {
-   ring = &adev->sdma.instance[i].ring;
-   adev->nbio.funcs->sdma_doorbell_range(adev, i,
-   ring->use_doorbell, ring->doorbell_index,
-   adev->doorbell_index.sdma_doorbell_range);
-
if (adev->firmware.load_type != AMDGPU_FW_LOAD_PSP) {
/* unhalt engine */
temp = RREG32_SDMA(i, regSDMA_F32_CNTL);
-- 
2.38.1

RE: [PATCH 1/1] drm/amdgpu: remove duplicated doorbell range init for sdma v4.4.2

2023-06-26 Thread Zhang, Hawking

[AMD Official Use Only - General]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: amd-gfx  On Behalf Of Le Ma
Sent: Tuesday, June 27, 2023 11:56
To: amd-gfx@lists.freedesktop.org
Subject: [PATCH 1/1] drm/amdgpu: remove duplicated doorbell range init for sdma 
v4.4.2

Handled in earlier phase

Signed-off-by: Le Ma 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 6be19ffc502b..f413898dda37 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -902,11 +902,6 @@ static int sdma_v4_4_2_inst_start(struct amdgpu_device 
*adev,
WREG32_SDMA(i, regSDMA_CNTL, temp);

if (!amdgpu_sriov_vf(adev)) {
-   ring = &adev->sdma.instance[i].ring;
-   adev->nbio.funcs->sdma_doorbell_range(adev, i,
-   ring->use_doorbell, ring->doorbell_index,
-   adev->doorbell_index.sdma_doorbell_range);
-
if (adev->firmware.load_type != AMDGPU_FW_LOAD_PSP) {
/* unhalt engine */
temp = RREG32_SDMA(i, regSDMA_F32_CNTL);
--
2.38.1

RE: [PATCH] drm/amd/pm: Add GFX v9.4.3 unique id to sysfs

2023-06-26 Thread Lazar, Lijo

[AMD Official Use Only - General]



Thanks,
Lijo

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Wednesday, June 21, 2023 6:49 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Ma, Le ; 
Kasiviswanathan, Harish ; Kamal, Asad 
; Zhang, Hawking 
Subject: [PATCH] drm/amd/pm: Add GFX v9.4.3 unique id to sysfs

Expose unique id of GFX v9.4.3 ASICs as device attribute.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 386ccf11e657..9ec51f50fc52 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -2072,6 +2072,7 @@ static int default_attr_update(struct amdgpu_device 
*adev, struct amdgpu_device_
case IP_VERSION(9, 4, 0):
case IP_VERSION(9, 4, 1):
case IP_VERSION(9, 4, 2):
+   case IP_VERSION(9, 4, 3):
case IP_VERSION(10, 3, 0):
case IP_VERSION(11, 0, 0):
case IP_VERSION(11, 0, 1):
--
2.25.1

[PATCH 2/2] drm/amdgpu: use psp_execute_load_ip_fw_cmd_buf instead

2023-06-26 Thread Lang Yu

Replace the old ones with psp_execute_load_ip_fw_cmd_buf.

Signed-off-by: Lang Yu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 31 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  2 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c |  9 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c   |  4 +---
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c   |  4 +---
 drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c   |  4 +---
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   |  4 +---
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c |  4 +---
 9 files changed, 20 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index a1cb541f315f..b61963112118 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2474,21 +2474,11 @@ int psp_execute_load_ip_fw_cmd_buf(struct amdgpu_device 
*adev,
return ret;
 }
 
-static int psp_execute_non_psp_fw_load(struct psp_context *psp,
- struct amdgpu_firmware_info *ucode)
+static inline
+int psp_execute_non_psp_fw_load(struct psp_context *psp,
+   struct amdgpu_firmware_info *ucode)
 {
-   int ret = 0;
-   struct psp_gfx_cmd_resp *cmd = acquire_psp_cmd_buf(psp);
-
-   ret = psp_prep_load_ip_fw_cmd_buf(ucode, cmd);
-   if (!ret) {
-   ret = psp_cmd_submit_buf(psp, ucode, cmd,
-psp->fence_buf_mc_addr);
-   }
-
-   release_psp_cmd_buf(psp);
-
-   return ret;
+   return psp_execute_load_ip_fw_cmd_buf(psp->adev, ucode, 0, 0, 0);
 }
 
 static int psp_load_smu_fw(struct psp_context *psp)
@@ -2946,19 +2936,6 @@ int psp_rlc_autoload_start(struct psp_context *psp)
return ret;
 }
 
-int psp_update_vcn_sram(struct amdgpu_device *adev, int inst_idx,
-   uint64_t cmd_gpu_addr, int cmd_size)
-{
-   struct amdgpu_firmware_info ucode = {0};
-
-   ucode.ucode_id = inst_idx ? AMDGPU_UCODE_ID_VCN1_RAM :
-   AMDGPU_UCODE_ID_VCN0_RAM;
-   ucode.mc_addr = cmd_gpu_addr;
-   ucode.ucode_size = cmd_size;
-
-   return psp_execute_non_psp_fw_load(&adev->psp, &ucode);
-}
-
 int psp_ring_cmd_submit(struct psp_context *psp,
uint64_t cmd_buf_mc_addr,
uint64_t fence_mc_addr,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index bd324fed6237..e49984a9d570 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -459,8 +459,6 @@ extern int psp_wait_for_spirom_update(struct psp_context 
*psp, uint32_t reg_inde
uint32_t field_val, uint32_t mask, uint32_t 
msec_timeout);
 
 int psp_gpu_reset(struct amdgpu_device *adev);
-int psp_update_vcn_sram(struct amdgpu_device *adev, int inst_idx,
-   uint64_t cmd_gpu_addr, int cmd_size);
 
 int psp_execute_load_ip_fw_cmd_buf(struct amdgpu_device *adev,
   struct amdgpu_firmware_info *ucode,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index d37ebd4402ef..1805cd042d34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1257,3 +1257,12 @@ int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev)
 
return 0;
 }
+
+int amdgpu_vcn_psp_update_sram(struct amdgpu_device *adev, int inst_idx)
+{
+   return psp_execute_load_ip_fw_cmd_buf(adev, NULL,
+   inst_idx ? AMDGPU_UCODE_ID_VCN1_RAM : AMDGPU_UCODE_ID_VCN0_RAM,
+   adev->vcn.inst[inst_idx].dpg_sram_gpu_addr,
+   
(uint32_t)((uintptr_t)adev->vcn.inst[inst_idx].dpg_sram_curr_addr -
+  
(uintptr_t)adev->vcn.inst[inst_idx].dpg_sram_cpu_addr));
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index 92d5534df5f4..3ac5ad91ed08 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -414,4 +414,6 @@ int amdgpu_vcn_ras_late_init(struct amdgpu_device *adev,
struct ras_common_if *ras_block);
 int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev);
 
+int amdgpu_vcn_psp_update_sram(struct amdgpu_device *adev, int inst_idx);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
index c975aed2f6c7..74cd1522067c 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
@@ -881,9 +881,7 @@ static int vcn_v2_0_start_dpg_mode(struct amdgpu_device 
*adev, bool indirect)
UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect);
 
if (indirect)
-   psp_update_vcn_sram(adev, 0, adev->vcn.inst->dpg_sram_gpu_addr,
-   
(uint32_t)((uintptr_t)adev->vcn.inst->dpg_sram_curr_addr

[PATCH 1/2] drm/amdgpu: extract a PSP function to execute IP FW loading commands

2023-06-26 Thread Lang Yu

This function is more general and easy to use by more clients.

Signed-off-by: Lang Yu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 29 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  6 +
 2 files changed, 35 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index a33c155dddcf..a1cb541f315f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2445,6 +2445,35 @@ static int psp_prep_load_ip_fw_cmd_buf(struct 
amdgpu_firmware_info *ucode,
return ret;
 }
 
+int psp_execute_load_ip_fw_cmd_buf(struct amdgpu_device *adev,
+  struct amdgpu_firmware_info *ucode,
+  uint32_t ucode_id,
+  uint64_t cmd_buf_gpu_addr,
+  int cmd_buf_size)
+{
+   struct amdgpu_firmware_info fw_info = {
+   .ucode_id = ucode_id,
+   .mc_addr = cmd_buf_gpu_addr,
+   .ucode_size = cmd_buf_size,
+   };
+   struct psp_context *psp = &adev->psp;
+   struct psp_gfx_cmd_resp *cmd =
+   acquire_psp_cmd_buf(psp);
+   int ret;
+
+   if (!ucode)
+   ucode = &fw_info;
+
+   ret = psp_prep_load_ip_fw_cmd_buf(ucode, cmd);
+   if (!ret)
+   ret = psp_cmd_submit_buf(psp, ucode, cmd,
+psp->fence_buf_mc_addr);
+
+   release_psp_cmd_buf(psp);
+
+   return ret;
+}
+
 static int psp_execute_non_psp_fw_load(struct psp_context *psp,
  struct amdgpu_firmware_info *ucode)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 4847aacdf9dc..bd324fed6237 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -462,6 +462,12 @@ int psp_gpu_reset(struct amdgpu_device *adev);
 int psp_update_vcn_sram(struct amdgpu_device *adev, int inst_idx,
uint64_t cmd_gpu_addr, int cmd_size);
 
+int psp_execute_load_ip_fw_cmd_buf(struct amdgpu_device *adev,
+  struct amdgpu_firmware_info *ucode,
+  uint32_t ucode_id,
+  uint64_t cmd_buf_gpu_addr,
+  int cmd_buf_size);
+
 int psp_ta_init_shared_buf(struct psp_context *psp,
  struct ta_mem_context *mem_ctx);
 void psp_ta_free_shared_buf(struct ta_mem_context *mem_ctx);
-- 
2.25.1

Re: [PATCH 2/2] drm/amd/pm: avoid unintentional shutdown due to temperature momentary fluctuation

2023-06-26 Thread Lazar, Lijo





On 6/27/2023 9:02 AM, Quan, Evan wrote:

[AMD Official Use Only - General]


-Original Message-
From: Lazar, Lijo 
Sent: Monday, June 26, 2023 7:54 PM
To: Quan, Evan ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander 
Subject: Re: [PATCH 2/2] drm/amd/pm: avoid unintentional shutdown due to
temperature momentary fluctuation



On 6/26/2023 1:17 PM, Evan Quan wrote:

An intentional delay is added on soft ctf triggered. Then there will
be a double check for the GPU temperature before taking further
action. This can avoid unintended shutdown due to temperature
momentary fluctuation.

Signed-off-by: Evan Quan 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  3 ++
   .../gpu/drm/amd/pm/powerplay/amd_powerplay.c  | 48

+++

   .../drm/amd/pm/powerplay/hwmgr/smu_helper.c   | 27 ---
   drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h  |  2 +
   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 34

+

   drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  2 +
   .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c|  9 +---
   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|  9 +---
   8 files changed, 102 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e459381dc759..5ef1f31e703c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -287,6 +287,9 @@ extern int amdgpu_user_partt_mode;
   #define AMDGPU_SMARTSHIFT_MAX_BIAS (100)
   #define AMDGPU_SMARTSHIFT_MIN_BIAS (-100)

+/* Extra time delay(in ms) to eliminate the influence of temperature

momentary fluctuation */

+#define AMDGPU_SWCTF_EXTRA_DELAY   50


I think a delay of 10-15ms is good enough to filter out any spike.

50ms is required by our CE team for supporting the customer. It is also aligned 
with Windows side.
Considering we cannot guard that(10-15ms is good), I think it's better to stick 
to the 50ms recommended setting.
How do you think?



IMO, a temperature reading consistenly remaining high for 10-15 ms 
shouldn't be considered a spike since thermal controller (given its 
clock) would have taken multiple readings by that time for the same sensor.


I'm fine if you want to align with Windows side.

Thanks,
Lijo


Evan


With that change, the series is
   Reviewed-by: Lijo Lazar 

Thanks,
Lijo


+
   struct amdgpu_xcp_mgr;
   struct amdgpu_device;
   struct amdgpu_irq_src;
diff --git a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
index 11b7b4cffaae..ff360c699171 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
@@ -26,6 +26,7 @@
   #include 
   #include 
   #include 
+#include 
   #include "amd_shared.h"
   #include "amd_powerplay.h"
   #include "power_state.h"
@@ -91,6 +92,45 @@ static int pp_early_init(void *handle)
 return 0;
   }

+static void pp_swctf_delayed_work_handler(struct work_struct *work) {
+   struct pp_hwmgr *hwmgr =
+   container_of(work, struct pp_hwmgr,

swctf_delayed_work.work);

+   struct amdgpu_device *adev = hwmgr->adev;
+   struct amdgpu_dpm_thermal *range =
+   &adev->pm.dpm.thermal;
+   uint32_t gpu_temperature, size;
+   int ret;
+
+   /*
+* If the hotspot/edge temperature is confirmed as below SW CTF

setting point

+* after the delay enforced, nothing will be done.
+* Otherwise, a graceful shutdown will be performed to prevent

further damage.

+*/
+   if (range->sw_ctf_threshold &&
+   hwmgr->hwmgr_func->read_sensor) {
+   ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
+

AMDGPU_PP_SENSOR_HOTSPOT_TEMP,

+&gpu_temperature,
+&size);
+   /*
+* For some legacy ASICs, hotspot temperature retrieving

might be not

+* supported. Check the edge temperature instead then.
+*/
+   if (ret == -EOPNOTSUPP)
+   ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
+

AMDGPU_PP_SENSOR_EDGE_TEMP,

+

&gpu_temperature,

+&size);
+   if (!ret && gpu_temperature / 1000 < range-
sw_ctf_threshold)
+   return;
+   }
+
+   dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW

CTF) detected!\n");

+   dev_emerg(adev->dev, "ERROR: System is going to shutdown due to

GPU SW CTF!\n");

+   orderly_poweroff(true);
+}
+
   static int pp_sw_init(void *handle)
   {
 struct amdgpu_device *adev = handle; @@ -101,6 +141,10 @@ static
int pp_sw_init(void *handle)

 pr_debug("powerplay sw init %s\n", ret ? "failed" :
"successfully");

+   if (!ret)
+   INIT_DELAYED_WORK(&hwmgr->swctf_delayed_work,
+ pp_swctf_delayed_work_handler);
+
 return ret;
   }

@@ -135,6 +179,8 @@ static int pp_hw_fini(void *handle)
 struct amdgpu_de

RE: [PATCH] drm/amd/pm: Add GFX v9.4.3 unique id to sysfs

2023-06-26 Thread Wang, Yang(Kevin)

[AMD Official Use Only - General]

Reviewed-by: Yang Wang 

Best Regards,
Kevin

-Original Message-
From: amd-gfx  On Behalf Of Lazar, Lijo
Sent: Tuesday, June 27, 2023 12:39 PM
To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Ma, Le ; 
Kasiviswanathan, Harish ; Kamal, Asad 
; Zhang, Hawking 
Subject: RE: [PATCH] drm/amd/pm: Add GFX v9.4.3 unique id to sysfs

[AMD Official Use Only - General]

[AMD Official Use Only - General]



Thanks,
Lijo

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Wednesday, June 21, 2023 6:49 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Ma, Le ; 
Kasiviswanathan, Harish ; Kamal, Asad 
; Zhang, Hawking 
Subject: [PATCH] drm/amd/pm: Add GFX v9.4.3 unique id to sysfs

Expose unique id of GFX v9.4.3 ASICs as device attribute.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 386ccf11e657..9ec51f50fc52 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -2072,6 +2072,7 @@ static int default_attr_update(struct amdgpu_device 
*adev, struct amdgpu_device_
case IP_VERSION(9, 4, 0):
case IP_VERSION(9, 4, 1):
case IP_VERSION(9, 4, 2):
+   case IP_VERSION(9, 4, 3):
case IP_VERSION(10, 3, 0):
case IP_VERSION(11, 0, 0):
case IP_VERSION(11, 0, 1):
--
2.25.1

<>

[PATCH] drm/amdgpu: Keep non-psp path for partition switch

2023-06-26 Thread Lijo Lazar

When PSP block is not present, use direct programming.

Signed-off-by: Lijo Lazar 
Acked-by: Mangesh Gadre 
Tested-by: Mangesh Gadre 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 28 +++--
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 76b189bd244a..9e3b835bdbb2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -623,12 +623,28 @@ static void gfx_v9_4_3_select_me_pipe_q(struct 
amdgpu_device *adev,
 static int gfx_v9_4_3_switch_compute_partition(struct amdgpu_device *adev,
int num_xccs_per_xcp)
 {
-   int ret;
-
-   ret = psp_spatial_partition(&adev->psp, NUM_XCC(adev->gfx.xcc_mask) /
-   num_xccs_per_xcp);
-   if (ret)
-   return ret;
+   int ret, i, num_xcc;
+   u32 tmp = 0;
+
+   if (adev->psp.funcs) {
+   ret = psp_spatial_partition(&adev->psp,
+   NUM_XCC(adev->gfx.xcc_mask) /
+   num_xccs_per_xcp);
+   if (ret)
+   return ret;
+   } else {
+   num_xcc = NUM_XCC(adev->gfx.xcc_mask);
+
+   for (i = 0; i < num_xcc; i++) {
+   tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, NUM_XCC_IN_XCP,
+   num_xccs_per_xcp);
+   tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, VIRTUAL_XCC_ID,
+   i % num_xccs_per_xcp);
+   WREG32_SOC15(GC, GET_INST(GC, i), regCP_HYP_XCP_CTL,
+tmp);
+   }
+   ret = 0;
+   }
 
adev->gfx.num_xcc_per_xcp = num_xccs_per_xcp;
 
-- 
2.25.1

RE: [PATCH] drm/amdgpu: Keep non-psp path for partition switch

2023-06-26 Thread Zhang, Hawking

[AMD Official Use Only - General]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, June 27, 2023 13:19
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Gadre, Mangesh 
Subject: [PATCH] drm/amdgpu: Keep non-psp path for partition switch

When PSP block is not present, use direct programming.

Signed-off-by: Lijo Lazar 
Acked-by: Mangesh Gadre 
Tested-by: Mangesh Gadre 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 28 +++--
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 76b189bd244a..9e3b835bdbb2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -623,12 +623,28 @@ static void gfx_v9_4_3_select_me_pipe_q(struct 
amdgpu_device *adev,  static int gfx_v9_4_3_switch_compute_partition(struct 
amdgpu_device *adev,
int num_xccs_per_xcp)
 {
-   int ret;
-
-   ret = psp_spatial_partition(&adev->psp, NUM_XCC(adev->gfx.xcc_mask) /
-   num_xccs_per_xcp);
-   if (ret)
-   return ret;
+   int ret, i, num_xcc;
+   u32 tmp = 0;
+
+   if (adev->psp.funcs) {
+   ret = psp_spatial_partition(&adev->psp,
+   NUM_XCC(adev->gfx.xcc_mask) /
+   num_xccs_per_xcp);
+   if (ret)
+   return ret;
+   } else {
+   num_xcc = NUM_XCC(adev->gfx.xcc_mask);
+
+   for (i = 0; i < num_xcc; i++) {
+   tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, NUM_XCC_IN_XCP,
+   num_xccs_per_xcp);
+   tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, VIRTUAL_XCC_ID,
+   i % num_xccs_per_xcp);
+   WREG32_SOC15(GC, GET_INST(GC, i), regCP_HYP_XCP_CTL,
+tmp);
+   }
+   ret = 0;
+   }

adev->gfx.num_xcc_per_xcp = num_xccs_per_xcp;

--
2.25.1

RE: [PATCH] drm/amd/pm: Add GFX v9.4.3 unique id to sysfs

2023-06-26 Thread Zhang, Hawking

[AMD Official Use Only - General]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, June 27, 2023 12:39
To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Ma, Le ; 
Kasiviswanathan, Harish ; Kamal, Asad 
; Zhang, Hawking 
Subject: RE: [PATCH] drm/amd/pm: Add GFX v9.4.3 unique id to sysfs

[AMD Official Use Only - General]



Thanks,
Lijo

-Original Message-
From: amd-gfx  On Behalf Of Lijo Lazar
Sent: Wednesday, June 21, 2023 6:49 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Ma, Le ; 
Kasiviswanathan, Harish ; Kamal, Asad 
; Zhang, Hawking 
Subject: [PATCH] drm/amd/pm: Add GFX v9.4.3 unique id to sysfs

Expose unique id of GFX v9.4.3 ASICs as device attribute.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 386ccf11e657..9ec51f50fc52 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -2072,6 +2072,7 @@ static int default_attr_update(struct amdgpu_device 
*adev, struct amdgpu_device_
case IP_VERSION(9, 4, 0):
case IP_VERSION(9, 4, 1):
case IP_VERSION(9, 4, 2):
+   case IP_VERSION(9, 4, 3):
case IP_VERSION(10, 3, 0):
case IP_VERSION(11, 0, 0):
case IP_VERSION(11, 0, 1):
--
2.25.1

50 matches

Mail list logo