RE: [PATCH 2/2] drm/amdgpu: fix documentation errors in gmc v12.0

2024-05-14 Thread Zhang, Hawking
[AMD Official Use Only - AMD Internal Distribution Only]

Series is

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: amd-gfx  On Behalf Of Alex Deucher
Sent: Wednesday, May 15, 2024 00:44
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander 
Subject: [PATCH 2/2] drm/amdgpu: fix documentation errors in gmc v12.0

Fix up parameter descriptions.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
index 34e751b9b7003..c12c96f5bbaae 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
@@ -282,6 +282,8 @@ static void gmc_v12_0_flush_vm_hub(struct amdgpu_device 
*adev, uint32_t vmid,
  *
  * @adev: amdgpu_device pointer
  * @vmid: vm instance to flush
+ * @vmhub: which hub to flush
+ * @flush_type: the flush type
  *
  * Flush the TLB for the requested page table.
  */
@@ -321,6 +323,9 @@ static void gmc_v12_0_flush_gpu_tlb(struct amdgpu_device 
*adev, uint32_t vmid,
  *
  * @adev: amdgpu_device pointer
  * @pasid: pasid to be flush
+ * @flush_type: the flush type
+ * @all_hub: flush all hubs
+ * @inst: is used to select which instance of KIQ to use for the invalidation
  *
  * Flush the TLB for the requested pasid.
  */
--
2.45.0



RE: [PATCH] drm/amdgpu: fix compiler 'side-effect' check issue for RAS_EVENT_LOG()

2024-05-14 Thread Zhang, Hawking
[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Wang, Yang(Kevin) 
Sent: Tuesday, May 14, 2024 08:03
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Kamal, Asad 
Subject: [PATCH] drm/amdgpu: fix compiler 'side-effect' check issue for 
RAS_EVENT_LOG()

create a new helper function to avoid compiler 'side-effect'
check about RAS_EVENT_LOG() macro.

Signed-off-by: Yang Wang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 18 ++  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 13 ++---
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 1dd13ed3b7b5..c04e6ced1af3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4504,3 +4504,21 @@ int amdgpu_ras_reserve_page(struct amdgpu_device *adev, 
uint64_t pfn)

return ret;
 }
+
+void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
+   const char *fmt, ...)
+{
+   struct va_format vaf;
+   va_list args;
+
+   va_start(args, fmt);
+   vaf.fmt = fmt;
+   vaf.va = 
+
+   if (amdgpu_ras_event_id_is_valid(adev, event_id))
+   dev_printk(KERN_INFO, adev->dev, "{%llu}%pV", event_id, );
+   else
+   dev_printk(KERN_INFO, adev->dev, "%pV", );
+
+   va_end(args);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index c8980d5f6540..6a8c7b1609df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -67,13 +67,8 @@ struct amdgpu_iv_entry;
 /* The high three bits indicates socketid */  #define 
AMDGPU_RAS_GET_FEATURES(val)  ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)

-#define RAS_EVENT_LOG(_adev, _id, _fmt, ...)   \
-do {   \
-   if (amdgpu_ras_event_id_is_valid((_adev), (_id)))   
\
-   dev_info((_adev)->dev, "{%llu}" _fmt, (_id), ##__VA_ARGS__);
\
-   else\
-   dev_info((_adev)->dev, _fmt, ##__VA_ARGS__);
\
-} while (0)
+#define RAS_EVENT_LOG(adev, id, fmt, ...)  \
+   amdgpu_ras_event_log_print((adev), (id), (fmt), ##__VA_ARGS__);

 enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__UMC = 0,
@@ -956,4 +951,8 @@ int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint16_t pasid,
pasid_notify pasid_fn, void *data, uint32_t reset);

+__printf(3, 4)
+void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
+   const char *fmt, ...);
+
 #endif
--
2.34.1



[PATCH v3] drm/amdkfd: Remove bo NULL check in gmc_v12_0_get_vm_pte() function

2024-05-14 Thread Sreekant Somasekharan
Remove bo NULL check in amdgpu/gmc_v12_0.c:gmc_v12_0_get_vm_pte() function
to fix smatch warning:

'drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c:518 gmc_v12_0_get_vm_pte()
warn: variable dereferenced before check 'bo' (see line 500)'

Signed-off-by: Sreekant Somasekharan 
Suggested-by: Dan Carpenter 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
index 2b7b67916c1d..0fadebec9019 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
@@ -515,13 +515,13 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device 
*adev,
*flags &= ~AMDGPU_PTE_VALID;
}
 
-   if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
+   if (bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
   AMDGPU_GEM_CREATE_UNCACHED))
*flags = (*flags & ~AMDGPU_PTE_MTYPE_GFX12_MASK) |
 AMDGPU_PTE_MTYPE_GFX12(MTYPE_UC);
 
/* WA for HW bug */
-   if ((bo && is_system) || ((bo_adev != adev) && coherent))
+   if (is_system || ((bo_adev != adev) && coherent))
*flags |= AMDGPU_PTE_MTYPE_GFX12(MTYPE_NC);
 
 }
-- 
2.34.1



RE: [PATCH] drm/amdgpu: fix compiler 'side-effect' check issue for RAS_EVENT_LOG()

2024-05-14 Thread Wang, Yang(Kevin)
[AMD Official Use Only - AMD Internal Distribution Only]

Ping...

Best Regards,
Kevin

-Original Message-
From: amd-gfx  On Behalf Of Yang Wang
Sent: Tuesday, May 14, 2024 8:03 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Kamal, Asad 
Subject: [PATCH] drm/amdgpu: fix compiler 'side-effect' check issue for 
RAS_EVENT_LOG()

create a new helper function to avoid compiler 'side-effect'
check about RAS_EVENT_LOG() macro.

Signed-off-by: Yang Wang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 18 ++  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 13 ++---
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 1dd13ed3b7b5..c04e6ced1af3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4504,3 +4504,21 @@ int amdgpu_ras_reserve_page(struct amdgpu_device *adev, 
uint64_t pfn)

return ret;
 }
+
+void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
+   const char *fmt, ...)
+{
+   struct va_format vaf;
+   va_list args;
+
+   va_start(args, fmt);
+   vaf.fmt = fmt;
+   vaf.va = 
+
+   if (amdgpu_ras_event_id_is_valid(adev, event_id))
+   dev_printk(KERN_INFO, adev->dev, "{%llu}%pV", event_id, );
+   else
+   dev_printk(KERN_INFO, adev->dev, "%pV", );
+
+   va_end(args);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index c8980d5f6540..6a8c7b1609df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -67,13 +67,8 @@ struct amdgpu_iv_entry;
 /* The high three bits indicates socketid */  #define 
AMDGPU_RAS_GET_FEATURES(val)  ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)

-#define RAS_EVENT_LOG(_adev, _id, _fmt, ...)   \
-do {   \
-   if (amdgpu_ras_event_id_is_valid((_adev), (_id)))   
\
-   dev_info((_adev)->dev, "{%llu}" _fmt, (_id), ##__VA_ARGS__);
\
-   else\
-   dev_info((_adev)->dev, _fmt, ##__VA_ARGS__);
\
-} while (0)
+#define RAS_EVENT_LOG(adev, id, fmt, ...)  \
+   amdgpu_ras_event_log_print((adev), (id), (fmt), ##__VA_ARGS__);

 enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__UMC = 0,
@@ -956,4 +951,8 @@ int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint16_t pasid,
pasid_notify pasid_fn, void *data, uint32_t reset);

+__printf(3, 4)
+void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
+   const char *fmt, ...);
+
 #endif
--
2.34.1



[PATCH v2] drm/amdkfd: Remove bo NULL check in gmc_v12_0_get_vm_pte() function

2024-05-14 Thread Sreekant Somasekharan
Remove bo NULL check in amdgpu/gmc_v12_0.c:gmc_v12_0_get_vm_pte() function
to fix smatch warning:

'drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c:518 gmc_v12_0_get_vm_pte()
warn: variable dereferenced before check 'bo' (see line 500)'

Signed-off-by: Sreekant Somasekharan 
Suggested-by: Dan Carpenter 
Reviewed-by: Kent Russell 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
index 2b7b67916c1d..0fadebec9019 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
@@ -515,13 +515,13 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device 
*adev,
*flags &= ~AMDGPU_PTE_VALID;
}
 
-   if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
+   if (bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
   AMDGPU_GEM_CREATE_UNCACHED))
*flags = (*flags & ~AMDGPU_PTE_MTYPE_GFX12_MASK) |
 AMDGPU_PTE_MTYPE_GFX12(MTYPE_UC);
 
/* WA for HW bug */
-   if ((bo && is_system) || ((bo_adev != adev) && coherent))
+   if (is_system || ((bo_adev != adev) && coherent))
*flags |= AMDGPU_PTE_MTYPE_GFX12(MTYPE_NC);
 
 }
-- 
2.34.1



RE: [PATCH] drm/amdgpu: fix Kconfig for ISP v2

2024-05-14 Thread Nirujogi, Pratap
[AMD Official Use Only - AMD Internal Distribution Only]

Thanks Alex!

-Original Message-
From: Deucher, Alexander 
Sent: Tuesday, May 14, 2024 5:28 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Nirujogi, Pratap 

Subject: [PATCH] drm/amdgpu: fix Kconfig for ISP v2

Add new config option and set proper dependencies for ISP.

v2: add missed guards, drop separate Kconfig

Signed-off-by: Alex Deucher 
Reviewed-by: Pratap Nirujogi 

Cc: Pratap Nirujogi 
---
 drivers/gpu/drm/amd/amdgpu/Kconfig| 11 +++
 drivers/gpu/drm/amd/amdgpu/Makefile   |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  4 
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c |  6 ++
 4 files changed, 23 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig 
b/drivers/gpu/drm/amd/amdgpu/Kconfig
index 22d88f8ef5279..0cd9d29394072 100644
--- a/drivers/gpu/drm/amd/amdgpu/Kconfig
+++ b/drivers/gpu/drm/amd/amdgpu/Kconfig
@@ -70,6 +70,17 @@ config DRM_AMDGPU_USERPTR
  This option selects CONFIG_HMM and CONFIG_HMM_MIRROR if it
  isn't already selected to enabled full userptr support.

+config DRM_AMD_ISP
+   bool "Enable AMD Image Signal Processor IP support"
+   depends on DRM_AMDGPU
+   select MFD_CORE
+   select PM_GENERIC_DOMAINS if PM
+   help
+   Choose this option to enable ISP IP support for AMD SOCs.
+   This adds the ISP (Image Signal Processor) IP driver and wires
+   it up into the amdgpu driver.  It is required for cameras
+   on APUs which utilize mipi cameras.
+
 config DRM_AMDGPU_WERROR
bool "Force the compiler to throw an error instead of a warning when 
compiling"
depends on DRM_AMDGPU
diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index 12ba76025cb7c..c95ec19a38264 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -325,6 +325,8 @@ amdgpu-y += $(AMD_DISPLAY_FILES)  endif

 # add isp block
+ifneq ($(CONFIG_DRM_AMD_ISP),)
 amdgpu-y += amdgpu_isp.o
+endif

 obj-$(CONFIG_DRM_AMDGPU)+= amdgpu.o
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 846c3550fbda8..936ed3c10c884 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -113,7 +113,9 @@
 #include "amdgpu_seq64.h"
 #include "amdgpu_reg_state.h"
 #include "amdgpu_umsch_mm.h"
+#if defined(CONFIG_DRM_AMD_ISP)
 #include "amdgpu_isp.h"
+#endif

 #define MAX_GPU_INSTANCE   64

@@ -1049,8 +1051,10 @@ struct amdgpu_device {
/* display related functionality */
struct amdgpu_display_manager dm;

+#if defined(CONFIG_DRM_AMD_ISP)
/* isp */
struct amdgpu_isp   isp;
+#endif

/* mes */
boolenable_mes;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 378d5a5cda917..1bab8dd37d621 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -107,7 +107,9 @@
 #include "jpeg_v5_0_0.h"

 #include "amdgpu_vpe.h"
+#if defined(CONFIG_DRM_AMD_ISP)
 #include "amdgpu_isp.h"
+#endif

 #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
 MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
@@ -712,10 +714,12 @@ static void 
amdgpu_discovery_read_from_harvest_table(struct amdgpu_device *adev,
adev->sdma.sdma_mask &=
~(1U << harvest_info->list[i].number_instance);
break;
+#if defined(CONFIG_DRM_AMD_ISP)
case ISP_HWID:
adev->isp.harvest_config |=
~(1U << harvest_info->list[i].number_instance);
break;
+#endif
default:
break;
}
@@ -2402,6 +2406,7 @@ static int amdgpu_discovery_set_umsch_mm_ip_blocks(struct 
amdgpu_device *adev)

 static int amdgpu_discovery_set_isp_ip_blocks(struct amdgpu_device *adev)  {
+#if defined(CONFIG_DRM_AMD_ISP)
switch (amdgpu_ip_version(adev, ISP_HWIP, 0)) {
case IP_VERSION(4, 1, 0):
case IP_VERSION(4, 1, 1):
@@ -2410,6 +2415,7 @@ static int amdgpu_discovery_set_isp_ip_blocks(struct 
amdgpu_device *adev)
default:
break;
}
+#endif

return 0;
 }
--
2.45.0



Re: [PATCH] drm/amdgpu: fix Kconfig for ISP

2024-05-14 Thread Alex Deucher
Ignore this.  Better, functional patch sent out.

Alex

On Tue, May 14, 2024 at 5:12 PM Alex Deucher  wrote:
>
> Add new config option and set proper dependencies for ISP.
>
> Signed-off-by: Alex Deucher 
> Cc: Pratap Nirujogi 
> ---
>  drivers/gpu/drm/amd/amdgpu/Kconfig  |  1 +
>  drivers/gpu/drm/amd/amdgpu/Makefile |  2 ++
>  drivers/gpu/drm/amd/isp/Kconfig | 17 +
>  3 files changed, 20 insertions(+)
>  create mode 100644 drivers/gpu/drm/amd/isp/Kconfig
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig 
> b/drivers/gpu/drm/amd/amdgpu/Kconfig
> index 22d88f8ef5279..aa037ac7ef24f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Kconfig
> +++ b/drivers/gpu/drm/amd/amdgpu/Kconfig
> @@ -83,3 +83,4 @@ config DRM_AMDGPU_WERROR
>  source "drivers/gpu/drm/amd/acp/Kconfig"
>  source "drivers/gpu/drm/amd/display/Kconfig"
>  source "drivers/gpu/drm/amd/amdkfd/Kconfig"
> +source "drivers/gpu/drm/amd/isp/Kconfig"
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
> b/drivers/gpu/drm/amd/amdgpu/Makefile
> index 12ba76025cb7c..c95ec19a38264 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -325,6 +325,8 @@ amdgpu-y += $(AMD_DISPLAY_FILES)
>  endif
>
>  # add isp block
> +ifneq ($(CONFIG_DRM_AMD_ISP),)
>  amdgpu-y += amdgpu_isp.o
> +endif
>
>  obj-$(CONFIG_DRM_AMDGPU)+= amdgpu.o
> diff --git a/drivers/gpu/drm/amd/isp/Kconfig b/drivers/gpu/drm/amd/isp/Kconfig
> new file mode 100644
> index 0..afa3579901009
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/isp/Kconfig
> @@ -0,0 +1,17 @@
> +# SPDX-License-Identifier: MIT
> +menu "ISP (Image Signal Processor) Configuration"
> +   depends on DRM_AMDGPU
> +
> +config DRM_AMD_ISP
> +   bool "Enable AMD Image Signal Processor IP support"
> +   depends on DRM_AMDGPU
> +   select MFD_CORE
> +   select PM_GENERIC_DOMAINS if PM
> +   help
> +   Choose this option to enable ISP IP support for AMD SOCs.
> +   This adds the ISP (Image Signal Processor) IP driver and wires
> +   it up into the amdgpu driver.  The ACP block provides the DMA
> +   engine for the V4L mipi driver. It is required for camera
> +   on APUs which utilize a mipi camera.
> +
> +endmenu
> --
> 2.45.0
>


[PATCH] drm/amdgpu: fix Kconfig for ISP v2

2024-05-14 Thread Alex Deucher
Add new config option and set proper dependencies for ISP.

v2: add missed guards, drop separate Kconfig

Signed-off-by: Alex Deucher 
Cc: Pratap Nirujogi 
---
 drivers/gpu/drm/amd/amdgpu/Kconfig| 11 +++
 drivers/gpu/drm/amd/amdgpu/Makefile   |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  4 
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c |  6 ++
 4 files changed, 23 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig 
b/drivers/gpu/drm/amd/amdgpu/Kconfig
index 22d88f8ef5279..0cd9d29394072 100644
--- a/drivers/gpu/drm/amd/amdgpu/Kconfig
+++ b/drivers/gpu/drm/amd/amdgpu/Kconfig
@@ -70,6 +70,17 @@ config DRM_AMDGPU_USERPTR
  This option selects CONFIG_HMM and CONFIG_HMM_MIRROR if it
  isn't already selected to enabled full userptr support.
 
+config DRM_AMD_ISP
+   bool "Enable AMD Image Signal Processor IP support"
+   depends on DRM_AMDGPU
+   select MFD_CORE
+   select PM_GENERIC_DOMAINS if PM
+   help
+   Choose this option to enable ISP IP support for AMD SOCs.
+   This adds the ISP (Image Signal Processor) IP driver and wires
+   it up into the amdgpu driver.  It is required for cameras
+   on APUs which utilize mipi cameras.
+
 config DRM_AMDGPU_WERROR
bool "Force the compiler to throw an error instead of a warning when 
compiling"
depends on DRM_AMDGPU
diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index 12ba76025cb7c..c95ec19a38264 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -325,6 +325,8 @@ amdgpu-y += $(AMD_DISPLAY_FILES)
 endif
 
 # add isp block
+ifneq ($(CONFIG_DRM_AMD_ISP),)
 amdgpu-y += amdgpu_isp.o
+endif
 
 obj-$(CONFIG_DRM_AMDGPU)+= amdgpu.o
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 846c3550fbda8..936ed3c10c884 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -113,7 +113,9 @@
 #include "amdgpu_seq64.h"
 #include "amdgpu_reg_state.h"
 #include "amdgpu_umsch_mm.h"
+#if defined(CONFIG_DRM_AMD_ISP)
 #include "amdgpu_isp.h"
+#endif
 
 #define MAX_GPU_INSTANCE   64
 
@@ -1049,8 +1051,10 @@ struct amdgpu_device {
/* display related functionality */
struct amdgpu_display_manager dm;
 
+#if defined(CONFIG_DRM_AMD_ISP)
/* isp */
struct amdgpu_isp   isp;
+#endif
 
/* mes */
boolenable_mes;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 378d5a5cda917..1bab8dd37d621 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -107,7 +107,9 @@
 #include "jpeg_v5_0_0.h"
 
 #include "amdgpu_vpe.h"
+#if defined(CONFIG_DRM_AMD_ISP)
 #include "amdgpu_isp.h"
+#endif
 
 #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
 MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
@@ -712,10 +714,12 @@ static void 
amdgpu_discovery_read_from_harvest_table(struct amdgpu_device *adev,
adev->sdma.sdma_mask &=
~(1U << harvest_info->list[i].number_instance);
break;
+#if defined(CONFIG_DRM_AMD_ISP)
case ISP_HWID:
adev->isp.harvest_config |=
~(1U << harvest_info->list[i].number_instance);
break;
+#endif
default:
break;
}
@@ -2402,6 +2406,7 @@ static int amdgpu_discovery_set_umsch_mm_ip_blocks(struct 
amdgpu_device *adev)
 
 static int amdgpu_discovery_set_isp_ip_blocks(struct amdgpu_device *adev)
 {
+#if defined(CONFIG_DRM_AMD_ISP)
switch (amdgpu_ip_version(adev, ISP_HWIP, 0)) {
case IP_VERSION(4, 1, 0):
case IP_VERSION(4, 1, 1):
@@ -2410,6 +2415,7 @@ static int amdgpu_discovery_set_isp_ip_blocks(struct 
amdgpu_device *adev)
default:
break;
}
+#endif
 
return 0;
 }
-- 
2.45.0



[PATCH] drm/amdgpu: fix Kconfig for ISP

2024-05-14 Thread Alex Deucher
Add new config option and set proper dependencies for ISP.

Signed-off-by: Alex Deucher 
Cc: Pratap Nirujogi 
---
 drivers/gpu/drm/amd/amdgpu/Kconfig  |  1 +
 drivers/gpu/drm/amd/amdgpu/Makefile |  2 ++
 drivers/gpu/drm/amd/isp/Kconfig | 17 +
 3 files changed, 20 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/isp/Kconfig

diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig 
b/drivers/gpu/drm/amd/amdgpu/Kconfig
index 22d88f8ef5279..aa037ac7ef24f 100644
--- a/drivers/gpu/drm/amd/amdgpu/Kconfig
+++ b/drivers/gpu/drm/amd/amdgpu/Kconfig
@@ -83,3 +83,4 @@ config DRM_AMDGPU_WERROR
 source "drivers/gpu/drm/amd/acp/Kconfig"
 source "drivers/gpu/drm/amd/display/Kconfig"
 source "drivers/gpu/drm/amd/amdkfd/Kconfig"
+source "drivers/gpu/drm/amd/isp/Kconfig"
diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index 12ba76025cb7c..c95ec19a38264 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -325,6 +325,8 @@ amdgpu-y += $(AMD_DISPLAY_FILES)
 endif
 
 # add isp block
+ifneq ($(CONFIG_DRM_AMD_ISP),)
 amdgpu-y += amdgpu_isp.o
+endif
 
 obj-$(CONFIG_DRM_AMDGPU)+= amdgpu.o
diff --git a/drivers/gpu/drm/amd/isp/Kconfig b/drivers/gpu/drm/amd/isp/Kconfig
new file mode 100644
index 0..afa3579901009
--- /dev/null
+++ b/drivers/gpu/drm/amd/isp/Kconfig
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: MIT
+menu "ISP (Image Signal Processor) Configuration"
+   depends on DRM_AMDGPU
+
+config DRM_AMD_ISP
+   bool "Enable AMD Image Signal Processor IP support"
+   depends on DRM_AMDGPU
+   select MFD_CORE
+   select PM_GENERIC_DOMAINS if PM
+   help
+   Choose this option to enable ISP IP support for AMD SOCs.
+   This adds the ISP (Image Signal Processor) IP driver and wires
+   it up into the amdgpu driver.  The ACP block provides the DMA
+   engine for the V4L mipi driver. It is required for camera
+   on APUs which utilize a mipi camera.
+
+endmenu
-- 
2.45.0



RE: [PATCH] drm/amdgpu: Add documentation for AMD_IP_BLOCK_TYPE_ISP

2024-05-14 Thread Nirujogi, Pratap
[Public]

Acked, thanks Alex.

-Original Message-
From: Deucher, Alexander 
Sent: Tuesday, May 14, 2024 3:23 PM
To: amd-gfx@lists.freedesktop.org; Nirujogi, Pratap 
Cc: Stephen Rothwell 
Subject: RE: [PATCH] drm/amdgpu: Add documentation for AMD_IP_BLOCK_TYPE_ISP

[Public]

+ Pratap

> -Original Message-
> From: Deucher, Alexander 
> Sent: Tuesday, May 14, 2024 11:14 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander ; Stephen Rothwell
> 
> Subject: [PATCH] drm/amdgpu: Add documentation for
> AMD_IP_BLOCK_TYPE_ISP
>
> Add missing documentation for the IP block.
>
> Fixes: a83048bfa402 ("drm/amd/amdgpu: Add ISP support to
> amdgpu_discovery")
> Reported-by: Stephen Rothwell 
> Signed-off-by: Alex Deucher 
Acked-by: Pratap Nirujogi 
> ---
>  drivers/gpu/drm/amd/include/amd_shared.h | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/drivers/gpu/drm/amd/include/amd_shared.h
> b/drivers/gpu/drm/amd/include/amd_shared.h
> index 8bc2134cdd6b8..f5b725f10a7ce 100644
> --- a/drivers/gpu/drm/amd/include/amd_shared.h
> +++ b/drivers/gpu/drm/amd/include/amd_shared.h
> @@ -86,6 +86,7 @@ enum amd_apu_flags {
>  * @AMD_IP_BLOCK_TYPE_JPEG: JPEG Engine
>  * @AMD_IP_BLOCK_TYPE_VPE: Video Processing Engine
>  * @AMD_IP_BLOCK_TYPE_UMSCH_MM: User Mode Schduler for Multimedia
> +* @AMD_IP_BLOCK_TYPE_ISP: Image Signal Processor
>  * @AMD_IP_BLOCK_TYPE_NUM: Total number of IP block types  */  enum
> amd_ip_block_type {
> --
> 2.45.0




RE: [PATCH] drm/amdgpu: Add documentation for AMD_IP_BLOCK_TYPE_ISP

2024-05-14 Thread Deucher, Alexander
[Public]

+ Pratap

> -Original Message-
> From: Deucher, Alexander 
> Sent: Tuesday, May 14, 2024 11:14 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander ; Stephen Rothwell
> 
> Subject: [PATCH] drm/amdgpu: Add documentation for
> AMD_IP_BLOCK_TYPE_ISP
>
> Add missing documentation for the IP block.
>
> Fixes: a83048bfa402 ("drm/amd/amdgpu: Add ISP support to
> amdgpu_discovery")
> Reported-by: Stephen Rothwell 
> Signed-off-by: Alex Deucher 
> ---
>  drivers/gpu/drm/amd/include/amd_shared.h | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/drivers/gpu/drm/amd/include/amd_shared.h
> b/drivers/gpu/drm/amd/include/amd_shared.h
> index 8bc2134cdd6b8..f5b725f10a7ce 100644
> --- a/drivers/gpu/drm/amd/include/amd_shared.h
> +++ b/drivers/gpu/drm/amd/include/amd_shared.h
> @@ -86,6 +86,7 @@ enum amd_apu_flags {
>  * @AMD_IP_BLOCK_TYPE_JPEG: JPEG Engine
>  * @AMD_IP_BLOCK_TYPE_VPE: Video Processing Engine
>  * @AMD_IP_BLOCK_TYPE_UMSCH_MM: User Mode Schduler for Multimedia
> +* @AMD_IP_BLOCK_TYPE_ISP: Image Signal Processor
>  * @AMD_IP_BLOCK_TYPE_NUM: Total number of IP block types  */  enum
> amd_ip_block_type {
> --
> 2.45.0



[PATCH 2/2] drm/amdgpu: fix documentation errors in gmc v12.0

2024-05-14 Thread Alex Deucher
Fix up parameter descriptions.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
index 34e751b9b7003..c12c96f5bbaae 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
@@ -282,6 +282,8 @@ static void gmc_v12_0_flush_vm_hub(struct amdgpu_device 
*adev, uint32_t vmid,
  *
  * @adev: amdgpu_device pointer
  * @vmid: vm instance to flush
+ * @vmhub: which hub to flush
+ * @flush_type: the flush type
  *
  * Flush the TLB for the requested page table.
  */
@@ -321,6 +323,9 @@ static void gmc_v12_0_flush_gpu_tlb(struct amdgpu_device 
*adev, uint32_t vmid,
  *
  * @adev: amdgpu_device pointer
  * @pasid: pasid to be flush
+ * @flush_type: the flush type
+ * @all_hub: flush all hubs
+ * @inst: is used to select which instance of KIQ to use for the invalidation
  *
  * Flush the TLB for the requested pasid.
  */
-- 
2.45.0



[PATCH 1/2] drm/amdgpu: fix documentation errors in sdma v7.0

2024-05-14 Thread Alex Deucher
Fix up parameter descriptions.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
index 7db53a96cff0a..4a5252e088838 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
@@ -224,7 +224,9 @@ static void sdma_v7_0_ring_insert_nop(struct amdgpu_ring 
*ring, uint32_t count)
  * sdma_v7_0_ring_emit_ib - Schedule an IB on the DMA engine
  *
  * @ring: amdgpu ring pointer
+ * @job: job to retrieve vmid from
  * @ib: IB object to schedule
+ * @flags: unused
  *
  * Schedule an IB in the DMA ring.
  */
@@ -260,8 +262,6 @@ static void sdma_v7_0_ring_emit_ib(struct amdgpu_ring *ring,
  * sdma_v7_0_ring_emit_mem_sync - flush the IB by graphics cache rinse
  *
  * @ring: amdgpu ring pointer
- * @job: job to retrieve vmid from
- * @ib: IB object to schedule
  *
  * flush the IB by graphics cache rinse.
  */
@@ -313,7 +313,9 @@ static void sdma_v7_0_ring_emit_hdp_flush(struct 
amdgpu_ring *ring)
  * sdma_v7_0_ring_emit_fence - emit a fence on the DMA ring
  *
  * @ring: amdgpu ring pointer
- * @fence: amdgpu fence object
+ * @addr: address
+ * @seq: fence seq number
+ * @flags: fence flags
  *
  * Add a DMA fence packet to the ring to write
  * the fence seq number and DMA trap packet to generate
@@ -915,6 +917,7 @@ static int sdma_v7_0_ring_test_ring(struct amdgpu_ring 
*ring)
  * sdma_v7_0_ring_test_ib - test an IB on the DMA engine
  *
  * @ring: amdgpu_ring structure holding ring information
+ * @timeout: timeout value in jiffies, or MAX_SCHEDULE_TIMEOUT
  *
  * Test a simple IB in the DMA ring.
  * Returns 0 on success, error on failure.
@@ -1038,10 +1041,9 @@ static void sdma_v7_0_vm_copy_pte(struct amdgpu_ib *ib,
  *
  * @ib: indirect buffer to fill with commands
  * @pe: addr of the page entry
- * @addr: dst addr to write into pe
+ * @value: dst addr to write into pe
  * @count: number of page entries to update
  * @incr: increase next addr by incr bytes
- * @flags: access flags
  *
  * Update PTEs by writing them manually using sDMA.
  */
@@ -1095,6 +1097,8 @@ static void sdma_v7_0_vm_set_pte_pde(struct amdgpu_ib *ib,
 
 /**
  * sdma_v7_0_ring_pad_ib - pad the IB
+ *
+ * @ring: amdgpu ring pointer
  * @ib: indirect buffer to fill with padding
  *
  * Pad the IB with NOPs to a boundary multiple of 8.
@@ -1145,7 +1149,8 @@ static void sdma_v7_0_ring_emit_pipeline_sync(struct 
amdgpu_ring *ring)
  * sdma_v7_0_ring_emit_vm_flush - vm flush using sDMA
  *
  * @ring: amdgpu_ring pointer
- * @vm: amdgpu_vm pointer
+ * @vmid: vmid number to use
+ * @pd_addr: address
  *
  * Update the page table base and flush the VM TLB
  * using sDMA.
@@ -1549,11 +1554,11 @@ static void sdma_v7_0_set_irq_funcs(struct 
amdgpu_device *adev)
 /**
  * sdma_v7_0_emit_copy_buffer - copy buffer using the sDMA engine
  *
- * @ring: amdgpu_ring structure holding ring information
+ * @ib: indirect buffer to fill with commands
  * @src_offset: src GPU address
  * @dst_offset: dst GPU address
  * @byte_count: number of bytes to xfer
- * @copy_flags: flags for the copy
+ * @copy_flags: copy flags for the buffers
  *
  * Copy GPU buffers using the DMA engine.
  * Used by the amdgpu ttm implementation to move pages if
@@ -1579,7 +1584,7 @@ static void sdma_v7_0_emit_copy_buffer(struct amdgpu_ib 
*ib,
 /**
  * sdma_v7_0_emit_fill_buffer - fill buffer using the sDMA engine
  *
- * @ring: amdgpu_ring structure holding ring information
+ * @ib: indirect buffer to fill
  * @src_data: value to write to buffer
  * @dst_offset: dst GPU address
  * @byte_count: number of bytes to xfer
-- 
2.45.0



Re: [RFC 0/5] Discussion around eviction improvements

2024-05-14 Thread Christian König

Am 14.05.24 um 17:14 schrieb Tvrtko Ursulin:


On 13/05/2024 14:49, Tvrtko Ursulin wrote:


On 09/05/2024 13:40, Tvrtko Ursulin wrote:


On 08/05/2024 19:09, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Last few days I was looking at the situation with VRAM over 
subscription, what
happens versus what perhaps should happen. Browsing through the 
driver and

running some simple experiments.

I ended up with this patch series which, as a disclaimer, may be 
completely
wrong but as I found some suspicious things, to me at least, I 
thought it was a

good point to stop and request some comments.

To perhaps summarise what are the main issues I think I found:

  * Migration rate limiting does not bother knowing if actual 
migration happened

    and so can over-account and unfairly penalise.

  * Migration rate limiting does not even work, at least not for 
the common case
    where userspace configures VRAM+GTT. It thinks it can stop 
migration attempts
    by playing with bo->allowed_domains vs bo->preferred domains 
but, both from
    the code, and from empirical experiments, I see that not 
working at all. Both

    masks are identical so fiddling with them achieves nothing.

  * Idea of the fallback placement only works when VRAM has free 
space. As soon
    as it does not, ttm_resource_compatible is happy to leave the 
buffers in the

    secondary placement forever.

  * Driver thinks it will be re-validating evicted buffers on the 
next submission
    but it does not for the very common case of VRAM+GTT because it 
only checks

    if current placement is *none* of the preferred placements.

All those problems are addressed in individual patches.

End result of this series appears to be driver which will try 
harder to move
buffers back into VRAM, but will be (more) correctly throttled in 
doing so by

the existing rate limiting logic.

I have run a quick benchmark of Cyberpunk 2077 and cannot say that 
I saw a
change but that could be a good thing too. At least I did not break 
anything,
perhaps.. On one occassion I did see the rate limiting logic get 
confused while
for a period of few minutes it went to a mode where it was 
constantly giving a
high migration budget. But that recovered itself when I switched 
clients and did
not come back so I don't know. If there is something wrong there I 
don't think

it would be caused by any patches in this series.


Since yesterday I also briefly tested with Far Cry New Dawn. One run 
each so possibly doesn't mean anything apart that there isn't a 
regression aka migration throttling is keeping things at bay even 
with increased requests to migrate things back to VRAM:


  before after
min/avg/max fps    36/44/54    37/45/55

Cyberpunk 2077 from yesterday was similarly close:

 26.96/29.59/30.40    29.70/30.00/30.32

I guess the real story is proper DGPU where misplaced buffers have a 
real cost.


I found one game which regresses spectacularly badly with this series 
- Assasin's Creed Valhalla. The built-in benchmark at least. The game 
appears to have a working set much larger than the other games I 
tested, around 5GiB total during the benchmark. And for some reason 
migration throttling totally fails to put it in check. I will be 
investigating this shortly.


I think that the conclusion is everything I attempted to add relating 
to TTM_PL_PREFERRED does not really work as I initially thought it 
did. Therefore please imagine this series as only containing patches 
1, 2 and 5.


Noted (and I had just started to wrap my head around that idea).



(And FWIW it was quite annoying to get to the bottom of since for some 
reason the system exibits some sort of a latching behaviour, where on 
some boots and/or some minutes of runtime things were fine, and then 
it would latch onto a mode where the TTM_PL_PREFERRED induced breakage 
would show. And sometimes this breakage would appear straight away. Odd.)


Welcome to my world. You improve one use case and four other get a 
penalty. Even when you know the code and potential use cases inside out 
it's really hard to predict how some applications and the core memory 
management behave sometimes.




I still need to test though if the subset of patches manage to achieve 
some positive improvement on their own. It is possible, as patch 5 
marks more buffers for re-validation so once overcommit subsides they 
would get promoted to preferred placement straight away. And 1&2 are 
notionally fixes for migration throttling so at least in broad sense 
should be still valid as discussion points.


Yeah, especially 5 kind of makes sense but could potentially lead to 
higher overhead. Need to see how we can better handle that.


Regards,
Christian.



Regards,

Tvrtko

Series is probably rough but should be good enough for dicsussion. 
I am curious
to hear if I identified at least something correctly as a real 
problem.


It would also be good to hear what are the suggested games to check 

[PATCH] drm/amdgpu: Add documentation for AMD_IP_BLOCK_TYPE_ISP

2024-05-14 Thread Alex Deucher
Add missing documentation for the IP block.

Fixes: a83048bfa402 ("drm/amd/amdgpu: Add ISP support to amdgpu_discovery")
Reported-by: Stephen Rothwell 
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/include/amd_shared.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/include/amd_shared.h 
b/drivers/gpu/drm/amd/include/amd_shared.h
index 8bc2134cdd6b8..f5b725f10a7ce 100644
--- a/drivers/gpu/drm/amd/include/amd_shared.h
+++ b/drivers/gpu/drm/amd/include/amd_shared.h
@@ -86,6 +86,7 @@ enum amd_apu_flags {
 * @AMD_IP_BLOCK_TYPE_JPEG: JPEG Engine
 * @AMD_IP_BLOCK_TYPE_VPE: Video Processing Engine
 * @AMD_IP_BLOCK_TYPE_UMSCH_MM: User Mode Schduler for Multimedia
+* @AMD_IP_BLOCK_TYPE_ISP: Image Signal Processor
 * @AMD_IP_BLOCK_TYPE_NUM: Total number of IP block types
 */
 enum amd_ip_block_type {
-- 
2.45.0



Re: [RFC 0/5] Discussion around eviction improvements

2024-05-14 Thread Tvrtko Ursulin



On 13/05/2024 14:49, Tvrtko Ursulin wrote:


On 09/05/2024 13:40, Tvrtko Ursulin wrote:


On 08/05/2024 19:09, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Last few days I was looking at the situation with VRAM over 
subscription, what
happens versus what perhaps should happen. Browsing through the 
driver and

running some simple experiments.

I ended up with this patch series which, as a disclaimer, may be 
completely
wrong but as I found some suspicious things, to me at least, I 
thought it was a

good point to stop and request some comments.

To perhaps summarise what are the main issues I think I found:

  * Migration rate limiting does not bother knowing if actual 
migration happened

    and so can over-account and unfairly penalise.

  * Migration rate limiting does not even work, at least not for the 
common case
    where userspace configures VRAM+GTT. It thinks it can stop 
migration attempts
    by playing with bo->allowed_domains vs bo->preferred domains but, 
both from
    the code, and from empirical experiments, I see that not working 
at all. Both

    masks are identical so fiddling with them achieves nothing.

  * Idea of the fallback placement only works when VRAM has free 
space. As soon
    as it does not, ttm_resource_compatible is happy to leave the 
buffers in the

    secondary placement forever.

  * Driver thinks it will be re-validating evicted buffers on the 
next submission
    but it does not for the very common case of VRAM+GTT because it 
only checks

    if current placement is *none* of the preferred placements.

All those problems are addressed in individual patches.

End result of this series appears to be driver which will try harder 
to move
buffers back into VRAM, but will be (more) correctly throttled in 
doing so by

the existing rate limiting logic.

I have run a quick benchmark of Cyberpunk 2077 and cannot say that I 
saw a
change but that could be a good thing too. At least I did not break 
anything,
perhaps.. On one occassion I did see the rate limiting logic get 
confused while
for a period of few minutes it went to a mode where it was constantly 
giving a
high migration budget. But that recovered itself when I switched 
clients and did
not come back so I don't know. If there is something wrong there I 
don't think

it would be caused by any patches in this series.


Since yesterday I also briefly tested with Far Cry New Dawn. One run 
each so possibly doesn't mean anything apart that there isn't a 
regression aka migration throttling is keeping things at bay even with 
increased requests to migrate things back to VRAM:


  before after
min/avg/max fps    36/44/54    37/45/55

Cyberpunk 2077 from yesterday was similarly close:

 26.96/29.59/30.40    29.70/30.00/30.32

I guess the real story is proper DGPU where misplaced buffers have a 
real cost.


I found one game which regresses spectacularly badly with this series - 
Assasin's Creed Valhalla. The built-in benchmark at least. The game 
appears to have a working set much larger than the other games I tested, 
around 5GiB total during the benchmark. And for some reason migration 
throttling totally fails to put it in check. I will be investigating 
this shortly.


I think that the conclusion is everything I attempted to add relating to 
TTM_PL_PREFERRED does not really work as I initially thought it did. 
Therefore please imagine this series as only containing patches 1, 2 and 5.


(And FWIW it was quite annoying to get to the bottom of since for some 
reason the system exibits some sort of a latching behaviour, where on 
some boots and/or some minutes of runtime things were fine, and then it 
would latch onto a mode where the TTM_PL_PREFERRED induced breakage 
would show. And sometimes this breakage would appear straight away. Odd.)


I still need to test though if the subset of patches manage to achieve 
some positive improvement on their own. It is possible, as patch 5 marks 
more buffers for re-validation so once overcommit subsides they would 
get promoted to preferred placement straight away. And 1&2 are 
notionally fixes for migration throttling so at least in broad sense 
should be still valid as discussion points.


Regards,

Tvrtko

Series is probably rough but should be good enough for dicsussion. I 
am curious

to hear if I identified at least something correctly as a real problem.

It would also be good to hear what are the suggested games to check 
and see

whether there is any improvement.

Cc: Christian König 
Cc: Friedrich Vock 

Tvrtko Ursulin (5):
   drm/amdgpu: Fix migration rate limiting accounting
   drm/amdgpu: Actually respect buffer migration budget
   drm/ttm: Add preferred placement flag
   drm/amdgpu: Use preferred placement for VRAM+GTT
   drm/amdgpu: Re-validate evicted buffers

  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 38 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c |  8 +++--
  

[PATCH v3 1/2] drm/buddy: Fix the range bias clear memory allocation issue

2024-05-14 Thread Arunpravin Paneer Selvam
Problem statement: During the system boot time, an application request
for the bulk volume of cleared range bias memory when the clear_avail
is zero, we dont fallback into normal allocation method as we had an
unnecessary clear_avail check which prevents the fallback method leads
to fb allocation failure following system goes into unresponsive state.

Solution: Remove the unnecessary clear_avail check in the range bias
allocation function.

v2: add a kunit for this corner case (Daniel Vetter)

Signed-off-by: Arunpravin Paneer Selvam 
Fixes: 96950929eb23 ("drm/buddy: Implement tracking clear page feature")
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/drm_buddy.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 284ebae71cc4..1daf778cf6fa 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -249,6 +249,7 @@ int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 
chunk_size)
 
mm->size = size;
mm->avail = size;
+   mm->clear_avail = 0;
mm->chunk_size = chunk_size;
mm->max_order = ilog2(size) - ilog2(chunk_size);
 
@@ -574,7 +575,7 @@ __drm_buddy_alloc_range_bias(struct drm_buddy *mm,
 
block = __alloc_range_bias(mm, start, end, order,
   flags, fallback);
-   if (IS_ERR(block) && mm->clear_avail)
+   if (IS_ERR(block))
return __alloc_range_bias(mm, start, end, order,
  flags, !fallback);
 
-- 
2.25.1



[PATCH v3 2/2] drm/tests: Add a unit test for range bias allocation

2024-05-14 Thread Arunpravin Paneer Selvam
Allocate cleared blocks in the bias range when the DRM
buddy's clear avail is zero. This will validate the bias
range allocation in scenarios like system boot when no
cleared blocks are available and exercise the fallback
path too. The resulting blocks should always be dirty.

v1:(Matthew)
  - move the size to the variable declaration section.
  - move the mm.clear_avail init to allocator init.

Signed-off-by: Arunpravin Paneer Selvam 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/tests/drm_buddy_test.c | 36 +-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c 
b/drivers/gpu/drm/tests/drm_buddy_test.c
index e3b50e240d36..b3be68b03610 100644
--- a/drivers/gpu/drm/tests/drm_buddy_test.c
+++ b/drivers/gpu/drm/tests/drm_buddy_test.c
@@ -23,9 +23,11 @@ static inline u64 get_size(int order, u64 chunk_size)
 
 static void drm_test_buddy_alloc_range_bias(struct kunit *test)
 {
-   u32 mm_size, ps, bias_size, bias_start, bias_end, bias_rem;
+   u32 mm_size, size, ps, bias_size, bias_start, bias_end, bias_rem;
DRM_RND_STATE(prng, random_seed);
unsigned int i, count, *order;
+   struct drm_buddy_block *block;
+   unsigned long flags;
struct drm_buddy mm;
LIST_HEAD(allocated);
 
@@ -222,6 +224,38 @@ static void drm_test_buddy_alloc_range_bias(struct kunit 
*test)
 
drm_buddy_free_list(, , 0);
drm_buddy_fini();
+
+   /*
+* Allocate cleared blocks in the bias range when the DRM buddy's clear 
avail is
+* zero. This will validate the bias range allocation in scenarios like 
system boot
+* when no cleared blocks are available and exercise the fallback path 
too. The resulting
+* blocks should always be dirty.
+*/
+
+   KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(, mm_size, ps),
+  "buddy_init failed\n");
+
+   bias_start = round_up(prandom_u32_state() % (mm_size - ps), ps);
+   bias_end = round_up(bias_start + prandom_u32_state() % (mm_size - 
bias_start), ps);
+   bias_end = max(bias_end, bias_start + ps);
+   bias_rem = bias_end - bias_start;
+
+   flags = DRM_BUDDY_CLEAR_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION;
+   size = max(round_up(prandom_u32_state() % bias_rem, ps), ps);
+
+   KUNIT_ASSERT_FALSE_MSG(test,
+  drm_buddy_alloc_blocks(, bias_start,
+ bias_end, size, ps,
+ ,
+ flags),
+  "buddy_alloc failed with bias(%x-%x), size=%u, 
ps=%u\n",
+  bias_start, bias_end, size, ps);
+
+   list_for_each_entry(block, , link)
+   KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false);
+
+   drm_buddy_free_list(, , 0);
+   drm_buddy_fini();
 }
 
 static void drm_test_buddy_alloc_clear(struct kunit *test)
-- 
2.25.1



Re: [PATCH] drm/amdgpu: Check if NBIO funcs are NULL in amdgpu_device_baco_exit

2024-05-14 Thread Alex Deucher
Applied.  Thanks!

On Tue, May 14, 2024 at 3:27 AM Christian König
 wrote:
>
> Am 14.05.24 um 09:06 schrieb Friedrich Vock:
> > The special case for VM passthrough doesn't check adev->nbio.funcs
> > before dereferencing it. If GPUs that don't have an NBIO block are
> > passed through, this leads to a NULL pointer dereference on startup.
> >
> > Signed-off-by: Friedrich Vock 
>
> Acked-by: Christian König 
>
> >
> > Fixes: 1bece222eab ("drm/amdgpu: Clear doorbell interrupt status for Sienna 
> > Cichlid")
> > Cc: Alex Deucher 
> > Cc: Christian König 
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
> >   1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 861ccff78af95..83c4533ee75c8 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -6165,7 +6165,7 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
> >   adev->nbio.funcs->enable_doorbell_interrupt)
> >   adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
> >
> > - if (amdgpu_passthrough(adev) &&
> > + if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
> >   adev->nbio.funcs->clear_doorbell_interrupt)
> >   adev->nbio.funcs->clear_doorbell_interrupt(adev);
> >
> > --
> > 2.45.0
> >
>


Re: [PATCH] drm/amdgpu/mes: use mc address for wptr in add queue packet

2024-05-14 Thread Alex Deucher
Acked-by: Alex Deucher 

On Tue, May 14, 2024 at 5:07 AM Min, Frank  wrote:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> From: Frank Min 
>
> use mc address for wptr in add queue packet
>
> Signed-off-by: Frank Min 
> ---
>  drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 6 +-
>  1 file changed, 1 insertion(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c 
> b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index 5519655fd70a..6256b21884ee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -267,11 +267,7 @@ static int mes_v12_0_add_hw_queue(struct amdgpu_mes *mes,
> mes_add_queue_pkt.doorbell_offset = input->doorbell_offset;
> mes_add_queue_pkt.mqd_addr = input->mqd_addr;
>
> -   if (((adev->mes.sched_version & AMDGPU_MES_API_VERSION_MASK) >>
> -   AMDGPU_MES_API_VERSION_SHIFT) >= 2)
> -   mes_add_queue_pkt.wptr_addr = input->wptr_mc_addr;
> -   else
> -   mes_add_queue_pkt.wptr_addr = input->wptr_addr;
> +   mes_add_queue_pkt.wptr_addr = input->wptr_mc_addr;
>
> mes_add_queue_pkt.queue_type =
> convert_to_mes_queue_type(input->queue_type);
> --
> 2.34.1
>


[PATCH v4 05/10] drm/amd/pm: Add xgmi plpd to SMU v13.0.6 pm_policy

2024-05-14 Thread Lijo Lazar
On SOCs with SMU v13.0.6, allow changing xgmi plpd policy through
pm_policy sysfs interface.

Signed-off-by: Lijo Lazar 
Reviewed-by: Hawking Zhang 
---
v2, v3: No change
v4: Use macro for XGMI policy type name

 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 17 ++-
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 51 +--
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c| 27 ++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h|  1 +
 4 files changed, 90 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index df9ff377ebfd..2a8d853e6346 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -1197,6 +1197,9 @@ static void smu_swctf_delayed_work_handler(struct 
work_struct *work)
 
 static void smu_init_xgmi_plpd_mode(struct smu_context *smu)
 {
+   struct smu_dpm_context *dpm_ctxt = &(smu->smu_dpm);
+   struct smu_dpm_policy_ctxt *policy_ctxt;
+
if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(11, 0, 2)) {
smu->plpd_mode = XGMI_PLPD_DEFAULT;
return;
@@ -1204,10 +1207,20 @@ static void smu_init_xgmi_plpd_mode(struct smu_context 
*smu)
 
/* PMFW put PLPD into default policy after enabling the feature */
if (smu_feature_is_enabled(smu,
-  SMU_FEATURE_XGMI_PER_LINK_PWR_DWN_BIT))
+  SMU_FEATURE_XGMI_PER_LINK_PWR_DWN_BIT)) {
+   struct smu_dpm_policy *policy;
+
smu->plpd_mode = XGMI_PLPD_DEFAULT;
-   else
+   policy = smu_get_pm_policy(smu, PP_PM_POLICY_XGMI_PLPD);
+   if (policy)
+   policy->current_level = XGMI_PLPD_DEFAULT;
+   } else {
smu->plpd_mode = XGMI_PLPD_NONE;
+   policy_ctxt = dpm_ctxt->dpm_policies;
+   if (policy_ctxt)
+   policy_ctxt->policy_mask &=
+   ~BIT(PP_PM_POLICY_XGMI_PLPD);
+   }
 }
 
 static int smu_sw_init(void *handle)
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 0ed0b5326d35..173c5599279b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -403,9 +403,45 @@ static int smu_v13_0_6_select_policy_soc_pstate(struct 
smu_context *smu,
return ret;
 }
 
+static int smu_v13_0_6_select_plpd_policy(struct smu_context *smu, int level)
+{
+   struct amdgpu_device *adev = smu->adev;
+   int ret, param;
+
+   switch (level) {
+   case XGMI_PLPD_DEFAULT:
+   param = PPSMC_PLPD_MODE_DEFAULT;
+   break;
+   case XGMI_PLPD_OPTIMIZED:
+   param = PPSMC_PLPD_MODE_OPTIMIZED;
+   break;
+   case XGMI_PLPD_DISALLOW:
+   param = 0;
+   break;
+   default:
+   return -EINVAL;
+   }
+
+   if (level == XGMI_PLPD_DISALLOW)
+   ret = smu_cmn_send_smc_msg_with_param(
+   smu, SMU_MSG_GmiPwrDnControl, param, NULL);
+   else
+   /* change xgmi per-link power down policy */
+   ret = smu_cmn_send_smc_msg_with_param(
+   smu, SMU_MSG_SelectPLPDMode, param, NULL);
+
+   if (ret)
+   dev_err(adev->dev,
+   "select xgmi per-link power down policy %d failed\n",
+   level);
+
+   return ret;
+}
+
 static int smu_v13_0_6_allocate_dpm_context(struct smu_context *smu)
 {
struct smu_dpm_context *smu_dpm = >smu_dpm;
+   struct smu_dpm_policy *policy;
 
smu_dpm->dpm_context =
kzalloc(sizeof(struct smu_13_0_dpm_context), GFP_KERNEL);
@@ -413,11 +449,9 @@ static int smu_v13_0_6_allocate_dpm_context(struct 
smu_context *smu)
return -ENOMEM;
smu_dpm->dpm_context_size = sizeof(struct smu_13_0_dpm_context);
 
+   smu_dpm->dpm_policies =
+   kzalloc(sizeof(struct smu_dpm_policy_ctxt), GFP_KERNEL);
if (!(smu->adev->flags & AMD_IS_APU)) {
-   struct smu_dpm_policy *policy;
-
-   smu_dpm->dpm_policies =
-   kzalloc(sizeof(struct smu_dpm_policy_ctxt), GFP_KERNEL);
policy = &(smu_dpm->dpm_policies->policies[0]);
 
policy->policy_type = PP_PM_POLICY_SOC_PSTATE;
@@ -430,6 +464,15 @@ static int smu_v13_0_6_allocate_dpm_context(struct 
smu_context *smu)
smu_dpm->dpm_policies->policy_mask |=
BIT(PP_PM_POLICY_SOC_PSTATE);
}
+   policy = &(smu_dpm->dpm_policies->policies[1]);
+
+   policy->policy_type = PP_PM_POLICY_XGMI_PLPD;
+   policy->level_mask = BIT(XGMI_PLPD_DISALLOW) | BIT(XGMI_PLPD_DEFAULT) |
+BIT(XGMI_PLPD_OPTIMIZED);

[PATCH v4 09/10] drm/amd/pm: Remove unused interface to set plpd

2024-05-14 Thread Lijo Lazar
Remove unused callback to set PLPD policy and its implementation from
arcturus, aldebaran and SMUv13.0.6 SOCs.

Signed-off-by: Lijo Lazar 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  6 ---
 .../gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c | 22 ---
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c| 24 
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 39 ---
 4 files changed, 91 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index 356231fd976d..ba457d15ea14 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -876,12 +876,6 @@ struct pptable_funcs {
 */
int (*set_df_cstate)(struct smu_context *smu, enum pp_df_cstate state);
 
-   /**
-* @select_xgmi_plpd_policy: Select xgmi per-link power down policy.
-*/
-   int (*select_xgmi_plpd_policy)(struct smu_context *smu,
-  enum pp_xgmi_plpd_mode mode);
-
/**
 * @update_pcie_parameters: Update and upload the system's PCIe
 *  capabilites to the SMU.
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
index 84f7d4139bda..c0f6b59369b7 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
@@ -,27 +,6 @@ static int arcturus_set_df_cstate(struct smu_context 
*smu,
return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_DFCstateControl, 
state, NULL);
 }
 
-static int arcturus_select_xgmi_plpd_policy(struct smu_context *smu,
-   enum pp_xgmi_plpd_mode mode)
-{
-   /* PPSMC_MSG_GmiPwrDnControl is supported by 54.23.0 and onwards */
-   if (smu->smc_fw_version < 0x00361700) {
-   dev_err(smu->adev->dev, "XGMI power down control is only 
supported by PMFW 54.23.0 and onwards\n");
-   return -EINVAL;
-   }
-
-   if (mode == XGMI_PLPD_DEFAULT)
-   return smu_cmn_send_smc_msg_with_param(smu,
-  SMU_MSG_GmiPwrDnControl,
-  1, NULL);
-   else if (mode == XGMI_PLPD_DISALLOW)
-   return smu_cmn_send_smc_msg_with_param(smu,
-  SMU_MSG_GmiPwrDnControl,
-  0, NULL);
-   else
-   return -EINVAL;
-}
-
 static const struct throttling_logging_label {
uint32_t feature_mask;
const char *label;
@@ -2440,7 +2419,6 @@ static const struct pptable_funcs arcturus_ppt_funcs = {
.get_dpm_ultimate_freq = smu_v11_0_get_dpm_ultimate_freq,
.set_soft_freq_limited_range = smu_v11_0_set_soft_freq_limited_range,
.set_df_cstate = arcturus_set_df_cstate,
-   .select_xgmi_plpd_policy = arcturus_select_xgmi_plpd_policy,
.log_thermal_throttling_event = arcturus_log_thermal_throttling_event,
.get_pp_feature_mask = smu_cmn_get_pp_feature_mask,
.set_pp_feature_mask = smu_cmn_set_pp_feature_mask,
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 66d386ef1da9..e584e53e3760 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1642,29 +1642,6 @@ static int aldebaran_set_df_cstate(struct smu_context 
*smu,
return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_DFCstateControl, 
state, NULL);
 }
 
-static int aldebaran_select_xgmi_plpd_policy(struct smu_context *smu,
-enum pp_xgmi_plpd_mode mode)
-{
-   struct amdgpu_device *adev = smu->adev;
-
-   /* The message only works on master die and NACK will be sent
-  back for other dies, only send it on master die */
-   if (adev->smuio.funcs->get_socket_id(adev) ||
-   adev->smuio.funcs->get_die_id(adev))
-   return 0;
-
-   if (mode == XGMI_PLPD_DEFAULT)
-   return smu_cmn_send_smc_msg_with_param(smu,
-  SMU_MSG_GmiPwrDnControl,
-  0, NULL);
-   else if (mode == XGMI_PLPD_DISALLOW)
-   return smu_cmn_send_smc_msg_with_param(smu,
-  SMU_MSG_GmiPwrDnControl,
-  1, NULL);
-   else
-   return -EINVAL;
-}
-
 static const struct throttling_logging_label {
uint32_t feature_mask;
const char *label;
@@ -2104,7 +2081,6 @@ static const struct pptable_funcs aldebaran_ppt_funcs = {
.set_soft_freq_limited_range = 

[PATCH v4 08/10] drm/amd/pm: Remove legacy interface for xgmi plpd

2024-05-14 Thread Lijo Lazar
Replace the legacy interface with amdgpu_dpm_set_pm_policy to set XGMI
PLPD mode. Also, xgmi_plpd sysfs node is not used by any client. Remove
that as well.

Signed-off-by: Lijo Lazar 
Reviewed-by: Hawking Zhang 
---
v2: No change
v3: Rebase to remove device_attr_id__xgmi_plpd_policy

 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c  |  4 +-
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c   | 43 
 drivers/gpu/drm/amd/pm/amdgpu_pm.c| 68 ---
 drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h   |  5 --
 drivers/gpu/drm/amd/pm/inc/amdgpu_pm.h|  1 -
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 27 
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  2 -
 7 files changed, 2 insertions(+), 148 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 4a14f9c1bfe8..821ba2309dec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1446,7 +1446,7 @@ static int amdgpu_ras_error_inject_xgmi(struct 
amdgpu_device *adev,
if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
dev_warn(adev->dev, "Failed to disallow df cstate");
 
-   ret1 = amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DISALLOW);
+   ret1 = amdgpu_dpm_set_pm_policy(adev, PP_PM_POLICY_XGMI_PLPD, 
XGMI_PLPD_DISALLOW);
if (ret1 && ret1 != -EOPNOTSUPP)
dev_warn(adev->dev, "Failed to disallow XGMI power down");
 
@@ -1455,7 +1455,7 @@ static int amdgpu_ras_error_inject_xgmi(struct 
amdgpu_device *adev,
if (amdgpu_ras_intr_triggered())
return ret2;
 
-   ret1 = amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DEFAULT);
+   ret1 = amdgpu_dpm_set_pm_policy(adev, PP_PM_POLICY_XGMI_PLPD, 
XGMI_PLPD_DEFAULT);
if (ret1 && ret1 != -EOPNOTSUPP)
dev_warn(adev->dev, "Failed to allow XGMI power down");
 
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index b443906484e7..cd169af35399 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -368,49 +368,6 @@ int amdgpu_dpm_set_df_cstate(struct amdgpu_device *adev,
return ret;
 }
 
-int amdgpu_dpm_get_xgmi_plpd_mode(struct amdgpu_device *adev, char **mode_desc)
-{
-   struct smu_context *smu = adev->powerplay.pp_handle;
-   int mode = XGMI_PLPD_NONE;
-
-   if (is_support_sw_smu(adev)) {
-   mode = smu->plpd_mode;
-   if (mode_desc == NULL)
-   return mode;
-   switch (smu->plpd_mode) {
-   case XGMI_PLPD_DISALLOW:
-   *mode_desc = "disallow";
-   break;
-   case XGMI_PLPD_DEFAULT:
-   *mode_desc = "default";
-   break;
-   case XGMI_PLPD_OPTIMIZED:
-   *mode_desc = "optimized";
-   break;
-   case XGMI_PLPD_NONE:
-   default:
-   *mode_desc = "none";
-   break;
-   }
-   }
-
-   return mode;
-}
-
-int amdgpu_dpm_set_xgmi_plpd_mode(struct amdgpu_device *adev, int mode)
-{
-   struct smu_context *smu = adev->powerplay.pp_handle;
-   int ret = -EOPNOTSUPP;
-
-   if (is_support_sw_smu(adev)) {
-   mutex_lock(>pm.mutex);
-   ret = smu_set_xgmi_plpd_mode(smu, mode);
-   mutex_unlock(>pm.mutex);
-   }
-
-   return ret;
-}
-
 ssize_t amdgpu_dpm_get_pm_policy_info(struct amdgpu_device *adev, char *buf)
 {
struct smu_context *smu = adev->powerplay.pp_handle;
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index b03c38d198ea..5c92c041d0bc 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -2214,70 +2214,6 @@ static int pp_dpm_clk_default_attr_update(struct 
amdgpu_device *adev, struct amd
return 0;
 }
 
-/* Following items will be read out to indicate current plpd policy:
- *  - -1: none
- *  - 0: disallow
- *  - 1: default
- *  - 2: optimized
- */
-static ssize_t amdgpu_get_xgmi_plpd_policy(struct device *dev,
-  struct device_attribute *attr,
-  char *buf)
-{
-   struct drm_device *ddev = dev_get_drvdata(dev);
-   struct amdgpu_device *adev = drm_to_adev(ddev);
-   char *mode_desc = "none";
-   int mode;
-
-   if (amdgpu_in_reset(adev))
-   return -EPERM;
-   if (adev->in_suspend && !adev->in_runpm)
-   return -EPERM;
-
-   mode = amdgpu_dpm_get_xgmi_plpd_mode(adev, _desc);
-
-   return sysfs_emit(buf, "%d: %s\n", mode, mode_desc);
-}
-
-/* Following argument value is expected from user to change plpd policy
- *  - arg 0: disallow plpd
- *  - arg 1: default policy
- *  - arg 2: optimized policy
- */

[PATCH v4 10/10] Documentation/amdgpu: Add PM policy documentation

2024-05-14 Thread Lijo Lazar
Add documentation about the newly added pm_policy node in sysfs.

Signed-off-by: Lijo Lazar 
---
 Documentation/gpu/amdgpu/thermal.rst |  6 
 drivers/gpu/drm/amd/pm/amdgpu_pm.c   | 48 
 2 files changed, 54 insertions(+)

diff --git a/Documentation/gpu/amdgpu/thermal.rst 
b/Documentation/gpu/amdgpu/thermal.rst
index 2f6166f81e6a..6d942b5c58f0 100644
--- a/Documentation/gpu/amdgpu/thermal.rst
+++ b/Documentation/gpu/amdgpu/thermal.rst
@@ -49,6 +49,12 @@ pp_power_profile_mode
 .. kernel-doc:: drivers/gpu/drm/amd/pm/amdgpu_pm.c
:doc: pp_power_profile_mode
 
+pm_policy
+-
+
+.. kernel-doc:: drivers/gpu/drm/amd/pm/amdgpu_pm.c
+   :doc: pm_policy
+
 \*_busy_percent
 ---
 
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 5c92c041d0bc..be39276181a1 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -2214,6 +2214,54 @@ static int pp_dpm_clk_default_attr_update(struct 
amdgpu_device *adev, struct amd
return 0;
 }
 
+/**
+ * DOC: pm_policy
+ *
+ * Certain SOCs can support different power policies to optimize application
+ * performance. However, this policy is provided only at SOC level and not at a
+ * per-process level. This is useful especially when entire SOC is utilized for
+ * dedicated workload.
+ *
+ * The amdgpu driver provides a sysfs API for selecting the policy. Presently,
+ * only two types of policies are supported through this interface.
+ *
+ *  Pstate Policy Selection - This is to select different Pstate profiles which
+ *  decides clock/throttling preferences.
+ *
+ *  XGMI PLPD Policy Selection - When multiple devices are connected over XGMI,
+ *  this helps to select policy to be applied for per link power down.
+ *
+ * The list of available policies and policy levels vary between SOCs. They can
+ * be viewed by reading the file. The policy level which is applied presently 
is
+ * denoted by * (asterisk). E.g.,
+ *
+ * .. code-block:: console
+ *
+ * cat /sys/bus/pci/devices/.../pm_policy
+ * soc_pstate
+ * 0 : soc_pstate_default
+ * 1 : soc_pstate_0
+ * 2 : soc_pstate_1*
+ * 3 : soc_pstate_2
+ * xgmi_plpd
+ * 0 : plpd_disallow
+ * 1 : plpd_default
+ * 2 : plpd_optimized*
+ *
+ * To apply a specific policy
+ *
+ * "echo   > /sys/bus/pci/devices/.../pm_policy"
+ *
+ * For the levels listed in the example above, to select "plpd_optimized" for
+ * XGMI and "soc_pstate_2" for soc pstate policy -
+ *
+ * .. code-block:: console
+ *
+ * echo "xgmi_plpd 2" > /sys/bus/pci/devices/.../pm_policy
+ * echo "soc_pstate 3" > /sys/bus/pci/devices/.../pm_policy
+ *
+ */
+
 static ssize_t amdgpu_get_pm_policy(struct device *dev,
struct device_attribute *attr, char *buf)
 {
-- 
2.25.1



[PATCH v4 07/10] drm/amd/pm: Add xgmi plpd to arcturus pm_policy

2024-05-14 Thread Lijo Lazar
On arcturus, allow changing xgmi plpd policy through pm_policy sysfs
interface.

Signed-off-by: Lijo Lazar 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c |  7 ++--
 .../gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c | 42 +++
 2 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 2a8d853e6346..48b867f4cf04 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -1199,19 +1199,20 @@ static void smu_init_xgmi_plpd_mode(struct smu_context 
*smu)
 {
struct smu_dpm_context *dpm_ctxt = &(smu->smu_dpm);
struct smu_dpm_policy_ctxt *policy_ctxt;
+   struct smu_dpm_policy *policy;
 
+   policy = smu_get_pm_policy(smu, PP_PM_POLICY_XGMI_PLPD);
if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(11, 0, 2)) {
smu->plpd_mode = XGMI_PLPD_DEFAULT;
+   if (policy)
+   policy->current_level = XGMI_PLPD_DEFAULT;
return;
}
 
/* PMFW put PLPD into default policy after enabling the feature */
if (smu_feature_is_enabled(smu,
   SMU_FEATURE_XGMI_PER_LINK_PWR_DWN_BIT)) {
-   struct smu_dpm_policy *policy;
-
smu->plpd_mode = XGMI_PLPD_DEFAULT;
-   policy = smu_get_pm_policy(smu, PP_PM_POLICY_XGMI_PLPD);
if (policy)
policy->current_level = XGMI_PLPD_DEFAULT;
} else {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
index 623f6052f97e..84f7d4139bda 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
@@ -283,9 +283,29 @@ static int arcturus_tables_init(struct smu_context *smu)
return 0;
 }
 
+static int arcturus_select_plpd_policy(struct smu_context *smu, int level)
+{
+   /* PPSMC_MSG_GmiPwrDnControl is supported by 54.23.0 and onwards */
+   if (smu->smc_fw_version < 0x00361700) {
+   dev_err(smu->adev->dev,
+   "XGMI power down control is only supported by PMFW 
54.23.0 and onwards\n");
+   return -EINVAL;
+   }
+
+   if (level == XGMI_PLPD_DEFAULT)
+   return smu_cmn_send_smc_msg_with_param(
+   smu, SMU_MSG_GmiPwrDnControl, 1, NULL);
+   else if (level == XGMI_PLPD_DISALLOW)
+   return smu_cmn_send_smc_msg_with_param(
+   smu, SMU_MSG_GmiPwrDnControl, 0, NULL);
+   else
+   return -EINVAL;
+}
+
 static int arcturus_allocate_dpm_context(struct smu_context *smu)
 {
struct smu_dpm_context *smu_dpm = >smu_dpm;
+   struct smu_dpm_policy *policy;
 
smu_dpm->dpm_context = kzalloc(sizeof(struct smu_11_0_dpm_context),
   GFP_KERNEL);
@@ -293,6 +313,20 @@ static int arcturus_allocate_dpm_context(struct 
smu_context *smu)
return -ENOMEM;
smu_dpm->dpm_context_size = sizeof(struct smu_11_0_dpm_context);
 
+   smu_dpm->dpm_policies =
+   kzalloc(sizeof(struct smu_dpm_policy_ctxt), GFP_KERNEL);
+
+   if (!smu_dpm->dpm_policies)
+   return -ENOMEM;
+
+   policy = &(smu_dpm->dpm_policies->policies[0]);
+   policy->policy_type = PP_PM_POLICY_XGMI_PLPD;
+   policy->level_mask = BIT(XGMI_PLPD_DISALLOW) | BIT(XGMI_PLPD_DEFAULT);
+   policy->current_level = XGMI_PLPD_DEFAULT;
+   policy->set_policy = arcturus_select_plpd_policy;
+   smu_cmn_generic_plpd_policy_desc(policy);
+   smu_dpm->dpm_policies->policy_mask |= BIT(PP_PM_POLICY_XGMI_PLPD);
+
return 0;
 }
 
@@ -403,6 +437,14 @@ static int arcturus_set_default_dpm_table(struct 
smu_context *smu)
dpm_table->max = dpm_table->dpm_levels[0].value;
}
 
+   /* XGMI PLPD is supported by 54.23.0 and onwards */
+   if (smu->smc_fw_version < 0x00361700) {
+   struct smu_dpm_context *smu_dpm = >smu_dpm;
+
+   smu_dpm->dpm_policies->policy_mask &=
+   ~BIT(PP_PM_POLICY_XGMI_PLPD);
+   }
+
return 0;
 }
 
-- 
2.25.1



[PATCH v4 06/10] drm/amd/pm: Add xgmi plpd to aldebaran pm_policy

2024-05-14 Thread Lijo Lazar
On aldebaran, allow changing xgmi plpd policy through pm_policy sysfs
interface.

Signed-off-by: Lijo Lazar 
Reviewed-by: Hawking Zhang 
---
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c| 35 +++
 1 file changed, 35 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index a22eb6bbb05e..66d386ef1da9 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -266,9 +266,30 @@ static int aldebaran_tables_init(struct smu_context *smu)
return 0;
 }
 
+static int aldebaran_select_plpd_policy(struct smu_context *smu, int level)
+{
+   struct amdgpu_device *adev = smu->adev;
+
+   /* The message only works on master die and NACK will be sent
+  back for other dies, only send it on master die */
+   if (adev->smuio.funcs->get_socket_id(adev) ||
+   adev->smuio.funcs->get_die_id(adev))
+   return 0;
+
+   if (level == XGMI_PLPD_DEFAULT)
+   return smu_cmn_send_smc_msg_with_param(
+   smu, SMU_MSG_GmiPwrDnControl, 0, NULL);
+   else if (level == XGMI_PLPD_DISALLOW)
+   return smu_cmn_send_smc_msg_with_param(
+   smu, SMU_MSG_GmiPwrDnControl, 1, NULL);
+   else
+   return -EINVAL;
+}
+
 static int aldebaran_allocate_dpm_context(struct smu_context *smu)
 {
struct smu_dpm_context *smu_dpm = >smu_dpm;
+   struct smu_dpm_policy *policy;
 
smu_dpm->dpm_context = kzalloc(sizeof(struct smu_13_0_dpm_context),
   GFP_KERNEL);
@@ -276,6 +297,20 @@ static int aldebaran_allocate_dpm_context(struct 
smu_context *smu)
return -ENOMEM;
smu_dpm->dpm_context_size = sizeof(struct smu_13_0_dpm_context);
 
+   smu_dpm->dpm_policies =
+   kzalloc(sizeof(struct smu_dpm_policy_ctxt), GFP_KERNEL);
+
+   if (!smu_dpm->dpm_policies)
+   return -ENOMEM;
+
+   policy = &(smu_dpm->dpm_policies->policies[0]);
+   policy->policy_type = PP_PM_POLICY_XGMI_PLPD;
+   policy->level_mask = BIT(XGMI_PLPD_DISALLOW) | BIT(XGMI_PLPD_DEFAULT);
+   policy->current_level = XGMI_PLPD_DEFAULT;
+   policy->set_policy = aldebaran_select_plpd_policy;
+   smu_cmn_generic_plpd_policy_desc(policy);
+   smu_dpm->dpm_policies->policy_mask |= BIT(PP_PM_POLICY_XGMI_PLPD);
+
return 0;
 }
 
-- 
2.25.1



[PATCH v4 03/10] drm/amd/pm: Add support to select pstate policy

2024-05-14 Thread Lijo Lazar
Add support to select pstate policy in SOCs with SMUv13.0.6

Signed-off-by: Lijo Lazar 
Reviewed-by: Hawking Zhang 
---
v2,v3: No change
v4: Use macro for policy type name

 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|  2 +
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 71 +++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c| 30 
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h|  1 +
 4 files changed, 104 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 9c0445fa9f9b..3a50076e44f0 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -531,10 +531,12 @@ int smu_v13_0_fini_smc_tables(struct smu_context *smu)
smu_table->watermarks_table = NULL;
smu_table->metrics_time = 0;
 
+   kfree(smu_dpm->dpm_policies);
kfree(smu_dpm->dpm_context);
kfree(smu_dpm->golden_dpm_context);
kfree(smu_dpm->dpm_current_power_state);
kfree(smu_dpm->dpm_request_power_state);
+   smu_dpm->dpm_policies = NULL;
smu_dpm->dpm_context = NULL;
smu_dpm->golden_dpm_context = NULL;
smu_dpm->dpm_context_size = 0;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index ed9c4866b6e4..0ed0b5326d35 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -174,6 +174,7 @@ static const struct cmn2asic_msg_mapping 
smu_v13_0_6_message_map[SMU_MSG_MAX_COU
MSG_MAP(McaBankCeDumpDW, PPSMC_MSG_McaBankCeDumpDW, 
SMU_MSG_RAS_PRI),
MSG_MAP(SelectPLPDMode,  PPSMC_MSG_SelectPLPDMode,  
0),
MSG_MAP(RmaDueToBadPageThreshold,
PPSMC_MSG_RmaDueToBadPageThreshold,0),
+   MSG_MAP(SelectPstatePolicy,  
PPSMC_MSG_SelectPstatePolicy,  0),
 };
 
 // clang-format on
@@ -369,6 +370,39 @@ static int smu_v13_0_6_tables_init(struct smu_context *smu)
return 0;
 }
 
+static int smu_v13_0_6_select_policy_soc_pstate(struct smu_context *smu,
+   int policy)
+{
+   struct amdgpu_device *adev = smu->adev;
+   int ret, param;
+
+   switch (policy) {
+   case SOC_PSTATE_DEFAULT:
+   param = 0;
+   break;
+   case SOC_PSTATE_0:
+   param = 1;
+   break;
+   case SOC_PSTATE_1:
+   param = 2;
+   break;
+   case SOC_PSTATE_2:
+   param = 3;
+   break;
+   default:
+   return -EINVAL;
+   }
+
+   ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SelectPstatePolicy,
+ param, NULL);
+
+   if (ret)
+   dev_err(adev->dev, "select soc pstate policy %d failed",
+   policy);
+
+   return ret;
+}
+
 static int smu_v13_0_6_allocate_dpm_context(struct smu_context *smu)
 {
struct smu_dpm_context *smu_dpm = >smu_dpm;
@@ -379,6 +413,24 @@ static int smu_v13_0_6_allocate_dpm_context(struct 
smu_context *smu)
return -ENOMEM;
smu_dpm->dpm_context_size = sizeof(struct smu_13_0_dpm_context);
 
+   if (!(smu->adev->flags & AMD_IS_APU)) {
+   struct smu_dpm_policy *policy;
+
+   smu_dpm->dpm_policies =
+   kzalloc(sizeof(struct smu_dpm_policy_ctxt), GFP_KERNEL);
+   policy = &(smu_dpm->dpm_policies->policies[0]);
+
+   policy->policy_type = PP_PM_POLICY_SOC_PSTATE;
+   policy->level_mask = BIT(SOC_PSTATE_DEFAULT) |
+BIT(SOC_PSTATE_0) | BIT(SOC_PSTATE_1) |
+BIT(SOC_PSTATE_2);
+   policy->current_level = SOC_PSTATE_DEFAULT;
+   policy->set_policy = smu_v13_0_6_select_policy_soc_pstate;
+   smu_cmn_generic_soc_policy_desc(policy);
+   smu_dpm->dpm_policies->policy_mask |=
+   BIT(PP_PM_POLICY_SOC_PSTATE);
+   }
+
return 0;
 }
 
@@ -639,6 +691,15 @@ static int smu_v13_0_6_get_dpm_level_count(struct 
smu_context *smu,
return ret;
 }
 
+static void smu_v13_0_6_pm_policy_init(struct smu_context *smu)
+{
+   struct smu_dpm_policy *policy;
+
+   policy = smu_get_pm_policy(smu, PP_PM_POLICY_SOC_PSTATE);
+   if (policy)
+   policy->current_level = SOC_PSTATE_DEFAULT;
+}
+
 static int smu_v13_0_6_set_default_dpm_table(struct smu_context *smu)
 {
struct smu_13_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context;
@@ -668,6 +729,16 @@ static int smu_v13_0_6_set_default_dpm_table(struct 
smu_context *smu)
 
smu_v13_0_6_setup_driver_pptable(smu);
 
+   /* DPM policy not supported in older 

[PATCH v4 04/10] drm/amd/pm: Add xgmi plpd policy to pm_policy

2024-05-14 Thread Lijo Lazar
Add support to set XGMI PLPD policy levels through pm_policy sysfs node.

Signed-off-by: Lijo Lazar 
Reviewed-by: Hawking Zhang 
---
v2, v3: No change
v4: Use a macro for XGMI PLPD policy type

 drivers/gpu/drm/amd/include/kgd_pp_interface.h | 1 +
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h  | 1 +
 3 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h 
b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
index 8ed9aa9a990d..4b20e2274313 100644
--- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
@@ -276,6 +276,7 @@ enum pp_xgmi_plpd_mode {
 enum pp_pm_policy {
PP_PM_POLICY_NONE = -1,
PP_PM_POLICY_SOC_PSTATE = 0,
+   PP_PM_POLICY_XGMI_PLPD,
PP_PM_POLICY_NUM,
 };
 
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 6dab0b085239..b03c38d198ea 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -2293,6 +2293,7 @@ static ssize_t amdgpu_get_pm_policy(struct device *dev,
 }
 
 #define STR_SOC_PSTATE_POLICY "soc_pstate"
+#define STR_XGMI_PLPD_POLICY "xgmi_plpd"
 
 static ssize_t amdgpu_set_pm_policy(struct device *dev,
struct device_attribute *attr,
@@ -2320,6 +2321,9 @@ static ssize_t amdgpu_set_pm_policy(struct device *dev,
if (strncmp(tmp, STR_SOC_PSTATE_POLICY, strlen(STR_SOC_PSTATE_POLICY)) 
== 0) {
policy_type = PP_PM_POLICY_SOC_PSTATE;
tmp += strlen(STR_SOC_PSTATE_POLICY);
+   } else if (strncmp(tmp, STR_XGMI_PLPD_POLICY, 
strlen(STR_XGMI_PLPD_POLICY)) == 0) {
+   policy_type = PP_PM_POLICY_XGMI_PLPD;
+   tmp += strlen(STR_XGMI_PLPD_POLICY);
} else {
return -EINVAL;
}
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index ee5b9701038c..f304071adee1 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -1574,6 +1574,7 @@ typedef struct {
 } WifiBandEntryTable_t;
 
 #define STR_SOC_PSTATE_POLICY "soc_pstate"
+#define STR_XGMI_PLPD_POLICY "xgmi_plpd"
 
 struct smu_dpm_policy *smu_get_pm_policy(struct smu_context *smu,
 enum pp_pm_policy p_type);
-- 
2.25.1



[PATCH v4 02/10] drm/amd/pm: Update PMFW messages for SMUv13.0.6

2024-05-14 Thread Lijo Lazar
Add PMF message to select a Pstate policy in SOCs with SMU v13.0.6.

Signed-off-by: Lijo Lazar 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h | 3 ++-
 drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h
index 86758051cb93..41cb681927e2 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h
@@ -92,7 +92,8 @@
 #define PPSMC_MSG_McaBankCeDumpDW   0x3B
 #define PPSMC_MSG_SelectPLPDMode0x40
 #define PPSMC_MSG_RmaDueToBadPageThreshold  0x43
-#define PPSMC_Message_Count 0x44
+#define PPSMC_MSG_SelectPstatePolicy0x44
+#define PPSMC_Message_Count 0x45
 
 //PPSMC Reset Types for driver msg argument
 #define PPSMC_RESET_TYPE_DRIVER_MODE_1_RESET0x1
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
index c48214e3dc8e..dff36bd7a17c 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
@@ -272,7 +272,8 @@
__SMU_DUMMY_MAP(SetSoftMinVpe), \
__SMU_DUMMY_MAP(GetMetricsVersion), \
__SMU_DUMMY_MAP(EnableUCLKShadow), \
-   __SMU_DUMMY_MAP(RmaDueToBadPageThreshold),
+   __SMU_DUMMY_MAP(RmaDueToBadPageThreshold),\
+   __SMU_DUMMY_MAP(SelectPstatePolicy),
 
 #undef __SMU_DUMMY_MAP
 #define __SMU_DUMMY_MAP(type)  SMU_MSG_##type
-- 
2.25.1



[PATCH v4 01/10] drm/amd/pm: Add support for DPM policies

2024-05-14 Thread Lijo Lazar
Add support to set/get information about different DPM policies. The
support is only available on SOCs which use swsmu architecture.

A DPM policy type may be defined with different levels. For example, a
policy may be defined to select Pstate preference and then later a
pstate preference may be chosen.

Signed-off-by: Lijo Lazar 
Reviewed-by: Hawking Zhang 
---
v2: Add NULL checks before accessing smu_dpm_policy_ctxt
v3: Rebase to add device_attr_id__pm_policy
v4: Use macro to define policy type for consistency.

 .../gpu/drm/amd/include/kgd_pp_interface.h| 16 +++
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c   | 29 ++
 drivers/gpu/drm/amd/pm/amdgpu_pm.c| 94 ++
 drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h   |  4 +
 drivers/gpu/drm/amd/pm/inc/amdgpu_pm.h|  1 +
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 98 +++
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 31 ++
 7 files changed, 273 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h 
b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
index 805c9d37a2b4..8ed9aa9a990d 100644
--- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
@@ -273,6 +273,22 @@ enum pp_xgmi_plpd_mode {
XGMI_PLPD_COUNT,
 };
 
+enum pp_pm_policy {
+   PP_PM_POLICY_NONE = -1,
+   PP_PM_POLICY_SOC_PSTATE = 0,
+   PP_PM_POLICY_NUM,
+};
+
+enum pp_policy_soc_pstate {
+   SOC_PSTATE_DEFAULT = 0,
+   SOC_PSTATE_0,
+   SOC_PSTATE_1,
+   SOC_PSTATE_2,
+   SOC_PSTAT_COUNT,
+};
+
+#define PP_POLICY_MAX_LEVELS 5
+
 #define PP_GROUP_MASK0xF000
 #define PP_GROUP_SHIFT   28
 
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index eee919577b44..b443906484e7 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -411,6 +411,35 @@ int amdgpu_dpm_set_xgmi_plpd_mode(struct amdgpu_device 
*adev, int mode)
return ret;
 }
 
+ssize_t amdgpu_dpm_get_pm_policy_info(struct amdgpu_device *adev, char *buf)
+{
+   struct smu_context *smu = adev->powerplay.pp_handle;
+   int ret = -EOPNOTSUPP;
+
+   if (is_support_sw_smu(adev)) {
+   mutex_lock(>pm.mutex);
+   ret = smu_get_pm_policy_info(smu, buf);
+   mutex_unlock(>pm.mutex);
+   }
+
+   return ret;
+}
+
+int amdgpu_dpm_set_pm_policy(struct amdgpu_device *adev, int policy_type,
+int policy_level)
+{
+   struct smu_context *smu = adev->powerplay.pp_handle;
+   int ret = -EOPNOTSUPP;
+
+   if (is_support_sw_smu(adev)) {
+   mutex_lock(>pm.mutex);
+   ret = smu_set_pm_policy(smu, policy_type, policy_level);
+   mutex_unlock(>pm.mutex);
+   }
+
+   return ret;
+}
+
 int amdgpu_dpm_enable_mgpu_fan_boost(struct amdgpu_device *adev)
 {
void *pp_handle = adev->powerplay.pp_handle;
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 110f2fc31754..6dab0b085239 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -2278,6 +2278,98 @@ static ssize_t amdgpu_set_xgmi_plpd_policy(struct device 
*dev,
return count;
 }
 
+static ssize_t amdgpu_get_pm_policy(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+
+   if (amdgpu_in_reset(adev))
+   return -EPERM;
+   if (adev->in_suspend && !adev->in_runpm)
+   return -EPERM;
+
+   return amdgpu_dpm_get_pm_policy_info(adev, buf);
+}
+
+#define STR_SOC_PSTATE_POLICY "soc_pstate"
+
+static ssize_t amdgpu_set_pm_policy(struct device *dev,
+   struct device_attribute *attr,
+   const char *buf, size_t count)
+{
+   struct drm_device *ddev = dev_get_drvdata(dev);
+   struct amdgpu_device *adev = drm_to_adev(ddev);
+   int policy_type, ret, num_params = 0;
+   char delimiter[] = " \n\t";
+   char tmp_buf[128];
+   char *tmp, *param;
+   long val;
+
+   if (amdgpu_in_reset(adev))
+   return -EPERM;
+   if (adev->in_suspend && !adev->in_runpm)
+   return -EPERM;
+
+   count = min(count, sizeof(tmp_buf));
+   memcpy(tmp_buf, buf, count);
+   tmp_buf[count - 1] = '\0';
+   tmp = tmp_buf;
+
+   tmp = skip_spaces(tmp);
+   if (strncmp(tmp, STR_SOC_PSTATE_POLICY, strlen(STR_SOC_PSTATE_POLICY)) 
== 0) {
+   policy_type = PP_PM_POLICY_SOC_PSTATE;
+   tmp += strlen(STR_SOC_PSTATE_POLICY);
+   } else {
+   return -EINVAL;
+   }
+
+   tmp = skip_spaces(tmp);
+   while ((param = strsep(, delimiter))) {
+   if (!strlen(param)) {
+   

[PATCH v4 00/10] Add PM policy interfaces

2024-05-14 Thread Lijo Lazar
This series adds APIs to get the supported PM policies and also set them. A PM
policy type is a predefined policy type supported by an SOC and each policy may
define two or more levels to choose from. A user can select the appropriate
level through amdgpu_dpm_set_pm_policy() or through sysfs node pm_policy. Based
on the specific PM functional area, multiple PM policies may be defined for an
SOC For ex: a policy may be defined to set the right setting for XGMI per link
power down feature and another may be defined to select the SOC Pstate
preferences.
 
Presently, XGMI PLPD and SOC Pstate policy types are supported. It also removes
the legacy sysfs interface to set XGMI PLPD as it is not used any client like
SMI tool.

v2:
 Add NULL checks to avoid access on SOCs which don't support any policy.

v3:
 Rebase and add documentation patch

v4:
 Use consistent policy type naming for read/write (Alex Deucher)

Lijo Lazar (10):
  drm/amd/pm: Add support for DPM policies
  drm/amd/pm: Update PMFW messages for SMUv13.0.6
  drm/amd/pm: Add support to select pstate policy
  drm/amd/pm: Add xgmi plpd policy to pm_policy
  drm/amd/pm: Add xgmi plpd to SMU v13.0.6 pm_policy
  drm/amd/pm: Add xgmi plpd to aldebaran pm_policy
  drm/amd/pm: Add xgmi plpd to arcturus pm_policy
  drm/amd/pm: Remove legacy interface for xgmi plpd
  drm/amd/pm: Remove unused interface to set plpd
  Documentation/amdgpu: Add PM policy documentation

 Documentation/gpu/amdgpu/thermal.rst  |   6 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c  |   4 +-
 .../gpu/drm/amd/include/kgd_pp_interface.h|  17 ++
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c   |  32 ++--
 drivers/gpu/drm/amd/pm/amdgpu_pm.c| 136 
 drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h   |   9 +-
 drivers/gpu/drm/amd/pm/inc/amdgpu_pm.h|   2 +-
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 113 +++--
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  40 -
 .../pm/swsmu/inc/pmfw_if/smu_v13_0_6_ppsmc.h  |   3 +-
 drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  |   3 +-
 .../gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c |  64 +---
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c|  59 ---
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|   2 +
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 153 +-
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c|  57 +++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h|   2 +
 17 files changed, 533 insertions(+), 169 deletions(-)

-- 
2.25.1



Re: [PATCH] drm/amd/display: Drop pixel_clock_mhz

2024-05-14 Thread Chung, ChiaHsuan (Tom)

This patch looks good to me.

Reviewed-by: Tom Chung 

On 5/9/2024 9:55 AM, Mario Limonciello wrote:

The pixel_clock_mhz property is populated in amdgpu_dm when Freesync is setup,
but it is not used anywhere in amdgpu_dm. Remove the dead code.

Cc:chiahsuan.ch...@amd.com
Signed-off-by: Mario Limonciello
---
  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 ---
  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 1 -
  2 files changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index f80213b7e9f7..3054bf79fc99 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -11274,7 +11274,6 @@ void amdgpu_dm_update_freesync_caps(struct 
drm_connector *connector,
  
  		amdgpu_dm_connector->min_vfreq = 0;

amdgpu_dm_connector->max_vfreq = 0;
-   amdgpu_dm_connector->pixel_clock_mhz = 0;
connector->display_info.monitor_range.min_vfreq = 0;
connector->display_info.monitor_range.max_vfreq = 0;
freesync_capable = false;
@@ -11338,8 +11337,6 @@ void amdgpu_dm_update_freesync_caps(struct 
drm_connector *connector,

connector->display_info.monitor_range.min_vfreq;
amdgpu_dm_connector->max_vfreq =

connector->display_info.monitor_range.max_vfreq;
-   amdgpu_dm_connector->pixel_clock_mhz =
-   range->pixel_clock_mhz * 10;
  
  break;

}
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index 09519b7abf67..67647bb5999b 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -678,7 +678,6 @@ struct amdgpu_dm_connector {
 * value is set to zero when there is no FreeSync support.
 */
int max_vfreq ;
-   int pixel_clock_mhz;
  
  	/* Audio instance - protected by audio_lock. */

int audio_inst;

RE: [PATCH 1/8] drm/amdgpu: support imu for gc 12_0_0

2024-05-14 Thread Zhang, Hawking
[AMD Official Use Only - AMD Internal Distribution Only]

Series is

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: amd-gfx  On Behalf Of Gao, Likun
Sent: Tuesday, May 14, 2024 16:51
To: amd-gfx list 
Subject: [PATCH 1/8] drm/amdgpu: support imu for gc 12_0_0

[AMD Official Use Only - AMD Internal Distribution Only]

[AMD Official Use Only - AMD Internal Distribution Only]

From: Likun Gao 

Support IMU for ASIC with GC 12.0.0
Drop some unused function.

Signed-off-by: Likun Gao 
---
 drivers/gpu/drm/amd/amdgpu/imu_v12_0.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/imu_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/imu_v12_0.c
index 032ae12b2be2..0c8ef908d112 100644
--- a/drivers/gpu/drm/amd/amdgpu/imu_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/imu_v12_0.c
@@ -32,6 +32,7 @@
 #include "gc/gc_12_0_0_sh_mask.h"
 #include "mmhub/mmhub_4_1_0_offset.h"

+MODULE_FIRMWARE("amdgpu/gc_12_0_0_imu.bin");
 MODULE_FIRMWARE("amdgpu/gc_12_0_1_imu.bin");

 #define TRANSFER_RAM_MASK  0x001c
@@ -367,6 +368,7 @@ static void imu_v12_0_program_rlc_ram(struct amdgpu_device 
*adev)
WREG32_SOC15(GC, 0, regGFX_IMU_RLC_RAM_INDEX, 0x2);

switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+   case IP_VERSION(12, 0, 0):
case IP_VERSION(12, 0, 1):
if (!r)
program_imu_rlc_ram(adev, data, (const u32)size);
--
2.34.1



RE: [PATCH 3/3] drm/amdgpu: Use NPS ranges from discovery table

2024-05-14 Thread Zhang, Hawking
[AMD Official Use Only - AMD Internal Distribution Only]

Series is

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, May 14, 2024 16:36
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Koenig, Christian ; Ma, 
Le ; Ma, Le 
Subject: [PATCH 3/3] drm/amdgpu: Use NPS ranges from discovery table

Add GMC API to fetch NPS range information from discovery table. Use NPS range 
information in GMC 9.4.3 SOCs when available, otherwise fallback to software 
method.

Signed-off-by: Lijo Lazar 
Reviewed-by: Le Ma 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 92 +++  
drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h |  5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c   | 76 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h   | 11 +++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 40 +---
 5 files changed, 212 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 43528ff50e72..afe8d12667f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -367,6 +367,35 @@ static void amdgpu_discovery_harvest_config_quirk(struct 
amdgpu_device *adev)
}
 }

+static int amdgpu_discovery_verify_npsinfo(struct amdgpu_device *adev,
+  struct binary_header *bhdr)
+{
+   struct table_info *info;
+   uint16_t checksum;
+   uint16_t offset;
+
+   info = >table_list[NPS_INFO];
+   offset = le16_to_cpu(info->offset);
+   checksum = le16_to_cpu(info->checksum);
+
+   struct nps_info_header *nhdr =
+   (struct nps_info_header *)(adev->mman.discovery_bin + offset);
+
+   if (le32_to_cpu(nhdr->table_id) != NPS_INFO_TABLE_ID) {
+   dev_dbg(adev->dev, "invalid ip discovery nps info table id\n");
+   return -EINVAL;
+   }
+
+   if (!amdgpu_discovery_verify_checksum(adev->mman.discovery_bin + offset,
+ le32_to_cpu(nhdr->size_bytes),
+ checksum)) {
+   dev_dbg(adev->dev, "invalid nps info data table checksum\n");
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
 static int amdgpu_discovery_init(struct amdgpu_device *adev)  {
struct table_info *info;
@@ -1681,6 +1710,69 @@ static int amdgpu_discovery_get_vcn_info(struct 
amdgpu_device *adev)
return 0;
 }

+union nps_info {
+   struct nps_info_v1_0 v1;
+};
+
+int amdgpu_discovery_get_nps_info(struct amdgpu_device *adev,
+ uint32_t *nps_type,
+ struct amdgpu_gmc_memrange **ranges,
+ int *range_cnt)
+{
+   struct amdgpu_gmc_memrange *mem_ranges;
+   struct binary_header *bhdr;
+   union nps_info *nps_info;
+   u16 offset;
+   int i;
+
+   if (!nps_type || !range_cnt || !ranges)
+   return -EINVAL;
+
+   if (!adev->mman.discovery_bin) {
+   dev_err(adev->dev,
+   "fetch mem range failed, ip discovery uninitialized\n");
+   return -EINVAL;
+   }
+
+   bhdr = (struct binary_header *)adev->mman.discovery_bin;
+   offset = le16_to_cpu(bhdr->table_list[NPS_INFO].offset);
+
+   if (!offset)
+   return -ENOENT;
+
+   /* If verification fails, return as if NPS table doesn't exist */
+   if (amdgpu_discovery_verify_npsinfo(adev, bhdr))
+   return -ENOENT;
+
+   nps_info = (union nps_info *)(adev->mman.discovery_bin + offset);
+
+   switch (le16_to_cpu(nps_info->v1.header.version_major)) {
+   case 1:
+   *nps_type = nps_info->v1.nps_type;
+   *range_cnt = nps_info->v1.count;
+   mem_ranges = kvzalloc(
+   *range_cnt * sizeof(struct amdgpu_gmc_memrange),
+   GFP_KERNEL);
+   for (i = 0; i < *range_cnt; i++) {
+   mem_ranges[i].base_address =
+   nps_info->v1.instance_info[i].base_address;
+   mem_ranges[i].limit_address =
+   nps_info->v1.instance_info[i].limit_address;
+   mem_ranges[i].nid_mask = -1;
+   mem_ranges[i].flags = 0;
+   }
+   *ranges = mem_ranges;
+   break;
+   default:
+   dev_err(adev->dev, "Unhandled NPS info table %d.%d\n",
+   le16_to_cpu(nps_info->v1.header.version_major),
+   le16_to_cpu(nps_info->v1.header.version_minor));
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
 static int amdgpu_discovery_set_common_ip_blocks(struct amdgpu_device *adev)  {
/* what IP to use for this? */
diff --git 

[PATCH] drm/amdgpu: add more device info to the devcoredump

2024-05-14 Thread Sunil Khatri
Adding more device information:
a. PCI info
b. VRAM and GTT info
c. GDC config

Also correct the print layout and section information for
in devcoredump.

Signed-off-by: Sunil Khatri 
---
 .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c  | 21 +--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
index c1cb62683695..f0a44d0dec27 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
@@ -224,12 +224,29 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, 
size_t count,
   coredump->reset_task_info.process_name,
   coredump->reset_task_info.pid);
 
-   /* GPU IP's information of the SOC */
-   drm_printf(, "\nIP Information\n");
+   /* SOC Information */
+   drm_printf(, "\nSOC Information\n");
+   drm_printf(, "SOC Device id: %d\n", coredump->adev->pdev->device);
+   drm_printf(, "SOC PCI Revision id: %d\n", 
coredump->adev->pdev->revision);
drm_printf(, "SOC Family: %d\n", coredump->adev->family);
drm_printf(, "SOC Revision id: %d\n", coredump->adev->rev_id);
drm_printf(, "SOC External Revision id: %d\n", 
coredump->adev->external_rev_id);
 
+   /* Memory Information */
+   drm_printf(, "\nSOC Memory Information\n");
+   drm_printf(, "real vram size: %llu\n", 
coredump->adev->gmc.real_vram_size);
+   drm_printf(, "visible vram size: %llu\n", 
coredump->adev->gmc.visible_vram_size);
+   drm_printf(, "visible vram size: %llu\n", 
coredump->adev->mman.gtt_mgr.manager.size);
+
+   /* GDS Config */
+   drm_printf(, "\nGDS Config\n");
+   drm_printf(, "gds: total size: %d\n", coredump->adev->gds.gds_size);
+   drm_printf(, "gds: compute partition size: %d\n", 
coredump->adev->gds.gds_size);
+   drm_printf(, "gds: gws per compute partition: %d\n", 
coredump->adev->gds.gws_size);
+   drm_printf(, "gds: os per compute partition: %d\n", 
coredump->adev->gds.oa_size);
+
+   /* HWIP Version Information */
+   drm_printf(, "\nHW IP Version Information\n");
for (int i = 1; i < MAX_HWIP; i++) {
for (int j = 0; j < HWIP_MAX_INSTANCE; j++) {
ver = coredump->adev->ip_versions[i][j];
-- 
2.34.1



[PATCH 2/2] drm/amd/pm: check specific index for aldebaran

2024-05-14 Thread Jesse Zhang
To avoid warning problems, drop index and
use PPSMC_MSG_GfxDriverReset instead of index for aldebaran.

Signed-off-by: Jesse Zhang 
Suggested-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index a22eb6bbb05e..2fc4ba036afe 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1880,17 +1880,18 @@ static int aldebaran_mode1_reset(struct smu_context 
*smu)
 
 static int aldebaran_mode2_reset(struct smu_context *smu)
 {
-   int ret = 0, index;
+   int ret = 0;
struct amdgpu_device *adev = smu->adev;
int timeout = 10;
 
-   index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
-   SMU_MSG_GfxDeviceDriverReset);
-   if (index < 0 )
-   return -EINVAL;
mutex_lock(>message_lock);
if (smu->smc_fw_version >= 0x00441400) {
-   ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index, 
SMU_RESET_MODE_2);
+   ret = smu_cmn_send_msg_without_waiting(smu, 
PPSMC_MSG_GfxDriverReset,
+   
SMU_RESET_MODE_2);
+   if (ret) {
+   dev_err(smu->adev->dev, "Failed to mode2 reset!\n");
+   goto out;
+   }
/* This is similar to FLR, wait till max FLR timeout */
msleep(100);
dev_dbg(smu->adev->dev, "restore config space...\n");
-- 
2.25.1



[PATCH 1/2] drm/amd/pm: check specific index for smu13

2024-05-14 Thread Jesse Zhang
To avoid warning problems, drop index and use PPSMC_MSG_GfxDriverReset instead 
of index.

Signed-off-by: Jesse Zhang 
Suggested-by: Lijo Lazar 
---
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 15 +--
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 46ab70a244af..6d691edf74fa 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2330,20 +2330,15 @@ static void smu_v13_0_6_restore_pci_config(struct 
smu_context *smu)
 
 static int smu_v13_0_6_mode2_reset(struct smu_context *smu)
 {
-   int ret = 0, index;
+   int ret = 0;
struct amdgpu_device *adev = smu->adev;
int timeout = 10;
 
-   index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
-  SMU_MSG_GfxDeviceDriverReset);
-   if (index < 0)
-   return index;
-
mutex_lock(>message_lock);
-
-   ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
-  SMU_RESET_MODE_2);
-
+   ret = smu_cmn_send_msg_without_waiting(smu, PPSMC_MSG_GfxDriverReset,
+   SMU_RESET_MODE_2);
+   if (ret)
+   goto out;
/* Reset takes a bit longer, wait for 200ms. */
msleep(200);
 
-- 
2.25.1



Re: [PATCH] drm/amdgpu: Use the slab allocator to reduce job allocation fragmentation

2024-05-14 Thread Christian König

Am 14.05.24 um 10:13 schrieb Liang, Prike:

[AMD Official Use Only - AMD Internal Distribution Only]


From: Koenig, Christian 
Sent: Friday, May 10, 2024 5:31 PM
To: Liang, Prike ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander 
Subject: Re: [PATCH] drm/amdgpu: Use the slab allocator to reduce job
allocation fragmentation

Am 10.05.24 um 10:11 schrieb Prike Liang:

Using kzalloc() results in about 50% memory fragmentation, therefore
use the slab allocator to reproduce memory fragmentation.

Signed-off-by: Prike Liang 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  1 +
   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 26

-

   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 +
   3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index ea14f1c8f430..3de1b42291b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -3040,6 +3040,7 @@ static void __exit amdgpu_exit(void)
 amdgpu_fence_slab_fini();
 mmu_notifier_synchronize();
 amdgpu_xcp_drv_release();
+   amdgpue_job_slab_fini();
   }

   module_init(amdgpu_init);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e4742b65032d..8327bf017a0e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -31,6 +31,8 @@
   #include "amdgpu_trace.h"
   #include "amdgpu_reset.h"

+static struct kmem_cache *amdgpu_job_slab;
+
   static enum drm_gpu_sched_stat amdgpu_job_timedout(struct

drm_sched_job *s_job)

   {
 struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); @@ -

101,10

+103,19 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct

amdgpu_vm *vm,

 if (num_ibs == 0)
 return -EINVAL;

-   *job = kzalloc(struct_size(*job, ibs, num_ibs), GFP_KERNEL);
-   if (!*job)
+   amdgpu_job_slab = kmem_cache_create("amdgpu_job",
+   struct_size(*job, ibs, num_ibs), 0,
+   SLAB_HWCACHE_ALIGN, NULL);

Well you are declaring a global slab cache for a dynamic job size, then try to
set it up in the job allocation function which can be called concurrently with
different number of IBs.

To sum it up  this is completely racy and will go boom immediately in testing.
As far as I can see this suggestion is just utterly nonsense.

Regards,
Christian.


Hi, Christian

The num_ibs is calculated as 1 in amdgpu_cs_p1_ib() and from amdgpu_cs_pass1(), 
the num_ibs will be set to 1 as an input parameter at amdgpu_job_alloc(). 
Moreover, the num_ibs is only set from amdgpu_cs_p1_ib() and shouldn't have a 
chance to be overwritten from the user space driver side. Also, I checked a few 
GL and Vulkan applications and didn't find multiple IBs within one amdgpu job 
submission.


Well this is just bluntly incorrect. I have no idea were you looked to 
come to this conclusion.


Basically UMDs are allowed to submit multiple IBs with each job, so 
assuming that it's always 1 just because we use 1 as a simple case 
doesn't change that in any way.


See function amdgpu_ring_max_ibs() for the in kernel limit on how many 
IBs can be used for each ring type:


/**
 * amdgpu_ring_max_ibs - Return max IBs that fit in a single submission.
 *
 * @type: ring type for which to return the limit.
 */
unsigned int amdgpu_ring_max_ibs(enum amdgpu_ring_type type)
{
    switch (type) {
    case AMDGPU_RING_TYPE_GFX:
    /* Need to keep at least 192 on GFX7+ for old radv. */
    return 192;
    case AMDGPU_RING_TYPE_COMPUTE:
    return 125;
    case AMDGPU_RING_TYPE_VCN_JPEG:
    return 16;
    default:
    return 49;
    }
}


If there are still concerns about the IB array size on the amdgpu_job object 
allocated, we can remove the IBs member and decompose the IB with the job 
object. Then, we can export and access the IBs as a parameter from a new 
interface like amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p, struct 
amdgpu_job *job, struct amdgpu_ib *ib).


And how should that help? Then we have to allocate the IBs separately 
which adds even more overhead.



Regarding this patch, using kmem_cache_zalloc() instead of kzalloc() can save 
about 448 bytes of memory space for each amdgpu_job object allocated. 
Meanwhile, the job object allocation takes almost the same time, so it should 
have no side effect on the performance. If the idea is sensible, I will rework 
the patch by creating the job slab during the driver probe period.


Well that initializing global variables from a function which can be 
called from multiple threads at the same time without a lock is 
completely racy should be obvious and not something I explicitly need to 
point out.


Then this patch doesn't even bother to check if the slab was already 
allocated before, but instead just calls kmem_cache_create() 

RE: [PATCH 2/2 v2] drm/amd/pm: check specific index for aldebaran

2024-05-14 Thread Zhang, Jesse(Jie)
[AMD Official Use Only - AMD Internal Distribution Only]

Hi Lijo

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, May 14, 2024 4:19 PM
To: Zhang, Jesse(Jie) ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Koenig, Christian 
; Huang, Tim 
Subject: Re: [PATCH 2/2 v2] drm/amd/pm: check specific index for aldebaran



On 5/14/2024 12:28 PM, Jesse Zhang wrote:
> To avoid warning problems, drop index and use PPSMC_MSG_GfxDriverReset
> instead of index for aldebaran.
>
> Signed-off-by: Jesse Zhang 
> Suggested-by: Lijo Lazar 
> ---
>  drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 13 +++--
>  1 file changed, 7 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index a22eb6bbb05e..d671314c46c8 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -1880,17 +1880,18 @@ static int aldebaran_mode1_reset(struct
> smu_context *smu)
>
>  static int aldebaran_mode2_reset(struct smu_context *smu)  {
> - int ret = 0, index;
> + int ret = 0;
>   struct amdgpu_device *adev = smu->adev;
>   int timeout = 10;
>
> - index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
> - SMU_MSG_GfxDeviceDriverReset);
> - if (index < 0 )
> - return -EINVAL;
>   mutex_lock(>message_lock);
>   if (smu->smc_fw_version >= 0x00441400) {
> - ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index, 
> SMU_RESET_MODE_2);

For clarity, original comment is - retain this as it is, only replace index 
with PPSMC_MSG_GfxDriverReset.

Changing this to msg_with_param() breaks the reset sequence.

[Zhang, Jesse(Jie)] Sorry, I misunderstood you, I will update the patch again.
Thanks
Jesse

Thanks,
Lijo

> + ret = smu_cmn_send_smc_msg_with_param(smu, 
> PPSMC_MSG_GfxDriverReset,
> + 
> SMU_RESET_MODE_2, NULL);
> + if (ret) {
> + dev_err(smu->adev->dev, "Failed to mode2 reset!\n");
> + goto out;
> + }
>   /* This is similar to FLR, wait till max FLR timeout */
>   msleep(100);
>   dev_dbg(smu->adev->dev, "restore config space...\n");


[PATCH 8/8] drm/amdgpu: switch default mes to uni mes

2024-05-14 Thread Gao, Likun
[AMD Official Use Only - AMD Internal Distribution Only]

From: Likun Gao 

Switch the default mes to uni mes for gfx v12.
V2: remove uni_mes set for gfx v11.

Signed-off-by: Likun Gao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 2 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 82d064adaa49..e5fc100c9ce7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -2248,8 +2248,6 @@ static int amdgpu_discovery_set_mes_ip_blocks(struct 
amdgpu_device *adev)
amdgpu_device_ip_block_add(adev, _v11_0_ip_block);
adev->enable_mes = true;
adev->enable_mes_kiq = true;
-   if (amdgpu_uni_mes)
-   adev->enable_uni_mes = true;
break;
case IP_VERSION(12, 0, 0):
case IP_VERSION(12, 0, 1):
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index caf89d21b61c..82b02bb261b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -197,7 +197,7 @@ int amdgpu_discovery = -1;  int amdgpu_mes;  int 
amdgpu_mes_log_enable = 0;  int amdgpu_mes_kiq; -int amdgpu_uni_mes;
+int amdgpu_uni_mes = 1;
 int amdgpu_noretry = -1;
 int amdgpu_force_asic_type = -1;
 int amdgpu_tmz = -1; /* auto */
@@ -694,7 +694,7 @@ module_param_named(mes_kiq, amdgpu_mes_kiq, int, 0444);
  * (0 = disabled (default), 1 = enabled)
  */
 MODULE_PARM_DESC(uni_mes,
-   "Enable Unified Micro Engine Scheduler (0 = disabled (default), 1 = 
enabled)");
+   "Enable Unified Micro Engine Scheduler (0 = disabled, 1 =
+enabled(default)");
 module_param_named(uni_mes, amdgpu_uni_mes, int, 0444);

 /**
--
2.34.1



[PATCH 6/8] drm/amdgpu: enable gfxoff for gc v12.0.0

2024-05-14 Thread Gao, Likun
[AMD Official Use Only - AMD Internal Distribution Only]

From: Likun Gao 

Enable GFXOFF for GC v12.0.0.

Signed-off-by: Likun Gao 
Reviewed-by: Kenneth Feng 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
index 045731caa624..6419f98e32b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
@@ -3608,6 +3608,7 @@ static int gfx_v12_0_set_powergating_state(void *handle,
return 0;

switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+   case IP_VERSION(12, 0, 0):
case IP_VERSION(12, 0, 1):
amdgpu_gfx_off_ctrl(adev, enable);
break;
--
2.34.1



[PATCH 7/8] drm/amd/pm: add pp_dpm_dcefclk for smu 14.0.2/3

2024-05-14 Thread Gao, Likun
[AMD Official Use Only - AMD Internal Distribution Only]

From: Kenneth Feng 

add pp_dpm_dcefclk for smu 14.0.2/3

Signed-off-by: Kenneth Feng 
Reviewed-by: Jack Gui 
---
 .../drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c  | 24 +++
 1 file changed, 24 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
index c22be56024d1..0e27cde82193 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
@@ -133,6 +133,7 @@ static struct cmn2asic_mapping 
smu_v14_0_2_clk_map[SMU_CLK_COUNT] = {
CLK_MAP(MCLK,   PPCLK_UCLK),
CLK_MAP(VCLK,   PPCLK_VCLK_0),
CLK_MAP(DCLK,   PPCLK_DCLK_0),
+   CLK_MAP(DCEFCLK,PPCLK_DCFCLK),
 };

 static struct cmn2asic_mapping smu_v14_0_2_feature_mask_map[SMU_FEATURE_COUNT] 
= { @@ -676,6 +677,22 @@ static int smu_v14_0_2_set_default_dpm_table(struct 
smu_context *smu)
pcie_table->num_of_link_levels++;
}

+   /* dcefclk dpm table setup */
+   dpm_table = _context->dpm_tables.dcef_table;
+   if (smu_cmn_feature_is_enabled(smu, SMU_FEATURE_DPM_DCN_BIT)) {
+   ret = smu_v14_0_set_single_dpm_table(smu,
+SMU_DCEFCLK,
+dpm_table);
+   if (ret)
+   return ret;
+   } else {
+   dpm_table->count = 1;
+   dpm_table->dpm_levels[0].value = 
smu->smu_table.boot_values.dcefclk / 100;
+   dpm_table->dpm_levels[0].enabled = true;
+   dpm_table->min = dpm_table->dpm_levels[0].value;
+   dpm_table->max = dpm_table->dpm_levels[0].value;
+   }
+
return 0;
 }

@@ -1000,6 +1017,9 @@ static int 
smu_v14_0_2_get_current_clk_freq_by_table(struct smu_context *smu,
case PPCLK_DCLK_0:
member_type = METRICS_AVERAGE_DCLK;
break;
+   case PPCLK_DCFCLK:
+   member_type = METRICS_CURR_DCEFCLK;
+   break;
default:
return -EINVAL;
}
@@ -1047,6 +1067,9 @@ static int smu_v14_0_2_print_clk_levels(struct 
smu_context *smu,
case SMU_DCLK1:
single_dpm_table = &(dpm_context->dpm_tables.dclk_table);
break;
+   case SMU_DCEFCLK:
+   single_dpm_table = &(dpm_context->dpm_tables.dcef_table);
+   break;
default:
break;
}
@@ -1060,6 +1083,7 @@ static int smu_v14_0_2_print_clk_levels(struct 
smu_context *smu,
case SMU_VCLK1:
case SMU_DCLK:
case SMU_DCLK1:
+   case SMU_DCEFCLK:
ret = smu_v14_0_2_get_current_clk_freq_by_table(smu, clk_type, 
_freq);
if (ret) {
dev_err(smu->adev->dev, "Failed to get current clock 
freq!");
--
2.34.1



[PATCH 5/8] drm/amd/amdgpu: enable mmhub and athub cg on gc 12.0.0

2024-05-14 Thread Gao, Likun
[AMD Official Use Only - AMD Internal Distribution Only]

From: Likun Gao 

Enable mmhub and athub cg on gc 12.0.0

Signed-off-by: Likun Gao 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/soc24.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc24.c 
b/drivers/gpu/drm/amd/amdgpu/soc24.c
index e91da2d986da..b2b9e0f83bdf 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc24.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc24.c
@@ -404,7 +404,11 @@ static int soc24_common_early_init(void *handle)
AMD_CG_SUPPORT_GFX_3D_CGLS |
AMD_CG_SUPPORT_REPEATER_FGCG |
AMD_CG_SUPPORT_GFX_FGCG |
-   AMD_CG_SUPPORT_GFX_PERF_CLK;
+   AMD_CG_SUPPORT_GFX_PERF_CLK |
+   AMD_CG_SUPPORT_ATHUB_MGCG |
+   AMD_CG_SUPPORT_ATHUB_LS |
+   AMD_CG_SUPPORT_MC_MGCG |
+   AMD_CG_SUPPORT_MC_LS;
adev->pg_flags = AMD_PG_SUPPORT_VCN |
AMD_PG_SUPPORT_JPEG;
adev->external_rev_id = adev->rev_id + 0x40;
--
2.34.1



[PATCH] drm/amdgpu/mes: use mc address for wptr in add queue packet

2024-05-14 Thread Min, Frank
[AMD Official Use Only - AMD Internal Distribution Only]

From: Frank Min 

use mc address for wptr in add queue packet

Signed-off-by: Frank Min 
---
 drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index 5519655fd70a..6256b21884ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -267,11 +267,7 @@ static int mes_v12_0_add_hw_queue(struct amdgpu_mes *mes,
mes_add_queue_pkt.doorbell_offset = input->doorbell_offset;
mes_add_queue_pkt.mqd_addr = input->mqd_addr;

-   if (((adev->mes.sched_version & AMDGPU_MES_API_VERSION_MASK) >>
-   AMDGPU_MES_API_VERSION_SHIFT) >= 2)
-   mes_add_queue_pkt.wptr_addr = input->wptr_mc_addr;
-   else
-   mes_add_queue_pkt.wptr_addr = input->wptr_addr;
+   mes_add_queue_pkt.wptr_addr = input->wptr_mc_addr;

mes_add_queue_pkt.queue_type =
convert_to_mes_queue_type(input->queue_type);
--
2.34.1



[PATCH 4/8] drm/amdgpu: enable some cg feature for gc 12.0.0

2024-05-14 Thread Gao, Likun
[AMD Official Use Only - AMD Internal Distribution Only]

From: Likun Gao 

Enable 3D cgcg, 3D cgls, sram fgcg, perfcounter mgcg, repeater fgcg for gc 
v12.0.0.

Signed-off-by: Likun Gao 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/soc24.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc24.c 
b/drivers/gpu/drm/amd/amdgpu/soc24.c
index 5c796e974252..e91da2d986da 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc24.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc24.c
@@ -398,7 +398,13 @@ static int soc24_common_early_init(void *handle)
switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
case IP_VERSION(12, 0, 0):
adev->cg_flags = AMD_CG_SUPPORT_GFX_CGCG |
-   AMD_CG_SUPPORT_GFX_CGLS;
+   AMD_CG_SUPPORT_GFX_CGLS |
+   AMD_CG_SUPPORT_GFX_MGCG |
+   AMD_CG_SUPPORT_GFX_3D_CGCG |
+   AMD_CG_SUPPORT_GFX_3D_CGLS |
+   AMD_CG_SUPPORT_REPEATER_FGCG |
+   AMD_CG_SUPPORT_GFX_FGCG |
+   AMD_CG_SUPPORT_GFX_PERF_CLK;
adev->pg_flags = AMD_PG_SUPPORT_VCN |
AMD_PG_SUPPORT_JPEG;
adev->external_rev_id = adev->rev_id + 0x40;
--
2.34.1



[PATCH 3/8] drm/amdgpu: enable gfx cgcg for gfx v12_0_0

2024-05-14 Thread Gao, Likun
[AMD Official Use Only - AMD Internal Distribution Only]

From: Likun Gao 

Enable GFX CGCG and CGLS for gfx version 12.0.0.

Signed-off-by: Likun Gao 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 1 +
 drivers/gpu/drm/amd/amdgpu/soc24.c | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
index 924ab4a3a0d5..045731caa624 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
@@ -3857,6 +3857,7 @@ static int gfx_v12_0_set_clockgating_state(void *handle,
return 0;

switch (adev->ip_versions[GC_HWIP][0]) {
+   case IP_VERSION(12, 0, 0):
case IP_VERSION(12, 0, 1):
gfx_v12_0_update_gfx_clock_gating(adev,
  state == AMD_CG_STATE_GATE);
diff --git a/drivers/gpu/drm/amd/amdgpu/soc24.c 
b/drivers/gpu/drm/amd/amdgpu/soc24.c
index 3eb6574b777a..5c796e974252 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc24.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc24.c
@@ -397,7 +397,8 @@ static int soc24_common_early_init(void *handle)

switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
case IP_VERSION(12, 0, 0):
-   adev->cg_flags = 0;
+   adev->cg_flags = AMD_CG_SUPPORT_GFX_CGCG |
+   AMD_CG_SUPPORT_GFX_CGLS;
adev->pg_flags = AMD_PG_SUPPORT_VCN |
AMD_PG_SUPPORT_JPEG;
adev->external_rev_id = adev->rev_id + 0x40;
--
2.34.1



[PATCH 1/8] drm/amdgpu: support imu for gc 12_0_0

2024-05-14 Thread Gao, Likun
[AMD Official Use Only - AMD Internal Distribution Only]

From: Likun Gao 

Support IMU for ASIC with GC 12.0.0
Drop some unused function.

Signed-off-by: Likun Gao 
---
 drivers/gpu/drm/amd/amdgpu/imu_v12_0.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/imu_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/imu_v12_0.c
index 032ae12b2be2..0c8ef908d112 100644
--- a/drivers/gpu/drm/amd/amdgpu/imu_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/imu_v12_0.c
@@ -32,6 +32,7 @@
 #include "gc/gc_12_0_0_sh_mask.h"
 #include "mmhub/mmhub_4_1_0_offset.h"

+MODULE_FIRMWARE("amdgpu/gc_12_0_0_imu.bin");
 MODULE_FIRMWARE("amdgpu/gc_12_0_1_imu.bin");

 #define TRANSFER_RAM_MASK  0x001c
@@ -367,6 +368,7 @@ static void imu_v12_0_program_rlc_ram(struct amdgpu_device 
*adev)
WREG32_SOC15(GC, 0, regGFX_IMU_RLC_RAM_INDEX, 0x2);

switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+   case IP_VERSION(12, 0, 0):
case IP_VERSION(12, 0, 1):
if (!r)
program_imu_rlc_ram(adev, data, (const u32)size);
--
2.34.1



FW: [PATCH 2/8] drm/amdgpu/jpeg5: enable power gating

2024-05-14 Thread Gao, Likun
[AMD Official Use Only - AMD Internal Distribution Only]

From: Sonny Jiang 

Enable PG on JPEG5

Signed-off-by: Sonny Jiang 
Reviewed-by: Leo Liu 
---
 drivers/gpu/drm/amd/amdgpu/soc24.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc24.c 
b/drivers/gpu/drm/amd/amdgpu/soc24.c
index 285d6af10f62..3eb6574b777a 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc24.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc24.c
@@ -398,7 +398,8 @@ static int soc24_common_early_init(void *handle)
switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
case IP_VERSION(12, 0, 0):
adev->cg_flags = 0;
-   adev->pg_flags = 0;
+   adev->pg_flags = AMD_PG_SUPPORT_VCN |
+   AMD_PG_SUPPORT_JPEG;
adev->external_rev_id = adev->rev_id + 0x40;
break;
case IP_VERSION(12, 0, 1):
--
2.34.1



[PATCH 3/3] drm/amdgpu: Use NPS ranges from discovery table

2024-05-14 Thread Lijo Lazar
Add GMC API to fetch NPS range information from discovery table. Use NPS
range information in GMC 9.4.3 SOCs when available, otherwise fallback
to software method.

Signed-off-by: Lijo Lazar 
Reviewed-by: Le Ma 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 92 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h |  5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c   | 76 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h   | 11 +++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 40 +---
 5 files changed, 212 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 43528ff50e72..afe8d12667f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -367,6 +367,35 @@ static void amdgpu_discovery_harvest_config_quirk(struct 
amdgpu_device *adev)
}
 }
 
+static int amdgpu_discovery_verify_npsinfo(struct amdgpu_device *adev,
+  struct binary_header *bhdr)
+{
+   struct table_info *info;
+   uint16_t checksum;
+   uint16_t offset;
+
+   info = >table_list[NPS_INFO];
+   offset = le16_to_cpu(info->offset);
+   checksum = le16_to_cpu(info->checksum);
+
+   struct nps_info_header *nhdr =
+   (struct nps_info_header *)(adev->mman.discovery_bin + offset);
+
+   if (le32_to_cpu(nhdr->table_id) != NPS_INFO_TABLE_ID) {
+   dev_dbg(adev->dev, "invalid ip discovery nps info table id\n");
+   return -EINVAL;
+   }
+
+   if (!amdgpu_discovery_verify_checksum(adev->mman.discovery_bin + offset,
+ le32_to_cpu(nhdr->size_bytes),
+ checksum)) {
+   dev_dbg(adev->dev, "invalid nps info data table checksum\n");
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
 static int amdgpu_discovery_init(struct amdgpu_device *adev)
 {
struct table_info *info;
@@ -1681,6 +1710,69 @@ static int amdgpu_discovery_get_vcn_info(struct 
amdgpu_device *adev)
return 0;
 }
 
+union nps_info {
+   struct nps_info_v1_0 v1;
+};
+
+int amdgpu_discovery_get_nps_info(struct amdgpu_device *adev,
+ uint32_t *nps_type,
+ struct amdgpu_gmc_memrange **ranges,
+ int *range_cnt)
+{
+   struct amdgpu_gmc_memrange *mem_ranges;
+   struct binary_header *bhdr;
+   union nps_info *nps_info;
+   u16 offset;
+   int i;
+
+   if (!nps_type || !range_cnt || !ranges)
+   return -EINVAL;
+
+   if (!adev->mman.discovery_bin) {
+   dev_err(adev->dev,
+   "fetch mem range failed, ip discovery uninitialized\n");
+   return -EINVAL;
+   }
+
+   bhdr = (struct binary_header *)adev->mman.discovery_bin;
+   offset = le16_to_cpu(bhdr->table_list[NPS_INFO].offset);
+
+   if (!offset)
+   return -ENOENT;
+
+   /* If verification fails, return as if NPS table doesn't exist */
+   if (amdgpu_discovery_verify_npsinfo(adev, bhdr))
+   return -ENOENT;
+
+   nps_info = (union nps_info *)(adev->mman.discovery_bin + offset);
+
+   switch (le16_to_cpu(nps_info->v1.header.version_major)) {
+   case 1:
+   *nps_type = nps_info->v1.nps_type;
+   *range_cnt = nps_info->v1.count;
+   mem_ranges = kvzalloc(
+   *range_cnt * sizeof(struct amdgpu_gmc_memrange),
+   GFP_KERNEL);
+   for (i = 0; i < *range_cnt; i++) {
+   mem_ranges[i].base_address =
+   nps_info->v1.instance_info[i].base_address;
+   mem_ranges[i].limit_address =
+   nps_info->v1.instance_info[i].limit_address;
+   mem_ranges[i].nid_mask = -1;
+   mem_ranges[i].flags = 0;
+   }
+   *ranges = mem_ranges;
+   break;
+   default:
+   dev_err(adev->dev, "Unhandled NPS info table %d.%d\n",
+   le16_to_cpu(nps_info->v1.header.version_major),
+   le16_to_cpu(nps_info->v1.header.version_minor));
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
 static int amdgpu_discovery_set_common_ip_blocks(struct amdgpu_device *adev)
 {
/* what IP to use for this? */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h
index 4d03cd5b3410..f5d36525ec3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h
@@ -30,4 +30,9 @@
 void amdgpu_discovery_fini(struct amdgpu_device *adev);
 int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev);
 

[PATCH 2/3] drm/amdgpu: Add nps info table to IP discovery

2024-05-14 Thread Lijo Lazar
Add support to fetch NPS info table in IP discovery table.

Signed-off-by: Lijo Lazar 
Reviewed-by: Le Ma 
---
 drivers/gpu/drm/amd/include/discovery.h | 25 -
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/include/discovery.h 
b/drivers/gpu/drm/amd/include/discovery.h
index 0bc169f1ae0d..737d37cfd486 100644
--- a/drivers/gpu/drm/amd/include/discovery.h
+++ b/drivers/gpu/drm/amd/include/discovery.h
@@ -31,6 +31,7 @@
 #define HARVEST_TABLE_SIGNATURE 0x56524148
 #define VCN_INFO_TABLE_ID   0x004E4356
 #define MALL_INFO_TABLE_ID  0x4C4C414D
+#define NPS_INFO_TABLE_ID   0x0053504E
 
 typedef enum
 {
@@ -39,7 +40,7 @@ typedef enum
HARVEST_INFO,
VCN_INFO,
MALL_INFO,
-   RESERVED_1,
+   NPS_INFO,
TOTAL_TABLES = 6
 } table;
 
@@ -382,6 +383,28 @@ struct vcn_info_v1_0 {
uint32_t reserved[4];
 };
 
+#define NPS_INFO_TABLE_MAX_NUM_INSTANCES 12
+
+struct nps_info_header {
+uint32_t table_id;  /* table ID */
+uint16_t version_major; /* table version */
+uint16_t version_minor; /* table version */
+uint32_t size_bytes;/* size of the entire header+data in bytes = 
0x00D4 (212) */
+};
+
+struct nps_instance_info_v1_0
+{
+uint64_t base_address;
+uint64_t limit_address;
+};
+
+struct nps_info_v1_0 {
+struct   nps_info_header header;
+uint32_t nps_type;
+uint32_t count;
+struct   nps_instance_info_v1_0 
instance_info[NPS_INFO_TABLE_MAX_NUM_INSTANCES];
+};
+
 #pragma pack()
 
 #endif
-- 
2.25.1



[PATCH 1/3] drm/amdgpu: Fix memory range calculation

2024-05-14 Thread Lijo Lazar
Consider the 16M reserved region also before range calculation for GMC
9.4.3 SOCs.

Signed-off-by: Lijo Lazar 
Acked-by: Christian König 
Reviewed-by: Le Ma 

Fixes: a433f1f59484 ("drm/amdgpu: Initialize memory ranges for GC 9.4.3")
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 671a6766df5b..7c4e2adae7b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1918,7 +1918,7 @@ gmc_v9_0_init_sw_mem_ranges(struct amdgpu_device *adev,
break;
}
 
-   size = adev->gmc.real_vram_size >> AMDGPU_GPU_PAGE_SHIFT;
+   size = (adev->gmc.real_vram_size + SZ_16M) >> AMDGPU_GPU_PAGE_SHIFT;
size /= adev->gmc.num_mem_partitions;
 
for (i = 0; i < adev->gmc.num_mem_partitions; ++i) {
-- 
2.25.1



Re: [PATCH 2/2 v2] drm/amd/pm: check specific index for aldebaran

2024-05-14 Thread Lazar, Lijo



On 5/14/2024 12:28 PM, Jesse Zhang wrote:
> To avoid warning problems, drop index and
> use PPSMC_MSG_GfxDriverReset instead of index for aldebaran.
> 
> Signed-off-by: Jesse Zhang 
> Suggested-by: Lijo Lazar 
> ---
>  drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 13 +++--
>  1 file changed, 7 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index a22eb6bbb05e..d671314c46c8 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -1880,17 +1880,18 @@ static int aldebaran_mode1_reset(struct smu_context 
> *smu)
>  
>  static int aldebaran_mode2_reset(struct smu_context *smu)
>  {
> - int ret = 0, index;
> + int ret = 0;
>   struct amdgpu_device *adev = smu->adev;
>   int timeout = 10;
>  
> - index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
> - SMU_MSG_GfxDeviceDriverReset);
> - if (index < 0 )
> - return -EINVAL;
>   mutex_lock(>message_lock);
>   if (smu->smc_fw_version >= 0x00441400) {
> - ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index, 
> SMU_RESET_MODE_2);

For clarity, original comment is - retain this as it is, only replace
index with PPSMC_MSG_GfxDriverReset.

Changing this to msg_with_param() breaks the reset sequence.

Thanks,
Lijo

> + ret = smu_cmn_send_smc_msg_with_param(smu, 
> PPSMC_MSG_GfxDriverReset,
> + 
> SMU_RESET_MODE_2, NULL);
> + if (ret) {
> + dev_err(smu->adev->dev, "Failed to mode2 reset!\n");
> + goto out;
> + }
>   /* This is similar to FLR, wait till max FLR timeout */
>   msleep(100);
>   dev_dbg(smu->adev->dev, "restore config space...\n");


RE: [PATCH] drm/amdgpu: add debug flag to enable RAS ACA driver.

2024-05-14 Thread Zhang, Hawking
[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Wang, Yang(Kevin) 
Sent: Tuesday, May 14, 2024 16:15
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao 
Subject: [PATCH] drm/amdgpu: add debug flag to enable RAS ACA driver.

Use debug_mask=0x10 (BIT.4) param to help enable RAS ACA driver.
(RAS ACA is disabled by default.)

Signed-off-by: Yang Wang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 3 ++-  
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 6 ++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 846c3550fbda..550a42e3961f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1171,6 +1171,7 @@ struct amdgpu_device {
booldebug_largebar;
booldebug_disable_soft_recovery;
booldebug_use_vram_fw_buf;
+   booldebug_enable_ras_aca;
 };

 static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev, 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index 987a1b4d4503..0b1b9911bd99 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -686,7 +686,8 @@ static void aca_manager_fini(struct aca_handle_manager *mgr)

 bool amdgpu_aca_is_enabled(struct amdgpu_device *adev)  {
-   return adev->aca.is_enabled;
+   return (adev->aca.is_enabled ||
+   adev->debug_enable_ras_aca);
 }

 int amdgpu_aca_init(struct amdgpu_device *adev) diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index caf89d21b61c..a2de55ab3a6a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -129,6 +129,7 @@ enum AMDGPU_DEBUG_MASK {
AMDGPU_DEBUG_LARGEBAR = BIT(1),
AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2),
AMDGPU_DEBUG_USE_VRAM_FW_BUF = BIT(3),
+   AMDGPU_DEBUG_ENABLE_RAS_ACA = BIT(4),
 };

 unsigned int amdgpu_vram_limit = UINT_MAX; @@ -2192,6 +2193,11 @@ static void 
amdgpu_init_debug_options(struct amdgpu_device *adev)
pr_info("debug: place fw in vram for frontdoor loading\n");
adev->debug_use_vram_fw_buf = true;
}
+
+   if (amdgpu_debug_mask & AMDGPU_DEBUG_ENABLE_RAS_ACA) {
+   pr_info("debug: enable RAS ACA driver\n");
+   adev->debug_enable_ras_aca = true;
+   }
 }

 static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long 
flags)
--
2.34.1



[PATCH] drm/amdgpu: add debug flag to enable RAS ACA driver.

2024-05-14 Thread Yang Wang
Use debug_mask=0x10 (BIT.4) param to help enable RAS ACA driver.
(RAS ACA is disabled by default.)

Signed-off-by: Yang Wang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 6 ++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 846c3550fbda..550a42e3961f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1171,6 +1171,7 @@ struct amdgpu_device {
booldebug_largebar;
booldebug_disable_soft_recovery;
booldebug_use_vram_fw_buf;
+   booldebug_enable_ras_aca;
 };
 
 static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index 987a1b4d4503..0b1b9911bd99 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -686,7 +686,8 @@ static void aca_manager_fini(struct aca_handle_manager *mgr)
 
 bool amdgpu_aca_is_enabled(struct amdgpu_device *adev)
 {
-   return adev->aca.is_enabled;
+   return (adev->aca.is_enabled ||
+   adev->debug_enable_ras_aca);
 }
 
 int amdgpu_aca_init(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index caf89d21b61c..a2de55ab3a6a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -129,6 +129,7 @@ enum AMDGPU_DEBUG_MASK {
AMDGPU_DEBUG_LARGEBAR = BIT(1),
AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2),
AMDGPU_DEBUG_USE_VRAM_FW_BUF = BIT(3),
+   AMDGPU_DEBUG_ENABLE_RAS_ACA = BIT(4),
 };
 
 unsigned int amdgpu_vram_limit = UINT_MAX;
@@ -2192,6 +2193,11 @@ static void amdgpu_init_debug_options(struct 
amdgpu_device *adev)
pr_info("debug: place fw in vram for frontdoor loading\n");
adev->debug_use_vram_fw_buf = true;
}
+
+   if (amdgpu_debug_mask & AMDGPU_DEBUG_ENABLE_RAS_ACA) {
+   pr_info("debug: enable RAS ACA driver\n");
+   adev->debug_enable_ras_aca = true;
+   }
 }
 
 static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long 
flags)
-- 
2.34.1



RE: [PATCH] drm/amdgpu: Use the slab allocator to reduce job allocation fragmentation

2024-05-14 Thread Liang, Prike
[AMD Official Use Only - AMD Internal Distribution Only]

> From: Koenig, Christian 
> Sent: Friday, May 10, 2024 5:31 PM
> To: Liang, Prike ; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander 
> Subject: Re: [PATCH] drm/amdgpu: Use the slab allocator to reduce job
> allocation fragmentation
>
> Am 10.05.24 um 10:11 schrieb Prike Liang:
> > Using kzalloc() results in about 50% memory fragmentation, therefore
> > use the slab allocator to reproduce memory fragmentation.
> >
> > Signed-off-by: Prike Liang 
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  1 +
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 26
> -
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 +
> >   3 files changed, 23 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > index ea14f1c8f430..3de1b42291b6 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > @@ -3040,6 +3040,7 @@ static void __exit amdgpu_exit(void)
> > amdgpu_fence_slab_fini();
> > mmu_notifier_synchronize();
> > amdgpu_xcp_drv_release();
> > +   amdgpue_job_slab_fini();
> >   }
> >
> >   module_init(amdgpu_init);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > index e4742b65032d..8327bf017a0e 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > @@ -31,6 +31,8 @@
> >   #include "amdgpu_trace.h"
> >   #include "amdgpu_reset.h"
> >
> > +static struct kmem_cache *amdgpu_job_slab;
> > +
> >   static enum drm_gpu_sched_stat amdgpu_job_timedout(struct
> drm_sched_job *s_job)
> >   {
> > struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); @@ -
> 101,10
> > +103,19 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct
> amdgpu_vm *vm,
> > if (num_ibs == 0)
> > return -EINVAL;
> >
> > -   *job = kzalloc(struct_size(*job, ibs, num_ibs), GFP_KERNEL);
> > -   if (!*job)
> > +   amdgpu_job_slab = kmem_cache_create("amdgpu_job",
> > +   struct_size(*job, ibs, num_ibs), 0,
> > +   SLAB_HWCACHE_ALIGN, NULL);
>
> Well you are declaring a global slab cache for a dynamic job size, then try to
> set it up in the job allocation function which can be called concurrently with
> different number of IBs.
>
> To sum it up  this is completely racy and will go boom immediately in testing.
> As far as I can see this suggestion is just utterly nonsense.
>
> Regards,
> Christian.
>
Hi, Christian

The num_ibs is calculated as 1 in amdgpu_cs_p1_ib() and from amdgpu_cs_pass1(), 
the num_ibs will be set to 1 as an input parameter at amdgpu_job_alloc(). 
Moreover, the num_ibs is only set from amdgpu_cs_p1_ib() and shouldn't have a 
chance to be overwritten from the user space driver side. Also, I checked a few 
GL and Vulkan applications and didn't find multiple IBs within one amdgpu job 
submission.

If there are still concerns about the IB array size on the amdgpu_job object 
allocated, we can remove the IBs member and decompose the IB with the job 
object. Then, we can export and access the IBs as a parameter from a new 
interface like amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p, struct 
amdgpu_job *job, struct amdgpu_ib *ib).

Regarding this patch, using kmem_cache_zalloc() instead of kzalloc() can save 
about 448 bytes of memory space for each amdgpu_job object allocated. 
Meanwhile, the job object allocation takes almost the same time, so it should 
have no side effect on the performance. If the idea is sensible, I will rework 
the patch by creating the job slab during the driver probe period.

Thanks,
Prike
> > +   if (!amdgpu_job_slab) {
> > +   DRM_ERROR("create amdgpu_job cache failed\n");
> > return -ENOMEM;
> > +   }
> >
> > +   *job = kmem_cache_zalloc(amdgpu_job_slab, GFP_KERNEL);
> > +   if (!*job) {
> > +   kmem_cache_destroy(amdgpu_job_slab);
> > +   return -ENOMEM;
> > +   }
> > /*
> >  * Initialize the scheduler to at least some ring so that we always
> >  * have a pointer to adev.
> > @@ -138,7 +149,7 @@ int amdgpu_job_alloc_with_ib(struct
> amdgpu_device *adev,
> > if (r) {
> > if (entity)
> > drm_sched_job_cleanup(&(*job)->base);
> > -   kfree(*job);
> > +   kmem_cache_free(amdgpu_job_slab, job);
> > }
> >
> > return r;
> > @@ -179,6 +190,11 @@ void amdgpu_job_free_resources(struct
> amdgpu_job *job)
> > amdgpu_ib_free(ring->adev, >ibs[i], f);
> >   }
> >
> > +void amdgpue_job_slab_fini(void)
> > +{
> > +   kmem_cache_destroy(amdgpu_job_slab);
> > +}
> > +
> >   static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
> >   {
> > struct amdgpu_job *job = to_amdgpu_job(s_job); @@ -189,7 +205,7
> @@
> > static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
> >
> > /* 

Re: [PATCH] drm/amdgpu: Check if NBIO funcs are NULL in amdgpu_device_baco_exit

2024-05-14 Thread Christian König

Am 14.05.24 um 09:06 schrieb Friedrich Vock:

The special case for VM passthrough doesn't check adev->nbio.funcs
before dereferencing it. If GPUs that don't have an NBIO block are
passed through, this leads to a NULL pointer dereference on startup.

Signed-off-by: Friedrich Vock 


Acked-by: Christian König 



Fixes: 1bece222eab ("drm/amdgpu: Clear doorbell interrupt status for Sienna 
Cichlid")
Cc: Alex Deucher 
Cc: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 861ccff78af95..83c4533ee75c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6165,7 +6165,7 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
adev->nbio.funcs->enable_doorbell_interrupt)
adev->nbio.funcs->enable_doorbell_interrupt(adev, true);

-   if (amdgpu_passthrough(adev) &&
+   if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
adev->nbio.funcs->clear_doorbell_interrupt)
adev->nbio.funcs->clear_doorbell_interrupt(adev);

--
2.45.0





Re: [PATCH 2/2] drm/amd/pm: check specific index for aldebaran

2024-05-14 Thread Lazar, Lijo



On 5/14/2024 12:37 PM, Wang, Yang(Kevin) wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
> 
> -Original Message-
> From: amd-gfx  On Behalf Of Lazar, Lijo
> Sent: Tuesday, May 14, 2024 2:07 PM
> To: Zhang, Jesse(Jie) ; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander ; Koenig, Christian 
> ; Huang, Tim 
> Subject: Re: [PATCH 2/2] drm/amd/pm: check specific index for aldebaran
> 
> 
> 
> On 5/14/2024 11:34 AM, Jesse Zhang wrote:
>> To avoid warning problems, drop index and use PPSMC_MSG_GfxDriverReset
>> instead of index for aldebaran.
>>
>> Signed-off-by: Jesse Zhang 
>> Suggested-by: Lijo Lazar 
>> ---
>>  drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 13 +++--
>>  1 file changed, 7 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>> index a22eb6bbb05e..d671314c46c8 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
>> @@ -1880,17 +1880,18 @@ static int aldebaran_mode1_reset(struct
>> smu_context *smu)
>>
>>  static int aldebaran_mode2_reset(struct smu_context *smu)  {
>> - int ret = 0, index;
>> + int ret = 0;
>>   struct amdgpu_device *adev = smu->adev;
>>   int timeout = 10;
>>
>> - index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
>> - SMU_MSG_GfxDeviceDriverReset);
>> - if (index < 0 )
>> - return -EINVAL;
>>   mutex_lock(>message_lock);
>>   if (smu->smc_fw_version >= 0x00441400) {
>> - ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index, 
>> SMU_RESET_MODE_2);
>> + ret = smu_cmn_send_smc_msg_with_param(smu,
>> +SMU_MSG_GfxDeviceDriverReset,
> 
> PPSMC_MSG_GfxDriverReset is different from SMU_MSG_GfxDeviceDriverReset.
> Use PPSMC_MSG_GfxDriverReset here (for both patches).
> 
> Thanks,
> Lijo
> 
> [Kevin]:
> 
> There is no interface here to directly use PPSMC_MSG_XXX to send messages to 
> smu/pmfw in the swSMU driver,
> and it is not recommended to do so to maintain code consistency.
> 

Thanks, didn't notice earlier that smu_cmn_send_msg_without_waiting got
changed as well with this patch. This API is a direct interface.

Please note not to change anything else other than what is specifically
requested in review comment. The original comment was only to replace
index with PPSMC_MSG_GfxDriverReset. Please stick to that, otherwise it
will break the entire sequence.

Thanks,
Lijo

> Best Regards,
> Kevin
> 
>> + 
>> SMU_RESET_MODE_2, NULL);
>> + if (ret) {
>> + dev_err(smu->adev->dev, "Failed to mode2 reset!\n");
>> + goto out;
>> + }
>>   /* This is similar to FLR, wait till max FLR timeout */
>>   msleep(100);
>>   dev_dbg(smu->adev->dev, "restore config space...\n");


RE: [PATCH 2/2] drm/amd/pm: check specific index for aldebaran

2024-05-14 Thread Wang, Yang(Kevin)
[AMD Official Use Only - AMD Internal Distribution Only]

-Original Message-
From: amd-gfx  On Behalf Of Lazar, Lijo
Sent: Tuesday, May 14, 2024 2:07 PM
To: Zhang, Jesse(Jie) ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Koenig, Christian 
; Huang, Tim 
Subject: Re: [PATCH 2/2] drm/amd/pm: check specific index for aldebaran



On 5/14/2024 11:34 AM, Jesse Zhang wrote:
> To avoid warning problems, drop index and use PPSMC_MSG_GfxDriverReset
> instead of index for aldebaran.
>
> Signed-off-by: Jesse Zhang 
> Suggested-by: Lijo Lazar 
> ---
>  drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 13 +++--
>  1 file changed, 7 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index a22eb6bbb05e..d671314c46c8 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -1880,17 +1880,18 @@ static int aldebaran_mode1_reset(struct
> smu_context *smu)
>
>  static int aldebaran_mode2_reset(struct smu_context *smu)  {
> - int ret = 0, index;
> + int ret = 0;
>   struct amdgpu_device *adev = smu->adev;
>   int timeout = 10;
>
> - index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
> - SMU_MSG_GfxDeviceDriverReset);
> - if (index < 0 )
> - return -EINVAL;
>   mutex_lock(>message_lock);
>   if (smu->smc_fw_version >= 0x00441400) {
> - ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index, 
> SMU_RESET_MODE_2);
> + ret = smu_cmn_send_smc_msg_with_param(smu,
> +SMU_MSG_GfxDeviceDriverReset,

PPSMC_MSG_GfxDriverReset is different from SMU_MSG_GfxDeviceDriverReset.
Use PPSMC_MSG_GfxDriverReset here (for both patches).

Thanks,
Lijo

[Kevin]:

There is no interface here to directly use PPSMC_MSG_XXX to send messages to 
smu/pmfw in the swSMU driver,
and it is not recommended to do so to maintain code consistency.

Best Regards,
Kevin

> + 
> SMU_RESET_MODE_2, NULL);
> + if (ret) {
> + dev_err(smu->adev->dev, "Failed to mode2 reset!\n");
> + goto out;
> + }
>   /* This is similar to FLR, wait till max FLR timeout */
>   msleep(100);
>   dev_dbg(smu->adev->dev, "restore config space...\n");


[PATCH] drm/amdgpu: Check if NBIO funcs are NULL in amdgpu_device_baco_exit

2024-05-14 Thread Friedrich Vock
The special case for VM passthrough doesn't check adev->nbio.funcs
before dereferencing it. If GPUs that don't have an NBIO block are
passed through, this leads to a NULL pointer dereference on startup.

Signed-off-by: Friedrich Vock 

Fixes: 1bece222eab ("drm/amdgpu: Clear doorbell interrupt status for Sienna 
Cichlid")
Cc: Alex Deucher 
Cc: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 861ccff78af95..83c4533ee75c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6165,7 +6165,7 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
adev->nbio.funcs->enable_doorbell_interrupt)
adev->nbio.funcs->enable_doorbell_interrupt(adev, true);

-   if (amdgpu_passthrough(adev) &&
+   if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
adev->nbio.funcs->clear_doorbell_interrupt)
adev->nbio.funcs->clear_doorbell_interrupt(adev);

--
2.45.0



[PATCH 2/2 v2] drm/amd/pm: check specific index for aldebaran

2024-05-14 Thread Jesse Zhang
To avoid warning problems, drop index and
use PPSMC_MSG_GfxDriverReset instead of index for aldebaran.

Signed-off-by: Jesse Zhang 
Suggested-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index a22eb6bbb05e..d671314c46c8 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1880,17 +1880,18 @@ static int aldebaran_mode1_reset(struct smu_context 
*smu)
 
 static int aldebaran_mode2_reset(struct smu_context *smu)
 {
-   int ret = 0, index;
+   int ret = 0;
struct amdgpu_device *adev = smu->adev;
int timeout = 10;
 
-   index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
-   SMU_MSG_GfxDeviceDriverReset);
-   if (index < 0 )
-   return -EINVAL;
mutex_lock(>message_lock);
if (smu->smc_fw_version >= 0x00441400) {
-   ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index, 
SMU_RESET_MODE_2);
+   ret = smu_cmn_send_smc_msg_with_param(smu, 
PPSMC_MSG_GfxDriverReset,
+   
SMU_RESET_MODE_2, NULL);
+   if (ret) {
+   dev_err(smu->adev->dev, "Failed to mode2 reset!\n");
+   goto out;
+   }
/* This is similar to FLR, wait till max FLR timeout */
msleep(100);
dev_dbg(smu->adev->dev, "restore config space...\n");
-- 
2.25.1



Re: [PATCH] drm/amdgpu/pm: Drop hard-code value of usTMax

2024-05-14 Thread Lazar, Lijo



On 5/14/2024 9:43 AM, Ma Jun wrote:
> Drop hard-code value of nsTmax because we read this
> value from fantable below.
> 
> Signed-off-by: Ma Jun 

Reviewed-by: Lijo Lazar 

Thanks,
Lijo

> ---
>  drivers/gpu/drm/amd/pm/powerplay/hwmgr/process_pptables_v1_0.c | 2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/process_pptables_v1_0.c 
> b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/process_pptables_v1_0.c
> index 17882f8dfdd3..6cfef1b295ab 100644
> --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/process_pptables_v1_0.c
> +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/process_pptables_v1_0.c
> @@ -977,8 +977,6 @@ static int init_thermal_controller(
>   = le16_to_cpu(tonga_fan_table->usPWMMed);
>   hwmgr->thermal_controller.advanceFanControlParameters.usPWMHigh
>   = le16_to_cpu(tonga_fan_table->usPWMHigh);
> - hwmgr->thermal_controller.advanceFanControlParameters.usTMax
> - = 10900;  /* hard coded */
>   hwmgr->thermal_controller.advanceFanControlParameters.usTMax
>   = le16_to_cpu(tonga_fan_table->usTMax);
>   
> hwmgr->thermal_controller.advanceFanControlParameters.ucFanControlMode


Re: [PATCH v2] drm/amdgpu: Fix the null pointer dereference to ras_manager

2024-05-14 Thread Lazar, Lijo



On 5/14/2024 9:42 AM, Ma Jun wrote:
> Check ras_manager before using it
> 
> Signed-off-by: Ma Jun 

Reviewed-by: Lijo Lazar 

Thanks,
Lijo

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +--
>  1 file changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 925ec65ac5ed..2bcf5c3b5d70 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2172,12 +2172,15 @@ static void 
> amdgpu_ras_interrupt_process_handler(struct work_struct *work)
>  int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
>   struct ras_dispatch_if *info)
>  {
> - struct ras_manager *obj = amdgpu_ras_find_obj(adev, >head);
> - struct ras_ih_data *data = >ih_data;
> + struct ras_manager *obj;
> + struct ras_ih_data *data;
>  
> + obj = amdgpu_ras_find_obj(adev, >head);
>   if (!obj)
>   return -EINVAL;
>  
> + data = >ih_data;
> +
>   if (data->inuse == 0)
>   return 0;
>  


[no subject]

2024-05-14 Thread Jesse Zhang
>From 3348a4bb465834b165de80dc42d11630ac5c6a83 Mon Sep 17 00:00:00 2001
From: Jesse Zhang 
Date: Tue, 14 May 2024 13:59:18 +0800
Subject: [PATCH 2/2 v2] drm/amd/pm: check specific index for aldebaran

To avoid warning problems, drop index and
use PPSMC_MSG_GfxDriverReset instead of index for aldebaran.

Signed-off-by: Jesse Zhang 
Suggested-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index a22eb6bbb05e..d671314c46c8 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1880,17 +1880,18 @@ static int aldebaran_mode1_reset(struct smu_context 
*smu)
 
 static int aldebaran_mode2_reset(struct smu_context *smu)
 {
-   int ret = 0, index;
+   int ret = 0;
struct amdgpu_device *adev = smu->adev;
int timeout = 10;
 
-   index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
-   SMU_MSG_GfxDeviceDriverReset);
-   if (index < 0 )
-   return -EINVAL;
mutex_lock(>message_lock);
if (smu->smc_fw_version >= 0x00441400) {
-   ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index, 
SMU_RESET_MODE_2);
+   ret = smu_cmn_send_smc_msg_with_param(smu, 
PPSMC_MSG_GfxDriverReset,
+   
SMU_RESET_MODE_2, NULL);
+   if (ret) {
+   dev_err(smu->adev->dev, "Failed to mode2 reset!\n");
+   goto out;
+   }
/* This is similar to FLR, wait till max FLR timeout */
msleep(100);
dev_dbg(smu->adev->dev, "restore config space...\n");
-- 
2.25.1



[PATCH 1/2 v2] drm/amd/pm: check specific index for smu13

2024-05-14 Thread Jesse Zhang
To avoid warning problems, drop index and 
use PPSMC_MSG_GfxDriverReset instead of index for smu13.

Signed-off-by: Jesse Zhang 
Suggested-by: Lijo Lazar 
---
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 15 +--
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 46ab70a244af..27ec95a4e81d 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2330,20 +2330,15 @@ static void smu_v13_0_6_restore_pci_config(struct 
smu_context *smu)
 
 static int smu_v13_0_6_mode2_reset(struct smu_context *smu)
 {
-   int ret = 0, index;
+   int ret = 0;
struct amdgpu_device *adev = smu->adev;
int timeout = 10;
 
-   index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
-  SMU_MSG_GfxDeviceDriverReset);
-   if (index < 0)
-   return index;
-
mutex_lock(>message_lock);
-
-   ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
-  SMU_RESET_MODE_2);
-
+   ret = smu_cmn_send_smc_msg_with_param(smu, PPSMC_MSG_GfxDriverReset,
+   SMU_RESET_MODE_2, NULL);
+   if (ret)
+   goto out;
/* Reset takes a bit longer, wait for 200ms. */
msleep(200);
 
-- 
2.25.1



RE: [PATCH 2/2] drm/amd/pm: Use gpu_metrics_v1_6 for SMUv13.0.6

2024-05-14 Thread Zhang, Hawking
[AMD Official Use Only - AMD Internal Distribution Only]

Series is

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Kamal, Asad 
Sent: Tuesday, May 14, 2024 14:23
To: amd-gfx@lists.freedesktop.org
Cc: Lazar, Lijo ; Zhang, Hawking ; 
Ma, Le ; Zhang, Morris ; Kamal, Asad 
; Cheung, Donald ; Khatir, Sepehr 
; Oliveira, Daniel ; Poag, 
Charis ; Liu, Shuzhou (Bill) 
Subject: [PATCH 2/2] drm/amd/pm: Use gpu_metrics_v1_6 for SMUv13.0.6

Use gpu_metrics_v1_6 for SMUv13.0.6 to fill gpu metric info

Signed-off-by: Asad Kamal 
Reviewed-by: Lijo Lazar 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c   | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 46ab70a244af..70e5589f6229 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -350,7 +350,7 @@ static int smu_v13_0_6_tables_init(struct smu_context *smu)
return -ENOMEM;
smu_table->metrics_time = 0;

-   smu_table->gpu_metrics_table_size = sizeof(struct gpu_metrics_v1_5);
+   smu_table->gpu_metrics_table_size = sizeof(struct gpu_metrics_v1_6);
smu_table->gpu_metrics_table =
kzalloc(smu_table->gpu_metrics_table_size, GFP_KERNEL);
if (!smu_table->gpu_metrics_table) {
@@ -2176,8 +2176,8 @@ static int smu_v13_0_6_get_current_pcie_link_speed(struct 
smu_context *smu)  static ssize_t smu_v13_0_6_get_gpu_metrics(struct 
smu_context *smu, void **table)  {
struct smu_table_context *smu_table = >smu_table;
-   struct gpu_metrics_v1_5 *gpu_metrics =
-   (struct gpu_metrics_v1_5 *)smu_table->gpu_metrics_table;
+   struct gpu_metrics_v1_6 *gpu_metrics =
+   (struct gpu_metrics_v1_6 *)smu_table->gpu_metrics_table;
struct amdgpu_device *adev = smu->adev;
int ret = 0, xcc_id, inst, i, j;
MetricsTableX_t *metrics_x;
@@ -2193,7 +2193,7 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct 
smu_context *smu, void **table

metrics_a = (MetricsTableA_t *)metrics_x;

-   smu_cmn_init_soft_gpu_metrics(gpu_metrics, 1, 5);
+   smu_cmn_init_soft_gpu_metrics(gpu_metrics, 1, 6);

gpu_metrics->temperature_hotspot =
SMUQ10_ROUND(GET_METRIC_FIELD(MaxSocketTemperature));
@@ -2235,6 +2235,16 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct 
smu_context *smu, void **table

gpu_metrics->current_uclk = 
SMUQ10_ROUND(GET_METRIC_FIELD(UclkFrequency));

+   /* Total accumulated cycle counter */
+   gpu_metrics->accumulation_counter =
+GET_METRIC_FIELD(AccumulationCounter);
+
+   /* Accumulated throttler residencies */
+   gpu_metrics->prochot_residency_acc = 
GET_METRIC_FIELD(ProchotResidencyAcc);
+   gpu_metrics->ppt_residency_acc = GET_METRIC_FIELD(PptResidencyAcc);
+   gpu_metrics->socket_thm_residency_acc = 
GET_METRIC_FIELD(SocketThmResidencyAcc);
+   gpu_metrics->vr_thm_residency_acc = GET_METRIC_FIELD(VrThmResidencyAcc);
+   gpu_metrics->hbm_thm_residency_acc =
+GET_METRIC_FIELD(HbmThmResidencyAcc);
+
/* Throttle status is not reported through metrics now */
gpu_metrics->throttle_status = 0;

--
2.42.0



[PATCH 2/2] drm/amd/pm: Use gpu_metrics_v1_6 for SMUv13.0.6

2024-05-14 Thread Asad Kamal
Use gpu_metrics_v1_6 for SMUv13.0.6 to fill gpu metric info

Signed-off-by: Asad Kamal 
Reviewed-by: Lijo Lazar 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c   | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 46ab70a244af..70e5589f6229 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -350,7 +350,7 @@ static int smu_v13_0_6_tables_init(struct smu_context *smu)
return -ENOMEM;
smu_table->metrics_time = 0;
 
-   smu_table->gpu_metrics_table_size = sizeof(struct gpu_metrics_v1_5);
+   smu_table->gpu_metrics_table_size = sizeof(struct gpu_metrics_v1_6);
smu_table->gpu_metrics_table =
kzalloc(smu_table->gpu_metrics_table_size, GFP_KERNEL);
if (!smu_table->gpu_metrics_table) {
@@ -2176,8 +2176,8 @@ static int smu_v13_0_6_get_current_pcie_link_speed(struct 
smu_context *smu)
 static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void 
**table)
 {
struct smu_table_context *smu_table = >smu_table;
-   struct gpu_metrics_v1_5 *gpu_metrics =
-   (struct gpu_metrics_v1_5 *)smu_table->gpu_metrics_table;
+   struct gpu_metrics_v1_6 *gpu_metrics =
+   (struct gpu_metrics_v1_6 *)smu_table->gpu_metrics_table;
struct amdgpu_device *adev = smu->adev;
int ret = 0, xcc_id, inst, i, j;
MetricsTableX_t *metrics_x;
@@ -2193,7 +2193,7 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct 
smu_context *smu, void **table
 
metrics_a = (MetricsTableA_t *)metrics_x;
 
-   smu_cmn_init_soft_gpu_metrics(gpu_metrics, 1, 5);
+   smu_cmn_init_soft_gpu_metrics(gpu_metrics, 1, 6);
 
gpu_metrics->temperature_hotspot =
SMUQ10_ROUND(GET_METRIC_FIELD(MaxSocketTemperature));
@@ -2235,6 +2235,16 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct 
smu_context *smu, void **table
 
gpu_metrics->current_uclk = 
SMUQ10_ROUND(GET_METRIC_FIELD(UclkFrequency));
 
+   /* Total accumulated cycle counter */
+   gpu_metrics->accumulation_counter = 
GET_METRIC_FIELD(AccumulationCounter);
+
+   /* Accumulated throttler residencies */
+   gpu_metrics->prochot_residency_acc = 
GET_METRIC_FIELD(ProchotResidencyAcc);
+   gpu_metrics->ppt_residency_acc = GET_METRIC_FIELD(PptResidencyAcc);
+   gpu_metrics->socket_thm_residency_acc = 
GET_METRIC_FIELD(SocketThmResidencyAcc);
+   gpu_metrics->vr_thm_residency_acc = GET_METRIC_FIELD(VrThmResidencyAcc);
+   gpu_metrics->hbm_thm_residency_acc = 
GET_METRIC_FIELD(HbmThmResidencyAcc);
+
/* Throttle status is not reported through metrics now */
gpu_metrics->throttle_status = 0;
 
-- 
2.42.0



[PATCH 1/2] drm/amd/pm: Add gpu_metrics_v1_6

2024-05-14 Thread Asad Kamal
Add new gpu_metrics_v1_6 to acquire accumulated
throttler residencies

Signed-off-by: Asad Kamal 
Reviewed-by: Lijo Lazar 
---
 .../gpu/drm/amd/include/kgd_pp_interface.h| 89 +++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c|  3 +
 2 files changed, 92 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h 
b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
index 805c9d37a2b4..a0955cfe41ce 100644
--- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
@@ -854,6 +854,95 @@ struct gpu_metrics_v1_5 {
uint16_tpadding;
 };
 
+struct gpu_metrics_v1_6 {
+   struct metrics_table_header common_header;
+
+   /* Temperature (Celsius) */
+   uint16_ttemperature_hotspot;
+   uint16_ttemperature_mem;
+   uint16_ttemperature_vrsoc;
+
+   /* Power (Watts) */
+   uint16_tcurr_socket_power;
+
+   /* Utilization (%) */
+   uint16_taverage_gfx_activity;
+   uint16_taverage_umc_activity; // memory 
controller
+   uint16_tvcn_activity[NUM_VCN];
+   uint16_tjpeg_activity[NUM_JPEG_ENG];
+
+   /* Energy (15.259uJ (2^-16) units) */
+   uint64_tenergy_accumulator;
+
+   /* Driver attached timestamp (in ns) */
+   uint64_tsystem_clock_counter;
+
+   /* Accumulation cycle counter */
+   uint32_taccumulation_counter;
+
+   /* Accumulated throttler residencies */
+   uint32_tprochot_residency_acc;
+   uint32_tppt_residency_acc;
+   uint32_tsocket_thm_residency_acc;
+   uint32_tvr_thm_residency_acc;
+   uint32_thbm_thm_residency_acc;
+
+   /* Throttle status */
+   uint32_tthrottle_status;
+
+   /* Clock Lock Status. Each bit corresponds to clock instance */
+   uint32_tgfxclk_lock_status;
+
+   /* Link width (number of lanes) and speed (in 0.1 GT/s) */
+   uint16_tpcie_link_width;
+   uint16_tpcie_link_speed;
+
+   /* XGMI bus width and bitrate (in Gbps) */
+   uint16_txgmi_link_width;
+   uint16_txgmi_link_speed;
+
+   /* Utilization Accumulated (%) */
+   uint32_tgfx_activity_acc;
+   uint32_tmem_activity_acc;
+
+   /*PCIE accumulated bandwidth (Mbps) */
+   uint64_tpcie_bandwidth_acc;
+
+   /*PCIE instantaneous bandwidth (Mbps) */
+   uint64_tpcie_bandwidth_inst;
+
+   /* PCIE L0 to recovery state transition accumulated count */
+   uint64_tpcie_l0_to_recov_count_acc;
+
+   /* PCIE replay accumulated count */
+   uint64_tpcie_replay_count_acc;
+
+   /* PCIE replay rollover accumulated count */
+   uint64_tpcie_replay_rover_count_acc;
+
+   /* PCIE NAK sent  accumulated count */
+   uint32_tpcie_nak_sent_count_acc;
+
+   /* PCIE NAK received accumulated count */
+   uint32_tpcie_nak_rcvd_count_acc;
+
+   /* XGMI accumulated data transfer size(KiloBytes) */
+   uint64_txgmi_read_data_acc[NUM_XGMI_LINKS];
+   uint64_txgmi_write_data_acc[NUM_XGMI_LINKS];
+
+   /* PMFW attached timestamp (10ns resolution) */
+   uint64_tfirmware_timestamp;
+
+   /* Current clocks (Mhz) */
+   uint16_tcurrent_gfxclk[MAX_GFX_CLKS];
+   uint16_tcurrent_socclk[MAX_CLKS];
+   uint16_tcurrent_vclk0[MAX_CLKS];
+   uint16_tcurrent_dclk0[MAX_CLKS];
+   uint16_tcurrent_uclk;
+
+   uint16_tpadding;
+};
+
 /*
  * gpu_metrics_v2_0 is not recommended as it's not naturally aligned.
  * Use gpu_metrics_v2_1 or later instead.
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 602aa6941231..26d44a4370d2 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -1052,6 +1052,9 @@ void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t 
frev, uint8_t crev)
case METRICS_VERSION(1, 5):
structure_size = sizeof(struct gpu_metrics_v1_5);
break;
+   case METRICS_VERSION(1, 6):
+   structure_size = sizeof(struct 

Re: [PATCH 2/2] drm/amd/pm: check specific index for aldebaran

2024-05-14 Thread Lazar, Lijo



On 5/14/2024 11:34 AM, Jesse Zhang wrote:
> To avoid warning problems, drop index and
> use PPSMC_MSG_GfxDriverReset instead of index for aldebaran.
> 
> Signed-off-by: Jesse Zhang 
> Suggested-by: Lijo Lazar 
> ---
>  drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 13 +++--
>  1 file changed, 7 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index a22eb6bbb05e..d671314c46c8 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -1880,17 +1880,18 @@ static int aldebaran_mode1_reset(struct smu_context 
> *smu)
>  
>  static int aldebaran_mode2_reset(struct smu_context *smu)
>  {
> - int ret = 0, index;
> + int ret = 0;
>   struct amdgpu_device *adev = smu->adev;
>   int timeout = 10;
>  
> - index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
> - SMU_MSG_GfxDeviceDriverReset);
> - if (index < 0 )
> - return -EINVAL;
>   mutex_lock(>message_lock);
>   if (smu->smc_fw_version >= 0x00441400) {
> - ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index, 
> SMU_RESET_MODE_2);
> + ret = smu_cmn_send_smc_msg_with_param(smu, 
> SMU_MSG_GfxDeviceDriverReset,

PPSMC_MSG_GfxDriverReset is different from SMU_MSG_GfxDeviceDriverReset.
Use PPSMC_MSG_GfxDriverReset here (for both patches).

Thanks,
Lijo

> + 
> SMU_RESET_MODE_2, NULL);
> + if (ret) {
> + dev_err(smu->adev->dev, "Failed to mode2 reset!\n");
> + goto out;
> + }
>   /* This is similar to FLR, wait till max FLR timeout */
>   msleep(100);
>   dev_dbg(smu->adev->dev, "restore config space...\n");


[PATCH 2/2] drm/amd/pm: check specific index for aldebaran

2024-05-14 Thread Jesse Zhang
To avoid warning problems, drop index and
use PPSMC_MSG_GfxDriverReset instead of index for aldebaran.

Signed-off-by: Jesse Zhang 
Suggested-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index a22eb6bbb05e..d671314c46c8 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1880,17 +1880,18 @@ static int aldebaran_mode1_reset(struct smu_context 
*smu)
 
 static int aldebaran_mode2_reset(struct smu_context *smu)
 {
-   int ret = 0, index;
+   int ret = 0;
struct amdgpu_device *adev = smu->adev;
int timeout = 10;
 
-   index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
-   SMU_MSG_GfxDeviceDriverReset);
-   if (index < 0 )
-   return -EINVAL;
mutex_lock(>message_lock);
if (smu->smc_fw_version >= 0x00441400) {
-   ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index, 
SMU_RESET_MODE_2);
+   ret = smu_cmn_send_smc_msg_with_param(smu, 
SMU_MSG_GfxDeviceDriverReset,
+   
SMU_RESET_MODE_2, NULL);
+   if (ret) {
+   dev_err(smu->adev->dev, "Failed to mode2 reset!\n");
+   goto out;
+   }
/* This is similar to FLR, wait till max FLR timeout */
msleep(100);
dev_dbg(smu->adev->dev, "restore config space...\n");
-- 
2.25.1



[PATCH 1/2] drm/amd/pm: check specific index for smu13

2024-05-14 Thread Jesse Zhang
To avoid warning problems, drop index and 
use PPSMC_MSG_GfxDriverReset instead of index for smu13.

Signed-off-by: Jesse Zhang 
Suggested-by: Lijo Lazar 
---
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 15 +--
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 46ab70a244af..27ec95a4e81d 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2330,20 +2330,15 @@ static void smu_v13_0_6_restore_pci_config(struct 
smu_context *smu)
 
 static int smu_v13_0_6_mode2_reset(struct smu_context *smu)
 {
-   int ret = 0, index;
+   int ret = 0;
struct amdgpu_device *adev = smu->adev;
int timeout = 10;
 
-   index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
-  SMU_MSG_GfxDeviceDriverReset);
-   if (index < 0)
-   return index;
-
mutex_lock(>message_lock);
-
-   ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
-  SMU_RESET_MODE_2);
-
+   ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GfxDeviceDriverReset,
+   SMU_RESET_MODE_2, NULL);
+   if (ret)
+   goto out;
/* Reset takes a bit longer, wait for 200ms. */
msleep(200);
 
-- 
2.25.1