RE: [PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported

2024-01-02 Thread Zhou1, Tao
[AMD Official Use Only - General]

The series is:

Reviewed-by: Tao Zhou 

> -Original Message-
> From: Hawking Zhang 
> Sent: Tuesday, January 2, 2024 10:16 PM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang,
> Stanley ; Wang, Yang(Kevin)
> ; Chai, Thomas ; Li,
> Candice 
> Cc: Zhang, Hawking ; Deucher, Alexander
> ; Lazar, Lijo ; Ma, Le
> 
> Subject: [PATCH 3/3] drm/amdgpu: Centralize ras cap query to
> amdgpu_ras_check_supported
>
> Move ras capablity check to amdgpu_ras_check_supported.
> Driver will query ras capablity through psp interace, or vbios interface, or 
> specific
> ip callbacks.
>
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +---
>  1 file changed, 93 insertions(+), 77 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index a901b00d4949..2ee82baaf7d6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -39,6 +39,7 @@
>  #include "nbio_v7_9.h"
>  #include "atom.h"
>  #include "amdgpu_reset.h"
> +#include "amdgpu_psp.h"
>
>  #ifdef CONFIG_X86_MCE_AMD
>  #include 
> @@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct
> amdgpu_device *adev)
>   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);  }
>
> +/* Query ras capablity via atomfirmware interface */ static void
> +amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev) {
> + /* mem_ecc cap */
> + if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
> + dev_info(adev->dev, "MEM ECC is active.\n");
> + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
> +  1 << AMDGPU_RAS_BLOCK__DF);
> + } else {
> + dev_info(adev->dev, "MEM ECC is not presented.\n");
> + }
> +
> + /* sram_ecc cap */
> + if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
> + dev_info(adev->dev, "SRAM ECC is active.\n");
> + if (!amdgpu_sriov_vf(adev))
> + adev->ras_hw_enabled |= ~(1 <<
> AMDGPU_RAS_BLOCK__UMC |
> +   1 <<
> AMDGPU_RAS_BLOCK__DF);
> + else
> + adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__PCIE_BIF |
> +  1 <<
> AMDGPU_RAS_BLOCK__SDMA |
> +  1 <<
> AMDGPU_RAS_BLOCK__GFX);
> +
> + /*
> +  * VCN/JPEG RAS can be supported on both bare metal and
> +  * SRIOV environment
> +  */
> + if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6,
> 0) ||
> + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0,
> 0) ||
> + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0,
> 3))
> + adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__VCN |
> +  1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> + else
> + adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__VCN |
> +   1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> +
> + /*
> +  * XGMI RAS is not supported if xgmi num physical nodes
> +  * is zero
> +  */
> + if (!adev->gmc.xgmi.num_physical_nodes)
> + adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__XGMI_WAFL);
> + } else {
> + dev_info(adev->dev, "SRAM ECC is not presented.\n");
> + }
> +}
> +
> +/* Query poison mode from umc/df IP callbacks */ static void
> +amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) {
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + bool df_poison, umc_poison;
> +
> + /* poison setting is useless on SRIOV guest */
> + if (amdgpu_sriov_vf(adev) || !con)
> + return;
> +
> + /* Init poison supported flag, the default value is false */
> + if (adev->gmc.xgmi.connected_to_cpu ||
> + adev->gmc.is_app_apu) {
> + /* enabled by default when GPU is connected to CPU */
> + con->poison_supported = true;
> + } else if (adev->df.funcs &&
> + adev->df.funcs->query_ras_poison_mode &&
> + adev->umc.ras &&
> + adev->umc.ras->query_ras_poison_mode) {
> + df_poison =
> + adev->df.funcs->query_ras_poison_mode(adev);
> + umc_poison =
> + adev->umc.ras->query_ras_poison_mode(adev);
> +
> + /* Only poison is set in both DF and UMC, we can support it */
> + if (df_poison && umc_poison)
> + con->poison_supported = true;
> + else if (df_poison != umc_poison)
> + dev_warn(adev->dev,
> + "Poison setting is inconsistent in
> DF/UMC(%d:%d)!\n",
> + 

RE: [PATCH 2/5] drm/amdgpu: Init pcie_index/data address as fallback

2024-01-02 Thread Zhang, Hawking
[AMD Official Use Only - General]

Sure, will do in v2

Regards,
Hawking

-Original Message-
From: Alex Deucher 
Sent: Wednesday, January 3, 2024 03:44
To: Zhang, Hawking 
Cc: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, 
Stanley ; Wang, Yang(Kevin) ; 
Chai, Thomas ; Li, Candice ; Deucher, 
Alexander ; Ma, Le ; Lazar, Lijo 

Subject: Re: [PATCH 2/5] drm/amdgpu: Init pcie_index/data address as fallback

On Mon, Jan 1, 2024 at 10:50 PM Hawking Zhang  wrote:
>
> To allow using this helper for indirect access when nbio funcs is not
> available. For instance, in ip discovery phase.
>
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++-
>  1 file changed, 15 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 001a35fa0f19..873419a5b9aa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -781,12 +781,22 @@ u32 amdgpu_device_indirect_rreg_ext(struct 
> amdgpu_device *adev,
> void __iomem *pcie_index_hi_offset;
> void __iomem *pcie_data_offset;
>
> -   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
> -   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
> -   if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
> -   pcie_index_hi = 
> adev->nbio.funcs->get_pcie_index_hi_offset(adev);
> -   else
> +   if (unlikely(!adev->nbio.funcs)) {
> +   pcie_index = (0x38 >> 2);
> +   pcie_data = (0x3C >> 2);
> +   } else {
> +   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
> +   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
> +   }
> +
> +   if (reg_addr >> 32) {
> +   if (unlikely(!adev->nbio.funcs))
> +   pcie_index_hi = (0x44 >> 2);

I'd still use a define for these, E.g.,

#define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) etc.
Or something similar.

Alex

> +   else
> +   pcie_index_hi = 
> adev->nbio.funcs->get_pcie_index_hi_offset(adev);
> +   } else {
> pcie_index_hi = 0;
> +   }
>
> spin_lock_irqsave(>pcie_idx_lock, flags);
> pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index *
> 4;
> --
> 2.17.1
>


RE: [PATCH 4/5] drm/amdgpu: Query boot status if discovery failed

2024-01-02 Thread Zhang, Hawking
[AMD Official Use Only - General]

Yes, it is.

Regards,
Hawking

From: Deucher, Alexander 
Sent: Wednesday, January 3, 2024 03:45
To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org; 
Zhou1, Tao ; Yang, Stanley ; Wang, 
Yang(Kevin) ; Chai, Thomas ; Li, 
Candice 
Cc: Lazar, Lijo ; Ma, Le 
Subject: Re: [PATCH 4/5] drm/amdgpu: Query boot status if discovery failed


[AMD Official Use Only - General]

Is mmIP_DISCOVERY_VERSION at the same offset across ASIC families?

Alex


From: Hawking Zhang mailto:hawking.zh...@amd.com>>
Sent: Monday, January 1, 2024 10:43 PM
To: amd-gfx@lists.freedesktop.org 
mailto:amd-gfx@lists.freedesktop.org>>; Zhou1, 
Tao mailto:tao.zh...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Chai, Thomas 
mailto:yipeng.c...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>
Cc: Zhang, Hawking mailto:hawking.zh...@amd.com>>; 
Deucher, Alexander 
mailto:alexander.deuc...@amd.com>>; Lazar, Lijo 
mailto:lijo.la...@amd.com>>; Ma, Le 
mailto:le...@amd.com>>
Subject: [PATCH 4/5] drm/amdgpu: Query boot status if discovery failed

Check and report boot status if discovery failed.

Signed-off-by: Hawking Zhang 
mailto:hawking.zh...@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index b8fde08aec8e..302b71e9f1e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -27,6 +27,7 @@
 #include "amdgpu_discovery.h"
 #include "soc15_hw_ip.h"
 #include "discovery.h"
+#include "amdgpu_ras.h"

 #include "soc15.h"
 #include "gfx_v9_0.h"
@@ -98,6 +99,7 @@
 #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
 MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);

+#define mmIP_DISCOVERY_VERSION  0x16A00
 #define mmRCC_CONFIG_MEMSIZE0xde3
 #define mmMP0_SMN_C2PMSG_33 0x16061
 #define mmMM_INDEX  0x0
@@ -518,7 +520,9 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
 out:
 kfree(adev->mman.discovery_bin);
 adev->mman.discovery_bin = NULL;
-
+   if ((amdgpu_discovery != 2) &&
+   (RREG32(mmIP_DISCOVERY_VERSION) == 4))
+   amdgpu_ras_query_boot_status(adev, 4);
 return r;
 }

--
2.17.1


RE: [PATCH 4/5] drm/amdgpu: Query boot status if discovery failed

2024-01-02 Thread Zhang, Hawking
[AMD Official Use Only - General]

RE - I'm not sure about hard-coding 4 instances here. The code you dropped in 
patch 1 was using adev->aid_mask. But I guess that's not even initialized 
correctly if IP discovery failed. Will this work correctly on the APU version?

Yes aid_mask is not initialized. IP_DISCOVERY_VERSION is the only available 
fuse setting that can be used to identify or equivalent to 4 instances of aid 
in such case. We switched to a common mailbox reg that works for both APU and 
dGPU. The expectation is for APU, driver still reports fw boot status, while it 
gives next level information on the failures if boot fails on dGPU.

Regards,
Hawking

-Original Message-
From: Kuehling, Felix 
Sent: Wednesday, January 3, 2024 01:49
To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org; 
Zhou1, Tao ; Yang, Stanley ; Wang, 
Yang(Kevin) ; Chai, Thomas ; Li, 
Candice 
Cc: Deucher, Alexander ; Ma, Le ; 
Lazar, Lijo 
Subject: Re: [PATCH 4/5] drm/amdgpu: Query boot status if discovery failed


On 2024-01-02 09:07, Hawking Zhang wrote:
> Check and report boot status if discovery failed.
>
> Signed-off-by: Hawking Zhang 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +-
>   1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
> index b8fde08aec8e..302b71e9f1e2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
> @@ -27,6 +27,7 @@
>   #include "amdgpu_discovery.h"
>   #include "soc15_hw_ip.h"
>   #include "discovery.h"
> +#include "amdgpu_ras.h"
>
>   #include "soc15.h"
>   #include "gfx_v9_0.h"
> @@ -98,6 +99,7 @@
>   #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
>   MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
>
> +#define mmIP_DISCOVERY_VERSION  0x16A00
>   #define mmRCC_CONFIG_MEMSIZE0xde3
>   #define mmMP0_SMN_C2PMSG_33 0x16061
>   #define mmMM_INDEX  0x0
> @@ -518,7 +520,9 @@ static int amdgpu_discovery_init(struct amdgpu_device 
> *adev)
>   out:
>   kfree(adev->mman.discovery_bin);
>   adev->mman.discovery_bin = NULL;
> -
> + if ((amdgpu_discovery != 2) &&
> + (RREG32(mmIP_DISCOVERY_VERSION) == 4))
> + amdgpu_ras_query_boot_status(adev, 4);
I'm not sure about hard-coding 4 instances here. The code you dropped in patch 
1 was using adev->aid_mask. But I guess that's not even initialized correctly 
if IP discovery failed. Will this work correctly on the APU version?

Regards,
   Felix


>   return r;
>   }
>


Re: [PATCH v2] drm/amdkfd: Fix lock dependency warning

2024-01-02 Thread Philip Yang

  


On 2024-01-02 15:08, Felix Kuehling
  wrote:


  ==
WARNING: possible circular locking dependency detected
6.5.0-kfd-fkuehlin #276 Not tainted
--
kworker/8:2/2676 is trying to acquire lock:
9435aae95c88 ((work_completion)(_bo->eviction_work)){+.+.}-{0:0}, at: __flush_work+0x52/0x550

but task is already holding lock:
9435cd8e1720 (>lock){+.+.}-{3:3}, at: svm_range_deferred_list_work+0xe8/0x340 [amdgpu]

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (>lock){+.+.}-{3:3}:
   __mutex_lock+0x97/0xd30
   kfd_ioctl_alloc_memory_of_gpu+0x6d/0x3c0 [amdgpu]
   kfd_ioctl+0x1b2/0x5d0 [amdgpu]
   __x64_sys_ioctl+0x86/0xc0
   do_syscall_64+0x39/0x80
   entry_SYSCALL_64_after_hwframe+0x63/0xcd

-> #1 (>mmap_lock){}-{3:3}:
   down_read+0x42/0x160
   svm_range_evict_svm_bo_worker+0x8b/0x340 [amdgpu]
   process_one_work+0x27a/0x540
   worker_thread+0x53/0x3e0
   kthread+0xeb/0x120
   ret_from_fork+0x31/0x50
   ret_from_fork_asm+0x11/0x20

-> #0 ((work_completion)(_bo->eviction_work)){+.+.}-{0:0}:
   __lock_acquire+0x1426/0x2200
   lock_acquire+0xc1/0x2b0
   __flush_work+0x80/0x550
   __cancel_work_timer+0x109/0x190
   svm_range_bo_release+0xdc/0x1c0 [amdgpu]
   svm_range_free+0x175/0x180 [amdgpu]
   svm_range_deferred_list_work+0x15d/0x340 [amdgpu]
   process_one_work+0x27a/0x540
   worker_thread+0x53/0x3e0
   kthread+0xeb/0x120
   ret_from_fork+0x31/0x50
   ret_from_fork_asm+0x11/0x20

other info that might help us debug this:

Chain exists of:
  (work_completion)(_bo->eviction_work) --> >mmap_lock --> >lock

 Possible unsafe locking scenario:

   CPU0CPU1
   
  lock(>lock);
   lock(>mmap_lock);
   lock(>lock);
  lock((work_completion)(_bo->eviction_work));

I believe this cannot really lead to a deadlock in practice, because
svm_range_evict_svm_bo_worker only takes the mmap_read_lock if the BO
refcount is non-0. That means it's impossible that svm_range_bo_release
is running concurrently. However, there is no good way to annotate this.

To avoid the problem, take a BO reference in
svm_range_schedule_evict_svm_bo instead of in the worker. That way it's
impossible for a BO to get freed while eviction work is pending and the
cancel_work_sync call in svm_range_bo_release can be eliminated.

v2: Use svm_bo_ref_unless_zero and explained why that's safe. Also
removed redundant checks that are already done in
amdkfd_fence_enable_signaling.

Signed-off-by: Felix Kuehling 

Reviewed-by: Philip Yang 

  
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 26 ++
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index afd98aace065..6b314e4d3ae0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -404,14 +404,9 @@ static void svm_range_bo_release(struct kref *kref)
 		spin_lock(_bo->list_lock);
 	}
 	spin_unlock(_bo->list_lock);
-	if (!dma_fence_is_signaled(_bo->eviction_fence->base)) {
-		/* We're not in the eviction worker.
-		 * Signal the fence and synchronize with any
-		 * pending eviction work.
-		 */
+	if (!dma_fence_is_signaled(_bo->eviction_fence->base))
+		/* We're not in the eviction worker. Signal the fence. */
 		dma_fence_signal(_bo->eviction_fence->base);
-		cancel_work_sync(_bo->eviction_work);
-	}
 	dma_fence_put(_bo->eviction_fence->base);
 	amdgpu_bo_unref(_bo->bo);
 	kfree(svm_bo);
@@ -3437,13 +3432,14 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange,
 
 int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence)
 {
-	if (!fence)
-		return -EINVAL;
-
-	if (dma_fence_is_signaled(>base))
-		return 0;
-
-	if (fence->svm_bo) {
+	/* Dereferencing fence->svm_bo is safe here because the fence hasn't
+	 * signaled yet and we're under the protection of the fence->lock.
+	 * After the fence is signaled in svm_range_bo_release, we cannot get
+	 * here any more.
+	 *
+	 * Reference is dropped in svm_range_evict_svm_bo_worker.
+	 */
+	if (svm_bo_ref_unless_zero(fence->svm_bo)) {
 		WRITE_ONCE(fence->svm_bo->evicting, 1);
 		schedule_work(>svm_bo->eviction_work);
 	}
@@ -3458,8 +3454,6 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
 	int r = 0;
 
 	svm_bo = container_of(work, struct svm_range_bo, eviction_work);
-	if (!svm_bo_ref_unless_zero(svm_bo))
-		return; /* svm_bo was freed while eviction was pending */
 
 	if (mmget_not_zero(svm_bo->eviction_fence->mm)) {
 		mm = svm_bo->eviction_fence->mm;


  



Re: [PATCH 2/2] drm/amdgpu: skip gpu_info fw loading on navi12

2024-01-02 Thread Deucher, Alexander
[AMD Official Use Only - General]

Ping on this series?

Alex

From: Deucher, Alexander 
Sent: Thursday, December 21, 2023 1:11 PM
To: amd-gfx@lists.freedesktop.org 
Cc: Deucher, Alexander 
Subject: [PATCH 2/2] drm/amdgpu: skip gpu_info fw loading on navi12

It's no longer required.

Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2318
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++-
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 9c1ff893c03c..71e8fe2144b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2251,15 +2251,8 @@ static int amdgpu_device_parse_gpu_info_fw(struct 
amdgpu_device *adev)

 adev->firmware.gpu_info_fw = NULL;

-   if (adev->mman.discovery_bin) {
-   /*
-* FIXME: The bounding box is still needed by Navi12, so
-* temporarily read it from gpu_info firmware. Should be dropped
-* when DAL no longer needs it.
-*/
-   if (adev->asic_type != CHIP_NAVI12)
-   return 0;
-   }
+   if (adev->mman.discovery_bin)
+   return 0;

 switch (adev->asic_type) {
 default:
--
2.42.0



[PATCH] drm/amdgpu: make a correction on comment

2024-01-02 Thread James Zhu
Current AMDGPU_VM_RESERVED_VRAM is updated to 8M.

Signed-off-by: James Zhu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index b6cd565562ad..b788067b9158 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -116,7 +116,7 @@ struct amdgpu_mem_stats;
 #define AMDGPU_VM_FAULT_STOP_FIRST 1
 #define AMDGPU_VM_FAULT_STOP_ALWAYS2
 
-/* Reserve 4MB VRAM for page tables */
+/* Reserve 8MB VRAM for page tables */
 #define AMDGPU_VM_RESERVED_VRAM(8ULL << 20)
 
 /*
-- 
2.25.1



Re: [PATCH] drm/amdgpu: Remove unreachable code in 'atom_skip_src_int()'

2024-01-02 Thread Deucher, Alexander
[Public]

Reviewed-by: Alex Deucher 

From: SHANMUGAM, SRINIVASAN 
Sent: Friday, December 29, 2023 4:43 AM
To: Deucher, Alexander ; Koenig, Christian 
; Kuehling, Felix 
Cc: amd-gfx@lists.freedesktop.org ; SHANMUGAM, 
SRINIVASAN 
Subject: [PATCH] drm/amdgpu: Remove unreachable code in 'atom_skip_src_int()'

Fixes the below:
drivers/gpu/drm/amd/amdgpu/atom.c:398 atom_skip_src_int() warn: ignoring 
unreachable code.

Cc: Christian König 
Cc: Alex Deucher 
Signed-off-by: Srinivasan Shanmugam 
---
 drivers/gpu/drm/amd/amdgpu/atom.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c 
b/drivers/gpu/drm/amd/amdgpu/atom.c
index 2c221000782c..a33e890c70d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/atom.c
+++ b/drivers/gpu/drm/amd/amdgpu/atom.c
@@ -395,7 +395,6 @@ static void atom_skip_src_int(atom_exec_context *ctx, 
uint8_t attr, int *ptr)
 (*ptr)++;
 return;
 }
-   return;
 }
 }

--
2.34.1



Re: [PATCH v3 23/24] drm/amdkfd: set debug trap bit when enabling PC Sampling

2024-01-02 Thread James Zhu


On 2023-12-15 10:59, James Zhu wrote:

From: David Yat Sin

We need the SPI_GDBG_PER_VMID_CNTL.TRAP_EN bit to be set during PC
Sampling so that the TTMP registers are valid inside the sampling data.
runtime_info.ttmp_setup will be cleared when the user application
does the AMDKFD_IOC_RUNTIME_ENABLE ioctl without
KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK flag on exit.

It is also not valid to have the debugger attached to a process while PC
sampling is enabled so adding some checks to prevent this.

Signed-off-by: David Yat Sin
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 31 --
  drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 22 ++
  drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  3 ++
  drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 43 +---
  drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h |  4 +-
  5 files changed, 75 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 1a3a8ded9c93..f7a8794c2bde 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1775,7 +1775,7 @@ static int kfd_ioctl_pc_sample(struct file *filep,
pr_debug("failed to bind process %p with gpu id 0x%x", p, 
args->gpu_id);
ret = -ESRCH;
} else {
-   ret = kfd_pc_sample(pdd, args);
+   ret = kfd_pc_sample(p, pdd, args);
}
}
mutex_unlock(>mutex);
@@ -2808,26 +2808,9 @@ static int runtime_enable(struct kfd_process *p, 
uint64_t r_debug,
  
  	p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;

p->runtime_info.r_debug = r_debug;
-   p->runtime_info.ttmp_setup = enable_ttmp_setup;
  
-	if (p->runtime_info.ttmp_setup) {

-   for (i = 0; i < p->n_pdds; i++) {
-   struct kfd_process_device *pdd = p->pdds[i];
-
-   if (!kfd_dbg_is_rlc_restore_supported(pdd->dev)) {
-   amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
-   pdd->dev->kfd2kgd->enable_debug_trap(
-   pdd->dev->adev,
-   true,
-   
pdd->dev->vm_info.last_vmid_kfd);
-   } else if (kfd_dbg_is_per_vmid_supported(pdd->dev)) {
-   pdd->spi_dbg_override = 
pdd->dev->kfd2kgd->enable_debug_trap(
-   pdd->dev->adev,
-   false,
-   0);
-   }
-   }
-   }
+   if (enable_ttmp_setup)
+   kfd_dbg_enable_ttmp_setup(p);
  
  retry:

if (p->debug_trap_enabled) {
@@ -2976,9 +2959,13 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
goto out;
}
  
-	/* Check if target is still PTRACED. */

rcu_read_lock();
-   if (target != p && args->op != KFD_IOC_DBG_TRAP_DISABLE
+
+   if (kfd_pc_sampling_enabled(target)) {
+   pr_debug("Cannot enable debug trap on PID:%d because PC Sampling 
active\n", args->pid);
+   r = -EBUSY;
+   /* Check if target is still PTRACED. */
+   } else if (target != p && args->op != KFD_IOC_DBG_TRAP_DISABLE
&& ptrace_parent(target->lead_thread) != 
current) {
pr_err("PID %i is not PTRACED and cannot be debugged\n", 
args->pid);
r = -EPERM;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 9ec750666382..092c2dc84d24 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -1118,3 +1118,25 @@ void kfd_dbg_set_enabled_debug_exception_mask(struct 
kfd_process *target,
  
  	mutex_unlock(>event_mutex);

  }
+
+void kfd_dbg_enable_ttmp_setup(struct kfd_process *p)
+{
+   int i;
+   p->runtime_info.ttmp_setup = true;
+   for (i = 0; i < p->n_pdds; i++) {
+   struct kfd_process_device *pdd = p->pdds[i];
+
+   if (!kfd_dbg_is_rlc_restore_supported(pdd->dev)) {
+   amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+   pdd->dev->kfd2kgd->enable_debug_trap(
+   pdd->dev->adev,
+   true,
+   pdd->dev->vm_info.last_vmid_kfd);
+   } else if (kfd_dbg_is_per_vmid_supported(pdd->dev)) {
+   pdd->spi_dbg_override = 
pdd->dev->kfd2kgd->enable_debug_trap(
+   pdd->dev->adev,
+   false,
+   0);
+   }
+   }
+}
\ No newline at end of 

[PATCH v2] drm/amdkfd: Fix lock dependency warning

2024-01-02 Thread Felix Kuehling
==
WARNING: possible circular locking dependency detected
6.5.0-kfd-fkuehlin #276 Not tainted
--
kworker/8:2/2676 is trying to acquire lock:
9435aae95c88 ((work_completion)(_bo->eviction_work)){+.+.}-{0:0}, at: 
__flush_work+0x52/0x550

but task is already holding lock:
9435cd8e1720 (>lock){+.+.}-{3:3}, at: 
svm_range_deferred_list_work+0xe8/0x340 [amdgpu]

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (>lock){+.+.}-{3:3}:
   __mutex_lock+0x97/0xd30
   kfd_ioctl_alloc_memory_of_gpu+0x6d/0x3c0 [amdgpu]
   kfd_ioctl+0x1b2/0x5d0 [amdgpu]
   __x64_sys_ioctl+0x86/0xc0
   do_syscall_64+0x39/0x80
   entry_SYSCALL_64_after_hwframe+0x63/0xcd

-> #1 (>mmap_lock){}-{3:3}:
   down_read+0x42/0x160
   svm_range_evict_svm_bo_worker+0x8b/0x340 [amdgpu]
   process_one_work+0x27a/0x540
   worker_thread+0x53/0x3e0
   kthread+0xeb/0x120
   ret_from_fork+0x31/0x50
   ret_from_fork_asm+0x11/0x20

-> #0 ((work_completion)(_bo->eviction_work)){+.+.}-{0:0}:
   __lock_acquire+0x1426/0x2200
   lock_acquire+0xc1/0x2b0
   __flush_work+0x80/0x550
   __cancel_work_timer+0x109/0x190
   svm_range_bo_release+0xdc/0x1c0 [amdgpu]
   svm_range_free+0x175/0x180 [amdgpu]
   svm_range_deferred_list_work+0x15d/0x340 [amdgpu]
   process_one_work+0x27a/0x540
   worker_thread+0x53/0x3e0
   kthread+0xeb/0x120
   ret_from_fork+0x31/0x50
   ret_from_fork_asm+0x11/0x20

other info that might help us debug this:

Chain exists of:
  (work_completion)(_bo->eviction_work) --> >mmap_lock --> >lock

 Possible unsafe locking scenario:

   CPU0CPU1
   
  lock(>lock);
   lock(>mmap_lock);
   lock(>lock);
  lock((work_completion)(_bo->eviction_work));

I believe this cannot really lead to a deadlock in practice, because
svm_range_evict_svm_bo_worker only takes the mmap_read_lock if the BO
refcount is non-0. That means it's impossible that svm_range_bo_release
is running concurrently. However, there is no good way to annotate this.

To avoid the problem, take a BO reference in
svm_range_schedule_evict_svm_bo instead of in the worker. That way it's
impossible for a BO to get freed while eviction work is pending and the
cancel_work_sync call in svm_range_bo_release can be eliminated.

v2: Use svm_bo_ref_unless_zero and explained why that's safe. Also
removed redundant checks that are already done in
amdkfd_fence_enable_signaling.

Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 26 ++
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index afd98aace065..6b314e4d3ae0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -404,14 +404,9 @@ static void svm_range_bo_release(struct kref *kref)
spin_lock(_bo->list_lock);
}
spin_unlock(_bo->list_lock);
-   if (!dma_fence_is_signaled(_bo->eviction_fence->base)) {
-   /* We're not in the eviction worker.
-* Signal the fence and synchronize with any
-* pending eviction work.
-*/
+   if (!dma_fence_is_signaled(_bo->eviction_fence->base))
+   /* We're not in the eviction worker. Signal the fence. */
dma_fence_signal(_bo->eviction_fence->base);
-   cancel_work_sync(_bo->eviction_work);
-   }
dma_fence_put(_bo->eviction_fence->base);
amdgpu_bo_unref(_bo->bo);
kfree(svm_bo);
@@ -3437,13 +3432,14 @@ svm_range_trigger_migration(struct mm_struct *mm, 
struct svm_range *prange,
 
 int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence)
 {
-   if (!fence)
-   return -EINVAL;
-
-   if (dma_fence_is_signaled(>base))
-   return 0;
-
-   if (fence->svm_bo) {
+   /* Dereferencing fence->svm_bo is safe here because the fence hasn't
+* signaled yet and we're under the protection of the fence->lock.
+* After the fence is signaled in svm_range_bo_release, we cannot get
+* here any more.
+*
+* Reference is dropped in svm_range_evict_svm_bo_worker.
+*/
+   if (svm_bo_ref_unless_zero(fence->svm_bo)) {
WRITE_ONCE(fence->svm_bo->evicting, 1);
schedule_work(>svm_bo->eviction_work);
}
@@ -3458,8 +3454,6 @@ static void svm_range_evict_svm_bo_worker(struct 
work_struct *work)
int r = 0;
 
svm_bo = container_of(work, struct svm_range_bo, eviction_work);
-   if (!svm_bo_ref_unless_zero(svm_bo))
-   return; /* svm_bo was freed while eviction was pending */

Re: [PATCH 4/5] drm/amdgpu: Query boot status if discovery failed

2024-01-02 Thread Deucher, Alexander
[AMD Official Use Only - General]

Is mmIP_DISCOVERY_VERSION at the same offset across ASIC families?

Alex


From: Hawking Zhang 
Sent: Monday, January 1, 2024 10:43 PM
To: amd-gfx@lists.freedesktop.org ; Zhou1, Tao 
; Yang, Stanley ; Wang, Yang(Kevin) 
; Chai, Thomas ; Li, Candice 

Cc: Zhang, Hawking ; Deucher, Alexander 
; Lazar, Lijo ; Ma, Le 

Subject: [PATCH 4/5] drm/amdgpu: Query boot status if discovery failed

Check and report boot status if discovery failed.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index b8fde08aec8e..302b71e9f1e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -27,6 +27,7 @@
 #include "amdgpu_discovery.h"
 #include "soc15_hw_ip.h"
 #include "discovery.h"
+#include "amdgpu_ras.h"

 #include "soc15.h"
 #include "gfx_v9_0.h"
@@ -98,6 +99,7 @@
 #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
 MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);

+#define mmIP_DISCOVERY_VERSION  0x16A00
 #define mmRCC_CONFIG_MEMSIZE0xde3
 #define mmMP0_SMN_C2PMSG_33 0x16061
 #define mmMM_INDEX  0x0
@@ -518,7 +520,9 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
 out:
 kfree(adev->mman.discovery_bin);
 adev->mman.discovery_bin = NULL;
-
+   if ((amdgpu_discovery != 2) &&
+   (RREG32(mmIP_DISCOVERY_VERSION) == 4))
+   amdgpu_ras_query_boot_status(adev, 4);
 return r;
 }

--
2.17.1



Re: [PATCH 2/5] drm/amdgpu: Init pcie_index/data address as fallback

2024-01-02 Thread Alex Deucher
On Mon, Jan 1, 2024 at 10:50 PM Hawking Zhang  wrote:
>
> To allow using this helper for indirect access when
> nbio funcs is not available. For instance, in ip
> discovery phase.
>
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++-
>  1 file changed, 15 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 001a35fa0f19..873419a5b9aa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -781,12 +781,22 @@ u32 amdgpu_device_indirect_rreg_ext(struct 
> amdgpu_device *adev,
> void __iomem *pcie_index_hi_offset;
> void __iomem *pcie_data_offset;
>
> -   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
> -   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
> -   if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
> -   pcie_index_hi = 
> adev->nbio.funcs->get_pcie_index_hi_offset(adev);
> -   else
> +   if (unlikely(!adev->nbio.funcs)) {
> +   pcie_index = (0x38 >> 2);
> +   pcie_data = (0x3C >> 2);
> +   } else {
> +   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
> +   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
> +   }
> +
> +   if (reg_addr >> 32) {
> +   if (unlikely(!adev->nbio.funcs))
> +   pcie_index_hi = (0x44 >> 2);

I'd still use a define for these, E.g.,

#define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
etc.
Or something similar.

Alex

> +   else
> +   pcie_index_hi = 
> adev->nbio.funcs->get_pcie_index_hi_offset(adev);
> +   } else {
> pcie_index_hi = 0;
> +   }
>
> spin_lock_irqsave(>pcie_idx_lock, flags);
> pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
> --
> 2.17.1
>


Re: [PATCH] drm/amdkfd: fixes for HMM mem allocation

2024-01-02 Thread Felix Kuehling



On 2023-12-31 09:37, Dafna Hirschfeld wrote:

Few fixes to amdkfd and the doc of
devm_request_free_mem_region.

Signed-off-by: Dafna Hirschfeld 
---
  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 6 +++---
  kernel/resource.c| 2 +-
  2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 6c25dab051d5..b8680e0753ca 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -1021,7 +1021,7 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev)
} else {
res = devm_request_free_mem_region(adev->dev, _resource, 
size);
if (IS_ERR(res))
-   return -ENOMEM;
+   return PTR_ERR(res);
pgmap->range.start = res->start;
pgmap->range.end = res->end;
pgmap->type = MEMORY_DEVICE_PRIVATE;
@@ -1037,10 +1037,10 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev)
r = devm_memremap_pages(adev->dev, pgmap);
if (IS_ERR(r)) {
pr_err("failed to register HMM device memory\n");
-   /* Disable SVM support capability */
-   pgmap->type = 0;
if (pgmap->type == MEMORY_DEVICE_PRIVATE)
devm_release_mem_region(adev->dev, res->start, 
resource_size(res));
+   /* Disable SVM support capability */
+   pgmap->type = 0;


Ooff, thanks for catching that. For the KFD driver changes you can add

Fixes: c83dee9b6394 ("drm/amdkfd: add SPM support for SVM")
Reviewed-by: Felix Kuehling 



return PTR_ERR(r);
}
  
diff --git a/kernel/resource.c b/kernel/resource.c

index 866ef3663a0b..fe890b874606 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1905,8 +1905,8 @@ get_free_mem_region(struct device *dev, struct resource 
*base,
   * devm_request_free_mem_region - find free region for device private memory
   *
   * @dev: device struct to bind the resource to
- * @size: size in bytes of the device memory to add
   * @base: resource tree to look in
+ * @size: size in bytes of the device memory to add
   *
   * This function tries to find an empty range of physical address big enough 
to
   * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE


Re: [PATCH] drm/amd/display: Fix sending VSC (+ colorimetry) packets for DP/eDP displays without PSR

2024-01-02 Thread Xaver Hugl
Hi,

I tested the patch and it fixes the issue for me too. Consider it
Tested-By Xaver Hugl 

- Xaver

Am Mo., 1. Jan. 2024 um 22:37 Uhr schrieb Joshua Ashton :

>  From the issue:
>
> ```
> Thank you for for fixing this!
> I built a custom kernel with this patch on the fedora rawhide kernel
> (6.7.0-0.rc8.61.fc40.x86_64) and now the colors look correct. SDR
> content is now displayed as sRGB and HDR/WCG content can use the full
> capabilities of the display.
> I currently don't have a desktop mail client installed to comment on the
> mailing list directly, so I'll post it here (not sure if it counts or
> matters  )
>
> Tested-By: Simon Berz 
> ```
>
> - Joshie ✨
>
> On 1/1/24 18:28, Joshua Ashton wrote:
> > The check for sending the vsc infopacket to the display was gated behind
> > PSR (Panel Self Refresh) being enabled.
> >
> > The vsc infopacket also contains the colorimetry (specifically the
> > container color gamut) information for the stream on modern DP.
> >
> > PSR is typically only supported on mobile phone eDP displays, thus this
> > was not getting sent for typical desktop monitors or TV screens.
> >
> > This functionality is needed for proper HDR10 functionality on DP as it
> > wants BT2020 RGB/YCbCr for the container color space.
> >
> > Signed-off-by: Joshua Ashton 
> >
> > Cc: Harry Wentland 
> > Cc: Xaver Hugl 
> > Cc: Melissa Wen 
> > ---
> >   drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c   |  8 +---
> >   .../amd/display/modules/info_packet/info_packet.c   | 13 -
> >   2 files changed, 13 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > index 2845c884398e..6dff56408bf4 100644
> > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > @@ -6233,8 +6233,9 @@ create_stream_for_sink(struct drm_connector
> *connector,
> >
> >   if (stream->signal == SIGNAL_TYPE_HDMI_TYPE_A)
> >   mod_build_hf_vsif_infopacket(stream,
> >vsp_infopacket);
> > -
> > - if (stream->link->psr_settings.psr_feature_enabled ||
> stream->link->replay_settings.replay_feature_enabled) {
> > + else if (stream->signal == SIGNAL_TYPE_DISPLAY_PORT ||
> > +  stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST ||
> > +  stream->signal == SIGNAL_TYPE_EDP) {
> >   //
> >   // should decide stream support vsc sdp colorimetry
> capability
> >   // before building vsc info packet
> > @@ -6250,8 +6251,9 @@ create_stream_for_sink(struct drm_connector
> *connector,
> >   if (stream->out_transfer_func->tf ==
> TRANSFER_FUNCTION_GAMMA22)
> >   tf = TRANSFER_FUNC_GAMMA_22;
> >   mod_build_vsc_infopacket(stream, >vsc_infopacket,
> stream->output_color_space, tf);
> > - aconnector->psr_skip_count = AMDGPU_DM_PSR_ENTRY_DELAY;
> >
> > + if (stream->link->psr_settings.psr_feature_enabled)
> > + aconnector->psr_skip_count =
> AMDGPU_DM_PSR_ENTRY_DELAY;
> >   }
> >   finish:
> >   dc_sink_release(sink);
> > diff --git
> a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c
> b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c
> > index 84f9b412a4f1..738ee763f24a 100644
> > --- a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c
> > +++ b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c
> > @@ -147,12 +147,15 @@ void mod_build_vsc_infopacket(const struct
> dc_stream_state *stream,
> >   }
> >
> >   /* VSC packet set to 4 for PSR-SU, or 2 for PSR1 */
> > - if (stream->link->psr_settings.psr_version == DC_PSR_VERSION_SU_1)
> > - vsc_packet_revision = vsc_packet_rev4;
> > - else if (stream->link->replay_settings.config.replay_supported)
> > + if (stream->link->psr_settings.psr_feature_enabled) {
> > + if (stream->link->psr_settings.psr_version ==
> DC_PSR_VERSION_SU_1)
> > + vsc_packet_revision = vsc_packet_rev4;
> > + else if (stream->link->psr_settings.psr_version ==
> DC_PSR_VERSION_1)
> > + vsc_packet_revision = vsc_packet_rev2;
> > + }
> > +
> > + if (stream->link->replay_settings.config.replay_supported)
> >   vsc_packet_revision = vsc_packet_rev4;
> > - else if (stream->link->psr_settings.psr_version ==
> DC_PSR_VERSION_1)
> > - vsc_packet_revision = vsc_packet_rev2;
> >
> >   /* Update to revision 5 for extended colorimetry support */
> >   if (stream->use_vsc_sdp_for_colorimetry)
>
>


Re: [PATCH] drm/amdgpu: change vm->task_info handling

2024-01-02 Thread Felix Kuehling



On 2024-01-02 06:12, Shashank Sharma wrote:

drm/amdgpu: change vm->task_info handling

This patch changes the handling and lifecycle of vm->task_info object.
The major changes are:
- vm->task_info is a dynamically allocated ptr now, and its uasge is
   reference counted.
- introducing two new helper funcs for task_info lifecycle management
 - amdgpu_vm_get_task_info: reference counts up task_info before
   returning this info
 - amdgpu_vm_put_task_info: reference counts down task_info
- last put to task_info() frees task_info from the vm.

This patch also does logistical changes required for existing usage
of vm->task_info.

Cc: Christian Koenig 
Cc: Alex Deucher 
Cc: Felix Kuehling 
Signed-off-by: Shashank Sharma 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |   7 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  15 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  17 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 142 +---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  24 +++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c   |   2 +-
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  27 ++--
  drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  |  28 ++--
  drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c   |  26 ++--
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  28 ++--
  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c  |  20 +--
  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c|  19 +--
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c |  17 +--
  13 files changed, 259 insertions(+), 113 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index a4faea4fa0b5..111f8afb03a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1763,9 +1763,12 @@ static int amdgpu_debugfs_vm_info_show(struct seq_file 
*m, void *unused)
list_for_each_entry(file, >filelist, lhead) {
struct amdgpu_fpriv *fpriv = file->driver_priv;
struct amdgpu_vm *vm = >vm;
+   struct amdgpu_task_info *ti;
+
+   ti = amdgpu_vm_get_task_info_vm(vm);
+   seq_printf(m, "pid:%d\tProcess:%s --\n", ti->pid, 
ti->process_name);
+   amdgpu_vm_put_task_info_vm(ti, vm);
  
-		seq_printf(m, "pid:%d\tProcess:%s --\n",

-   vm->task_info.pid, vm->task_info.process_name);
r = amdgpu_bo_reserve(vm->root.bo, true);
if (r)
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2b8356699f23..00516fa178b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4952,10 +4952,17 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
tmp_adev->reset_vram_lost = vram_lost;
memset(_adev->reset_task_info, 0,

sizeof(tmp_adev->reset_task_info));
-   if (reset_context->job && 
reset_context->job->vm)
-   tmp_adev->reset_task_info =
-   
reset_context->job->vm->task_info;
-   amdgpu_reset_capture_coredumpm(tmp_adev);
+   if (reset_context->job && 
reset_context->job->vm) {
+   struct amdgpu_task_info *ti;
+   struct amdgpu_vm *vm = 
reset_context->job->vm;
+
+   ti = amdgpu_vm_get_task_info_vm(vm);
+   if (ti) {
+   tmp_adev->reset_task_info = *ti;
+   
amdgpu_reset_capture_coredumpm(tmp_adev);
+   amdgpu_vm_put_task_info_vm(ti, 
vm);
+   }
+   }
  #endif
if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU 
reset!\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 78476bc75b4e..b89ee6ab7db9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -35,7 +35,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
  {
struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
struct amdgpu_job *job = to_amdgpu_job(s_job);
-   struct amdgpu_task_info ti;
+   struct amdgpu_task_info *ti;
struct amdgpu_device *adev = ring->adev;
int idx;
int r;
@@ -58,12 +58,15 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
goto exit;
}
 

Re: [PATCH 4/5] drm/amdgpu: Query boot status if discovery failed

2024-01-02 Thread Felix Kuehling



On 2024-01-02 09:07, Hawking Zhang wrote:

Check and report boot status if discovery failed.

Signed-off-by: Hawking Zhang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +-
  1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index b8fde08aec8e..302b71e9f1e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -27,6 +27,7 @@
  #include "amdgpu_discovery.h"
  #include "soc15_hw_ip.h"
  #include "discovery.h"
+#include "amdgpu_ras.h"
  
  #include "soc15.h"

  #include "gfx_v9_0.h"
@@ -98,6 +99,7 @@
  #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
  MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
  
+#define mmIP_DISCOVERY_VERSION  0x16A00

  #define mmRCC_CONFIG_MEMSIZE  0xde3
  #define mmMP0_SMN_C2PMSG_33   0x16061
  #define mmMM_INDEX0x0
@@ -518,7 +520,9 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
  out:
kfree(adev->mman.discovery_bin);
adev->mman.discovery_bin = NULL;
-
+   if ((amdgpu_discovery != 2) &&
+   (RREG32(mmIP_DISCOVERY_VERSION) == 4))
+   amdgpu_ras_query_boot_status(adev, 4);
I'm not sure about hard-coding 4 instances here. The code you dropped in 
patch 1 was using adev->aid_mask. But I guess that's not even 
initialized correctly if IP discovery failed. Will this work correctly 
on the APU version?


Regards,
  Felix



return r;
  }
  


[PATCH 4/4] drm/amdgpu: Do not program VM_L2_CNTL under SRIOV

2024-01-02 Thread Victor Lu
VM_L2_CNTL* should not be programmed on driver unload under SRIOV.
These regs are skipped during SRIOV driver init.

Signed-off-by: Victor Lu 
---
 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
index 55423ff1bb49..20e800bc0b68 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
@@ -454,10 +454,12 @@ static void gfxhub_v1_2_xcc_gart_disable(struct 
amdgpu_device *adev,
WREG32_SOC15_RLC(GC, GET_INST(GC, j), regMC_VM_MX_L1_TLB_CNTL, 
tmp);
 
/* Setup L2 cache */
-   tmp = RREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL);
-   tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
-   WREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL, tmp);
-   WREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL3, 0);
+   if (!amdgpu_sriov_vf(adev)) {
+   tmp = RREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL);
+   tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 
0);
+   WREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL, tmp);
+   WREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL3, 0);
+   }
}
 }
 
-- 
2.34.1



[PATCH 3/4] drm/amdgpu: Use correct SRIOV macro for gmc_v9_0_vm_fault_interrupt_state

2024-01-02 Thread Victor Lu
Under SRIOV, programming to VM_CONTEXT*_CNTL regs failed because the
current macro does not pass through the correct xcc instance.
Use the *REG32_XCC macro in this case.

The behaviour without SRIOV is the same.

Signed-off-by: Victor Lu 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 473a774294ce..e2e14d40109c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -496,14 +496,14 @@ static int gmc_v9_0_vm_fault_interrupt_state(struct 
amdgpu_device *adev,
if (j >= AMDGPU_MMHUB0(0))
tmp = RREG32_SOC15_IP(MMHUB, reg);
else
-   tmp = RREG32_SOC15_IP(GC, reg);
+   tmp = RREG32_XCC(reg, j);
 
tmp &= ~bits;
 
if (j >= AMDGPU_MMHUB0(0))
WREG32_SOC15_IP(MMHUB, reg, tmp);
else
-   WREG32_SOC15_IP(GC, reg, tmp);
+   WREG32_XCC(reg, tmp, j);
}
}
break;
@@ -524,14 +524,14 @@ static int gmc_v9_0_vm_fault_interrupt_state(struct 
amdgpu_device *adev,
if (j >= AMDGPU_MMHUB0(0))
tmp = RREG32_SOC15_IP(MMHUB, reg);
else
-   tmp = RREG32_SOC15_IP(GC, reg);
+   tmp = RREG32_XCC(reg, j);
 
tmp |= bits;
 
if (j >= AMDGPU_MMHUB0(0))
WREG32_SOC15_IP(MMHUB, reg, tmp);
else
-   WREG32_SOC15_IP(GC, reg, tmp);
+   WREG32_XCC(reg, tmp, j);
}
}
break;
-- 
2.34.1



[PATCH 2/4] drm/amdgpu: Do not program SQ_TIMEOUT_CONFIG in SRIOV

2024-01-02 Thread Victor Lu
VF should not program this register.

Signed-off-by: Victor Lu 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 00b21ece081f..30cc155f20d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -3888,6 +3888,9 @@ static void gfx_v9_4_3_inst_enable_watchdog_timer(struct 
amdgpu_device *adev,
uint32_t i;
uint32_t data;
 
+   if (amdgpu_sriov_vf(adev))
+   return;
+
data = RREG32_SOC15(GC, GET_INST(GC, 0), regSQ_TIMEOUT_CONFIG);
data = REG_SET_FIELD(data, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE,
 amdgpu_watchdog_timer.timeout_fatal_disable ? 1 : 
0);
-- 
2.34.1



[PATCH 1/4] drm/amdgpu: Improve error checking in amdgpu_virt_rlcg_reg_rw

2024-01-02 Thread Victor Lu
The current error detection only looks for a timeout.
This should be changed to also check scratch_reg1 for any errors
returned from RLCG.

Also add a new error value.

Signed-off-by: Victor Lu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 8 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 2 ++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 0dcff2889e25..3cd085569515 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -1022,7 +1022,7 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, 
u32 offset, u32 v, u32 f
 * SCRATCH_REG0 = read/write value
 * SCRATCH_REG1[30:28]  = command
 * SCRATCH_REG1[19:0]   = address in dword
-* SCRATCH_REG1[26:24]  = Error reporting
+* SCRATCH_REG1[27:24]  = Error reporting
 */
writel(v, scratch_reg0);
writel((offset | flag), scratch_reg1);
@@ -1036,7 +1036,8 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, 
u32 offset, u32 v, u32 f
udelay(10);
}
 
-   if (i >= timeout) {
+   tmp = readl(scratch_reg1);
+   if (i >= timeout || (tmp & AMDGPU_RLCG_SCRATCH1_ERROR_MASK) != 
0) {
if (amdgpu_sriov_rlcg_error_report_enabled(adev)) {
if (tmp & AMDGPU_RLCG_VFGATE_DISABLED) {
dev_err(adev->dev,
@@ -1047,6 +1048,9 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, 
u32 offset, u32 v, u32 f
} else if (tmp & AMDGPU_RLCG_REG_NOT_IN_RANGE) {
dev_err(adev->dev,
"register is not in range, rlcg 
failed to program reg: 0x%05x\n", offset);
+   } else if (tmp & 
AMDGPU_RLCG_INVALID_XCD_ACCESS) {
+   dev_err(adev->dev,
+   "invalid xcd access, rlcg 
failed to program reg: 0x%05x\n", offset);
} else {
dev_err(adev->dev,
"unknown error type, rlcg 
failed to program reg: 0x%05x\n", offset);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index d4207e44141f..447af2e4aef0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -40,11 +40,13 @@
 #define AMDGPU_RLCG_MMHUB_WRITE(0x2 << 28)
 
 /* error code for indirect register access path supported by rlcg for sriov */
+#define AMDGPU_RLCG_INVALID_XCD_ACCESS 0x800
 #define AMDGPU_RLCG_VFGATE_DISABLED0x400
 #define AMDGPU_RLCG_WRONG_OPERATION_TYPE   0x200
 #define AMDGPU_RLCG_REG_NOT_IN_RANGE   0x100
 
 #define AMDGPU_RLCG_SCRATCH1_ADDRESS_MASK  0xF
+#define AMDGPU_RLCG_SCRATCH1_ERROR_MASK0xF00
 
 /* all asic after AI use this offset */
 #define mmRCC_IOV_FUNC_IDENTIFIER 0xDE5
-- 
2.34.1



Re: [PATCH v2] drm/amdkfd: Confirm list is non-empty before utilizing list_first_entry in kfd_topology.c

2024-01-02 Thread Felix Kuehling

On 2023-12-22 06:53, Srinivasan Shanmugam wrote:

Before using list_first_entry, make sure to check that list is not
empty, if list is empty return -ENODATA.

Fixes the below:
drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_topology.c:1347 
kfd_create_indirect_link_prop() warn: can 'gpu_link' even be NULL?
drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_topology.c:1428 kfd_add_peer_prop() 
warn: can 'iolink1' even be NULL?
drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_topology.c:1433 kfd_add_peer_prop() 
warn: can 'iolink2' even be NULL?

'Fixes: 0f28cca87e9a ("drm/amdkfd: Extend KFD device topology to surface
peer-to-peer links")'
Suggested-by: Lijo Lazar 
Suggested-by: Felix Kuehling 
Cc: Felix Kuehling 
Cc: Christian König 
Cc: Alex Deucher 
Signed-off-by: Srinivasan Shanmugam 


Reviewed-by: Felix Kuehling 



---

v2:
   Changed to "if (list_empty(>io_link_props)) return -ENODATA;"
(Lijo)

  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 21 -
  1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index dc7c8312e8c7..1fc9d4616564 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1342,10 +1342,11 @@ static int kfd_create_indirect_link_prop(struct 
kfd_topology_device *kdev, int g
num_cpu++;
}
  
+	if (list_empty(>io_link_props))

+   return -ENODATA;
+
gpu_link = list_first_entry(>io_link_props,
-   struct kfd_iolink_properties, list);
-   if (!gpu_link)
-   return -ENOMEM;
+   struct kfd_iolink_properties, list);
  
  	for (i = 0; i < num_cpu; i++) {

/* CPU <--> GPU */
@@ -1423,15 +1424,17 @@ static int kfd_add_peer_prop(struct kfd_topology_device 
*kdev,
peer->gpu->adev))
return ret;
  
+	if (list_empty(>io_link_props))

+   return -ENODATA;
+
iolink1 = list_first_entry(>io_link_props,
-   struct 
kfd_iolink_properties, list);
-   if (!iolink1)
-   return -ENOMEM;
+  struct kfd_iolink_properties, list);
+
+   if (list_empty(>io_link_props))
+   return -ENODATA;
  
  	iolink2 = list_first_entry(>io_link_props,

-   struct 
kfd_iolink_properties, list);
-   if (!iolink2)
-   return -ENOMEM;
+  struct kfd_iolink_properties, list);
  
  	props = kfd_alloc_struct(props);

if (!props)


Re: [PATCH] drm/amdkfd: Fix lock dependency warning

2024-01-02 Thread Felix Kuehling



On 2023-12-28 18:11, Philip Yang wrote:



On 2023-12-21 15:40, Felix Kuehling wrote:

==
WARNING: possible circular locking dependency detected
6.5.0-kfd-fkuehlin #276 Not tainted
--
kworker/8:2/2676 is trying to acquire lock:
9435aae95c88 ((work_completion)(_bo->eviction_work)){+.+.}-{0:0}, at: 
__flush_work+0x52/0x550

but task is already holding lock:
9435cd8e1720 (>lock){+.+.}-{3:3}, at: 
svm_range_deferred_list_work+0xe8/0x340 [amdgpu]

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (>lock){+.+.}-{3:3}:
__mutex_lock+0x97/0xd30
kfd_ioctl_alloc_memory_of_gpu+0x6d/0x3c0 [amdgpu]
kfd_ioctl+0x1b2/0x5d0 [amdgpu]
__x64_sys_ioctl+0x86/0xc0
do_syscall_64+0x39/0x80
entry_SYSCALL_64_after_hwframe+0x63/0xcd

-> #1 (>mmap_lock){}-{3:3}:
down_read+0x42/0x160
svm_range_evict_svm_bo_worker+0x8b/0x340 [amdgpu]
process_one_work+0x27a/0x540
worker_thread+0x53/0x3e0
kthread+0xeb/0x120
ret_from_fork+0x31/0x50
ret_from_fork_asm+0x11/0x20

-> #0 ((work_completion)(_bo->eviction_work)){+.+.}-{0:0}:
__lock_acquire+0x1426/0x2200
lock_acquire+0xc1/0x2b0
__flush_work+0x80/0x550
__cancel_work_timer+0x109/0x190
svm_range_bo_release+0xdc/0x1c0 [amdgpu]
svm_range_free+0x175/0x180 [amdgpu]
svm_range_deferred_list_work+0x15d/0x340 [amdgpu]
process_one_work+0x27a/0x540
worker_thread+0x53/0x3e0
kthread+0xeb/0x120
ret_from_fork+0x31/0x50
ret_from_fork_asm+0x11/0x20

other info that might help us debug this:

Chain exists of:
   (work_completion)(_bo->eviction_work) --> >mmap_lock --> >lock

  Possible unsafe locking scenario:

CPU0CPU1

   lock(>lock);
lock(>mmap_lock);
lock(>lock);
   lock((work_completion)(_bo->eviction_work));

I believe this cannot really lead to a deadlock in practice, because
svm_range_evict_svm_bo_worker only takes the mmap_read_lock if the BO
refcount is non-0. That means it's impossible that svm_range_bo_release
is running concurrently. However, there is no good way to annotate this.

To avoid the problem, take a BO reference in
svm_range_schedule_evict_svm_bo instead of in the worker. That way it's
impossible for a BO to get freed while eviction work is pending and the
cancel_work_sync call in svm_range_bo_release can be eliminated.

Signed-off-by: Felix Kuehling
---
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 13 -
  1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index afd98aace065..7683c96f0cbd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -404,14 +404,9 @@ static void svm_range_bo_release(struct kref *kref)
spin_lock(_bo->list_lock);
}
spin_unlock(_bo->list_lock);
-   if (!dma_fence_is_signaled(_bo->eviction_fence->base)) {
-   /* We're not in the eviction worker.
-* Signal the fence and synchronize with any
-* pending eviction work.
-*/
+   if (!dma_fence_is_signaled(_bo->eviction_fence->base))
+   /* We're not in the eviction worker. Signal the fence. */
dma_fence_signal(_bo->eviction_fence->base);
-   cancel_work_sync(_bo->eviction_work);
-   }
dma_fence_put(_bo->eviction_fence->base);
amdgpu_bo_unref(_bo->bo);
kfree(svm_bo);
@@ -3444,6 +3439,8 @@ int svm_range_schedule_evict_svm_bo(struct 
amdgpu_amdkfd_fence *fence)
return 0;
  
  	if (fence->svm_bo) {

+   /* Reference is dropped in svm_range_evict_svm_bo_worker */
+   kref_get(>svm_bo->kref);


I think there maybe racing condition if bo is already releasing,  get 
ref unless zero


If that's a concern, we should probably make sure that the fence->svm_bo 
pointer holds a reference itself. We shouldn't have eviction fences with 
dangling svm_bo pointers. And we should not let a BO refount go to 0 
while there exists an unsignaled eviction fence pointing to it. I'll 
look into that.




/* if svm_bo is getting released, eviction fence will be signaled */
if (!svm_bo_ref_unless_zero(svm_bo))
return 0;

Another solution is to call svm_range_bo_unref_async from 
svm_range_vram_node_free.


That's not ideal either. The eviction worker is already a worker thread. 
Then we'd need to schedule another worker to actually release the 
memory, adding more latency and scheduling overhead.


Regards,
  Felix




Regards,
Philip

WRITE_ONCE(fence->svm_bo->evicting, 1);

Re: [PATCH] drm/amdkfd: Fix iterator used outside loop in 'kfd_add_peer_prop()'

2024-01-02 Thread Felix Kuehling

On 2023-12-29 04:43, Srinivasan Shanmugam wrote:

Fix the following about iterator use:
drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_topology.c:1456 kfd_add_peer_prop() 
warn: iterator used outside loop: 'iolink3'

Cc: Felix Kuehling 
Cc: Christian König 
Cc: Alex Deucher 
Signed-off-by: Srinivasan Shanmugam 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 24 ---
  1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index dc7c8312e8c7..68640e46312f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1449,17 +1449,19 @@ static int kfd_add_peer_prop(struct kfd_topology_device 
*kdev,
/* CPU->CPU  link*/
cpu_dev = 
kfd_topology_device_by_proximity_domain(iolink1->node_to);
if (cpu_dev) {
-   list_for_each_entry(iolink3, _dev->io_link_props, 
list)
-   if (iolink3->node_to == iolink2->node_to)
-   break;
-
-   props->weight += iolink3->weight;
-   props->min_latency += iolink3->min_latency;
-   props->max_latency += iolink3->max_latency;
-   props->min_bandwidth = min(props->min_bandwidth,
-   iolink3->min_bandwidth);
-   props->max_bandwidth = min(props->max_bandwidth,
-   iolink3->max_bandwidth);
+   list_for_each_entry(iolink3, _dev->io_link_props, 
list) {
+   if (iolink3->node_to != iolink2->node_to)
+   continue;
+
+   props->weight += iolink3->weight;
+   props->min_latency += iolink3->min_latency;
+   props->max_latency += iolink3->max_latency;
+   props->min_bandwidth = min(props->min_bandwidth,
+  
iolink3->min_bandwidth);
+   props->max_bandwidth = min(props->max_bandwidth,
+  
iolink3->max_bandwidth);
+   break;
+   }
} else {
WARN(1, "CPU node not found");
}


Re: [PATCH v3] drm/amdkfd: Prefer kernel data types u8, u16, u32, u64 in amdkfd/kfd_priv.h

2024-01-02 Thread Felix Kuehling



On 2023-12-29 06:03, Srinivasan Shanmugam wrote:

Fix the following checks reported by checkpatch:

CHECK: Prefer kernel type 'u8' over 'uint8_t'
CHECK: Prefer kernel type 'u16' over 'uint16_t'
CHECK: Prefer kernel type 'u64' over 'uint64_t'
CHECK: Prefer kernel type 'u32' over 'uint32_t'


This is a lot of churn. Why do this now, and why specifically for KFD. I 
see a lot of uint..._t in amdgpu as well. The kernel header files define 
both data types. Why is one preferred over the other? If there is 
agreement that u8/16/32/64 is preferred, should this change be applied 
to amdgpu as well?


I also see a bunch of unrelated indentation changes in this patch.

Regards,
  Felix




Cc: Felix Kuehling 
Cc: Christian König 
Cc: Alex Deucher 
Signed-off-by: Srinivasan Shanmugam 
---
v3:
   - updated u32, u16, u64 for missed variables in v2

  drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 448 +-
  1 file changed, 226 insertions(+), 222 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 45366b4ca976..c161b5220fd7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -74,7 +74,7 @@
  #define KFD_MMAP_GPU_ID_SHIFT 46
  #define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \
<< KFD_MMAP_GPU_ID_SHIFT)
-#define KFD_MMAP_GPU_ID(gpu_id) uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\
+#define KFD_MMAP_GPU_ID(gpu_id) u64)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\
& KFD_MMAP_GPU_ID_MASK)
  #define KFD_MMAP_GET_GPU_ID(offset)((offset & KFD_MMAP_GPU_ID_MASK) \
>> KFD_MMAP_GPU_ID_SHIFT)
@@ -91,7 +91,7 @@
  
  /* Macro for allocating structures */

  #define kfd_alloc_struct(ptr_to_struct)   \
-   ((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL))
+   ((typeof(ptr_to_struct))kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL))
  
  #define KFD_MAX_NUM_OF_PROCESSES 512

  #define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS 1024
@@ -145,13 +145,13 @@ enum kfd_ioctl_flags {
 */
KFD_IOC_FLAG_CHECKPOINT_RESTORE = BIT(0),
  };
+
  /*
   * Kernel module parameter to specify maximum number of supported queues per
   * device
   */
  extern int max_num_of_queues_per_device;
  
-

  /* Kernel module parameter to specify the scheduling policy */
  extern int sched_policy;
  
@@ -212,24 +212,24 @@ struct kfd_node;
  
  struct kfd_event_interrupt_class {

bool (*interrupt_isr)(struct kfd_node *dev,
-   const uint32_t *ih_ring_entry, uint32_t *patched_ihre,
+ const u32 *ih_ring_entry, u32 *patched_ihre,
bool *patched_flag);
void (*interrupt_wq)(struct kfd_node *dev,
-   const uint32_t *ih_ring_entry);
+const u32 *ih_ring_entry);
  };
  
  struct kfd_device_info {

-   uint32_t gfx_target_version;
+   u32 gfx_target_version;
const struct kfd_event_interrupt_class *event_interrupt_class;
unsigned int max_pasid_bits;
unsigned int max_no_of_hqd;
unsigned int doorbell_size;
size_t ih_ring_entry_size;
-   uint8_t num_of_watch_points;
-   uint16_t mqd_size_aligned;
+   u8 num_of_watch_points;
+   u16 mqd_size_aligned;
bool supports_cwsr;
bool needs_pci_atomics;
-   uint32_t no_atomic_fw_version;
+   u32 no_atomic_fw_version;
unsigned int num_sdma_queues_per_engine;
unsigned int num_reserved_sdma_queues_per_engine;
DECLARE_BITMAP(reserved_sdma_queues_bitmap, KFD_MAX_SDMA_QUEUES);
@@ -239,17 +239,17 @@ unsigned int kfd_get_num_sdma_engines(struct kfd_node 
*kdev);
  unsigned int kfd_get_num_xgmi_sdma_engines(struct kfd_node *kdev);
  
  struct kfd_mem_obj {

-   uint32_t range_start;
-   uint32_t range_end;
-   uint64_t gpu_addr;
-   uint32_t *cpu_ptr;
+   u32 range_start;
+   u32 range_end;
+   u64 gpu_addr;
+   u32 *cpu_ptr;
void *gtt_mem;
  };
  
  struct kfd_vmid_info {

-   uint32_t first_vmid_kfd;
-   uint32_t last_vmid_kfd;
-   uint32_t vmid_num_kfd;
+   u32 first_vmid_kfd;
+   u32 last_vmid_kfd;
+   u32 vmid_num_kfd;
  };
  
  #define MAX_KFD_NODES	8

@@ -267,7 +267,7 @@ struct kfd_node {
  */
struct kfd_vmid_info vm_info;
unsigned int id;/* topology stub index */
-   uint32_t xcc_mask; /* Instance mask of XCCs present */
+   u32 xcc_mask; /* Instance mask of XCCs present */
struct amdgpu_xcp *xcp;
  
  	/* Interrupts */

@@ -281,7 +281,7 @@ struct kfd_node {
 * from the HW ring into a SW ring.
 */
bool interrupts_active;
-   uint32_t interrupt_bitmap; /* Only used for GFX 9.4.3 */
+   u32 interrupt_bitmap; /* Only used for GFX 9.4.3 */
  
  	/* QCM 

Re: [PATCH] drm/amdgpu: Drop 'fence' check in 'to_amdgpu_amdkfd_fence()'

2024-01-02 Thread Felix Kuehling

On 2023-12-29 10:12, Srinivasan Shanmugam wrote:

Return value of container_of(...) can't be null, so null check is not
required for 'fence'. Hence drop its NULL check.

Fixes the below:
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c:93 to_amdgpu_amdkfd_fence() 
warn: can 'fence' even be NULL?

Cc: Felix Kuehling 
Cc: Christian König 
Cc: Alex Deucher 
Signed-off-by: Srinivasan Shanmugam 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
index 469785d33791..1ef758ac5076 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
@@ -90,7 +90,7 @@ struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct 
dma_fence *f)
return NULL;
  
  	fence = container_of(f, struct amdgpu_amdkfd_fence, base);

-   if (fence && f->ops == _fence_ops)
+   if (f->ops == _fence_ops)
return fence;
  
  	return NULL;


[PATCH v2 2/3] drm/amdgpu: Query ras capablity from psp v2

2024-01-02 Thread Hawking Zhang
Instead of traditional atomfirmware interfaces for RAS
capability, host driver can query ras capability from
psp starting from psp v13_0_6.

v2: drop redundant local variable from get_ras_capability.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 10 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  | 26 +
 3 files changed, 38 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 0dc8686e54f7..af3bc36aef18 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2128,6 +2128,16 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }
 
+bool amdgpu_psp_get_ras_capability(struct psp_context *psp)
+{
+   if (psp->funcs &&
+   psp->funcs->get_ras_capability) {
+   return psp->funcs->get_ras_capability(psp);
+   } else {
+   return false;
+   }
+}
+
 static int psp_hw_start(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 09d1f8f72a9c..652b0a01854a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,6 +134,7 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
+   bool (*get_ras_capability)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -537,4 +538,5 @@ int psp_spatial_partition(struct psp_context *psp, int 
mode);
 int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
+bool amdgpu_psp_get_ras_capability(struct psp_context *psp);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 676bec2cc157..722b6066ce07 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -27,6 +27,7 @@
 #include "amdgpu_ucode.h"
 #include "soc15_common.h"
 #include "psp_v13_0.h"
+#include "amdgpu_ras.h"
 
 #include "mp/mp_13_0_2_offset.h"
 #include "mp/mp_13_0_2_sh_mask.h"
@@ -770,6 +771,30 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }
 
+static bool psp_v13_0_get_ras_capability(struct psp_context *psp)
+{
+   struct amdgpu_device *adev = psp->adev;
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   u32 reg_data;
+
+   /* query ras cap should be done from host side */
+   if (amdgpu_sriov_vf(adev))
+   return false;
+
+   if (!con)
+   return false;
+
+   if ((amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) &&
+   (!(adev->flags & AMD_IS_APU))) {
+   reg_data = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_127);
+   adev->ras_hw_enabled = (reg_data & GENMASK_ULL(23, 0));
+   con->poison_supported = ((reg_data & GENMASK_ULL(24, 24)) >> 
24) ? true : false;
+   return true;
+   } else {
+   return false;
+   }
+}
+
 static const struct psp_funcs psp_v13_0_funcs = {
.init_microcode = psp_v13_0_init_microcode,
.wait_for_bootloader = psp_v13_0_wait_for_bootloader_steady_state,
@@ -792,6 +817,7 @@ static const struct psp_funcs psp_v13_0_funcs = {
.update_spirom = psp_v13_0_update_spirom,
.vbflash_stat = psp_v13_0_vbflash_status,
.fatal_error_recovery_quirk = psp_v13_0_fatal_error_recovery_quirk,
+   .get_ras_capability = psp_v13_0_get_ras_capability,
 };
 
 void psp_v13_0_set_psp_funcs(struct psp_context *psp)
-- 
2.17.1



[PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported

2024-01-02 Thread Hawking Zhang
Move ras capablity check to amdgpu_ras_check_supported.
Driver will query ras capablity through psp interace, or
vbios interface, or specific ip callbacks.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +---
 1 file changed, 93 insertions(+), 77 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index a901b00d4949..2ee82baaf7d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -39,6 +39,7 @@
 #include "nbio_v7_9.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
+#include "amdgpu_psp.h"
 
 #ifdef CONFIG_X86_MCE_AMD
 #include 
@@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device 
*adev)
adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
 }
 
+/* Query ras capablity via atomfirmware interface */
+static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device 
*adev)
+{
+   /* mem_ecc cap */
+   if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
+   dev_info(adev->dev, "MEM ECC is active.\n");
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
+1 << AMDGPU_RAS_BLOCK__DF);
+   } else {
+   dev_info(adev->dev, "MEM ECC is not presented.\n");
+   }
+
+   /* sram_ecc cap */
+   if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
+   dev_info(adev->dev, "SRAM ECC is active.\n");
+   if (!amdgpu_sriov_vf(adev))
+   adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
+ 1 << AMDGPU_RAS_BLOCK__DF);
+   else
+   adev->ras_hw_enabled |= (1 << 
AMDGPU_RAS_BLOCK__PCIE_BIF |
+1 << AMDGPU_RAS_BLOCK__SDMA |
+1 << AMDGPU_RAS_BLOCK__GFX);
+
+   /*
+* VCN/JPEG RAS can be supported on both bare metal and
+* SRIOV environment
+*/
+   if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) 
||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) 
||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3))
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
+1 << AMDGPU_RAS_BLOCK__JPEG);
+   else
+   adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
+ 1 << AMDGPU_RAS_BLOCK__JPEG);
+
+   /*
+* XGMI RAS is not supported if xgmi num physical nodes
+* is zero
+*/
+   if (!adev->gmc.xgmi.num_physical_nodes)
+   adev->ras_hw_enabled &= ~(1 << 
AMDGPU_RAS_BLOCK__XGMI_WAFL);
+   } else {
+   dev_info(adev->dev, "SRAM ECC is not presented.\n");
+   }
+}
+
+/* Query poison mode from umc/df IP callbacks */
+static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   bool df_poison, umc_poison;
+
+   /* poison setting is useless on SRIOV guest */
+   if (amdgpu_sriov_vf(adev) || !con)
+   return;
+
+   /* Init poison supported flag, the default value is false */
+   if (adev->gmc.xgmi.connected_to_cpu ||
+   adev->gmc.is_app_apu) {
+   /* enabled by default when GPU is connected to CPU */
+   con->poison_supported = true;
+   } else if (adev->df.funcs &&
+   adev->df.funcs->query_ras_poison_mode &&
+   adev->umc.ras &&
+   adev->umc.ras->query_ras_poison_mode) {
+   df_poison =
+   adev->df.funcs->query_ras_poison_mode(adev);
+   umc_poison =
+   adev->umc.ras->query_ras_poison_mode(adev);
+
+   /* Only poison is set in both DF and UMC, we can support it */
+   if (df_poison && umc_poison)
+   con->poison_supported = true;
+   else if (df_poison != umc_poison)
+   dev_warn(adev->dev,
+   "Poison setting is inconsistent in 
DF/UMC(%d:%d)!\n",
+   df_poison, umc_poison);
+   }
+}
+
 /*
  * check hardware's ras ability which will be saved in hw_supported.
  * if hardware does not support ras, we can skip some ras initializtion and
@@ -2696,49 +2778,13 @@ static void amdgpu_ras_check_supported(struct 
amdgpu_device *adev)
if (!amdgpu_ras_asic_supported(adev))
return;
 
-   if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
-   if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
-   

[PATCH 1/3] drm/amdgpu: Align ras block enum with firmware

2024-01-02 Thread Hawking Zhang
Driver and firmware share the same ras block enum.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 5785b705c692..8b053602c5ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -70,6 +70,8 @@ enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__MCA,
AMDGPU_RAS_BLOCK__VCN,
AMDGPU_RAS_BLOCK__JPEG,
+   AMDGPU_RAS_BLOCK__IH,
+   AMDGPU_RAS_BLOCK__MPIO,
 
AMDGPU_RAS_BLOCK__LAST
 };
-- 
2.17.1



[PATCH] drm/amd/display: dcn35_hwseq: use common comment to prevent kernel-doc warnings

2024-01-02 Thread Randy Dunlap
Change non-kernel-doc comments to use "/*" to prevent warnings from
scripts/kernel-doc.

dcn35_hwseq.c:1124: warning: This comment starts with '/**', but isn't a 
kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * power down sequence
dcn35_hwseq.c:1124: warning: missing initial short description on line:
 * power down sequence
dcn35_hwseq.c:1176: warning: This comment starts with '/**', but isn't a 
kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * power up sequence
dcn35_hwseq.c:1176: warning: missing initial short description on line:
 * power up sequence

Signed-off-by: Randy Dunlap 
Cc: Harry Wentland 
Cc: Leo Li 
Cc: Rodrigo Siqueira 
Cc: Alex Deucher 
Cc: Christian König 
Cc: "Pan, Xinhui" 
Cc: amd-gfx@lists.freedesktop.org
Cc: Maarten Lankhorst 
Cc: Maxime Ripard 
Cc: Thomas Zimmermann 
---
I thought that I sent this patch but it's not in the lore archive
so I'm sending it again. Apologies if you get it twice.

 drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff -- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c 
b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c
@@ -1120,7 +1120,7 @@ void dcn35_calc_blocks_to_ungate(struct
update_state->pg_res_update[PG_HPO] = true;
 
 }
-/**
+/*
 * power down sequence
 * ONO Region 3, DCPG 25: hpo - SKIPPED
 * ONO Region 4, DCPG 0: dchubp0, dpp0
@@ -1172,7 +1172,7 @@ void dcn35_hw_block_power_down(struct dc
//domain22, 23, 25 currently always on.
 
 }
-/**
+/*
 * power up sequence
 * ONO Region 0, DCPG 22: dccg dio dcio - SKIPPED
 * ONO Region 2, DCPG 24: mpc opp optc dwb


[PATCH] drm/amdkfd: fixes for HMM mem allocation

2024-01-02 Thread Dafna Hirschfeld
Few fixes to amdkfd and the doc of
devm_request_free_mem_region.

Signed-off-by: Dafna Hirschfeld 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 6 +++---
 kernel/resource.c| 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 6c25dab051d5..b8680e0753ca 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -1021,7 +1021,7 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev)
} else {
res = devm_request_free_mem_region(adev->dev, _resource, 
size);
if (IS_ERR(res))
-   return -ENOMEM;
+   return PTR_ERR(res);
pgmap->range.start = res->start;
pgmap->range.end = res->end;
pgmap->type = MEMORY_DEVICE_PRIVATE;
@@ -1037,10 +1037,10 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev)
r = devm_memremap_pages(adev->dev, pgmap);
if (IS_ERR(r)) {
pr_err("failed to register HMM device memory\n");
-   /* Disable SVM support capability */
-   pgmap->type = 0;
if (pgmap->type == MEMORY_DEVICE_PRIVATE)
devm_release_mem_region(adev->dev, res->start, 
resource_size(res));
+   /* Disable SVM support capability */
+   pgmap->type = 0;
return PTR_ERR(r);
}
 
diff --git a/kernel/resource.c b/kernel/resource.c
index 866ef3663a0b..fe890b874606 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1905,8 +1905,8 @@ get_free_mem_region(struct device *dev, struct resource 
*base,
  * devm_request_free_mem_region - find free region for device private memory
  *
  * @dev: device struct to bind the resource to
- * @size: size in bytes of the device memory to add
  * @base: resource tree to look in
+ * @size: size in bytes of the device memory to add
  *
  * This function tries to find an empty range of physical address big enough to
  * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
-- 
2.34.1



[PATCH v2 3/5] drm/amdgpu: Add ras helper to query boot errors v2

2024-01-02 Thread Hawking Zhang
Add ras helper function to query boot time gpu
errors.
v2: use aqua_vanjaram smn addressing pattern

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 +++-
 3 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 616b6c911767..cd91533d641c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1328,6 +1328,7 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
 #define WREG32_FIELD_OFFSET(reg, offset, field, val)   \
WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & 
~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field))
 
+#define AMDGPU_GET_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> (l))
 /*
  * BIOS helpers.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index fc42fb6ee191..a901b00d4949 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3763,3 +3763,98 @@ int amdgpu_ras_error_statistic_ce_count(struct 
ras_err_data *err_data,
 
return 0;
 }
+
+#define mmMP0_SMN_C2PMSG_920x1609C
+#define mmMP0_SMN_C2PMSG_126   0x160BE
+static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
+u32 instance, u32 boot_error)
+{
+   u32 socket_id, aid_id, hbm_id;
+   u32 reg_data;
+   u64 reg_addr;
+
+   socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
+   aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
+   hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
+
+   /* The pattern for smn addressing in other SOC could be different from
+* the one for aqua_vanjaram. We should revisit the code if the pattern
+* is changed. In such case, replace the aqua_vanjaram implementation
+* with more common helper */
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
+  aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
+   socket_id, aid_id, reg_data);
+
+   if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory 
test failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist 
test failed\n",
+socket_id, aid_id, hbm_id);
+}
+
+static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
+u32 instance, u32 *boot_error)
+{
+   u32 reg_addr;
+   u32 reg_data;
+   int retry_loop;
+
+   /* The pattern for smn addressing in other SOC could be different from
+* the one for aqua_vanjaram. We should revisit the code if the pattern
+* is changed. In such case, replace the aqua_vanjaram implementation
+* with more common helper */
+   reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
+  aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+   for (retry_loop = 0; retry_loop < 1000; retry_loop++) {
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
+   *boot_error = reg_data;
+   return 0;
+   }
+   msleep(1);
+   }
+
+   *boot_error = 

[PATCH 5/5] drm/amdgpu: Query boot status if boot failed

2024-01-02 Thread Hawking Zhang
Check and report firmware boot status if it doesn't
reach steady status.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 6fad451a85be..676bec2cc157 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -187,11 +187,18 @@ static int psp_v13_0_wait_for_bootloader(struct 
psp_context *psp)
 static int psp_v13_0_wait_for_bootloader_steady_state(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
+   int ret;
 
if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) {
-   psp_v13_0_wait_for_vmbx_ready(psp);
+   ret = psp_v13_0_wait_for_vmbx_ready(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
+
+   ret = psp_v13_0_wait_for_bootloader(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
 
-   return psp_v13_0_wait_for_bootloader(psp);
+   return ret;
}
 
return 0;
-- 
2.17.1



[PATCH 4/5] drm/amdgpu: Query boot status if discovery failed

2024-01-02 Thread Hawking Zhang
Check and report boot status if discovery failed.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index b8fde08aec8e..302b71e9f1e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -27,6 +27,7 @@
 #include "amdgpu_discovery.h"
 #include "soc15_hw_ip.h"
 #include "discovery.h"
+#include "amdgpu_ras.h"
 
 #include "soc15.h"
 #include "gfx_v9_0.h"
@@ -98,6 +99,7 @@
 #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
 MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
 
+#define mmIP_DISCOVERY_VERSION  0x16A00
 #define mmRCC_CONFIG_MEMSIZE   0xde3
 #define mmMP0_SMN_C2PMSG_330x16061
 #define mmMM_INDEX 0x0
@@ -518,7 +520,9 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
 out:
kfree(adev->mman.discovery_bin);
adev->mman.discovery_bin = NULL;
-
+   if ((amdgpu_discovery != 2) &&
+   (RREG32(mmIP_DISCOVERY_VERSION) == 4))
+   amdgpu_ras_query_boot_status(adev, 4);
return r;
 }
 
-- 
2.17.1



[PATCH 2/5] drm/amdgpu: Init pcie_index/data address as fallback

2024-01-02 Thread Hawking Zhang
To allow using this helper for indirect access when
nbio funcs is not available. For instance, in ip
discovery phase.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 001a35fa0f19..873419a5b9aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -781,12 +781,22 @@ u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device 
*adev,
void __iomem *pcie_index_hi_offset;
void __iomem *pcie_data_offset;
 
-   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
-   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
-   if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
-   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
-   else
+   if (unlikely(!adev->nbio.funcs)) {
+   pcie_index = (0x38 >> 2);
+   pcie_data = (0x3C >> 2);
+   } else {
+   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
+   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
+   }
+
+   if (reg_addr >> 32) {
+   if (unlikely(!adev->nbio.funcs))
+   pcie_index_hi = (0x44 >> 2);
+   else
+   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
+   } else {
pcie_index_hi = 0;
+   }
 
spin_lock_irqsave(>pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
-- 
2.17.1



[PATCH 1/5] drm/amdgpu: drop psp v13 query_boot_status implementation

2024-01-02 Thread Hawking Zhang
Will replace it with new implementation to cover
boot fails in ip discovery phase.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c| 15 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h|  4 --
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 78 --
 4 files changed, 99 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4b694696930e..001a35fa0f19 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1218,8 +1218,6 @@ static int amdgpu_device_asic_init(struct amdgpu_device 
*adev)
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
amdgpu_psp_wait_for_bootloader(adev);
ret = amdgpu_atomfirmware_asic_init(adev, true);
-   /* TODO: check the return val and stop device initialization if 
boot fails */
-   amdgpu_psp_query_boot_status(adev);
return ret;
} else {
return amdgpu_atom_asic_init(adev->mode_info.atom_context);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 51bfe3757c89..0dc8686e54f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2128,21 +2128,6 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }
 
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev)
-{
-   struct psp_context *psp = >psp;
-   int ret = 0;
-
-   if (amdgpu_sriov_vf(adev) || (adev->flags & AMD_IS_APU))
-   return 0;
-
-   if (psp->funcs &&
-   psp->funcs->query_boot_status)
-   ret = psp->funcs->query_boot_status(psp);
-
-   return ret;
-}
-
 static int psp_hw_start(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index c4d9cbde55b9..09d1f8f72a9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,7 +134,6 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
-   int (*query_boot_status)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -538,7 +537,4 @@ int psp_spatial_partition(struct psp_context *psp, int 
mode);
 int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
-
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev);
-
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index df1844d0800f..6fad451a85be 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -763,83 +763,6 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }
 
-
-static void psp_v13_0_boot_error_reporting(struct amdgpu_device *adev,
-  uint32_t inst,
-  uint32_t boot_error)
-{
-   uint32_t socket_id;
-   uint32_t aid_id;
-   uint32_t hbm_id;
-   uint32_t reg_data;
-
-   socket_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, SOCKET_ID);
-   aid_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, AID_ID);
-   hbm_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, HBM_ID);
-
-   reg_data = RREG32_SOC15(MP0, inst, regMP0_SMN_C2PMSG_109);
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
-socket_id, aid_id, reg_data);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_MEM_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
-socket_id, aid_id, hbm_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_FW_LOAD))
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_WAFL_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_XGMI_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_USR_CP_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
-   

[PATCH] drm/amdgpu: change vm->task_info handling

2024-01-02 Thread Shashank Sharma
drm/amdgpu: change vm->task_info handling

This patch changes the handling and lifecycle of vm->task_info object.
The major changes are:
- vm->task_info is a dynamically allocated ptr now, and its uasge is
  reference counted.
- introducing two new helper funcs for task_info lifecycle management
- amdgpu_vm_get_task_info: reference counts up task_info before
  returning this info
- amdgpu_vm_put_task_info: reference counts down task_info
- last put to task_info() frees task_info from the vm.

This patch also does logistical changes required for existing usage
of vm->task_info.

Cc: Christian Koenig 
Cc: Alex Deucher 
Cc: Felix Kuehling 
Signed-off-by: Shashank Sharma 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |   7 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  15 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  17 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 142 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  24 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c   |   2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  27 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  |  28 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c   |  26 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  28 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c  |  20 +--
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c|  19 +--
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c |  17 +--
 13 files changed, 259 insertions(+), 113 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index a4faea4fa0b5..111f8afb03a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1763,9 +1763,12 @@ static int amdgpu_debugfs_vm_info_show(struct seq_file 
*m, void *unused)
list_for_each_entry(file, >filelist, lhead) {
struct amdgpu_fpriv *fpriv = file->driver_priv;
struct amdgpu_vm *vm = >vm;
+   struct amdgpu_task_info *ti;
+
+   ti = amdgpu_vm_get_task_info_vm(vm);
+   seq_printf(m, "pid:%d\tProcess:%s --\n", ti->pid, 
ti->process_name);
+   amdgpu_vm_put_task_info_vm(ti, vm);
 
-   seq_printf(m, "pid:%d\tProcess:%s --\n",
-   vm->task_info.pid, vm->task_info.process_name);
r = amdgpu_bo_reserve(vm->root.bo, true);
if (r)
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2b8356699f23..00516fa178b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4952,10 +4952,17 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
tmp_adev->reset_vram_lost = vram_lost;
memset(_adev->reset_task_info, 0,

sizeof(tmp_adev->reset_task_info));
-   if (reset_context->job && 
reset_context->job->vm)
-   tmp_adev->reset_task_info =
-   
reset_context->job->vm->task_info;
-   amdgpu_reset_capture_coredumpm(tmp_adev);
+   if (reset_context->job && 
reset_context->job->vm) {
+   struct amdgpu_task_info *ti;
+   struct amdgpu_vm *vm = 
reset_context->job->vm;
+
+   ti = amdgpu_vm_get_task_info_vm(vm);
+   if (ti) {
+   tmp_adev->reset_task_info = *ti;
+   
amdgpu_reset_capture_coredumpm(tmp_adev);
+   amdgpu_vm_put_task_info_vm(ti, 
vm);
+   }
+   }
 #endif
if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU 
reset!\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 78476bc75b4e..b89ee6ab7db9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -35,7 +35,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
 {
struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
struct amdgpu_job *job = to_amdgpu_job(s_job);
-   struct amdgpu_task_info ti;
+   struct amdgpu_task_info *ti;
struct amdgpu_device *adev = ring->adev;
int idx;
int r;
@@ -58,12 +58,15 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
goto exit;
}
 
-   amdgpu_vm_get_task_info(ring->adev, job->pasid, 

RE: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp

2024-01-02 Thread Zhang, Hawking
[AMD Official Use Only - General]

Oh, that's good point. I guess Kevin mentioned the same thing. Yes. Returning 
directly is reasonable to me.

Regards,
Hawking

-Original Message-
From: Zhou1, Tao 
Sent: Tuesday, January 2, 2024 16:49
To: Zhang, Hawking ; Wang, Yang(Kevin) 
; amd-gfx@lists.freedesktop.org; Yang, Stanley 
; Chai, Thomas ; Li, Candice 

Cc: Deucher, Alexander ; Lazar, Lijo 
; Ma, Le 
Subject: RE: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp

[AMD Official Use Only - General]

> -Original Message-
> From: Zhang, Hawking 
> Sent: Tuesday, January 2, 2024 1:38 PM
> To: Wang, Yang(Kevin) ; amd-
> g...@lists.freedesktop.org; Zhou1, Tao ; Yang,
> Stanley ; Chai, Thomas ;
> Li, Candice 
> Cc: Deucher, Alexander ; Lazar, Lijo
> ; Ma, Le 
> Subject: RE: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp
>
> [AMD Official Use Only - General]
>
> The ret gives us a chance to fallback to legacy query approach (from vbios).
>
>
> You might want to see patch #3 of the series for more details, go to
> the following lines in patch #3
>
> +   /* query ras capability from psp */
> +   if (amdgpu_psp_get_ras_capability(>psp))
> +   goto init_ras_enabled_flag;
>
>
> Regards,
> Hawking
>
> -Original Message-
> From: Wang, Yang(Kevin) 
> Sent: Tuesday, January 2, 2024 13:19
> To: Zhang, Hawking ;
> amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang,
> Stanley ; Chai, Thomas ;
> Li, Candice 
> Cc: Zhang, Hawking ; Deucher, Alexander
> ; Lazar, Lijo ; Ma, Le
> 
> Subject: RE: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp
>
> [AMD Official Use Only - General]
>
> -Original Message-
> From: Hawking Zhang 
> Sent: Tuesday, January 2, 2024 11:45 AM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ;
> Yang, Stanley ; Wang, Yang(Kevin)
> ; Chai, Thomas ; Li,
> Candice 
> Cc: Zhang, Hawking ; Deucher, Alexander
> ; Lazar, Lijo ; Ma, Le
> 
> Subject: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp
>
> Instead of traditional atomfirmware interfaces for RAS capability,
> host driver can query ras capability from psp starting from psp v13_0_6.
>
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 13 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  2 ++
> drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  | 26 +
>  3 files changed, 41 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 94b536e3cada..8a3847d3041f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -2125,6 +2125,19 @@ int amdgpu_psp_wait_for_bootloader(struct
> amdgpu_device *adev)
> return ret;
>  }
>
> +bool amdgpu_psp_get_ras_capability(struct psp_context *psp) {
> +   bool ret;
> +
> +   if (psp->funcs &&
> +   psp->funcs->get_ras_capability) {
> +   ret = psp->funcs->get_ras_capability(psp);
> +   return ret;

[Tao] I think the code can be simplified as:

return psp->funcs->get_ras_capability(psp);

and drop the ret variable.

> [kevin]:
> This variable 'ret' seems to have no other purpose, can we remove it
> and return directly ?
>
> Best Regards,
> Kevin
> +   } else {
> +   return false;
> +   }
> +}
> +
>  static int psp_hw_start(struct psp_context *psp)  {
> struct amdgpu_device *adev = psp->adev; diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> index 09d1f8f72a9c..652b0a01854a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> @@ -134,6 +134,7 @@ struct psp_funcs {
> int (*update_spirom)(struct psp_context *psp, uint64_t 
> fw_pri_mc_addr);
> int (*vbflash_stat)(struct psp_context *psp);
> int (*fatal_error_recovery_quirk)(struct psp_context *psp);
> +   bool (*get_ras_capability)(struct psp_context *psp);
>  };
>
>  struct ta_funcs {
> @@ -537,4 +538,5 @@ int psp_spatial_partition(struct psp_context *psp,
> int mode);  int is_psp_fw_valid(struct psp_bin_desc bin);
>
>  int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
> +bool amdgpu_psp_get_ras_capability(struct psp_context *psp);
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
> b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
> index 676bec2cc157..722b6066ce07 100644
> --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
> @@ -27,6 +27,7 @@
>  #include "amdgpu_ucode.h"
>  #include "soc15_common.h"
>  #include "psp_v13_0.h"
> +#include "amdgpu_ras.h"
>
>  #include "mp/mp_13_0_2_offset.h"
>  #include "mp/mp_13_0_2_sh_mask.h"
> @@ -770,6 +771,30 @@ static int
> psp_v13_0_fatal_error_recovery_quirk(struct
> psp_context *psp)
> return 0;
>  }
>
> +static bool psp_v13_0_get_ras_capability(struct psp_context *psp) {
> +   struct amdgpu_device *adev = psp->adev;
> +   struct 

RE: [PATCH 3/3] drm/amdgpu: Replace DRM_* with dev_* in amdgpu_psp.c

2024-01-02 Thread Zhou1, Tao
[AMD Official Use Only - General]

The series is:

Reviewed-by: Tao Zhou 

> -Original Message-
> From: amd-gfx  On Behalf Of Hawking
> Zhang
> Sent: Tuesday, January 2, 2024 11:45 AM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang,
> Stanley ; Wang, Yang(Kevin)
> ; Chai, Thomas ; Li,
> Candice 
> Cc: Deucher, Alexander ; Ma, Le
> ; Lazar, Lijo ; Zhang, Hawking
> 
> Subject: [PATCH 3/3] drm/amdgpu: Replace DRM_* with dev_* in amdgpu_psp.c
>
> So kernel message has the device pcie bdf information, which helps issue
> debugging especially in multiple GPU system.
>
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 144 
>  1 file changed, 75 insertions(+), 69 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 8a3847d3041f..0d871479ff34 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -291,21 +291,22 @@ static int psp_memory_training_init(struct psp_context
> *psp)
>   struct psp_memory_training_context *ctx = >mem_train_ctx;
>
>   if (ctx->init != PSP_MEM_TRAIN_RESERVE_SUCCESS) {
> - DRM_DEBUG("memory training is not supported!\n");
> + dev_dbg(psp->adev->dev, "memory training is not
> supported!\n");
>   return 0;
>   }
>
>   ctx->sys_cache = kzalloc(ctx->train_data_size, GFP_KERNEL);
>   if (ctx->sys_cache == NULL) {
> - DRM_ERROR("alloc mem_train_ctx.sys_cache failed!\n");
> + dev_err(psp->adev->dev, "alloc mem_train_ctx.sys_cache
> failed!\n");
>   ret = -ENOMEM;
>   goto Err_out;
>   }
>
> -
>   DRM_DEBUG("train_data_size:%llx,p2c_train_data_offset:%llx,c2p_train
> _data_offset:%llx.\n",
> -   ctx->train_data_size,
> -   ctx->p2c_train_data_offset,
> -   ctx->c2p_train_data_offset);
> + dev_dbg(psp->adev->dev,
> +
>   "train_data_size:%llx,p2c_train_data_offset:%llx,c2p_train_data_offset:
> %llx.\n",
> + ctx->train_data_size,
> + ctx->p2c_train_data_offset,
> + ctx->c2p_train_data_offset);
>   ctx->init = PSP_MEM_TRAIN_INIT_SUCCESS;
>   return 0;
>
> @@ -407,7 +408,7 @@ static int psp_sw_init(void *handle)
>
>   psp->cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
>   if (!psp->cmd) {
> - DRM_ERROR("Failed to allocate memory to command
> buffer!\n");
> + dev_err(adev->dev, "Failed to allocate memory to command
> buffer!\n");
>   ret = -ENOMEM;
>   }
>
> @@ -454,13 +455,13 @@ static int psp_sw_init(void *handle)
>   if (mem_training_ctx->enable_mem_training) {
>   ret = psp_memory_training_init(psp);
>   if (ret) {
> - DRM_ERROR("Failed to initialize memory training!\n");
> + dev_err(adev->dev, "Failed to initialize memory
> training!\n");
>   return ret;
>   }
>
>   ret = psp_mem_training(psp, PSP_MEM_TRAIN_COLD_BOOT);
>   if (ret) {
> - DRM_ERROR("Failed to process memory training!\n");
> + dev_err(adev->dev, "Failed to process memory
> training!\n");
>   return ret;
>   }
>   }
> @@ -675,9 +676,11 @@ psp_cmd_submit_buf(struct psp_context *psp,
>*/
>   if (!skip_unsupport && (psp->cmd_buf_mem->resp.status || !timeout)
> && !ras_intr) {
>   if (ucode)
> - DRM_WARN("failed to load ucode %s(0x%X) ",
> -   amdgpu_ucode_name(ucode->ucode_id),
> ucode->ucode_id);
> - DRM_WARN("psp gfx command %s(0x%X) failed and response
> status is (0x%X)\n",
> + dev_warn(psp->adev->dev,
> +  "failed to load ucode %s(0x%X) ",
> +  amdgpu_ucode_name(ucode->ucode_id),
> ucode->ucode_id);
> + dev_warn(psp->adev->dev,
> +  "psp gfx command %s(0x%X) failed and response status
> is (0x%X)\n",
>psp_gfx_cmd_name(psp->cmd_buf_mem->cmd_id),
> psp->cmd_buf_mem->cmd_id,
>psp->cmd_buf_mem->resp.status);
>   /* If any firmware (including CAP) load fails under SRIOV, it
> should @@ -807,7 +810,7 @@ static int psp_tmr_init(struct psp_context *psp)
>   psp->fw_pri_buf) {
>   ret = psp_load_toc(psp, _size);
>   if (ret) {
> - DRM_ERROR("Failed to load toc\n");
> + dev_err(psp->adev->dev, "Failed to load toc\n");
>   return ret;
>   }
>   }
> @@ -855,7 +858,7 @@ static int psp_tmr_load(struct psp_context *psp)
>
>   psp_prep_tmr_cmd_buf(psp, cmd, psp->tmr_mc_addr, psp->tmr_bo);
>   if (psp->tmr_bo)
> - DRM_INFO("reserve 

RE: [PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported

2024-01-02 Thread Zhou1, Tao
[AMD Official Use Only - General]

The series is:

Reviewed-by: Tao Zhou 

> -Original Message-
> From: amd-gfx  On Behalf Of Hawking
> Zhang
> Sent: Tuesday, January 2, 2024 11:45 AM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang,
> Stanley ; Wang, Yang(Kevin)
> ; Chai, Thomas ; Li,
> Candice 
> Cc: Deucher, Alexander ; Ma, Le
> ; Lazar, Lijo ; Zhang, Hawking
> 
> Subject: [PATCH 3/3] drm/amdgpu: Centralize ras cap query to
> amdgpu_ras_check_supported
>
> Move ras capablity check to amdgpu_ras_check_supported.
> Driver will query ras capablity through psp interace, or vbios interface, or 
> specific
> ip callbacks.
>
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +---
>  1 file changed, 93 insertions(+), 77 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 5f302b7693b3..72b6e41329b0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -39,6 +39,7 @@
>  #include "nbio_v7_9.h"
>  #include "atom.h"
>  #include "amdgpu_reset.h"
> +#include "amdgpu_psp.h"
>
>  #ifdef CONFIG_X86_MCE_AMD
>  #include 
> @@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct
> amdgpu_device *adev)
>   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);  }
>
> +/* Query ras capablity via atomfirmware interface */ static void
> +amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev) {
> + /* mem_ecc cap */
> + if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
> + dev_info(adev->dev, "MEM ECC is active.\n");
> + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
> +  1 << AMDGPU_RAS_BLOCK__DF);
> + } else {
> + dev_info(adev->dev, "MEM ECC is not presented.\n");
> + }
> +
> + /* sram_ecc cap */
> + if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
> + dev_info(adev->dev, "SRAM ECC is active.\n");
> + if (!amdgpu_sriov_vf(adev))
> + adev->ras_hw_enabled |= ~(1 <<
> AMDGPU_RAS_BLOCK__UMC |
> +   1 <<
> AMDGPU_RAS_BLOCK__DF);
> + else
> + adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__PCIE_BIF |
> +  1 <<
> AMDGPU_RAS_BLOCK__SDMA |
> +  1 <<
> AMDGPU_RAS_BLOCK__GFX);
> +
> + /*
> +  * VCN/JPEG RAS can be supported on both bare metal and
> +  * SRIOV environment
> +  */
> + if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6,
> 0) ||
> + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0,
> 0) ||
> + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0,
> 3))
> + adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__VCN |
> +  1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> + else
> + adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__VCN |
> +   1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> +
> + /*
> +  * XGMI RAS is not supported if xgmi num physical nodes
> +  * is zero
> +  */
> + if (!adev->gmc.xgmi.num_physical_nodes)
> + adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__XGMI_WAFL);
> + } else {
> + dev_info(adev->dev, "SRAM ECC is not presented.\n");
> + }
> +}
> +
> +/* Query poison mode from umc/df IP callbacks */ static void
> +amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) {
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + bool df_poison, umc_poison;
> +
> + /* poison setting is useless on SRIOV guest */
> + if (amdgpu_sriov_vf(adev) || !con)
> + return;
> +
> + /* Init poison supported flag, the default value is false */
> + if (adev->gmc.xgmi.connected_to_cpu ||
> + adev->gmc.is_app_apu) {
> + /* enabled by default when GPU is connected to CPU */
> + con->poison_supported = true;
> + } else if (adev->df.funcs &&
> + adev->df.funcs->query_ras_poison_mode &&
> + adev->umc.ras &&
> + adev->umc.ras->query_ras_poison_mode) {
> + df_poison =
> + adev->df.funcs->query_ras_poison_mode(adev);
> + umc_poison =
> + adev->umc.ras->query_ras_poison_mode(adev);
> +
> + /* Only poison is set in both DF and UMC, we can support it */
> + if (df_poison && umc_poison)
> + con->poison_supported = true;
> + else if (df_poison != umc_poison)
> + dev_warn(adev->dev,
> + "Poison setting is inconsistent in
> 

RE: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp

2024-01-02 Thread Zhou1, Tao
[AMD Official Use Only - General]

> -Original Message-
> From: Zhang, Hawking 
> Sent: Tuesday, January 2, 2024 1:38 PM
> To: Wang, Yang(Kevin) ; amd-
> g...@lists.freedesktop.org; Zhou1, Tao ; Yang, Stanley
> ; Chai, Thomas ; Li, Candice
> 
> Cc: Deucher, Alexander ; Lazar, Lijo
> ; Ma, Le 
> Subject: RE: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp
>
> [AMD Official Use Only - General]
>
> The ret gives us a chance to fallback to legacy query approach (from vbios).
>
>
> You might want to see patch #3 of the series for more details, go to the 
> following
> lines in patch #3
>
> +   /* query ras capability from psp */
> +   if (amdgpu_psp_get_ras_capability(>psp))
> +   goto init_ras_enabled_flag;
>
>
> Regards,
> Hawking
>
> -Original Message-
> From: Wang, Yang(Kevin) 
> Sent: Tuesday, January 2, 2024 13:19
> To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org;
> Zhou1, Tao ; Yang, Stanley ;
> Chai, Thomas ; Li, Candice 
> Cc: Zhang, Hawking ; Deucher, Alexander
> ; Lazar, Lijo ; Ma, Le
> 
> Subject: RE: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp
>
> [AMD Official Use Only - General]
>
> -Original Message-
> From: Hawking Zhang 
> Sent: Tuesday, January 2, 2024 11:45 AM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang,
> Stanley ; Wang, Yang(Kevin)
> ; Chai, Thomas ; Li,
> Candice 
> Cc: Zhang, Hawking ; Deucher, Alexander
> ; Lazar, Lijo ; Ma, Le
> 
> Subject: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp
>
> Instead of traditional atomfirmware interfaces for RAS capability, host 
> driver can
> query ras capability from psp starting from psp v13_0_6.
>
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 13 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  2 ++
> drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  | 26 +
>  3 files changed, 41 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 94b536e3cada..8a3847d3041f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -2125,6 +2125,19 @@ int amdgpu_psp_wait_for_bootloader(struct
> amdgpu_device *adev)
> return ret;
>  }
>
> +bool amdgpu_psp_get_ras_capability(struct psp_context *psp) {
> +   bool ret;
> +
> +   if (psp->funcs &&
> +   psp->funcs->get_ras_capability) {
> +   ret = psp->funcs->get_ras_capability(psp);
> +   return ret;

[Tao] I think the code can be simplified as:

return psp->funcs->get_ras_capability(psp);

and drop the ret variable.

> [kevin]:
> This variable 'ret' seems to have no other purpose, can we remove it and 
> return
> directly ?
>
> Best Regards,
> Kevin
> +   } else {
> +   return false;
> +   }
> +}
> +
>  static int psp_hw_start(struct psp_context *psp)  {
> struct amdgpu_device *adev = psp->adev; diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> index 09d1f8f72a9c..652b0a01854a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> @@ -134,6 +134,7 @@ struct psp_funcs {
> int (*update_spirom)(struct psp_context *psp, uint64_t 
> fw_pri_mc_addr);
> int (*vbflash_stat)(struct psp_context *psp);
> int (*fatal_error_recovery_quirk)(struct psp_context *psp);
> +   bool (*get_ras_capability)(struct psp_context *psp);
>  };
>
>  struct ta_funcs {
> @@ -537,4 +538,5 @@ int psp_spatial_partition(struct psp_context *psp, int
> mode);  int is_psp_fw_valid(struct psp_bin_desc bin);
>
>  int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
> +bool amdgpu_psp_get_ras_capability(struct psp_context *psp);
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
> b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
> index 676bec2cc157..722b6066ce07 100644
> --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
> @@ -27,6 +27,7 @@
>  #include "amdgpu_ucode.h"
>  #include "soc15_common.h"
>  #include "psp_v13_0.h"
> +#include "amdgpu_ras.h"
>
>  #include "mp/mp_13_0_2_offset.h"
>  #include "mp/mp_13_0_2_sh_mask.h"
> @@ -770,6 +771,30 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct
> psp_context *psp)
> return 0;
>  }
>
> +static bool psp_v13_0_get_ras_capability(struct psp_context *psp) {
> +   struct amdgpu_device *adev = psp->adev;
> +   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +   u32 reg_data;
> +
> +   /* query ras cap should be done from host side */
> +   if (amdgpu_sriov_vf(adev))
> +   return false;
> +
> +   if (!con)
> +   return false;
> +
> +   if ((amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) &&
> +   (!(adev->flags & AMD_IS_APU))) {
> +   reg_data = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_127);
> +