from:"Luo, Zhigang"

RE: [PATCH v2 1/2] Revert "drm/amdgpu: Extend KIQ reg polling wait for VF"

2024-08-08 Thread Luo, Zhigang

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Zhigang Luo 

-Original Message-
From: amd-gfx  On Behalf Of Victor 
Skvortsov
Sent: Tuesday, August 6, 2024 8:41 AM
To: Deucher, Alexander ; 
amd-gfx@lists.freedesktop.org
Cc: Skvortsov, Victor 
Subject: [PATCH v2 1/2] Revert "drm/amdgpu: Extend KIQ reg polling wait for VF"

KIQ timeouts no longer seen.

This reverts commit b4d12cc00ad69e8a0dea2ece7202bacfd8b894fb.

Signed-off-by: Victor Skvortsov 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 137a88b8de45..206360503136 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -347,9 +347,9 @@ enum amdgpu_kiq_irq {
AMDGPU_CP_KIQ_IRQ_DRIVER0 = 0,
AMDGPU_CP_KIQ_IRQ_LAST
 };
-#define SRIOV_USEC_TIMEOUT 120 /* wait 12 * 100ms for SRIOV */ -#define 
MAX_KIQ_REG_WAIT (amdgpu_sriov_vf(adev) ? 5 : 5000) /* in usecs, extend for 
VF */ -#define MAX_KIQ_REG_BAILOUT_INTERVAL 5 /* in msecs, 5ms */
+#define SRIOV_USEC_TIMEOUT  120 /* wait 12 * 100ms for SRIOV */
+#define MAX_KIQ_REG_WAIT   5000 /* in usecs, 5ms */
+#define MAX_KIQ_REG_BAILOUT_INTERVAL   5 /* in msecs, 5ms */
 #define MAX_KIQ_REG_TRY 1000

 int amdgpu_device_ip_set_clockgating_state(void *dev,
--
2.34.1

RE: [PATCH 1/2] drm/amdgpu: Use dev_ prints for virtualization as it supports multi adapter

2024-06-18 Thread Luo, Zhigang

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Zhigang Luo 

-Original Message-
From: Chander, Vignesh 
Sent: Monday, June 17, 2024 10:55 AM
To: amd-gfx@lists.freedesktop.org
Cc: Chan, Hing Pong ; Luo, Zhigang ; 
Chander, Vignesh ; Chander, Vignesh 

Subject: [PATCH 1/2] drm/amdgpu: Use dev_ prints for virtualization as it 
supports multi adapter

Signed-off-by: Vignesh Chander 
Change-Id: Ifead637951c00e5b4e97c766d172323dcac4da08
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 19 +++  
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 23 +++
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 6b71ee85ee6556..65656afc6ed1c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -93,7 +93,7 @@ static int xgpu_ai_poll_ack(struct amdgpu_device *adev)
timeout -= 5;
} while (timeout > 1);

-   pr_err("Doesn't get TRN_MSG_ACK from pf in %d msec\n", 
AI_MAILBOX_POLL_ACK_TIMEDOUT);
+   dev_err(adev->dev, "Doesn't get TRN_MSG_ACK from pf in %d msec\n",
+AI_MAILBOX_POLL_ACK_TIMEDOUT);

return -ETIME;
 }
@@ -111,7 +111,7 @@ static int xgpu_ai_poll_msg(struct amdgpu_device *adev, 
enum idh_event event)
timeout -= 10;
} while (timeout > 1);

-   pr_err("Doesn't get msg:%d from pf, error=%d\n", event, r);
+   dev_err(adev->dev, "Doesn't get msg:%d from pf, error=%d\n", event,
+r);

return -ETIME;
 }
@@ -132,7 +132,7 @@ static void xgpu_ai_mailbox_trans_msg (struct amdgpu_device 
*adev,
xgpu_ai_mailbox_set_valid(adev, false);
trn = xgpu_ai_peek_ack(adev);
if (trn) {
-   pr_err("trn=%x ACK should not assert! wait again !\n", 
trn);
+   dev_err_ratelimited(adev->dev, "trn=%x ACK should not 
assert! wait
+again !\n", trn);
msleep(1);
}
} while(trn);
@@ -155,7 +155,7 @@ static void xgpu_ai_mailbox_trans_msg (struct amdgpu_device 
*adev,
/* start to poll ack */
r = xgpu_ai_poll_ack(adev);
if (r)
-   pr_err("Doesn't get ack from pf, continue\n");
+   dev_err(adev->dev, "Doesn't get ack from pf, continue\n");

xgpu_ai_mailbox_set_valid(adev, false);  } @@ -173,7 +173,7 @@ static 
int xgpu_ai_send_access_requests(struct amdgpu_device *adev,
req == IDH_REQ_GPU_RESET_ACCESS) {
r = xgpu_ai_poll_msg(adev, IDH_READY_TO_ACCESS_GPU);
if (r) {
-   pr_err("Doesn't get READY_TO_ACCESS_GPU from pf, give 
up\n");
+   dev_err(adev->dev, "Doesn't get READY_TO_ACCESS_GPU 
from pf, give
+up\n");
return r;
}
/* Retrieve checksum from mailbox2 */ @@ -231,7 +231,7 @@ 
static int xgpu_ai_mailbox_ack_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry)
 {
-   DRM_DEBUG("get ack intr and do nothing.\n");
+   dev_dbg(adev->dev, "get ack intr and do nothing.\n");
return 0;
 }

@@ -258,12 +258,15 @@ static int xgpu_ai_wait_reset(struct amdgpu_device *adev) 
 {
int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
do {
-   if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
+   if (xgpu_ai_mailbox_peek_msg(adev) == 
IDH_FLR_NOTIFICATION_CMPL) {
+   dev_dbg(adev->dev, "Got AI IDH_FLR_NOTIFICATION_CMPL 
after %d ms\n",
+AI_MAILBOX_POLL_FLR_TIMEDOUT - timeout);
return 0;
+   }
msleep(10);
timeout -= 10;
} while (timeout > 1);
-   dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");
+
+   dev_dbg(adev->dev, "waiting AI IDH_FLR_NOTIFICATION_CMPL timeout\n");
return -ETIME;
 }

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 22af30a15a5fd7..17e1e8cc243752 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -91,7 +91,7 @@ static int xgpu_nv_poll_ack(struct amdgpu_device *adev)
timeout -= 5;
} while (timeout > 1);

-   pr_err("Doesn't get TRN_MSG_ACK from pf in %d msec\n", 
NV_MAILBOX_POLL_ACK_TIMEDOUT);
+   dev_err(adev->dev, "Doesn't get TRN_MSG_ACK from pf in %d msec \n",
+NV_MAILBOX_POLL_ACK_TIMEDOUT);

return -ETIME;
 }
@@ -106,13 +106,16 @@ static int xgpu_nv_p

RE: [PATCH] drm/amdgpu: Add lock around VF RLCG interface

2024-05-28 Thread Luo, Zhigang

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Zhigang Luo 

-Original Message-
From: Skvortsov, Victor 
Sent: Monday, May 27, 2024 4:19 PM
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; 
Lazar, Lijo ; Luo, Zhigang 
Cc: Skvortsov, Victor 
Subject: [PATCH] drm/amdgpu: Add lock around VF RLCG interface

flush_gpu_tlb may be called from another thread while device_gpu_recover is 
running.

Both of these threads access registers through the VF RLCG interface during VF 
Full Access. Add a lock around this interface to prevent race conditions 
between these threads.

Signed-off-by: Victor Skvortsov 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c   | 6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   | 2 ++
 3 files changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f5168b4c3b03..6711836054f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4049,6 +4049,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(&adev->grbm_idx_mutex);
mutex_init(&adev->mn_lock);
mutex_init(&adev->virt.vf_errors.lock);
+   mutex_init(&adev->virt.rlcg_reg_lock);
hash_init(adev->mn_hash);
mutex_init(&adev->psp.mutex);
mutex_init(&adev->notifier_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 3d5f58e76f2d..a72683f83390 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -982,6 +982,9 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 
offset, u32 v, u32 f
scratch_reg1 = (void __iomem *)adev->rmmio + 4 * 
reg_access_ctrl->scratch_reg1;
scratch_reg2 = (void __iomem *)adev->rmmio + 4 * 
reg_access_ctrl->scratch_reg2;
scratch_reg3 = (void __iomem *)adev->rmmio + 4 * 
reg_access_ctrl->scratch_reg3;
+
+   mutex_lock(&adev->virt.rlcg_reg_lock);
+
if (reg_access_ctrl->spare_int)
spare_int = (void __iomem *)adev->rmmio + 4 * 
reg_access_ctrl->spare_int;

@@ -1038,6 +1041,9 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, 
u32 offset, u32 v, u32 f
}

ret = readl(scratch_reg0);
+
+   mutex_unlock(&adev->virt.rlcg_reg_lock);
+
return ret;
 }

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 642f1fd287d8..0ec246c74570 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -272,6 +272,8 @@ struct amdgpu_virt {

/* the ucode id to signal the autoload */
uint32_t autoload_ucode_id;
+
+   struct mutex rlcg_reg_lock;
 };

 struct amdgpu_video_codec_info;
--
2.34.1

RE: [PATCH 2/2] drm/amdgpu: Queue KFD reset workitem in VF FED

2024-05-19 Thread Luo, Zhigang

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Zhigang Luo 

-Original Message-
From: amd-gfx  On Behalf Of Victor 
Skvortsov
Sent: Sunday, May 19, 2024 10:52 AM
To: amd-gfx@lists.freedesktop.org
Cc: Skvortsov, Victor 
Subject: [PATCH 2/2] drm/amdgpu: Queue KFD reset workitem in VF FED

The guest recovery sequence is buggy in Fatal Error when both FLR & KFD reset 
workitems are queued at the same time. In addition, FLR guest recovery sequence 
is out of order when PF/VF communication breaks due to a GPU fatal error

As a temporary work around, perform a KFD style reset (Initiate reset request 
from the guest) inside the pf2vf thread on FED.

Signed-off-by: Victor Skvortsov 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index d98d619fba97..3d5f58e76f2d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -602,7 +602,7 @@ static void amdgpu_virt_update_vf2pf_work_item(struct 
work_struct *work)
amdgpu_sriov_runtime(adev)) {
amdgpu_ras_set_fed(adev, true);
if (amdgpu_reset_domain_schedule(adev->reset_domain,
- &adev->virt.flr_work))
+ 
&adev->kfd.reset_work))
return;
else
dev_err(adev->dev, "Failed to queue work! at 
%s", __func__);
--
2.34.1

RE: [PATCH 1/2] drm/amdgpu: Extend KIQ reg polling wait for VF

2024-05-19 Thread Luo, Zhigang

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Zhigang Luo 

-Original Message-
From: amd-gfx  On Behalf Of Victor 
Skvortsov
Sent: Sunday, May 19, 2024 10:52 AM
To: amd-gfx@lists.freedesktop.org
Cc: Skvortsov, Victor 
Subject: [PATCH 1/2] drm/amdgpu: Extend KIQ reg polling wait for VF

Runtime KIQ interface to read/write registers in VF may take longer than 
expected for BM environment. Extend the timeout.

Signed-off-by: Victor Skvortsov 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d749c6abdc5e..e8980b6009c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -348,9 +348,9 @@ enum amdgpu_kiq_irq {
AMDGPU_CP_KIQ_IRQ_DRIVER0 = 0,
AMDGPU_CP_KIQ_IRQ_LAST
 };
-#define SRIOV_USEC_TIMEOUT  120 /* wait 12 * 100ms for SRIOV */
-#define MAX_KIQ_REG_WAIT   5000 /* in usecs, 5ms */
-#define MAX_KIQ_REG_BAILOUT_INTERVAL   5 /* in msecs, 5ms */
+#define SRIOV_USEC_TIMEOUT 120 /* wait 12 * 100ms for SRIOV */
+#define MAX_KIQ_REG_WAIT amdgpu_sriov_vf(adev) ? 5 : 5000 /* in
+usecs, extend for VF */ #define MAX_KIQ_REG_BAILOUT_INTERVAL 5 /* in
+msecs, 5ms */
 #define MAX_KIQ_REG_TRY 1000

 int amdgpu_device_ip_set_clockgating_state(void *dev,
--
2.34.1

RE: [PATCH 4/4] drm/amdgpu: Move ras resume into SRIOV function

2024-04-30 Thread Luo, Zhigang

[AMD Official Use Only - General]

Reviewed-by: Zhigang Luo 

-Original Message-
From: amd-gfx  On Behalf Of Yunxiang Li
Sent: Thursday, April 25, 2024 11:58 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Koenig, Christian 
; Lazar, Lijo ; Kuehling, Felix 
; Deng, Emily ; Li, Yunxiang 
(Teddy) 
Subject: [PATCH 4/4] drm/amdgpu: Move ras resume into SRIOV function

This is part of the reset, move it into the reset function.

Signed-off-by: Yunxiang Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3c4755f3c116..8f2c1f71ed9a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5119,6 +5119,11 @@ static int amdgpu_device_reset_sriov(struct 
amdgpu_device *adev,
amdgpu_amdkfd_post_reset(adev);
amdgpu_virt_release_full_gpu(adev, true);

+   /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras 
during reset */
+   if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
+   amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
+   amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
+   amdgpu_ras_resume(adev);
return 0;
 }

@@ -5823,13 +5828,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
goto retry;
if (r)
adev->asic_reset_res = r;
-
-   /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need 
resume ras during reset */
-   if (amdgpu_ip_version(adev, GC_HWIP, 0) ==
-   IP_VERSION(9, 4, 2) ||
-   amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) 
||
-   amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
-   amdgpu_ras_resume(adev);
} else {
r = amdgpu_do_asic_reset(device_list_handle, reset_context);
if (r && r == -EAGAIN)
--
2.34.1

RE: [PATCH v4 2/4] drm/amdgpu: Add reset_context flag for host FLR

2024-04-30 Thread Luo, Zhigang

[AMD Official Use Only - General]

Reviewed-by: Zhigang Luo 

-Original Message-
From: amd-gfx  On Behalf Of Yunxiang Li
Sent: Friday, April 26, 2024 2:27 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Koenig, Christian 
; Lazar, Lijo ; Kuehling, Felix 
; Deng, Emily ; Li, Yunxiang 
(Teddy) 
Subject: [PATCH v4 2/4] drm/amdgpu: Add reset_context flag for host FLR

There are other reset sources that pass NULL as the job pointer, such as 
amdgpu_amdkfd_reset_work. Therefore, using the job pointer to check if the FLR 
comes from the host does not work.

Add a flag in reset_context to explicitly mark host triggered reset, and set 
this flag when we receive host reset notification.

Signed-off-by: Yunxiang Li 
---
v2: fix typo
v3: pass reset_context directly
v4: clear the flag in case we retry

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 -  
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c  |  1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c  |  1 +
 5 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 8befd10bf007..33c889c027a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5055,13 +5055,13 @@ static int amdgpu_device_recover_vram(struct 
amdgpu_device *adev)
  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
  *
  * @adev: amdgpu_device pointer
- * @from_hypervisor: request from hypervisor
+ * @reset_context: amdgpu reset context pointer
  *
  * do VF FLR and reinitialize Asic
  * return 0 means succeeded otherwise failed
  */
 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
-bool from_hypervisor)
+struct amdgpu_reset_context *reset_context)
 {
int r;
struct amdgpu_hive_info *hive = NULL;
@@ -5070,12 +5070,15 @@ static int amdgpu_device_reset_sriov(struct 
amdgpu_device *adev,
 retry:
amdgpu_amdkfd_pre_reset(adev);

-   if (from_hypervisor)
+   if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
+   clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
r = amdgpu_virt_request_full_gpu(adev, true);
-   else
+   } else {
r = amdgpu_virt_reset_gpu(adev);
+   }
if (r)
return r;
+
amdgpu_ras_set_fed(adev, false);
amdgpu_irq_gpu_reset_resume_helper(adev);

@@ -5826,7 +5829,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* Actual ASIC resets if needed.*/
/* Host driver will handle XGMI hive reset for SRIOV */
if (amdgpu_sriov_vf(adev)) {
-   r = amdgpu_device_reset_sriov(adev, job ? false : true);
+   r = amdgpu_device_reset_sriov(adev, reset_context);
if (r)
adev->asic_reset_res = r;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index b11d190ece53..5a9cc043b858 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -33,6 +33,7 @@ enum AMDGPU_RESET_FLAGS {
AMDGPU_NEED_FULL_RESET = 0,
AMDGPU_SKIP_HW_RESET = 1,
AMDGPU_SKIP_COREDUMP = 2,
+   AMDGPU_HOST_FLR = 3,
 };

 struct amdgpu_reset_context {
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index c5ba9c4757a8..f4c47492e0cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -292,6 +292,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+   set_bit(AMDGPU_HOST_FLR, &reset_context.flags);

amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index fa9d1b02f391..14cc7910e5cf 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -328,6 +328,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+   set_bit(AMDGPU_HOST_FLR, &reset_context.flags);

amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index 14a065516ae4..78cd07744ebe 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -529,6

RE: [PATCH v2 3/4] drm/amdgpu: Fix amdgpu_device_reset_sriov retry logic

2024-04-30 Thread Luo, Zhigang

[AMD Official Use Only - General]

Reviewed-by: Zhigang Luo 

-Original Message-
From: amd-gfx  On Behalf Of Yunxiang Li
Sent: Friday, April 26, 2024 2:29 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Koenig, Christian 
; Lazar, Lijo ; Kuehling, Felix 
; Deng, Emily ; Li, Yunxiang 
(Teddy) 
Subject: [PATCH v2 3/4] drm/amdgpu: Fix amdgpu_device_reset_sriov retry logic

The retry loop for SRIOV reset have refcount and memory leak issue.
Depending on which function call fails it can potentially call 
amdgpu_amdkfd_pre/post_reset different number of times and causes kfd_locked 
count to be wrong. This will block all future attempts at opening /dev/kfd. The 
retry loop also leakes resources by calling amdgpu_virt_init_data_exchange 
multiple times without calling the corresponding fini function.

Align with the bare-metal reset path which doesn't have these issues.
This means taking the amdgpu_amdkfd_pre/post_reset functions out of the reset 
loop and calling amdgpu_device_pre_asic_reset each retry which properly free 
the resources from previous try by calling amdgpu_virt_fini_data_exchange.

Signed-off-by: Yunxiang Li 
---
v2: put back release full access and the missed return

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 33c889c027a5..b23645f23a2e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5065,10 +5065,6 @@ static int amdgpu_device_reset_sriov(struct 
amdgpu_device *adev,  {
int r;
struct amdgpu_hive_info *hive = NULL;
-   int retry_limit = 0;
-
-retry:
-   amdgpu_amdkfd_pre_reset(adev);

if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); @@ -5088,7 
+5084,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
/* Resume IP prior to SMC */
r = amdgpu_device_ip_reinit_early_sriov(adev);
if (r)
-   goto error;
+   return r;

amdgpu_virt_init_data_exchange(adev);

@@ -5099,38 +5095,35 @@ static int amdgpu_device_reset_sriov(struct 
amdgpu_device *adev,
/* now we are okay to resume SMC/CP/SDMA */
r = amdgpu_device_ip_reinit_late_sriov(adev);
if (r)
-   goto error;
+   return r;

hive = amdgpu_get_xgmi_hive(adev);
/* Update PSP FW topology after reset */
if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
r = amdgpu_xgmi_update_topology(hive, adev);
-
if (hive)
amdgpu_put_xgmi_hive(hive);
+   if (r)
+   return r;

-   if (!r) {
-   r = amdgpu_ib_ring_tests(adev);
-
-   amdgpu_amdkfd_post_reset(adev);
-   }
+   r = amdgpu_ib_ring_tests(adev);
+   if (r)
+   return r;

-error:
-   if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
+   if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
amdgpu_inc_vram_lost(adev);
r = amdgpu_device_recover_vram(adev);
}
-   amdgpu_virt_release_full_gpu(adev, true);
+   if (r)
+   return r;

-   if (AMDGPU_RETRY_SRIOV_RESET(r)) {
-   if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
-   retry_limit++;
-   goto retry;
-   } else
-   DRM_ERROR("GPU reset retry is beyond the retry 
limit\n");
-   }
+   /* need to be called during full access so we can't do it later like
+* bare-metal does.
+*/
+   amdgpu_amdkfd_post_reset(adev);
+   amdgpu_virt_release_full_gpu(adev, true);

-   return r;
+   return 0;
 }

 /**
@@ -5689,6 +5682,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
int i, r = 0;
bool need_emergency_restart = false;
bool audio_suspended = false;
+   int retry_limit = AMDGPU_MAX_RETRY_LIMIT;

/*
 * Special case: RAS triggered and full reset isn't supported @@ 
-5770,8 +5764,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

cancel_delayed_work_sync(&tmp_adev->delayed_init_work);

-   if (!amdgpu_sriov_vf(tmp_adev))
-   amdgpu_amdkfd_pre_reset(tmp_adev);
+   amdgpu_amdkfd_pre_reset(tmp_adev);

/*
 * Mark these ASICs to be reseted as untracked first @@ -5830,6 
+5823,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* Host driver will handle XGMI hive reset for SRIOV */
if (amdgpu_sriov_vf(adev)) {
r = amdgpu_device_reset_sriov(adev, reset_context);
+   if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
+

RE: [PATCH 1/3] amd/amdkfd: add a function to wait no process running in kfd

2024-03-25 Thread Luo, Zhigang

[AMD Official Use Only - General]

Hi Mukul,

The purpose of adding this function is to waiting user applications to be kill 
after calling amdgpu_amdkfd_pre_reset().
If I understand correctly, kfd_locked will be set after calling 
amdgpu_amdkfd_pre_reset(). So, I added this new function to only check if 
kfd_processes_table is empty.

Thanks,
Zhigang

-Original Message-
From: Joshi, Mukul 
Sent: Monday, March 25, 2024 12:08 PM
To: Luo, Zhigang ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Saye, Sashank 
; Chan, Hing Pong ; Yang, Philip 
; Lazar, Lijo ; Luo, Zhigang 

Subject: RE: [PATCH 1/3] amd/amdkfd: add a function to wait no process running 
in kfd

[AMD Official Use Only - General]

> -Original Message-
> From: amd-gfx  On Behalf Of
> Zhigang Luo
> Sent: Monday, March 25, 2024 11:18 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking ; Saye, Sashank
> ; Chan, Hing Pong ; Yang,
> Philip ; Lazar, Lijo ; Luo,
> Zhigang 
> Subject: [PATCH 1/3] amd/amdkfd: add a function to wait no process
> running in kfd
>
> Caution: This message originated from an External Source. Use proper
> caution when opening attachments, clicking links, or responding.
>
>
> Signed-off-by: Zhigang Luo 
> Change-Id: I2a98d513c26107ac76ecf20e951c188afbc7ede6
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 20
> 
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 10 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c| 11 +++
>  3 files changed, 40 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index d5fde8adf19b..e02bfcec608b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -261,6 +261,26 @@ int amdgpu_amdkfd_resume(struct amdgpu_device
> *adev, bool run_pm)
> return r;
>  }
>
> +int amdgpu_amdkfd_wait_no_process_running(struct amdgpu_device
> *adev) {
> +   unsigned long end_jiffies;
> +
> +   if (!adev->kfd.dev)
> +   return 0;
> +
> +   end_jiffies =
> msecs_to_jiffies(AMDKFD_WAIT_NO_PROCESS_RUNNING_TIMEOUT_MS) + jiffies;
> +   while (!kgd2kfd_is_processes_table_empty(adev->kfd.dev)) {
> +   if (time_after(jiffies, end_jiffies)) {
> +   dev_err(adev->dev, "wait no process running
> + timeout\n");
> +
> +   return -ETIME;
> +   }
> +   schedule();
> +   }
> +
> +   return 0;
> +}
> +
>  int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev)  {
> int r = 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index caee36e52a09..d46dccc5bbf7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -38,6 +38,8 @@
>  #include "amdgpu_vm.h"
>  #include "amdgpu_xcp.h"
>
> +#define AMDKFD_WAIT_NO_PROCESS_RUNNING_TIMEOUT_MS 1
> +
>  extern uint64_t amdgpu_amdkfd_total_mem_size;
>
>  enum TLB_FLUSH_TYPE {
> @@ -169,7 +171,7 @@ void amdgpu_amdkfd_set_compute_idle(struct
> amdgpu_device *adev, bool idle);  bool
> amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev);
>
>  bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);
> -
> +int amdgpu_amdkfd_wait_no_process_running(struct amdgpu_device
> *adev);
>  int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev);
>
>  int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev); @@ -411,6
> +413,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,  void
> kgd2kfd_device_exit(struct kfd_dev *kfd);  void kgd2kfd_suspend(struct
> kfd_dev *kfd, bool run_pm);  int kgd2kfd_resume(struct kfd_dev *kfd,
> bool run_pm);
> +bool kgd2kfd_is_processes_table_empty(struct kfd_dev *kfd);
>  int kgd2kfd_pre_reset(struct kfd_dev *kfd);  int
> kgd2kfd_post_reset(struct kfd_dev *kfd);  void
> kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry); @@
> -454,6 +457,11 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool 
> run_pm)
> return 0;
>  }
>
> +static inline bool kgd2kfd_is_processes_table_empty(struct kfd_dev
> +*kfd) {
> +   return true;
> +}
> +
>  static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd)  {
> return 0;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 041ec3de55e7..2bec79e0c721 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -894,6 +894,17 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
>

RE: [PATCH] drm/amdgpu: Skip virt_exchange_init on SDMA poison consumption

2024-03-14 Thread Luo, Zhigang

[AMD Official Use Only - General]

Reviewed-by: Zhigang Luo 

-Original Message-
From: Skvortsov, Victor 
Sent: Tuesday, March 12, 2024 10:09 PM
To: Skvortsov, Victor ; Chai, Thomas 
; Zhang, Hawking ; Luo, Zhigang 
; amd-gfx@lists.freedesktop.org
Cc: Skvortsov, Victor 
Subject: [PATCH] drm/amdgpu: Skip virt_exchange_init on SDMA poison consumption

From: Victor Skvortsov 

Host will initiate an FLR in SDMA poison consumption scenario.
Guest should wait for FLR message to re-init data exchange.

Signed-off-by: Victor Skvortsov 
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 77f5b55decf9..a1bad772d932 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -444,7 +444,8 @@ static void xgpu_nv_ras_poison_handler(struct amdgpu_device 
*adev,
amdgpu_virt_fini_data_exchange(adev);
xgpu_nv_send_access_requests_with_param(adev,
IDH_RAS_POISON, block, 0, 0);
-   amdgpu_virt_init_data_exchange(adev);
+   if (block != AMDGPU_RAS_BLOCK__SDMA)
+   amdgpu_virt_init_data_exchange(adev);
}
 }

--
2.25.1

RE: [PATCH] drm/amdgpu/: Remove bo_create_kernel_at path from virt page

2024-03-14 Thread Luo, Zhigang

[AMD Official Use Only - General]

Reviewed-by: Zhigang Luo 

-Original Message-
From: Skvortsov, Victor 
Sent: Tuesday, March 12, 2024 1:51 PM
To: Skvortsov, Victor ; Luo, Zhigang 
; amd-gfx@lists.freedesktop.org
Cc: Koenig, Christian 
Subject: [PATCH] drm/amdgpu/: Remove bo_create_kernel_at path from virt page

Use amdgpu_vram_mgr to reserve bad page ranges.
Reserved ranges will be freed by amdgpu_vram_mgr_fini() Delete bo_create path 
as it is redundant.

Suggested-by: Christian König 
Signed-off-by: Victor Skvortsov 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 55 ++--  
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 -
 2 files changed, 3 insertions(+), 54 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 7a4eae36778a..2a20714b9c16 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -244,7 +244,6 @@ static int amdgpu_virt_init_ras_err_handler_data(struct 
amdgpu_device *adev)
 */
unsigned int align_space = 512;
void *bps = NULL;
-   struct amdgpu_bo **bps_bo = NULL;

*data = kmalloc(sizeof(struct amdgpu_virt_ras_err_handler_data), 
GFP_KERNEL);
if (!*data)
@@ -254,12 +253,7 @@ static int amdgpu_virt_init_ras_err_handler_data(struct 
amdgpu_device *adev)
if (!bps)
goto bps_failure;

-   bps_bo = kmalloc_array(align_space, sizeof(*(*data)->bps_bo), 
GFP_KERNEL);
-   if (!bps_bo)
-   goto bps_bo_failure;
-
(*data)->bps = bps;
-   (*data)->bps_bo = bps_bo;
(*data)->count = 0;
(*data)->last_reserved = 0;

@@ -267,34 +261,12 @@ static int amdgpu_virt_init_ras_err_handler_data(struct 
amdgpu_device *adev)

return 0;

-bps_bo_failure:
-   kfree(bps);
 bps_failure:
kfree(*data);
 data_failure:
return -ENOMEM;
 }

-static void amdgpu_virt_ras_release_bp(struct amdgpu_device *adev) -{
-   struct amdgpu_virt *virt = &adev->virt;
-   struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data;
-   struct amdgpu_bo *bo;
-   int i;
-
-   if (!data)
-   return;
-
-   for (i = data->last_reserved - 1; i >= 0; i--) {
-   bo = data->bps_bo[i];
-   if (bo) {
-   amdgpu_bo_free_kernel(&bo, NULL, NULL);
-   data->bps_bo[i] = bo;
-   }
-   data->last_reserved = i;
-   }
-}
-
 void amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev)  {
struct amdgpu_virt *virt = &adev->virt; @@ -305,10 +277,7 @@ void 
amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev)
if (!data)
return;

-   amdgpu_virt_ras_release_bp(adev);
-
kfree(data->bps);
-   kfree(data->bps_bo);
kfree(data);
virt->virt_eh_data = NULL;
 }
@@ -330,9 +299,6 @@ static void amdgpu_virt_ras_reserve_bps(struct 
amdgpu_device *adev)  {
struct amdgpu_virt *virt = &adev->virt;
struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data;
-   struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
-   struct ttm_resource_manager *man = &mgr->manager;
-   struct amdgpu_bo *bo = NULL;
uint64_t bp;
int i;

@@ -341,26 +307,11 @@ static void amdgpu_virt_ras_reserve_bps(struct 
amdgpu_device *adev)

for (i = data->last_reserved; i < data->count; i++) {
bp = data->bps[i].retired_page;
+   if (amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
+   bp << AMDGPU_GPU_PAGE_SHIFT, AMDGPU_GPU_PAGE_SIZE))
+   DRM_DEBUG("RAS WARN: reserve vram for retired page %llx 
fail\n",
+bp);

-   /* There are two cases of reserve error should be ignored:
-* 1) a ras bad page has been allocated (used by someone);
-* 2) a ras bad page has been reserved (duplicate error 
injection
-*for one page);
-*/
-   if  (ttm_resource_manager_used(man)) {
-   amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
-   bp << AMDGPU_GPU_PAGE_SHIFT,
-   AMDGPU_GPU_PAGE_SIZE);
-   data->bps_bo[i] = NULL;
-   } else {
-   if (amdgpu_bo_create_kernel_at(adev, bp << 
AMDGPU_GPU_PAGE_SHIFT,
-   AMDGPU_GPU_PAGE_SIZE,
-   &bo, NULL))
-   DRM_DEBUG("RAS WARN: reserve vram for retired 
page %llx fail\n", bp);
-   data->bps_bo[i] = bo;
-   }
data->

RE: [PATCH 2/4] drm/amdgpu: Do not program SQ_TIMEOUT_CONFIG in SRIOV

2024-02-23 Thread Luo, Zhigang

[AMD Official Use Only - General]

Reviewed By Zhigang Luo mailto:zhigang@amd.com>>


From: Lu, Victor Cheng Chi (Victor) 
Sent: Friday, February 16, 2024 1:50 PM
To: Luo, Zhigang 
Subject: Fw: [PATCH 2/4] drm/amdgpu: Do not program SQ_TIMEOUT_CONFIG in SRIOV


[AMD Official Use Only - General]



From: Lu, Victor Cheng Chi (Victor) 
mailto:victorchengchi...@amd.com>>
Sent: Tuesday, January 2, 2024 12:30 PM
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> 
mailto:amd-gfx@lists.freedesktop.org>>
Cc: Chander, Vignesh mailto:vignesh.chan...@amd.com>>; 
Lu, Victor Cheng Chi (Victor) 
mailto:victorchengchi...@amd.com>>
Subject: [PATCH 2/4] drm/amdgpu: Do not program SQ_TIMEOUT_CONFIG in SRIOV

VF should not program this register.

Signed-off-by: Victor Lu 
mailto:victorchengchi...@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 00b21ece081f..30cc155f20d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -3888,6 +3888,9 @@ static void gfx_v9_4_3_inst_enable_watchdog_timer(struct 
amdgpu_device *adev,
 uint32_t i;
 uint32_t data;

+   if (amdgpu_sriov_vf(adev))
+   return;
+
 data = RREG32_SOC15(GC, GET_INST(GC, 0), regSQ_TIMEOUT_CONFIG);
 data = REG_SET_FIELD(data, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE,
  amdgpu_watchdog_timer.timeout_fatal_disable ? 1 : 
0);
--
2.34.1

RE: [PATCH 3/4] drm/amdgpu: Use correct SRIOV macro for gmc_v9_0_vm_fault_interrupt_state

2024-02-16 Thread Luo, Zhigang

[AMD Official Use Only - General]

Reviewed By Zhigang Luo 

From: Lu, Victor Cheng Chi (Victor) 
Sent: Friday, February 16, 2024 1:50 PM
To: Luo, Zhigang 
Subject: Fw: [PATCH 3/4] drm/amdgpu: Use correct SRIOV macro for 
gmc_v9_0_vm_fault_interrupt_state


[AMD Official Use Only - General]



From: Lu, Victor Cheng Chi (Victor) 
mailto:victorchengchi...@amd.com>>
Sent: Tuesday, January 2, 2024 12:30 PM
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> 
mailto:amd-gfx@lists.freedesktop.org>>
Cc: Chander, Vignesh mailto:vignesh.chan...@amd.com>>; 
Lu, Victor Cheng Chi (Victor) 
mailto:victorchengchi...@amd.com>>
Subject: [PATCH 3/4] drm/amdgpu: Use correct SRIOV macro for 
gmc_v9_0_vm_fault_interrupt_state

Under SRIOV, programming to VM_CONTEXT*_CNTL regs failed because the
current macro does not pass through the correct xcc instance.
Use the *REG32_XCC macro in this case.

The behaviour without SRIOV is the same.

Signed-off-by: Victor Lu 
mailto:victorchengchi...@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 473a774294ce..e2e14d40109c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -496,14 +496,14 @@ static int gmc_v9_0_vm_fault_interrupt_state(struct 
amdgpu_device *adev,
 if (j >= AMDGPU_MMHUB0(0))
 tmp = RREG32_SOC15_IP(MMHUB, reg);
 else
-   tmp = RREG32_SOC15_IP(GC, reg);
+   tmp = RREG32_XCC(reg, j);

 tmp &= ~bits;

 if (j >= AMDGPU_MMHUB0(0))
 WREG32_SOC15_IP(MMHUB, reg, tmp);
 else
-   WREG32_SOC15_IP(GC, reg, tmp);
+   WREG32_XCC(reg, tmp, j);
 }
 }
 break;
@@ -524,14 +524,14 @@ static int gmc_v9_0_vm_fault_interrupt_state(struct 
amdgpu_device *adev,
 if (j >= AMDGPU_MMHUB0(0))
 tmp = RREG32_SOC15_IP(MMHUB, reg);
 else
-   tmp = RREG32_SOC15_IP(GC, reg);
+   tmp = RREG32_XCC(reg, j);

 tmp |= bits;

 if (j >= AMDGPU_MMHUB0(0))
 WREG32_SOC15_IP(MMHUB, reg, tmp);
 else
-   WREG32_SOC15_IP(GC, reg, tmp);
+   WREG32_XCC(reg, tmp, j);
 }
 }
 break;
--
2.34.1

RE: [PATCH] drm/amdgpu: xgmi_fill_topology_info

2023-12-08 Thread Luo, Zhigang

[AMD Official Use Only - General]

Reviewed-by: Zhigang Luo 

-Original Message-
From: Chander, Vignesh 
Sent: Thursday, December 7, 2023 7:42 PM
To: amd-gfx@lists.freedesktop.org
Cc: Lazar, Lijo ; Luo, Zhigang ; Kim, 
Jonathan ; Chander, Vignesh 
Subject: [PATCH] drm/amdgpu: xgmi_fill_topology_info

1. Use the mirrored topology info to fill links for VF.
The new solution is required to simplify and optimize host driver logic.
Only use the new solution for VFs that support full duplex and 
extended_peer_link_info otherwise the info would be incomplete.

2. avoid calling extended_link_info on VF as its not supported

Signed-off-by: Vignesh Chander 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c  |  4 +-  
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 58 
 2 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index a21045d018f2..1bf975b8d083 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1433,8 +1433,8 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,
 get_extended_data) ||
amdgpu_ip_version(psp->adev, MP0_HWIP, 0) ==
IP_VERSION(13, 0, 6);
-   bool ta_port_num_support = psp->xgmi_context.xgmi_ta_caps &
-   EXTEND_PEER_LINK_INFO_CMD_FLAG;
+   bool ta_port_num_support = amdgpu_sriov_vf(psp->adev) ? 0 :
+   psp->xgmi_context.xgmi_ta_caps & 
EXTEND_PEER_LINK_INFO_CMD_FLAG;

/* popluate the shared output buffer rather than the cmd input 
buffer
 * with node_ids as the input for GET_PEER_LINKS command 
execution.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 44d8c1a11e1b..dd82d73daed6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -823,6 +823,28 @@ static int 
amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_inf
return 0;
 }

+void amdgpu_xgmi_fill_topology_info(struct amdgpu_device *adev,
+   struct amdgpu_device *peer_adev)
+{
+   struct psp_xgmi_topology_info *top_info = 
&adev->psp.xgmi_context.top_info;
+   struct psp_xgmi_topology_info *peer_info =
+&peer_adev->psp.xgmi_context.top_info;
+
+   for (int i = 0; i < peer_info->num_nodes; i++) {
+   if (peer_info->nodes[i].node_id == adev->gmc.xgmi.node_id) {
+   for (int j = 0; j < top_info->num_nodes; j++) {
+   if (top_info->nodes[j].node_id == 
peer_adev->gmc.xgmi.node_id) {
+   peer_info->nodes[i].num_hops = 
top_info->nodes[j].num_hops;
+   peer_info->nodes[i].is_sharing_enabled =
+   
top_info->nodes[j].is_sharing_enabled;
+   peer_info->nodes[i].num_links =
+   
top_info->nodes[j].num_links;
+   return;
+   }
+   }
+   }
+   }
+}
+
 int amdgpu_xgmi_add_device(struct amdgpu_device *adev)  {
struct psp_xgmi_topology_info *top_info; @@ -897,18 +919,38 @@ int 
amdgpu_xgmi_add_device(struct amdgpu_device *adev)
goto exit_unlock;
}

-   /* get latest topology info for each device from psp */
-   list_for_each_entry(tmp_adev, &hive->device_list, 
gmc.xgmi.head) {
-   ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
-   &tmp_adev->psp.xgmi_context.top_info, 
false);
+   if (amdgpu_sriov_vf(adev) &&
+   psp->xgmi_context.xgmi_ta_caps & 
EXTEND_PEER_LINK_INFO_CMD_FLAG) {
+   /* only get topology for VF being init if it can 
support full duplex */
+   ret = psp_xgmi_get_topology_info(&adev->psp, count,
+   
&adev->psp.xgmi_context.top_info, false);
if (ret) {
-   dev_err(tmp_adev->dev,
+   dev_err(adev->dev,
"XGMI: Get topology failure on device 
%llx, hive %llx, ret %d",
-   tmp_adev->gmc.xgmi.node_id,
-   tmp_adev->gmc.xgmi.hive_id, ret);
-   /* To do : continue with some node failed or 
disable the whole hive */
+

RE: [PATCH] drm/amdgpu/jpeg - skip change of power-gating state for sriov

2023-08-17 Thread Luo, Zhigang

[AMD Official Use Only - General]

Reviewed-by: Zhigang Luo 

-Original Message-
From: Dhume, Samir 
Sent: Wednesday, August 16, 2023 9:20 PM
To: amd-gfx@lists.freedesktop.org
Cc: Luo, Zhigang ; Liu, Leo ; Deucher, 
Alexander ; Dhume, Samir 
Subject: [PATCH] drm/amdgpu/jpeg - skip change of power-gating state for sriov

Signed-off-by: Samir Dhume 
---
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
index 15612915bb6c..1de79d660285 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
@@ -360,8 +360,10 @@ static int jpeg_v4_0_3_hw_fini(void *handle)

cancel_delayed_work_sync(&adev->jpeg.idle_work);

-   if (adev->jpeg.cur_state != AMD_PG_STATE_GATE)
-   ret = jpeg_v4_0_3_set_powergating_state(adev, 
AMD_PG_STATE_GATE);
+   if (!amdgpu_sriov_vf(adev)) {
+   if (adev->jpeg.cur_state != AMD_PG_STATE_GATE)
+   ret = jpeg_v4_0_3_set_powergating_state(adev, 
AMD_PG_STATE_GATE);
+   }

return ret;
 }
--
2.34.1

RE: [PATCH v3] drm/amdgpu: Add RLCG interface driver implementation for gfx v9.4.3 (v3)

2023-07-11 Thread Luo, Zhigang

[AMD Official Use Only - General]

Reviewed-by: Zhigang Luo 

-Original Message-
From: Lu, Victor Cheng Chi (Victor) 
Sent: Tuesday, July 11, 2023 10:18 AM
To: amd-gfx@lists.freedesktop.org
Cc: Skvortsov, Victor ; Ming, Davis 
; Luo, Zhigang ; Lu, Victor Cheng Chi 
(Victor) 
Subject: [PATCH v3] drm/amdgpu: Add RLCG interface driver implementation for 
gfx v9.4.3 (v3)

Add RLCG interface support for gfx v9.4.3 and multiple XCCs.
Do not enable it yet.

v2: Fix amdgpu_rlcg_reg_access_ctrl init, add support for multiple XCCs
in amdgpu_mm_wreg_mmio_rlc

v3: Use GET_INST() when indexing amdgpu_rlcg_reg_access_ctrl

Signed-off-by: Victor Lu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  4 +-  
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h |  6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c| 17 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h|  4 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 22 +++
 drivers/gpu/drm/amd/amdgpu/soc15_common.h   | 66 ++---
 11 files changed, 81 insertions(+), 51 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c578e07fd90a..a6647a1d13e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1182,7 +1182,7 @@ void amdgpu_device_indirect_wreg_ext(struct amdgpu_device 
*adev,
 u32 pcie_index, u32 pcie_index_hi,
 u32 pcie_data, u64 reg_addr, u32 reg_data);  
void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
-uint32_t reg, uint32_t v);
+uint32_t reg, uint32_t v, uint32_t xcc_id);
 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t 
value);  uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset);

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 241d8c5da273..fc77bea72db7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -173,7 +173,7 @@ static int  amdgpu_debugfs_process_reg_op(bool read, struct 
file *f,
} else {
r = get_user(value, (uint32_t *)buf);
if (!r)
-   amdgpu_mm_wreg_mmio_rlc(adev, *pos >> 2, value);
+   amdgpu_mm_wreg_mmio_rlc(adev, *pos >> 2, value, 
0);
}
if (r) {
result = r;
@@ -301,7 +301,7 @@ static ssize_t amdgpu_debugfs_regs2_op(struct file *f, char 
__user *buf, u32 off
} else {
r = get_user(value, (uint32_t *)buf);
if (!r)
-   amdgpu_mm_wreg_mmio_rlc(adev, offset >> 2, 
value);
+   amdgpu_mm_wreg_mmio_rlc(adev, offset >> 2, 
value, rd->id.xcc_id);
}
if (r) {
result = r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 45b335c766fd..abc56085b136 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -586,7 +586,8 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
  * this function is invoked only for the debugfs register access
  */
 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
-uint32_t reg, uint32_t v)
+uint32_t reg, uint32_t v,
+uint32_t xcc_id)
 {
if (amdgpu_device_skip_hw_access(adev))
return;
@@ -595,7 +596,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
adev->gfx.rlc.funcs &&
adev->gfx.rlc.funcs->is_rlcg_access_range) {
if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
-   return amdgpu_sriov_wreg(adev, reg, v, 0, 0);
+   return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
} else if ((reg * 4) >= adev->rmmio_size) {
adev->pcie_wreg(adev, reg * 4, v);
} else {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h
index 80b263646966..b591d33af264 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h
@@ -26,6 +26,8 @@

 #include "clearstate_defs.h"

+#define AMDGPU_MAX_RLC_INSTANCES   8
+
 /* firmware ID used in rlc toc */
 typedef enum _FIRMWARE_ID_ {
FIRMWARE_ID_INVALID

RE: [PATCH 1/4] drm/amdgpu: skip reset other device in the same hive if it's SRIOV VF

2021-12-07 Thread Luo, Zhigang

[AMD Official Use Only]

Shaoyun, please see my comments inline.

Thanks,
Zhigang

-Original Message-
From: Liu, Shaoyun  
Sent: December 7, 2021 2:15 PM
To: Luo, Zhigang ; amd-gfx@lists.freedesktop.org
Cc: Luo, Zhigang 
Subject: RE: [PATCH 1/4] drm/amdgpu: skip reset other device in the same hive 
if it's SRIOV VF

[AMD Official Use Only]

This   patch looks ok  to me . 
Patch 2 is  actually add the PSP xgmi init  not the whole XGMI  init, can  you 
change the description according  to this ? 
[Zhigang] Ok. Will change it.
Patch 3,  You take the hive lock inside the reset sriov function , but the  
hive lock already be took  before this function is called  in gpu_recovery 
function,  so is it real necessary to get hive  inside the reset sriov function 
, can  you try remove the code to check hive ?  Or maybe pass the  hive as a 
parameter into this function if the hive is needed? 
[Zhigang] in patch 1, we made change in gpu_recovery to skip getting xgmi hive 
if it's sriov vf as we don't want to reset other VF in the same hive.
Patch 4 looks ok to me , but may need  SRDC engineer confirm it won't have  
side effect on other AI  asic . 

Regards
Shaoyun.liu

-Original Message-
From: amd-gfx  On Behalf Of Zhigang Luo
Sent: Tuesday, December 7, 2021 11:57 AM
To: amd-gfx@lists.freedesktop.org
Cc: Luo, Zhigang 
Subject: [PATCH 1/4] drm/amdgpu: skip reset other device in the same hive if 
it's SRIOV VF

On SRIOV, host driver can support FLR(function level reset) on individual VF 
within the hive which might bring the individual device back to normal without 
the necessary to execute the hive reset. If the FLR failed , host driver will 
trigger the hive reset, each guest VF will get reset notification before the 
real hive reset been executed. The VF device can handle the reset request 
individually in it's reset work handler.

This change updated gpu recover sequence to skip reset other device in the same 
hive for SRIOV VF.

Signed-off-by: Zhigang Luo 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3c5afa45173c..474f8ea58aa5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4746,7 +4746,7 @@ static int amdgpu_device_lock_hive_adev(struct 
amdgpu_device *adev, struct amdgp  {
struct amdgpu_device *tmp_adev = NULL;
 
-   if (adev->gmc.xgmi.num_physical_nodes > 1) {
+   if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) 
+{
if (!hive) {
dev_err(adev->dev, "Hive is NULL while device has 
multiple xgmi nodes");
return -ENODEV;
@@ -4958,7 +4958,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 * We always reset all schedulers for device and all devices for XGMI
 * hive so that should take care of them too.
 */
-   hive = amdgpu_get_xgmi_hive(adev);
+   if (!amdgpu_sriov_vf(adev))
+   hive = amdgpu_get_xgmi_hive(adev);
if (hive) {
if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as 
another already in progress", @@ -4999,7 +5000,7 @@ int 
amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 * to put adev in the 1st position.
 */
INIT_LIST_HEAD(&device_list);
-   if (adev->gmc.xgmi.num_physical_nodes > 1) {
+   if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) 
+{
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
list_add_tail(&tmp_adev->reset_list, &device_list);
if (!list_is_first(&adev->reset_list, &device_list))
--
2.17.1

RE: [PATCH] drm/amdgpu: correct MMSCH version

2021-08-16 Thread Luo, Zhigang

[AMD Official Use Only]

Yes, the information is from MMSCH firmware team.
They are saying the version info is not used in the old MMSCH firmware, so it 
won't break the ASICs using old MMSCH firmware.
Ok, I will change the commit message to specify mmsch 1.0.

Thanks,
Zhigang

-Original Message-
From: Liu, Shaoyun  
Sent: August 16, 2021 10:49 AM
To: Luo, Zhigang ; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH] drm/amdgpu: correct MMSCH version

[AMD Official Use Only]

Is that information from MM team ? 
Please make sure it won't break the ASICs that use the same  code path. Also If 
this is true for all mmsch_v1.0 , you need to specify this is mmSCH v1.0 , 
since other MMSCH version will still use this major and  minor. 

Shaoyun.liu


-Original Message-
From: amd-gfx  On Behalf Of Zhigang Luo
Sent: Thursday, August 12, 2021 11:07 AM
To: amd-gfx@lists.freedesktop.org
Cc: Luo, Zhigang 
Subject: [PATCH] drm/amdgpu: correct MMSCH version

MMSCH doesn't have major/minor version, only verison.

Signed-off-by: Zhigang Luo 
---
 drivers/gpu/drm/amd/amdgpu/mmsch_v1_0.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mmsch_v1_0.h 
b/drivers/gpu/drm/amd/amdgpu/mmsch_v1_0.h
index 20958639b601..2cdab8062c86 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmsch_v1_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/mmsch_v1_0.h
@@ -24,9 +24,7 @@
 #ifndef __MMSCH_V1_0_H__
 #define __MMSCH_V1_0_H__
 
-#define MMSCH_VERSION_MAJOR1
-#define MMSCH_VERSION_MINOR0
-#define MMSCH_VERSION  (MMSCH_VERSION_MAJOR << 16 | MMSCH_VERSION_MINOR)
+#define MMSCH_VERSION  0x1
 
 enum mmsch_v1_0_command_type {
MMSCH_COMMAND__DIRECT_REG_WRITE = 0,
-- 
2.17.1

RE: [PATCH 4/5] drm/amdgpu: add psp microcode init for arcturus and aldebaran sriov vf

2021-06-07 Thread Luo, Zhigang

[Public]

Okay. I will update the change as you suggested.

Thanks,
Zhigang

-Original Message-
From: Zhang, Hawking 
Sent: June 7, 2021 9:52 AM
To: Luo, Zhigang ; Liu, Shaoyun ; 
amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH 4/5] drm/amdgpu: add psp microcode init for arcturus and 
aldebaran sriov vf

[AMD Official Use Only]

You can call psp_init_ta_microcode directly in sriov vf case so you don't need 
to initialize unnecessary psp firmware structures.

Regards,
Hawking
-Original Message-
From: amd-gfx  On Behalf Of Luo, Zhigang
Sent: Thursday, June 3, 2021 23:32
To: Liu, Shaoyun ; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH 4/5] drm/amdgpu: add psp microcode init for arcturus and 
aldebaran sriov vf

Yeah, it will also init psp sos and asd mircrocode. But I think it's harmless.

Thanks,
Zhigang

-Original Message-
From: Liu, Shaoyun 
Sent: June 3, 2021 11:13 AM
To: Luo, Zhigang ; amd-gfx@lists.freedesktop.org
Cc: Luo, Zhigang 
Subject: RE: [PATCH 4/5] drm/amdgpu: add psp microcode init for arcturus and 
aldebaran sriov vf

[AMD Official Use Only]

This one  doesn't looks apply to  XGMI TA  only , it's for whole PSP init , can 
 you double check it ?


Shaoyun.liu

-Original Message-
From: amd-gfx  On Behalf Of Zhigang Luo
Sent: Thursday, June 3, 2021 10:13 AM
To: amd-gfx@lists.freedesktop.org
Cc: Luo, Zhigang 
Subject: [PATCH 4/5] drm/amdgpu: add psp microcode init for arcturus and 
aldebaran sriov vf

need to load xgmi ta for arcturus and aldebaran sriov vf.

Signed-off-by: Zhigang Luo 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 55378c6b9722..6bd7e39c3e75 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -170,7 +170,8 @@ static int psp_sw_init(void *handle)
struct psp_context *psp = &adev->psp;
int ret;

-   if (!amdgpu_sriov_vf(adev)) {
+   if ((adev->asic_type == CHIP_ARCTURUS) ||
+   (adev->asic_type == CHIP_ALDEBARAN) || (!amdgpu_sriov_vf(adev))) {
ret = psp_init_microcode(psp);
if (ret) {
DRM_ERROR("Failed to load psp firmware!\n");
--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=04%7C01%7Chawking.zhang%40amd.com%7C36ad44d9dbca4d9ca2d408d926a4b1f1%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637583311075104399%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=tcFmUpgqBn0YW7B99DfjQl4cP4k7%2FHQGplQU0Zjj%2FGw%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=04%7C01%7Chawking.zhang%40amd.com%7C36ad44d9dbca4d9ca2d408d926a4b1f1%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637583311075104399%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=tcFmUpgqBn0YW7B99DfjQl4cP4k7%2FHQGplQU0Zjj%2FGw%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 1/5] drm/amdgpu: remove sriov vf checking from getting fb location

2021-06-04 Thread Luo, Zhigang

[AMD Official Use Only]

The policy is defined by our virtualization team to guarantee end user 
experience and reduce maintenance work.

Added David, virtualization team architect.

David, could you help to add more comments?

Thanks,
Zhigang

-Original Message-
From: Christian König 
Sent: June 4, 2021 9:46 AM
To: Luo, Zhigang ; Deng, Emily ; Liu, 
Shaoyun ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 1/5] drm/amdgpu: remove sriov vf checking from getting fb 
location

Well, but are you the one defining the compatibility policy?

See usually Linux kernel code compatibility policy is that existing stuff needs 
to work forever.

We could argue a bit that the hypervisor components are not open source nor 
uAPI, but that argument is rather thin.

Christian.

Am 04.06.21 um 15:23 schrieb Luo, Zhigang:
> [AMD Official Use Only]
>
> Here is our hypervisor driver compatibility policy:
>  - Host.y supports Guest.y-1, Guest.y, Guest.y+1
>  - Guest.y supported by Host.y-1, Host.y,Host.y+1
>
> Host driver had the feature for gfx9 2 years ago. So, this change meet our 
> compatibility policy.
>
> Thanks,
> Zhigang
>
> -Original Message-
> From: Christian König 
> Sent: June 4, 2021 7:14 AM
> To: Deng, Emily ; Liu, Shaoyun
> ; Luo, Zhigang ;
> amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 1/5] drm/amdgpu: remove sriov vf checking from
> getting fb location
>
> I was just about to question the same thing.
>
> It looks really good to me to have that cleaned up, but if this breaks with 
> older versions of the hypervisor then it is a bit questionable change.
>
> Regards,
> Christian.
>
> Am 04.06.21 um 03:13 schrieb Deng, Emily:
>> Do we need to consider backward compatibility?
>>
>>
>> Best wishes
>> Emily Deng
>>
>>
>>> -Original Message-
>>> From: amd-gfx  On Behalf Of
>>> Liu, Shaoyun
>>> Sent: Thursday, June 3, 2021 11:10 PM
>>> To: Luo, Zhigang ;
>>> amd-gfx@lists.freedesktop.org
>>> Cc: Luo, Zhigang 
>>> Subject: RE: [PATCH 1/5] drm/amdgpu: remove sriov vf checking from
>>> getting fb location
>>>
>>> [AMD Official Use Only]
>>>
>>> Looks ok to me .
>>>
>>> Reviewed-By : Shaoyun.liu 
>>>
>>> -Original Message-
>>> From: amd-gfx  On Behalf Of
>>> Zhigang Luo
>>> Sent: Thursday, June 3, 2021 10:13 AM
>>> To: amd-gfx@lists.freedesktop.org
>>> Cc: Luo, Zhigang 
>>> Subject: [PATCH 1/5] drm/amdgpu: remove sriov vf checking from
>>> getting fb location
>>>
>>> host driver programmed fb location registers for vf, no need to check 
>>> anymore.
>>>
>>> Signed-off-by: Zhigang Luo 
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 5 +
>>> 1 file changed, 1 insertion(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index ceb3968d8326..1c2d9fde9021 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -1292,10 +1292,7 @@ static int gmc_v9_0_late_init(void *handle)
>>> static void gmc_v9_0_vram_gtt_location(struct amdgpu_device *adev,
>>>   struct amdgpu_gmc *mc) {
>>> -u64 base = 0;
>>> -
>>> -if (!amdgpu_sriov_vf(adev))
>>> -base = adev->mmhub.funcs->get_fb_location(adev);
>>> +u64 base = adev->mmhub.funcs->get_fb_location(adev);
>>>
>>>   /* add the xgmi offset of the physical node */
>>>   base += adev->gmc.xgmi.physical_node_id * adev-
>>>> gmc.xgmi.node_segment_size;
>>> --
>>> 2.17.1
>>>
>>> ___
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fli
>>> s
>>> ts.fre
>>> edesktop.org%2Fmailman%2Flistinfo%2Famd-
>>> gfx&data=04%7C01%7CEmily.Deng%40amd.com%7Cd41e78b1a3af4f08ff
>>> d108d926a1a2d8%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C63
>>> 7583297946242271%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAi
>>> LCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=Nsz
>>> ZyRZHCxj%2FIJ1hYoSrkv3LpTmF9FbchpNMtQ2GE5M%3D&reserved=0
>>> ___
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam1

RE: [PATCH 1/5] drm/amdgpu: remove sriov vf checking from getting fb location

2021-06-04 Thread Luo, Zhigang

[AMD Official Use Only]

Here is our hypervisor driver compatibility policy:
- Host.y supports Guest.y-1, Guest.y, Guest.y+1
- Guest.y supported by Host.y-1, Host.y,Host.y+1

Host driver had the feature for gfx9 2 years ago. So, this change meet our 
compatibility policy.

Thanks,
Zhigang

-Original Message-
From: Christian König 
Sent: June 4, 2021 7:14 AM
To: Deng, Emily ; Liu, Shaoyun ; Luo, 
Zhigang ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 1/5] drm/amdgpu: remove sriov vf checking from getting fb 
location

I was just about to question the same thing.

It looks really good to me to have that cleaned up, but if this breaks with 
older versions of the hypervisor then it is a bit questionable change.

Regards,
Christian.

Am 04.06.21 um 03:13 schrieb Deng, Emily:
> Do we need to consider backward compatibility?
>
>
> Best wishes
> Emily Deng
>
>
>> -Original Message-
>> From: amd-gfx  On Behalf Of
>> Liu, Shaoyun
>> Sent: Thursday, June 3, 2021 11:10 PM
>> To: Luo, Zhigang ; amd-gfx@lists.freedesktop.org
>> Cc: Luo, Zhigang 
>> Subject: RE: [PATCH 1/5] drm/amdgpu: remove sriov vf checking from
>> getting fb location
>>
>> [AMD Official Use Only]
>>
>> Looks ok to me .
>>
>> Reviewed-By : Shaoyun.liu 
>>
>> -Original Message-
>> From: amd-gfx  On Behalf Of
>> Zhigang Luo
>> Sent: Thursday, June 3, 2021 10:13 AM
>> To: amd-gfx@lists.freedesktop.org
>> Cc: Luo, Zhigang 
>> Subject: [PATCH 1/5] drm/amdgpu: remove sriov vf checking from
>> getting fb location
>>
>> host driver programmed fb location registers for vf, no need to check 
>> anymore.
>>
>> Signed-off-by: Zhigang Luo 
>> ---
>> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 5 +
>> 1 file changed, 1 insertion(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index ceb3968d8326..1c2d9fde9021 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -1292,10 +1292,7 @@ static int gmc_v9_0_late_init(void *handle)
>> static void gmc_v9_0_vram_gtt_location(struct amdgpu_device *adev,
>>  struct amdgpu_gmc *mc)
>> {
>> -u64 base = 0;
>> -
>> -if (!amdgpu_sriov_vf(adev))
>> -base = adev->mmhub.funcs->get_fb_location(adev);
>> +u64 base = adev->mmhub.funcs->get_fb_location(adev);
>>
>>  /* add the xgmi offset of the physical node */
>>  base += adev->gmc.xgmi.physical_node_id * adev-
>>> gmc.xgmi.node_segment_size;
>> --
>> 2.17.1
>>
>> ___
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flis
>> ts.fre
>> edesktop.org%2Fmailman%2Flistinfo%2Famd-
>> gfx&data=04%7C01%7CEmily.Deng%40amd.com%7Cd41e78b1a3af4f08ff
>> d108d926a1a2d8%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C63
>> 7583297946242271%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAi
>> LCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=Nsz
>> ZyRZHCxj%2FIJ1hYoSrkv3LpTmF9FbchpNMtQ2GE5M%3D&reserved=0
>> ___
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flis
>> ts.fre
>> edesktop.org%2Fmailman%2Flistinfo%2Famd-
>> gfx&data=04%7C01%7CEmily.Deng%40amd.com%7Cd41e78b1a3af4f08ff
>> d108d926a1a2d8%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C63
>> 7583297946242271%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAi
>> LCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=Nsz
>> ZyRZHCxj%2FIJ1hYoSrkv3LpTmF9FbchpNMtQ2GE5M%3D&reserved=0
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=04%7C01%7CZh
> igang.Luo%40amd.com%7C9b87dfa195ed4e84a8c808d92749e416%7C3dd8961fe4884
> e608e11a82d994e183d%7C0%7C0%7C637584020581460118%7CUnknown%7CTWFpbGZsb
> 3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%
> 7C1000&sdata=3S6ZEnCCMHHlfc%2B6xmLDz7Bgn91Is7EIpMS7WRxq0Jo%3D&
> reserved=0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 5/5] drm/amdgpu: allocate psp fw private buffer from VRAM for sriov vf

2021-06-03 Thread Luo, Zhigang

All new PSP release will have this feature. And it will not cause any failure 
even the PSP doesn't have this feature yet.

Thanks,
Zhigang

-Original Message-
From: Liu, Shaoyun  
Sent: June 3, 2021 11:15 AM
To: Luo, Zhigang ; amd-gfx@lists.freedesktop.org
Cc: Luo, Zhigang 
Subject: RE: [PATCH 5/5] drm/amdgpu: allocate psp fw private buffer from VRAM 
for sriov vf

[AMD Official Use Only]

Please double verify whether this feature apply to all aisc PSP supported  
since this is not only apply to ARCTURUS and  ALDEBARAN. 

Shaoyun.liu

-Original Message-
From: amd-gfx  On Behalf Of Zhigang Luo
Sent: Thursday, June 3, 2021 10:13 AM
To: amd-gfx@lists.freedesktop.org
Cc: Luo, Zhigang 
Subject: [PATCH 5/5] drm/amdgpu: allocate psp fw private buffer from VRAM for 
sriov vf

psp added new feature to check fw buffer address for sriov vf. the address 
range must be in vf fb.

Signed-off-by: Zhigang Luo 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 19 ++-
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 6bd7e39c3e75..7c0f1017a46b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2320,11 +2320,20 @@ static int psp_load_fw(struct amdgpu_device *adev)
if (!psp->cmd)
return -ENOMEM;
 
-   ret = amdgpu_bo_create_kernel(adev, PSP_1_MEG, PSP_1_MEG,
-   AMDGPU_GEM_DOMAIN_GTT,
-   &psp->fw_pri_bo,
-   &psp->fw_pri_mc_addr,
-   &psp->fw_pri_buf);
+   if (amdgpu_sriov_vf(adev)) {
+   ret = amdgpu_bo_create_kernel(adev, PSP_1_MEG, PSP_1_MEG,
+   AMDGPU_GEM_DOMAIN_VRAM,
+   &psp->fw_pri_bo,
+   &psp->fw_pri_mc_addr,
+   &psp->fw_pri_buf);
+   } else {
+   ret = amdgpu_bo_create_kernel(adev, PSP_1_MEG, PSP_1_MEG,
+   AMDGPU_GEM_DOMAIN_GTT,
+   &psp->fw_pri_bo,
+   &psp->fw_pri_mc_addr,
+   &psp->fw_pri_buf);
+   }
+
if (ret)
goto failed;
 
--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=04%7C01%7CShaoyun.Liu%40amd.com%7C3f624a72d2574d5c10a808d92699c9a8%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637583264223318916%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=4EfyfR26TENFq1%2BXlSufuOYocdCmNcdEZHyEPzAQPcc%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 4/5] drm/amdgpu: add psp microcode init for arcturus and aldebaran sriov vf

2021-06-03 Thread Luo, Zhigang

Yeah, it will also init psp sos and asd mircrocode. But I think it's harmless.

Thanks,
Zhigang

-Original Message-
From: Liu, Shaoyun  
Sent: June 3, 2021 11:13 AM
To: Luo, Zhigang ; amd-gfx@lists.freedesktop.org
Cc: Luo, Zhigang 
Subject: RE: [PATCH 4/5] drm/amdgpu: add psp microcode init for arcturus and 
aldebaran sriov vf

[AMD Official Use Only]

This one  doesn't looks apply to  XGMI TA  only , it's for whole PSP init , can 
 you double check it ? 


Shaoyun.liu

-Original Message-
From: amd-gfx  On Behalf Of Zhigang Luo
Sent: Thursday, June 3, 2021 10:13 AM
To: amd-gfx@lists.freedesktop.org
Cc: Luo, Zhigang 
Subject: [PATCH 4/5] drm/amdgpu: add psp microcode init for arcturus and 
aldebaran sriov vf

need to load xgmi ta for arcturus and aldebaran sriov vf.

Signed-off-by: Zhigang Luo 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 55378c6b9722..6bd7e39c3e75 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -170,7 +170,8 @@ static int psp_sw_init(void *handle)
struct psp_context *psp = &adev->psp;
int ret;
 
-   if (!amdgpu_sriov_vf(adev)) {
+   if ((adev->asic_type == CHIP_ARCTURUS) ||
+   (adev->asic_type == CHIP_ALDEBARAN) || (!amdgpu_sriov_vf(adev))) {
ret = psp_init_microcode(psp);
if (ret) {
DRM_ERROR("Failed to load psp firmware!\n");
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=04%7C01%7CShaoyun.Liu%40amd.com%7C7568bce040b840a5a20508d92699c7ee%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637583264190861368%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=ZL8oC97Rnltg0gbqc8AUqnZS%2BEuUSq8%2FDFngzjjFtbI%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 2/5] drm/amdgpu: remove sriov vf gfxhub fb location programming

2021-06-03 Thread Luo, Zhigang

Yes, I double checked all gfx9 ASICs with sriov supported.

-Original Message-
From: Liu, Shaoyun  
Sent: June 3, 2021 11:11 AM
To: Luo, Zhigang ; amd-gfx@lists.freedesktop.org
Cc: Luo, Zhigang 
Subject: RE: [PATCH 2/5] drm/amdgpu: remove sriov vf gfxhub fb location 
programming

[AMD Official Use Only]

This looks will affect other ASIC , Can you double check that ? 

-Original Message-
From: amd-gfx  On Behalf Of Zhigang Luo
Sent: Thursday, June 3, 2021 10:13 AM
To: amd-gfx@lists.freedesktop.org
Cc: Luo, Zhigang 
Subject: [PATCH 2/5] drm/amdgpu: remove sriov vf gfxhub fb location programming

host driver programmed the gfxhub fb location for vf, no need to program in 
guest side.

Signed-off-by: Zhigang Luo 
---
 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c | 12 
 1 file changed, 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
index 063e48df0b2d..f51fd0688eca 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
@@ -320,18 +320,6 @@ static void gfxhub_v1_0_program_invalidation(struct 
amdgpu_device *adev)
 
 static int gfxhub_v1_0_gart_enable(struct amdgpu_device *adev)  {
-   if (amdgpu_sriov_vf(adev) && adev->asic_type != CHIP_ARCTURUS) {
-   /*
-* MC_VM_FB_LOCATION_BASE/TOP is NULL for VF, becuase they are
-* VF copy registers so vbios post doesn't program them, for
-* SRIOV driver need to program them
-*/
-   WREG32_SOC15_RLC(GC, 0, mmMC_VM_FB_LOCATION_BASE,
-adev->gmc.vram_start >> 24);
-   WREG32_SOC15_RLC(GC, 0, mmMC_VM_FB_LOCATION_TOP,
-adev->gmc.vram_end >> 24);
-   }
-
/* GART Enable. */
gfxhub_v1_0_init_gart_aperture_regs(adev);
gfxhub_v1_0_init_system_aperture_regs(adev);
--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=04%7C01%7CShaoyun.Liu%40amd.com%7C5e79adbeb3bb46b1cf7e08d92699c8c9%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637583264238382812%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=%2FHQnWwOdUVoyXVRBwx03aJqif3bVRKkKfDT82lr3ZJ8%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 1/1] drm/amdgpu: Add a new device ID for Aldebaran

2021-04-30 Thread Luo, Zhigang

[AMD Official Use Only - Internal Distribution Only]

Thanks, Felix. The change looks good for me.

--Zhigang

-Original Message-
From: Kuehling, Felix 
Sent: April 30, 2021 1:45 PM
To: Luo, Zhigang ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 1/1] drm/amdgpu: Add a new device ID for Aldebaran

It just took me two minutes to do this myself. The commit is up for review on 
Gerrit:
http://gerrit-git.amd.com/c/compute/ec/libhsakmt/+/518573. Please review and 
test this.

Regards,
  Felix


Am 2021-04-29 um 2:46 p.m. schrieb Luo, Zhigang:
> [AMD Official Use Only - Internal Distribution Only]
>
> Thanks, Felix.
> Yes, I will sent out several patches for Aldebaran virtualization support 
> soon.
>
> Thanks,
> Zhigang
>
> -Original Message-
> From: Kuehling, Felix 
> Sent: April 29, 2021 2:40 PM
> To: Luo, Zhigang ; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 1/1] drm/amdgpu: Add a new device ID for Aldebaran
>
> Am 2021-04-29 um 2:33 p.m. schrieb Zhigang Luo:
>> It is Aldebaran VF device ID, for virtualization support.
>>
>> Signed-off-by: Zhigang Luo 
> This patch looks good to me.
>
> Acked-by: Felix Kuehling 
>
> I believe you'll also need to add an entry for the VF in 
> kfd_supported_devices in kfd_device.c.
>
> Regards,
>   Felix
>
>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
>>  1 file changed, 1 insertion(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 0369d3532bf0..310e1249e64e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -1186,6 +1186,7 @@ static const struct pci_device_id pciidlist[] =
>> {  {0x1002, 0x7408, PCI_ANY_ID, PCI_ANY_ID, 0, 0,
>> CHIP_ALDEBARAN|AMD_EXP_HW_SUPPORT},
>>  {0x1002, 0x740C, PCI_ANY_ID, PCI_ANY_ID, 0, 0,
>> CHIP_ALDEBARAN|AMD_EXP_HW_SUPPORT},
>>  {0x1002, 0x740F, PCI_ANY_ID, PCI_ANY_ID, 0, 0,
>> CHIP_ALDEBARAN|AMD_EXP_HW_SUPPORT},
>> +{0x1002, 0x7410, PCI_ANY_ID, PCI_ANY_ID, 0, 0,
>> +CHIP_ALDEBARAN|AMD_EXP_HW_SUPPORT},
>>
>>  {0, 0, 0}
>>  };
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 2/2] drm/amdgpu: Add Aldebaran virtualization support

2021-04-29 Thread Luo, Zhigang

[AMD Official Use Only - Internal Distribution Only]

Thanks Alex.

 I will add it.

--Zhigang

-Original Message-
From: Alex Deucher 
Sent: April 29, 2021 4:17 PM
To: Luo, Zhigang 
Cc: amd-gfx list 
Subject: Re: [PATCH 2/2] drm/amdgpu: Add Aldebaran virtualization support

On Thu, Apr 29, 2021 at 4:13 PM Zhigang Luo  wrote:
>
> 1. add Aldebaran in virtualization detection list.
> 2. disable Aldebaran virtual display support as there is no GFX
>engine in Aldebaran.
> 3. skip TMR loading if Aldebaran is in virtualizatin mode as it
>shares the one host loaded.
>
> Signed-off-by: Zhigang Luo 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c  | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 9 ++---
>  2 files changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 623044414bb5..17b728d2c1f2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -422,6 +422,7 @@ static bool psp_skip_tmr(struct psp_context *psp)
> switch (psp->adev->asic_type) {
> case CHIP_NAVI12:
> case CHIP_SIENNA_CICHLID:
> +   case CHIP_ALDEBARAN:
> return true;
> default:
> return false;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 0c9c5255aa42..691066e9c1f3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -50,9 +50,11 @@ void amdgpu_virt_init_setting(struct amdgpu_device *adev)
> struct drm_device *ddev = adev_to_drm(adev);
>
> /* enable virtual display */
> -   if (adev->mode_info.num_crtc == 0)
> -   adev->mode_info.num_crtc = 1;
> -   adev->enable_virtual_display = true;
> +   if (adev->asic_type != CHIP_ALDEBARAN) {

We should check for ARCTURUS here as well.

Alex

> +   if (adev->mode_info.num_crtc == 0)
> +   adev->mode_info.num_crtc = 1;
> +   adev->enable_virtual_display = true;
> +   }
> ddev->driver_features &= ~DRIVER_ATOMIC;
> adev->cg_flags = 0;
> adev->pg_flags = 0;
> @@ -679,6 +681,7 @@ void amdgpu_detect_virtualization(struct amdgpu_device 
> *adev)
> case CHIP_VEGA10:
> case CHIP_VEGA20:
> case CHIP_ARCTURUS:
> +   case CHIP_ALDEBARAN:
> soc15_set_virt_ops(adev);
> break;
> case CHIP_NAVI10:
> --
> 2.17.1
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=04%7C01%7Czh
> igang.luo%40amd.com%7C6379911fec0b46ae127e08d90b4bc186%7C3dd8961fe4884
> e608e11a82d994e183d%7C0%7C0%7C637553242264820358%7CUnknown%7CTWFpbGZsb
> 3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%
> 7C1000&sdata=sDmmizo0SMOhi8dTIVLRVpkLEocssobJCCQEyASwbGk%3D&re
> served=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 1/1] drm/amdgpu: Add a new device ID for Aldebaran

2021-04-29 Thread Luo, Zhigang

[AMD Official Use Only - Internal Distribution Only]

Thanks, Felix.
Yes, I will sent out several patches for Aldebaran virtualization support soon.

Thanks,
Zhigang

-Original Message-
From: Kuehling, Felix 
Sent: April 29, 2021 2:40 PM
To: Luo, Zhigang ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 1/1] drm/amdgpu: Add a new device ID for Aldebaran

Am 2021-04-29 um 2:33 p.m. schrieb Zhigang Luo:
> It is Aldebaran VF device ID, for virtualization support.
>
> Signed-off-by: Zhigang Luo 

This patch looks good to me.

Acked-by: Felix Kuehling 

I believe you'll also need to add an entry for the VF in kfd_supported_devices 
in kfd_device.c.

Regards,
  Felix


> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 0369d3532bf0..310e1249e64e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -1186,6 +1186,7 @@ static const struct pci_device_id pciidlist[] = {
>  {0x1002, 0x7408, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 
> CHIP_ALDEBARAN|AMD_EXP_HW_SUPPORT},
>  {0x1002, 0x740C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 
> CHIP_ALDEBARAN|AMD_EXP_HW_SUPPORT},
>  {0x1002, 0x740F, PCI_ANY_ID, PCI_ANY_ID, 0, 0,
> CHIP_ALDEBARAN|AMD_EXP_HW_SUPPORT},
> +{0x1002, 0x7410, PCI_ANY_ID, PCI_ANY_ID, 0, 0,
> +CHIP_ALDEBARAN|AMD_EXP_HW_SUPPORT},
>
>  {0, 0, 0}
>  };
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco exit

2019-12-06 Thread Luo, Zhigang

[AMD Official Use Only - Internal Distribution Only]

Thanks, Hawking.

--Zhigang

-Original Message-
From: Zhang, Hawking  
Sent: December 6, 2019 12:35 PM
To: Luo, Zhigang ; Alex Deucher 
Cc: Quan, Evan ; Yuan, Xiaojie ; 
amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco exit

It was used to indicate whether bios or driver to handle display mode for 
various display type. And also some field to indicate other display status like 
docking/undocking, LID  open/close, etc.

Check atombios_encoder.c for its major usage

Regards,
Hawking

-Original Message-
From: Luo, Zhigang 
Sent: 2019年12月6日 23:37
To: Zhang, Hawking ; Alex Deucher 
Cc: Quan, Evan ; Yuan, Xiaojie ; 
amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco exit

[AMD Official Use Only - Internal Distribution Only]

Can someone tell me what's BIOS_SCRATCH_6 used for? I know BIOS_SCRATCH_7 is 
used for asic init.

Thanks,
Zhigang

-Original Message-
From: Zhang, Hawking 
Sent: December 6, 2019 9:22 AM
To: Alex Deucher 
Cc: Quan, Evan ; Yuan, Xiaojie ; Luo, 
Zhigang ; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco exit

Ah yes, I made a logical mistake. This should work.

Regards,
Hawking
-Original Message-
From: Alex Deucher 
Sent: 2019年12月6日 22:01
To: Zhang, Hawking 
Cc: Quan, Evan ; Yuan, Xiaojie ; Luo, 
Zhigang ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco exit

On Fri, Dec 6, 2019 at 3:14 AM Zhang, Hawking  wrote:
>
> Correct my typo
>
> This is in high risk to break gpu resume and reset just because you clear the 
> ATOM_S7_ASIC_INIT_COMPLETE_MASK field in scratch register 7. And the 
> atom_bios init will be skipped.
>

I think we should be ok.  If ATOM_S7_ASIC_INIT_COMPLETE_MASK is cleared, we'll 
assume the card needs to be posted, which it probably should after BACO.  This 
may even be what makes this patch a fix.

Alex

> Regards,
> Hawking
> -Original Message-
> From: amd-gfx  On Behalf Of 
> Zhang, Hawking
> Sent: 2019年12月6日 16:07
> To: Quan, Evan ; Yuan, Xiaojie 
> ; Luo, Zhigang 
> Cc: amd-gfx@lists.freedesktop.org
> Subject: RE: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco 
> exit
>
> This is in high risk to break secondary gpu resume and reset just because you 
> clear the ATOM_S7_ASIC_INIT_COMPLETE_MASK field in scratch register 7. And 
> the atom_bios init will be skipped.
>
> We shall understand any libgv fixes very well before "copy" it to bare-metal. 
> Libgv don't need to take care S3.
>
> Regards,
> Hawking
> -Original Message-
> From: amd-gfx  On Behalf Of 
> Quan, Evan
> Sent: 2019年12月6日 13:27
> To: Yuan, Xiaojie ; Luo, Zhigang 
> 
> Cc: amd-gfx@lists.freedesktop.org
> Subject: RE: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco 
> exit
>
> Hi Xiaojie,
>
> This was reported by Zhigang team.  Under their special use case, scratch 
> register 7 has be to 0 to perform asic init(@Luo, Zhigang right?).
> And considering old asics(vega20/10) also applied this change.
> So, I think it's reasonable(and recommend by SMU fw team) to apply this on 
> the new ASICs also.
>
> Regards,
> Evan
> > -Original Message-
> > From: Yuan, Xiaojie 
> > Sent: Friday, December 6, 2019 12:20 PM
> > To: Quan, Evan 
> > Cc: amd-gfx@lists.freedesktop.org; Luo, Zhigang 
> > 
> > Subject: Re: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco 
> > exit
> >
> > Hi Evan,
> >
> > Just out of curiosity, may I know what issue are you trying to fix?
> > I used to see vbios post failure (hangs in atombios init table) 
> > after baco exit occasionally on navi.
> >
> > BR,
> > Xiaojie
> >
> > > On Dec 6, 2019, at 11:37 AM, Evan Quan  wrote:
> > >
> > > This is needed for coming asic init on performing gpu reset.
> > >
> > > Change-Id: If3671a24d239e3d288665fadaa2c40c87d5da40b
> > > Signed-off-by: Evan Quan 
> > > ---
> > > drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 6 ++
> > > 1 file changed, 6 insertions(+)
> > >
> > > diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> > b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> > > index 39ec06aee809..ab809df7bc35 100644
> > > --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> > > +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> > > @@ -1659,6 +1659,12 @@ int smu_v11_0_baco_set_state(struct
> > smu_context *smu, enum smu_baco_state state)
> > >}
> > >} else {
> >

RE: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco exit

2019-12-06 Thread Luo, Zhigang

[AMD Official Use Only - Internal Distribution Only]

Thanks, Alex.

> > This was reported by Zhigang team.  Under their special use case, scratch 
> > register 7 has be to 0 to perform asic init(@Luo, Zhigang right?).
Yes, sriov driver use this register to detect if asic init performed or not. 

--Zhigang

-Original Message-
From: Alex Deucher  
Sent: December 6, 2019 10:42 AM
To: Luo, Zhigang 
Cc: Zhang, Hawking ; Quan, Evan ; 
Yuan, Xiaojie ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco exit

On Fri, Dec 6, 2019 at 10:37 AM Luo, Zhigang  wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
> Can someone tell me what's BIOS_SCRATCH_6 used for? I know BIOS_SCRATCH_7 is 
> used for asic init.

ATOM_S6_CRITICAL_STATE and ATOM_S6_ACC_MODE.  See the other S6 defines in 
atombios.h.  I think it's mainly for display.

Alex

>
> Thanks,
> Zhigang
>
> -Original Message-
> From: Zhang, Hawking 
> Sent: December 6, 2019 9:22 AM
> To: Alex Deucher 
> Cc: Quan, Evan ; Yuan, Xiaojie 
> ; Luo, Zhigang ; 
> amd-gfx@lists.freedesktop.org
> Subject: RE: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco 
> exit
>
> Ah yes, I made a logical mistake. This should work.
>
> Regards,
> Hawking
> -Original Message-
> From: Alex Deucher 
> Sent: 2019年12月6日 22:01
> To: Zhang, Hawking 
> Cc: Quan, Evan ; Yuan, Xiaojie 
> ; Luo, Zhigang ; 
> amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco 
> exit
>
> On Fri, Dec 6, 2019 at 3:14 AM Zhang, Hawking  wrote:
> >
> > Correct my typo
> >
> > This is in high risk to break gpu resume and reset just because you clear 
> > the ATOM_S7_ASIC_INIT_COMPLETE_MASK field in scratch register 7. And the 
> > atom_bios init will be skipped.
> >
>
> I think we should be ok.  If ATOM_S7_ASIC_INIT_COMPLETE_MASK is cleared, 
> we'll assume the card needs to be posted, which it probably should after 
> BACO.  This may even be what makes this patch a fix.
>
> Alex
>
> > Regards,
> > Hawking
> > -Original Message-
> > From: amd-gfx  On Behalf Of 
> > Zhang, Hawking
> > Sent: 2019年12月6日 16:07
> > To: Quan, Evan ; Yuan, Xiaojie 
> > ; Luo, Zhigang 
> > Cc: amd-gfx@lists.freedesktop.org
> > Subject: RE: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco 
> > exit
> >
> > This is in high risk to break secondary gpu resume and reset just because 
> > you clear the ATOM_S7_ASIC_INIT_COMPLETE_MASK field in scratch register 7. 
> > And the atom_bios init will be skipped.
> >
> > We shall understand any libgv fixes very well before "copy" it to 
> > bare-metal. Libgv don't need to take care S3.
> >
> > Regards,
> > Hawking
> > -Original Message-
> > From: amd-gfx  On Behalf Of 
> > Quan, Evan
> > Sent: 2019年12月6日 13:27
> > To: Yuan, Xiaojie ; Luo, Zhigang 
> > 
> > Cc: amd-gfx@lists.freedesktop.org
> > Subject: RE: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco 
> > exit
> >
> > Hi Xiaojie,
> >
> > This was reported by Zhigang team.  Under their special use case, scratch 
> > register 7 has be to 0 to perform asic init(@Luo, Zhigang right?).
> > And considering old asics(vega20/10) also applied this change.
> > So, I think it's reasonable(and recommend by SMU fw team) to apply this on 
> > the new ASICs also.
> >
> > Regards,
> > Evan
> > > -Original Message-
> > > From: Yuan, Xiaojie 
> > > Sent: Friday, December 6, 2019 12:20 PM
> > > To: Quan, Evan 
> > > Cc: amd-gfx@lists.freedesktop.org; Luo, Zhigang 
> > > 
> > > Subject: Re: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on 
> > > baco exit
> > >
> > > Hi Evan,
> > >
> > > Just out of curiosity, may I know what issue are you trying to fix?
> > > I used to see vbios post failure (hangs in atombios init table) 
> > > after baco exit occasionally on navi.
> > >
> > > BR,
> > > Xiaojie
> > >
> > > > On Dec 6, 2019, at 11:37 AM, Evan Quan  wrote:
> > > >
> > > > This is needed for coming asic init on performing gpu reset.
> > > >
> > > > Change-Id: If3671a24d239e3d288665fadaa2c40c87d5da40b
> > > > Signed-off-by: Evan Quan 
> > > > ---
> > > > drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 6 ++
> > > > 1 file changed, 6 insertions(+)
> > > >
&g

RE: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco exit

2019-12-06 Thread Luo, Zhigang

[AMD Official Use Only - Internal Distribution Only]

Can someone tell me what's BIOS_SCRATCH_6 used for? I know BIOS_SCRATCH_7 is 
used for asic init.

Thanks,
Zhigang

-Original Message-
From: Zhang, Hawking  
Sent: December 6, 2019 9:22 AM
To: Alex Deucher 
Cc: Quan, Evan ; Yuan, Xiaojie ; Luo, 
Zhigang ; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco exit

Ah yes, I made a logical mistake. This should work.

Regards,
Hawking
-Original Message-
From: Alex Deucher 
Sent: 2019年12月6日 22:01
To: Zhang, Hawking 
Cc: Quan, Evan ; Yuan, Xiaojie ; Luo, 
Zhigang ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco exit

On Fri, Dec 6, 2019 at 3:14 AM Zhang, Hawking  wrote:
>
> Correct my typo
>
> This is in high risk to break gpu resume and reset just because you clear the 
> ATOM_S7_ASIC_INIT_COMPLETE_MASK field in scratch register 7. And the 
> atom_bios init will be skipped.
>

I think we should be ok.  If ATOM_S7_ASIC_INIT_COMPLETE_MASK is cleared, we'll 
assume the card needs to be posted, which it probably should after BACO.  This 
may even be what makes this patch a fix.

Alex

> Regards,
> Hawking
> -Original Message-
> From: amd-gfx  On Behalf Of 
> Zhang, Hawking
> Sent: 2019年12月6日 16:07
> To: Quan, Evan ; Yuan, Xiaojie 
> ; Luo, Zhigang 
> Cc: amd-gfx@lists.freedesktop.org
> Subject: RE: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco 
> exit
>
> This is in high risk to break secondary gpu resume and reset just because you 
> clear the ATOM_S7_ASIC_INIT_COMPLETE_MASK field in scratch register 7. And 
> the atom_bios init will be skipped.
>
> We shall understand any libgv fixes very well before "copy" it to bare-metal. 
> Libgv don't need to take care S3.
>
> Regards,
> Hawking
> -Original Message-
> From: amd-gfx  On Behalf Of 
> Quan, Evan
> Sent: 2019年12月6日 13:27
> To: Yuan, Xiaojie ; Luo, Zhigang 
> 
> Cc: amd-gfx@lists.freedesktop.org
> Subject: RE: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco 
> exit
>
> Hi Xiaojie,
>
> This was reported by Zhigang team.  Under their special use case, scratch 
> register 7 has be to 0 to perform asic init(@Luo, Zhigang right?).
> And considering old asics(vega20/10) also applied this change.
> So, I think it's reasonable(and recommend by SMU fw team) to apply this on 
> the new ASICs also.
>
> Regards,
> Evan
> > -Original Message-
> > From: Yuan, Xiaojie 
> > Sent: Friday, December 6, 2019 12:20 PM
> > To: Quan, Evan 
> > Cc: amd-gfx@lists.freedesktop.org; Luo, Zhigang 
> > 
> > Subject: Re: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco 
> > exit
> >
> > Hi Evan,
> >
> > Just out of curiosity, may I know what issue are you trying to fix?
> > I used to see vbios post failure (hangs in atombios init table) 
> > after baco exit occasionally on navi.
> >
> > BR,
> > Xiaojie
> >
> > > On Dec 6, 2019, at 11:37 AM, Evan Quan  wrote:
> > >
> > > This is needed for coming asic init on performing gpu reset.
> > >
> > > Change-Id: If3671a24d239e3d288665fadaa2c40c87d5da40b
> > > Signed-off-by: Evan Quan 
> > > ---
> > > drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 6 ++
> > > 1 file changed, 6 insertions(+)
> > >
> > > diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> > b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> > > index 39ec06aee809..ab809df7bc35 100644
> > > --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> > > +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> > > @@ -1659,6 +1659,12 @@ int smu_v11_0_baco_set_state(struct
> > smu_context *smu, enum smu_baco_state state)
> > >}
> > >} else {
> > >ret = smu_send_smc_msg(smu, SMU_MSG_ExitBaco);
> > > +if (ret)
> > > +goto out;
> > > +
> > > +WREG32_SOC15(NBIO, 0, mmBIOS_SCRATCH_6, 0);
> > > +WREG32_SOC15(NBIO, 0, mmBIOS_SCRATCH_7, 0);
> > > +
> > >bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl,
> > >BIF_DOORBELL_INT_CNTL,
> > >DOORBELL_INTERRUPT_DISABLE, 0);
> > > --
> > > 2.24.0
> > >
> > > ___
> > > amd-gfx mailing list
> > > amd-gfx@lists.freedesktop.org
> > >
> > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fli
> > st
> > s.free
&

RE: [PATCH v2 1/2] Revert "drm/amdgpu: Extend KIQ reg polling wait for VF"

RE: [PATCH 1/2] drm/amdgpu: Use dev_ prints for virtualization as it supports multi adapter

RE: [PATCH] drm/amdgpu: Add lock around VF RLCG interface

RE: [PATCH 2/2] drm/amdgpu: Queue KFD reset workitem in VF FED

RE: [PATCH 1/2] drm/amdgpu: Extend KIQ reg polling wait for VF

RE: [PATCH 4/4] drm/amdgpu: Move ras resume into SRIOV function

RE: [PATCH v4 2/4] drm/amdgpu: Add reset_context flag for host FLR

RE: [PATCH v2 3/4] drm/amdgpu: Fix amdgpu_device_reset_sriov retry logic

RE: [PATCH 1/3] amd/amdkfd: add a function to wait no process running in kfd

RE: [PATCH] drm/amdgpu: Skip virt_exchange_init on SDMA poison consumption

RE: [PATCH] drm/amdgpu/: Remove bo_create_kernel_at path from virt page

RE: [PATCH 2/4] drm/amdgpu: Do not program SQ_TIMEOUT_CONFIG in SRIOV

RE: [PATCH 3/4] drm/amdgpu: Use correct SRIOV macro for gmc_v9_0_vm_fault_interrupt_state

RE: [PATCH] drm/amdgpu: xgmi_fill_topology_info

RE: [PATCH] drm/amdgpu/jpeg - skip change of power-gating state for sriov

RE: [PATCH v3] drm/amdgpu: Add RLCG interface driver implementation for gfx v9.4.3 (v3)

RE: [PATCH 1/4] drm/amdgpu: skip reset other device in the same hive if it's SRIOV VF

RE: [PATCH] drm/amdgpu: correct MMSCH version

RE: [PATCH 4/5] drm/amdgpu: add psp microcode init for arcturus and aldebaran sriov vf

RE: [PATCH 1/5] drm/amdgpu: remove sriov vf checking from getting fb location

RE: [PATCH 1/5] drm/amdgpu: remove sriov vf checking from getting fb location

RE: [PATCH 5/5] drm/amdgpu: allocate psp fw private buffer from VRAM for sriov vf

RE: [PATCH 4/5] drm/amdgpu: add psp microcode init for arcturus and aldebaran sriov vf

RE: [PATCH 2/5] drm/amdgpu: remove sriov vf gfxhub fb location programming

RE: [PATCH 1/1] drm/amdgpu: Add a new device ID for Aldebaran

RE: [PATCH 2/2] drm/amdgpu: Add Aldebaran virtualization support

RE: [PATCH 1/1] drm/amdgpu: Add a new device ID for Aldebaran

RE: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco exit

RE: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco exit

RE: [PATCH] drm/amd/powerplay: clear VBIOS scratchs on baco exit

30 matches

Site Navigation

Mail list logo

Footer information