from:"shaoyunl"

[PATCH] drm/amdgpu: enable unmapped doorbell handling basic mode on mes 12

2024-05-08 Thread shaoyunl

This reverts commit 9606c08e178f953d22e50b05c64b4b1a48051f3e.

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/mes_v12_0.c| 14 ++
 drivers/gpu/drm/amd/include/mes_v12_api_def.h |  3 ++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index 76db85157bf9..4f123d88aa3a 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -525,7 +525,14 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes 
*mes)
mes_set_hw_res_pkt.disable_mes_log = 1;
mes_set_hw_res_pkt.use_different_vmid_compute = 1;
mes_set_hw_res_pkt.enable_reg_active_poll = 1;
+
+   /*
+* Keep oversubscribe timer for sdma . When we have unmapped doorbell
+* handling support, other queue will not use the oversubscribe timer.
+* handling  mode - 0: disabled; 1: basic version; 2: basic+ version
+*/
mes_set_hw_res_pkt.oversubscription_timer = 50;
+   mes_set_hw_res_pkt.unmapped_doorbell_handling = 1;
 
mes_set_hw_res_pkt.enable_mes_event_int_logging = 0;
mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = 
mes->event_log_gpu_addr;
@@ -972,6 +979,13 @@ static int mes_v12_0_mqd_init(struct amdgpu_ring *ring)
mqd->cp_hqd_iq_timer = regCP_HQD_IQ_TIMER_DEFAULT;
mqd->cp_hqd_quantum = regCP_HQD_QUANTUM_DEFAULT;
 
+   /*
+* Set CP_HQD_GFX_CONTROL.DB_UPDATED_MSG_EN[15] to enable unmapped
+* doorbell handling. This is a reserved CP internal register can
+* not be accesss by others
+*/
+   mqd->reserved_184 = BIT(15);
+
return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/include/mes_v12_api_def.h 
b/drivers/gpu/drm/amd/include/mes_v12_api_def.h
index e3211daa9c2e..ffd67c6ed9b3 100644
--- a/drivers/gpu/drm/amd/include/mes_v12_api_def.h
+++ b/drivers/gpu/drm/amd/include/mes_v12_api_def.h
@@ -239,7 +239,8 @@ union MESAPI_SET_HW_RESOURCES {
uint32_t send_write_data : 1;
uint32_t os_tdr_timeout_override : 1;
uint32_t use_rs64mem_for_proc_gang_ctx : 1;
-   uint32_t reserved : 17;
+   uint32_t unmapped_doorbell_handling: 2;
+   uint32_t reserved : 15;
};
uint32_t uint32_all;
};
-- 
2.34.1

[PATCH] drm/amdgpu : Add mes_log_enable to control mes log feature

2024-03-22 Thread shaoyunl

The MES log might slow down the performance for extra step of log the data,
disable it by default and introduce a parameter can enable it when necessary

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c |  5 -
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  |  7 +--
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9c62552bec34..b3b84647207e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -210,6 +210,7 @@ extern int amdgpu_async_gfx_ring;
 extern int amdgpu_mcbp;
 extern int amdgpu_discovery;
 extern int amdgpu_mes;
+extern int amdgpu_mes_log_enable;
 extern int amdgpu_mes_kiq;
 extern int amdgpu_noretry;
 extern int amdgpu_force_asic_type;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 80b9642f2bc4..e4277298cf1a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -195,6 +195,7 @@ int amdgpu_async_gfx_ring = 1;
 int amdgpu_mcbp = -1;
 int amdgpu_discovery = -1;
 int amdgpu_mes;
+int amdgpu_mes_log_enable = 0;
 int amdgpu_mes_kiq;
 int amdgpu_noretry = -1;
 int amdgpu_force_asic_type = -1;
@@ -667,6 +668,15 @@ MODULE_PARM_DESC(mes,
"Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)");
 module_param_named(mes, amdgpu_mes, int, 0444);
 
+/**
+ * DOC: mes_log_enable (int)
+ * Enable Micro Engine Scheduler log. This is used to enable/disable MES 
internal log.
+ * (0 = disabled (default), 1 = enabled)
+ */
+MODULE_PARM_DESC(mes_log_enable,
+   "Enable Micro Engine Scheduler log (0 = disabled (default), 1 = 
enabled)");
+module_param_named(mes_log_enable, amdgpu_mes_log_enable, int, 0444);
+
 /**
  * DOC: mes_kiq (int)
  * Enable Micro Engine Scheduler KIQ. This is a new engine pipe for kiq.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 78dfd027dc99..9ace848e174c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -100,6 +100,9 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device 
*adev)
 {
int r;
 
+   if (!amdgpu_mes_log_enable)
+   return 0;
+
r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE,
AMDGPU_GEM_DOMAIN_GTT,
>mes.event_log_gpu_obj,
@@ -1561,7 +1564,7 @@ void amdgpu_debugfs_mes_event_log_init(struct 
amdgpu_device *adev)
 #if defined(CONFIG_DEBUG_FS)
struct drm_minor *minor = adev_to_drm(adev)->primary;
struct dentry *root = minor->debugfs_root;
-   if (adev->enable_mes)
+   if (adev->enable_mes && amdgpu_mes_log_enable)
debugfs_create_file("amdgpu_mes_event_log", 0444, root,
adev, _debugfs_mes_event_log_fops);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 072c478665ad..63f281a9984d 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -411,8 +411,11 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes 
*mes)
mes_set_hw_res_pkt.enable_reg_active_poll = 1;
mes_set_hw_res_pkt.enable_level_process_quantum_check = 1;
mes_set_hw_res_pkt.oversubscription_timer = 50;
-   mes_set_hw_res_pkt.enable_mes_event_int_logging = 1;
-   mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = 
mes->event_log_gpu_addr;
+   if (amdgpu_mes_log_enable) {
+   mes_set_hw_res_pkt.enable_mes_event_int_logging = 1;
+   mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr =
+   mes->event_log_gpu_addr;
+   }
 
return mes_v11_0_submit_pkt_and_poll_completion(mes,
_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt),
-- 
2.34.1

[PATCH] drm/amdgpu : Increase the mes log buffer size as per new MES FW version

2024-03-22 Thread shaoyunl

>From MES version 0x54, the log entry increased and require the log buffer
size to be increased. The 16k is maximum size agreed

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 5 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 +
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 9ace848e174c..78e4f88f5134 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -103,7 +103,7 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device 
*adev)
if (!amdgpu_mes_log_enable)
return 0;
 
-   r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE,
+   r = amdgpu_bo_create_kernel(adev, AMDGPU_MES_LOG_BUFFER_SIZE, PAGE_SIZE,
AMDGPU_GEM_DOMAIN_GTT,
>mes.event_log_gpu_obj,
>mes.event_log_gpu_addr,
@@ -1548,12 +1548,11 @@ static int amdgpu_debugfs_mes_event_log_show(struct 
seq_file *m, void *unused)
uint32_t *mem = (uint32_t *)(adev->mes.event_log_cpu_addr);
 
seq_hex_dump(m, "", DUMP_PREFIX_OFFSET, 32, 4,
-mem, PAGE_SIZE, false);
+mem, AMDGPU_MES_LOG_BUFFER_SIZE, false);
 
return 0;
 }
 
-
 DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_mes_event_log);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 7d4f93fea937..4c8fc3117ef8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -52,6 +52,7 @@ enum amdgpu_mes_priority_level {
 
 #define AMDGPU_MES_PROC_CTX_SIZE 0x1000 /* one page area */
 #define AMDGPU_MES_GANG_CTX_SIZE 0x1000 /* one page area */
+#define AMDGPU_MES_LOG_BUFFER_SIZE 0x4000 /* Maximu log buffer size for MES */
 
 struct amdgpu_mes_funcs;
 
-- 
2.34.1

[PATCH] drm/amdgpu : Add mes_log_enable to control mes log feature

2024-03-22 Thread shaoyunl

The MES log might slow down the performance for extra step of log the data,
disable it by default and introduce a parameter can enable it when necessary

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c |  5 -
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  |  7 +--
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 80b9642f2bc4..7584f1ea469e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -195,6 +195,7 @@ int amdgpu_async_gfx_ring = 1;
 int amdgpu_mcbp = -1;
 int amdgpu_discovery = -1;
 int amdgpu_mes;
+int amdgpu_mes_log_enable = 0;
 int amdgpu_mes_kiq;
 int amdgpu_noretry = -1;
 int amdgpu_force_asic_type = -1;
@@ -667,6 +668,15 @@ MODULE_PARM_DESC(mes,
"Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)");
 module_param_named(mes, amdgpu_mes, int, 0444);
 
+/**
+ * DOC: mes_log_enable (int)
+ * Enable Micro Engine Scheduler log. This is used to enable/disable MES 
internal log.
+ * (0 = disabled (default), 1 = enabled)
+ */
+MODULE_PARM_DESC(mes_log_enable,
+   "Enable Micro Engine Scheduler log (0 = disabled (default), 1 = 
enabled)");
+module_param_named(mes, amdgpu_mes_log_enable, int, 0444);
+
 /**
  * DOC: mes_kiq (int)
  * Enable Micro Engine Scheduler KIQ. This is a new engine pipe for kiq.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 78dfd027dc99..9ace848e174c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -100,6 +100,9 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device 
*adev)
 {
int r;
 
+   if (!amdgpu_mes_log_enable)
+   return 0;
+
r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE,
AMDGPU_GEM_DOMAIN_GTT,
>mes.event_log_gpu_obj,
@@ -1561,7 +1564,7 @@ void amdgpu_debugfs_mes_event_log_init(struct 
amdgpu_device *adev)
 #if defined(CONFIG_DEBUG_FS)
struct drm_minor *minor = adev_to_drm(adev)->primary;
struct dentry *root = minor->debugfs_root;
-   if (adev->enable_mes)
+   if (adev->enable_mes && amdgpu_mes_log_enable)
debugfs_create_file("amdgpu_mes_event_log", 0444, root,
adev, _debugfs_mes_event_log_fops);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 072c478665ad..63f281a9984d 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -411,8 +411,11 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes 
*mes)
mes_set_hw_res_pkt.enable_reg_active_poll = 1;
mes_set_hw_res_pkt.enable_level_process_quantum_check = 1;
mes_set_hw_res_pkt.oversubscription_timer = 50;
-   mes_set_hw_res_pkt.enable_mes_event_int_logging = 1;
-   mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = 
mes->event_log_gpu_addr;
+   if (amdgpu_mes_log_enable) {
+   mes_set_hw_res_pkt.enable_mes_event_int_logging = 1;
+   mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr =
+   mes->event_log_gpu_addr;
+   }
 
return mes_v11_0_submit_pkt_and_poll_completion(mes,
_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt),
-- 
2.34.1

[PATCH] drm/amdgpu: Only create mes event log debugfs when mes is enabled

2024-01-31 Thread shaoyunl

Skip the debugfs file creation for mes event log if the GPU
doesn't use MES. This to prevent potential kernel oops when
user try to read the event log in debugfs on a GPU without MES

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 0626ac0192a8..dd2b8f3fa2f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -1565,9 +1565,9 @@ void amdgpu_debugfs_mes_event_log_init(struct 
amdgpu_device *adev)
 #if defined(CONFIG_DEBUG_FS)
struct drm_minor *minor = adev_to_drm(adev)->primary;
struct dentry *root = minor->debugfs_root;
-
-   debugfs_create_file("amdgpu_mes_event_log", 0444, root,
-   adev, _debugfs_mes_event_log_fops);
+   if (adev->enable_mes)
+   debugfs_create_file("amdgpu_mes_event_log", 0444, root,
+   adev, _debugfs_mes_event_log_fops);
 
 #endif
 }
-- 
2.34.1

[PATCH] drm/amdgpu: Enable event log on MES 11

2023-11-23 Thread shaoyunl

Enable event log through the HW specific FW API

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c| 2 ++
 drivers/gpu/drm/amd/include/mes_v11_api_def.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 4dfec56e1b7f..26d71a22395d 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -408,6 +408,8 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes 
*mes)
mes_set_hw_res_pkt.enable_reg_active_poll = 1;
mes_set_hw_res_pkt.enable_level_process_quantum_check = 1;
mes_set_hw_res_pkt.oversubscription_timer = 50;
+   mes_set_hw_res_pkt.enable_mes_event_int_logging = 1;
+   mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = 
mes->event_log_gpu_addr;
 
return mes_v11_0_submit_pkt_and_poll_completion(mes,
_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt),
diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h 
b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
index b1db2b190187..1fbfd1aa987e 100644
--- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
+++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
@@ -232,6 +232,7 @@ union MESAPI_SET_HW_RESOURCES {
};
uint32_toversubscription_timer;
uint64_tdoorbell_info;
+   uint64_tevent_intr_history_gpu_mc_ptr;
};
 
uint32_tmax_dwords_in_api[API_FRAME_SIZE_IN_DWORDS];
-- 
2.34.1

[PATCH] drm/amdgpu: SW part of MES event log enablement

2023-11-23 Thread shaoyunl

This is the generic SW part, prepare the event log buffer and dump it through 
debugfs

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 61 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  5 ++
 4 files changed, 70 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index a53f436fa9f1..8b2cbeae99ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -2140,6 +2140,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
amdgpu_debugfs_firmware_init(adev);
amdgpu_ta_if_debugfs_init(adev);
 
+   amdgpu_debugfs_mes_event_log_init(adev);
+
 #if defined(CONFIG_DRM_AMD_DC)
if (adev->dc_enabled)
dtn_debugfs_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
index 371a6f0deb29..0425432d8659 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
@@ -32,3 +32,5 @@ void amdgpu_debugfs_fini(struct amdgpu_device *adev);
 void amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
 void amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
 void amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
+void amdgpu_debugfs_mes_event_log_init(struct amdgpu_device *adev);
+
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 45280fb0e00c..b4ba556dc733 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -97,6 +97,26 @@ static int amdgpu_mes_doorbell_init(struct amdgpu_device 
*adev)
return 0;
 }
 
+static int amdgpu_mes_event_log_init(struct amdgpu_device *adev)
+{
+   int r;
+
+   r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE,
+   AMDGPU_GEM_DOMAIN_GTT,
+   >mes.event_log_gpu_obj,
+   >mes.event_log_gpu_addr,
+   >mes.event_log_cpu_addr);
+   if (r) {
+   dev_warn(adev->dev, "failed to create MES event log buffer 
(%d)", r);
+   return r;
+   }
+
+   memset(adev->mes.event_log_cpu_addr, 0, PAGE_SIZE);
+
+   return  0;
+
+}
+
 static void amdgpu_mes_doorbell_free(struct amdgpu_device *adev)
 {
bitmap_free(adev->mes.doorbell_bitmap);
@@ -181,8 +201,14 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
if (r)
goto error;
 
+   r = amdgpu_mes_event_log_init(adev);
+   if (r)
+   goto error_doorbell;
+
return 0;
 
+error_doorbell:
+   amdgpu_mes_doorbell_free(adev);
 error:
amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs);
amdgpu_device_wb_free(adev, adev->mes.query_status_fence_offs);
@@ -198,6 +224,10 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
 
 void amdgpu_mes_fini(struct amdgpu_device *adev)
 {
+   amdgpu_bo_free_kernel(>mes.event_log_gpu_obj,
+ >mes.event_log_gpu_addr,
+ >mes.event_log_cpu_addr);
+
amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs);
amdgpu_device_wb_free(adev, adev->mes.query_status_fence_offs);
amdgpu_device_wb_free(adev, adev->mes.read_val_offs);
@@ -1483,3 +1513,34 @@ int amdgpu_mes_init_microcode(struct amdgpu_device 
*adev, int pipe)
amdgpu_ucode_release(>mes.fw[pipe]);
return r;
 }
+
+#if defined(CONFIG_DEBUG_FS)
+
+static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused)
+{
+   struct amdgpu_device *adev = m->private;
+   uint32_t *mem = (uint32_t *)(adev->mes.event_log_cpu_addr);
+
+   seq_hex_dump(m, "", DUMP_PREFIX_OFFSET, 32, 4,
+mem, PAGE_SIZE, false);
+
+   return 0;
+}
+
+
+DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_mes_event_log);
+
+#endif
+
+void amdgpu_debugfs_mes_event_log_init(struct amdgpu_device *adev)
+{
+
+#if defined(CONFIG_DEBUG_FS)
+   struct drm_minor *minor = adev_to_drm(adev)->primary;
+   struct dentry *root = minor->debugfs_root;
+
+   debugfs_create_file("amdgpu_mes_event_log", 0444, root,
+   adev, _debugfs_mes_event_log_fops);
+
+#endif
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index a27b424ffe00..894b9b133000 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -133,6 +133,11 @@ struct amdgpu_mes {
uint32_tnum_mes_dbs;
unsigned long   *doorbell_bitmap;
 
+   /* MES event log buffer */
+   struct amdgpu_bo*e

[PATCH] drm/amdgpu: SW part of MES event log enablement

2023-11-23 Thread shaoyunl

This is the generic SW part, prepare the event log buffer and dump it through 
debugfs

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 61 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  5 ++
 4 files changed, 70 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index a53f436fa9f1..8b2cbeae99ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -2140,6 +2140,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
amdgpu_debugfs_firmware_init(adev);
amdgpu_ta_if_debugfs_init(adev);
 
+   amdgpu_debugfs_mes_event_log_init(adev);
+
 #if defined(CONFIG_DRM_AMD_DC)
if (adev->dc_enabled)
dtn_debugfs_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
index 371a6f0deb29..0425432d8659 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
@@ -32,3 +32,5 @@ void amdgpu_debugfs_fini(struct amdgpu_device *adev);
 void amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
 void amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
 void amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
+void amdgpu_debugfs_mes_event_log_init(struct amdgpu_device *adev);
+
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 45280fb0e00c..b7af24d7db0e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -97,6 +97,26 @@ static int amdgpu_mes_doorbell_init(struct amdgpu_device 
*adev)
return 0;
 }
 
+static int amdgpu_mes_event_log_init(struct amdgpu_device *adev)
+{
+   int r;
+
+   r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE,
+   AMDGPU_GEM_DOMAIN_GTT,
+   >mes.event_log_gpu_obj,
+   >mes.event_log_gpu_addr,
+   >mes.event_log_cpu_addr);
+   if (r) {
+   dev_warn(adev->dev, "failed to create MES event log buffer 
(%d)", r);
+   return r;
+   }
+
+   memset(adev->mes.event_log_cpu_addr, 0, PAGE_SIZE);
+
+   return  0;
+
+}
+
 static void amdgpu_mes_doorbell_free(struct amdgpu_device *adev)
 {
bitmap_free(adev->mes.doorbell_bitmap);
@@ -181,6 +201,12 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
if (r)
goto error;
 
+   r = amdgpu_mes_event_log_init(adev);
+   if (r) {
+   amdgpu_mes_doorbell_free(adev);
+   goto error;
+   }
+
return 0;
 
 error:
@@ -198,6 +224,10 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
 
 void amdgpu_mes_fini(struct amdgpu_device *adev)
 {
+   amdgpu_bo_free_kernel(>mes.event_log_gpu_obj,
+ >mes.event_log_gpu_addr,
+ >mes.event_log_cpu_addr);
+
amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs);
amdgpu_device_wb_free(adev, adev->mes.query_status_fence_offs);
amdgpu_device_wb_free(adev, adev->mes.read_val_offs);
@@ -1483,3 +1513,34 @@ int amdgpu_mes_init_microcode(struct amdgpu_device 
*adev, int pipe)
amdgpu_ucode_release(>mes.fw[pipe]);
return r;
 }
+
+#if defined(CONFIG_DEBUG_FS)
+
+static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused)
+{
+   struct amdgpu_device *adev = m->private;
+   uint32_t *mem = (uint32_t *)(adev->mes.event_log_cpu_addr);
+
+   seq_hex_dump(m, "", DUMP_PREFIX_OFFSET, 32, 4,
+mem, PAGE_SIZE, false);
+
+   return 0;
+}
+
+
+DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_mes_event_log);
+
+#endif
+
+void amdgpu_debugfs_mes_event_log_init(struct amdgpu_device *adev)
+{
+
+#if defined(CONFIG_DEBUG_FS)
+   struct drm_minor *minor = adev_to_drm(adev)->primary;
+   struct dentry *root = minor->debugfs_root;
+
+   debugfs_create_file("amdgpu_mes_event_log", 0444, root,
+   adev, _debugfs_mes_event_log_fops);
+
+#endif
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index a27b424ffe00..894b9b133000 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -133,6 +133,11 @@ struct amdgpu_mes {
uint32_tnum_mes_dbs;
unsigned long   *doorbell_bitmap;
 
+   /* MES event log buffer */
+   struct amdgpu_bo*event_log_gpu_obj;
+   uint64_tevent_log_gpu_addr;
+   void*event_log_cpu_addr;

[PATCH] drm/amdgpu: Enable MES to handle doorbell ring on unmapped queue

2023-11-02 Thread shaoyunl

On navi4x and up, HW can monitor up to 2048 doorbells that not be
mapped currently and trigger the interrupt to MES when these unmapped
doorbell been ringed.

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index ac41c649caa0..eac34ed1a504 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -455,6 +455,27 @@ static void mes_v12_0_init_aggregated_doorbell(struct 
amdgpu_mes *mes)
WREG32_SOC15(GC, 0, regCP_HQD_GFX_CONTROL, data);
 }
 
+
+static void mes_v12_0_enable_unmapped_doorbell_handling(
+   struct amdgpu_mes *mes, bool enable)
+{
+   struct amdgpu_device *adev = mes->adev;
+   uint32_t data = RREG32_SOC15(GC, 0, regCP_UNMAPPED_DOORBELL);
+
+   /*
+* The default PROC_LSB settng is 0xc which means doorbell
+* addr[16:12] gives the doorbell page number. For kfd, each
+* process will use 2 pages of doorbell, we need to change the
+* setting to 0xd
+*/
+   data &= ~CP_UNMAPPED_DOORBELL__PROC_LSB_MASK;
+   data |= 0xd <<  CP_UNMAPPED_DOORBELL__PROC_LSB__SHIFT;
+
+   data |= (enable ? 1 : 0) << CP_UNMAPPED_DOORBELL__ENABLE__SHIFT;
+
+   WREG32_SOC15(GC, 0, regCP_UNMAPPED_DOORBELL, data);
+}
+
 static const struct amdgpu_mes_funcs mes_v12_0_funcs = {
.add_hw_queue = mes_v12_0_add_hw_queue,
.remove_hw_queue = mes_v12_0_remove_hw_queue,
@@ -1235,6 +1256,9 @@ static int mes_v12_0_hw_init(void *handle)
 
mes_v12_0_init_aggregated_doorbell(>mes);
 
+   /* Enable the MES to handle doorbell ring on unmapped queue */
+   mes_v12_0_enable_unmapped_doorbell_handling(>mes, true);
+
r = mes_v12_0_query_sched_status(>mes);
if (r) {
DRM_ERROR("MES is busy\n");
-- 
2.34.1

[PATCH] drm/amdgpu: Use per device reset_domain for XGMI on sriov configuration

2022-09-07 Thread shaoyunl

For SRIOV configuration, host driver control the reset method(either FLR or
heavier chain reset). The host will notify the guest individually with FLR
message if individual GPU within the hive need to be reset. So for guest
side, no need to use hive->reset_domain to replace the original per
device reset_domain

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 +-
 2 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62b26f0e37b0..a5533e0d9d6c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2453,17 +2453,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
if (amdgpu_xgmi_add_device(adev) == 0) {
struct amdgpu_hive_info *hive = 
amdgpu_get_xgmi_hive(adev);
 
-   if (!hive->reset_domain ||
-   !amdgpu_reset_get_reset_domain(hive->reset_domain)) 
{
-   r = -ENOENT;
+   if(!amdgpu_sriov_vf(adev)) {
+   if (!hive->reset_domain ||
+   
!amdgpu_reset_get_reset_domain(hive->reset_domain)) {
+   r = -ENOENT;
+   amdgpu_put_xgmi_hive(hive);
+   goto init_failed;
+   }
+
+   /* Drop the early temporary reset domain we 
created for device */
+   
amdgpu_reset_put_reset_domain(adev->reset_domain);
+   adev->reset_domain = hive->reset_domain;
amdgpu_put_xgmi_hive(hive);
-   goto init_failed;
}
-
-   /* Drop the early temporary reset domain we created for 
device */
-   amdgpu_reset_put_reset_domain(adev->reset_domain);
-   adev->reset_domain = hive->reset_domain;
-   amdgpu_put_xgmi_hive(hive);
}
}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index d3b483aa81f8..a78b589e4f4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -391,24 +391,32 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct 
amdgpu_device *adev)
goto pro_end;
}
 
+   /**
+* Only init hive->reset_domain for none SRIOV configuration. For SRIOV,
+* Host driver decide how to reset the GPU either through FLR or chain 
reset.
+* Guest side will get individual notifications from the host for the 
FLR
+* if necessary.
+*/
+   if (!amdgpu_sriov_vf(adev)) {
/**
 * Avoid recreating reset domain when hive is reconstructed for the case
-* of reset the devices in the XGMI hive during probe for SRIOV
+* of reset the devices in the XGMI hive during probe for passthrough 
GPU
 * See https://www.spinics.net/lists/amd-gfx/msg58836.html
 */
-   if (adev->reset_domain->type != XGMI_HIVE) {
-   hive->reset_domain = 
amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
-   if (!hive->reset_domain) {
-   dev_err(adev->dev, "XGMI: failed initializing 
reset domain for xgmi hive\n");
-   ret = -ENOMEM;
-   kobject_put(>kobj);
-   kfree(hive);
-   hive = NULL;
-   goto pro_end;
-   }
-   } else {
-   amdgpu_reset_get_reset_domain(adev->reset_domain);
-   hive->reset_domain = adev->reset_domain;
+   if (adev->reset_domain->type != XGMI_HIVE) {
+   hive->reset_domain = 
amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
+   if (!hive->reset_domain) {
+   dev_err(adev->dev, "XGMI: failed 
initializing reset domain for xgmi hive\n");
+   ret = -ENOMEM;
+   kobject_put(>kobj);
+   kfree(hive);
+   hive = NULL;
+   goto pro_end;
+   }
+   } else {
+   amdgpu_reset_get_reset_domain(adev->reset_domain);
+   hive->reset_domain = adev->reset_domain;
+   }
}
 
hive->hive_id = adev->gmc.xgmi.hive_id;
-- 
2.17.1

[PATCH] drm/amdgpu: Remove the additional kfd pre reset call for sriov

2022-08-18 Thread shaoyunl

The additional call is caused by merge conflict

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4cd87dbb108c..d7eb23b8d692 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4417,8 +4417,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
 retry:
amdgpu_amdkfd_pre_reset(adev);
 
-   amdgpu_amdkfd_pre_reset(adev);
-
if (from_hypervisor)
r = amdgpu_virt_request_full_gpu(adev, true);
else
-- 
2.17.1

[PATCH] drm/amdgpu: use sjt mec fw on aldebaran for sriov

2022-08-05 Thread shaoyunl

The second jump table is required on live migration or mulitple VF
configuration on Aldebaran. With this implemented, the first level
jump table(hw used) will be same, mec fw internal will use the
second level jump table jump to the real functionality implementation.
so the different VF can load different version of MEC as long as
they support sjt

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index c6e0f9313a7f..7f187558220e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -126,6 +126,8 @@ MODULE_FIRMWARE("amdgpu/green_sardine_rlc.bin");
 MODULE_FIRMWARE("amdgpu/aldebaran_mec.bin");
 MODULE_FIRMWARE("amdgpu/aldebaran_mec2.bin");
 MODULE_FIRMWARE("amdgpu/aldebaran_rlc.bin");
+MODULE_FIRMWARE("amdgpu/aldebaran_sjt_mec.bin");
+MODULE_FIRMWARE("amdgpu/aldebaran_sjt_mec2.bin");
 
 #define mmTCP_CHAN_STEER_0_ARCT
0x0b03
 #define mmTCP_CHAN_STEER_0_ARCT_BASE_IDX   
0
@@ -1496,7 +1498,11 @@ static int gfx_v9_0_init_cp_compute_microcode(struct 
amdgpu_device *adev,
const struct common_firmware_header *header = NULL;
const struct gfx_firmware_header_v1_0 *cp_hdr;
 
-   snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_mec.bin", chip_name);
+   if (amdgpu_sriov_vf(adev) && (adev->asic_type == CHIP_ALDEBARAN))
+   snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_sjt_mec.bin", 
chip_name);
+   else
+   snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_mec.bin", 
chip_name);
+
err = request_firmware(>gfx.mec_fw, fw_name, adev->dev);
if (err)
goto out;
@@ -1509,7 +1515,11 @@ static int gfx_v9_0_init_cp_compute_microcode(struct 
amdgpu_device *adev,
 
 
if (gfx_v9_0_load_mec2_fw_bin_support(adev)) {
-   snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_mec2.bin", 
chip_name);
+   if (amdgpu_sriov_vf(adev) && (adev->asic_type == 
CHIP_ALDEBARAN))
+   snprintf(fw_name, sizeof(fw_name), 
"amdgpu/%s_sjt_mec2.bin", chip_name);
+   else
+   snprintf(fw_name, sizeof(fw_name), 
"amdgpu/%s_mec2.bin", chip_name);
+
err = request_firmware(>gfx.mec2_fw, fw_name, adev->dev);
if (!err) {
err = amdgpu_ucode_validate(adev->gfx.mec2_fw);
-- 
2.17.1

[PATCH] drm/amdgpu: Disable FRU EEPROM access for SRIOV

2022-01-20 Thread shaoyunl

VF acces the EEPROM is blocked by security policy, we might need other way
to get SKUs info for VF

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c
index 2a786e788627..0548e279cc9f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c
@@ -40,6 +40,12 @@ static bool is_fru_eeprom_supported(struct amdgpu_device 
*adev)
 */
struct atom_context *atom_ctx = adev->mode_info.atom_context;
 
+   /* The i2c access is blocked on VF
+* TODO: Need other way to get the info
+*/  
+   if (amdgpu_sriov_vf(adev)
+   return false;
+
/* VBIOS is of the format ###-DXXXYY-##. For SKU identification,
 * we can use just the "DXXX" portion. If there were more models, we
 * could convert the 3 characters to a hex integer and use a switch
-- 
2.17.1

[PATCH] drm/amdgpu: adjust the kfd reset sequence in reset sriov function

2021-11-29 Thread shaoyunl

This change revert previous commit
7079e7d5c6bf: drm/amd/amdgpu: fix the kfd pre_reset sequence in sriov
cd547b93c62a: drm/amdgpu: move kfd post_reset out of reset_sriov function

Some register access(GRBM_GFX_CNTL) only be allowed on full access
mode. Move kfd_pre_reset and  kfd_post_reset back inside reset_sriov
function.

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1989f9e9379e..3c5afa45173c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4285,6 +4285,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
 {
int r;
 
+   amdgpu_amdkfd_pre_reset(adev);
+
if (from_hypervisor)
r = amdgpu_virt_request_full_gpu(adev, true);
else
@@ -4312,6 +4314,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
 
amdgpu_irq_gpu_reset_resume_helper(adev);
r = amdgpu_ib_ring_tests(adev);
+   amdgpu_amdkfd_post_reset(adev);
 
 error:
if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
@@ -5026,7 +5029,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
cancel_delayed_work_sync(_adev->delayed_init_work);
 
-   amdgpu_amdkfd_pre_reset(tmp_adev);
+   if (!amdgpu_sriov_vf(tmp_adev))
+   amdgpu_amdkfd_pre_reset(tmp_adev);
 
/*
 * Mark these ASICs to be reseted as untracked first
@@ -5144,9 +5148,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
 skip_sched_resume:
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
-   /* unlock kfd */
-   if (!need_emergency_restart)
-   amdgpu_amdkfd_post_reset(tmp_adev);
+   /* unlock kfd: SRIOV would do it separately */
+   if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
+   amdgpu_amdkfd_post_reset(tmp_adev);
 
/* kfd_post_reset will do nothing if kfd device is not 
initialized,
 * need to bring up kfd here if it's not be initialized before
-- 
2.17.1

[PATCH] drm/amd/amdgpu: move kfd post_reset out of reset_sriov function

2021-11-18 Thread shaoyunl

For sriov XGMI  configuration, the host driver will handle the hive reset,
so in guest side, the reset_sriov only be called once on one device. This will
make kfd post_reset unblanced with kfd pre_reset since kfd pre_reset already
been moved out of reset_sriov function. Move kfd post_reset out of reset_sriov
function to make them balance .

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 10c8008d1da0..9a9d5493c676 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4308,7 +4308,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
 
amdgpu_irq_gpu_reset_resume_helper(adev);
r = amdgpu_ib_ring_tests(adev);
-   amdgpu_amdkfd_post_reset(adev);
 
 error:
if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
@@ -5081,7 +5080,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
/* Actual ASIC resets if needed.*/
-   /* TODO Implement XGMI hive reset logic for SRIOV */
+   /* Host driver will handle XGMI hive reset for SRIOV */
if (amdgpu_sriov_vf(adev)) {
r = amdgpu_device_reset_sriov(adev, job ? false : true);
if (r)
@@ -5141,8 +5140,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
 skip_sched_resume:
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
-   /* unlock kfd: SRIOV would do it separately */
-   if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
+   /* unlock kfd */
+   if (!need_emergency_restart)
amdgpu_amdkfd_post_reset(tmp_adev);
 
/* kfd_post_reset will do nothing if kfd device is not 
initialized,
-- 
2.17.1

[PATCH] drm/amd/amdkfd: Fix kernel panic when reset failed and been triggered again

2021-11-15 Thread shaoyunl

In SRIOV configuration, the reset may failed to bring asic back to normal but 
stop cpsch
already been called, the start_cpsch will not be called since there is no 
resume in this
case.  When reset been triggered again, driver should avoid to do 
uninitialization again.

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 42b2cc999434..62fe28244a80 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1225,6 +1225,11 @@ static int stop_cpsch(struct device_queue_manager *dqm)
bool hanging;
 
dqm_lock(dqm);
+   if (!dqm->sched_running) {
+   dqm_unlock(dqm);
+   return 0;
+   }
+
if (!dqm->is_hws_hang)
unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
hanging = dqm->is_hws_hang || dqm->is_resetting;
-- 
2.17.1

[PATCH] drm/amd/amdkfd: Fix kernel panic when reset failed and been triggered again

2021-11-14 Thread shaoyunl

In SRIOV configuration, the reset may failed to bring asic back to normal but 
stop cpsch
already been called, the start_cpsch will not be called since there is no 
resume in this
case.  When reset been triggered again, driver should avoid to do 
uninitialization again.

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 42b2cc999434..bcc8980d77e0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1228,12 +1228,14 @@ static int stop_cpsch(struct device_queue_manager *dqm)
if (!dqm->is_hws_hang)
unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
hanging = dqm->is_hws_hang || dqm->is_resetting;
-   dqm->sched_running = false;
 
-   pm_release_ib(>packet_mgr);
+   if (dqm->sched_running) {
+   dqm->sched_running = false;
+   pm_release_ib(>packet_mgr);
+   kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
+   pm_uninit(>packet_mgr, hanging);
+   }
 
-   kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
-   pm_uninit(>packet_mgr, hanging);
dqm_unlock(dqm);
 
return 0;
-- 
2.17.1

[PATCH] drm/amd/amdkfd: Fix kernel panic when reset failed and been triggered again

2021-11-14 Thread shaoyunl

In SRIOV configuration, the reset may failed to bring asic back to normal but 
stop cpsch
already been called, the start_cpsch will not be called since there is no 
resume in this
case.  When reset been triggered again, driver should avoid to do 
uninitialization again.

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 42b2cc999434..bcc8980d77e0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1228,12 +1228,14 @@ static int stop_cpsch(struct device_queue_manager *dqm)
if (!dqm->is_hws_hang)
unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
hanging = dqm->is_hws_hang || dqm->is_resetting;
-   dqm->sched_running = false;
 
-   pm_release_ib(>packet_mgr);
+   if (dqm->sched_running) {
+   dqm->sched_running = false;
+   pm_release_ib(>packet_mgr);
+   kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
+   pm_uninit(>packet_mgr, hanging);
+   }
 
-   kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
-   pm_uninit(>packet_mgr, hanging);
dqm_unlock(dqm);
 
return 0;
-- 
2.17.1

[PATCH] drm/amd/amdgpu: fix the kfd pre_reset sequence in sriov

2021-11-05 Thread shaoyunl

The KFD pre_reset should be called before reset been executed, it will
hold the lock to prevent other rocm process to sent the packlage to hiq
during host execute the real reset on the HW

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 95fec36e385e..d7c9dce17cad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4278,8 +4278,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
if (r)
return r;
 
-   amdgpu_amdkfd_pre_reset(adev);
-
/* Resume IP prior to SMC */
r = amdgpu_device_ip_reinit_early_sriov(adev);
if (r)
@@ -5015,8 +5013,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
cancel_delayed_work_sync(_adev->delayed_init_work);
 
-   if (!amdgpu_sriov_vf(tmp_adev))
-   amdgpu_amdkfd_pre_reset(tmp_adev);
+   amdgpu_amdkfd_pre_reset(tmp_adev);
 
/*
 * Mark these ASICs to be reseted as untracked first
-- 
2.17.1

[PATCH] drm/amd/amdkfd: Don't sent command to HWS on kfd reset

2021-11-04 Thread shaoyunl

When kfd need to be reset, sent command to HWS might cause hang and get 
unnecessary timeout.
This change try not to touch HW in pre_reset and keep queues to be in the 
evicted state
when the reset is done, so they are not put back on the runlist. These queues 
will be destroied
on process termination.

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  | 6 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index e9601d4dfb77..0a60317509c8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1430,7 +1430,7 @@ static int unmap_queues_cpsch(struct device_queue_manager 
*dqm,
 
if (!dqm->sched_running)
return 0;
-   if (dqm->is_hws_hang)
+   if (dqm->is_hws_hang || dqm->is_resetting)
return -EIO;
if (!dqm->active_runlist)
return retval;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index f8a8fdb95832..f29b3932e3dc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1715,7 +1715,11 @@ int kfd_process_evict_queues(struct kfd_process *p)
 
r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
>qpd);
-   if (r) {
+   /* evict return -EIO if HWS is hang or asic is resetting, in 
this case
+* we would like to set all the queues to be in evicted state 
to prevent
+* them been add back since they actually not be saved right 
now.
+*/
+   if (r && r != -EIO) {
pr_err("Failed to evict process queues\n");
goto fail;
}
-- 
2.17.1

[PATCH] drm/amd/amdkfd: Don't sent command to HWS on kfd reset

2021-11-03 Thread shaoyunl

When kfd need to be reset, sent command to HWS might cause hang and get 
unnecessary timeout.
This change try not to touch HW in pre_reset and keep queues to be in the 
evicted state
when the reset is done, so they are not put back on the runlist. These queues 
will be destroied
on process termination.

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   | 6 +-
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  | 6 +-
 4 files changed, 13 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_device.c
 mode change 100644 => 100755 
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
 mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_priv.h
 mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_process.c

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
old mode 100644
new mode 100755
index c8aade17efef..536ef766d09e
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -1100,6 +1100,8 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
if (!kfd->init_complete)
return 0;
 
+   kfd->is_resetting = true;
+
kfd_smi_event_update_gpu_reset(kfd, false);
 
kfd->dqm->ops.pre_reset(kfd->dqm);
@@ -1132,6 +1134,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
 
kfd_smi_event_update_gpu_reset(kfd, true);
 
+   kfd->is_resetting = false;
+
return 0;
 }
 
@@ -1168,7 +1172,7 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
return ret;
 
/* for runtime resume, skip unlocking kfd */
-   if (!run_pm) {
+   if (!run_pm && !kfd->is_resetting) {
count = atomic_dec_return(_locked);
WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
if (count == 0)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
old mode 100644
new mode 100755
index e9601d4dfb77..0a60317509c8
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1430,7 +1430,7 @@ static int unmap_queues_cpsch(struct device_queue_manager 
*dqm,
 
if (!dqm->sched_running)
return 0;
-   if (dqm->is_hws_hang)
+   if (dqm->is_hws_hang || dqm->is_resetting)
return -EIO;
if (!dqm->active_runlist)
return retval;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
old mode 100644
new mode 100755
index bfe7bacccb73..e4bcc2a09ca8
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -275,6 +275,8 @@ struct kfd_dev {
struct device_queue_manager *dqm;
 
bool init_complete;
+   bool is_resetting;
+
/*
 * Interrupts of interest to KFD are copied
 * from the HW ring into a SW ring.
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
old mode 100644
new mode 100755
index f8a8fdb95832..f29b3932e3dc
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1715,7 +1715,11 @@ int kfd_process_evict_queues(struct kfd_process *p)
 
r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
>qpd);
-   if (r) {
+   /* evict return -EIO if HWS is hang or asic is resetting, in 
this case
+* we would like to set all the queues to be in evicted state 
to prevent
+* them been add back since they actually not be saved right 
now.
+*/
+   if (r && r != -EIO) {
pr_err("Failed to evict process queues\n");
goto fail;
}
-- 
2.17.1

[PATCH] drm/amdgpu: Get atomicOps info from Host for sriov setup

2021-09-10 Thread shaoyunl

The AtomicOp Requester Enable bit is reserved in VFs and the PF value applies 
to all
associated VFs. so guest driver can not directly enable the atomicOps for VF, it
depends on PF to enable it. In current design, amdgpu driver  will get the 
enabled
atomicOps bits through private pf2vf data

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  | 24 +++--
 drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h |  4 +++-
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 653bd8fdaa33..3ae1721ca859 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3529,17 +3529,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
 
-   /* enable PCIE atomic ops */
-   r = pci_enable_atomic_ops_to_root(adev->pdev,
- PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
- PCI_EXP_DEVCAP2_ATOMIC_COMP64);
-   if (r) {
-   adev->have_atomics_support = false;
-   DRM_INFO("PCIE atomic ops is not supported\n");
-   } else {
-   adev->have_atomics_support = true;
-   }
-
amdgpu_device_get_pcie_info(adev);
 
if (amdgpu_mcbp)
@@ -3562,6 +3551,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (r)
return r;
 
+   /* enable PCIE atomic ops */
+   if (amdgpu_sriov_vf(adev))
+   adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info 
*)
+   
adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_enabled_flags ==
+   (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 
PCI_EXP_DEVCAP2_ATOMIC_COMP64);
+   else
+   adev->have_atomics_support =
+   !pci_enable_atomic_ops_to_root(adev->pdev,
+ PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
+ PCI_EXP_DEVCAP2_ATOMIC_COMP64);
+   if (!adev->have_atomics_support)
+   dev_info(adev->dev, "PCIE atomic ops is not supported\n");
+
/* doorbell bar mapping and doorbell index init*/
amdgpu_device_doorbell_init(adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h 
b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
index a434c71fde8e..995899191288 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
@@ -204,8 +204,10 @@ struct amd_sriov_msg_pf2vf_info {
} mm_bw_management[AMD_SRIOV_MSG_RESERVE_VCN_INST];
/* UUID info */
struct amd_sriov_msg_uuid_info uuid_info;
+   /* pcie atomic Ops info */
+   uint32_t pcie_atomic_ops_enabled_flags;
/* reserved */
-   uint32_t reserved[256 - 47];
+   uint32_t reserved[256 - 48];
 };
 
 struct amd_sriov_msg_vf2pf_info_header {
-- 
2.17.1

[PATCH] drm/amdgpu: Get atomicOps info from Host for sriov setup

2021-09-10 Thread shaoyunl

The AtomicOp Requester Enable bit is reserved in VFs and the PF value applies 
to all
associated VFs. so guest driver can not directly enable the atomicOps for VF, it
depends on PF to enable it. In current design, amdgpu driver  will get the 
enabled
atomicOps bits through private pf2vf data

Signed-off-by: shaoyunl 
Change-Id: Ifdbcb4396d64e3f3cbf6bcbf7ab9c7b2cb061052
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  | 24 +++--
 drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h |  4 +++-
 2 files changed, 16 insertions(+), 12 deletions(-)
 mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
 mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
old mode 100644
new mode 100755
index 653bd8fdaa33..3ae1721ca859
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3529,17 +3529,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
 
-   /* enable PCIE atomic ops */
-   r = pci_enable_atomic_ops_to_root(adev->pdev,
- PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
- PCI_EXP_DEVCAP2_ATOMIC_COMP64);
-   if (r) {
-   adev->have_atomics_support = false;
-   DRM_INFO("PCIE atomic ops is not supported\n");
-   } else {
-   adev->have_atomics_support = true;
-   }
-
amdgpu_device_get_pcie_info(adev);
 
if (amdgpu_mcbp)
@@ -3562,6 +3551,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (r)
return r;
 
+   /* enable PCIE atomic ops */
+   if (amdgpu_sriov_vf(adev))
+   adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info 
*)
+   
adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_enabled_flags ==
+   (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 
PCI_EXP_DEVCAP2_ATOMIC_COMP64);
+   else
+   adev->have_atomics_support =
+   !pci_enable_atomic_ops_to_root(adev->pdev,
+ PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
+ PCI_EXP_DEVCAP2_ATOMIC_COMP64);
+   if (!adev->have_atomics_support)
+   dev_info(adev->dev, "PCIE atomic ops is not supported\n");
+
/* doorbell bar mapping and doorbell index init*/
amdgpu_device_doorbell_init(adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h 
b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
old mode 100644
new mode 100755
index a434c71fde8e..995899191288
--- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
@@ -204,8 +204,10 @@ struct amd_sriov_msg_pf2vf_info {
} mm_bw_management[AMD_SRIOV_MSG_RESERVE_VCN_INST];
/* UUID info */
struct amd_sriov_msg_uuid_info uuid_info;
+   /* pcie atomic Ops info */
+   uint32_t pcie_atomic_ops_enabled_flags;
/* reserved */
-   uint32_t reserved[256 - 47];
+   uint32_t reserved[256 - 48];
 };
 
 struct amd_sriov_msg_vf2pf_info_header {
-- 
2.17.1

[PATCH] drm/amdgpu: Get atomicOps info from Host for sriov setup

2021-09-10 Thread shaoyunl

The AtomicOp Requester Enable bit is reserved in VFs and the PF value applies 
to all
associated VFs. so guest driver can not directly enable the atomicOps for VF, it
depends on PF to enable it. In current design, amdgpu driver  will get the 
enabled
atomicOps bits through private pf2vf data

Signed-off-by: shaoyunl 
Change-Id: Ifdbcb4396d64e3f3cbf6bcbf7ab9c7b2cb061052
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  | 25 -
 drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h |  4 +++-
 2 files changed, 17 insertions(+), 12 deletions(-)
 mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
 mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
old mode 100644
new mode 100755
index 653bd8fdaa33..fc6a6491c1b6
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3529,17 +3529,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
 
-   /* enable PCIE atomic ops */
-   r = pci_enable_atomic_ops_to_root(adev->pdev,
- PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
- PCI_EXP_DEVCAP2_ATOMIC_COMP64);
-   if (r) {
-   adev->have_atomics_support = false;
-   DRM_INFO("PCIE atomic ops is not supported\n");
-   } else {
-   adev->have_atomics_support = true;
-   }
-
amdgpu_device_get_pcie_info(adev);
 
if (amdgpu_mcbp)
@@ -3562,6 +3551,20 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (r)
return r;
 
+   /* enable PCIE atomic ops */
+   if (amdgpu_sriov_bios(adev))
+   adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info 
*)
+   
adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_enabled_flags ==
+   (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 
PCI_EXP_DEVCAP2_ATOMIC_COMP64);
+   else
+   adev->have_atomics_support =
+   !pci_enable_atomic_ops_to_root(adev->pdev,
+ PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
+ PCI_EXP_DEVCAP2_ATOMIC_COMP64);
+   if (!adev->have_atomics_support)
+   dev_info(adev->dev, "PCIE atomic ops is not supported\n");
+
+
/* doorbell bar mapping and doorbell index init*/
amdgpu_device_doorbell_init(adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h 
b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
old mode 100644
new mode 100755
index a434c71fde8e..995899191288
--- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
@@ -204,8 +204,10 @@ struct amd_sriov_msg_pf2vf_info {
} mm_bw_management[AMD_SRIOV_MSG_RESERVE_VCN_INST];
/* UUID info */
struct amd_sriov_msg_uuid_info uuid_info;
+   /* pcie atomic Ops info */
+   uint32_t pcie_atomic_ops_enabled_flags;
/* reserved */
-   uint32_t reserved[256 - 47];
+   uint32_t reserved[256 - 48];
 };
 
 struct amd_sriov_msg_vf2pf_info_header {
-- 
2.17.1

[PATCH] drm/amdgpu: Get atomicOps info from Host for sriov setup

2021-09-09 Thread shaoyunl

The AtomicOp Requester Enable bit is reserved in VFs and the PF value applies 
to all
associated VFs. so guest driver can not directly enable the atomicOps for VF, it
depends on PF to enable it. In current design, amdgpu driver  will get the 
enabled
atomicOps bits through private pf2vf data

Signed-off-by: shaoyunl 
Change-Id: Ifdbcb4396d64e3f3cbf6bcbf7ab9c7b2cb061052
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  | 20 ++--
 drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h |  4 +++-
 2 files changed, 21 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
 mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
old mode 100644
new mode 100755
index 653bd8fdaa33..a0d2b9eb84fc
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2167,8 +2167,6 @@ static int amdgpu_device_ip_early_init(struct 
amdgpu_device *adev)
return -EINVAL;
}
 
-   amdgpu_amdkfd_device_probe(adev);
-
adev->pm.pp_feature = amdgpu_pp_feature_mask;
if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
@@ -3562,6 +3560,24 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (r)
return r;
 
+   /* enable PCIE atomic ops */
+   if (amdgpu_sriov_bios(adev))
+   adev->have_atomics_support = (((struct amd_sriov_msg_pf2vf_info 
*)
+   
adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_enabled_flags ==
+   (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 
PCI_EXP_DEVCAP2_ATOMIC_COMP64))
+   ? TRUE : FALSE;
+   else
+   adev->have_atomics_support =
+   pci_enable_atomic_ops_to_root(adev->pdev,
+ PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
+ PCI_EXP_DEVCAP2_ATOMIC_COMP64)
+   ? FALSE : TRUE;
+   if (adev->have_atomics_support = false )
+   DRM_INFO("PCIE atomic ops is not supported\n");
+
+   amdgpu_amdkfd_device_probe(adev);
+
+
/* doorbell bar mapping and doorbell index init*/
amdgpu_device_doorbell_init(adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h 
b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
old mode 100644
new mode 100755
index a434c71fde8e..995899191288
--- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
@@ -204,8 +204,10 @@ struct amd_sriov_msg_pf2vf_info {
} mm_bw_management[AMD_SRIOV_MSG_RESERVE_VCN_INST];
/* UUID info */
struct amd_sriov_msg_uuid_info uuid_info;
+   /* pcie atomic Ops info */
+   uint32_t pcie_atomic_ops_enabled_flags;
/* reserved */
-   uint32_t reserved[256 - 47];
+   uint32_t reserved[256 - 48];
 };
 
 struct amd_sriov_msg_vf2pf_info_header {
-- 
2.17.1

[PATCH] drm/amdgpu: soc15 register access through RLC should only apply to sriov runtime

2021-06-01 Thread shaoyunl

On SRIOV, driver should only access register through RLC in runtime

Signed-off-by: shaoyunl 
Change-Id: Iecaa52436a2985a18ede9c86cb00cc197a717bd6
---
 drivers/gpu/drm/amd/amdgpu/soc15_common.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15_common.h 
b/drivers/gpu/drm/amd/amdgpu/soc15_common.h
index c781808e4dc3..f6cf70e69cce 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15_common.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15_common.h
@@ -28,12 +28,12 @@
 #define SOC15_REG_OFFSET(ip, inst, reg)
(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg)
 
 #define __WREG32_SOC15_RLC__(reg, value, flag, hwip) \
-   ((amdgpu_sriov_vf(adev) && adev->gfx.rlc.funcs->rlcg_wreg) ? \
+   ((amdgpu_sriov_runtime(adev) && adev->gfx.rlc.funcs->rlcg_wreg) ? \
 adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, value, flag, hwip) : \
 WREG32(reg, value))
 
 #define __RREG32_SOC15_RLC__(reg, flag, hwip) \
-   ((amdgpu_sriov_vf(adev) && adev->gfx.rlc.funcs->rlcg_rreg) ? \
+   ((amdgpu_sriov_runtime(adev) && adev->gfx.rlc.funcs->rlcg_rreg) ? \
 adev->gfx.rlc.funcs->rlcg_rreg(adev, reg, flag, hwip) : \
 RREG32(reg))
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 2/2] drm/amd/pm: Use BACO reset arg 0 on XGMI configuration

2021-03-15 Thread shaoyunl

With arg 1 BACO reset, it will try to reload the SMU FW after reset.
This might failed if driver already in a pending reset status during probe 
period.
Arg 0 reset will bring  asic back to a clean state and driver will re-init
everythign including SMU FW

Signed-off-by: shaoyunl 
Change-Id: I6df90d963e259dcf1df8fe3716cdf52681202162
---
 drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
index 635bd5da2133..0d137af1a78a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
@@ -1530,7 +1530,7 @@ int smu_v11_0_baco_set_state(struct smu_context *smu, 
enum smu_baco_state state)
  NULL);
break;
default:
-   if (!ras || !ras->supported) {
+   if (!ras || !ras->supported || 
adev->gmc.xgmi.pending_reset) {
if (adev->asic_type == CHIP_ARCTURUS) {
data = RREG32_SOC15(THM, 0, 
mmTHM_BACO_CNTL_ARCT);
data |= 0x8000;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 1/2] drm/amdgpu: Keep pending_reset valid during smu reset the ASIC

2021-03-15 Thread shaoyunl

SMU internal might need to check this pending_reset setting to decide the reset 
method

Signed-off-by: shaoyunl 
Change-Id: I8d88abf56d481e7443ac31baa2929826aec9e576
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index e0c4f70f96a4..7b3d9384b307 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -1359,8 +1359,8 @@ static void amdgpu_drv_delayed_reset_work_handler(struct 
work_struct *work)
}
for (i = 0; i < mgpu_info.num_dgpu; i++) {
adev = mgpu_info.gpu_ins[i].adev;
-   adev->gmc.xgmi.pending_reset = false;
flush_work(>xgmi_reset_work);
+   adev->gmc.xgmi.pending_reset = false;
}
 
/* reset function will rebuild the xgmi hive info , clear it now */
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: Enable light SBR in XGMI+passthrough configuration

2021-03-12 Thread shaoyunl

This is to fix the commit dda9bbb26c7 where it only enable the light SMU
on normal device init. This feature actually need to be enabled after ASIC
been reset as well.

Signed-off-by: shaoyunl 
Change-Id: Ie7ee02cd3ccdab3522aad9a02f681963e211ed44
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index cada3e77c7d5..34472c9d73f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2513,6 +2513,11 @@ static int amdgpu_device_ip_late_init(struct 
amdgpu_device *adev)
if (r)
DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
 
+   /* For XGMI + passthrough configuration on arcturus, enable light SBR */
+   if (adev->asic_type == CHIP_ARCTURUS &&
+   amdgpu_passthrough(adev) &&
+   adev->gmc.xgmi.num_physical_nodes > 1)
+   smu_set_light_sbr(>smu, true);
 
if (adev->gmc.xgmi.num_physical_nodes > 1) {
mutex_lock(_info.mutex);
@@ -3615,10 +3620,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (amdgpu_device_cache_pci_state(adev->pdev))
pci_restore_state(pdev);
 
-   /* Enable lightSBR on SMU in passthrough + xgmi configuration */
-   if (amdgpu_passthrough(adev) && adev->gmc.xgmi.num_physical_nodes > 1)
-   smu_set_light_sbr(>smu, true);
-
if (adev->gmc.xgmi.pending_reset)
queue_delayed_work(system_wq, _info.delayed_reset_work,
   msecs_to_jiffies(AMDGPU_RESUME_MS));
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: Enable light SBR in XGMI+passthrough configuration

2021-03-11 Thread shaoyunl

This is to fix the commit dda9bbb26c7 where it only enable the light SMU
on normal device init. This feature actually need to be enabled after ASIC
been reset as well.

Signed-off-by: shaoyunl 
Change-Id: Ie7ee02cd3ccdab3522aad9a02f681963e211ed44
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index cada3e77c7d5..fb775a9c0db1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2513,6 +2513,9 @@ static int amdgpu_device_ip_late_init(struct 
amdgpu_device *adev)
if (r)
DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
 
+   /* For XGMI + passthrough configuration , enable light SBR */
+   if (amdgpu_passthrough(adev) && adev->gmc.xgmi.num_physical_nodes > 1)
+   smu_set_light_sbr(>smu, true);
 
if (adev->gmc.xgmi.num_physical_nodes > 1) {
mutex_lock(_info.mutex);
@@ -3615,10 +3618,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (amdgpu_device_cache_pci_state(adev->pdev))
pci_restore_state(pdev);
 
-   /* Enable lightSBR on SMU in passthrough + xgmi configuration */
-   if (amdgpu_passthrough(adev) && adev->gmc.xgmi.num_physical_nodes > 1)
-   smu_set_light_sbr(>smu, true);
-
if (adev->gmc.xgmi.pending_reset)
queue_delayed_work(system_wq, _info.delayed_reset_work,
   msecs_to_jiffies(AMDGPU_RESUME_MS));
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 1/2] drm/amd/pm: Add LightSBR SMU MSG support

2021-03-10 Thread shaoyunl

This new MSG provide the interface for driver to enable/disable the Light 
Secondary Bus Reset
support from SMU. When enabled, SMU will only do minimum NBIO response to the 
SBR request and
leave the real HW reset to be handled by driver later. When disabled (default 
state),SMU will
pass the request to PSP for a HW reset

Signed-off-by: shaoyunl 
Change-Id: I5f0e48730d2b4b48fed8137aa57c683d5b3d1b9f
---
 drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h   |  7 +++
 drivers/gpu/drm/amd/pm/inc/arcturus_ppsmc.h   |  7 +++
 drivers/gpu/drm/amd/pm/inc/smu_types.h|  1 +
 drivers/gpu/drm/amd/pm/inc/smu_v11_0.h|  2 ++
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 13 +
 drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c |  2 ++
 drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c| 10 ++
 7 files changed, 42 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
index c02ffbd1df76..5ac683fd0749 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
@@ -1153,6 +1153,11 @@ struct pptable_funcs {
 *  parameters to defaults.
 */
int (*set_fine_grain_gfx_freq_parameters)(struct smu_context *smu);
+
+   /**
+* @set_light_sbr:  Set light sbr mode for the SMU.
+*/
+   int (*set_light_sbr)(struct smu_context *smu, bool enable);
 };
 
 typedef enum {
@@ -1356,5 +1361,7 @@ ssize_t smu_sys_get_gpu_metrics(void *handle, void 
**table);
 int smu_enable_mgpu_fan_boost(void *handle);
 int smu_gfx_state_change_set(struct smu_context *smu, uint32_t state);
 
+int smu_set_light_sbr(struct smu_context *smu, bool enable);
+
 #endif
 #endif
diff --git a/drivers/gpu/drm/amd/pm/inc/arcturus_ppsmc.h 
b/drivers/gpu/drm/amd/pm/inc/arcturus_ppsmc.h
index 79afb132164e..45f5d29bc705 100644
--- a/drivers/gpu/drm/amd/pm/inc/arcturus_ppsmc.h
+++ b/drivers/gpu/drm/amd/pm/inc/arcturus_ppsmc.h
@@ -120,6 +120,13 @@
 #define PPSMC_MSG_ReadSerialNumTop320x40
 #define PPSMC_MSG_ReadSerialNumBottom32 0x41
 
+/* paramater for MSG_LightSBR
+ * 1 -- Enable light secondary bus reset, only do nbio respond without further 
handling,
+ *  leave driver to handle the real reset
+ * 0 -- Disable LightSBR, default behavior, SMU will pass the reset to PSP
+ */
+#define PPSMC_MSG_LightSBR  0x42
+
 typedef uint32_t PPSMC_Result;
 typedef uint32_t PPSMC_Msg;
 #pragma pack(pop)
diff --git a/drivers/gpu/drm/amd/pm/inc/smu_types.h 
b/drivers/gpu/drm/amd/pm/inc/smu_types.h
index e9a0bda98fd7..5bfb60f41dd4 100644
--- a/drivers/gpu/drm/amd/pm/inc/smu_types.h
+++ b/drivers/gpu/drm/amd/pm/inc/smu_types.h
@@ -224,6 +224,7 @@
__SMU_DUMMY_MAP(EnableDeterminism), \
__SMU_DUMMY_MAP(DisableDeterminism),\
__SMU_DUMMY_MAP(SetUclkDpmMode),\
+   __SMU_DUMMY_MAP(LightSBR),  \
 
 #undef __SMU_DUMMY_MAP
 #define __SMU_DUMMY_MAP(type)  SMU_MSG_##type
diff --git a/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h 
b/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h
index bf570a7af6a7..907e0967a9e8 100644
--- a/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h
+++ b/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h
@@ -295,5 +295,7 @@ int smu_v11_0_deep_sleep_control(struct smu_context *smu,
 
 void smu_v11_0_interrupt_work(struct smu_context *smu);
 
+int smu_v11_0_set_light_sbr(struct smu_context *smu, bool enable);
+
 #endif
 #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 6a6fafc11588..1202b9e7d0f9 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -2972,6 +2972,19 @@ int smu_gfx_state_change_set(struct smu_context *smu, 
uint32_t state)
return ret;
 }
 
+int smu_set_light_sbr(struct smu_context *smu, bool enable)
+{
+   int ret = 0;
+
+   mutex_lock(>mutex);
+   if (smu->ppt_funcs->set_light_sbr)
+   ret = smu->ppt_funcs->set_light_sbr(smu, enable);
+   mutex_unlock(>mutex);
+
+   return ret;
+}
+
+
 static const struct amd_pm_funcs swsmu_pm_funcs = {
/* export for sysfs */
.set_fan_control_mode= smu_pp_set_fan_control_mode,
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
index f76d1b8aeecc..f82dd8a5c773 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
@@ -142,6 +142,7 @@ static const struct cmn2asic_msg_mapping 
arcturus_message_map[SMU_MSG_MAX_COUNT]
MSG_MAP(GmiPwrDnControl, PPSMC_MSG_GmiPwrDnControl, 
0),
MSG_MAP(ReadSerialNumTop32,  
PPSMC_MSG_ReadSerialNumTop32,  1),
MSG_MAP(ReadSerialNumBottom32,   
PPSMC_MSG_Rea

[PATCH 2/2] drm/amdgpu: Enable light SBR for SMU on passthrough and XGMI configuration

2021-03-10 Thread shaoyunl

SMU introduce the new interface to enable light Secondary Bus Reset mode, driver
enable it on passthrough + XGMI configuration

Signed-off-by: shaoyunl 
Change-Id: I59aef0559aba418b764e7cf716b0d98aca14fec5
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4640cafae619..e142b4080976 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3630,6 +3630,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (amdgpu_device_cache_pci_state(adev->pdev))
pci_restore_state(pdev);
 
+   /* Enable lightSBR on SMU in passthrough + xgmi configuration */
+   if (amdgpu_passthrough(adev) && adev->gmc.xgmi.num_physical_nodes > 1)
+   smu_set_light_sbr(>smu, true);
+
if (adev->gmc.xgmi.pending_reset)
queue_delayed_work(system_wq, _info.delayed_reset_work,
   msecs_to_jiffies(AMDGPU_RESUME_MS));
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 1/2] drm/amd/pm: Add LightSBR SMU MSG support

2021-03-10 Thread shaoyunl

This new MSG provide the interface for driver to enable/disable the Light 
Secondary Bus Reset
support from SMU. When enabled, SMU will only do minimum NBIO response to the 
SBR request and
leave the real HW reset to be handled by driver later. When disabled (default 
state),SMU will
pass the request to PSP for a HW reset

Signed-off-by: shaoyunl 
Change-Id: I5f0e48730d2b4b48fed8137aa57c683d5b3d1b9f
---
 drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h   |  7 +++
 drivers/gpu/drm/amd/pm/inc/arcturus_ppsmc.h   |  7 +++
 drivers/gpu/drm/amd/pm/inc/smu_types.h|  1 +
 drivers/gpu/drm/amd/pm/inc/smu_v11_0.h|  2 ++
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 12 
 drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c |  2 ++
 drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c| 10 ++
 7 files changed, 41 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
index 00186a3b29be..ba015816d771 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
@@ -1149,6 +1149,11 @@ struct pptable_funcs {
 *  parameters to defaults.
 */
int (*set_fine_grain_gfx_freq_parameters)(struct smu_context *smu);
+
+   /**
+* @set_light_sbr:  Set light sbr mode for the SMU.
+*/
+   int (*set_light_sbr)(struct smu_context *smu, bool enable);
 };
 
 typedef enum {
@@ -1350,5 +1355,7 @@ ssize_t smu_sys_get_gpu_metrics(void *handle, void 
**table);
 int smu_enable_mgpu_fan_boost(void *handle);
 int smu_gfx_state_change_set(struct smu_context *smu, uint32_t state);
 
+int smu_set_light_sbr(struct smu_context *smu, bool enable);
+
 #endif
 #endif
diff --git a/drivers/gpu/drm/amd/pm/inc/arcturus_ppsmc.h 
b/drivers/gpu/drm/amd/pm/inc/arcturus_ppsmc.h
index 79afb132164e..45f5d29bc705 100644
--- a/drivers/gpu/drm/amd/pm/inc/arcturus_ppsmc.h
+++ b/drivers/gpu/drm/amd/pm/inc/arcturus_ppsmc.h
@@ -120,6 +120,13 @@
 #define PPSMC_MSG_ReadSerialNumTop320x40
 #define PPSMC_MSG_ReadSerialNumBottom32 0x41
 
+/* paramater for MSG_LightSBR
+ * 1 -- Enable light secondary bus reset, only do nbio respond without further 
handling,
+ *  leave driver to handle the real reset
+ * 0 -- Disable LightSBR, default behavior, SMU will pass the reset to PSP
+ */
+#define PPSMC_MSG_LightSBR  0x42
+
 typedef uint32_t PPSMC_Result;
 typedef uint32_t PPSMC_Msg;
 #pragma pack(pop)
diff --git a/drivers/gpu/drm/amd/pm/inc/smu_types.h 
b/drivers/gpu/drm/amd/pm/inc/smu_types.h
index aa4822202587..92f72d770a99 100644
--- a/drivers/gpu/drm/amd/pm/inc/smu_types.h
+++ b/drivers/gpu/drm/amd/pm/inc/smu_types.h
@@ -214,6 +214,7 @@
__SMU_DUMMY_MAP(SetSlowPPTLimit),\
__SMU_DUMMY_MAP(GetFastPPTLimit),\
__SMU_DUMMY_MAP(GetSlowPPTLimit),\
+   __SMU_DUMMY_MAP(LightSBR),   \
 
 #undef __SMU_DUMMY_MAP
 #define __SMU_DUMMY_MAP(type)  SMU_MSG_##type
diff --git a/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h 
b/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h
index bf570a7af6a7..907e0967a9e8 100644
--- a/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h
+++ b/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h
@@ -295,5 +295,7 @@ int smu_v11_0_deep_sleep_control(struct smu_context *smu,
 
 void smu_v11_0_interrupt_work(struct smu_context *smu);
 
+int smu_v11_0_set_light_sbr(struct smu_context *smu, bool enable);
+
 #endif
 #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 502e1b926a06..d9baf97ce1c9 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -2934,6 +2934,18 @@ int smu_gfx_state_change_set(struct smu_context *smu, 
uint32_t state)
return ret;
 }
 
+int smu_set_light_sbr(struct smu_context *smu, bool enable)
+{
+   int ret = 0;
+
+   mutex_lock(>mutex);
+   ret = smu->ppt_funcs->set_light_sbr(smu, enable);
+   mutex_unlock(>mutex);
+
+   return ret;
+}
+
+
 static const struct amd_pm_funcs swsmu_pm_funcs = {
/* export for sysfs */
.set_fan_control_mode= smu_pp_set_fan_control_mode,
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
index f76d1b8aeecc..f82dd8a5c773 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
@@ -142,6 +142,7 @@ static const struct cmn2asic_msg_mapping 
arcturus_message_map[SMU_MSG_MAX_COUNT]
MSG_MAP(GmiPwrDnControl, PPSMC_MSG_GmiPwrDnControl, 
0),
MSG_MAP(ReadSerialNumTop32,  
PPSMC_MSG_ReadSerialNumTop32,  1),
MSG_MAP(ReadSerialNumBottom32,   
PPSMC_MSG_ReadSerialNumBottom32,   1),
+   MSG_

[PATCH 2/2] drm/amdgpu: Enable lightSBR for SMU on passthrough and XGMI configuration

2021-03-10 Thread shaoyunl

SMU introduce the new interface to enable lightSBR mode, driver enable it
on passthrough + XGMI configuration

Signed-off-by: shaoyunl 
Change-Id: I59aef0559aba418b764e7cf716b0d98aca14fec5
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4640cafae619..31a1783f211a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3630,6 +3630,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (amdgpu_device_cache_pci_state(adev->pdev))
pci_restore_state(pdev);
 
+   /* Enable lightSBR on SMU in passthrough + xgmi configuration */
+   if (amdgpu_passthrough(adev) && adev->gmc.xgmi.num_physical_nodes > 1)
+   smu_set_lightSBR(>smu, true);
+
if (adev->gmc.xgmi.pending_reset)
queue_delayed_work(system_wq, _info.delayed_reset_work,
   msecs_to_jiffies(AMDGPU_RESUME_MS));
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 1/2] drm/amd/pm: Add LightSBR SMU MSG support

2021-03-10 Thread shaoyunl

This new MSG provide the interface for driver to enable/disable the Light 
Secondary Bus Reset
support from SMU. When enabled, SMU will only do minimum NBIO response to the 
SBR request and
leave the real HW reset to be handled by driver later. When disabled (default 
state),SMU will
pass the request to PSP for a HW reset

Signed-off-by: shaoyunl 
Change-Id: I5f0e48730d2b4b48fed8137aa57c683d5b3d1b9f
---
 drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h   |  7 +++
 drivers/gpu/drm/amd/pm/inc/arcturus_ppsmc.h   |  7 +++
 drivers/gpu/drm/amd/pm/inc/smu_types.h|  1 +
 drivers/gpu/drm/amd/pm/inc/smu_v11_0.h|  2 ++
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 12 
 drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c |  2 ++
 drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c| 10 ++
 7 files changed, 41 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
index 00186a3b29be..369f0267b1f2 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
@@ -1149,6 +1149,11 @@ struct pptable_funcs {
 *  parameters to defaults.
 */
int (*set_fine_grain_gfx_freq_parameters)(struct smu_context *smu);
+
+   /**
+* @set_lightSBR: Set lightSBR mode for the SMU.
+*/
+   int (*set_lightSBR)(struct smu_context *smu, bool enable);
 };
 
 typedef enum {
@@ -1350,5 +1355,7 @@ ssize_t smu_sys_get_gpu_metrics(void *handle, void 
**table);
 int smu_enable_mgpu_fan_boost(void *handle);
 int smu_gfx_state_change_set(struct smu_context *smu, uint32_t state);
 
+int smu_set_lightSBR(struct smu_context *smu, bool enable);
+
 #endif
 #endif
diff --git a/drivers/gpu/drm/amd/pm/inc/arcturus_ppsmc.h 
b/drivers/gpu/drm/amd/pm/inc/arcturus_ppsmc.h
index 79afb132164e..45f5d29bc705 100644
--- a/drivers/gpu/drm/amd/pm/inc/arcturus_ppsmc.h
+++ b/drivers/gpu/drm/amd/pm/inc/arcturus_ppsmc.h
@@ -120,6 +120,13 @@
 #define PPSMC_MSG_ReadSerialNumTop320x40
 #define PPSMC_MSG_ReadSerialNumBottom32 0x41
 
+/* paramater for MSG_LightSBR
+ * 1 -- Enable light secondary bus reset, only do nbio respond without further 
handling,
+ *  leave driver to handle the real reset
+ * 0 -- Disable LightSBR, default behavior, SMU will pass the reset to PSP
+ */
+#define PPSMC_MSG_LightSBR  0x42
+
 typedef uint32_t PPSMC_Result;
 typedef uint32_t PPSMC_Msg;
 #pragma pack(pop)
diff --git a/drivers/gpu/drm/amd/pm/inc/smu_types.h 
b/drivers/gpu/drm/amd/pm/inc/smu_types.h
index aa4822202587..92f72d770a99 100644
--- a/drivers/gpu/drm/amd/pm/inc/smu_types.h
+++ b/drivers/gpu/drm/amd/pm/inc/smu_types.h
@@ -214,6 +214,7 @@
__SMU_DUMMY_MAP(SetSlowPPTLimit),\
__SMU_DUMMY_MAP(GetFastPPTLimit),\
__SMU_DUMMY_MAP(GetSlowPPTLimit),\
+   __SMU_DUMMY_MAP(LightSBR),   \
 
 #undef __SMU_DUMMY_MAP
 #define __SMU_DUMMY_MAP(type)  SMU_MSG_##type
diff --git a/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h 
b/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h
index bf570a7af6a7..7c7b149e7a83 100644
--- a/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h
+++ b/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h
@@ -295,5 +295,7 @@ int smu_v11_0_deep_sleep_control(struct smu_context *smu,
 
 void smu_v11_0_interrupt_work(struct smu_context *smu);
 
+int smu_v11_0_set_lightSBR(struct smu_context *smu, bool enable);
+
 #endif
 #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 502e1b926a06..58f508c36084 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -2934,6 +2934,18 @@ int smu_gfx_state_change_set(struct smu_context *smu, 
uint32_t state)
return ret;
 }
 
+int smu_set_lightSBR(struct smu_context *smu, bool enable)
+{
+   int ret = 0;
+
+   mutex_lock(>mutex);
+   ret = smu->ppt_funcs->set_lightSBR(smu, enable);
+   mutex_unlock(>mutex);
+
+   return ret;
+}
+
+
 static const struct amd_pm_funcs swsmu_pm_funcs = {
/* export for sysfs */
.set_fan_control_mode= smu_pp_set_fan_control_mode,
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
index f76d1b8aeecc..73a30208aa71 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
@@ -142,6 +142,7 @@ static const struct cmn2asic_msg_mapping 
arcturus_message_map[SMU_MSG_MAX_COUNT]
MSG_MAP(GmiPwrDnControl, PPSMC_MSG_GmiPwrDnControl, 
0),
MSG_MAP(ReadSerialNumTop32,  
PPSMC_MSG_ReadSerialNumTop32,  1),
MSG_MAP(ReadSerialNumBottom32,   
PPSMC_MSG_ReadSerialNumBottom32,   1),
+   MSG_

[PATCH] drm/amdgpu: skip read eeprom for device that pending on XGMI reset

2021-03-09 Thread shaoyunl

Read eeprom through SMU doesn't works stable on XGMI reset during test.
skip it for now

Signed-off-by: shaoyunl 
Change-Id: Id864b96a9da5b0d4dd5ffef9858997dd9f52de25
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c669435ccc74..a2ab8ee251f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1822,6 +1822,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
goto out;
}
 
+   /* Todo: During test the SMU might fail to read the eeprom through I2C
+* when the GPU is pending on XGMI reset during probe time
+* (Mostly after second bus reset), skip it now
+*/
+   if (adev->gmc.xgmi.pending_reset)
+   return 0;
ret = amdgpu_ras_eeprom_init(>eeprom_control, _err_limit);
/*
 * This calling fails when exc_err_limit is true or
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu : Fix asic reset regression issue introduce by 3f61aa92b88c

2021-03-09 Thread shaoyunl

This recent change introduce SDMA interrupt info printing with irq->process 
function.
These functions do not require a set function to enable/disable the irq

Signed-off-by: shaoyunl 
Change-Id: I595998b107f48865f47820ba2e7f758cc263dc64
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
index 9ab8d7db19f9..af026109421a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
@@ -570,7 +570,7 @@ void amdgpu_irq_gpu_reset_resume_helper(struct 
amdgpu_device *adev)
for (j = 0; j < AMDGPU_MAX_IRQ_SRC_ID; ++j) {
struct amdgpu_irq_src *src = 
adev->irq.client[i].sources[j];
 
-   if (!src)
+   if (!src || !src->funcs || !src->funcs->set)
continue;
for (k = 0; k < src->num_types; k++)
amdgpu_irq_update(adev, src, k);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe

2021-03-06 Thread shaoyunl

In passthrough configuration, hypervisior will trigger the SBR(Secondary bus 
reset) to the devices
without sync to each other. This could cause device hang since for XGMI 
configuration, all the devices
within the hive need to be reset at a limit time slot. This serial of patches 
try to solve this issue
by co-operate with new SMU which will only do minimum house keeping to response 
the SBR request but don't
do the real reset job and leave it to driver. Driver need to do the whole sw 
init and minimum HW init
to bring up the SMU and trigger the reset(possibly BACO) on all the ASICs at 
the same time

Signed-off-by: shaoyunl 
Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  13 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  69 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h|   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
 5 files changed, 163 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d46d3794699e..5602c6edee97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -125,6 +125,10 @@ struct amdgpu_mgpu_info
uint32_tnum_gpu;
uint32_tnum_dgpu;
uint32_tnum_apu;
+
+   /* delayed reset_func for XGMI configuration if necessary */
+   struct delayed_work delayed_reset_work;
+   boolpending_reset;
 };
 
 #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH256
@@ -1124,6 +1128,15 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device 
*adev,
 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
 
+int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
+ struct amdgpu_job *job,
+ bool *need_full_reset_arg);
+
+int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
+ struct list_head *device_list_handle,
+ bool *need_full_reset_arg,
+ bool skip_hw_reset);
+
 int emu_soc_asic_init(struct amdgpu_device *adev);
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3c35b0c1e710..5b520f70e660 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
}
}
 
+   /* Don't post if we need to reset whole hive on init */
+   if (adev->gmc.xgmi.pending_reset)
+   return false;
+
if (adev->has_hw_reset) {
adev->has_hw_reset = false;
return true;
@@ -2149,6 +2153,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device 
*adev)
if (adev->ip_blocks[i].version->type != 
AMD_IP_BLOCK_TYPE_PSP)
continue;
 
+   if (!adev->ip_blocks[i].status.sw)
+   continue;
+
/* no need to do the fw loading again if already done*/
if (adev->ip_blocks[i].status.hw == true)
break;
@@ -2289,7 +2296,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
 
if (adev->gmc.xgmi.num_physical_nodes > 1)
amdgpu_xgmi_add_device(adev);
-   amdgpu_amdkfd_device_init(adev);
+
+   /* Don't init kfd if whole hive need to be reset during init */
+   if (!adev->gmc.xgmi.pending_reset)
+   amdgpu_amdkfd_device_init(adev);
 
amdgpu_fru_get_product_info(adev);
 
@@ -2734,6 +2744,16 @@ static int amdgpu_device_ip_suspend_phase2(struct 
amdgpu_device *adev)
adev->ip_blocks[i].status.hw = false;
continue;
}
+
+   /* skip unnecessary suspend if we do not initialize them yet */
+   if (adev->gmc.xgmi.pending_reset &&
+   !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC 
||
+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC 
||
+ adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_COMMON ||
+ adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_IH)) {
+   adev->ip_blocks[i].status.hw = false;
+   continue;
+   }
/* XXX handle errors */
r = adev->ip_blocks[i].version->funcs->suspend(adev);
/* XXX handle errors */
@@ -3407,10 +3427,28 @@ int amdgpu_device_in

[PATCH 4/5] drm/amdgpu: Add reset_list for device list used for reset

2021-03-06 Thread shaoyunl

The gmc.xgmi.head list originally is designed for device list in the XGMI hive. 
Mix use it
for reset purpose will prevent the reset function to adjust XGMI device list 
which is required
in next change

Signed-off-by: shaoyunl 
Change-Id: Ibbdf75c02836151adf5bb44186e6ced97dbf8c1d
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 33 --
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f01b75ec6c60..d46d3794699e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1040,6 +1040,7 @@ struct amdgpu_device {
 
int asic_reset_res;
struct work_struct  xgmi_reset_work;
+   struct list_headreset_list;
 
longgfx_timeout;
longsdma_timeout;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62d7ce621457..3c35b0c1e710 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3290,6 +3290,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
INIT_LIST_HEAD(>shadow_list);
mutex_init(>shadow_list_lock);
 
+   INIT_LIST_HEAD(>reset_list);
+
INIT_DELAYED_WORK(>delayed_init_work,
  amdgpu_device_delayed_init_work_handler);
INIT_DELAYED_WORK(>gfx.gfx_off_delay_work,
@@ -4301,11 +4303,11 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
int r = 0;
 
/*
-* ASIC reset has to be done on all HGMI hive nodes ASAP
+* ASIC reset has to be done on all XGMI hive nodes ASAP
 * to allow proper links negotiation in FW (within 1 sec)
 */
if (!skip_hw_reset && need_full_reset) {
-   list_for_each_entry(tmp_adev, device_list_handle, 
gmc.xgmi.head) {
+   list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
/* For XGMI run all resets in parallel to speed up the 
process */
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
if (!queue_work(system_unbound_wq, 
_adev->xgmi_reset_work))
@@ -4322,8 +4324,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
 
/* For XGMI wait for all resets to complete before proceed */
if (!r) {
-   list_for_each_entry(tmp_adev, device_list_handle,
-   gmc.xgmi.head) {
+   list_for_each_entry(tmp_adev, device_list_handle, 
reset_list) {
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
flush_work(_adev->xgmi_reset_work);
r = tmp_adev->asic_reset_res;
@@ -4335,7 +4336,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
}
 
if (!r && amdgpu_ras_intr_triggered()) {
-   list_for_each_entry(tmp_adev, device_list_handle, 
gmc.xgmi.head) {
+   list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (tmp_adev->mmhub.funcs &&
tmp_adev->mmhub.funcs->reset_ras_error_count)

tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
@@ -4344,7 +4345,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
amdgpu_ras_intr_cleared();
}
 
-   list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+   list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (need_full_reset) {
/* post card */
if (amdgpu_device_asic_init(tmp_adev))
@@ -4655,16 +4656,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
*adev,
 */
INIT_LIST_HEAD(_list);
if (adev->gmc.xgmi.num_physical_nodes > 1) {
-   if (!list_is_first(>gmc.xgmi.head, >device_list))
-   list_rotate_to_front(>gmc.xgmi.head, 
>device_list);
-   device_list_handle = >device_list;
+   list_for_each_entry(tmp_adev, >device_list, gmc.xgmi.head)
+   list_add_tail(_adev->reset_list, _list);
+   if (!list_is_first(>reset_list, _list))
+   list_rotate_to_front(>reset_list, _list);
+   device_list_handle = _list;
} else {
-   list_add_tail(>gmc.xgmi.head, _list);
+   list_add_tail(>reset_list, _list);
device_list_handle = _list;
}
 
/* block all schedulers and reset given job's ring

[PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe

2021-03-05 Thread shaoyunl

In passthrough configuration, hypervisior will trigger the SBR(Secondary bus 
reset) to the devices
without sync to each other. This could cause device hang since for XGMI 
configuration, all the devices
within the hive need to be reset at a limit time slot. This serial of patches 
try to solve this issue
by co-operate with new SMU which will only do minimum house keeping to response 
the SBR request but don't
do the real reset job and leave it to driver. Driver need to do the whole sw 
init and minimum HW init
to bring up the SMU and trigger the reset(possibly BACO) on all the ASICs at 
the same time

Signed-off-by: shaoyunl 
Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  13 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  71 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h|   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
 5 files changed, 165 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d46d3794699e..5602c6edee97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -125,6 +125,10 @@ struct amdgpu_mgpu_info
uint32_tnum_gpu;
uint32_tnum_dgpu;
uint32_tnum_apu;
+
+   /* delayed reset_func for XGMI configuration if necessary */
+   struct delayed_work delayed_reset_work;
+   boolpending_reset;
 };
 
 #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH256
@@ -1124,6 +1128,15 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device 
*adev,
 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
 
+int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
+ struct amdgpu_job *job,
+ bool *need_full_reset_arg);
+
+int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
+ struct list_head *device_list_handle,
+ bool *need_full_reset_arg,
+ bool skip_hw_reset);
+
 int emu_soc_asic_init(struct amdgpu_device *adev);
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3c35b0c1e710..5b520f70e660 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
}
}
 
+   /* Don't post if we need to reset whole hive on init */
+   if (adev->gmc.xgmi.pending_reset)
+   return false;
+
if (adev->has_hw_reset) {
adev->has_hw_reset = false;
return true;
@@ -2149,6 +2153,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device 
*adev)
if (adev->ip_blocks[i].version->type != 
AMD_IP_BLOCK_TYPE_PSP)
continue;
 
+   if (!adev->ip_blocks[i].status.sw)
+   continue;
+
/* no need to do the fw loading again if already done*/
if (adev->ip_blocks[i].status.hw == true)
break;
@@ -2289,7 +2296,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
 
if (adev->gmc.xgmi.num_physical_nodes > 1)
amdgpu_xgmi_add_device(adev);
-   amdgpu_amdkfd_device_init(adev);
+
+   /* Don't init kfd if whole hive need to be reset during init */
+   if (!adev->gmc.xgmi.pending_reset)
+   amdgpu_amdkfd_device_init(adev);
 
amdgpu_fru_get_product_info(adev);
 
@@ -2734,6 +2744,16 @@ static int amdgpu_device_ip_suspend_phase2(struct 
amdgpu_device *adev)
adev->ip_blocks[i].status.hw = false;
continue;
}
+
+   /* skip unnecessary suspend if we do not initialize them yet */
+   if (adev->gmc.xgmi.pending_reset &&
+   !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC 
||
+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC 
||
+ adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_COMMON ||
+ adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_IH)) {
+   adev->ip_blocks[i].status.hw = false;
+   continue;
+   }
/* XXX handle errors */
r = adev->ip_blocks[i].version->funcs->suspend(adev);
/* XXX handle errors */
@@ -3407,10 +3427,28 @@ int amdgpu_device_in

[PATCH 4/5] drm/amdgpu: Add reset_list for device list used for reset

2021-03-05 Thread shaoyunl

The gmc.xgmi.head list originally is designed for device list in the XGMI hive. 
Mix use it
for reset purpose will prevent the reset function to adjust XGMI device list 
which is required
in next change

Signed-off-by: shaoyunl 
Change-Id: Ibbdf75c02836151adf5bb44186e6ced97dbf8c1d
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 33 --
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f01b75ec6c60..d46d3794699e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1040,6 +1040,7 @@ struct amdgpu_device {
 
int asic_reset_res;
struct work_struct  xgmi_reset_work;
+   struct list_headreset_list;
 
longgfx_timeout;
longsdma_timeout;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62d7ce621457..3c35b0c1e710 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3290,6 +3290,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
INIT_LIST_HEAD(>shadow_list);
mutex_init(>shadow_list_lock);
 
+   INIT_LIST_HEAD(>reset_list);
+
INIT_DELAYED_WORK(>delayed_init_work,
  amdgpu_device_delayed_init_work_handler);
INIT_DELAYED_WORK(>gfx.gfx_off_delay_work,
@@ -4301,11 +4303,11 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
int r = 0;
 
/*
-* ASIC reset has to be done on all HGMI hive nodes ASAP
+* ASIC reset has to be done on all XGMI hive nodes ASAP
 * to allow proper links negotiation in FW (within 1 sec)
 */
if (!skip_hw_reset && need_full_reset) {
-   list_for_each_entry(tmp_adev, device_list_handle, 
gmc.xgmi.head) {
+   list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
/* For XGMI run all resets in parallel to speed up the 
process */
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
if (!queue_work(system_unbound_wq, 
_adev->xgmi_reset_work))
@@ -4322,8 +4324,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
 
/* For XGMI wait for all resets to complete before proceed */
if (!r) {
-   list_for_each_entry(tmp_adev, device_list_handle,
-   gmc.xgmi.head) {
+   list_for_each_entry(tmp_adev, device_list_handle, 
reset_list) {
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
flush_work(_adev->xgmi_reset_work);
r = tmp_adev->asic_reset_res;
@@ -4335,7 +4336,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
}
 
if (!r && amdgpu_ras_intr_triggered()) {
-   list_for_each_entry(tmp_adev, device_list_handle, 
gmc.xgmi.head) {
+   list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (tmp_adev->mmhub.funcs &&
tmp_adev->mmhub.funcs->reset_ras_error_count)

tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
@@ -4344,7 +4345,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
amdgpu_ras_intr_cleared();
}
 
-   list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+   list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (need_full_reset) {
/* post card */
if (amdgpu_device_asic_init(tmp_adev))
@@ -4655,16 +4656,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
*adev,
 */
INIT_LIST_HEAD(_list);
if (adev->gmc.xgmi.num_physical_nodes > 1) {
-   if (!list_is_first(>gmc.xgmi.head, >device_list))
-   list_rotate_to_front(>gmc.xgmi.head, 
>device_list);
-   device_list_handle = >device_list;
+   list_for_each_entry(tmp_adev, >device_list, gmc.xgmi.head)
+   list_add_tail(_adev->reset_list, _list);
+   if (!list_is_first(>reset_list, _list))
+   list_rotate_to_front(>reset_list, _list);
+   device_list_handle = _list;
} else {
-   list_add_tail(>gmc.xgmi.head, _list);
+   list_add_tail(>reset_list, _list);
device_list_handle = _list;
}
 
/* block all schedulers and reset given job's ring

[PATCH 3/5] drm/amdgpu: Init the cp MQD if it's not be initialized before

2021-03-05 Thread shaoyunl

The MQD might not be initialized duirng first init period if the device need to 
be reset
druing probe. Driver need to proper init them in gpu recovery period

Signed-off-by: shaoyunl 
Acked-by: Alex Deucher 
Change-Id: Iad58a050939af2afa46d1c74a90866c47ba9efd2
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 20 +---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 300a07227597..7ee4bbb1d325 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -3707,11 +3707,18 @@ static int gfx_v9_0_kiq_init_queue(struct amdgpu_ring 
*ring)
struct amdgpu_device *adev = ring->adev;
struct v9_mqd *mqd = ring->mqd_ptr;
int mqd_idx = AMDGPU_MAX_COMPUTE_RINGS;
+   struct v9_mqd *tmp_mqd;
 
gfx_v9_0_kiq_setting(ring);
 
-   if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
-   /* reset MQD to a clean status */
+   /* GPU could be in bad state during probe, driver trigger the reset
+* after load the SMU, in this case , the mqd is not be initialized.
+* driver need to re-init the mqd.
+* check mqd->cp_hqd_pq_control since this value should not be 0
+*/
+   tmp_mqd = (struct v9_mqd *)adev->gfx.mec.mqd_backup[mqd_idx];
+   if (amdgpu_in_reset(adev) && tmp_mqd->cp_hqd_pq_control){
+   /* for GPU_RESET case , reset MQD to a clean status */
if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], 
sizeof(struct v9_mqd_allocation));
 
@@ -3747,8 +3754,15 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring 
*ring)
struct amdgpu_device *adev = ring->adev;
struct v9_mqd *mqd = ring->mqd_ptr;
int mqd_idx = ring - >gfx.compute_ring[0];
+   struct v9_mqd *tmp_mqd;
 
-   if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
+   /* Same as above kiq init, driver need to re-init the mqd if 
mqd->cp_hqd_pq_control
+* is not be initialized before
+*/
+   tmp_mqd = (struct v9_mqd *)adev->gfx.mec.mqd_backup[mqd_idx];
+
+   if (!tmp_mqd->cp_hqd_pq_control ||
+   (!amdgpu_in_reset(adev) && !adev->in_suspend)) {
memset((void *)mqd, 0, sizeof(struct v9_mqd_allocation));
((struct v9_mqd_allocation *)mqd)->dynamic_cu_mask = 0x;
((struct v9_mqd_allocation *)mqd)->dynamic_rb_mask = 0x;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 2/5] drm/amdgpu: Add kfd init_complete flag to check from amdgpu side

2021-03-05 Thread shaoyunl

amdgpu driver may be in reset state during init which will not initialize the 
kfd,
driver need to initialize the KFD after reset by check the flag

Signed-off-by: shaoyunl 
Acked-by: Alex Deucher 
Change-Id: Ic1684b55b27e0afd42bee8b9b431c4fb0afcec15
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 -
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index c5343a5eecbe..a876dc3af017 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -165,7 +165,8 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
adev->doorbell_index.last_non_cp;
}
 
-   kgd2kfd_device_init(adev->kfd.dev, adev_to_drm(adev), 
_resources);
+   adev->kfd.init_complete = kgd2kfd_device_init(adev->kfd.dev,
+   adev_to_drm(adev), 
_resources);
}
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 4687ff2961e1..3182dd97840e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -80,6 +80,7 @@ struct amdgpu_amdkfd_fence {
 struct amdgpu_kfd_dev {
struct kfd_dev *dev;
uint64_t vram_used;
+   bool init_complete;
 };
 
 enum kgd_engine_type {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a11760ec3924..62d7ce621457 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4788,9 +4788,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
 skip_sched_resume:
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-   /*unlock kfd: SRIOV would do it separately */
+   /* unlock kfd: SRIOV would do it separately */
if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
amdgpu_amdkfd_post_reset(tmp_adev);
+
+   /* kfd_post_reset will do nothing if kfd device is not 
initialized,
+* need to bring up kfd here if it's not be initialized before
+*/
+   if (!adev->kfd.init_complete)
+   amdgpu_amdkfd_device_init(adev);
+
if (audio_suspended)
amdgpu_device_resume_display_audio(tmp_adev);
amdgpu_device_unlock_adev(tmp_adev);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 1/5] drm/amdgpu: get xgmi info at eary_init

2021-03-05 Thread shaoyunl

Driver need to get XGMI info function earlier before ip_init since driver need 
to check
the XGMI setting to determine how to perform reset during init

Signed-off-by: shaoyunl 
Acked-by: Alex Deucher 
Change-Id: Ic37276bbb6640bb4e9360220fed99494cedd3ef5
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 3686e777c76c..3e6bfab5b855 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1151,6 +1151,10 @@ static int gmc_v9_0_early_init(void *handle)
adev->gmc.private_aperture_end =
adev->gmc.private_aperture_start + (4ULL << 30) - 1;
 
+   /* Need to get xgmi info earlier to decide the reset behavior*/
+   if (adev->gmc.xgmi.supported)
+   adev->gfxhub.funcs->get_xgmi_info(adev);
+
return 0;
 }
 
@@ -1416,12 +1420,6 @@ static int gmc_v9_0_sw_init(void *handle)
}
adev->need_swiotlb = drm_need_swiotlb(44);
 
-   if (adev->gmc.xgmi.supported) {
-   r = adev->gfxhub.funcs->get_xgmi_info(adev);
-   if (r)
-   return r;
-   }
-
r = gmc_v9_0_mc_init(adev);
if (r)
return r;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 4/4] drm/amdgpu: Reset the devices in the XGMI hive duirng probe

2021-02-23 Thread shaoyunl

In passthrough configuration, hypervisior will trigger the SBR(Secondary bus 
reset) to the devices
without sync to each other. This could cause device hang since for XGMI 
configuration, all the devices
within the hive need to be reset at a limit time slot. This serial of patches 
try to solve this issue
by co-operate with new SMU which will only do minimum house keeping to response 
the SBR request but don't
do the real reset job and leave it to driver. Driver need to do the whole sw 
init and minimum HW init
to bring up the SMU and trigger the reset(possibly BACO) on all the ASICs at 
the same time with existing
gpu_recovery routine.

Signed-off-by: shaoyunl 
Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 96 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h|  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c|  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |  6 +-
 4 files changed, 87 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 420ef08a51b5..ae8be6d813a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
}
}
 
+   /* Don't post if we need to reset whole hive on init */
+   if (adev->gmc.xgmi.pending_reset)
+   return false;
+
if (adev->has_hw_reset) {
adev->has_hw_reset = false;
return true;
@@ -2147,6 +2151,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device 
*adev)
if (adev->ip_blocks[i].version->type != 
AMD_IP_BLOCK_TYPE_PSP)
continue;
 
+   if (!adev->ip_blocks[i].status.sw)
+   continue;
+
/* no need to do the fw loading again if already done*/
if (adev->ip_blocks[i].status.hw == true)
break;
@@ -2287,7 +2294,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
 
if (adev->gmc.xgmi.num_physical_nodes > 1)
amdgpu_xgmi_add_device(adev);
-   amdgpu_amdkfd_device_init(adev);
+
+   /* Don't init kfd if whole hive need to be reset during init */
+   if (!adev->gmc.xgmi.pending_reset)
+   amdgpu_amdkfd_device_init(adev);
 
amdgpu_fru_get_product_info(adev);
 
@@ -2731,6 +2741,16 @@ static int amdgpu_device_ip_suspend_phase2(struct 
amdgpu_device *adev)
adev->ip_blocks[i].status.hw = false;
continue;
}
+
+   /* skip unnecessary suspend if we do not initialize them yet */
+   if (adev->gmc.xgmi.pending_reset &&
+   !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC 
||
+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC 
||
+ adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_COMMON ||
+ adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_IH)) {
+   adev->ip_blocks[i].status.hw = false;
+   continue;
+   }
/* XXX handle errors */
r = adev->ip_blocks[i].version->funcs->suspend(adev);
/* XXX handle errors */
@@ -3402,10 +3422,29 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 *  E.g., driver was not cleanly unloaded previously, etc.
 */
if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
-   r = amdgpu_asic_reset(adev);
-   if (r) {
-   dev_err(adev->dev, "asic reset on init failed\n");
-   goto failed;
+   if (adev->gmc.xgmi.num_physical_nodes) {
+   dev_info(adev->dev, "Pending hive reset.\n");
+   adev->gmc.xgmi.pending_reset = true;
+   /* Only need to init necessary block for SMU to handle 
the reset */
+   for (i = 0; i < adev->num_ip_blocks; i++) {
+   if (!adev->ip_blocks[i].status.valid)
+   continue;
+   if (!(adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_GMC ||
+ adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_PSP ||
+ adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_COMMON ||
+ adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_IH ||
+

[PATCH 3/4] drm/amdgpu: Init the cp MQD if it's not be initialized before

2021-02-23 Thread shaoyunl

The MQD might not be initialized duirng first init period if the device need to 
be reset
druing probe. Driver need to proper init them in gpu recovery period

Signed-off-by: shaoyunl 
Acked-by: Alex Deucher 
Change-Id: Iad58a050939af2afa46d1c74a90866c47ba9efd2
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 20 +---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 65db88bb6cbc..bfe1aaa0418f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -3696,11 +3696,18 @@ static int gfx_v9_0_kiq_init_queue(struct amdgpu_ring 
*ring)
struct amdgpu_device *adev = ring->adev;
struct v9_mqd *mqd = ring->mqd_ptr;
int mqd_idx = AMDGPU_MAX_COMPUTE_RINGS;
+   struct v9_mqd *tmp_mqd;
 
gfx_v9_0_kiq_setting(ring);
 
-   if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
-   /* reset MQD to a clean status */
+   /* GPU could be in bad state during probe, driver trigger the reset
+* after load the SMU, in this case , the mqd is not be initialized.
+* driver need to re-init the mqd.
+* check mqd->cp_hqd_pq_control since this value should not be 0
+*/
+   tmp_mqd = (struct v9_mqd *)adev->gfx.mec.mqd_backup[mqd_idx];
+   if (amdgpu_in_reset(adev) && tmp_mqd->cp_hqd_pq_control){
+   /* for GPU_RESET case , reset MQD to a clean status */
if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], 
sizeof(struct v9_mqd_allocation));
 
@@ -3736,8 +3743,15 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring 
*ring)
struct amdgpu_device *adev = ring->adev;
struct v9_mqd *mqd = ring->mqd_ptr;
int mqd_idx = ring - >gfx.compute_ring[0];
+   struct v9_mqd *tmp_mqd;
 
-   if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
+   /* Same as above kiq init, driver need to re-init the mqd if 
mqd->cp_hqd_pq_control
+* is not be initialized before
+*/
+   tmp_mqd = (struct v9_mqd *)adev->gfx.mec.mqd_backup[mqd_idx];
+
+   if (!tmp_mqd->cp_hqd_pq_control ||
+   (!amdgpu_in_reset(adev) && !adev->in_suspend)) {
memset((void *)mqd, 0, sizeof(struct v9_mqd_allocation));
((struct v9_mqd_allocation *)mqd)->dynamic_cu_mask = 0x;
((struct v9_mqd_allocation *)mqd)->dynamic_rb_mask = 0x;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 2/4] drm/amdgpu: Add kfd init_complete flag to check from amdgpu side

2021-02-23 Thread shaoyunl

amdgpu driver may be in reset state during init which will not initialize the 
kfd,
driver need to initialize the KFD after reset by check the flag

Signed-off-by: shaoyunl 
Acked-by: Alex Deucher 
Change-Id: Ic1684b55b27e0afd42bee8b9b431c4fb0afcec15
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 -
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index c5343a5eecbe..a876dc3af017 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -165,7 +165,8 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
adev->doorbell_index.last_non_cp;
}
 
-   kgd2kfd_device_init(adev->kfd.dev, adev_to_drm(adev), 
_resources);
+   adev->kfd.init_complete = kgd2kfd_device_init(adev->kfd.dev,
+   adev_to_drm(adev), 
_resources);
}
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 4687ff2961e1..3182dd97840e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -80,6 +80,7 @@ struct amdgpu_amdkfd_fence {
 struct amdgpu_kfd_dev {
struct kfd_dev *dev;
uint64_t vram_used;
+   bool init_complete;
 };
 
 enum kgd_engine_type {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2f9ad7ed82be..420ef08a51b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4784,9 +4784,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
 skip_sched_resume:
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-   /*unlock kfd: SRIOV would do it separately */
+   /* unlock kfd: SRIOV would do it separately */
if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
amdgpu_amdkfd_post_reset(tmp_adev);
+
+   /* kfd_post_reset will do nothing if kfd device is not 
initialized,
+* need to bring up kfd here if it's not be initialized before
+*/
+   if (!adev->kfd.init_complete)
+   amdgpu_amdkfd_device_init(adev);
+
if (audio_suspended)
amdgpu_device_resume_display_audio(tmp_adev);
amdgpu_device_unlock_adev(tmp_adev);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 1/4] drm/amdgpu: get xgmi info at eary_init

2021-02-23 Thread shaoyunl

Driver need to get XGMI info function earlier before ip_init since driver need 
to check
the XGMI setting to determine how to perform reset during init

Signed-off-by: shaoyunl 
Acked-by: Alex Deucher 
Change-Id: Ic37276bbb6640bb4e9360220fed99494cedd3ef5
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 3686e777c76c..3e6bfab5b855 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1151,6 +1151,10 @@ static int gmc_v9_0_early_init(void *handle)
adev->gmc.private_aperture_end =
adev->gmc.private_aperture_start + (4ULL << 30) - 1;
 
+   /* Need to get xgmi info earlier to decide the reset behavior*/
+   if (adev->gmc.xgmi.supported)
+   adev->gfxhub.funcs->get_xgmi_info(adev);
+
return 0;
 }
 
@@ -1416,12 +1420,6 @@ static int gmc_v9_0_sw_init(void *handle)
}
adev->need_swiotlb = drm_need_swiotlb(44);
 
-   if (adev->gmc.xgmi.supported) {
-   r = adev->gfxhub.funcs->get_xgmi_info(adev);
-   if (r)
-   return r;
-   }
-
r = gmc_v9_0_mc_init(adev);
if (r)
return r;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 4/4] drm/amdgpu: Init the cp MQD if it's not be initialized before

2021-02-18 Thread shaoyunl

The MQD might not be initialized duirng first init period if the device need to 
be reset
druing probe. Driver need to proper init them in gpu recovery period

Signed-off-by: shaoyunl 
Change-Id: Iad58a050939af2afa46d1c74a90866c47ba9efd2
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 20 +---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 65db88bb6cbc..8fc2fd518a1b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -3696,11 +3696,18 @@ static int gfx_v9_0_kiq_init_queue(struct amdgpu_ring 
*ring)
struct amdgpu_device *adev = ring->adev;
struct v9_mqd *mqd = ring->mqd_ptr;
int mqd_idx = AMDGPU_MAX_COMPUTE_RINGS;
+   struct v9_mqd *tmp_mqd;
 
gfx_v9_0_kiq_setting(ring);
 
-   if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
-   /* reset MQD to a clean status */
+   /* GPU could be in bad state during probe, driver trigger the reset
+* after load the SMU, in this case , the mqd is not be initialized.
+* driver need to re-init the mqd in this case.
+* check mqd->cp_hqd_pq_control since this value should not be 0
+*/
+   tmp_mqd = (struct v9_mqd *)adev->gfx.mec.mqd_backup[mqd_idx];
+   if (amdgpu_in_reset(adev) && tmp_mqd->cp_hqd_pq_control){
+   /* for GPU_RESET case , reset MQD to a clean status */
if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], 
sizeof(struct v9_mqd_allocation));
 
@@ -3736,8 +3743,15 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring 
*ring)
struct amdgpu_device *adev = ring->adev;
struct v9_mqd *mqd = ring->mqd_ptr;
int mqd_idx = ring - >gfx.compute_ring[0];
+   struct v9_mqd *tmp_mqd;
 
-   if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
+   /* Samw as above kiq init, driver need to re-init the mqd if 
mqd->cp_hqd_pq_control
+* is not be initialized before
+*/
+   tmp_mqd = (struct v9_mqd *)adev->gfx.mec.mqd_backup[mqd_idx];
+
+   if (!tmp_mqd->cp_hqd_pq_control ||
+   (!amdgpu_in_reset(adev) && !adev->in_suspend)) {
memset((void *)mqd, 0, sizeof(struct v9_mqd_allocation));
((struct v9_mqd_allocation *)mqd)->dynamic_cu_mask = 0x;
((struct v9_mqd_allocation *)mqd)->dynamic_rb_mask = 0x;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 3/4] drm/amdgpu: Add kfd init_complete flag to check from amdgpu side

2021-02-18 Thread shaoyunl

amdgpu driver may in reset state duirng init which will not initialize the kfd,
driver need to initialize the KFD after reset by check the flag

Signed-off-by: shaoyunl 
Change-Id: Ic1684b55b27e0afd42bee8b9b431c4fb0afcec15
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index c5343a5eecbe..a876dc3af017 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -165,7 +165,8 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
adev->doorbell_index.last_non_cp;
}
 
-   kgd2kfd_device_init(adev->kfd.dev, adev_to_drm(adev), 
_resources);
+   adev->kfd.init_complete = kgd2kfd_device_init(adev->kfd.dev,
+   adev_to_drm(adev), 
_resources);
}
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 4687ff2961e1..3182dd97840e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -80,6 +80,7 @@ struct amdgpu_amdkfd_fence {
 struct amdgpu_kfd_dev {
struct kfd_dev *dev;
uint64_t vram_used;
+   bool init_complete;
 };
 
 enum kgd_engine_type {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 9f574fd151bc..e898fce96f75 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4841,6 +4841,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/*unlock kfd: SRIOV would do it separately */
if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
amdgpu_amdkfd_post_reset(tmp_adev);
+
+   /*kfd_post_reset will do nothing if kfd device is not 
initialized,
+*need to bring up kfd here if it's not be initialized before
+*/
+   if (!adev->kfd.init_complete)
+   amdgpu_amdkfd_device_init(adev);
+
if (audio_suspended)
amdgpu_device_resume_display_audio(tmp_adev);
amdgpu_device_unlock_adev(tmp_adev);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 2/4] drm/amdgpu: get xgmi info at eary_init

2021-02-18 Thread shaoyunl

Driver need to get XGMI info function earlier before ip_init since driver need 
to check
the XGMI setting to determine how to perform reset during init

Signed-off-by: shaoyunl 
Change-Id: Ic37276bbb6640bb4e9360220fed99494cedd3ef5
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 3686e777c76c..3e6bfab5b855 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1151,6 +1151,10 @@ static int gmc_v9_0_early_init(void *handle)
adev->gmc.private_aperture_end =
adev->gmc.private_aperture_start + (4ULL << 30) - 1;
 
+   /* Need to get xgmi info earlier to decide the reset behavior*/
+   if (adev->gmc.xgmi.supported)
+   adev->gfxhub.funcs->get_xgmi_info(adev);
+
return 0;
 }
 
@@ -1416,12 +1420,6 @@ static int gmc_v9_0_sw_init(void *handle)
}
adev->need_swiotlb = drm_need_swiotlb(44);
 
-   if (adev->gmc.xgmi.supported) {
-   r = adev->gfxhub.funcs->get_xgmi_info(adev);
-   if (r)
-   return r;
-   }
-
r = gmc_v9_0_mc_init(adev);
if (r)
return r;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 1/4] drm/amdgpu: Reset the devices in the XGMI hive duirng probe

2021-02-18 Thread shaoyunl

In passthrough configuration, hypervisior will trigger the SBR(Secondary bus 
reset) to the devices
without sync to each other. This could cause device hang since for XGMI 
configuration, all the devices
within the hive need to be reset at a limit time slot. This serial of patches 
try to solve this issue
by co-operate with new SMU which will only do minimum house keeping to response 
the SBR request but don't
do the real reset job and leave it to driver. Driver need to do the whole sw 
init and minimum HW init
to bring up the SMU and trigger the reset(possibly BACO) on all the ASICs at 
the same time with existing
gpu_recovery routine.

Signed-off-by: shaoyunl 
Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 96 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h|  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c|  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |  6 +-
 4 files changed, 87 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2f9ad7ed82be..9f574fd151bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
}
}
 
+   /* Don't post if we need to reset whole hive on init */
+   if (adev->gmc.xgmi.pending_reset)
+   return false;
+
if (adev->has_hw_reset) {
adev->has_hw_reset = false;
return true;
@@ -2147,6 +2151,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device 
*adev)
if (adev->ip_blocks[i].version->type != 
AMD_IP_BLOCK_TYPE_PSP)
continue;
 
+   if (!adev->ip_blocks[i].status.sw)
+   continue;
+
/* no need to do the fw loading again if already done*/
if (adev->ip_blocks[i].status.hw == true)
break;
@@ -2287,7 +2294,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
 
if (adev->gmc.xgmi.num_physical_nodes > 1)
amdgpu_xgmi_add_device(adev);
-   amdgpu_amdkfd_device_init(adev);
+
+   /* Don't init kfd if whole hive need to be reset during init */
+   if (!adev->gmc.xgmi.pending_reset)
+   amdgpu_amdkfd_device_init(adev);
 
amdgpu_fru_get_product_info(adev);
 
@@ -2731,6 +2741,16 @@ static int amdgpu_device_ip_suspend_phase2(struct 
amdgpu_device *adev)
adev->ip_blocks[i].status.hw = false;
continue;
}
+
+   /* skip unnecessary suspend if we do not initialize them yet */
+   if (adev->gmc.xgmi.pending_reset &&
+   !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC 
||
+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC 
||
+ adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_COMMON ||
+ adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_IH)) {
+   adev->ip_blocks[i].status.hw = false;
+   continue;
+   }
/* XXX handle errors */
r = adev->ip_blocks[i].version->funcs->suspend(adev);
/* XXX handle errors */
@@ -3402,10 +3422,29 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 *  E.g., driver was not cleanly unloaded previously, etc.
 */
if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
-   r = amdgpu_asic_reset(adev);
-   if (r) {
-   dev_err(adev->dev, "asic reset on init failed\n");
-   goto failed;
+   if (adev->gmc.xgmi.num_physical_nodes) {
+   dev_info(adev->dev, "Pending hive reset.\n");
+   adev->gmc.xgmi.pending_reset = true;
+   /* Only need to init necessary block for SMU to handle 
the reset */
+   for (i = 0; i < adev->num_ip_blocks; i++) {
+   if (!adev->ip_blocks[i].status.valid)
+   continue;
+   if (!(adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_GMC ||
+ adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_PSP ||
+ adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_COMMON ||
+ adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_IH ||
+

[PATCH] drm/amdgpu/dce_virtual: Enable vBlank control for vf

2020-11-23 Thread shaoyunl

This function actually control the vblank on/off. It shouldn't be bypassed 
for VF. Otherwise all the vblank based feature on VF will not work.

Signed-off-by: shaoyunl 
Change-Id: I77c6f57bb0af390b61f0049c12bf425b10d70d91
---
 drivers/gpu/drm/amd/amdgpu/dce_virtual.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/dce_virtual.c 
b/drivers/gpu/drm/amd/amdgpu/dce_virtual.c
index b4d4b76538d2..ffcc64ec6473 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_virtual.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_virtual.c
@@ -139,9 +139,6 @@ static void dce_virtual_crtc_dpms(struct drm_crtc *crtc, 
int mode)
struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
unsigned type;
 
-   if (amdgpu_sriov_vf(adev))
-   return;
-
switch (mode) {
case DRM_MODE_DPMS_ON:
amdgpu_crtc->enabled = true;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu/dce_virtual: Enable DPM for vf

2020-11-23 Thread shaoyunl

This function actually control the vblank on/off. It shouldn't be bypassed 
for VF. Otherwise all the vblank based feature on VF will not work.

Signed-off-by: shaoyunl 
Change-Id: I77c6f57bb0af390b61f0049c12bf425b10d70d91
---
 drivers/gpu/drm/amd/amdgpu/dce_virtual.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/dce_virtual.c 
b/drivers/gpu/drm/amd/amdgpu/dce_virtual.c
index b4d4b76538d2..ffcc64ec6473 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_virtual.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_virtual.c
@@ -139,9 +139,6 @@ static void dce_virtual_crtc_dpms(struct drm_crtc *crtc, 
int mode)
struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
unsigned type;
 
-   if (amdgpu_sriov_vf(adev))
-   return;
-
switch (mode) {
case DRM_MODE_DPMS_ON:
amdgpu_crtc->enabled = true;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu/dce_virtual: Enable DPM for vf

2020-11-23 Thread shaoyunl

This function actually control the vblank on/off. It shouldn't be bypassed 
for VF. Otherwise all the vblank based feature on VF will not work.

Signed-off-by: shaoyunl 
Change-Id: I77c6f57bb0af390b61f0049c12bf425b10d70d91
---
 drivers/gpu/drm/amd/amdgpu/dce_virtual.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/dce_virtual.c 
b/drivers/gpu/drm/amd/amdgpu/dce_virtual.c
index b4d4b76538d2..ffcc64ec6473 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_virtual.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_virtual.c
@@ -139,9 +139,6 @@ static void dce_virtual_crtc_dpms(struct drm_crtc *crtc, 
int mode)
struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
unsigned type;
 
-   if (amdgpu_sriov_vf(adev))
-   return;
-
switch (mode) {
case DRM_MODE_DPMS_ON:
amdgpu_crtc->enabled = true;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu/sriov : Don't resume RLCG for SRIOV guest

2020-03-17 Thread shaoyunl

RLCG is enabled by host driver, no need to enable it in guest for none-PSP load 
path

Change-Id: I2f313743bf3d492f06aaef07224da6eda3878a28
Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index d1cdcb4..e134bb2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -1940,6 +1940,11 @@ static int gfx_v10_0_rlc_resume(struct amdgpu_device 
*adev)
if (!amdgpu_sriov_vf(adev)) /* enable RLC SRM */
gfx_v10_0_rlc_enable_srm(adev);
} else {
+   if (amdgpu_sriov_vf(adev)) {
+   gfx_v10_0_init_csb(adev);
+   return 0;
+   }
+
adev->gfx.rlc.funcs->stop(adev);
 
/* disable CG */
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 3/3] drm/amdgpu: Improve Vega20 XGMI TLB flush workaround

2020-01-20 Thread shaoyunl


I see.  So this change

Reviewed-by: shaoyun liu 


On 2020-01-20 1:40 p.m., Felix Kuehling wrote:

On 2020-01-20 1:28 p.m., shaoyunl wrote:


On 2020-01-20 12:58 p.m., Felix Kuehling wrote:

On 2020-01-20 12:47 p.m., shaoyunl wrote:

comments in line .

On 2020-01-17 8:37 p.m., Felix Kuehling wrote:

Using a heavy-weight TLB flush once is not sufficient. Concurrent
memory accesses in the same TLB cache line can re-populate TLB 
entries

from stale texture cache (TC) entries while the heavy-weight TLB
flush is in progress. To fix this race condition, perform another TLB
flush after the heavy-weight one, when TC is known to be clean.

Move the workaround into the low-level TLB flushing functions. 
This way

they apply to amdgpu as well, and KIQ-based TLB flush only needs to
synchronize once.

CC: shaoyun@amd.com
Signed-off-by: Felix Kuehling 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  6 +-
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 68 
+-

  2 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c

index 8609287620ea..5325f6b455f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -647,13 +647,9 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct 
kgd_dev *kgd, uint16_t vmid)
  int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, 
uint16_t pasid)

  {
  struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
-    uint32_t flush_type = 0;
+    const uint32_t flush_type = 0;
  bool all_hub = false;
  -    if (adev->gmc.xgmi.num_physical_nodes &&
-    adev->asic_type == CHIP_VEGA20)
-    flush_type = 2;
-
  if (adev->family == AMDGPU_FAMILY_AI)
  all_hub = true;
  diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

index 90216abf14a4..e2a5e852bdb0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -476,13 +476,26 @@ static void gmc_v9_0_flush_gpu_tlb(struct 
amdgpu_device *adev, uint32_t vmid,

  {
  bool use_semaphore = gmc_v9_0_use_invalidate_semaphore(adev, 
vmhub);

  const unsigned eng = 17;
-    u32 j, inv_req, tmp;
+    u32 j, inv_req, inv_req2, tmp;
  struct amdgpu_vmhub *hub;
    BUG_ON(vmhub >= adev->num_vmhubs);
    hub = >vmhub[vmhub];
-    inv_req = gmc_v9_0_get_invalidate_req(vmid, flush_type);
+    if (adev->gmc.xgmi.num_physical_nodes &&
+    adev->asic_type == CHIP_VEGA20) {
+    /* Vega20+XGMI caches PTEs in TC and TLB. Add a
+ * heavy-weight TLB flush (type 2), which flushes
+ * both. Due to a race condition with concurrent
+ * memory accesses using the same TLB cache line, we
+ * still need a second TLB flush after this.
+ */
+    inv_req = gmc_v9_0_get_invalidate_req(vmid, 2);
+    inv_req2 = gmc_v9_0_get_invalidate_req(vmid, flush_type);


[shaoyunl]  For the send invalidation in this situation ,can we use 
0  for the flush type directly ? I think no matter what's the input 
flush_type for this function , heavy-weight + legacy invalidation 
should be enough for all of them .


I'm not sure that's true. In the case of the race condition, there 
was some concurrent memory access during the first heavy-weight 
invalidation. If that is now flushed in the second invalidation, and 
a heavy-weight invalidation was requested, we should also flush any 
TC cache lines associated with that access. So hard-coding 
flush_type 0 here is probably not safe for all cases.


Regards,
  Felix

[shaoyunl]   Originally we use the  heavy-weight invalidation for 
XGMI here is due to the HW issue which always use NC even for remote 
GPU memory access (this lead walker to load the TLB directly from TC 
with stale value) . The heavy-weight  will set invalidate bit for 
both TLB and  TC so this will make the walker to load from main 
memory . Your change is based on the assumption that after first 
heavy-weight invalidation , the TC already load with  correct 
contents which seems  should be true , so in this situation I think 
the light-weight or even legacy invalidation will be  enough since 
they will load from TC to TLB directly .


With this change, if you request a legacy invalidation (currently we 
always do), you'll get a heavy-weight followed by a legacy invalidation.


I'm working on other changes that will require a heavy-weight TLB 
flush even without this workaround. In this case I believe the second 
flush will need to be heavy-weight as well.


Regards,
  Felix




Regards

shaoyun.liu







+    } else {
+    inv_req = gmc_v9_0_get_invalidate_req(vmid, flush_type);
+    inv_req2 = 0;
+    }
    /* This is necessary for a HW workaround under SRIOV as well
   * as GFXOFF under bare metal
@@ -521,21 +534,27 @@ static void gmc_v9_0_flush_gpu_tlb(st

Re: [PATCH 3/3] drm/amdgpu: Improve Vega20 XGMI TLB flush workaround

2020-01-20 Thread shaoyunl



On 2020-01-20 12:58 p.m., Felix Kuehling wrote:

On 2020-01-20 12:47 p.m., shaoyunl wrote:

comments in line .

On 2020-01-17 8:37 p.m., Felix Kuehling wrote:

Using a heavy-weight TLB flush once is not sufficient. Concurrent
memory accesses in the same TLB cache line can re-populate TLB entries
from stale texture cache (TC) entries while the heavy-weight TLB
flush is in progress. To fix this race condition, perform another TLB
flush after the heavy-weight one, when TC is known to be clean.

Move the workaround into the low-level TLB flushing functions. This way
they apply to amdgpu as well, and KIQ-based TLB flush only needs to
synchronize once.

CC: shaoyun@amd.com
Signed-off-by: Felix Kuehling 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  6 +-
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 68 
+-

  2 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c

index 8609287620ea..5325f6b455f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -647,13 +647,9 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct 
kgd_dev *kgd, uint16_t vmid)
  int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, 
uint16_t pasid)

  {
  struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
-    uint32_t flush_type = 0;
+    const uint32_t flush_type = 0;
  bool all_hub = false;
  -    if (adev->gmc.xgmi.num_physical_nodes &&
-    adev->asic_type == CHIP_VEGA20)
-    flush_type = 2;
-
  if (adev->family == AMDGPU_FAMILY_AI)
  all_hub = true;
  diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

index 90216abf14a4..e2a5e852bdb0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -476,13 +476,26 @@ static void gmc_v9_0_flush_gpu_tlb(struct 
amdgpu_device *adev, uint32_t vmid,

  {
  bool use_semaphore = gmc_v9_0_use_invalidate_semaphore(adev, 
vmhub);

  const unsigned eng = 17;
-    u32 j, inv_req, tmp;
+    u32 j, inv_req, inv_req2, tmp;
  struct amdgpu_vmhub *hub;
    BUG_ON(vmhub >= adev->num_vmhubs);
    hub = >vmhub[vmhub];
-    inv_req = gmc_v9_0_get_invalidate_req(vmid, flush_type);
+    if (adev->gmc.xgmi.num_physical_nodes &&
+    adev->asic_type == CHIP_VEGA20) {
+    /* Vega20+XGMI caches PTEs in TC and TLB. Add a
+ * heavy-weight TLB flush (type 2), which flushes
+ * both. Due to a race condition with concurrent
+ * memory accesses using the same TLB cache line, we
+ * still need a second TLB flush after this.
+ */
+    inv_req = gmc_v9_0_get_invalidate_req(vmid, 2);
+    inv_req2 = gmc_v9_0_get_invalidate_req(vmid, flush_type);


[shaoyunl]  For the send invalidation in this situation ,can we use 
0  for the flush type directly ? I think no matter what's the input 
flush_type for this function , heavy-weight  + legacy invalidation 
should be enough for all of them .


I'm not sure that's true. In the case of the race condition, there was 
some concurrent memory access during the first heavy-weight 
invalidation. If that is now flushed in the second invalidation, and a 
heavy-weight invalidation was requested, we should also flush any TC 
cache lines associated with that access. So hard-coding flush_type 0 
here is probably not safe for all cases.


Regards,
  Felix

[shaoyunl]   Originally we use the  heavy-weight invalidation for XGMI 
here is due to the HW issue which always use NC even for remote GPU 
memory access (this lead walker to load the TLB directly from TC with 
stale value) . The heavy-weight  will set invalidate bit for both TLB 
and  TC so this will make the walker to load from main memory . Your 
change is based on the assumption that after first heavy-weight 
invalidation , the TC already load with  correct contents which seems  
should be true , so in this situation I think the light-weight or even 
legacy invalidation will be  enough since they will load from TC to TLB 
directly .


Regards

shaoyun.liu







+    } else {
+    inv_req = gmc_v9_0_get_invalidate_req(vmid, flush_type);
+    inv_req2 = 0;
+    }
    /* This is necessary for a HW workaround under SRIOV as well
   * as GFXOFF under bare metal
@@ -521,21 +534,27 @@ static void gmc_v9_0_flush_gpu_tlb(struct 
amdgpu_device *adev, uint32_t vmid,
  DRM_ERROR("Timeout waiting for sem acquire in VM 
flush!\n");

  }
  -    WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, inv_req);
+    do {
+    WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, inv_req);
  -    /*
- * Issue a dummy read to wait for the ACK register to be cleared
- * to avoid a false ACK due to the new fast GRBM interface.
- */
-    if (vmhub == AMDGPU_GFXHUB_0)
-    RREG32_NO_KIQ(hub->vm_inv_eng0_req

Re: [PATCH 3/3] drm/amdgpu: Improve Vega20 XGMI TLB flush workaround

2020-01-20 Thread shaoyunl


comments in line .

On 2020-01-17 8:37 p.m., Felix Kuehling wrote:

Using a heavy-weight TLB flush once is not sufficient. Concurrent
memory accesses in the same TLB cache line can re-populate TLB entries
from stale texture cache (TC) entries while the heavy-weight TLB
flush is in progress. To fix this race condition, perform another TLB
flush after the heavy-weight one, when TC is known to be clean.

Move the workaround into the low-level TLB flushing functions. This way
they apply to amdgpu as well, and KIQ-based TLB flush only needs to
synchronize once.

CC: shaoyun@amd.com
Signed-off-by: Felix Kuehling 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  6 +-
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 68 +-
  2 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 8609287620ea..5325f6b455f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -647,13 +647,9 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, 
uint16_t vmid)
  int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid)
  {
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
-   uint32_t flush_type = 0;
+   const uint32_t flush_type = 0;
bool all_hub = false;
  
-	if (adev->gmc.xgmi.num_physical_nodes &&

-   adev->asic_type == CHIP_VEGA20)
-   flush_type = 2;
-
if (adev->family == AMDGPU_FAMILY_AI)
all_hub = true;
  
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

index 90216abf14a4..e2a5e852bdb0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -476,13 +476,26 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device 
*adev, uint32_t vmid,
  {
bool use_semaphore = gmc_v9_0_use_invalidate_semaphore(adev, vmhub);
const unsigned eng = 17;
-   u32 j, inv_req, tmp;
+   u32 j, inv_req, inv_req2, tmp;
struct amdgpu_vmhub *hub;
  
  	BUG_ON(vmhub >= adev->num_vmhubs);
  
  	hub = >vmhub[vmhub];

-   inv_req = gmc_v9_0_get_invalidate_req(vmid, flush_type);
+   if (adev->gmc.xgmi.num_physical_nodes &&
+   adev->asic_type == CHIP_VEGA20) {
+   /* Vega20+XGMI caches PTEs in TC and TLB. Add a
+* heavy-weight TLB flush (type 2), which flushes
+* both. Due to a race condition with concurrent
+* memory accesses using the same TLB cache line, we
+* still need a second TLB flush after this.
+*/
+   inv_req = gmc_v9_0_get_invalidate_req(vmid, 2);
+   inv_req2 = gmc_v9_0_get_invalidate_req(vmid, flush_type);


[shaoyunl]  For the send invalidation in this situation ,can we use 0  
for the flush type directly ? I think no matter what's the input 
flush_type for this function , heavy-weight  + legacy invalidation 
should be enough for all of them .



+   } else {
+   inv_req = gmc_v9_0_get_invalidate_req(vmid, flush_type);
+   inv_req2 = 0;
+   }
  
  	/* This is necessary for a HW workaround under SRIOV as well

 * as GFXOFF under bare metal
@@ -521,21 +534,27 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device 
*adev, uint32_t vmid,
DRM_ERROR("Timeout waiting for sem acquire in VM 
flush!\n");
}
  
-	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, inv_req);

+   do {
+   WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, inv_req);
  
-	/*

-* Issue a dummy read to wait for the ACK register to be cleared
-* to avoid a false ACK due to the new fast GRBM interface.
-*/
-   if (vmhub == AMDGPU_GFXHUB_0)
-   RREG32_NO_KIQ(hub->vm_inv_eng0_req + eng);
+   /*
+* Issue a dummy read to wait for the ACK register to
+* be cleared to avoid a false ACK due to the new fast
+* GRBM interface.
+*/
+   if (vmhub == AMDGPU_GFXHUB_0)
+   RREG32_NO_KIQ(hub->vm_inv_eng0_req + eng);
  
-	for (j = 0; j < adev->usec_timeout; j++) {

-   tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_ack + eng);
-   if (tmp & (1 << vmid))
-   break;
-   udelay(1);
-   }
+   for (j = 0; j < adev->usec_timeout; j++) {
+   tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_ack + eng);
+   if (tmp & (1 << vmid))
+   break;
+   udelay(1);
+   }
+
+   inv_req = inv_req2;
+   inv_req2 = 0;
+   } while (inv_req);
  
  	/* TODO: It needs to continue working on debuggi

Re: [PATCH] drm/amdgpu: check rlc_g firmware pointer is valid before using it

2020-01-13 Thread shaoyunl


ping.


On 2020-01-10 1:33 p.m., shaoyunl wrote:

In SRIOV, rlc_g firmware is loaded by host, guest driver won't load it which 
will
cause the rlc_fw pointer is null

Change-Id: Id16f65171dd427d623af4c5bc75f674019e63dec
Signed-off-by: shaoyunl 
---
  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 9 +
  1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 379e46c..07e22f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -808,10 +808,11 @@ static int gfx_v10_0_init_microcode(struct amdgpu_device 
*adev)
info = >firmware.ucode[AMDGPU_UCODE_ID_RLC_G];
info->ucode_id = AMDGPU_UCODE_ID_RLC_G;
info->fw = adev->gfx.rlc_fw;
-   header = (const struct common_firmware_header *)info->fw->data;
-   adev->firmware.fw_size +=
-   ALIGN(le32_to_cpu(header->ucode_size_bytes), PAGE_SIZE);
-
+   if (info->fw) {
+   header = (const struct common_firmware_header 
*)info->fw->data;
+   adev->firmware.fw_size +=
+   ALIGN(le32_to_cpu(header->ucode_size_bytes), 
PAGE_SIZE);
+   }
if (adev->gfx.rlc.is_rlc_v2_1 &&
adev->gfx.rlc.save_restore_list_cntl_size_bytes &&
adev->gfx.rlc.save_restore_list_gpm_size_bytes &&

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: check rlc_g firmware pointer is valid before using it

2020-01-10 Thread shaoyunl

In SRIOV, rlc_g firmware is loaded by host, guest driver won't load it which 
will
cause the rlc_fw pointer is null

Change-Id: Id16f65171dd427d623af4c5bc75f674019e63dec
Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 379e46c..07e22f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -808,10 +808,11 @@ static int gfx_v10_0_init_microcode(struct amdgpu_device 
*adev)
info = >firmware.ucode[AMDGPU_UCODE_ID_RLC_G];
info->ucode_id = AMDGPU_UCODE_ID_RLC_G;
info->fw = adev->gfx.rlc_fw;
-   header = (const struct common_firmware_header *)info->fw->data;
-   adev->firmware.fw_size +=
-   ALIGN(le32_to_cpu(header->ucode_size_bytes), PAGE_SIZE);
-
+   if (info->fw) {
+   header = (const struct common_firmware_header 
*)info->fw->data;
+   adev->firmware.fw_size +=
+   ALIGN(le32_to_cpu(header->ucode_size_bytes), 
PAGE_SIZE);
+   }
if (adev->gfx.rlc.is_rlc_v2_1 &&
adev->gfx.rlc.save_restore_list_cntl_size_bytes &&
adev->gfx.rlc.save_restore_list_gpm_size_bytes &&
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch

2019-12-20 Thread shaoyunl

I agree this patch is a big improvement , I think we need this patch so 
SRIOV can put the  amdkfd_pre_reset in right place as bare metal mode . 
The further improvement can be in separate change .


This serial is reviewed by shaoyun.liu < shaoyun@amd.com>


Regards

shaoyun.liu


On 2019-12-20 2:46 p.m., Felix Kuehling wrote:

On 2019-12-20 14:31, shaoyunl wrote:
Can we use the  dqm_lock when we try to get the dqm->is_hw_hang and 
dqm->is_resetting inside function kq_uninitialize ?


Spreading the DQM lock around is probably not a good idea. Then I'd 
rather do more refactoring to move hqd_load and hqd_destroy out of the 
kq_init/kq_uninit functions.





I think more closer we check the status  to hqd_destroy it will be  
more accurate . It does look better with this logic if the status are 
changed after dqm unmap_queue call and  before we call hqd_destroy .


Another comment in line.

Regards

shaoyun.liu




On 2019-12-20 11:33 a.m., Felix Kuehling wrote:
dqm->is_hws_hang is protected by the DQM lock. kq_uninitialize runs 
outside that lock protection. Therefore I opted to pass in the 
hanging flag as a parameter. It also keeps the logic that decides 
all of that inside the device queue manager, which I think is cleaner.


I was trying to clean this up further by moving the 
pm_init/pm_uninit out of the start_cpsch/stop_cpsch sequence, but 
gave up on that idea when I found out that I can't create the kernel 
queue in the DQM initialize function because dev->dqm isn't 
initialized at that time yet.


Regards,
  Felix

On 2019-12-20 10:56, shaoyunl wrote:

Looks like patch 2 is not related to this serial , but anyway .

Patch 1,2,3 are reviewed by shaoyunl 

For patch 4 ,  is it possible we directly check dqm->is_hws_hang || 
dqm->is_resetting  inside function kq_uninitialize.  so we don't 
need other interface change .


I think even Inside that kq_uninitialize function , we still can 
get dqm as  kq->dev->dqm .



shaoyun.liu


On 2019-12-20 3:30 a.m., Felix Kuehling wrote:

Don't use the HWS if it's known to be hanging. In a reset also
don't try to destroy the HIQ because that may hang on SRIOV if the
KIQ is unresponsive.

Signed-off-by: Felix Kuehling 
---
  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 


  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c    | 8 
  drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c  | 4 ++--
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h    | 4 ++--
  .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   | 2 +-
  5 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

index a7e9ec1b3ce3..d7eb6ac37f62 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -946,7 +946,7 @@ static int start_nocpsch(struct 
device_queue_manager *dqm)

  static int stop_nocpsch(struct device_queue_manager *dqm)
  {
  if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
-    pm_uninit(>packets);
+    pm_uninit(>packets, false);
  dqm->sched_running = false;
    return 0;
@@ -1114,20 +1114,24 @@ static int start_cpsch(struct 
device_queue_manager *dqm)

  return 0;
  fail_allocate_vidmem:
  fail_set_sched_resources:
-    pm_uninit(>packets);
+    pm_uninit(>packets, false);
  fail_packet_manager_init:
  return retval;
  }
    static int stop_cpsch(struct device_queue_manager *dqm)
  {
+    bool hanging;
+kq_uninitialize(

  dqm_lock(dqm);
-    unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+    if (!dqm->is_hws_hang)
[shaoyunl]  should we check is_resetting here as well . so we ignore 
the  unmap call even HWS still not  detect the hang but we know we 
currently in resetting  precedure


GPU reset can be done when the HWS is not hanging. In that case 
unmapping queues is perfectly safe. In the worst case it'll time out 
and dqm->is_hws_hang will be set as a result. I'm planning to add more 
checks later so that we can optionally wait in unmap_queues until a 
reset is done. We'll need that to do preemptions reliably while a GPU 
reset is in progress. So I need to either unmap the queues or be sure 
that HWS is hanging.


With yours and Oak's comments I realize, this is far from complete and 
more work is needed. But I still think this is an improvement.


Regards,
  Felix


+    unmap_queues_cpsch(dqm, 
KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);

+    hanging = dqm->is_hws_hang || dqm->is_resetting;
  dqm->sched_running = false;
  dqm_unlock(dqm);
    kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
-    pm_uninit(>packets);
+    pm_uninit(>packets, hanging);
    return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c

index 2d56dc534459..bae70646

Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch

2019-12-20 Thread shaoyunl

Can we use the  dqm_lock when we try to get the dqm->is_hw_hang and  
dqm->is_resetting inside function kq_uninitialize ?


I think more closer we check the status  to hqd_destroy it will be  more 
accurate . It does look better with this logic if the status are changed 
after dqm unmap_queue call and  before we call hqd_destroy .


Another comment in line.

Regards

shaoyun.liu




On 2019-12-20 11:33 a.m., Felix Kuehling wrote:
dqm->is_hws_hang is protected by the DQM lock. kq_uninitialize runs 
outside that lock protection. Therefore I opted to pass in the hanging 
flag as a parameter. It also keeps the logic that decides all of that 
inside the device queue manager, which I think is cleaner.


I was trying to clean this up further by moving the pm_init/pm_uninit 
out of the start_cpsch/stop_cpsch sequence, but gave up on that idea 
when I found out that I can't create the kernel queue in the DQM 
initialize function because dev->dqm isn't initialized at that time yet.


Regards,
  Felix

On 2019-12-20 10:56, shaoyunl wrote:

Looks like patch 2 is not related to this serial , but anyway .

Patch 1,2,3 are reviewed by shaoyunl 

For patch 4 ,  is it possible we directly check dqm->is_hws_hang || 
dqm->is_resetting  inside function kq_uninitialize.  so we don't need 
other interface change .


I think even Inside that kq_uninitialize function , we still can get 
dqm as  kq->dev->dqm .



shaoyun.liu


On 2019-12-20 3:30 a.m., Felix Kuehling wrote:

Don't use the HWS if it's known to be hanging. In a reset also
don't try to destroy the HIQ because that may hang on SRIOV if the
KIQ is unresponsive.

Signed-off-by: Felix Kuehling 
---
  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 


  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c    |  8 
  drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c  |  4 ++--
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  4 ++--
  .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   |  2 +-
  5 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

index a7e9ec1b3ce3..d7eb6ac37f62 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -946,7 +946,7 @@ static int start_nocpsch(struct 
device_queue_manager *dqm)

  static int stop_nocpsch(struct device_queue_manager *dqm)
  {
  if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
-    pm_uninit(>packets);
+    pm_uninit(>packets, false);
  dqm->sched_running = false;
    return 0;
@@ -1114,20 +1114,24 @@ static int start_cpsch(struct 
device_queue_manager *dqm)

  return 0;
  fail_allocate_vidmem:
  fail_set_sched_resources:
-    pm_uninit(>packets);
+    pm_uninit(>packets, false);
  fail_packet_manager_init:
  return retval;
  }
    static int stop_cpsch(struct device_queue_manager *dqm)
  {
+    bool hanging;
+kq_uninitialize(

  dqm_lock(dqm);
-    unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+    if (!dqm->is_hws_hang)
[shaoyunl]  should we check is_resetting here as well . so we ignore 
the  unmap call even HWS still not  detect the hang but we know we 
currently in resetting  precedure
+    unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 
0);

+    hanging = dqm->is_hws_hang || dqm->is_resetting;
  dqm->sched_running = false;
  dqm_unlock(dqm);
    kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
-    pm_uninit(>packets);
+    pm_uninit(>packets, hanging);
    return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c

index 2d56dc534459..bae706462f96 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue 
*kq, struct kfd_dev *dev,

  }
    /* Uninitialize a kernel queue and free all its memory usages. */
-static void kq_uninitialize(struct kernel_queue *kq)
+static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
  {
-    if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
+    if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
  kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
  kq->queue->mqd,
  KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
@@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct 
kfd_dev *dev,

  return NULL;
  }
  -void kernel_queue_uninit(struct kernel_queue *kq)
+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
  {
-    kq_uninitialize(kq);
+    kq_uninitialize(kq, hanging);
  kfree(kq);
  }
  diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c

index 6c

Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch

2019-12-20 Thread shaoyunl


Looks like patch 2 is not related to this serial , but anyway .

Patch 1,2,3 are reviewed by shaoyunl  

For patch 4 ,  is it possible we directly check dqm->is_hws_hang || 
dqm->is_resetting  inside function kq_uninitialize.  so we don't need 
other interface change .


I think even Inside that kq_uninitialize function , we still can get dqm 
as  kq->dev->dqm .



shaoyun.liu


On 2019-12-20 3:30 a.m., Felix Kuehling wrote:

Don't use the HWS if it's known to be hanging. In a reset also
don't try to destroy the HIQ because that may hang on SRIOV if the
KIQ is unresponsive.

Signed-off-by: Felix Kuehling 
---
  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c| 12 
  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c|  8 
  drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c  |  4 ++--
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  4 ++--
  .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   |  2 +-
  5 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index a7e9ec1b3ce3..d7eb6ac37f62 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager *dqm)
  static int stop_nocpsch(struct device_queue_manager *dqm)
  {
if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
-   pm_uninit(>packets);
+   pm_uninit(>packets, false);
dqm->sched_running = false;
  
  	return 0;

@@ -1114,20 +1114,24 @@ static int start_cpsch(struct device_queue_manager *dqm)
return 0;
  fail_allocate_vidmem:
  fail_set_sched_resources:
-   pm_uninit(>packets);
+   pm_uninit(>packets, false);
  fail_packet_manager_init:
return retval;
  }
  
  static int stop_cpsch(struct device_queue_manager *dqm)

  {
+   bool hanging;
+kq_uninitialize(

dqm_lock(dqm);
-   unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+   if (!dqm->is_hws_hang)
+   unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+   hanging = dqm->is_hws_hang || dqm->is_resetting;
dqm->sched_running = false;
dqm_unlock(dqm);
  
  	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);

-   pm_uninit(>packets);
+   pm_uninit(>packets, hanging);
  
  	return 0;

  }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 2d56dc534459..bae706462f96 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct 
kfd_dev *dev,
  }
  
  /* Uninitialize a kernel queue and free all its memory usages. */

-static void kq_uninitialize(struct kernel_queue *kq)
+static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
  {
-   if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
+   if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
kq->queue->mqd,
KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
@@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
return NULL;
  }
  
-void kernel_queue_uninit(struct kernel_queue *kq)

+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
  {
-   kq_uninitialize(kq);
+   kq_uninitialize(kq, hanging);
kfree(kq);
  }
  
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c

index 6cabed06ef5d..dc406e6dee23 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct 
device_queue_manager *dqm)
return 0;
  }
  
-void pm_uninit(struct packet_manager *pm)

+void pm_uninit(struct packet_manager *pm, bool hanging)
  {
mutex_destroy(>lock);
-   kernel_queue_uninit(pm->priv_queue);
+   kernel_queue_uninit(pm->priv_queue, hanging);
  }
  
  int pm_send_set_resources(struct packet_manager *pm,

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 087e96838997..8ac680dc90f1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -883,7 +883,7 @@ struct device_queue_manager 
*device_queue_manager_init(struct kfd_dev *dev);
  void device_queue_manager_uninit(struct device_queue_manager *dqm);
  struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
enum kfd_queue_type type);
-void kernel_queue_

Re: [PATCwH 2/2] drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV

2019-12-19 Thread shaoyunl

How we prevent the  user queue from submitting on the  following FLR  if 
we didn't unmap the  user queues . It's possible that CP still not hang 
when other part HW get hang and  need a reset .

Om, but probably it's ok since after FLR , all the hqd will be reset to 
unmapped by default by HW  and existing user queue need to be re-created 
anyway ...

I think i'm ok with your proposal . Can KFD team prepare the change ?

Regards

shaoyun.liu

On 2019-12-19 5:44 p.m., Felix Kuehling wrote:
I'm thinking, if we know we're preparing for a GPU reset, maybe we 
shouldn't even try to suspend processes and stop the HIQ. 
kfd_suspend_all_processes, stop_cpsch and other functions up that call 
chain up to kgd2kfd_suspend could have a parameter (bool pre_reset) 
that would update the driver state but not touch the hardware. That 
avoids unnecessary timeouts on things that aren't expected to complete 
anyway.

Regards,
  Felix

On 2019-12-19 11:59 a.m., shaoyunl wrote:

After check the code , in KFD side , should be simple just add the 
check in stop_cpsch code . For kiq, there is no return for WREG32 , 
so no easy way to check the return value . Maybe we can add 
kiq_status in struct amdgpu_kiq  to indicate the kiq is hang or not 
,  in hdq_destroy function check this  kiq_status after acquire_queue 
, finish the destroy function is kiq is hang for SRIOV only .

Any comments ?

shaoyun.liu

On 2019-12-19 9:51 a.m., Liu, Shaoyun wrote:

I see, thanks for the detail information.
Normally when CP is hang, the hiq access to unmap the queue will 
failed before driver call to the hqd_destroy. I think driver should 
add the code to check the return value  and directly finish the 
pre_reset in this case . If the hiq does not hang but kiq hang. We 
can use the same logic in hqd_destroy function, return in first 
access failure instead go further.  With this change we probably can 
move the pre_reset function back to normal .

Felix, do you have any concerns or comments for the change.

Regards
Shaoyun.liu

*From:* Liu, Monk 
*Sent:* December 19, 2019 1:13:24 AM
*To:* Liu, Shaoyun ; 
amd-gfx@lists.freedesktop.org 
*Subject:* RE: [PATCH 2/2] drm/amdgpu: fix KIQ ring test fail in TDR 
of SRIOV

>>> I would like to check why we need a special sequences for sriov 
on this pre_reset. If possible, make it the same as bare metal mode 
sounds better solution.

Because before VF FLR calling function would lead to register access 
through KIQ,  which will not complete because KIQ/GFX already hang 
by that time

>>> I don't remember any register access by amdkfd_pre_reset call,   
please let me know if this assumption is wrong .

Please check “void pm_uninit(struct packet_manager *pm)” which is 
invoked inside of amdkfd_pre_reset() :

It will call uninitialized() in kfd_kernel_queue.c file

And then go to the path of “kq->mqd_mgr->destroy_mqd(…)”

And finally it calls “static int kgd_hqd_destroy(…)” in 
amdgpu_amdkfd_gfx_v10.c

539 {

540 struct amdgpu_device *adev = get_amdgpu_device(kgd);

541 enum hqd_dequeue_request_type type;

542 unsigned long end_jiffies;

543 uint32_t temp;

544 struct v10_compute_mqd *m = get_mqd(mqd);

545

546 #if 0

547 unsigned long flags;

548 int retry;

549 #endif

550

551 acquire_queue(kgd, pipe_id, queue_id); //this introduce 
register access via KIQ

552

553 if (m->cp_hqd_vmid == 0)

554 WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); 
//this introduce register access via KIQ

555

556 switch (reset_type) {

557 case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN:

558 type = DRAIN_PIPE;

559 break;

560 case KFD_PREEMPT_TYPE_WAVEFRONT_RESET:

561 type = RESET_WAVES;

562 break;

563 default:

564 type = DRAIN_PIPE;

565 break;

566 }

624 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), 
type); //this introduce register access via KIQ

625

626 end_jiffies = (utimeout * HZ / 1000) + jiffies;

627 while (true) {

628 temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); 
//this introduce register access via KIQ

629 if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))

630 break;

631 if (time_after(jiffies, end_jiffies)) {

632 pr_err("cp queue preemption time out.\n");

633 release_queue(kgd);

634 return -ETIME;

635 }

636 usleep_range(500, 1000);

637 }

638

639 release_queue(kgd);

640 return 0;

If we use the sequence from bare-metal, all above highlighted 
register access will not work because KIQ/GFX already died by that 
time which means the amdkfd_pre_reset() is actually  not working as 
expected.

_

Monk Liu|GPU Virtualization Team |AMD

sig-cloud-gpu

*From:* Liu, Shaoyun 
*Sent:* Thursday, D

Re: [PATCwH 2/2] drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV

2019-12-19 Thread shaoyunl

reedesktop.org>>
*Subject:* RE: [PATCH 2/2] drm/amdgpu: fix KIQ ring test fail in TDR 
of SRIOV


Oh, by the way

>>> Do we know the root cause why this function would ruin MEC ?

Only we call this function right after VF FLR will ruin MEC and lead 
to following KIQ ring test fail , and on bare-metal it is called 
before gpu rest , so that's why on bare-metal we don't have this issue


But the reason we cannot call it before VF FLR on SRIOV case was 
already stated in this thread


Thanks
_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-----
From: Liu, Monk
Sent: Thursday, December 19, 2019 11:49 AM
To: shaoyunl mailto:shaoyun@amd.com>>; 
amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>
Subject: RE: [PATCH 2/2] drm/amdgpu: fix KIQ ring test fail in TDR of 
SRIOV


Hi Shaoyun

>>> Do we know the root cause why this function would ruin MEC ? From 
the logic, I think this function should be called before FLR since we 
need to disable the user queue submission first.
Right now I don't know which detail step lead to KIQ ring test fail, I 
totally agree with you that this func should be called before VF FLR, 
but we cannot do it and the reason is described in The comment:


> if we do pre_reset() before VF FLR, it would go KIQ way to do register
> access and stuck there, because KIQ probably won't work by that time
> (e.g. you already made GFX hang)


>>> I remembered the function should use hiq to communicate with HW , 
shouldn't use kiq to access HW registerm,  has this been changed ?
Tis function use WREG32/RREG32 to do register access, like all other 
functions in KMD,  and WREG32/RREG32 will let KIQ to do the register 
access If we are under dynamic SRIOV  mode (means we are SRIOV VF and 
isn't under full exclusive mode)


You see that if you call this func before EVENT_5 (event 5 triggers VF 
FLR) then it will run under dynamic mode and KIQ will handle the 
register access, which is not an option Since ME/MEC probably already 
hang ( if we are testing quark on gfx/compute rings)


Do you have a good suggestion ?

thanks
_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: amd-gfx <mailto:amd-gfx-boun...@lists.freedesktop.org>> On Behalf Of shaoyunl

Sent: Tuesday, December 17, 2019 11:38 PM
To: amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH 2/2] drm/amdgpu: fix KIQ ring test fail in TDR of 
SRIOV


I think amdkfd side depends on this call to stop the user queue, 
without this call, the user queue can submit to HW during the reset 
which could cause hang again ...
Do we know the root cause why this function would ruin MEC ? From the 
logic, I think this function should be called before FLR since we need 
to disable the user queue submission first.
I remembered the function should use hiq to communicate with HW , 
shouldn't use kiq to access HW registerm,  has this been changed ?



Regards
shaoyun.liu


On 2019-12-17 5:19 a.m., Monk Liu wrote:
> issues:
> MEC is ruined by the amdkfd_pre_reset after VF FLR done
>
> fix:
> amdkfd_pre_reset() would ruin MEC after hypervisor finished the VF
> FLR, the correct sequence is do amdkfd_pre_reset before VF FLR but
> there is a limitation to block this sequence:
> if we do pre_reset() before VF FLR, it would go KIQ way to do register
> access and stuck there, because KIQ probably won't work by that time
> (e.g. you already made GFX hang)
>
> so the best way right now is to simply remove it.
>
> Signed-off-by: Monk Liu mailto:monk@amd.com>>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 --
>   1 file changed, 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 605cef6..ae962b9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3672,8 +3672,6 @@ static int amdgpu_device_reset_sriov(struct 
amdgpu_device *adev,

>    if (r)
>    return r;
>
> - amdgpu_amdkfd_pre_reset(adev);
> -
>    /* Resume IP prior to SMC */
>    r = amdgpu_device_ip_reinit_early_sriov(adev);
>    if (r)
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cmonk.liu%40amd.com%7Cee9c811452634fc2739808d7830718f6%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637121938885721447sdata=FiqkgiUX8k5rD%2F%2FiJQU2cF1MGExO8yXEzYOoBtpdfYU%3Dreserved=0 
<https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gf

Re: [PATCH 2/2] drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV

2019-12-17 Thread shaoyunl


I think amdkfd side depends on this call to stop the user queue, without this 
call, the user queue can submit to HW during the reset which could cause hang 
again ...
Do we know the root cause why this function would ruin MEC ? From the logic, I 
think this function should be called before FLR since we need to disable the 
user queue submission first.
I remembered the function should use hiq to communicate with HW , shouldn't use 
kiq to access HW registerm,  has this been changed ?


Regards
shaoyun.liu


On 2019-12-17 5:19 a.m., Monk Liu wrote:

issues:
MEC is ruined by the amdkfd_pre_reset after VF FLR done

fix:
amdkfd_pre_reset() would ruin MEC after hypervisor finished the VF FLR,
the correct sequence is do amdkfd_pre_reset before VF FLR but there is
a limitation to block this sequence:
if we do pre_reset() before VF FLR, it would go KIQ way to do register
access and stuck there, because KIQ probably won't work by that time
(e.g. you already made GFX hang)

so the best way right now is to simply remove it.

Signed-off-by: Monk Liu 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 --
  1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 605cef6..ae962b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3672,8 +3672,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
if (r)
return r;
  
-	amdgpu_amdkfd_pre_reset(adev);

-
/* Resume IP prior to SMC */
r = amdgpu_device_ip_reinit_early_sriov(adev);
if (r)

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: Init correct fb region for none XGMI configuration

2018-09-10 Thread shaoyunl

Fix : 5c777a5 'Adjust GART and AGP location with xgmi offset'

Change-Id: I2d78024fbe44a37f46a35d34c1e64dbd3937fdf1
Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index cf97c1c..ae44671 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -121,6 +121,11 @@ void amdgpu_gmc_vram_location(struct amdgpu_device *adev, 
struct amdgpu_gmc *mc,
mc->vram_end = mc->vram_start + mc->mc_vram_size - 1;
if (limit && limit < mc->real_vram_size)
mc->real_vram_size = limit;
+
+   if (mc->xgmi.num_physical_nodes == 0) {
+   mc->fb_start = mc->vram_start;
+   mc->fb_end = mc->vram_end;
+   }
dev_info(adev->dev, "VRAM: %lluM 0x%016llX - 0x%016llX (%lluM used)\n",
mc->mc_vram_size >> 20, mc->vram_start,
mc->vram_end, mc->real_vram_size >> 20);
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: Init correct fb region for none XGMI configuration

2018-09-10 Thread shaoyunl

Change-Id: I2d78024fbe44a37f46a35d34c1e64dbd3937fdf1
Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index cf97c1c..ae44671 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -121,6 +121,11 @@ void amdgpu_gmc_vram_location(struct amdgpu_device *adev, 
struct amdgpu_gmc *mc,
mc->vram_end = mc->vram_start + mc->mc_vram_size - 1;
if (limit && limit < mc->real_vram_size)
mc->real_vram_size = limit;
+
+   if (mc->xgmi.num_physical_nodes == 0) {
+   mc->fb_start = mc->vram_start;
+   mc->fb_end = mc->vram_end;
+   }
dev_info(adev->dev, "VRAM: %lluM 0x%016llX - 0x%016llX (%lluM used)\n",
mc->mc_vram_size >> 20, mc->vram_start,
mc->vram_end, mc->real_vram_size >> 20);
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdkfd: Only add bi-directional iolink on GPU with XGMI or largebar

2018-09-07 Thread shaoyunl

Change-Id: Ibb6a89ed878fffccb9a8bb4032b07a10ee298a99
Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 15 +--
 drivers/gpu/drm/amd/amdkfd/kfd_crat.h |  3 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  1 +
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 130db4d..d4560f1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -353,8 +353,8 @@ static int kfd_parse_subtype_iolink(struct 
crat_subtype_iolink *iolink,
id_from = iolink->proximity_domain_from;
id_to = iolink->proximity_domain_to;
 
-   pr_debug("Found IO link entry in CRAT table with id_from=%d\n",
-   id_from);
+   pr_debug("Found IO link entry in CRAT table with id_from=%d, id_to 
%d\n",
+   id_from, id_to);
list_for_each_entry(dev, device_list, list) {
if (id_from == dev->proximity_domain) {
props = kfd_alloc_struct(props);
@@ -391,12 +391,12 @@ static int kfd_parse_subtype_iolink(struct 
crat_subtype_iolink *iolink,
/* CPU topology is created before GPUs are detected, so CPU->GPU
 * links are not built at that time. If a PCIe type is discovered, it
 * means a GPU is detected and we are adding GPU->CPU to the topology.
-* At this time, also add the corresponded CPU->GPU link.
+* At this time, also add the corresponded CPU->GPU link if GPU
+* is large bar.
 * For xGMI, we only added the link with one direction in the crat
 * table, add corresponded reversed direction link now.
 */
-   if (props && (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS ||
- props->iolink_type == CRAT_IOLINK_TYPE_XGMI)) {
+   if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) {
to_dev = kfd_topology_device_by_proximity_domain(id_to);
if (!to_dev)
return -ENODEV;
@@ -1057,6 +1057,8 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int 
*avail_size,
sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
+   if (kfd_dev_is_large_bar(kdev))
+   sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
 
/* Fill in IOLINK subtype.
 * TODO: Fill-in other fields of iolink subtype
@@ -1088,7 +1090,8 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,
 
sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
-   sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
+   sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED |
+  CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
 
sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;
sub_type_hdr->proximity_domain_from = proximity_domain_from;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
index 7a93aeb..7c3f192 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
@@ -232,7 +232,8 @@ struct crat_subtype_ccompute {
 #define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT(1 << 2)
 #define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT(1 << 3)
 #define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA  (1 << 4)
-#define CRAT_IOLINK_FLAGS_RESERVED_MASK0xffe0
+#define CRAT_IOLINK_FLAGS_BI_DIRECTIONAL   (1 << 31)
+#define CRAT_IOLINK_FLAGS_RESERVED_MASK0x7fe0
 
 /*
  * IO interface types
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 6a5418f..05283c9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -696,6 +696,7 @@ struct amdkfd_ioctl_desc {
unsigned int cmd_drv;
const char *name;
 };
+bool kfd_dev_is_large_bar(struct kfd_dev *dev);
 
 int kfd_process_create_wq(void);
 void kfd_process_destroy_wq(void);
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 06/12] drm/amdgpu: Add place holder functions for xgmi topology interface with psp

2018-09-07 Thread shaoyunl

From: Shaoyun Liu 

Add dummy function for xgmi function interface with psp

Change-Id: I01f35baf5a4b96e9654d448c9892be3cd72c05b7
Signed-off-by: Shaoyun Liu 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/psp_v11_0.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
index b70cfa3..a0c2d54 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
@@ -548,6 +548,32 @@ static int psp_v11_0_mode1_reset(struct psp_context *psp)
return 0;
 }
 
+/* TODO: Fill in follow functions once PSP firmware interface for XGMI is 
ready.
+ * For now, return success and hack the hive_id so high level code can
+ * start testing */
+static int psp_v11_0_xgmi_get_topology_info(struct psp_context *psp,
+   int number_devices, struct psp_xgmi_topology_info *topology)
+{
+   return 0;
+}
+
+static int psp_v11_0_xgmi_set_topology_info(struct psp_context *psp,
+   int number_devices, struct psp_xgmi_topology_info *topology)
+{
+   return 0;
+}
+
+static u64 psp_v11_0_xgmi_get_hive_id(struct psp_context *psp)
+{
+   u64 hive_id = 0;
+
+   /* Remove me when we can get correct hive_id through PSP */
+   if (psp->adev->gmc.xgmi.num_physical_nodes)
+   hive_id = 0x123456789abcdef;
+
+   return hive_id;
+}
+
 static const struct psp_funcs psp_v11_0_funcs = {
.init_microcode = psp_v11_0_init_microcode,
.bootloader_load_sysdrv = psp_v11_0_bootloader_load_sysdrv,
@@ -560,6 +586,9 @@ static const struct psp_funcs psp_v11_0_funcs = {
.cmd_submit = psp_v11_0_cmd_submit,
.compare_sram_data = psp_v11_0_compare_sram_data,
.mode1_reset = psp_v11_0_mode1_reset,
+   .xgmi_get_topology_info = psp_v11_0_xgmi_get_topology_info,
+   .xgmi_set_topology_info = psp_v11_0_xgmi_set_topology_info,
+   .xgmi_get_hive_id = psp_v11_0_xgmi_get_hive_id,
 };
 
 void psp_v11_0_set_psp_funcs(struct psp_context *psp)
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 04/12] drm/amdgpu/gmc9: Adjust GART and AGP location with xgmi offset

2018-09-07 Thread shaoyunl

From: Alex Deucher 

On hives with xgmi enabled, the fb_location aperture is a size
which defines the total framebuffer size of all nodes in the
hive.  Each GPU in the hive has the same view via the fb_location
aperture.  GPU0 starts at offset (0 * segment size),
GPU1 starts at offset (1 * segment size), etc.

For access to local vram on each GPU, we need to take this offset into
account. This including on setting up GPUVM page table and GART table

Change-Id: I9efd510bed68fdb9afdfbdc76e1046792471ee78
Acked-by: Huang Rui 
Acked-by: Slava Abramov 
Signed-off-by: Shaoyun Liu 
Signed-off-by: Alex Deucher 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  | 20 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h  |  7 +++
 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.c |  3 +++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c|  6 ++
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c  |  7 +++
 5 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 6acdeeb..cf97c1c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -147,8 +147,8 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev, 
struct amdgpu_gmc *mc)
/* VCE doesn't like it when BOs cross a 4GB segment, so align
 * the GART base on a 4GB boundary as well.
 */
-   size_bf = mc->vram_start;
-   size_af = adev->gmc.mc_mask + 1 - ALIGN(mc->vram_end + 1, four_gb);
+   size_bf = mc->fb_start;
+   size_af = adev->gmc.mc_mask + 1 - ALIGN(mc->fb_end + 1, four_gb);
 
if (mc->gart_size > max(size_bf, size_af)) {
dev_warn(adev->dev, "limiting GART\n");
@@ -184,23 +184,23 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev, 
struct amdgpu_gmc *mc)
const uint64_t sixteen_gb_mask = ~(sixteen_gb - 1);
u64 size_af, size_bf;
 
-   if (mc->vram_start > mc->gart_start) {
-   size_bf = (mc->vram_start & sixteen_gb_mask) -
+   if (mc->fb_start > mc->gart_start) {
+   size_bf = (mc->fb_start & sixteen_gb_mask) -
ALIGN(mc->gart_end + 1, sixteen_gb);
-   size_af = mc->mc_mask + 1 - ALIGN(mc->vram_end + 1, sixteen_gb);
+   size_af = mc->mc_mask + 1 - ALIGN(mc->fb_end + 1, sixteen_gb);
} else {
-   size_bf = mc->vram_start & sixteen_gb_mask;
+   size_bf = mc->fb_start & sixteen_gb_mask;
size_af = (mc->gart_start & sixteen_gb_mask) -
-   ALIGN(mc->vram_end + 1, sixteen_gb);
+   ALIGN(mc->fb_end + 1, sixteen_gb);
}
 
if (size_bf > size_af) {
-   mc->agp_start = mc->vram_start > mc->gart_start ?
+   mc->agp_start = mc->fb_start > mc->gart_start ?
mc->gart_end + 1 : 0;
mc->agp_size = size_bf;
} else {
-   mc->agp_start = (mc->vram_start > mc->gart_start ?
-   mc->vram_end : mc->gart_end) + 1,
+   mc->agp_start = (mc->fb_start > mc->gart_start ?
+   mc->fb_end : mc->gart_end) + 1,
mc->agp_size = size_af;
}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index a929a55..df96dfe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -114,6 +114,13 @@ struct amdgpu_gmc {
u64 gart_end;
u64 vram_start;
u64 vram_end;
+   /* FB region , it's same as local vram region in single GPU, in XGMI
+* configuration, this region covers all GPUs in the same hive ,
+* each GPU in the hive has the same view of this FB region .
+* GPU0's vram starts at offset (0 * segment size) ,
+* GPU1 starts at offset (1 * segment size), etc.   */
+   u64 fb_start;
+   u64 fb_end;
unsignedvram_width;
u64 real_vram_size;
int vram_mtrr;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.c
index d4170cb..5e9ab8e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.c
@@ -44,6 +44,9 @@ int gfxhub_v1_1_get_xgmi_info(struct amdgpu_device *adev)
REG_GET_FIELD(xgmi_lfb_cntl, MC_VM_XGMI_LFB_CNTL, 
PF_LFB_REGION);
if (adev->gmc.xgmi.physical_node_id > 3)
return -EINVAL;
+   adev->gmc.xgmi.node_segment_size = REG_GET_FIELD(
+   RREG32_SOC15(GC, 0, mmMC_VM_XGMI_LFB_SIZE),
+   MC_VM_XGMI_LFB_SIZE, PF_LFB_SIZE) << 24;
}
 
return 0;
diff --git

[PATCH 13/13] drm/amdkfd: Generate xGMI direct iolink

2018-09-05 Thread shaoyunl

From: Shaoyun Liu 

Generate xGMI iolink for upper level usage

Change-Id: I37bc29fee45cb10d1da849956055c59d823f6f5d
Signed-off-by: Shaoyun Liu 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 78 ++-
 1 file changed, 68 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index ee49960..130db4d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -346,7 +346,7 @@ static int kfd_parse_subtype_iolink(struct 
crat_subtype_iolink *iolink,
struct list_head *device_list)
 {
struct kfd_iolink_properties *props = NULL, *props2;
-   struct kfd_topology_device *dev, *cpu_dev;
+   struct kfd_topology_device *dev, *to_dev;
uint32_t id_from;
uint32_t id_to;
 
@@ -369,6 +369,8 @@ static int kfd_parse_subtype_iolink(struct 
crat_subtype_iolink *iolink,
 
if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS)
props->weight = 20;
+   else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI)
+   props->weight = 15;
else
props->weight = node_distance(id_from, id_to);
 
@@ -390,19 +392,22 @@ static int kfd_parse_subtype_iolink(struct 
crat_subtype_iolink *iolink,
 * links are not built at that time. If a PCIe type is discovered, it
 * means a GPU is detected and we are adding GPU->CPU to the topology.
 * At this time, also add the corresponded CPU->GPU link.
+* For xGMI, we only added the link with one direction in the crat
+* table, add corresponded reversed direction link now.
 */
-   if (props && props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) {
-   cpu_dev = kfd_topology_device_by_proximity_domain(id_to);
-   if (!cpu_dev)
+   if (props && (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS ||
+ props->iolink_type == CRAT_IOLINK_TYPE_XGMI)) {
+   to_dev = kfd_topology_device_by_proximity_domain(id_to);
+   if (!to_dev)
return -ENODEV;
/* same everything but the other direction */
props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL);
props2->node_from = id_to;
props2->node_to = id_from;
props2->kobj = NULL;
-   cpu_dev->io_link_count++;
-   cpu_dev->node_props.io_links_count++;
-   list_add_tail(>list, _dev->io_link_props);
+   to_dev->io_link_count++;
+   to_dev->node_props.io_links_count++;
+   list_add_tail(>list, _dev->io_link_props);
}
 
return 0;
@@ -1037,7 +1042,7 @@ static int kfd_fill_gpu_memory_affinity(int *avail_size,
  *
  * Return 0 if successful else return -ve value
  */
-static int kfd_fill_gpu_direct_io_link(int *avail_size,
+static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,
struct kfd_dev *kdev,
struct crat_subtype_iolink *sub_type_hdr,
uint32_t proximity_domain)
@@ -1069,6 +1074,28 @@ static int kfd_fill_gpu_direct_io_link(int *avail_size,
return 0;
 }
 
+static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,
+   struct kfd_dev *kdev,
+   struct crat_subtype_iolink *sub_type_hdr,
+   uint32_t proximity_domain_from,
+   uint32_t proximity_domain_to)
+{
+   *avail_size -= sizeof(struct crat_subtype_iolink);
+   if (*avail_size < 0)
+   return -ENOMEM;
+
+   memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
+
+   sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
+   sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
+   sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
+
+   sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;
+   sub_type_hdr->proximity_domain_from = proximity_domain_from;
+   sub_type_hdr->proximity_domain_to = proximity_domain_to;
+   return 0;
+}
+
 /* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU
  *
  * @pcrat_image: Fill in VCRAT for GPU
@@ -1081,14 +1108,16 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
 {
struct crat_header *crat_table = (struct crat_header *)pcrat_image;
struct crat_subtype_generic *sub_type_hdr;
+   struct kfd_local_mem_info local_mem_info;
+   struct kfd_topology_device *peer_dev;
struct crat_subtype_computeunit *cu;
struct kfd_cu_info cu_info;
int avail_size = *size;
uint32_t total_num_of_cu;
int num_of_cache_entries = 0;
int cache_mem_filled = 0;
+   uint32_t nid = 0;
int ret = 0;
-

[PATCH 12/13] drm/amdkfd: Add new iolink type defines

2018-09-05 Thread shaoyunl

From: Shaoyun Liu 

Update the iolink type defines according to the new thunk spec

Change-Id: Ie155641b6bfbe005ae0e12c5c31c68157247ea26
Signed-off-by: Shaoyun Liu 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_crat.h | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
index b5cd182..7a93aeb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
@@ -248,7 +248,12 @@ struct crat_subtype_ccompute {
 #define CRAT_IOLINK_TYPE_RAPID_IO  8
 #define CRAT_IOLINK_TYPE_INFINIBAND9
 #define CRAT_IOLINK_TYPE_RESERVED3 10
-#define CRAT_IOLINK_TYPE_OTHER 11
+#define CRAT_IOLINK_TYPE_XGMI  11
+#define CRAT_IOLINK_TYPE_XGOP  12
+#define CRAT_IOLINK_TYPE_GZ13
+#define CRAT_IOLINK_TYPE_ETHERNET_RDMA 14
+#define CRAT_IOLINK_TYPE_RDMA_OTHER15
+#define CRAT_IOLINK_TYPE_OTHER 16
 #define CRAT_IOLINK_TYPE_MAX   255
 
 #define CRAT_IOLINK_RESERVED_LENGTH24
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 11/13] drm/amdkfd: kfd expose the hive_id of the device through its node properties

2018-09-05 Thread shaoyunl

From: Shaoyun Liu 

Thunk will generate the XGMI topology information when necessary with the 
hive_id
for each specified device

Change-Id: I3bbc37bd2af4295e24357ce82f2c760162aff9ca
Signed-off-by: Shaoyun Liu 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   | 3 +++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +++
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 4 
 drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 1 +
 4 files changed, 11 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 1b04871..b4d9e6b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -476,6 +476,9 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
goto kfd_doorbell_error;
}
 
+   if (kfd->kfd2kgd->get_hive_id)
+   kfd->hive_id = kfd->kfd2kgd->get_hive_id(kfd->kgd);
+
if (kfd_topology_add_device(kfd)) {
dev_err(kfd_device, "Error adding device to topology\n");
goto kfd_topology_add_device_error;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 355f79d..6a5418f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -254,6 +254,9 @@ struct kfd_dev {
bool cwsr_enabled;
const void *cwsr_isa;
unsigned int cwsr_isa_size;
+
+   /* xGMI */
+   uint64_t hive_id;
 };
 
 /* KGD2KFD callbacks */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index bc95d4df..19ecc82 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -443,6 +443,8 @@ static ssize_t node_show(struct kobject *kobj, struct 
attribute *attr,
dev->node_props.location_id);
sysfs_show_32bit_prop(buffer, "drm_render_minor",
dev->node_props.drm_render_minor);
+   sysfs_show_64bit_prop(buffer, "hive_id",
+   dev->node_props.hive_id);
 
if (dev->gpu) {
log_max_watch_addr =
@@ -1219,6 +1221,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
dev->node_props.drm_render_minor =
gpu->shared_resources.drm_render_minor;
 
+   dev->node_props.hive_id = gpu->hive_id;
+
kfd_fill_mem_clk_max_info(dev);
kfd_fill_iolink_non_crat_info(dev);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
index 7d9c3f9..92a19be 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@@ -49,6 +49,7 @@
 #define HSA_CAP_AQL_QUEUE_DOUBLE_MAP   0x4000
 
 struct kfd_node_properties {
+   uint64_t hive_id;
uint32_t cpu_cores_count;
uint32_t simd_count;
uint32_t mem_banks_count;
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 10/13] drm/amdgpu: get_hive_id from amdgpu side

2018-09-05 Thread shaoyunl

From: Shaoyun Liu 

Retrieve hive_id from amdgpu device

Change-Id: I9bb4d87870edf638b477a9088f14bc84b70e71e2
Signed-off-by: Shaoyun Liu 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c| 7 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h| 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 1 +
 3 files changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 1a0824e..5661c34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -411,6 +411,13 @@ uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd)
return amdgpu_vram_mgr_usage(>mman.bdev.man[TTM_PL_VRAM]);
 }
 
+uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+
+   return adev->gmc.xgmi.hive_id;
+}
+
 int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
uint32_t vmid, uint64_t gpu_addr,
uint32_t *ib_cmd, uint32_t ib_len)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 2a1da3f..41e7dfc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -145,6 +145,7 @@ uint64_t get_gpu_clock_counter(struct kgd_dev *kgd);
 uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd);
 void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info);
 uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd);
+uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd);
 
 #define read_user_wptr(mmptr, wptr, dst)   \
({  \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 3dc987c..94d39a3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -216,6 +216,7 @@ static const struct kfd2kgd_calls kfd2kgd = {
.submit_ib = amdgpu_amdkfd_submit_ib,
.gpu_recover = amdgpu_amdkfd_gpu_reset,
.set_compute_idle = amdgpu_amdkfd_set_compute_idle
+   .get_hive_id = amdgpu_amdkfd_get_hive_id,
 };
 
 struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void)
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 09/13] drm/amd/include: Add get_hive_id interface in kfd2kgd

2018-09-05 Thread shaoyunl

From: Shaoyun Liu 

KFD need to get hive id from amdgpu to build up the XGMI topology

Change-Id: If68ea8fd7fb17b7ffb581f45d8406925578d96b8
Signed-off-by: Shaoyun Liu 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h 
b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index 31c52c1..cb4deb2 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -313,6 +313,8 @@ struct tile_config {
  * @set_compute_idle: Indicates that compute is idle on a device. This
  * can be used to change power profiles depending on compute activity.
  *
+ * @get_hive_id: Returns hive id of current  device,  0 if xgmi is not enabled
+ *
  * This structure contains function pointers to services that the kgd driver
  * provides to amdkfd driver.
  *
@@ -438,6 +440,9 @@ struct kfd2kgd_calls {
void (*gpu_recover)(struct kgd_dev *kgd);
 
void (*set_compute_idle)(struct kgd_dev *kgd, bool idle);
+
+   uint64_t (*get_hive_id)(struct kgd_dev *kgd);
+
 };
 
 /**
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 08/13] drm/amdgpu : Generate XGMI topology info from driver level

2018-09-05 Thread shaoyunl

From: Shaoyun Liu 

Driver will save an array of XGMI hive info, each hive will have a list of 
devices
that have the same hive ID.

Change-Id: Ia2934d5b624cffa3283bc0a37679eddbd387cbdd
Signed-off-by: Shaoyun Liu 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/Makefile|   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|   6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h|   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 119 +
 5 files changed, 128 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index e83ba7b..138cb78 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -53,7 +53,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
amdgpu_gtt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o amdgpu_atomfirmware.o \
amdgpu_vf_error.o amdgpu_sched.o amdgpu_debugfs.o amdgpu_ids.o \
-   amdgpu_gmc.o
+   amdgpu_gmc.o amdgpu_xgmi.o
 
 # add asic specific block
 amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o cik_ih.o kv_smc.o kv_dpm.o \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e992e0f..27382767 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1219,6 +1219,12 @@ void amdgpu_disable_vblank_kms(struct drm_device *dev, 
unsigned int pipe);
 long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd,
 unsigned long arg);
 
+
+/*
+ * functions used by amdgpu_xgmi.c
+ */
+int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
+
 /*
  * functions used by amdgpu_encoder.c
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 93476b8..e24a171 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1586,6 +1586,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
adev->ip_blocks[i].status.hw = true;
}
 
+   amdgpu_xgmi_add_device(adev);
amdgpu_amdkfd_device_init(adev);
 
if (amdgpu_sriov_vf(adev))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index bfb0a7e..f7c90c2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -88,6 +88,7 @@ struct amdgpu_gmc_funcs {
 };
 
 struct amdgpu_xgmi {
+   struct list_head head; /* gpu list in the same hive */
u64 device_id; /* from psp */
u64 hive_id; /* from psp */
u64 node_segment_size; /* fixed per family */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
new file mode 100644
index 000..897afbb
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ *
+ */
+#include 
+#include "amdgpu.h"
+#include "amdgpu_psp.h"
+
+
+static DEFINE_MUTEX(xgmi_mutex);
+
+#define AMDGPU_MAX_XGMI_HIVE   8
+#define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE4
+
+struct amdgpu_hive_info {
+   uint64_thive_id;
+   struct list_headdevice_list;
+};
+
+static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE];
+static unsigned hive_count = 0;
+
+static struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device 
*adev)
+{
+   int i;
+   struct amdgpu_hive_info *tmp;
+
+   if (!adev->gmc.xgmi.hive_id)
+   return NULL;
+   for (i = 0 ; i < hive_count; ++i) {
+   tmp = _hives[i];
+   if (tmp->hive_id == adev->gmc.xgmi.hive_id)
+

[PATCH 07/13] drm/amdgpu: Add place holder functions for xgmi topology interface with psp

2018-09-05 Thread shaoyunl

From: Shaoyun Liu 

Add dummy function for xgmi function interface with psp

Change-Id: I01f35baf5a4b96e9654d448c9892be3cd72c05b7
Signed-off-by: Shaoyun Liu 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/psp_v11_0.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
index b70cfa3..b1c0b33 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
@@ -548,6 +548,29 @@ static int psp_v11_0_mode1_reset(struct psp_context *psp)
return 0;
 }
 
+static int psp_v11_0_xgmi_get_topology_info(struct psp_context *psp,
+   int number_devices, struct psp_xgmi_topology_info *topology)
+{
+   return 0;
+}
+
+static int psp_v11_0_xgmi_set_topology_info(struct psp_context *psp,
+   int number_devices, struct psp_xgmi_topology_info *topology)
+{
+   return 0;
+}
+
+static u64 psp_v11_0_xgmi_get_hive_id(struct psp_context *psp)
+{
+   u64 hive_id = 0;
+
+   /* Remove me when normal psp interface is ready */
+   if (psp->adev->gmc.xgmi.num_physical_nodes)
+   hive_id = 0x123456789abcdef;
+
+   return hive_id;
+}
+
 static const struct psp_funcs psp_v11_0_funcs = {
.init_microcode = psp_v11_0_init_microcode,
.bootloader_load_sysdrv = psp_v11_0_bootloader_load_sysdrv,
@@ -560,6 +583,9 @@ static const struct psp_funcs psp_v11_0_funcs = {
.cmd_submit = psp_v11_0_cmd_submit,
.compare_sram_data = psp_v11_0_compare_sram_data,
.mode1_reset = psp_v11_0_mode1_reset,
+   .xgmi_get_topology_info = psp_v11_0_xgmi_get_topology_info,
+   .xgmi_set_topology_info = psp_v11_0_xgmi_set_topology_info,
+   .xgmi_get_hive_id = psp_v11_0_xgmi_get_hive_id,
 };
 
 void psp_v11_0_set_psp_funcs(struct psp_context *psp)
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 06/13] drm/amdgpu : Add psp function interfaces for XGMI support

2018-09-05 Thread shaoyunl

From: Shaoyun Liu 

Place holder for XGMI support

Change-Id: I924fa3693366409de0218009c7f709cb464854cc
Signed-off-by: Shaoyun Liu 
Reviewed-by: Huang Rui 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 34 +
 1 file changed, 34 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 981887c..8b8720e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -35,6 +35,7 @@
 #define PSP_TMR_SIZE   0x40
 
 struct psp_context;
+struct psp_xgmi_topology_info;
 
 enum psp_ring_type
 {
@@ -79,6 +80,12 @@ struct psp_funcs
  enum AMDGPU_UCODE_ID ucode_type);
bool (*smu_reload_quirk)(struct psp_context *psp);
int (*mode1_reset)(struct psp_context *psp);
+   uint64_t (*xgmi_get_device_id)(struct psp_context *psp);
+   uint64_t (*xgmi_get_hive_id)(struct psp_context *psp);
+   int (*xgmi_get_topology_info)(struct psp_context *psp, int 
number_devices,
+   struct psp_xgmi_topology_info *topology);
+   int (*xgmi_set_topology_info)(struct psp_context *psp, int 
number_devices,
+   struct psp_xgmi_topology_info *topology);
 };
 
 struct psp_context
@@ -134,6 +141,23 @@ struct amdgpu_psp_funcs {
enum AMDGPU_UCODE_ID);
 };
 
+struct psp_xgmi_topology_info {
+   /* Generated by PSP to identify the GPU instance within xgmi connection 
*/
+   uint64_tdevice_id;
+   /*
+* If all bits set to 0 , driver indicates it wants to retrieve the xgmi
+* connection vector topology, but not access enable the connections
+* if some or all bits are set to 1, driver indicates it want to 
retrieve the
+* current xgmi topology and  access enable the link to GPU[i] 
associated
+* with the bit position in the  vector.
+* On return,: bits indicated which xgmi links are present/active 
depending
+* on the  value passed in. The relative bit offset for the  relative 
GPU index
+* within the  hive is always marked active.
+*/
+   uint32_tconnection_mask;
+   uint32_treserved; /* must be  0 */
+};
+
 #define psp_prep_cmd_buf(ucode, type) (psp)->funcs->prep_cmd_buf((ucode), 
(type))
 #define psp_ring_init(psp, type) (psp)->funcs->ring_init((psp), (type))
 #define psp_ring_create(psp, type) (psp)->funcs->ring_create((psp), (type))
@@ -153,6 +177,16 @@ struct amdgpu_psp_funcs {
((psp)->funcs->smu_reload_quirk ? 
(psp)->funcs->smu_reload_quirk((psp)) : false)
 #define psp_mode1_reset(psp) \
((psp)->funcs->mode1_reset ? (psp)->funcs->mode1_reset((psp)) : 
false)
+#define psp_xgmi_get_device_id(psp) \
+   ((psp)->funcs->xgmi_get_device_id ? 
(psp)->funcs->xgmi_get_device_id((psp)) : 0)
+#define psp_xgmi_get_hive_id(psp) \
+   ((psp)->funcs->xgmi_get_hive_id ? 
(psp)->funcs->xgmi_get_hive_id((psp)) : 0)
+#define psp_xgmi_get_topology_info(psp, num_device, topology) \
+   ((psp)->funcs->xgmi_get_topology_info ? \
+   (psp)->funcs->xgmi_get_topology_info((psp), (num_device), 
(topology)) : -EINVAL)
+#define psp_xgmi_set_topology_info(psp, num_device, topology) \
+   ((psp)->funcs->xgmi_set_topology_info ?  \
+   (psp)->funcs->xgmi_set_topology_info((psp), (num_device), 
(topology)) : -EINVAL)
 
 #define amdgpu_psp_check_fw_loading_status(adev, i) 
(adev)->firmware.funcs->check_fw_loading_status((adev), (i))
 
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 05/13] drm/amdgpu/gmc9: populate xgmi info for vega20

2018-09-05 Thread shaoyunl

From: Alex Deucher 

Call the new gfxhub 1.1 function to get the xgmi info.

Acked-by: Huang Rui 
Acked-by: Slava Abramov 
Reviewed-by :Shaoyun liu 
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 0da89ba..b1c8489 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -43,6 +43,7 @@
 
 #include "gfxhub_v1_0.h"
 #include "mmhub_v1_0.h"
+#include "gfxhub_v1_1.h"
 
 #include "ivsrcid/vmc/irqsrcs_vmc_1_0.h"
 
@@ -991,6 +992,12 @@ static int gmc_v9_0_sw_init(void *handle)
}
adev->need_swiotlb = drm_get_max_iomem() > ((u64)1 << dma_bits);
 
+   if (adev->asic_type == CHIP_VEGA20) {
+   r = gfxhub_v1_1_get_xgmi_info(adev);
+   if (r)
+   return r;
+   }
+
r = gmc_v9_0_mc_init(adev);
if (r)
return r;
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 04/13] drm/amdgpu/gmc9: Adjust xgmi offset

2018-09-05 Thread shaoyunl

From: Alex Deucher 

On hives with xgmi enabled, the fb_location aperture is a size
which defines the total framebuffer size of all nodes in the
hive.  Each GPU in the hive has the same view via the fb_location
aperture.  GPU0 starts at offset (0 * segment size),
GPU1 starts at offset (1 * segment size), etc.

For access to local vram on each GPU, we need to take this offset into
account. This including on setting up GPUVM page table  and  GART table

Acked-by: Huang Rui 
Acked-by: Slava Abramov 
Signed-off-by: Shaoyun Liu 
Signed-off-by: Alex Deucher 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  | 4 
 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c| 6 ++
 3 files changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 6acdeeb..a95b615 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -158,6 +158,10 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev, 
struct amdgpu_gmc *mc)
if ((size_bf >= mc->gart_size && size_bf < size_af) ||
(size_af < mc->gart_size))
mc->gart_start = 0;
+   else if (mc->xgmi.num_physical_nodes)
+   mc->gart_start = mc->vram_start +
+   (mc->xgmi.num_physical_nodes - 
mc->xgmi.physical_node_id)
+   * mc->xgmi.node_segment_size;
else
mc->gart_start = mc->mc_mask - mc->gart_size + 1;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.c
index d4170cb..5e9ab8e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.c
@@ -44,6 +44,9 @@ int gfxhub_v1_1_get_xgmi_info(struct amdgpu_device *adev)
REG_GET_FIELD(xgmi_lfb_cntl, MC_VM_XGMI_LFB_CNTL, 
PF_LFB_REGION);
if (adev->gmc.xgmi.physical_node_id > 3)
return -EINVAL;
+   adev->gmc.xgmi.node_segment_size = REG_GET_FIELD(
+   RREG32_SOC15(GC, 0, mmMC_VM_XGMI_LFB_SIZE),
+   MC_VM_XGMI_LFB_SIZE, PF_LFB_SIZE) << 24;
}
 
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 3529c55..0da89ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -770,12 +770,18 @@ static void gmc_v9_0_vram_gtt_location(struct 
amdgpu_device *adev,
u64 base = 0;
if (!amdgpu_sriov_vf(adev))
base = mmhub_v1_0_get_fb_location(adev);
+   /* add the xgmi offset of the physical node */
+   base += adev->gmc.xgmi.physical_node_id * 
adev->gmc.xgmi.node_segment_size;
amdgpu_gmc_vram_location(adev, >gmc, base);
amdgpu_gmc_gart_location(adev, mc);
if (!amdgpu_sriov_vf(adev))
amdgpu_gmc_agp_location(adev, mc);
/* base offset of vram pages */
adev->vm_manager.vram_base_offset = gfxhub_v1_0_get_mc_fb_offset(adev);
+
+   /* XXX: add the xgmi offset of the physical node? */
+   adev->vm_manager.vram_base_offset +=
+   adev->gmc.xgmi.physical_node_id * 
adev->gmc.xgmi.node_segment_size;
 }
 
 /**
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 03/13] drm/amdgpu/gmc9: add a new gfxhub 1.1 helper for xgmi

2018-09-05 Thread shaoyunl

From: Alex Deucher 

Used to populate the xgmi info on vega20.

v2: PF_MAX_REGION is val - 1 (Ray)

Acked-by: Huang Rui 
Acked-by: Slava Abramov 
Reviewed-by :Shaoyun liu 
Signed-off-by: Alex Deucher 

Change-Id: Ia7b7f112880e69cdbcf73a8abf04cd6ef303940c
---
 drivers/gpu/drm/amd/amdgpu/Makefile  |  2 +-
 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.c | 50 
 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.h | 29 ++
 3 files changed, 80 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index 847536b..e83ba7b 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -74,7 +74,7 @@ amdgpu-y += \
 amdgpu-y += \
gmc_v7_0.o \
gmc_v8_0.o \
-   gfxhub_v1_0.o mmhub_v1_0.o gmc_v9_0.o
+   gfxhub_v1_0.o mmhub_v1_0.o gmc_v9_0.o gfxhub_v1_1.o
 
 # add IH block
 amdgpu-y += \
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.c
new file mode 100644
index 000..d4170cb
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "amdgpu.h"
+#include "gfxhub_v1_1.h"
+
+#include "gc/gc_9_2_1_offset.h"
+#include "gc/gc_9_2_1_sh_mask.h"
+
+#include "soc15_common.h"
+
+int gfxhub_v1_1_get_xgmi_info(struct amdgpu_device *adev)
+{
+   u32 xgmi_lfb_cntl = RREG32_SOC15(GC, 0, mmMC_VM_XGMI_LFB_CNTL);
+   u32 max_region =
+   REG_GET_FIELD(xgmi_lfb_cntl, MC_VM_XGMI_LFB_CNTL, 
PF_MAX_REGION);
+
+   /* PF_MAX_REGION=0 means xgmi is disabled */
+   if (max_region) {
+   adev->gmc.xgmi.num_physical_nodes = max_region + 1;
+   if (adev->gmc.xgmi.num_physical_nodes > 4)
+   return -EINVAL;
+
+   adev->gmc.xgmi.physical_node_id =
+   REG_GET_FIELD(xgmi_lfb_cntl, MC_VM_XGMI_LFB_CNTL, 
PF_LFB_REGION);
+   if (adev->gmc.xgmi.physical_node_id > 3)
+   return -EINVAL;
+   }
+
+   return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.h 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.h
new file mode 100644
index 000..d753cf2
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_1.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __GFXHUB_V1_1_H__
+#define __GFXHUB_V1_1_H__
+
+int gfxhub_v1_1_get_xgmi_info(struct amdgpu_device *adev);
+
+#endif
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org

[PATCH 02/13] drm/amdgpu/gmc: add initial xgmi structure to amdgpu_gmc structure

2018-09-05 Thread shaoyunl

From: Alex Deucher 

Initial pass at a structure to store xgmi info.  xgmi is a high
speed cross gpu interconnect.

Acked-by: Huang Rui 
Acked-by: Slava Abramov 
Reviewed-by :Shaoyun liu 
Signed-off-by: Alex Deucher 

Change-Id: I8b373bd847c857dd7cbefa55d1ede2a8785deb06
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 17ffc35..bfb0a7e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -87,6 +87,14 @@ struct amdgpu_gmc_funcs {
   u64 *dst, u64 *flags);
 };
 
+struct amdgpu_xgmi {
+   u64 device_id; /* from psp */
+   u64 hive_id; /* from psp */
+   u64 node_segment_size; /* fixed per family */
+   unsigned physical_node_id; /* physical node (0-3) */
+   unsigned num_physical_nodes; /* number of nodes (0-4) */
+};
+
 struct amdgpu_gmc {
resource_size_t aper_size;
resource_size_t aper_base;
@@ -125,6 +133,8 @@ struct amdgpu_gmc {
atomic_tvm_fault_info_updated;
 
const struct amdgpu_gmc_funcs   *gmc_funcs;
+
+   struct amdgpu_xgmi xgmi;
 };
 
 #define amdgpu_gmc_flush_gpu_tlb(adev, vmid) 
(adev)->gmc.gmc_funcs->flush_gpu_tlb((adev), (vmid))
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 01/13] drm/amd/include: update the bitfield define for PF_MAX_REGION

2018-09-05 Thread shaoyunl

From: Shaoyun Liu 

Correct the definition based on vega20 register spec

Change-Id: Ifde296134d00423cdf1078c8249d044f5b5cf5a5
Signed-off-by: Shaoyun Liu 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_2_1_sh_mask.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_2_1_sh_mask.h 
b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_2_1_sh_mask.h
index 6626fc2..76ea902 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_2_1_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_2_1_sh_mask.h
@@ -8241,9 +8241,9 @@
 #define MC_VM_LOCAL_HBM_ADDRESS_LOCK_CNTL__LOCK_MASK   
   0x0001L
 //MC_VM_XGMI_LFB_CNTL
 #define MC_VM_XGMI_LFB_CNTL__PF_LFB_REGION__SHIFT  
   0x0
-#define MC_VM_XGMI_LFB_CNTL__PF_MAX_REGION__SHIFT  
   0x3
+#define MC_VM_XGMI_LFB_CNTL__PF_MAX_REGION__SHIFT  
   0x4
 #define MC_VM_XGMI_LFB_CNTL__PF_LFB_REGION_MASK
   0x0007L
-#define MC_VM_XGMI_LFB_CNTL__PF_MAX_REGION_MASK
   0x0038L
+#define MC_VM_XGMI_LFB_CNTL__PF_MAX_REGION_MASK
   0x0070L
 //MC_VM_XGMI_LFB_SIZE
 #define MC_VM_XGMI_LFB_SIZE__PF_LFB_SIZE__SHIFT
   0x0
 #define MC_VM_XGMI_LFB_SIZE__PF_LFB_SIZE_MASK  
   0xL
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

85 matches

Mail list logo