date:20191217

Re: [PATCH 1/2] drm/amdgpu: update the method to get fb_loc of memory training

2019-12-17 Thread Wang, Kevin(Yang)

[AMD Official Use Only - Internal Distribution Only]



From: Tianci Yin 
Sent: Wednesday, December 18, 2019 10:21 AM
To: amd-gfx@lists.freedesktop.org 
Cc: Tuikov, Luben ; Koenig, Christian 
; Deucher, Alexander ; 
Zhang, Hawking ; Xu, Feifei ; Yuan, 
Xiaojie ; Long, Gang ; Wang, 
Kevin(Yang) ; Yin, Tianci (Rico) 
Subject: [PATCH 1/2] drm/amdgpu: update the method to get fb_loc of memory 
training

From: "Tianci.Yin" 

The method of getting fb_loc changed from parsing VBIOS to
taking certain offset from top of VRAM

Change-Id: I053b42fdb1d822722fa7980b2cd9f86b3fdce539
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  3 +-
 .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c  | 36 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  6 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h   |  7 
 drivers/gpu/drm/amd/include/atomfirmware.h| 14 
 5 files changed, 16 insertions(+), 50 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a78a363b1d71..fa2cf8e7bc07 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -642,9 +642,8 @@ struct amdgpu_fw_vram_usage {
 struct amdgpu_bo *reserved_bo;
 void *va;

-   /* Offset on the top of VRAM, used as c2p write buffer.
+   /* GDDR6 training support flag.
 */
-   u64 mem_train_fb_loc;
 bool mem_train_support;
 };

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
index ff4eb96bdfb5..009cb0b03d13 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
@@ -528,13 +528,9 @@ static int gddr6_mem_train_support(struct amdgpu_device 
*adev)
 int amdgpu_atomfirmware_get_mem_train_fb_loc(struct amdgpu_device *adev)
 {
 struct atom_context *ctx = adev->mode_info.atom_context;
-   unsigned char *bios = ctx->bios;
-   struct vram_reserve_block *reserved_block;
-   int index, block_number;
+   int index;
 uint8_t frev, crev;
 uint16_t data_offset, size;
-   uint32_t start_address_in_kb;
-   uint64_t offset;
 int ret;

 adev->fw_vram_usage.mem_train_support = false;
@@ -569,32 +565,6 @@ int amdgpu_atomfirmware_get_mem_train_fb_loc(struct 
amdgpu_device *adev)
[kevin]:
this function is not return any address after change,
i think we'd better to rename this function to another is well.
the code can be merge to function gddr6_mem_train_support().

 return -EINVAL;
 }

-   reserved_block = (struct vram_reserve_block *)
-   (bios + data_offset + sizeof(struct atom_common_table_header));
-   block_number = ((unsigned int)size - sizeof(struct 
atom_common_table_header))
-   / sizeof(struct vram_reserve_block);
-   reserved_block += (block_number > 0) ? block_number-1 : 0;
-   DRM_DEBUG("block_number:0x%04x, last block: 0x%08xkb sz, %dkb fw, %dkb 
drv.\n",
- block_number,
- le32_to_cpu(reserved_block->start_address_in_kb),
- le16_to_cpu(reserved_block->used_by_firmware_in_kb),
- le16_to_cpu(reserved_block->used_by_driver_in_kb));
-   if (reserved_block->used_by_firmware_in_kb > 0) {
-   start_address_in_kb = 
le32_to_cpu(reserved_block->start_address_in_kb);
-   offset = (uint64_t)start_address_in_kb * ONE_KiB;
-   if ((offset & (ONE_MiB - 1)) < (4 * ONE_KiB + 1) ) {
-   offset -= ONE_MiB;
-   }
-
-   offset &= ~(ONE_MiB - 1);
-   adev->fw_vram_usage.mem_train_fb_loc = offset;
-   adev->fw_vram_usage.mem_train_support = true;
-   DRM_DEBUG("mem_train_fb_loc:0x%09llx.\n", offset);
-   ret = 0;
-   } else {
-   DRM_ERROR("used_by_firmware_in_kb is 0!\n");
-   ret = -EINVAL;
-   }
-
-   return ret;
+   adev->fw_vram_usage.mem_train_support = true;
+   return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 2ff63d0414c9..ce5cb854bdb9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1705,7 +1705,11 @@ static int amdgpu_ttm_training_reserve_vram_init(struct 
amdgpu_device *adev)
 return 0;
 }

-   ctx->c2p_train_data_offset = adev->fw_vram_usage.mem_train_fb_loc;
+   ctx->c2p_train_data_offset = adev->gmc.mc_vram_size;
+   if ((ctx->c2p_train_data_offset & (ONE_MiB - 1)) < (4 * ONE_KiB + 1) ) {
+   ctx->c2p_train_data_offset -= ONE_MiB;
+   }
+   ctx->c2p_train_data_offset &= ~(ONE_MiB - 1);
 ctx->p2c_train_data_offset = (adev->gmc.mc_vram_size - 
GDDR6_MEM_TRAINING_OFFSET);
 ctx->train_data_size = GDDR6_MEM_TRAINING_DATA_SIZE_IN_BYTE

[PATCH] drm/amdgpu: correctly report gpu recover status

2019-12-17 Thread Evan Quan

Knowing whether gpu recovery was performed successfully or not
is important for our BACO development.

Change-Id: I0e3ca4dcb65a053eb26bc55ad7431e4a42e160de
Signed-off-by: Evan Quan 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index e9efee04ca23..5dff5c0dd882 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -743,9 +743,7 @@ static int amdgpu_debugfs_gpu_recover(struct seq_file *m, 
void *data)
struct amdgpu_device *adev = dev->dev_private;
 
seq_printf(m, "gpu recover\n");
-   amdgpu_device_gpu_recover(adev, NULL);
-
-   return 0;
+   return amdgpu_device_gpu_recover(adev, NULL);
 }
 
 static const struct drm_info_list amdgpu_debugfs_fence_list[] = {
-- 
2.24.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: no SMC firmware reloading for non-RAS baco reset

2019-12-17 Thread Evan Quan

For non-RAS baco reset, there is no need to reset the SMC. Thus
the firmware reloading should be avoided.

Change-Id: I73f6284541d0ca0e82761380a27e32484fb0061c
Signed-off-by: Evan Quan 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c |  3 ++-
 drivers/gpu/drm/amd/amdgpu/psp_v11_0.c  | 14 ++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index c14f2ccd0677..9bf7e92394f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1439,7 +1439,8 @@ static int psp_np_fw_load(struct psp_context *psp)
continue;
 
if (ucode->ucode_id == AMDGPU_UCODE_ID_SMC &&
-   (psp_smu_reload_quirk(psp) || psp->autoload_supported))
+   ((adev->in_gpu_reset && psp_smu_reload_quirk(psp))
+ || psp->autoload_supported))
continue;
 
if (amdgpu_sriov_vf(adev) &&
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
index c66ca8cc2ebd..ba761e9366e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
@@ -676,6 +676,19 @@ static bool psp_v11_0_compare_sram_data(struct psp_context 
*psp,
return true;
 }
 
+/*
+ * Check whether SMU is still alive. If that's true
+ * (e.g. for non-RAS baco reset), we need to skip SMC firmware reloading.
+ */
+static bool psp_v11_0_smu_reload_quirk(struct psp_context *psp)
+{
+   struct amdgpu_device *adev = psp->adev;
+   uint32_t reg;
+
+   reg = RREG32_PCIE(smnMP1_FIRMWARE_FLAGS | 0x03b0);
+   return (reg & MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK) ? true : 
false;
+}
+
 static int psp_v11_0_mode1_reset(struct psp_context *psp)
 {
int ret;
@@ -1070,6 +1083,7 @@ static const struct psp_funcs psp_v11_0_funcs = {
.ring_stop = psp_v11_0_ring_stop,
.ring_destroy = psp_v11_0_ring_destroy,
.compare_sram_data = psp_v11_0_compare_sram_data,
+   .smu_reload_quirk = psp_v11_0_smu_reload_quirk,
.mode1_reset = psp_v11_0_mode1_reset,
.xgmi_get_topology_info = psp_v11_0_xgmi_get_topology_info,
.xgmi_set_topology_info = psp_v11_0_xgmi_set_topology_info,
-- 
2.24.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH v2 1/5] drm/amdgpu/smu: add metrics table lock

2019-12-17 Thread Wang, Kevin(Yang)

[AMD Official Use Only - Internal Distribution Only]

The series patches are
Reviewed-by: Kevin Wang 


From: amd-gfx  on behalf of Alex Deucher 

Sent: Wednesday, December 18, 2019 5:45 AM
To: amd-gfx@lists.freedesktop.org 
Cc: Deucher, Alexander 
Subject: [PATCH v2 1/5] drm/amdgpu/smu: add metrics table lock

This table is used for lots of things, add it's own lock.

Bug: 
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fdrm%2Famd%2Fissues%2F900&data=02%7C01%7CKevin1.Wang%40amd.com%7C39da818e513e4cfb04fe08d7833a7fc2%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637122160270078326&sdata=jL5LpNDv7ZX%2FpGPAexqcUDKOE5%2B9kkAxKuIzWO1CE0Y%3D&reserved=0
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 1 +
 drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index f76a1717ffbd..936c68298786 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -872,6 +872,7 @@ static int smu_sw_init(void *handle)
 smu->smu_baco.platform_support = false;

 mutex_init(&smu->sensor_lock);
+   mutex_init(&smu->metrics_lock);

 smu->watermarks_bitmap = 0;
 smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
index a7d0ad831491..541cfde289ea 100644
--- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
@@ -350,6 +350,7 @@ struct smu_context
 const struct pptable_funcs  *ppt_funcs;
 struct mutexmutex;
 struct mutexsensor_lock;
+   struct mutexmetrics_lock;
 uint64_t pool_size;

 struct smu_table_contextsmu_table;
--
2.23.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7CKevin1.Wang%40amd.com%7C39da818e513e4cfb04fe08d7833a7fc2%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637122160270078326&sdata=5Z573z93vZHHifVEOQoXgpkcgKoGvlm%2B5hC6oVQdTec%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 1/2] drm/amdgpu: update the method to get fb_loc of memory training

2019-12-17 Thread Tianci Yin

From: "Tianci.Yin" 

The method of getting fb_loc changed from parsing VBIOS to
taking certain offset from top of VRAM

Change-Id: I053b42fdb1d822722fa7980b2cd9f86b3fdce539
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  3 +-
 .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c  | 36 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  6 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h   |  7 
 drivers/gpu/drm/amd/include/atomfirmware.h| 14 
 5 files changed, 16 insertions(+), 50 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a78a363b1d71..fa2cf8e7bc07 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -642,9 +642,8 @@ struct amdgpu_fw_vram_usage {
struct amdgpu_bo *reserved_bo;
void *va;
 
-   /* Offset on the top of VRAM, used as c2p write buffer.
+   /* GDDR6 training support flag.
*/
-   u64 mem_train_fb_loc;
bool mem_train_support;
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
index ff4eb96bdfb5..009cb0b03d13 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
@@ -528,13 +528,9 @@ static int gddr6_mem_train_support(struct amdgpu_device 
*adev)
 int amdgpu_atomfirmware_get_mem_train_fb_loc(struct amdgpu_device *adev)
 {
struct atom_context *ctx = adev->mode_info.atom_context;
-   unsigned char *bios = ctx->bios;
-   struct vram_reserve_block *reserved_block;
-   int index, block_number;
+   int index;
uint8_t frev, crev;
uint16_t data_offset, size;
-   uint32_t start_address_in_kb;
-   uint64_t offset;
int ret;
 
adev->fw_vram_usage.mem_train_support = false;
@@ -569,32 +565,6 @@ int amdgpu_atomfirmware_get_mem_train_fb_loc(struct 
amdgpu_device *adev)
return -EINVAL;
}
 
-   reserved_block = (struct vram_reserve_block *)
-   (bios + data_offset + sizeof(struct atom_common_table_header));
-   block_number = ((unsigned int)size - sizeof(struct 
atom_common_table_header))
-   / sizeof(struct vram_reserve_block);
-   reserved_block += (block_number > 0) ? block_number-1 : 0;
-   DRM_DEBUG("block_number:0x%04x, last block: 0x%08xkb sz, %dkb fw, %dkb 
drv.\n",
- block_number,
- le32_to_cpu(reserved_block->start_address_in_kb),
- le16_to_cpu(reserved_block->used_by_firmware_in_kb),
- le16_to_cpu(reserved_block->used_by_driver_in_kb));
-   if (reserved_block->used_by_firmware_in_kb > 0) {
-   start_address_in_kb = 
le32_to_cpu(reserved_block->start_address_in_kb);
-   offset = (uint64_t)start_address_in_kb * ONE_KiB;
-   if ((offset & (ONE_MiB - 1)) < (4 * ONE_KiB + 1) ) {
-   offset -= ONE_MiB;
-   }
-
-   offset &= ~(ONE_MiB - 1);
-   adev->fw_vram_usage.mem_train_fb_loc = offset;
-   adev->fw_vram_usage.mem_train_support = true;
-   DRM_DEBUG("mem_train_fb_loc:0x%09llx.\n", offset);
-   ret = 0;
-   } else {
-   DRM_ERROR("used_by_firmware_in_kb is 0!\n");
-   ret = -EINVAL;
-   }
-
-   return ret;
+   adev->fw_vram_usage.mem_train_support = true;
+   return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 2ff63d0414c9..ce5cb854bdb9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1705,7 +1705,11 @@ static int amdgpu_ttm_training_reserve_vram_init(struct 
amdgpu_device *adev)
return 0;
}
 
-   ctx->c2p_train_data_offset = adev->fw_vram_usage.mem_train_fb_loc;
+   ctx->c2p_train_data_offset = adev->gmc.mc_vram_size;
+   if ((ctx->c2p_train_data_offset & (ONE_MiB - 1)) < (4 * ONE_KiB + 1) ) {
+   ctx->c2p_train_data_offset -= ONE_MiB;
+   }
+   ctx->c2p_train_data_offset &= ~(ONE_MiB - 1);
ctx->p2c_train_data_offset = (adev->gmc.mc_vram_size - 
GDDR6_MEM_TRAINING_OFFSET);
ctx->train_data_size = GDDR6_MEM_TRAINING_DATA_SIZE_IN_BYTES;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index f1ebd424510c..19eb3e8456c7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -66,6 +66,13 @@ struct amdgpu_copy_mem {
unsigned long   offset;
 };
 
+/* Definitions for constance */
+enum amdgpu_internal_constants
+{
+   ONE_KiB = 0x400,
+   ONE_MiB = 0x10,
+};
+
 extern const struct ttm_mem_type_manager_func amdgpu_gtt_mgr_func;
 extern const struct ttm_mem_type_manager_func amdgpu_vram_mgr_func;
 
diff --git a/drivers/gpu/drm/amd/incl

[PATCH 2/2] drm/amdgpu: remove memory training p2c buffer reservation(V2)

2019-12-17 Thread Tianci Yin

From: "Tianci.Yin" 

IP discovery TMR(occupied the top VRAM with size DISCOVERY_TMR_SIZE)
has been reserved, and the p2c buffer is in the range of this TMR, so
the p2c buffer reservation is unnecessary.

Change-Id: Ib1f2f2b4a1f3869c03ffe22e2836cdbee17ba99f
Signed-off-by: Tianci.Yin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 21 ++---
 2 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 5f8fd3e3535b..3265487b859f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -202,7 +202,6 @@ struct psp_memory_training_context {
 
/*vram offset of the p2c training data*/
u64 p2c_train_data_offset;
-   struct amdgpu_bo *p2c_bo;
 
/*vram offset of the c2p training data*/
u64 c2p_train_data_offset;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index ce5cb854bdb9..476ea4a4dc03 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1681,9 +1681,6 @@ static int amdgpu_ttm_training_reserve_vram_fini(struct 
amdgpu_device *adev)
amdgpu_bo_free_kernel(&ctx->c2p_bo, NULL, NULL);
ctx->c2p_bo = NULL;
 
-   amdgpu_bo_free_kernel(&ctx->p2c_bo, NULL, NULL);
-   ctx->p2c_bo = NULL;
-
return 0;
 }
 
@@ -1718,17 +1715,6 @@ static int amdgpu_ttm_training_reserve_vram_init(struct 
amdgpu_device *adev)
  ctx->p2c_train_data_offset,
  ctx->c2p_train_data_offset);
 
-   ret = amdgpu_bo_create_kernel_at(adev,
-ctx->p2c_train_data_offset,
-ctx->train_data_size,
-AMDGPU_GEM_DOMAIN_VRAM,
-&ctx->p2c_bo,
-NULL);
-   if (ret) {
-   DRM_ERROR("alloc p2c_bo failed(%d)!\n", ret);
-   goto Err_out;
-   }
-
ret = amdgpu_bo_create_kernel_at(adev,
 ctx->c2p_train_data_offset,
 ctx->train_data_size,
@@ -1737,15 +1723,12 @@ static int amdgpu_ttm_training_reserve_vram_init(struct 
amdgpu_device *adev)
 NULL);
if (ret) {
DRM_ERROR("alloc c2p_bo failed(%d)!\n", ret);
-   goto Err_out;
+   amdgpu_ttm_training_reserve_vram_fini(adev);
+   return ret;
}
 
ctx->init = PSP_MEM_TRAIN_RESERVE_SUCCESS;
return 0;
-
-Err_out:
-   amdgpu_ttm_training_reserve_vram_fini(adev);
-   return ret;
 }
 
 /**
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 2/2] drm/amdgpu: remove memory training p2c buffer reservation

2019-12-17 Thread Yin, Tianci (Rico)

Hi Guchun,

Thanks very much for your suggestion.
I will refine it and send it out later.

Rico

From: Chen, Guchun 
Sent: Tuesday, December 17, 2019 22:11
To: Yin, Tianci (Rico) ; amd-gfx@lists.freedesktop.org 

Cc: Long, Gang ; Yin, Tianci (Rico) ; 
Xu, Feifei ; Wang, Kevin(Yang) ; 
Tuikov, Luben ; Deucher, Alexander 
; Zhang, Hawking ; Koenig, 
Christian ; Yuan, Xiaojie 
Subject: RE: [PATCH 2/2] drm/amdgpu: remove memory training p2c buffer 
reservation

[AMD Official Use Only - Internal Distribution Only]




-Original Message-
From: amd-gfx  On Behalf Of Tianci Yin
Sent: Tuesday, December 17, 2019 7:23 PM
To: amd-gfx@lists.freedesktop.org
Cc: Long, Gang ; Yin, Tianci (Rico) ; 
Xu, Feifei ; Wang, Kevin(Yang) ; 
Tuikov, Luben ; Deucher, Alexander 
; Zhang, Hawking ; Koenig, 
Christian ; Yuan, Xiaojie 
Subject: [PATCH 2/2] drm/amdgpu: remove memory training p2c buffer reservation

From: "Tianci.Yin" 

IP discovery TMR(occupied the top VRAM with size DISCOVERY_TMR_SIZE) has been 
reserved, and the p2c buffer is in the range of this TMR, so the p2c buffer 
reservation is unnecessary.

Change-Id: Ib1f2f2b4a1f3869c03ffe22e2836cdbee17ba99f
Signed-off-by: Tianci.Yin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  1 -  
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 14 --
 2 files changed, 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 5f8fd3e3535b..3265487b859f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -202,7 +202,6 @@ struct psp_memory_training_context {

 /*vram offset of the p2c training data*/
 u64 p2c_train_data_offset;
-   struct amdgpu_bo *p2c_bo;

 /*vram offset of the c2p training data*/
 u64 c2p_train_data_offset;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index ce5cb854bdb9..6f0ad1d1d4d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1681,9 +1681,6 @@ static int amdgpu_ttm_training_reserve_vram_fini(struct 
amdgpu_device *adev)
 amdgpu_bo_free_kernel(&ctx->c2p_bo, NULL, NULL);
 ctx->c2p_bo = NULL;

-   amdgpu_bo_free_kernel(&ctx->p2c_bo, NULL, NULL);
-   ctx->p2c_bo = NULL;
-
 return 0;
 }

@@ -1718,17 +1715,6 @@ static int amdgpu_ttm_training_reserve_vram_init(struct 
amdgpu_device *adev)
   ctx->p2c_train_data_offset,
   ctx->c2p_train_data_offset);

-   ret = amdgpu_bo_create_kernel_at(adev,
-ctx->p2c_train_data_offset,
-ctx->train_data_size,
-AMDGPU_GEM_DOMAIN_VRAM,
-&ctx->p2c_bo,
-NULL);
-   if (ret) {
-   DRM_ERROR("alloc p2c_bo failed(%d)!\n", ret);
-   goto Err_out;
-   }
-
 ret = amdgpu_bo_create_kernel_at(adev,
  ctx->c2p_train_data_offset,
  ctx->train_data_size,
[Guchun] If we have to remove such buffer reservation, from coding style's 
perspective, I suggest removing error handler code by "goto" too in 
amdgpu_ttm_training_reserve_vram_init.
After removing p2c buffer reservation from this function, there is only one 
buffer reservation case for c2p. So direct error handle and return should be 
better.

--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cguchun.chen%40amd.com%7C888c561716c342aa9ecc08d782e397d0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637121786693411170&sdata=pH1rob4R5ljvEGo8PSjn1te7ctWLG1Wctv30lNCLyx4%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH v2 5/5] drm/amdgpu/smu: add metrics table lock for vega20 (v2)

2019-12-17 Thread Quan, Evan

It's fine with me to check them in as a temporary workaround. 
Series is reviewed-by: Evan Quan 

> -Original Message-
> From: amd-gfx  On Behalf Of Alex
> Deucher
> Sent: Wednesday, December 18, 2019 5:46 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander 
> Subject: [PATCH v2 5/5] drm/amdgpu/smu: add metrics table lock for vega20
> (v2)
> 
> To protect access to the metrics table.
> 
> v2: unlock on error
> 
> Bug:
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.fre
> edesktop.org%2Fdrm%2Famd%2Fissues%2F900&data=02%7C01%7Cevan.q
> uan%40amd.com%7Cefa0dd86e5a74ead810708d7833a823e%7C3dd8961fe488
> 4e608e11a82d994e183d%7C0%7C0%7C637122159732784832&sdata=X0Z
> UV1r90Dy3mvlp8zONFcxKQcSaciwkVt7GJabYH0I%3D&reserved=0
> Signed-off-by: Alex Deucher 
> ---
>  drivers/gpu/drm/amd/powerplay/vega20_ppt.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
> b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
> index 2b1c3f8a0415..250ff5aa1305 100644
> --- a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
> @@ -1678,17 +1678,20 @@ static int vega20_get_metrics_table(struct
> smu_context *smu,
>   struct smu_table_context *smu_table= &smu->smu_table;
>   int ret = 0;
> 
> + mutex_lock(&smu->metrics_lock);
>   if (!smu_table->metrics_time || time_after(jiffies, smu_table-
> >metrics_time + HZ / 1000)) {
>   ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0,
>   (void *)smu_table->metrics_table, false);
>   if (ret) {
>   pr_info("Failed to export SMU metrics table!\n");
> + mutex_unlock(&smu->metrics_lock);
>   return ret;
>   }
>   smu_table->metrics_time = jiffies;
>   }
> 
>   memcpy(metrics_table, smu_table->metrics_table,
> sizeof(SmuMetrics_t));
> + mutex_unlock(&smu->metrics_lock);
> 
>   return ret;
>  }
> --
> 2.23.0
> 
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.free
> desktop.org%2Fmailman%2Flistinfo%2Famd-
> gfx&data=02%7C01%7Cevan.quan%40amd.com%7Cefa0dd86e5a74ead81
> 0708d7833a823e%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637
> 122159732784832&sdata=xFCKqTGqv57k9SucgTc7Ur5AGctpMO%2BbPvw
> RKz53whI%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 2/2] drm/amdkfd: expose num_cp_queues data field to topology node

2019-12-17 Thread Felix Kuehling


See comment inline. Other than that, the series looks good to me.

On 2019-12-16 2:02, Huang Rui wrote:

Thunk driver would like to know the num_cp_queues data, however this data relied
on different asic specific. So it's better to get it from kfd driver.

Signed-off-by: Huang Rui 
---
  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 3 +++
  drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 3 ++-
  2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index cc01ccd..203c823 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -488,6 +488,8 @@ static ssize_t node_show(struct kobject *kobj, struct 
attribute *attr,
dev->node_props.num_sdma_xgmi_engines);
sysfs_show_32bit_prop(buffer, "num_sdma_queues_per_engine",
dev->node_props.num_sdma_queues_per_engine);
+   sysfs_show_32bit_prop(buffer, "num_cp_queues",
+   dev->node_props.num_cp_queues);
  
  	if (dev->gpu) {

log_max_watch_addr =
@@ -1316,6 +1318,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
dev->node_props.num_gws = (hws_gws_support &&
dev->gpu->dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) ?
amdgpu_amdkfd_get_num_gws(dev->gpu->kgd) : 0;
+   dev->node_props.num_cp_queues = get_queues_num(dev->gpu->dqm);
  
  	kfd_fill_mem_clk_max_info(dev);

kfd_fill_iolink_non_crat_info(dev);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
index 9346cc1..e447901 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@@ -27,7 +27,7 @@
  #include 
  #include "kfd_crat.h"
  
-#define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 28

+#define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 24


I don't see why you need to change the name size here. I'm not aware of 
any requirement that the structure size cannot change. This comment 
applies to patch 1 as well.


Regards,
  Felix

  
  #define HSA_CAP_HOT_PLUGGABLE			0x0001

  #define HSA_CAP_ATS_PRESENT   0x0002
@@ -82,6 +82,7 @@ struct kfd_node_properties {
uint32_t num_sdma_engines;
uint32_t num_sdma_xgmi_engines;
uint32_t num_sdma_queues_per_engine;
+   uint32_t num_cp_queues;
char name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE];
  };
  

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 2/2] drm/amdgpu: attempt xgmi perfmon re-arm on failed arm

2019-12-17 Thread Felix Kuehling


On 2019-12-17 12:28, Jonathan Kim wrote:

The DF routines to arm xGMI performance will attempt to re-arm both on
performance monitoring start and read on initial failure to arm.

Signed-off-by: Jonathan Kim 
---
  drivers/gpu/drm/amd/amdgpu/df_v3_6.c | 153 ---
  1 file changed, 117 insertions(+), 36 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/df_v3_6.c 
b/drivers/gpu/drm/amd/amdgpu/df_v3_6.c
index 4043ebcea5de..af445054305f 100644
--- a/drivers/gpu/drm/amd/amdgpu/df_v3_6.c
+++ b/drivers/gpu/drm/amd/amdgpu/df_v3_6.c
@@ -162,25 +162,45 @@ static void df_v3_6_perfmon_rreg(struct amdgpu_device 
*adev,
  }
  
  /*

- * df_v3_6_perfmon_wreg - write to perfmon lo and hi
- *
- * required to be atomic.  no mmio method provided so subsequent reads after
- * data writes cannot occur to preserve data fabrics finite state machine.
+ * retry arming counters every 100 usecs within 1 millisecond interval.
+ * if retry fails after time out, return error.
   */
-static void df_v3_6_perfmon_wreg(struct amdgpu_device *adev, uint32_t lo_addr,
-   uint32_t lo_val, uint32_t hi_addr, uint32_t hi_val)
+#define ARM_RETRY_USEC_TIMEOUT 1000
+#define ARM_RETRY_USEC_INTERVAL100
+static int df_v3_6_perfmon_arm_with_retry(struct amdgpu_device *adev,
+ uint32_t lo_addr, uint32_t lo_val,
+ uint32_t hi_addr, uint32_t  hi_val)
  {
unsigned long flags, address, data;
+   uint32_t lo_val_rb, hi_val_rb;
+   int countdown = ARM_RETRY_USEC_TIMEOUT;
  
  	address = adev->nbio.funcs->get_pcie_index_offset(adev);

data = adev->nbio.funcs->get_pcie_data_offset(adev);
  
  	spin_lock_irqsave(&adev->pcie_idx_lock, flags);

-   WREG32(address, lo_addr);
-   WREG32(data, lo_val);
-   WREG32(address, hi_addr);
-   WREG32(data, hi_val);
+
+   while (countdown) {
+   WREG32(address, lo_addr);
+   WREG32(data, lo_val);
+   WREG32(address, hi_addr);
+   WREG32(data, hi_val);
+
+   WREG32(address, lo_addr);
+   lo_val_rb = RREG32(data);
+   WREG32(address, hi_addr);
+   hi_val_rb = RREG32(data);
+
+   if (lo_val == lo_val_rb && hi_val == hi_val_rb)
+   break;
+
+   countdown -= ARM_RETRY_USEC_INTERVAL;
+   udelay(ARM_RETRY_USEC_INTERVAL);
+   }
+
spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);


I don't think it's a good idea to hold the spin lock for the entire 
duration of this retry loop. Maybe put that inside the loop and release 
the lock while waiting in udelay.




+
+   return countdown > 0 ? 0 : -ETIME;
  }
  
  /* get the number of df counters available */

@@ -334,20 +354,20 @@ static void df_v3_6_pmc_get_addr(struct amdgpu_device 
*adev,
switch (target_cntr) {
  
  	case 0:

-   *lo_base_addr = is_ctrl ? smnPerfMonCtlLo0 : smnPerfMonCtrLo0;
-   *hi_base_addr = is_ctrl ? smnPerfMonCtlHi0 : smnPerfMonCtrHi0;
+   *lo_base_addr = is_ctrl ? smnPerfMonCtlLo4 : smnPerfMonCtrLo4;
+   *hi_base_addr = is_ctrl ? smnPerfMonCtlHi4 : smnPerfMonCtrHi4;
break;
case 1:
-   *lo_base_addr = is_ctrl ? smnPerfMonCtlLo1 : smnPerfMonCtrLo1;
-   *hi_base_addr = is_ctrl ? smnPerfMonCtlHi1 : smnPerfMonCtrHi1;
+   *lo_base_addr = is_ctrl ? smnPerfMonCtlLo5 : smnPerfMonCtrLo5;
+   *hi_base_addr = is_ctrl ? smnPerfMonCtlHi5 : smnPerfMonCtrHi5;
break;
case 2:
-   *lo_base_addr = is_ctrl ? smnPerfMonCtlLo2 : smnPerfMonCtrLo2;
-   *hi_base_addr = is_ctrl ? smnPerfMonCtlHi2 : smnPerfMonCtrHi2;
+   *lo_base_addr = is_ctrl ? smnPerfMonCtlLo6 : smnPerfMonCtrLo6;
+   *hi_base_addr = is_ctrl ? smnPerfMonCtlHi6 : smnPerfMonCtrHi6;
break;
case 3:
-   *lo_base_addr = is_ctrl ? smnPerfMonCtlLo3 : smnPerfMonCtrLo3;
-   *hi_base_addr = is_ctrl ? smnPerfMonCtlHi3 : smnPerfMonCtrHi3;
+   *lo_base_addr = is_ctrl ? smnPerfMonCtlLo7 : smnPerfMonCtrLo7;
+   *hi_base_addr = is_ctrl ? smnPerfMonCtlHi7 : smnPerfMonCtrHi7;
break;
  
  	}

@@ -422,6 +442,42 @@ static int df_v3_6_pmc_add_cntr(struct amdgpu_device *adev,
return -ENOSPC;
  }
  
+#define DEFERRED_ARM_MASK	(1 << 31)

+static int df_v3_6_pmc_defer_cntr(struct amdgpu_device *adev,
+ uint64_t config, int err)


Consider renaming this function. I found its usage confusing because 
it's used to defer arming as well as clearing the deferred flag. Maybe 
df_v3_6_pmc_set_deferred. The "err" parameter could be named "defer" to 
better indicate its meaning and maybe make it bool, since that's what's 
returned by the counterpart df_v3_6_pmc_is_deferred.




+{
+   int

[PATCH v2 5/5] drm/amdgpu/smu: add metrics table lock for vega20 (v2)

2019-12-17 Thread Alex Deucher

To protect access to the metrics table.

v2: unlock on error

Bug: https://gitlab.freedesktop.org/drm/amd/issues/900
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/powerplay/vega20_ppt.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c 
b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
index 2b1c3f8a0415..250ff5aa1305 100644
--- a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
@@ -1678,17 +1678,20 @@ static int vega20_get_metrics_table(struct smu_context 
*smu,
struct smu_table_context *smu_table= &smu->smu_table;
int ret = 0;
 
+   mutex_lock(&smu->metrics_lock);
if (!smu_table->metrics_time || time_after(jiffies, 
smu_table->metrics_time + HZ / 1000)) {
ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0,
(void *)smu_table->metrics_table, false);
if (ret) {
pr_info("Failed to export SMU metrics table!\n");
+   mutex_unlock(&smu->metrics_lock);
return ret;
}
smu_table->metrics_time = jiffies;
}
 
memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t));
+   mutex_unlock(&smu->metrics_lock);
 
return ret;
 }
-- 
2.23.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH v2 2/5] drm/amdgpu/smu: add metrics table lock for arcturus (v2)

2019-12-17 Thread Alex Deucher

To protect access to the metrics table.

v2: unlock on error

Bug: https://gitlab.freedesktop.org/drm/amd/issues/900
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/powerplay/arcturus_ppt.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c 
b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
index 17eeb546c550..be4ae0aea9a0 100644
--- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
@@ -867,18 +867,21 @@ static int arcturus_get_metrics_table(struct smu_context 
*smu,
struct smu_table_context *smu_table= &smu->smu_table;
int ret = 0;
 
+   mutex_lock(&smu->metrics_lock);
if (!smu_table->metrics_time ||
 time_after(jiffies, smu_table->metrics_time + HZ / 1000)) {
ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0,
(void *)smu_table->metrics_table, false);
if (ret) {
pr_info("Failed to export SMU metrics table!\n");
+   mutex_unlock(&smu->metrics_lock);
return ret;
}
smu_table->metrics_time = jiffies;
}
 
memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t));
+   mutex_unlock(&smu->metrics_lock);
 
return ret;
 }
-- 
2.23.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH v2 3/5] drm/amdgpu/smu: add metrics table lock for navi (v2)

2019-12-17 Thread Alex Deucher

To protect access to the metrics table.

v2: unlock on error

Bug: https://gitlab.freedesktop.org/drm/amd/issues/900
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
index 7b42e72dc939..bf87e93b26fc 100644
--- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
@@ -564,17 +564,20 @@ static int navi10_get_metrics_table(struct smu_context 
*smu,
struct smu_table_context *smu_table= &smu->smu_table;
int ret = 0;
 
+   mutex_lock(&smu->metrics_lock);
if (!smu_table->metrics_time || time_after(jiffies, 
smu_table->metrics_time + msecs_to_jiffies(100))) {
ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0,
(void *)smu_table->metrics_table, false);
if (ret) {
pr_info("Failed to export SMU metrics table!\n");
+   mutex_unlock(&smu->metrics_lock);
return ret;
}
smu_table->metrics_time = jiffies;
}
 
memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t));
+   mutex_unlock(&smu->metrics_lock);
 
return ret;
 }
-- 
2.23.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH v2 1/5] drm/amdgpu/smu: add metrics table lock

2019-12-17 Thread Alex Deucher

This table is used for lots of things, add it's own lock.

Bug: https://gitlab.freedesktop.org/drm/amd/issues/900
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 1 +
 drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index f76a1717ffbd..936c68298786 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -872,6 +872,7 @@ static int smu_sw_init(void *handle)
smu->smu_baco.platform_support = false;
 
mutex_init(&smu->sensor_lock);
+   mutex_init(&smu->metrics_lock);
 
smu->watermarks_bitmap = 0;
smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
index a7d0ad831491..541cfde289ea 100644
--- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
@@ -350,6 +350,7 @@ struct smu_context
const struct pptable_funcs  *ppt_funcs;
struct mutexmutex;
struct mutexsensor_lock;
+   struct mutexmetrics_lock;
uint64_t pool_size;
 
struct smu_table_contextsmu_table;
-- 
2.23.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH v2 4/5] drm/amdgpu/smu: add metrics table lock for renoir (v2)

2019-12-17 Thread Alex Deucher

To protect access to the metrics table.

v2: unlock on error

Bug: https://gitlab.freedesktop.org/drm/amd/issues/900
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/powerplay/renoir_ppt.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c 
b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c
index 81520b0fca68..979772dbe6a9 100644
--- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c
@@ -171,17 +171,20 @@ static int renoir_get_metrics_table(struct smu_context 
*smu,
struct smu_table_context *smu_table= &smu->smu_table;
int ret = 0;
 
+   mutex_lock(&smu->metrics_lock);
if (!smu_table->metrics_time || time_after(jiffies, 
smu_table->metrics_time + msecs_to_jiffies(100))) {
ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0,
(void *)smu_table->metrics_table, false);
if (ret) {
pr_info("Failed to export SMU metrics table!\n");
+   mutex_unlock(&smu->metrics_lock);
return ret;
}
smu_table->metrics_time = jiffies;
}
 
memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t));
+   mutex_unlock(&smu->metrics_lock);
 
return ret;
 }
@@ -239,8 +242,7 @@ static int renoir_print_clk_levels(struct smu_context *smu,
 
memset(&metrics, 0, sizeof(metrics));
 
-   ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0,
-  (void *)&metrics, false);
+   ret = renoir_get_metrics_table(smu, &metrics);
if (ret)
return ret;
 
-- 
2.23.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 2/2] drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV

2019-12-17 Thread Felix Kuehling


I agree. Removing the call to pre-reset probably breaks GPU reset for KFD.

We call the KFD suspend function in pre-reset, which uses the HIQ to 
stop any user mode queues still running. If that is not possible because 
the HIQ is hanging, it should fail with a timeout. There may be 
something we can do if we know that the HIQ is hanging, so we only 
update the KFD-internal queue state without actually sending anything to 
the HIQ.


Regards,
  Felix

On 2019-12-17 10:37, shaoyunl wrote:
I think amdkfd side depends on this call to stop the user queue, 
without this call, the user queue can submit to HW during the reset 
which could cause hang again ...
Do we know the root cause why this function would ruin MEC ? From the 
logic, I think this function should be called before FLR since we need 
to disable the user queue submission first.
I remembered the function should use hiq to communicate with HW , 
shouldn't use kiq to access HW registerm,  has this been changed ?



Regards
shaoyun.liu


On 2019-12-17 5:19 a.m., Monk Liu wrote:

issues:
MEC is ruined by the amdkfd_pre_reset after VF FLR done

fix:
amdkfd_pre_reset() would ruin MEC after hypervisor finished the VF FLR,
the correct sequence is do amdkfd_pre_reset before VF FLR but there is
a limitation to block this sequence:
if we do pre_reset() before VF FLR, it would go KIQ way to do register
access and stuck there, because KIQ probably won't work by that time
(e.g. you already made GFX hang)

so the best way right now is to simply remove it.

Signed-off-by: Monk Liu 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 --
  1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 605cef6..ae962b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3672,8 +3672,6 @@ static int amdgpu_device_reset_sriov(struct 
amdgpu_device *adev,

  if (r)
  return r;
  -    amdgpu_amdkfd_pre_reset(adev);
-
  /* Resume IP prior to SMC */
  r = amdgpu_device_ip_reinit_early_sriov(adev);
  if (r)

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cfelix.kuehling%40amd.com%7Cbd097404ba8b4e7f9d9308d7830717fe%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637121938908876710&sdata=bNGTZtFLiQ46UwjCa5u8hXG1KUtK%2Fs98g7rBmBtTaPs%3D&reserved=0 


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/2] drm/amdgpu/display: include delay.h

2019-12-17 Thread Kazlauskas, Nicholas


On 2019-12-17 3:47 p.m., Alex Deucher wrote:

For udelay.  This is needed for some platforms.

Signed-off-by: Alex Deucher 


Reviewed-by: Nicholas Kazlauskas 

I wonder if it makes more sense to include this in os_types.h to avoid 
these errors in the future.


Nicholas Kazlauskas


---
  drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c 
b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c
index 110c8620907b..bcbc0b8a9aa0 100644
--- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c
+++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c
@@ -23,6 +23,8 @@
   *
   */
  
+#include 

+
  #include "hdcp.h"
  
  static inline enum mod_hdcp_status check_receiver_id_list_ready(struct mod_hdcp *hdcp)




___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 2/2] drm/amdgpu/display: use msleep rather than udelay for HDCP

2019-12-17 Thread Alex Deucher

ARM has a 2000us limit for udelay.  Switch to msleep.  This code
executes in a worker thread so shouldn't be an atomic context.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c 
b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c
index bcbc0b8a9aa0..f730b94ac3c0 100644
--- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c
+++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c
@@ -153,7 +153,7 @@ static enum mod_hdcp_status poll_l_prime_available(struct 
mod_hdcp *hdcp)
 {
enum mod_hdcp_status status;
uint8_t size;
-   uint16_t max_wait = 2; // units of us
+   uint16_t max_wait = 20; // units of ms
uint16_t num_polls = 5;
uint16_t wait_time = max_wait / num_polls;
 
@@ -161,7 +161,7 @@ static enum mod_hdcp_status poll_l_prime_available(struct 
mod_hdcp *hdcp)
status = MOD_HDCP_STATUS_INVALID_OPERATION;
else
for (; num_polls; num_polls--) {
-   udelay(wait_time);
+   msleep(wait_time);
 
status = mod_hdcp_read_rxstatus(hdcp);
if (status != MOD_HDCP_STATUS_SUCCESS)
@@ -474,7 +474,7 @@ static enum mod_hdcp_status locality_check(struct mod_hdcp 
*hdcp,
 hdcp, "lc_init_write"))
goto out;
if (is_dp_hdcp(hdcp))
-   udelay(16000);
+   msleep(16);
else
if (!mod_hdcp_execute_and_set(poll_l_prime_available,
&input->l_prime_available_poll, &status,
-- 
2.23.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 1/2] drm/amdgpu/display: include delay.h

2019-12-17 Thread Alex Deucher

For udelay.  This is needed for some platforms.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c 
b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c
index 110c8620907b..bcbc0b8a9aa0 100644
--- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c
+++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c
@@ -23,6 +23,8 @@
  *
  */
 
+#include 
+
 #include "hdcp.h"
 
 static inline enum mod_hdcp_status check_receiver_id_list_ready(struct 
mod_hdcp *hdcp)
-- 
2.23.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [CI-NOTIFY]: TCWG Bisect tcwg_kernel/llvm-release-aarch64-next-allmodconfig - Build # 48 - Successful!

2019-12-17 Thread Nathan Chancellor

On Tue, Dec 17, 2019 at 09:19:37AM -0800, 'Nick Desaulniers' via Clang Built 
Linux wrote:
> Bhawanpreet, I suspect you're missing the header to include udelay in
> drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_execution.c.
> Can you please send a fix for this?
> 

arm allyesconfig is also broken at link time, which I reported here
previously:

https://lists.freedesktop.org/archives/amd-gfx/2019-November/043109.html

ld.lld: error: undefined symbol: __bad_udelay
>>> referenced by hdcp2_execution.c
>>>   
>>> gpu/drm/amd/display/modules/hdcp/hdcp2_execution.o:(locality_check) in 
>>> archive drivers/built-in.a
>>> referenced by hdcp2_execution.c
>>>   
>>> gpu/drm/amd/display/modules/hdcp/hdcp2_execution.o:(poll_l_prime_available) 
>>> in archive drivers/built-in.a
>>> referenced by hdcp2_execution.c
>>>   
>>> gpu/drm/amd/display/modules/hdcp/hdcp2_execution.o:(poll_l_prime_available) 
>>> in archive drivers/built-in.a
>>> referenced by hdcp2_execution.c
>>>   
>>> gpu/drm/amd/display/modules/hdcp/hdcp2_execution.o:(poll_l_prime_available) 
>>> in archive drivers/built-in.a
>>> referenced by hdcp2_execution.c
>>>   
>>> gpu/drm/amd/display/modules/hdcp/hdcp2_execution.o:(poll_l_prime_available) 
>>> in archive drivers/built-in.a
>>> referenced by hdcp2_execution.c
>>>   
>>> gpu/drm/amd/display/modules/hdcp/hdcp2_execution.o:(poll_l_prime_available) 
>>> in archive drivers/built-in.a

> On Tue, Dec 17, 2019 at 7:07 AM  wrote:
> >
> > Successfully identified regression in *linux* in CI configuration 
> > tcwg_kernel/llvm-release-aarch64-next-allmodconfig.  So far, this commit 
> > has regressed CI configurations:
> >  - tcwg_kernel/gnu-release-aarch64-next-allmodconfig
> >  - tcwg_kernel/llvm-master-aarch64-next-allyesconfig
> >  - tcwg_kernel/llvm-master-arm-next-allmodconfig
> >  - tcwg_kernel/llvm-release-aarch64-next-allmodconfig
> >  - tcwg_kernel/llvm-release-arm-next-allmodconfig
> >
> > Culprit:
> > 
> > commit 51466b3fd2725bfb0de629f71c0854ff276d50ae
> > Author: Bhawanpreet Lakha 
> >
> > drm/amd/display: Add execution and transition states for HDCP2.2
> > 
> >
> > First few errors in logs of first_bad:
> > 00:03:03 
> > drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_execution.c:162:4: 
> > error: implicit declaration of function 'udelay' 
> > [-Werror,-Wimplicit-function-declaration]
> > 00:03:03 
> > drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_execution.c:472:3: 
> > error: implicit declaration of function 'udelay' 
> > [-Werror,-Wimplicit-function-declaration]
> > 00:03:03 make[4]: *** 
> > [drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_execution.o] 
> > Error 1
> > 00:03:06 make[3]: *** [drivers/gpu/drm/amd/amdgpu] Error 2
> > 00:03:26 make[2]: *** [drivers/gpu/drm] Error 2
> > 00:03:26 make[1]: *** [drivers/gpu] Error 2
> > 00:04:14 make: *** [drivers] Error 2
> > Configuration details:
> > rr[llvm_url]="https://github.com/llvm/llvm-project.git";
> > rr[linux_url]="https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git";
> > rr[linux_branch]="32b8acf85223448973ca0bf0ee8149a01410f3a0"
> >
> > Results regressed to (for first_bad == 
> > 51466b3fd2725bfb0de629f71c0854ff276d50ae)
> > reset_artifacts:
> > -10
> > build_llvm:
> > -1
> > linux_n_obj:
> > 18938
> >
> > from (for last_good == eff682f83c9c2030761e7536c5d97e1b20f71c15)
> > reset_artifacts:
> > -10
> > build_llvm:
> > -1
> > linux_n_obj:
> > 25911
> > linux build successful:
> > all
> >
> > Artifacts of first_bad build: 
> > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/build-51466b3fd2725bfb0de629f71c0854ff276d50ae/
> > Artifacts of last_good build: 
> > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/build-eff682f83c9c2030761e7536c5d97e1b20f71c15/
> > Build top page/logs: 
> > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/
> >
> > Reproduce builds:
> > 
> > mkdir investigate-linux-51466b3fd2725bfb0de629f71c0854ff276d50ae
> > cd investigate-linux-51466b3fd2725bfb0de629f71c0854ff276d50ae
> >
> > git clone https://git.linaro.org/toolchain/jenkins-scripts
> >
> > mkdir -p artifacts/manifests
> > curl -o artifacts/manifests/build-baseline.sh 
> > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/manifests/build-baseline.sh
> > curl -o artifacts/manifests/build-parameters.sh 
> > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/manifests/build-parameters.sh
> > curl -o artifacts/test.sh 
> > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/test.sh
> > chmod +x artifacts/test.sh
> >
> > # Reproduce the baseline build (build all pre-requisites)
> > ./jenkins-scripts/tcwg_kernel-build.sh @@

Re: [PATCH 3/5] drm/amdgpu/smu: add metrics table lock for navi

2019-12-17 Thread Deucher, Alexander

[AMD Official Use Only - Internal Distribution Only]

yeah, they need some fixes.

Alex

From: Pelloux-prayer, Pierre-eric 
Sent: Tuesday, December 17, 2019 1:56 PM
To: Alex Deucher ; amd-gfx@lists.freedesktop.org 

Cc: Deucher, Alexander 
Subject: Re: [PATCH 3/5] drm/amdgpu/smu: add metrics table lock for navi

Hi Alex,

Isn't this patch missing something like this:

pr_info("Failed to export SMU metrics table!\n");
+   mutex_unlock(&smu->metrics_lock);
return ret;

to release the lock in case of error?

Regards,
Pierre-Eric


On 17/12/2019 15:55, Alex Deucher wrote:
> To protect access to the metrics table.
>
> Bug: https://gitlab.freedesktop.org/drm/amd/issues/900
> Signed-off-by: Alex Deucher 
> ---
>  drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
> b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> index 15403b7979d6..102fddda925b 100644
> --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> @@ -564,6 +564,7 @@ static int navi10_get_metrics_table(struct smu_context 
> *smu,
>struct smu_table_context *smu_table= &smu->smu_table;
>int ret = 0;
>
> + mutex_lock(&smu->metrics_lock);
>if (!smu_table->metrics_time || time_after(jiffies, 
> smu_table->metrics_time + msecs_to_jiffies(100))) {
>ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0,
>(void *)smu_table->metrics_table, false);
> @@ -575,6 +576,7 @@ static int navi10_get_metrics_table(struct smu_context 
> *smu,
>}
>
>memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t));
> + mutex_unlock(&smu->metrics_lock);
>
>return ret;
>  }
>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 3/5] drm/amdgpu/smu: add metrics table lock for navi

2019-12-17 Thread Pierre-Eric Pelloux-Prayer

Hi Alex,

Isn't this patch missing something like this:

pr_info("Failed to export SMU metrics table!\n");
+   mutex_unlock(&smu->metrics_lock);
return ret;

to release the lock in case of error?

Regards,
Pierre-Eric 


On 17/12/2019 15:55, Alex Deucher wrote:
> To protect access to the metrics table.
> 
> Bug: https://gitlab.freedesktop.org/drm/amd/issues/900
> Signed-off-by: Alex Deucher 
> ---
>  drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
> b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> index 15403b7979d6..102fddda925b 100644
> --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> @@ -564,6 +564,7 @@ static int navi10_get_metrics_table(struct smu_context 
> *smu,
>   struct smu_table_context *smu_table= &smu->smu_table;
>   int ret = 0;
>  
> + mutex_lock(&smu->metrics_lock);
>   if (!smu_table->metrics_time || time_after(jiffies, 
> smu_table->metrics_time + msecs_to_jiffies(100))) {
>   ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0,
>   (void *)smu_table->metrics_table, false);
> @@ -575,6 +576,7 @@ static int navi10_get_metrics_table(struct smu_context 
> *smu,
>   }
>  
>   memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t));
> + mutex_unlock(&smu->metrics_lock);
>  
>   return ret;
>  }
> 
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/5] drm/amdgpu/smu: add metrics table lock

2019-12-17 Thread Wang, Kevin(Yang)

[AMD Official Use Only - Internal Distribution Only]

the swSMU should be add metrics lock to protect the maintenance data of the 
metrics table.

The series patches are
Reviewed-by: Kevin Wang 

Best Regards,
Kevin

From: amd-gfx  on behalf of Alex Deucher 

Sent: Tuesday, December 17, 2019 10:55 PM
To: amd-gfx@lists.freedesktop.org 
Cc: Deucher, Alexander 
Subject: [PATCH 1/5] drm/amdgpu/smu: add metrics table lock

This table is used for lots of things, add it's own lock.

Bug: 
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fdrm%2Famd%2Fissues%2F900&data=02%7C01%7CKevin1.Wang%40amd.com%7C4ea0cd2cfad44f285ffb08d7830121d7%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637121913347060555&sdata=Az1dOiYWPr%2FJIvTgo35a7a9oTnnpCVvtSnA85mgExf8%3D&reserved=0
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 1 +
 drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 67818558..6177a6664737 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -872,6 +872,7 @@ static int smu_sw_init(void *handle)
 smu->smu_baco.platform_support = false;

 mutex_init(&smu->sensor_lock);
+   mutex_init(&smu->metrics_lock);

 smu->watermarks_bitmap = 0;
 smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
index ca3fdc6777cf..503099f254c1 100644
--- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
@@ -350,6 +350,7 @@ struct smu_context
 const struct pptable_funcs  *ppt_funcs;
 struct mutexmutex;
 struct mutexsensor_lock;
+   struct mutexmetrics_lock;
 uint64_t pool_size;

 struct smu_table_contextsmu_table;
--
2.23.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7CKevin1.Wang%40amd.com%7C4ea0cd2cfad44f285ffb08d7830121d7%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637121913347070548&sdata=EwWmrrJWWxG14kfkuXeM4YPA9odQI2gWyq0iT4pOXCQ%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [pull] amdgpu, amdkfd, radeon drm-next-5.6

2019-12-17 Thread Daniel Vetter

On Tue, Dec 17, 2019 at 12:41:06PM -0500, Alex Deucher wrote:
> On Tue, Dec 17, 2019 at 11:46 AM Daniel Vetter  wrote:
> >
> > On Tue, Dec 17, 2019 at 09:17:51AM -0500, Alex Deucher wrote:
> > > On Tue, Dec 17, 2019 at 8:47 AM Alex Deucher  
> > > wrote:
> > > >
> > > > On Tue, Dec 17, 2019 at 7:52 AM Daniel Vetter  wrote:
> > > > >
> > > > > On Wed, Dec 11, 2019 at 05:30:20PM -0500, Alex Deucher wrote:
> > > > > > Hi Dave, Daniel,
> > > > > >
> > > > > > Kicking off 5.6 with new stuff from AMD.  There is a UAPI addition. 
> > > > > >  We
> > > > > > added a new firmware for display, and this just adds the version 
> > > > > > query
> > > > > > to our existing firmware query interface.  UMDs like mesa use this 
> > > > > > interface to
> > > > > > query things like CP or UVD firmware versions to see what features 
> > > > > > are
> > > > > > supported.
> > > > >
> > > > > I got bored, and a quick serach for what the userspace for
> > > > > AMDGPU_INFO_FW_DMCUB didn't turn up anything. At least didn't spot any
> > > > > patches on mesa-devel or mesa MR and the patch on amd-gfx also didn't 
> > > > > come
> > > > > with pointers. From the patch series description I have no idea why 
> > > > > you'd
> > > > > even want this in userspace (but then I stopped being surprised by hw
> > > > > design long ago).
> > > > >
> > > >
> > > > We expose all the firmwares via the same interface for consistency,
> > > > but the only ones user space generally cares about are the versions
> > > > for the acceleration engines like gfx and multimedia.  I can revert it
> > > > if it's a big deal, but I'd prefer to keep it for consistency since
> > > > all the others are already available via the same interface.  It's not
> > > > really a new interace with no user per se.
> >
> > Imo not the best style adding uapi just in case. We have a lot of that
> > which then ended up (in other drivers at least) being for some hacks for
> > blobs or vendor trees and stuff like that. So personally I'd lean towards
> > just taking all the ones out you don't need (but keep the debugfs ofc),
> > but meh.
> >
> > > Also, there are a few minor conflicts.  I backmerged drm-next into my
> > > drm-next branch if that is helpful.  I can also send another PR with
> > > the backmerge if you'd prefer.
> >
> > Looks like you didn't push, and I've thrown in the towel on the wm stuff.
> > I honestly wonder how exactly you validate this stuff internally, this is
> > almost as bad as i915 :-)
> >
> > Fixing your scripts to also push your validated integration tree (whatever
> > funny kernel version that's based on, at least I'm assuming you're testing
> > the merged version somewhere) might be really good here. Or use dim push,
> > so that the git rr-cache is shared.
> 
> Here's the relevant backmerge in my drm-next branch:
> https://cgit.freedesktop.org/~agd5f/linux/commit/?h=drm-next&id=a759ca47934e83a117a7095a5fd9b91e62a91a0c
> 
> And here's the standalong branch with just the merge on top of my last PR:
> https://cgit.freedesktop.org/~agd5f/linux/log/?h=drm-next-5.6-backmerge

Oh silly me didn't scroll down enough.

btw there's a bunch of other things now in drm/amd from drm-misc, I think
you want to redo your backmerge (and double check what I've done).

Pulled, thanks for the pile.
-Daniel

> 
> Thanks!
> 
> Alex
> 
> >
> > Thanks, Daniel
> >
> > >
> > > Alex
> > >
> > > > Alex
> > > >
> > > > > Otherwise looks all good, no complaints from dim at least :-)
> > > > > -Daniel
> > > > >
> > > > > >
> > > > > > The following changes since commit 
> > > > > > 622b2a0ab647d2755f2c1f1000d3403e86a69763:
> > > > > >
> > > > > >   drm/amdgpu/vcn: finish delay work before release resources 
> > > > > > (2019-11-13 15:29:42 -0500)
> > > > > >
> > > > > > are available in the Git repository at:
> > > > > >
> > > > > >   git://people.freedesktop.org/~agd5f/linux 
> > > > > > tags/drm-next-5.6-2019-12-11
> > > > > >
> > > > > > for you to fetch changes up to 
> > > > > > ad808910be68dcf8da5d837d4511d00ad5d3678a:
> > > > > >
> > > > > >   drm/amdgpu: fix license on Kconfig and Makefiles (2019-12-11 
> > > > > > 15:22:08 -0500)
> > > > > >
> > > > > > 
> > > > > > drm-next-5.6-2019-12-11:
> > > > > >
> > > > > > amdgpu:
> > > > > > - Add MST atomic routines
> > > > > > - Add support for DMCUB (new helper microengine for displays)
> > > > > > - Add OEM i2c support in DC
> > > > > > - Use vstartup for vblank events on DCN
> > > > > > - Simplify Kconfig for DC
> > > > > > - Renoir fixes for DC
> > > > > > - Clean up function pointers in DC
> > > > > > - Initial support for HDCP 2.x
> > > > > > - Misc code cleanups
> > > > > > - GFX10 fixes
> > > > > > - Rework JPEG engine handling for VCN
> > > > > > - Add clock and power gating support for JPEG
> > > > > > - BACO support for Arcturus
> > > > > > - Cleanup PSP ring handling
> > > > > > - Add framework for using BACO with runtime pm to save power
> > > > > > -

Re: [pull] amdgpu, amdkfd, radeon drm-next-5.6

2019-12-17 Thread Alex Deucher

On Tue, Dec 17, 2019 at 11:46 AM Daniel Vetter  wrote:
>
> On Tue, Dec 17, 2019 at 09:17:51AM -0500, Alex Deucher wrote:
> > On Tue, Dec 17, 2019 at 8:47 AM Alex Deucher  wrote:
> > >
> > > On Tue, Dec 17, 2019 at 7:52 AM Daniel Vetter  wrote:
> > > >
> > > > On Wed, Dec 11, 2019 at 05:30:20PM -0500, Alex Deucher wrote:
> > > > > Hi Dave, Daniel,
> > > > >
> > > > > Kicking off 5.6 with new stuff from AMD.  There is a UAPI addition.  
> > > > > We
> > > > > added a new firmware for display, and this just adds the version query
> > > > > to our existing firmware query interface.  UMDs like mesa use this 
> > > > > interface to
> > > > > query things like CP or UVD firmware versions to see what features are
> > > > > supported.
> > > >
> > > > I got bored, and a quick serach for what the userspace for
> > > > AMDGPU_INFO_FW_DMCUB didn't turn up anything. At least didn't spot any
> > > > patches on mesa-devel or mesa MR and the patch on amd-gfx also didn't 
> > > > come
> > > > with pointers. From the patch series description I have no idea why 
> > > > you'd
> > > > even want this in userspace (but then I stopped being surprised by hw
> > > > design long ago).
> > > >
> > >
> > > We expose all the firmwares via the same interface for consistency,
> > > but the only ones user space generally cares about are the versions
> > > for the acceleration engines like gfx and multimedia.  I can revert it
> > > if it's a big deal, but I'd prefer to keep it for consistency since
> > > all the others are already available via the same interface.  It's not
> > > really a new interace with no user per se.
>
> Imo not the best style adding uapi just in case. We have a lot of that
> which then ended up (in other drivers at least) being for some hacks for
> blobs or vendor trees and stuff like that. So personally I'd lean towards
> just taking all the ones out you don't need (but keep the debugfs ofc),
> but meh.
>
> > Also, there are a few minor conflicts.  I backmerged drm-next into my
> > drm-next branch if that is helpful.  I can also send another PR with
> > the backmerge if you'd prefer.
>
> Looks like you didn't push, and I've thrown in the towel on the wm stuff.
> I honestly wonder how exactly you validate this stuff internally, this is
> almost as bad as i915 :-)
>
> Fixing your scripts to also push your validated integration tree (whatever
> funny kernel version that's based on, at least I'm assuming you're testing
> the merged version somewhere) might be really good here. Or use dim push,
> so that the git rr-cache is shared.

Here's the relevant backmerge in my drm-next branch:
https://cgit.freedesktop.org/~agd5f/linux/commit/?h=drm-next&id=a759ca47934e83a117a7095a5fd9b91e62a91a0c

And here's the standalong branch with just the merge on top of my last PR:
https://cgit.freedesktop.org/~agd5f/linux/log/?h=drm-next-5.6-backmerge

Thanks!

Alex

>
> Thanks, Daniel
>
> >
> > Alex
> >
> > > Alex
> > >
> > > > Otherwise looks all good, no complaints from dim at least :-)
> > > > -Daniel
> > > >
> > > > >
> > > > > The following changes since commit 
> > > > > 622b2a0ab647d2755f2c1f1000d3403e86a69763:
> > > > >
> > > > >   drm/amdgpu/vcn: finish delay work before release resources 
> > > > > (2019-11-13 15:29:42 -0500)
> > > > >
> > > > > are available in the Git repository at:
> > > > >
> > > > >   git://people.freedesktop.org/~agd5f/linux 
> > > > > tags/drm-next-5.6-2019-12-11
> > > > >
> > > > > for you to fetch changes up to 
> > > > > ad808910be68dcf8da5d837d4511d00ad5d3678a:
> > > > >
> > > > >   drm/amdgpu: fix license on Kconfig and Makefiles (2019-12-11 
> > > > > 15:22:08 -0500)
> > > > >
> > > > > 
> > > > > drm-next-5.6-2019-12-11:
> > > > >
> > > > > amdgpu:
> > > > > - Add MST atomic routines
> > > > > - Add support for DMCUB (new helper microengine for displays)
> > > > > - Add OEM i2c support in DC
> > > > > - Use vstartup for vblank events on DCN
> > > > > - Simplify Kconfig for DC
> > > > > - Renoir fixes for DC
> > > > > - Clean up function pointers in DC
> > > > > - Initial support for HDCP 2.x
> > > > > - Misc code cleanups
> > > > > - GFX10 fixes
> > > > > - Rework JPEG engine handling for VCN
> > > > > - Add clock and power gating support for JPEG
> > > > > - BACO support for Arcturus
> > > > > - Cleanup PSP ring handling
> > > > > - Add framework for using BACO with runtime pm to save power
> > > > > - Move core pci state handling out of the driver for pm ops
> > > > > - Allow guest power control in 1 VF case with SR-IOV
> > > > > - SR-IOV fixes
> > > > > - RAS fixes
> > > > > - Support for power metrics on renoir
> > > > > - Golden settings updates for gfx10
> > > > > - Enable gfxoff on supported navi10 skus
> > > > > - Update MAINTAINERS
> > > > >
> > > > > amdkfd:
> > > > > - Clean up generational gfx code
> > > > > - Fixes for gfx10
> > > > > - DIQ fixes
> > > > > - Share more code with amdgpu
> > > > >
> > > > > radeon:
> > > > >

Re: [CI-NOTIFY]: TCWG Bisect tcwg_kernel/llvm-release-aarch64-next-allmodconfig - Build # 48 - Successful!

2019-12-17 Thread Nick Desaulniers

Bhawanpreet, I suspect you're missing the header to include udelay in
drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_execution.c.
Can you please send a fix for this?

On Tue, Dec 17, 2019 at 7:07 AM  wrote:
>
> Successfully identified regression in *linux* in CI configuration 
> tcwg_kernel/llvm-release-aarch64-next-allmodconfig.  So far, this commit has 
> regressed CI configurations:
>  - tcwg_kernel/gnu-release-aarch64-next-allmodconfig
>  - tcwg_kernel/llvm-master-aarch64-next-allyesconfig
>  - tcwg_kernel/llvm-master-arm-next-allmodconfig
>  - tcwg_kernel/llvm-release-aarch64-next-allmodconfig
>  - tcwg_kernel/llvm-release-arm-next-allmodconfig
>
> Culprit:
> 
> commit 51466b3fd2725bfb0de629f71c0854ff276d50ae
> Author: Bhawanpreet Lakha 
>
> drm/amd/display: Add execution and transition states for HDCP2.2
> 
>
> First few errors in logs of first_bad:
> 00:03:03 
> drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_execution.c:162:4: 
> error: implicit declaration of function 'udelay' 
> [-Werror,-Wimplicit-function-declaration]
> 00:03:03 
> drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_execution.c:472:3: 
> error: implicit declaration of function 'udelay' 
> [-Werror,-Wimplicit-function-declaration]
> 00:03:03 make[4]: *** 
> [drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_execution.o] Error 1
> 00:03:06 make[3]: *** [drivers/gpu/drm/amd/amdgpu] Error 2
> 00:03:26 make[2]: *** [drivers/gpu/drm] Error 2
> 00:03:26 make[1]: *** [drivers/gpu] Error 2
> 00:04:14 make: *** [drivers] Error 2
> Configuration details:
> rr[llvm_url]="https://github.com/llvm/llvm-project.git";
> rr[linux_url]="https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git";
> rr[linux_branch]="32b8acf85223448973ca0bf0ee8149a01410f3a0"
>
> Results regressed to (for first_bad == 
> 51466b3fd2725bfb0de629f71c0854ff276d50ae)
> reset_artifacts:
> -10
> build_llvm:
> -1
> linux_n_obj:
> 18938
>
> from (for last_good == eff682f83c9c2030761e7536c5d97e1b20f71c15)
> reset_artifacts:
> -10
> build_llvm:
> -1
> linux_n_obj:
> 25911
> linux build successful:
> all
>
> Artifacts of first_bad build: 
> https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/build-51466b3fd2725bfb0de629f71c0854ff276d50ae/
> Artifacts of last_good build: 
> https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/build-eff682f83c9c2030761e7536c5d97e1b20f71c15/
> Build top page/logs: 
> https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/
>
> Reproduce builds:
> 
> mkdir investigate-linux-51466b3fd2725bfb0de629f71c0854ff276d50ae
> cd investigate-linux-51466b3fd2725bfb0de629f71c0854ff276d50ae
>
> git clone https://git.linaro.org/toolchain/jenkins-scripts
>
> mkdir -p artifacts/manifests
> curl -o artifacts/manifests/build-baseline.sh 
> https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/manifests/build-baseline.sh
> curl -o artifacts/manifests/build-parameters.sh 
> https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/manifests/build-parameters.sh
> curl -o artifacts/test.sh 
> https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/test.sh
> chmod +x artifacts/test.sh
>
> # Reproduce the baseline build (build all pre-requisites)
> ./jenkins-scripts/tcwg_kernel-build.sh @@ 
> artifacts/manifests/build-baseline.sh
>
> cd linux
>
> # Reproduce first_bad build
> git checkout --detach 51466b3fd2725bfb0de629f71c0854ff276d50ae
> ../artifacts/test.sh
>
> # Reproduce last_good build
> git checkout --detach eff682f83c9c2030761e7536c5d97e1b20f71c15
> ../artifacts/test.sh
>
> cd ..
> 
>
> History of pending regressions and results: 
> https://git.linaro.org/toolchain/ci/base-artifacts.git/log/?h=linaro-local/ci/tcwg_kernel/llvm-release-aarch64-next-allmodconfig
>
> Artifacts: 
> https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/
> Build log: 
> https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/consoleText
>
> Full commit:
> 
> commit 51466b3fd2725bfb0de629f71c0854ff276d50ae
> Author: Bhawanpreet Lakha 
> Date:   Wed Sep 18 11:18:15 2019 -0400
>
> drm/amd/display: Add execution and transition states for HDCP2.2
>
> The module works like a state machine
>
> +-+
> --> | Execution.c | --
> |   +-+   |
> | V
> ++  ++ +--+
> | DM |->| Hdcp.c |  <  | Transition.c |
> ++<-++ +--+
>
> This patch a

[PATCH 1/2] drm/amdgpu: add perfmons accessible during df c-states

2019-12-17 Thread Jonathan Kim

During DF C-State, Perfmon counters outside of range 1D700-1D7FF will
encounter SLVERR affecting xGMI performance monitoring.  PerfmonCtr[7:4]
is being added to avoid SLVERR during read since it falls within this
range.  PerfmonCtl[7:4] is being added in order to arm PerfmonCtr[7:4].
Since PerfmonCtl[7:4] exists outside of range 1D700-1D7FF, DF routines
will be enabled to opportunistically re-arm PerfmonCtl[7:4] on retry
after SLVERR.

Signed-off-by: Jonathan Kim 
Acked-by: Alex Deucher 
---
 .../drm/amd/include/asic_reg/df/df_3_6_offset.h  | 16 
 1 file changed, 16 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/asic_reg/df/df_3_6_offset.h 
b/drivers/gpu/drm/amd/include/asic_reg/df/df_3_6_offset.h
index c2bd25589e84..f301e5fe2109 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/df/df_3_6_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/df/df_3_6_offset.h
@@ -38,6 +38,14 @@
 #define smnPerfMonCtlHi2   0x01d464UL
 #define smnPerfMonCtlLo3   0x01d470UL
 #define smnPerfMonCtlHi3   0x01d474UL
+#define smnPerfMonCtlLo4   0x01d880UL
+#define smnPerfMonCtlHi4   0x01d884UL
+#define smnPerfMonCtlLo5   0x01d888UL
+#define smnPerfMonCtlHi5   0x01d88cUL
+#define smnPerfMonCtlLo6   0x01d890UL
+#define smnPerfMonCtlHi6   0x01d894UL
+#define smnPerfMonCtlLo7   0x01d898UL
+#define smnPerfMonCtlHi7   0x01d89cUL
 
 #define smnPerfMonCtrLo0   0x01d448UL
 #define smnPerfMonCtrHi0   0x01d44cUL
@@ -47,6 +55,14 @@
 #define smnPerfMonCtrHi2   0x01d46cUL
 #define smnPerfMonCtrLo3   0x01d478UL
 #define smnPerfMonCtrHi3   0x01d47cUL
+#define smnPerfMonCtrLo4   0x01d790UL
+#define smnPerfMonCtrHi4   0x01d794UL
+#define smnPerfMonCtrLo5   0x01d798UL
+#define smnPerfMonCtrHi5   0x01d79cUL
+#define smnPerfMonCtrLo6   0x01d7a0UL
+#define smnPerfMonCtrHi6   0x01d7a4UL
+#define smnPerfMonCtrLo7   0x01d7a8UL
+#define smnPerfMonCtrHi7   0x01d7acUL
 
 #define smnDF_PIE_AON_FabricIndirectConfigAccessAddress3   0x1d05cUL
 #define smnDF_PIE_AON_FabricIndirectConfigAccessDataLo3
0x1d098UL
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 2/2] drm/amdgpu: attempt xgmi perfmon re-arm on failed arm

2019-12-17 Thread Jonathan Kim

The DF routines to arm xGMI performance will attempt to re-arm both on
performance monitoring start and read on initial failure to arm.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdgpu/df_v3_6.c | 153 ---
 1 file changed, 117 insertions(+), 36 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/df_v3_6.c 
b/drivers/gpu/drm/amd/amdgpu/df_v3_6.c
index 4043ebcea5de..af445054305f 100644
--- a/drivers/gpu/drm/amd/amdgpu/df_v3_6.c
+++ b/drivers/gpu/drm/amd/amdgpu/df_v3_6.c
@@ -162,25 +162,45 @@ static void df_v3_6_perfmon_rreg(struct amdgpu_device 
*adev,
 }
 
 /*
- * df_v3_6_perfmon_wreg - write to perfmon lo and hi
- *
- * required to be atomic.  no mmio method provided so subsequent reads after
- * data writes cannot occur to preserve data fabrics finite state machine.
+ * retry arming counters every 100 usecs within 1 millisecond interval.
+ * if retry fails after time out, return error.
  */
-static void df_v3_6_perfmon_wreg(struct amdgpu_device *adev, uint32_t lo_addr,
-   uint32_t lo_val, uint32_t hi_addr, uint32_t hi_val)
+#define ARM_RETRY_USEC_TIMEOUT 1000
+#define ARM_RETRY_USEC_INTERVAL100
+static int df_v3_6_perfmon_arm_with_retry(struct amdgpu_device *adev,
+ uint32_t lo_addr, uint32_t lo_val,
+ uint32_t hi_addr, uint32_t  hi_val)
 {
unsigned long flags, address, data;
+   uint32_t lo_val_rb, hi_val_rb;
+   int countdown = ARM_RETRY_USEC_TIMEOUT;
 
address = adev->nbio.funcs->get_pcie_index_offset(adev);
data = adev->nbio.funcs->get_pcie_data_offset(adev);
 
spin_lock_irqsave(&adev->pcie_idx_lock, flags);
-   WREG32(address, lo_addr);
-   WREG32(data, lo_val);
-   WREG32(address, hi_addr);
-   WREG32(data, hi_val);
+
+   while (countdown) {
+   WREG32(address, lo_addr);
+   WREG32(data, lo_val);
+   WREG32(address, hi_addr);
+   WREG32(data, hi_val);
+
+   WREG32(address, lo_addr);
+   lo_val_rb = RREG32(data);
+   WREG32(address, hi_addr);
+   hi_val_rb = RREG32(data);
+
+   if (lo_val == lo_val_rb && hi_val == hi_val_rb)
+   break;
+
+   countdown -= ARM_RETRY_USEC_INTERVAL;
+   udelay(ARM_RETRY_USEC_INTERVAL);
+   }
+
spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
+
+   return countdown > 0 ? 0 : -ETIME;
 }
 
 /* get the number of df counters available */
@@ -334,20 +354,20 @@ static void df_v3_6_pmc_get_addr(struct amdgpu_device 
*adev,
switch (target_cntr) {
 
case 0:
-   *lo_base_addr = is_ctrl ? smnPerfMonCtlLo0 : smnPerfMonCtrLo0;
-   *hi_base_addr = is_ctrl ? smnPerfMonCtlHi0 : smnPerfMonCtrHi0;
+   *lo_base_addr = is_ctrl ? smnPerfMonCtlLo4 : smnPerfMonCtrLo4;
+   *hi_base_addr = is_ctrl ? smnPerfMonCtlHi4 : smnPerfMonCtrHi4;
break;
case 1:
-   *lo_base_addr = is_ctrl ? smnPerfMonCtlLo1 : smnPerfMonCtrLo1;
-   *hi_base_addr = is_ctrl ? smnPerfMonCtlHi1 : smnPerfMonCtrHi1;
+   *lo_base_addr = is_ctrl ? smnPerfMonCtlLo5 : smnPerfMonCtrLo5;
+   *hi_base_addr = is_ctrl ? smnPerfMonCtlHi5 : smnPerfMonCtrHi5;
break;
case 2:
-   *lo_base_addr = is_ctrl ? smnPerfMonCtlLo2 : smnPerfMonCtrLo2;
-   *hi_base_addr = is_ctrl ? smnPerfMonCtlHi2 : smnPerfMonCtrHi2;
+   *lo_base_addr = is_ctrl ? smnPerfMonCtlLo6 : smnPerfMonCtrLo6;
+   *hi_base_addr = is_ctrl ? smnPerfMonCtlHi6 : smnPerfMonCtrHi6;
break;
case 3:
-   *lo_base_addr = is_ctrl ? smnPerfMonCtlLo3 : smnPerfMonCtrLo3;
-   *hi_base_addr = is_ctrl ? smnPerfMonCtlHi3 : smnPerfMonCtrHi3;
+   *lo_base_addr = is_ctrl ? smnPerfMonCtlLo7 : smnPerfMonCtrLo7;
+   *hi_base_addr = is_ctrl ? smnPerfMonCtlHi7 : smnPerfMonCtrHi7;
break;
 
}
@@ -422,6 +442,42 @@ static int df_v3_6_pmc_add_cntr(struct amdgpu_device *adev,
return -ENOSPC;
 }
 
+#define DEFERRED_ARM_MASK  (1 << 31)
+static int df_v3_6_pmc_defer_cntr(struct amdgpu_device *adev,
+ uint64_t config, int err)
+{
+   int target_cntr;
+
+   target_cntr = df_v3_6_pmc_config_2_cntr(adev, config);
+
+   if (target_cntr < 0)
+   return -EINVAL;
+
+   if (err)
+   adev->df_perfmon_config_assign_mask[target_cntr] |=
+   DEFERRED_ARM_MASK;
+   else
+   adev->df_perfmon_config_assign_mask[target_cntr] &=
+   ~DEFERRED_ARM_MASK;
+
+   return 0;
+}
+
+static bool df_v3_6_pmc_is_deferred(struct amdgpu_device *adev,
+

Re: [pull] amdgpu, amdkfd, radeon drm-next-5.6

2019-12-17 Thread Daniel Vetter

On Tue, Dec 17, 2019 at 09:17:51AM -0500, Alex Deucher wrote:
> On Tue, Dec 17, 2019 at 8:47 AM Alex Deucher  wrote:
> >
> > On Tue, Dec 17, 2019 at 7:52 AM Daniel Vetter  wrote:
> > >
> > > On Wed, Dec 11, 2019 at 05:30:20PM -0500, Alex Deucher wrote:
> > > > Hi Dave, Daniel,
> > > >
> > > > Kicking off 5.6 with new stuff from AMD.  There is a UAPI addition.  We
> > > > added a new firmware for display, and this just adds the version query
> > > > to our existing firmware query interface.  UMDs like mesa use this 
> > > > interface to
> > > > query things like CP or UVD firmware versions to see what features are
> > > > supported.
> > >
> > > I got bored, and a quick serach for what the userspace for
> > > AMDGPU_INFO_FW_DMCUB didn't turn up anything. At least didn't spot any
> > > patches on mesa-devel or mesa MR and the patch on amd-gfx also didn't come
> > > with pointers. From the patch series description I have no idea why you'd
> > > even want this in userspace (but then I stopped being surprised by hw
> > > design long ago).
> > >
> >
> > We expose all the firmwares via the same interface for consistency,
> > but the only ones user space generally cares about are the versions
> > for the acceleration engines like gfx and multimedia.  I can revert it
> > if it's a big deal, but I'd prefer to keep it for consistency since
> > all the others are already available via the same interface.  It's not
> > really a new interace with no user per se.

Imo not the best style adding uapi just in case. We have a lot of that
which then ended up (in other drivers at least) being for some hacks for
blobs or vendor trees and stuff like that. So personally I'd lean towards
just taking all the ones out you don't need (but keep the debugfs ofc),
but meh.

> Also, there are a few minor conflicts.  I backmerged drm-next into my
> drm-next branch if that is helpful.  I can also send another PR with
> the backmerge if you'd prefer.

Looks like you didn't push, and I've thrown in the towel on the wm stuff.
I honestly wonder how exactly you validate this stuff internally, this is
almost as bad as i915 :-)

Fixing your scripts to also push your validated integration tree (whatever
funny kernel version that's based on, at least I'm assuming you're testing
the merged version somewhere) might be really good here. Or use dim push,
so that the git rr-cache is shared.

Thanks, Daniel

> 
> Alex
> 
> > Alex
> >
> > > Otherwise looks all good, no complaints from dim at least :-)
> > > -Daniel
> > >
> > > >
> > > > The following changes since commit 
> > > > 622b2a0ab647d2755f2c1f1000d3403e86a69763:
> > > >
> > > >   drm/amdgpu/vcn: finish delay work before release resources 
> > > > (2019-11-13 15:29:42 -0500)
> > > >
> > > > are available in the Git repository at:
> > > >
> > > >   git://people.freedesktop.org/~agd5f/linux tags/drm-next-5.6-2019-12-11
> > > >
> > > > for you to fetch changes up to ad808910be68dcf8da5d837d4511d00ad5d3678a:
> > > >
> > > >   drm/amdgpu: fix license on Kconfig and Makefiles (2019-12-11 15:22:08 
> > > > -0500)
> > > >
> > > > 
> > > > drm-next-5.6-2019-12-11:
> > > >
> > > > amdgpu:
> > > > - Add MST atomic routines
> > > > - Add support for DMCUB (new helper microengine for displays)
> > > > - Add OEM i2c support in DC
> > > > - Use vstartup for vblank events on DCN
> > > > - Simplify Kconfig for DC
> > > > - Renoir fixes for DC
> > > > - Clean up function pointers in DC
> > > > - Initial support for HDCP 2.x
> > > > - Misc code cleanups
> > > > - GFX10 fixes
> > > > - Rework JPEG engine handling for VCN
> > > > - Add clock and power gating support for JPEG
> > > > - BACO support for Arcturus
> > > > - Cleanup PSP ring handling
> > > > - Add framework for using BACO with runtime pm to save power
> > > > - Move core pci state handling out of the driver for pm ops
> > > > - Allow guest power control in 1 VF case with SR-IOV
> > > > - SR-IOV fixes
> > > > - RAS fixes
> > > > - Support for power metrics on renoir
> > > > - Golden settings updates for gfx10
> > > > - Enable gfxoff on supported navi10 skus
> > > > - Update MAINTAINERS
> > > >
> > > > amdkfd:
> > > > - Clean up generational gfx code
> > > > - Fixes for gfx10
> > > > - DIQ fixes
> > > > - Share more code with amdgpu
> > > >
> > > > radeon:
> > > > - PPC DMA fix
> > > > - Register checker fixes for r1xx/r2xx
> > > > - Misc cleanups
> > > >
> > > > 
> > > > Alex Deucher (34):
> > > >   drm/amdgpu/display: fix the build when CONFIG_DRM_AMD_DC_DCN is 
> > > > not set
> > > >   drm/amdgpu/display: fix warning when CONFIG_DRM_AMD_DC_DCN is not 
> > > > set
> > > >   drm/amdgpu/soc15: move struct definition around to align with 
> > > > other soc15 asics
> > > >   drm/amdgpu/nv: add asic func for fetching vbios from rom directly
> > > >   drm/amdgpu/powerplay: properly set PP_GFXOFF_MASK (v2)
> > > >

Re: [PATCH v3] drm/amd/display: Fix AppleDongle can't be detected

2019-12-17 Thread Harry Wentland

On 2019-12-11 2:33 a.m., Louis Li wrote:
> [Why]
> External monitor cannot be displayed consistently, if connecting
> via this Apple dongle (A1621, USB Type-C to HDMI).
> Experiments prove that the dongle needs 200ms at least to be ready
> for communication, after it drives HPDsignal high, and DPCD cannot
> be read correctly during the period, even reading it repeatedly.
> In such a case, driver does not perform link training bcz of no DPCD.
> 
> [How]
> When driver is run to the modified point, EDID is read correctly
> and dpcd_sink_count of link is not zero. Therefore, link training
> should be successfully performed. Which implies parameters should
> be updated, e.g. lane count, link rate, etc. Checking parameters,
> if values of those parameters are zero, link training is not
> performed. So, do link-training to have detection completed.
> 
> With this patch applied, the problem cannot be reproduced.
> Testing other dongles, results are PASS.
> Patch(v3) is verified PASS by both AMD internal lab and customer.
> 
> 
> Signed-off-by: Louis Li 
> ---
>  drivers/gpu/drm/amd/display/dc/core/dc_link.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link.c 
> b/drivers/gpu/drm/amd/display/dc/core/dc_link.c
> index 7372dedd2f48..6188edc92d0f 100644
> --- a/drivers/gpu/drm/amd/display/dc/core/dc_link.c
> +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link.c
> @@ -725,7 +725,9 @@ bool dc_link_detect(struct dc_link *link, enum 
> dc_detect_reason reason)
>  
>   if (link->connector_signal == SIGNAL_TYPE_DISPLAY_PORT &&
>   sink_caps.transaction_type == 
> DDC_TRANSACTION_TYPE_I2C_OVER_AUX &&
> - reason != DETECT_REASON_HPDRX) {

Do we need to drop this line? This looks like it'll break the previous
fix here.

It looks like Abdoulaye added this here to fix the 400.1.1 DP compliance
test. If you can check with him that your solution is fine and make sure
to test that you can get a consistent pass of 400.1.1 over 30 runs I'm
okay to take the change.

Harry

> + link->verified_link_cap.lane_count == 0 &&
> + link->verified_link_cap.link_rate == 0 &&
> + link->verified_link_cap.link_spread == 0) {
>   /*
>* TODO debug why Dell 2413 doesn't like
>*  two link trainings
> 
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/3] drm/amdgpu: wait for all rings to drain before runtime suspending

2019-12-17 Thread Andrey Grodzovsky


Reviewed-by: Andrey Grodzovsky 

Andrey

On 12/16/19 12:18 PM, Alex Deucher wrote:

Add a safety check to runtime suspend to make sure all outstanding
fences have signaled before we suspend.  Doesn't fix any known issue.

We already do this via the fence driver suspend function, but we
just force completion rather than bailing.  This bails on runtime
suspend so we can try again later once the fences are signaled to
avoid missing any outstanding work.

Signed-off-by: Alex Deucher 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 12 +++-
  1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index d1e9946ac218..61dc26515c7e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -1214,13 +1214,23 @@ static int amdgpu_pmops_runtime_suspend(struct device 
*dev)
struct pci_dev *pdev = to_pci_dev(dev);
struct drm_device *drm_dev = pci_get_drvdata(pdev);
struct amdgpu_device *adev = drm_dev->dev_private;
-   int ret;
+   int ret, i;
  
  	if (!adev->runpm) {

pm_runtime_forbid(dev);
return -EBUSY;
}
  
+	/* wait for all rings to drain before suspending */

+   for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
+   struct amdgpu_ring *ring = adev->rings[i];
+   if (ring && ring->sched.ready) {
+   ret = amdgpu_fence_wait_empty(ring);
+   if (ret)
+   return -EBUSY;
+   }
+   }
+
if (amdgpu_device_supports_boco(drm_dev))
drm_dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
drm_kms_helper_poll_disable(drm_dev);

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 2/2] drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV

2019-12-17 Thread shaoyunl


I think amdkfd side depends on this call to stop the user queue, without this 
call, the user queue can submit to HW during the reset which could cause hang 
again ...
Do we know the root cause why this function would ruin MEC ? From the logic, I 
think this function should be called before FLR since we need to disable the 
user queue submission first.
I remembered the function should use hiq to communicate with HW , shouldn't use 
kiq to access HW registerm,  has this been changed ?


Regards
shaoyun.liu


On 2019-12-17 5:19 a.m., Monk Liu wrote:

issues:
MEC is ruined by the amdkfd_pre_reset after VF FLR done

fix:
amdkfd_pre_reset() would ruin MEC after hypervisor finished the VF FLR,
the correct sequence is do amdkfd_pre_reset before VF FLR but there is
a limitation to block this sequence:
if we do pre_reset() before VF FLR, it would go KIQ way to do register
access and stuck there, because KIQ probably won't work by that time
(e.g. you already made GFX hang)

so the best way right now is to simply remove it.

Signed-off-by: Monk Liu 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 --
  1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 605cef6..ae962b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3672,8 +3672,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
if (r)
return r;
  
-	amdgpu_amdkfd_pre_reset(adev);

-
/* Resume IP prior to SMC */
r = amdgpu_device_ip_reinit_early_sriov(adev);
if (r)

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: move umc offset to one new header file for Arcturus

2019-12-17 Thread Deucher, Alexander

[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Alex Deucher 

From: Chen, Guchun 
Sent: Tuesday, December 17, 2019 4:08 AM
To: Clements, John ; Zhang, Hawking 
; Deucher, Alexander ; 
amd-gfx@lists.freedesktop.org 
Cc: Chen, Guchun 
Subject: [PATCH] drm/amdgpu: move umc offset to one new header file for Arcturus

Fixes: 9686563c4c42 drm/amdgpu: Added RAS UMC error query support for Arcturus

Code refactor and no functional change.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/umc_v6_1.c | 17 +-
 .../include/asic_reg/umc/umc_6_1_2_offset.h   | 32 +++
 2 files changed, 33 insertions(+), 16 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/include/asic_reg/umc/umc_6_1_2_offset.h

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
index 515eb50cd0f8..5093965dbc24 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
@@ -28,17 +28,10 @@
 #include "rsmu/rsmu_0_0_2_sh_mask.h"
 #include "umc/umc_6_1_1_offset.h"
 #include "umc/umc_6_1_1_sh_mask.h"
+#include "umc/umc_6_1_2_offset.h"

 #define smnMCA_UMC0_MCUMC_ADDRT00x50f10

-/* UMC 6_1_2 register offsets */
-#define mmUMCCH0_0_EccErrCntSel_ARCT 0x0360
-#define mmUMCCH0_0_EccErrCntSel_ARCT_BASE_IDX1
-#define mmUMCCH0_0_EccErrCnt_ARCT0x0361
-#define mmUMCCH0_0_EccErrCnt_ARCT_BASE_IDX   1
-#define mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT   0x03c2
-#define mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT_BASE_IDX  1
-
 /*
  * (addr / 256) * 8192, the higher 26 bits in ErrorAddr
  * is the index of 8KB block
@@ -105,7 +98,6 @@ static void umc_v6_1_query_correctable_error_count(struct 
amdgpu_device *adev,

 if (adev->asic_type == CHIP_ARCTURUS) {
 /* UMC 6_1_2 registers */
-
 ecc_err_cnt_sel_addr =
 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT);
 ecc_err_cnt_addr =
@@ -114,7 +106,6 @@ static void umc_v6_1_query_correctable_error_count(struct 
amdgpu_device *adev,
 SOC15_REG_OFFSET(UMC, 0, 
mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
 } else {
 /* UMC 6_1_1 registers */
-
 ecc_err_cnt_sel_addr =
 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel);
 ecc_err_cnt_addr =
@@ -164,12 +155,10 @@ static void 
umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device *adev

 if (adev->asic_type == CHIP_ARCTURUS) {
 /* UMC 6_1_2 registers */
-
 mc_umc_status_addr =
 SOC15_REG_OFFSET(UMC, 0, 
mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
 } else {
 /* UMC 6_1_1 registers */
-
 mc_umc_status_addr =
 SOC15_REG_OFFSET(UMC, 0, 
mmMCA_UMC_UMC0_MCUMC_STATUST0);
 }
@@ -211,12 +200,10 @@ static void umc_v6_1_query_error_address(struct 
amdgpu_device *adev,

 if (adev->asic_type == CHIP_ARCTURUS) {
 /* UMC 6_1_2 registers */
-
 mc_umc_status_addr =
 SOC15_REG_OFFSET(UMC, 0, 
mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
 } else {
 /* UMC 6_1_1 registers */
-
 mc_umc_status_addr =
 SOC15_REG_OFFSET(UMC, 0, 
mmMCA_UMC_UMC0_MCUMC_STATUST0);
 }
@@ -282,14 +269,12 @@ static void umc_v6_1_err_cnt_init_per_channel(struct 
amdgpu_device *adev,

 if (adev->asic_type == CHIP_ARCTURUS) {
 /* UMC 6_1_2 registers */
-
 ecc_err_cnt_sel_addr =
 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT);
 ecc_err_cnt_addr =
 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT);
 } else {
 /* UMC 6_1_1 registers */
-
 ecc_err_cnt_sel_addr =
 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel);
 ecc_err_cnt_addr =
diff --git a/drivers/gpu/drm/amd/include/asic_reg/umc/umc_6_1_2_offset.h 
b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_6_1_2_offset.h
new file mode 100644
index ..3e79a8056556
--- /dev/null
+++ b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_6_1_2_offset.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2019  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall b

[PATCH 3/5] drm/amdgpu/smu: add metrics table lock for navi

2019-12-17 Thread Alex Deucher

To protect access to the metrics table.

Bug: https://gitlab.freedesktop.org/drm/amd/issues/900
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
index 15403b7979d6..102fddda925b 100644
--- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
@@ -564,6 +564,7 @@ static int navi10_get_metrics_table(struct smu_context *smu,
struct smu_table_context *smu_table= &smu->smu_table;
int ret = 0;
 
+   mutex_lock(&smu->metrics_lock);
if (!smu_table->metrics_time || time_after(jiffies, 
smu_table->metrics_time + msecs_to_jiffies(100))) {
ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0,
(void *)smu_table->metrics_table, false);
@@ -575,6 +576,7 @@ static int navi10_get_metrics_table(struct smu_context *smu,
}
 
memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t));
+   mutex_unlock(&smu->metrics_lock);
 
return ret;
 }
-- 
2.23.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 2/5] drm/amdgpu/smu: add metrics table lock for arcturus

2019-12-17 Thread Alex Deucher

To protect access to the metrics table.

Bug: https://gitlab.freedesktop.org/drm/amd/issues/900
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/powerplay/arcturus_ppt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c 
b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
index 17eeb546c550..bd5bb7040638 100644
--- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
@@ -867,6 +867,7 @@ static int arcturus_get_metrics_table(struct smu_context 
*smu,
struct smu_table_context *smu_table= &smu->smu_table;
int ret = 0;
 
+   mutex_lock(&smu->metrics_lock);
if (!smu_table->metrics_time ||
 time_after(jiffies, smu_table->metrics_time + HZ / 1000)) {
ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0,
@@ -879,6 +880,7 @@ static int arcturus_get_metrics_table(struct smu_context 
*smu,
}
 
memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t));
+   mutex_unlock(&smu->metrics_lock);
 
return ret;
 }
-- 
2.23.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 1/5] drm/amdgpu/smu: add metrics table lock

2019-12-17 Thread Alex Deucher

This table is used for lots of things, add it's own lock.

Bug: https://gitlab.freedesktop.org/drm/amd/issues/900
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 1 +
 drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 67818558..6177a6664737 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -872,6 +872,7 @@ static int smu_sw_init(void *handle)
smu->smu_baco.platform_support = false;
 
mutex_init(&smu->sensor_lock);
+   mutex_init(&smu->metrics_lock);
 
smu->watermarks_bitmap = 0;
smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
index ca3fdc6777cf..503099f254c1 100644
--- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
@@ -350,6 +350,7 @@ struct smu_context
const struct pptable_funcs  *ppt_funcs;
struct mutexmutex;
struct mutexsensor_lock;
+   struct mutexmetrics_lock;
uint64_t pool_size;
 
struct smu_table_contextsmu_table;
-- 
2.23.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 5/5] drm/amdgpu/smu: add metrics table lock for vega20

2019-12-17 Thread Alex Deucher

To protect access to the metrics table.

Bug: https://gitlab.freedesktop.org/drm/amd/issues/900
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/powerplay/vega20_ppt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c 
b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
index 12bcc3e3ba99..740cf62e74f3 100644
--- a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
@@ -1678,6 +1678,7 @@ static int vega20_get_metrics_table(struct smu_context 
*smu,
struct smu_table_context *smu_table= &smu->smu_table;
int ret = 0;
 
+   mutex_lock(&smu->metrics_lock);
if (!smu_table->metrics_time || time_after(jiffies, 
smu_table->metrics_time + HZ / 1000)) {
ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0,
(void *)smu_table->metrics_table, false);
@@ -1689,6 +1690,7 @@ static int vega20_get_metrics_table(struct smu_context 
*smu,
}
 
memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t));
+   mutex_unlock(&smu->metrics_lock);
 
return ret;
 }
-- 
2.23.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 4/5] drm/amdgpu/smu: add metrics table lock for renoir

2019-12-17 Thread Alex Deucher

To protect access to the metrics table.

Bug: https://gitlab.freedesktop.org/drm/amd/issues/900
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/powerplay/renoir_ppt.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c 
b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c
index 81520b0fca68..8e723022be3e 100644
--- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c
@@ -171,6 +171,7 @@ static int renoir_get_metrics_table(struct smu_context *smu,
struct smu_table_context *smu_table= &smu->smu_table;
int ret = 0;
 
+   mutex_lock(&smu->metrics_lock);
if (!smu_table->metrics_time || time_after(jiffies, 
smu_table->metrics_time + msecs_to_jiffies(100))) {
ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0,
(void *)smu_table->metrics_table, false);
@@ -182,6 +183,7 @@ static int renoir_get_metrics_table(struct smu_context *smu,
}
 
memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t));
+   mutex_unlock(&smu->metrics_lock);
 
return ret;
 }
@@ -239,8 +241,7 @@ static int renoir_print_clk_levels(struct smu_context *smu,
 
memset(&metrics, 0, sizeof(metrics));
 
-   ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0,
-  (void *)&metrics, false);
+   ret = renoir_get_metrics_table(smu, &metrics);
if (ret)
return ret;
 
-- 
2.23.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [pull] amdgpu, amdkfd, radeon drm-next-5.6

2019-12-17 Thread Alex Deucher

On Tue, Dec 17, 2019 at 8:47 AM Alex Deucher  wrote:
>
> On Tue, Dec 17, 2019 at 7:52 AM Daniel Vetter  wrote:
> >
> > On Wed, Dec 11, 2019 at 05:30:20PM -0500, Alex Deucher wrote:
> > > Hi Dave, Daniel,
> > >
> > > Kicking off 5.6 with new stuff from AMD.  There is a UAPI addition.  We
> > > added a new firmware for display, and this just adds the version query
> > > to our existing firmware query interface.  UMDs like mesa use this 
> > > interface to
> > > query things like CP or UVD firmware versions to see what features are
> > > supported.
> >
> > I got bored, and a quick serach for what the userspace for
> > AMDGPU_INFO_FW_DMCUB didn't turn up anything. At least didn't spot any
> > patches on mesa-devel or mesa MR and the patch on amd-gfx also didn't come
> > with pointers. From the patch series description I have no idea why you'd
> > even want this in userspace (but then I stopped being surprised by hw
> > design long ago).
> >
>
> We expose all the firmwares via the same interface for consistency,
> but the only ones user space generally cares about are the versions
> for the acceleration engines like gfx and multimedia.  I can revert it
> if it's a big deal, but I'd prefer to keep it for consistency since
> all the others are already available via the same interface.  It's not
> really a new interace with no user per se.
>

Also, there are a few minor conflicts.  I backmerged drm-next into my
drm-next branch if that is helpful.  I can also send another PR with
the backmerge if you'd prefer.

Alex

> Alex
>
> > Otherwise looks all good, no complaints from dim at least :-)
> > -Daniel
> >
> > >
> > > The following changes since commit 
> > > 622b2a0ab647d2755f2c1f1000d3403e86a69763:
> > >
> > >   drm/amdgpu/vcn: finish delay work before release resources (2019-11-13 
> > > 15:29:42 -0500)
> > >
> > > are available in the Git repository at:
> > >
> > >   git://people.freedesktop.org/~agd5f/linux tags/drm-next-5.6-2019-12-11
> > >
> > > for you to fetch changes up to ad808910be68dcf8da5d837d4511d00ad5d3678a:
> > >
> > >   drm/amdgpu: fix license on Kconfig and Makefiles (2019-12-11 15:22:08 
> > > -0500)
> > >
> > > 
> > > drm-next-5.6-2019-12-11:
> > >
> > > amdgpu:
> > > - Add MST atomic routines
> > > - Add support for DMCUB (new helper microengine for displays)
> > > - Add OEM i2c support in DC
> > > - Use vstartup for vblank events on DCN
> > > - Simplify Kconfig for DC
> > > - Renoir fixes for DC
> > > - Clean up function pointers in DC
> > > - Initial support for HDCP 2.x
> > > - Misc code cleanups
> > > - GFX10 fixes
> > > - Rework JPEG engine handling for VCN
> > > - Add clock and power gating support for JPEG
> > > - BACO support for Arcturus
> > > - Cleanup PSP ring handling
> > > - Add framework for using BACO with runtime pm to save power
> > > - Move core pci state handling out of the driver for pm ops
> > > - Allow guest power control in 1 VF case with SR-IOV
> > > - SR-IOV fixes
> > > - RAS fixes
> > > - Support for power metrics on renoir
> > > - Golden settings updates for gfx10
> > > - Enable gfxoff on supported navi10 skus
> > > - Update MAINTAINERS
> > >
> > > amdkfd:
> > > - Clean up generational gfx code
> > > - Fixes for gfx10
> > > - DIQ fixes
> > > - Share more code with amdgpu
> > >
> > > radeon:
> > > - PPC DMA fix
> > > - Register checker fixes for r1xx/r2xx
> > > - Misc cleanups
> > >
> > > 
> > > Alex Deucher (34):
> > >   drm/amdgpu/display: fix the build when CONFIG_DRM_AMD_DC_DCN is not 
> > > set
> > >   drm/amdgpu/display: fix warning when CONFIG_DRM_AMD_DC_DCN is not 
> > > set
> > >   drm/amdgpu/soc15: move struct definition around to align with other 
> > > soc15 asics
> > >   drm/amdgpu/nv: add asic func for fetching vbios from rom directly
> > >   drm/amdgpu/powerplay: properly set PP_GFXOFF_MASK (v2)
> > >   drm/amdgpu: disable gfxoff when using register read interface
> > >   drm/amdgpu: remove experimental flag for Navi14
> > >   drm/amdgpu: disable gfxoff on original raven
> > >   Revert "drm/amd/display: enable S/G for RAVEN chip"
> > >   drm/amdgpu: add asic callback for BACO support
> > >   drm/amdgpu: add supports_baco callback for soc15 asics. (v2)
> > >   drm/amdgpu: add supports_baco callback for SI asics.
> > >   drm/amdgpu: add supports_baco callback for CIK asics.
> > >   drm/amdgpu: add supports_baco callback for VI asics.
> > >   drm/amdgpu: add supports_baco callback for NV asics.
> > >   drm/amdgpu: add a amdgpu_device_supports_baco helper
> > >   drm/amdgpu: rename amdgpu_device_is_px to 
> > > amdgpu_device_supports_boco (v2)
> > >   drm/amdgpu: add additional boco checks to runtime suspend/resume 
> > > (v2)
> > >   drm/amdgpu: split swSMU baco_reset into enter and exit
> > >   drm/amdgpu: add helpers for baco entry and exit
> > >

RE: [PATCH 2/2] drm/amdgpu: remove memory training p2c buffer reservation

2019-12-17 Thread Chen, Guchun

[AMD Official Use Only - Internal Distribution Only]




-Original Message-
From: amd-gfx  On Behalf Of Tianci Yin
Sent: Tuesday, December 17, 2019 7:23 PM
To: amd-gfx@lists.freedesktop.org
Cc: Long, Gang ; Yin, Tianci (Rico) ; 
Xu, Feifei ; Wang, Kevin(Yang) ; 
Tuikov, Luben ; Deucher, Alexander 
; Zhang, Hawking ; Koenig, 
Christian ; Yuan, Xiaojie 
Subject: [PATCH 2/2] drm/amdgpu: remove memory training p2c buffer reservation

From: "Tianci.Yin" 

IP discovery TMR(occupied the top VRAM with size DISCOVERY_TMR_SIZE) has been 
reserved, and the p2c buffer is in the range of this TMR, so the p2c buffer 
reservation is unnecessary.

Change-Id: Ib1f2f2b4a1f3869c03ffe22e2836cdbee17ba99f
Signed-off-by: Tianci.Yin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  1 -  
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 14 --
 2 files changed, 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 5f8fd3e3535b..3265487b859f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -202,7 +202,6 @@ struct psp_memory_training_context {
 
/*vram offset of the p2c training data*/
u64 p2c_train_data_offset;
-   struct amdgpu_bo *p2c_bo;
 
/*vram offset of the c2p training data*/
u64 c2p_train_data_offset;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index ce5cb854bdb9..6f0ad1d1d4d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1681,9 +1681,6 @@ static int amdgpu_ttm_training_reserve_vram_fini(struct 
amdgpu_device *adev)
amdgpu_bo_free_kernel(&ctx->c2p_bo, NULL, NULL);
ctx->c2p_bo = NULL;
 
-   amdgpu_bo_free_kernel(&ctx->p2c_bo, NULL, NULL);
-   ctx->p2c_bo = NULL;
-
return 0;
 }
 
@@ -1718,17 +1715,6 @@ static int amdgpu_ttm_training_reserve_vram_init(struct 
amdgpu_device *adev)
  ctx->p2c_train_data_offset,
  ctx->c2p_train_data_offset);
 
-   ret = amdgpu_bo_create_kernel_at(adev,
-ctx->p2c_train_data_offset,
-ctx->train_data_size,
-AMDGPU_GEM_DOMAIN_VRAM,
-&ctx->p2c_bo,
-NULL);
-   if (ret) {
-   DRM_ERROR("alloc p2c_bo failed(%d)!\n", ret);
-   goto Err_out;
-   }
-
ret = amdgpu_bo_create_kernel_at(adev,
 ctx->c2p_train_data_offset,
 ctx->train_data_size,
[Guchun] If we have to remove such buffer reservation, from coding style's 
perspective, I suggest removing error handler code by "goto" too in 
amdgpu_ttm_training_reserve_vram_init.
After removing p2c buffer reservation from this function, there is only one 
buffer reservation case for c2p. So direct error handle and return should be 
better.

--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cguchun.chen%40amd.com%7C888c561716c342aa9ecc08d782e397d0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637121786693411170&sdata=pH1rob4R5ljvEGo8PSjn1te7ctWLG1Wctv30lNCLyx4%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [pull] amdgpu, amdkfd, radeon drm-next-5.6

2019-12-17 Thread Alex Deucher

On Tue, Dec 17, 2019 at 7:52 AM Daniel Vetter  wrote:
>
> On Wed, Dec 11, 2019 at 05:30:20PM -0500, Alex Deucher wrote:
> > Hi Dave, Daniel,
> >
> > Kicking off 5.6 with new stuff from AMD.  There is a UAPI addition.  We
> > added a new firmware for display, and this just adds the version query
> > to our existing firmware query interface.  UMDs like mesa use this 
> > interface to
> > query things like CP or UVD firmware versions to see what features are
> > supported.
>
> I got bored, and a quick serach for what the userspace for
> AMDGPU_INFO_FW_DMCUB didn't turn up anything. At least didn't spot any
> patches on mesa-devel or mesa MR and the patch on amd-gfx also didn't come
> with pointers. From the patch series description I have no idea why you'd
> even want this in userspace (but then I stopped being surprised by hw
> design long ago).
>

We expose all the firmwares via the same interface for consistency,
but the only ones user space generally cares about are the versions
for the acceleration engines like gfx and multimedia.  I can revert it
if it's a big deal, but I'd prefer to keep it for consistency since
all the others are already available via the same interface.  It's not
really a new interace with no user per se.

Alex

> Otherwise looks all good, no complaints from dim at least :-)
> -Daniel
>
> >
> > The following changes since commit 622b2a0ab647d2755f2c1f1000d3403e86a69763:
> >
> >   drm/amdgpu/vcn: finish delay work before release resources (2019-11-13 
> > 15:29:42 -0500)
> >
> > are available in the Git repository at:
> >
> >   git://people.freedesktop.org/~agd5f/linux tags/drm-next-5.6-2019-12-11
> >
> > for you to fetch changes up to ad808910be68dcf8da5d837d4511d00ad5d3678a:
> >
> >   drm/amdgpu: fix license on Kconfig and Makefiles (2019-12-11 15:22:08 
> > -0500)
> >
> > 
> > drm-next-5.6-2019-12-11:
> >
> > amdgpu:
> > - Add MST atomic routines
> > - Add support for DMCUB (new helper microengine for displays)
> > - Add OEM i2c support in DC
> > - Use vstartup for vblank events on DCN
> > - Simplify Kconfig for DC
> > - Renoir fixes for DC
> > - Clean up function pointers in DC
> > - Initial support for HDCP 2.x
> > - Misc code cleanups
> > - GFX10 fixes
> > - Rework JPEG engine handling for VCN
> > - Add clock and power gating support for JPEG
> > - BACO support for Arcturus
> > - Cleanup PSP ring handling
> > - Add framework for using BACO with runtime pm to save power
> > - Move core pci state handling out of the driver for pm ops
> > - Allow guest power control in 1 VF case with SR-IOV
> > - SR-IOV fixes
> > - RAS fixes
> > - Support for power metrics on renoir
> > - Golden settings updates for gfx10
> > - Enable gfxoff on supported navi10 skus
> > - Update MAINTAINERS
> >
> > amdkfd:
> > - Clean up generational gfx code
> > - Fixes for gfx10
> > - DIQ fixes
> > - Share more code with amdgpu
> >
> > radeon:
> > - PPC DMA fix
> > - Register checker fixes for r1xx/r2xx
> > - Misc cleanups
> >
> > 
> > Alex Deucher (34):
> >   drm/amdgpu/display: fix the build when CONFIG_DRM_AMD_DC_DCN is not 
> > set
> >   drm/amdgpu/display: fix warning when CONFIG_DRM_AMD_DC_DCN is not set
> >   drm/amdgpu/soc15: move struct definition around to align with other 
> > soc15 asics
> >   drm/amdgpu/nv: add asic func for fetching vbios from rom directly
> >   drm/amdgpu/powerplay: properly set PP_GFXOFF_MASK (v2)
> >   drm/amdgpu: disable gfxoff when using register read interface
> >   drm/amdgpu: remove experimental flag for Navi14
> >   drm/amdgpu: disable gfxoff on original raven
> >   Revert "drm/amd/display: enable S/G for RAVEN chip"
> >   drm/amdgpu: add asic callback for BACO support
> >   drm/amdgpu: add supports_baco callback for soc15 asics. (v2)
> >   drm/amdgpu: add supports_baco callback for SI asics.
> >   drm/amdgpu: add supports_baco callback for CIK asics.
> >   drm/amdgpu: add supports_baco callback for VI asics.
> >   drm/amdgpu: add supports_baco callback for NV asics.
> >   drm/amdgpu: add a amdgpu_device_supports_baco helper
> >   drm/amdgpu: rename amdgpu_device_is_px to amdgpu_device_supports_boco 
> > (v2)
> >   drm/amdgpu: add additional boco checks to runtime suspend/resume (v2)
> >   drm/amdgpu: split swSMU baco_reset into enter and exit
> >   drm/amdgpu: add helpers for baco entry and exit
> >   drm/amdgpu: add baco support to runtime suspend/resume
> >   drm/amdgpu: start to disentangle boco from runtime pm
> >   drm/amdgpu: disentangle runtime pm and vga_switcheroo
> >   drm/amdgpu: enable runtime pm on BACO capable boards if runpm=1
> >   drm/amdgpu: simplify runtime suspend
> >   drm/amd/display: add default clocks if not able to fetch them
> >   MAINTAINERS: Drop Rex Zhu for amdgpu powerplay
> >   drm/amdgpu: move pci handling

Re: [pull] amdgpu, amdkfd, radeon drm-next-5.6

2019-12-17 Thread Daniel Vetter

On Wed, Dec 11, 2019 at 05:30:20PM -0500, Alex Deucher wrote:
> Hi Dave, Daniel,
> 
> Kicking off 5.6 with new stuff from AMD.  There is a UAPI addition.  We
> added a new firmware for display, and this just adds the version query
> to our existing firmware query interface.  UMDs like mesa use this interface 
> to
> query things like CP or UVD firmware versions to see what features are
> supported.

I got bored, and a quick serach for what the userspace for
AMDGPU_INFO_FW_DMCUB didn't turn up anything. At least didn't spot any
patches on mesa-devel or mesa MR and the patch on amd-gfx also didn't come
with pointers. From the patch series description I have no idea why you'd
even want this in userspace (but then I stopped being surprised by hw
design long ago).

Otherwise looks all good, no complaints from dim at least :-)
-Daniel

> 
> The following changes since commit 622b2a0ab647d2755f2c1f1000d3403e86a69763:
> 
>   drm/amdgpu/vcn: finish delay work before release resources (2019-11-13 
> 15:29:42 -0500)
> 
> are available in the Git repository at:
> 
>   git://people.freedesktop.org/~agd5f/linux tags/drm-next-5.6-2019-12-11
> 
> for you to fetch changes up to ad808910be68dcf8da5d837d4511d00ad5d3678a:
> 
>   drm/amdgpu: fix license on Kconfig and Makefiles (2019-12-11 15:22:08 -0500)
> 
> 
> drm-next-5.6-2019-12-11:
> 
> amdgpu:
> - Add MST atomic routines
> - Add support for DMCUB (new helper microengine for displays)
> - Add OEM i2c support in DC
> - Use vstartup for vblank events on DCN
> - Simplify Kconfig for DC
> - Renoir fixes for DC
> - Clean up function pointers in DC
> - Initial support for HDCP 2.x
> - Misc code cleanups
> - GFX10 fixes
> - Rework JPEG engine handling for VCN
> - Add clock and power gating support for JPEG
> - BACO support for Arcturus
> - Cleanup PSP ring handling
> - Add framework for using BACO with runtime pm to save power
> - Move core pci state handling out of the driver for pm ops
> - Allow guest power control in 1 VF case with SR-IOV
> - SR-IOV fixes
> - RAS fixes
> - Support for power metrics on renoir
> - Golden settings updates for gfx10
> - Enable gfxoff on supported navi10 skus
> - Update MAINTAINERS
> 
> amdkfd:
> - Clean up generational gfx code
> - Fixes for gfx10
> - DIQ fixes
> - Share more code with amdgpu
> 
> radeon:
> - PPC DMA fix
> - Register checker fixes for r1xx/r2xx
> - Misc cleanups
> 
> 
> Alex Deucher (34):
>   drm/amdgpu/display: fix the build when CONFIG_DRM_AMD_DC_DCN is not set
>   drm/amdgpu/display: fix warning when CONFIG_DRM_AMD_DC_DCN is not set
>   drm/amdgpu/soc15: move struct definition around to align with other 
> soc15 asics
>   drm/amdgpu/nv: add asic func for fetching vbios from rom directly
>   drm/amdgpu/powerplay: properly set PP_GFXOFF_MASK (v2)
>   drm/amdgpu: disable gfxoff when using register read interface
>   drm/amdgpu: remove experimental flag for Navi14
>   drm/amdgpu: disable gfxoff on original raven
>   Revert "drm/amd/display: enable S/G for RAVEN chip"
>   drm/amdgpu: add asic callback for BACO support
>   drm/amdgpu: add supports_baco callback for soc15 asics. (v2)
>   drm/amdgpu: add supports_baco callback for SI asics.
>   drm/amdgpu: add supports_baco callback for CIK asics.
>   drm/amdgpu: add supports_baco callback for VI asics.
>   drm/amdgpu: add supports_baco callback for NV asics.
>   drm/amdgpu: add a amdgpu_device_supports_baco helper
>   drm/amdgpu: rename amdgpu_device_is_px to amdgpu_device_supports_boco 
> (v2)
>   drm/amdgpu: add additional boco checks to runtime suspend/resume (v2)
>   drm/amdgpu: split swSMU baco_reset into enter and exit
>   drm/amdgpu: add helpers for baco entry and exit
>   drm/amdgpu: add baco support to runtime suspend/resume
>   drm/amdgpu: start to disentangle boco from runtime pm
>   drm/amdgpu: disentangle runtime pm and vga_switcheroo
>   drm/amdgpu: enable runtime pm on BACO capable boards if runpm=1
>   drm/amdgpu: simplify runtime suspend
>   drm/amd/display: add default clocks if not able to fetch them
>   MAINTAINERS: Drop Rex Zhu for amdgpu powerplay
>   drm/amdgpu: move pci handling out of pm ops
>   drm/amdgpu: flag vram lost on baco reset for VI/CIK
>   drm/amd/display: re-enable wait in pipelock, but add timeout
>   drm/radeon: fix r1xx/r2xx register checker for POT textures
>   drm/amdgpu: add header line for power profile on Arcturus
>   drm/amdgpu/display: add fallthrough comment
>   drm/amdgpu: fix license on Kconfig and Makefiles
> 
> Alex Sierra (2):
>   drm/amdgpu: add flag to indicate amdgpu vm context
>   amd/amdgpu: force to trigger a no-retry-fault after a retry-fault
> 
> Alvin Lee (1):
>   drm/amd/display: Changes in dc to allow full update in some cases
> 
> Amanda Liu (1):
>

[PATCH 2/2] drm/amdgpu: remove memory training p2c buffer reservation

2019-12-17 Thread Tianci Yin

From: "Tianci.Yin" 

IP discovery TMR(occupied the top VRAM with size DISCOVERY_TMR_SIZE)
has been reserved, and the p2c buffer is in the range of this TMR, so
the p2c buffer reservation is unnecessary.

Change-Id: Ib1f2f2b4a1f3869c03ffe22e2836cdbee17ba99f
Signed-off-by: Tianci.Yin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 14 --
 2 files changed, 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 5f8fd3e3535b..3265487b859f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -202,7 +202,6 @@ struct psp_memory_training_context {
 
/*vram offset of the p2c training data*/
u64 p2c_train_data_offset;
-   struct amdgpu_bo *p2c_bo;
 
/*vram offset of the c2p training data*/
u64 c2p_train_data_offset;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index ce5cb854bdb9..6f0ad1d1d4d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1681,9 +1681,6 @@ static int amdgpu_ttm_training_reserve_vram_fini(struct 
amdgpu_device *adev)
amdgpu_bo_free_kernel(&ctx->c2p_bo, NULL, NULL);
ctx->c2p_bo = NULL;
 
-   amdgpu_bo_free_kernel(&ctx->p2c_bo, NULL, NULL);
-   ctx->p2c_bo = NULL;
-
return 0;
 }
 
@@ -1718,17 +1715,6 @@ static int amdgpu_ttm_training_reserve_vram_init(struct 
amdgpu_device *adev)
  ctx->p2c_train_data_offset,
  ctx->c2p_train_data_offset);
 
-   ret = amdgpu_bo_create_kernel_at(adev,
-ctx->p2c_train_data_offset,
-ctx->train_data_size,
-AMDGPU_GEM_DOMAIN_VRAM,
-&ctx->p2c_bo,
-NULL);
-   if (ret) {
-   DRM_ERROR("alloc p2c_bo failed(%d)!\n", ret);
-   goto Err_out;
-   }
-
ret = amdgpu_bo_create_kernel_at(adev,
 ctx->c2p_train_data_offset,
 ctx->train_data_size,
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 1/2] drm/amdgpu: update the method to get fb_loc of memory training

2019-12-17 Thread Tianci Yin

From: "Tianci.Yin" 

The method of getting fb_loc changed from parsing VBIOS to
taking certain offset from top of VRAM

Change-Id: I053b42fdb1d822722fa7980b2cd9f86b3fdce539
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  3 +-
 .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c  | 36 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  6 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h   |  7 
 drivers/gpu/drm/amd/include/atomfirmware.h| 14 
 5 files changed, 16 insertions(+), 50 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a78a363b1d71..fa2cf8e7bc07 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -642,9 +642,8 @@ struct amdgpu_fw_vram_usage {
struct amdgpu_bo *reserved_bo;
void *va;
 
-   /* Offset on the top of VRAM, used as c2p write buffer.
+   /* GDDR6 training support flag.
*/
-   u64 mem_train_fb_loc;
bool mem_train_support;
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
index ff4eb96bdfb5..009cb0b03d13 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
@@ -528,13 +528,9 @@ static int gddr6_mem_train_support(struct amdgpu_device 
*adev)
 int amdgpu_atomfirmware_get_mem_train_fb_loc(struct amdgpu_device *adev)
 {
struct atom_context *ctx = adev->mode_info.atom_context;
-   unsigned char *bios = ctx->bios;
-   struct vram_reserve_block *reserved_block;
-   int index, block_number;
+   int index;
uint8_t frev, crev;
uint16_t data_offset, size;
-   uint32_t start_address_in_kb;
-   uint64_t offset;
int ret;
 
adev->fw_vram_usage.mem_train_support = false;
@@ -569,32 +565,6 @@ int amdgpu_atomfirmware_get_mem_train_fb_loc(struct 
amdgpu_device *adev)
return -EINVAL;
}
 
-   reserved_block = (struct vram_reserve_block *)
-   (bios + data_offset + sizeof(struct atom_common_table_header));
-   block_number = ((unsigned int)size - sizeof(struct 
atom_common_table_header))
-   / sizeof(struct vram_reserve_block);
-   reserved_block += (block_number > 0) ? block_number-1 : 0;
-   DRM_DEBUG("block_number:0x%04x, last block: 0x%08xkb sz, %dkb fw, %dkb 
drv.\n",
- block_number,
- le32_to_cpu(reserved_block->start_address_in_kb),
- le16_to_cpu(reserved_block->used_by_firmware_in_kb),
- le16_to_cpu(reserved_block->used_by_driver_in_kb));
-   if (reserved_block->used_by_firmware_in_kb > 0) {
-   start_address_in_kb = 
le32_to_cpu(reserved_block->start_address_in_kb);
-   offset = (uint64_t)start_address_in_kb * ONE_KiB;
-   if ((offset & (ONE_MiB - 1)) < (4 * ONE_KiB + 1) ) {
-   offset -= ONE_MiB;
-   }
-
-   offset &= ~(ONE_MiB - 1);
-   adev->fw_vram_usage.mem_train_fb_loc = offset;
-   adev->fw_vram_usage.mem_train_support = true;
-   DRM_DEBUG("mem_train_fb_loc:0x%09llx.\n", offset);
-   ret = 0;
-   } else {
-   DRM_ERROR("used_by_firmware_in_kb is 0!\n");
-   ret = -EINVAL;
-   }
-
-   return ret;
+   adev->fw_vram_usage.mem_train_support = true;
+   return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 2ff63d0414c9..ce5cb854bdb9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1705,7 +1705,11 @@ static int amdgpu_ttm_training_reserve_vram_init(struct 
amdgpu_device *adev)
return 0;
}
 
-   ctx->c2p_train_data_offset = adev->fw_vram_usage.mem_train_fb_loc;
+   ctx->c2p_train_data_offset = adev->gmc.mc_vram_size;
+   if ((ctx->c2p_train_data_offset & (ONE_MiB - 1)) < (4 * ONE_KiB + 1) ) {
+   ctx->c2p_train_data_offset -= ONE_MiB;
+   }
+   ctx->c2p_train_data_offset &= ~(ONE_MiB - 1);
ctx->p2c_train_data_offset = (adev->gmc.mc_vram_size - 
GDDR6_MEM_TRAINING_OFFSET);
ctx->train_data_size = GDDR6_MEM_TRAINING_DATA_SIZE_IN_BYTES;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index f1ebd424510c..19eb3e8456c7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -66,6 +66,13 @@ struct amdgpu_copy_mem {
unsigned long   offset;
 };
 
+/* Definitions for constance */
+enum amdgpu_internal_constants
+{
+   ONE_KiB = 0x400,
+   ONE_MiB = 0x10,
+};
+
 extern const struct ttm_mem_type_manager_func amdgpu_gtt_mgr_func;
 extern const struct ttm_mem_type_manager_func amdgpu_vram_mgr_func;
 
diff --git a/drivers/gpu/drm/amd/incl

RE: [PATCH 2/2] drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV

2019-12-17 Thread Deng, Emily

[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Emily Deng 

>-Original Message-
>From: amd-gfx  On Behalf Of Monk Liu
>Sent: Tuesday, December 17, 2019 6:20 PM
>To: amd-gfx@lists.freedesktop.org
>Cc: Liu, Monk 
>Subject: [PATCH 2/2] drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV
>
>issues:
>MEC is ruined by the amdkfd_pre_reset after VF FLR done
>
>fix:
>amdkfd_pre_reset() would ruin MEC after hypervisor finished the VF FLR, the
>correct sequence is do amdkfd_pre_reset before VF FLR but there is a limitation
>to block this sequence:
>if we do pre_reset() before VF FLR, it would go KIQ way to do register access 
>and
>stuck there, because KIQ probably won't work by that time (e.g. you already
>made GFX hang)
>
>so the best way right now is to simply remove it.
>
>Signed-off-by: Monk Liu 
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 --
> 1 file changed, 2 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>index 605cef6..ae962b9 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>@@ -3672,8 +3672,6 @@ static int amdgpu_device_reset_sriov(struct
>amdgpu_device *adev,
>   if (r)
>   return r;
>
>-  amdgpu_amdkfd_pre_reset(adev);
>-
>   /* Resume IP prior to SMC */
>   r = amdgpu_device_ip_reinit_early_sriov(adev);
>   if (r)
>--
>2.7.4
>
>___
>amd-gfx mailing list
>amd-gfx@lists.freedesktop.org
>https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.fre
>edesktop.org%2Fmailman%2Flistinfo%2Famd-
>gfx&data=02%7C01%7CEmily.Deng%40amd.com%7C74408803b49e4f328
>f7708d782daba6c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C6
>37121748318124859&sdata=4YbyHwEEGxVLEhuOg%2Frc%2FxdhFRwrdm
>FuZ4vpHx%2FApAE%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 1/2] drm/amdgpu: fix double gpu_recovery for NV of SRIOV

2019-12-17 Thread Deng, Emily

[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Emily Deng 

>-Original Message-
>From: amd-gfx  On Behalf Of Monk Liu
>Sent: Tuesday, December 17, 2019 6:20 PM
>To: amd-gfx@lists.freedesktop.org
>Cc: Liu, Monk 
>Subject: [PATCH 1/2] drm/amdgpu: fix double gpu_recovery for NV of SRIOV
>
>issues:
>gpu_recover() is re-entered by the mailbox interrupt handler mxgpu_nv.c
>
>fix:
>we need to bypass the gpu_recover() invoke in mailbox interrupt as long as the
>timeout is not infinite (thus the TDR will be triggered automatically after 
>time
>out, no need to invoke
>gpu_recover() through mailbox interrupt.
>
>Signed-off-by: Monk Liu 
>---
> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 6 +-
> 1 file changed, 5 insertions(+), 1 deletion(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>index 0d8767e..1c3a7d4 100644
>--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>@@ -269,7 +269,11 @@ static void xgpu_nv_mailbox_flr_work(struct
>work_struct *work)
>   }
>
>   /* Trigger recovery for world switch failure if no TDR */
>-  if (amdgpu_device_should_recover_gpu(adev))
>+  if (amdgpu_device_should_recover_gpu(adev)
>+  && (adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
>+  adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
>+  adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
>+  adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
>   amdgpu_device_gpu_recover(adev, NULL);  }
>
>--
>2.7.4
>
>___
>amd-gfx mailing list
>amd-gfx@lists.freedesktop.org
>https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.fre
>edesktop.org%2Fmailman%2Flistinfo%2Famd-
>gfx&data=02%7C01%7CEmily.Deng%40amd.com%7C029ef88677e744f2ad
>8f08d782dab68c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C63
>7121748276776005&sdata=IiRwMTw6DQW8sh8Y7SkZ2PehohwnH6gSqkt
>t64a73UU%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 1/2] drm/amdgpu: fix double gpu_recovery for NV of SRIOV

2019-12-17 Thread Monk Liu

issues:
gpu_recover() is re-entered by the mailbox interrupt
handler mxgpu_nv.c

fix:
we need to bypass the gpu_recover() invoke in mailbox
interrupt as long as the timeout is not infinite (thus the TDR
will be triggered automatically after time out, no need to invoke
gpu_recover() through mailbox interrupt.

Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 0d8767e..1c3a7d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -269,7 +269,11 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)
}
 
/* Trigger recovery for world switch failure if no TDR */
-   if (amdgpu_device_should_recover_gpu(adev))
+   if (amdgpu_device_should_recover_gpu(adev)
+   && (adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
+   adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
+   adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
+   adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
amdgpu_device_gpu_recover(adev, NULL);
 }
 
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 2/2] drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV

2019-12-17 Thread Monk Liu

issues:
MEC is ruined by the amdkfd_pre_reset after VF FLR done

fix:
amdkfd_pre_reset() would ruin MEC after hypervisor finished the VF FLR,
the correct sequence is do amdkfd_pre_reset before VF FLR but there is
a limitation to block this sequence:
if we do pre_reset() before VF FLR, it would go KIQ way to do register
access and stuck there, because KIQ probably won't work by that time
(e.g. you already made GFX hang)

so the best way right now is to simply remove it.

Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 605cef6..ae962b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3672,8 +3672,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
if (r)
return r;
 
-   amdgpu_amdkfd_pre_reset(adev);
-
/* Resume IP prior to SMC */
r = amdgpu_device_ip_reinit_early_sriov(adev);
if (r)
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: fix KIQ ring test fail in TDR

2019-12-17 Thread Monk Liu

issues:
there are two issue may lead to TDR failure for SRIOV
1) gpu_recover() is re-entered by the mailbox interrupt
handler mxgpu_nv.c
2) MEC is ruined by the amdkfd_pre_reset after VF FLR done

fix:
for 1) we need to bypass the gpu_recover() invoke in mailbox
interrupt as long as the timeout is not infinite (thus the TDR
will be triggered automatically after time out, no need to invoke
gpu_recover() through mailbox interrupt.

for 2) amdkfd_pre_reset() would ruin MEC after hypervisor finished
the VF FLR, the correct sequence is do amdkfd_pre_reset before VF FLR
but there is a limitation to block this sequence:
if we do pre_reset() before VF FLR, it would go KIQ way to do register
access and stuck there, because KIQ probably won't work by that time
(e.g. you already made GFX hang)

Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 --
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c  | 6 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 605cef6..ae962b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3672,8 +3672,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
if (r)
return r;
 
-   amdgpu_amdkfd_pre_reset(adev);
-
/* Resume IP prior to SMC */
r = amdgpu_device_ip_reinit_early_sriov(adev);
if (r)
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 0d8767e..1c3a7d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -269,7 +269,11 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)
}
 
/* Trigger recovery for world switch failure if no TDR */
-   if (amdgpu_device_should_recover_gpu(adev))
+   if (amdgpu_device_should_recover_gpu(adev)
+   && (adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
+   adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
+   adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
+   adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
amdgpu_device_gpu_recover(adev, NULL);
 }
 
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: move umc offset to one new header file for Arcturus

2019-12-17 Thread Guchun Chen

Fixes: 9686563c4c42 drm/amdgpu: Added RAS UMC error query support for Arcturus

Code refactor and no functional change.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/umc_v6_1.c | 17 +-
 .../include/asic_reg/umc/umc_6_1_2_offset.h   | 32 +++
 2 files changed, 33 insertions(+), 16 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/include/asic_reg/umc/umc_6_1_2_offset.h

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
index 515eb50cd0f8..5093965dbc24 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
@@ -28,17 +28,10 @@
 #include "rsmu/rsmu_0_0_2_sh_mask.h"
 #include "umc/umc_6_1_1_offset.h"
 #include "umc/umc_6_1_1_sh_mask.h"
+#include "umc/umc_6_1_2_offset.h"
 
 #define smnMCA_UMC0_MCUMC_ADDRT0   0x50f10
 
-/* UMC 6_1_2 register offsets */
-#define mmUMCCH0_0_EccErrCntSel_ARCT 0x0360
-#define mmUMCCH0_0_EccErrCntSel_ARCT_BASE_IDX1
-#define mmUMCCH0_0_EccErrCnt_ARCT0x0361
-#define mmUMCCH0_0_EccErrCnt_ARCT_BASE_IDX   1
-#define mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT   0x03c2
-#define mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT_BASE_IDX  1
-
 /*
  * (addr / 256) * 8192, the higher 26 bits in ErrorAddr
  * is the index of 8KB block
@@ -105,7 +98,6 @@ static void umc_v6_1_query_correctable_error_count(struct 
amdgpu_device *adev,
 
if (adev->asic_type == CHIP_ARCTURUS) {
/* UMC 6_1_2 registers */
-
ecc_err_cnt_sel_addr =
SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT);
ecc_err_cnt_addr =
@@ -114,7 +106,6 @@ static void umc_v6_1_query_correctable_error_count(struct 
amdgpu_device *adev,
SOC15_REG_OFFSET(UMC, 0, 
mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
} else {
/* UMC 6_1_1 registers */
-
ecc_err_cnt_sel_addr =
SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel);
ecc_err_cnt_addr =
@@ -164,12 +155,10 @@ static void 
umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device *adev
 
if (adev->asic_type == CHIP_ARCTURUS) {
/* UMC 6_1_2 registers */
-
mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, 
mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
} else {
/* UMC 6_1_1 registers */
-
mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
}
@@ -211,12 +200,10 @@ static void umc_v6_1_query_error_address(struct 
amdgpu_device *adev,
 
if (adev->asic_type == CHIP_ARCTURUS) {
/* UMC 6_1_2 registers */
-
mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, 
mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
} else {
/* UMC 6_1_1 registers */
-
mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
}
@@ -282,14 +269,12 @@ static void umc_v6_1_err_cnt_init_per_channel(struct 
amdgpu_device *adev,
 
if (adev->asic_type == CHIP_ARCTURUS) {
/* UMC 6_1_2 registers */
-
ecc_err_cnt_sel_addr =
SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT);
ecc_err_cnt_addr =
SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT);
} else {
/* UMC 6_1_1 registers */
-
ecc_err_cnt_sel_addr =
SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel);
ecc_err_cnt_addr =
diff --git a/drivers/gpu/drm/amd/include/asic_reg/umc/umc_6_1_2_offset.h 
b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_6_1_2_offset.h
new file mode 100644
index ..3e79a8056556
--- /dev/null
+++ b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_6_1_2_offset.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2019  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
LIABILITY, WHETHER IN
+ *

RE: [PATCH v2 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d.

2019-12-17 Thread Ma, Le

[AMD Official Use Only - Internal Distribution Only]


Hi Andry



Please check the 3 minor comments in this patch. With that addressed, the V2s 
series is Reviewed-by: Le Ma mailto:le...@amd.com>>



Regards,

Ma Le



-Original Message-
From: Andrey Grodzovsky 
Sent: Saturday, December 14, 2019 12:54 AM
To: dri-de...@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Ma, Le ; 
Zhang, Hawking ; Quan, Evan ; 
Grodzovsky, Andrey 
Subject: [PATCH v2 1/5] drm/amdgpu: reverts commit 
b01245ff54db66073b104ac9d9fbefb7b264b36d.



In preparation for doing XGMI reset synchronization using task barrier.



Signed-off-by: Andrey Grodzovsky 
mailto:andrey.grodzov...@amd.com>>

Reviewed-by: Le Ma mailto:le...@amd.com>>

---

drivers/gpu/drm/amd/amdgpu/amdgpu.h|  2 -

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 76 +-

2 files changed, 12 insertions(+), 66 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index a78a363..50bab33 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

@@ -1001,8 +1001,6 @@ struct amdgpu_device {



boolpm_sysfs_en;

   boolucode_sysfs_en;

-

-   boolin_baco;

};



 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device 
*bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 7324a5f..1d19edfa 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

@@ -2667,7 +2667,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
work_struct *__work)

   if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

   adev->asic_reset_res = (adev->in_baco == false) ?

   
amdgpu_device_baco_enter(adev->ddev) :

-   
amdgpu_device_baco_exit(adev->ddev);

+  
qamdgpu_device_baco_exit(adev->ddev);

[Le] 1/3: Still unnecessary typo here, although it will be removed in patch #4.

   else

   adev->asic_reset_res = amdgpu_asic_reset(adev);



@@ -3796,18 +3796,13 @@ static int amdgpu_device_pre_asic_reset(struct 
amdgpu_device *adev,

   return r;

}



-static int amdgpu_do_asic_reset(struct amdgpu_device *adev,

-  struct amdgpu_hive_info *hive,

+static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

  struct list_head *device_list_handle,

  bool *need_full_reset_arg)

{

   struct amdgpu_device *tmp_adev = NULL;

   bool need_full_reset = *need_full_reset_arg, vram_lost = false;

   int r = 0;

-   int cpu = smp_processor_id();

-   bool use_baco =

-   (amdgpu_asic_reset_method(adev) == 
AMD_RESET_METHOD_BACO) ?

-   true : false;



/*

* ASIC reset has to be done on all HGMI hive nodes ASAP @@ -3815,62 
+3810,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,

*/

   if (need_full_reset) {

   list_for_each_entry(tmp_adev, device_list_handle, 
gmc.xgmi.head) {

-   /*

-   * For XGMI run all resets in parallel to 
speed up the

-   * process by scheduling the highpri wq on 
different

-   * cpus. For XGMI with baco reset, all nodes 
must enter

-   * baco within close proximity before anyone 
exit.

-   */

+  /* For XGMI run all resets in parallel to 
speed up the process */

   if (tmp_adev->gmc.xgmi.num_physical_nodes > 
1) {

-   if (!queue_work_on(cpu, 
system_highpri_wq,

-  
&tmp_adev->xgmi_reset_work))

+  if 
(!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))

   r = -EALREADY;

-   cpu = cpumask_next(cpu, 
cpu_online_mask);

   } else

   r = amdgpu_asic_reset(tmp_adev);

-   if (r)

-   break;

-   }

-

-   /* For XGMI wait for all work to complete before 
proceed */

-   if

51 matches

Mail list logo