Re: [PATCH 1/2] drm/amdgpu: update the method to get fb_loc of memory training
[AMD Official Use Only - Internal Distribution Only] From: Tianci Yin Sent: Wednesday, December 18, 2019 10:21 AM To: amd-gfx@lists.freedesktop.org Cc: Tuikov, Luben ; Koenig, Christian ; Deucher, Alexander ; Zhang, Hawking ; Xu, Feifei ; Yuan, Xiaojie ; Long, Gang ; Wang, Kevin(Yang) ; Yin, Tianci (Rico) Subject: [PATCH 1/2] drm/amdgpu: update the method to get fb_loc of memory training From: "Tianci.Yin" The method of getting fb_loc changed from parsing VBIOS to taking certain offset from top of VRAM Change-Id: I053b42fdb1d822722fa7980b2cd9f86b3fdce539 --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 +- .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c | 36 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 6 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 7 drivers/gpu/drm/amd/include/atomfirmware.h| 14 5 files changed, 16 insertions(+), 50 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index a78a363b1d71..fa2cf8e7bc07 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -642,9 +642,8 @@ struct amdgpu_fw_vram_usage { struct amdgpu_bo *reserved_bo; void *va; - /* Offset on the top of VRAM, used as c2p write buffer. + /* GDDR6 training support flag. */ - u64 mem_train_fb_loc; bool mem_train_support; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c index ff4eb96bdfb5..009cb0b03d13 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c @@ -528,13 +528,9 @@ static int gddr6_mem_train_support(struct amdgpu_device *adev) int amdgpu_atomfirmware_get_mem_train_fb_loc(struct amdgpu_device *adev) { struct atom_context *ctx = adev->mode_info.atom_context; - unsigned char *bios = ctx->bios; - struct vram_reserve_block *reserved_block; - int index, block_number; + int index; uint8_t frev, crev; uint16_t data_offset, size; - uint32_t start_address_in_kb; - uint64_t offset; int ret; adev->fw_vram_usage.mem_train_support = false; @@ -569,32 +565,6 @@ int amdgpu_atomfirmware_get_mem_train_fb_loc(struct amdgpu_device *adev) [kevin]: this function is not return any address after change, i think we'd better to rename this function to another is well. the code can be merge to function gddr6_mem_train_support(). return -EINVAL; } - reserved_block = (struct vram_reserve_block *) - (bios + data_offset + sizeof(struct atom_common_table_header)); - block_number = ((unsigned int)size - sizeof(struct atom_common_table_header)) - / sizeof(struct vram_reserve_block); - reserved_block += (block_number > 0) ? block_number-1 : 0; - DRM_DEBUG("block_number:0x%04x, last block: 0x%08xkb sz, %dkb fw, %dkb drv.\n", - block_number, - le32_to_cpu(reserved_block->start_address_in_kb), - le16_to_cpu(reserved_block->used_by_firmware_in_kb), - le16_to_cpu(reserved_block->used_by_driver_in_kb)); - if (reserved_block->used_by_firmware_in_kb > 0) { - start_address_in_kb = le32_to_cpu(reserved_block->start_address_in_kb); - offset = (uint64_t)start_address_in_kb * ONE_KiB; - if ((offset & (ONE_MiB - 1)) < (4 * ONE_KiB + 1) ) { - offset -= ONE_MiB; - } - - offset &= ~(ONE_MiB - 1); - adev->fw_vram_usage.mem_train_fb_loc = offset; - adev->fw_vram_usage.mem_train_support = true; - DRM_DEBUG("mem_train_fb_loc:0x%09llx.\n", offset); - ret = 0; - } else { - DRM_ERROR("used_by_firmware_in_kb is 0!\n"); - ret = -EINVAL; - } - - return ret; + adev->fw_vram_usage.mem_train_support = true; + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 2ff63d0414c9..ce5cb854bdb9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1705,7 +1705,11 @@ static int amdgpu_ttm_training_reserve_vram_init(struct amdgpu_device *adev) return 0; } - ctx->c2p_train_data_offset = adev->fw_vram_usage.mem_train_fb_loc; + ctx->c2p_train_data_offset = adev->gmc.mc_vram_size; + if ((ctx->c2p_train_data_offset & (ONE_MiB - 1)) < (4 * ONE_KiB + 1) ) { + ctx->c2p_train_data_offset -= ONE_MiB; + } + ctx->c2p_train_data_offset &= ~(ONE_MiB - 1); ctx->p2c_train_data_offset = (adev->gmc.mc_vram_size - GDDR6_MEM_TRAINING_OFFSET); ctx->train_data_size = GDDR6_MEM_TRAINING_DATA_SIZE_IN_BYTE
[PATCH] drm/amdgpu: correctly report gpu recover status
Knowing whether gpu recovery was performed successfully or not is important for our BACO development. Change-Id: I0e3ca4dcb65a053eb26bc55ad7431e4a42e160de Signed-off-by: Evan Quan --- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index e9efee04ca23..5dff5c0dd882 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -743,9 +743,7 @@ static int amdgpu_debugfs_gpu_recover(struct seq_file *m, void *data) struct amdgpu_device *adev = dev->dev_private; seq_printf(m, "gpu recover\n"); - amdgpu_device_gpu_recover(adev, NULL); - - return 0; + return amdgpu_device_gpu_recover(adev, NULL); } static const struct drm_info_list amdgpu_debugfs_fence_list[] = { -- 2.24.0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: no SMC firmware reloading for non-RAS baco reset
For non-RAS baco reset, there is no need to reset the SMC. Thus the firmware reloading should be avoided. Change-Id: I73f6284541d0ca0e82761380a27e32484fb0061c Signed-off-by: Evan Quan --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 3 ++- drivers/gpu/drm/amd/amdgpu/psp_v11_0.c | 14 ++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index c14f2ccd0677..9bf7e92394f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1439,7 +1439,8 @@ static int psp_np_fw_load(struct psp_context *psp) continue; if (ucode->ucode_id == AMDGPU_UCODE_ID_SMC && - (psp_smu_reload_quirk(psp) || psp->autoload_supported)) + ((adev->in_gpu_reset && psp_smu_reload_quirk(psp)) + || psp->autoload_supported)) continue; if (amdgpu_sriov_vf(adev) && diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c index c66ca8cc2ebd..ba761e9366e3 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c @@ -676,6 +676,19 @@ static bool psp_v11_0_compare_sram_data(struct psp_context *psp, return true; } +/* + * Check whether SMU is still alive. If that's true + * (e.g. for non-RAS baco reset), we need to skip SMC firmware reloading. + */ +static bool psp_v11_0_smu_reload_quirk(struct psp_context *psp) +{ + struct amdgpu_device *adev = psp->adev; + uint32_t reg; + + reg = RREG32_PCIE(smnMP1_FIRMWARE_FLAGS | 0x03b0); + return (reg & MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK) ? true : false; +} + static int psp_v11_0_mode1_reset(struct psp_context *psp) { int ret; @@ -1070,6 +1083,7 @@ static const struct psp_funcs psp_v11_0_funcs = { .ring_stop = psp_v11_0_ring_stop, .ring_destroy = psp_v11_0_ring_destroy, .compare_sram_data = psp_v11_0_compare_sram_data, + .smu_reload_quirk = psp_v11_0_smu_reload_quirk, .mode1_reset = psp_v11_0_mode1_reset, .xgmi_get_topology_info = psp_v11_0_xgmi_get_topology_info, .xgmi_set_topology_info = psp_v11_0_xgmi_set_topology_info, -- 2.24.0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH v2 1/5] drm/amdgpu/smu: add metrics table lock
[AMD Official Use Only - Internal Distribution Only] The series patches are Reviewed-by: Kevin Wang From: amd-gfx on behalf of Alex Deucher Sent: Wednesday, December 18, 2019 5:45 AM To: amd-gfx@lists.freedesktop.org Cc: Deucher, Alexander Subject: [PATCH v2 1/5] drm/amdgpu/smu: add metrics table lock This table is used for lots of things, add it's own lock. Bug: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fdrm%2Famd%2Fissues%2F900&data=02%7C01%7CKevin1.Wang%40amd.com%7C39da818e513e4cfb04fe08d7833a7fc2%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637122160270078326&sdata=jL5LpNDv7ZX%2FpGPAexqcUDKOE5%2B9kkAxKuIzWO1CE0Y%3D&reserved=0 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 1 + drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index f76a1717ffbd..936c68298786 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -872,6 +872,7 @@ static int smu_sw_init(void *handle) smu->smu_baco.platform_support = false; mutex_init(&smu->sensor_lock); + mutex_init(&smu->metrics_lock); smu->watermarks_bitmap = 0; smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h index a7d0ad831491..541cfde289ea 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h @@ -350,6 +350,7 @@ struct smu_context const struct pptable_funcs *ppt_funcs; struct mutexmutex; struct mutexsensor_lock; + struct mutexmetrics_lock; uint64_t pool_size; struct smu_table_contextsmu_table; -- 2.23.0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7CKevin1.Wang%40amd.com%7C39da818e513e4cfb04fe08d7833a7fc2%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637122160270078326&sdata=5Z573z93vZHHifVEOQoXgpkcgKoGvlm%2B5hC6oVQdTec%3D&reserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 1/2] drm/amdgpu: update the method to get fb_loc of memory training
From: "Tianci.Yin" The method of getting fb_loc changed from parsing VBIOS to taking certain offset from top of VRAM Change-Id: I053b42fdb1d822722fa7980b2cd9f86b3fdce539 --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 +- .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c | 36 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 6 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 7 drivers/gpu/drm/amd/include/atomfirmware.h| 14 5 files changed, 16 insertions(+), 50 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index a78a363b1d71..fa2cf8e7bc07 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -642,9 +642,8 @@ struct amdgpu_fw_vram_usage { struct amdgpu_bo *reserved_bo; void *va; - /* Offset on the top of VRAM, used as c2p write buffer. + /* GDDR6 training support flag. */ - u64 mem_train_fb_loc; bool mem_train_support; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c index ff4eb96bdfb5..009cb0b03d13 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c @@ -528,13 +528,9 @@ static int gddr6_mem_train_support(struct amdgpu_device *adev) int amdgpu_atomfirmware_get_mem_train_fb_loc(struct amdgpu_device *adev) { struct atom_context *ctx = adev->mode_info.atom_context; - unsigned char *bios = ctx->bios; - struct vram_reserve_block *reserved_block; - int index, block_number; + int index; uint8_t frev, crev; uint16_t data_offset, size; - uint32_t start_address_in_kb; - uint64_t offset; int ret; adev->fw_vram_usage.mem_train_support = false; @@ -569,32 +565,6 @@ int amdgpu_atomfirmware_get_mem_train_fb_loc(struct amdgpu_device *adev) return -EINVAL; } - reserved_block = (struct vram_reserve_block *) - (bios + data_offset + sizeof(struct atom_common_table_header)); - block_number = ((unsigned int)size - sizeof(struct atom_common_table_header)) - / sizeof(struct vram_reserve_block); - reserved_block += (block_number > 0) ? block_number-1 : 0; - DRM_DEBUG("block_number:0x%04x, last block: 0x%08xkb sz, %dkb fw, %dkb drv.\n", - block_number, - le32_to_cpu(reserved_block->start_address_in_kb), - le16_to_cpu(reserved_block->used_by_firmware_in_kb), - le16_to_cpu(reserved_block->used_by_driver_in_kb)); - if (reserved_block->used_by_firmware_in_kb > 0) { - start_address_in_kb = le32_to_cpu(reserved_block->start_address_in_kb); - offset = (uint64_t)start_address_in_kb * ONE_KiB; - if ((offset & (ONE_MiB - 1)) < (4 * ONE_KiB + 1) ) { - offset -= ONE_MiB; - } - - offset &= ~(ONE_MiB - 1); - adev->fw_vram_usage.mem_train_fb_loc = offset; - adev->fw_vram_usage.mem_train_support = true; - DRM_DEBUG("mem_train_fb_loc:0x%09llx.\n", offset); - ret = 0; - } else { - DRM_ERROR("used_by_firmware_in_kb is 0!\n"); - ret = -EINVAL; - } - - return ret; + adev->fw_vram_usage.mem_train_support = true; + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 2ff63d0414c9..ce5cb854bdb9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1705,7 +1705,11 @@ static int amdgpu_ttm_training_reserve_vram_init(struct amdgpu_device *adev) return 0; } - ctx->c2p_train_data_offset = adev->fw_vram_usage.mem_train_fb_loc; + ctx->c2p_train_data_offset = adev->gmc.mc_vram_size; + if ((ctx->c2p_train_data_offset & (ONE_MiB - 1)) < (4 * ONE_KiB + 1) ) { + ctx->c2p_train_data_offset -= ONE_MiB; + } + ctx->c2p_train_data_offset &= ~(ONE_MiB - 1); ctx->p2c_train_data_offset = (adev->gmc.mc_vram_size - GDDR6_MEM_TRAINING_OFFSET); ctx->train_data_size = GDDR6_MEM_TRAINING_DATA_SIZE_IN_BYTES; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index f1ebd424510c..19eb3e8456c7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -66,6 +66,13 @@ struct amdgpu_copy_mem { unsigned long offset; }; +/* Definitions for constance */ +enum amdgpu_internal_constants +{ + ONE_KiB = 0x400, + ONE_MiB = 0x10, +}; + extern const struct ttm_mem_type_manager_func amdgpu_gtt_mgr_func; extern const struct ttm_mem_type_manager_func amdgpu_vram_mgr_func; diff --git a/drivers/gpu/drm/amd/incl
[PATCH 2/2] drm/amdgpu: remove memory training p2c buffer reservation(V2)
From: "Tianci.Yin" IP discovery TMR(occupied the top VRAM with size DISCOVERY_TMR_SIZE) has been reserved, and the p2c buffer is in the range of this TMR, so the p2c buffer reservation is unnecessary. Change-Id: Ib1f2f2b4a1f3869c03ffe22e2836cdbee17ba99f Signed-off-by: Tianci.Yin --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 21 ++--- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index 5f8fd3e3535b..3265487b859f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -202,7 +202,6 @@ struct psp_memory_training_context { /*vram offset of the p2c training data*/ u64 p2c_train_data_offset; - struct amdgpu_bo *p2c_bo; /*vram offset of the c2p training data*/ u64 c2p_train_data_offset; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index ce5cb854bdb9..476ea4a4dc03 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1681,9 +1681,6 @@ static int amdgpu_ttm_training_reserve_vram_fini(struct amdgpu_device *adev) amdgpu_bo_free_kernel(&ctx->c2p_bo, NULL, NULL); ctx->c2p_bo = NULL; - amdgpu_bo_free_kernel(&ctx->p2c_bo, NULL, NULL); - ctx->p2c_bo = NULL; - return 0; } @@ -1718,17 +1715,6 @@ static int amdgpu_ttm_training_reserve_vram_init(struct amdgpu_device *adev) ctx->p2c_train_data_offset, ctx->c2p_train_data_offset); - ret = amdgpu_bo_create_kernel_at(adev, -ctx->p2c_train_data_offset, -ctx->train_data_size, -AMDGPU_GEM_DOMAIN_VRAM, -&ctx->p2c_bo, -NULL); - if (ret) { - DRM_ERROR("alloc p2c_bo failed(%d)!\n", ret); - goto Err_out; - } - ret = amdgpu_bo_create_kernel_at(adev, ctx->c2p_train_data_offset, ctx->train_data_size, @@ -1737,15 +1723,12 @@ static int amdgpu_ttm_training_reserve_vram_init(struct amdgpu_device *adev) NULL); if (ret) { DRM_ERROR("alloc c2p_bo failed(%d)!\n", ret); - goto Err_out; + amdgpu_ttm_training_reserve_vram_fini(adev); + return ret; } ctx->init = PSP_MEM_TRAIN_RESERVE_SUCCESS; return 0; - -Err_out: - amdgpu_ttm_training_reserve_vram_fini(adev); - return ret; } /** -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 2/2] drm/amdgpu: remove memory training p2c buffer reservation
Hi Guchun, Thanks very much for your suggestion. I will refine it and send it out later. Rico From: Chen, Guchun Sent: Tuesday, December 17, 2019 22:11 To: Yin, Tianci (Rico) ; amd-gfx@lists.freedesktop.org Cc: Long, Gang ; Yin, Tianci (Rico) ; Xu, Feifei ; Wang, Kevin(Yang) ; Tuikov, Luben ; Deucher, Alexander ; Zhang, Hawking ; Koenig, Christian ; Yuan, Xiaojie Subject: RE: [PATCH 2/2] drm/amdgpu: remove memory training p2c buffer reservation [AMD Official Use Only - Internal Distribution Only] -Original Message- From: amd-gfx On Behalf Of Tianci Yin Sent: Tuesday, December 17, 2019 7:23 PM To: amd-gfx@lists.freedesktop.org Cc: Long, Gang ; Yin, Tianci (Rico) ; Xu, Feifei ; Wang, Kevin(Yang) ; Tuikov, Luben ; Deucher, Alexander ; Zhang, Hawking ; Koenig, Christian ; Yuan, Xiaojie Subject: [PATCH 2/2] drm/amdgpu: remove memory training p2c buffer reservation From: "Tianci.Yin" IP discovery TMR(occupied the top VRAM with size DISCOVERY_TMR_SIZE) has been reserved, and the p2c buffer is in the range of this TMR, so the p2c buffer reservation is unnecessary. Change-Id: Ib1f2f2b4a1f3869c03ffe22e2836cdbee17ba99f Signed-off-by: Tianci.Yin --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 14 -- 2 files changed, 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index 5f8fd3e3535b..3265487b859f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -202,7 +202,6 @@ struct psp_memory_training_context { /*vram offset of the p2c training data*/ u64 p2c_train_data_offset; - struct amdgpu_bo *p2c_bo; /*vram offset of the c2p training data*/ u64 c2p_train_data_offset; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index ce5cb854bdb9..6f0ad1d1d4d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1681,9 +1681,6 @@ static int amdgpu_ttm_training_reserve_vram_fini(struct amdgpu_device *adev) amdgpu_bo_free_kernel(&ctx->c2p_bo, NULL, NULL); ctx->c2p_bo = NULL; - amdgpu_bo_free_kernel(&ctx->p2c_bo, NULL, NULL); - ctx->p2c_bo = NULL; - return 0; } @@ -1718,17 +1715,6 @@ static int amdgpu_ttm_training_reserve_vram_init(struct amdgpu_device *adev) ctx->p2c_train_data_offset, ctx->c2p_train_data_offset); - ret = amdgpu_bo_create_kernel_at(adev, -ctx->p2c_train_data_offset, -ctx->train_data_size, -AMDGPU_GEM_DOMAIN_VRAM, -&ctx->p2c_bo, -NULL); - if (ret) { - DRM_ERROR("alloc p2c_bo failed(%d)!\n", ret); - goto Err_out; - } - ret = amdgpu_bo_create_kernel_at(adev, ctx->c2p_train_data_offset, ctx->train_data_size, [Guchun] If we have to remove such buffer reservation, from coding style's perspective, I suggest removing error handler code by "goto" too in amdgpu_ttm_training_reserve_vram_init. After removing p2c buffer reservation from this function, there is only one buffer reservation case for c2p. So direct error handle and return should be better. -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cguchun.chen%40amd.com%7C888c561716c342aa9ecc08d782e397d0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637121786693411170&sdata=pH1rob4R5ljvEGo8PSjn1te7ctWLG1Wctv30lNCLyx4%3D&reserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH v2 5/5] drm/amdgpu/smu: add metrics table lock for vega20 (v2)
It's fine with me to check them in as a temporary workaround. Series is reviewed-by: Evan Quan > -Original Message- > From: amd-gfx On Behalf Of Alex > Deucher > Sent: Wednesday, December 18, 2019 5:46 AM > To: amd-gfx@lists.freedesktop.org > Cc: Deucher, Alexander > Subject: [PATCH v2 5/5] drm/amdgpu/smu: add metrics table lock for vega20 > (v2) > > To protect access to the metrics table. > > v2: unlock on error > > Bug: > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.fre > edesktop.org%2Fdrm%2Famd%2Fissues%2F900&data=02%7C01%7Cevan.q > uan%40amd.com%7Cefa0dd86e5a74ead810708d7833a823e%7C3dd8961fe488 > 4e608e11a82d994e183d%7C0%7C0%7C637122159732784832&sdata=X0Z > UV1r90Dy3mvlp8zONFcxKQcSaciwkVt7GJabYH0I%3D&reserved=0 > Signed-off-by: Alex Deucher > --- > drivers/gpu/drm/amd/powerplay/vega20_ppt.c | 3 +++ > 1 file changed, 3 insertions(+) > > diff --git a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c > b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c > index 2b1c3f8a0415..250ff5aa1305 100644 > --- a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c > +++ b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c > @@ -1678,17 +1678,20 @@ static int vega20_get_metrics_table(struct > smu_context *smu, > struct smu_table_context *smu_table= &smu->smu_table; > int ret = 0; > > + mutex_lock(&smu->metrics_lock); > if (!smu_table->metrics_time || time_after(jiffies, smu_table- > >metrics_time + HZ / 1000)) { > ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, > (void *)smu_table->metrics_table, false); > if (ret) { > pr_info("Failed to export SMU metrics table!\n"); > + mutex_unlock(&smu->metrics_lock); > return ret; > } > smu_table->metrics_time = jiffies; > } > > memcpy(metrics_table, smu_table->metrics_table, > sizeof(SmuMetrics_t)); > + mutex_unlock(&smu->metrics_lock); > > return ret; > } > -- > 2.23.0 > > ___ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.free > desktop.org%2Fmailman%2Flistinfo%2Famd- > gfx&data=02%7C01%7Cevan.quan%40amd.com%7Cefa0dd86e5a74ead81 > 0708d7833a823e%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637 > 122159732784832&sdata=xFCKqTGqv57k9SucgTc7Ur5AGctpMO%2BbPvw > RKz53whI%3D&reserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 2/2] drm/amdkfd: expose num_cp_queues data field to topology node
See comment inline. Other than that, the series looks good to me. On 2019-12-16 2:02, Huang Rui wrote: Thunk driver would like to know the num_cp_queues data, however this data relied on different asic specific. So it's better to get it from kfd driver. Signed-off-by: Huang Rui --- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 3 +++ drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index cc01ccd..203c823 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -488,6 +488,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.num_sdma_xgmi_engines); sysfs_show_32bit_prop(buffer, "num_sdma_queues_per_engine", dev->node_props.num_sdma_queues_per_engine); + sysfs_show_32bit_prop(buffer, "num_cp_queues", + dev->node_props.num_cp_queues); if (dev->gpu) { log_max_watch_addr = @@ -1316,6 +1318,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu) dev->node_props.num_gws = (hws_gws_support && dev->gpu->dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) ? amdgpu_amdkfd_get_num_gws(dev->gpu->kgd) : 0; + dev->node_props.num_cp_queues = get_queues_num(dev->gpu->dqm); kfd_fill_mem_clk_max_info(dev); kfd_fill_iolink_non_crat_info(dev); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index 9346cc1..e447901 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -27,7 +27,7 @@ #include #include "kfd_crat.h" -#define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 28 +#define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 24 I don't see why you need to change the name size here. I'm not aware of any requirement that the structure size cannot change. This comment applies to patch 1 as well. Regards, Felix #define HSA_CAP_HOT_PLUGGABLE 0x0001 #define HSA_CAP_ATS_PRESENT 0x0002 @@ -82,6 +82,7 @@ struct kfd_node_properties { uint32_t num_sdma_engines; uint32_t num_sdma_xgmi_engines; uint32_t num_sdma_queues_per_engine; + uint32_t num_cp_queues; char name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; }; ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 2/2] drm/amdgpu: attempt xgmi perfmon re-arm on failed arm
On 2019-12-17 12:28, Jonathan Kim wrote: The DF routines to arm xGMI performance will attempt to re-arm both on performance monitoring start and read on initial failure to arm. Signed-off-by: Jonathan Kim --- drivers/gpu/drm/amd/amdgpu/df_v3_6.c | 153 --- 1 file changed, 117 insertions(+), 36 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/df_v3_6.c b/drivers/gpu/drm/amd/amdgpu/df_v3_6.c index 4043ebcea5de..af445054305f 100644 --- a/drivers/gpu/drm/amd/amdgpu/df_v3_6.c +++ b/drivers/gpu/drm/amd/amdgpu/df_v3_6.c @@ -162,25 +162,45 @@ static void df_v3_6_perfmon_rreg(struct amdgpu_device *adev, } /* - * df_v3_6_perfmon_wreg - write to perfmon lo and hi - * - * required to be atomic. no mmio method provided so subsequent reads after - * data writes cannot occur to preserve data fabrics finite state machine. + * retry arming counters every 100 usecs within 1 millisecond interval. + * if retry fails after time out, return error. */ -static void df_v3_6_perfmon_wreg(struct amdgpu_device *adev, uint32_t lo_addr, - uint32_t lo_val, uint32_t hi_addr, uint32_t hi_val) +#define ARM_RETRY_USEC_TIMEOUT 1000 +#define ARM_RETRY_USEC_INTERVAL100 +static int df_v3_6_perfmon_arm_with_retry(struct amdgpu_device *adev, + uint32_t lo_addr, uint32_t lo_val, + uint32_t hi_addr, uint32_t hi_val) { unsigned long flags, address, data; + uint32_t lo_val_rb, hi_val_rb; + int countdown = ARM_RETRY_USEC_TIMEOUT; address = adev->nbio.funcs->get_pcie_index_offset(adev); data = adev->nbio.funcs->get_pcie_data_offset(adev); spin_lock_irqsave(&adev->pcie_idx_lock, flags); - WREG32(address, lo_addr); - WREG32(data, lo_val); - WREG32(address, hi_addr); - WREG32(data, hi_val); + + while (countdown) { + WREG32(address, lo_addr); + WREG32(data, lo_val); + WREG32(address, hi_addr); + WREG32(data, hi_val); + + WREG32(address, lo_addr); + lo_val_rb = RREG32(data); + WREG32(address, hi_addr); + hi_val_rb = RREG32(data); + + if (lo_val == lo_val_rb && hi_val == hi_val_rb) + break; + + countdown -= ARM_RETRY_USEC_INTERVAL; + udelay(ARM_RETRY_USEC_INTERVAL); + } + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); I don't think it's a good idea to hold the spin lock for the entire duration of this retry loop. Maybe put that inside the loop and release the lock while waiting in udelay. + + return countdown > 0 ? 0 : -ETIME; } /* get the number of df counters available */ @@ -334,20 +354,20 @@ static void df_v3_6_pmc_get_addr(struct amdgpu_device *adev, switch (target_cntr) { case 0: - *lo_base_addr = is_ctrl ? smnPerfMonCtlLo0 : smnPerfMonCtrLo0; - *hi_base_addr = is_ctrl ? smnPerfMonCtlHi0 : smnPerfMonCtrHi0; + *lo_base_addr = is_ctrl ? smnPerfMonCtlLo4 : smnPerfMonCtrLo4; + *hi_base_addr = is_ctrl ? smnPerfMonCtlHi4 : smnPerfMonCtrHi4; break; case 1: - *lo_base_addr = is_ctrl ? smnPerfMonCtlLo1 : smnPerfMonCtrLo1; - *hi_base_addr = is_ctrl ? smnPerfMonCtlHi1 : smnPerfMonCtrHi1; + *lo_base_addr = is_ctrl ? smnPerfMonCtlLo5 : smnPerfMonCtrLo5; + *hi_base_addr = is_ctrl ? smnPerfMonCtlHi5 : smnPerfMonCtrHi5; break; case 2: - *lo_base_addr = is_ctrl ? smnPerfMonCtlLo2 : smnPerfMonCtrLo2; - *hi_base_addr = is_ctrl ? smnPerfMonCtlHi2 : smnPerfMonCtrHi2; + *lo_base_addr = is_ctrl ? smnPerfMonCtlLo6 : smnPerfMonCtrLo6; + *hi_base_addr = is_ctrl ? smnPerfMonCtlHi6 : smnPerfMonCtrHi6; break; case 3: - *lo_base_addr = is_ctrl ? smnPerfMonCtlLo3 : smnPerfMonCtrLo3; - *hi_base_addr = is_ctrl ? smnPerfMonCtlHi3 : smnPerfMonCtrHi3; + *lo_base_addr = is_ctrl ? smnPerfMonCtlLo7 : smnPerfMonCtrLo7; + *hi_base_addr = is_ctrl ? smnPerfMonCtlHi7 : smnPerfMonCtrHi7; break; } @@ -422,6 +442,42 @@ static int df_v3_6_pmc_add_cntr(struct amdgpu_device *adev, return -ENOSPC; } +#define DEFERRED_ARM_MASK (1 << 31) +static int df_v3_6_pmc_defer_cntr(struct amdgpu_device *adev, + uint64_t config, int err) Consider renaming this function. I found its usage confusing because it's used to defer arming as well as clearing the deferred flag. Maybe df_v3_6_pmc_set_deferred. The "err" parameter could be named "defer" to better indicate its meaning and maybe make it bool, since that's what's returned by the counterpart df_v3_6_pmc_is_deferred. +{ + int
[PATCH v2 5/5] drm/amdgpu/smu: add metrics table lock for vega20 (v2)
To protect access to the metrics table. v2: unlock on error Bug: https://gitlab.freedesktop.org/drm/amd/issues/900 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/vega20_ppt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c index 2b1c3f8a0415..250ff5aa1305 100644 --- a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c @@ -1678,17 +1678,20 @@ static int vega20_get_metrics_table(struct smu_context *smu, struct smu_table_context *smu_table= &smu->smu_table; int ret = 0; + mutex_lock(&smu->metrics_lock); if (!smu_table->metrics_time || time_after(jiffies, smu_table->metrics_time + HZ / 1000)) { ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, (void *)smu_table->metrics_table, false); if (ret) { pr_info("Failed to export SMU metrics table!\n"); + mutex_unlock(&smu->metrics_lock); return ret; } smu_table->metrics_time = jiffies; } memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t)); + mutex_unlock(&smu->metrics_lock); return ret; } -- 2.23.0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH v2 2/5] drm/amdgpu/smu: add metrics table lock for arcturus (v2)
To protect access to the metrics table. v2: unlock on error Bug: https://gitlab.freedesktop.org/drm/amd/issues/900 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/arcturus_ppt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c index 17eeb546c550..be4ae0aea9a0 100644 --- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c @@ -867,18 +867,21 @@ static int arcturus_get_metrics_table(struct smu_context *smu, struct smu_table_context *smu_table= &smu->smu_table; int ret = 0; + mutex_lock(&smu->metrics_lock); if (!smu_table->metrics_time || time_after(jiffies, smu_table->metrics_time + HZ / 1000)) { ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, (void *)smu_table->metrics_table, false); if (ret) { pr_info("Failed to export SMU metrics table!\n"); + mutex_unlock(&smu->metrics_lock); return ret; } smu_table->metrics_time = jiffies; } memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t)); + mutex_unlock(&smu->metrics_lock); return ret; } -- 2.23.0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH v2 3/5] drm/amdgpu/smu: add metrics table lock for navi (v2)
To protect access to the metrics table. v2: unlock on error Bug: https://gitlab.freedesktop.org/drm/amd/issues/900 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c index 7b42e72dc939..bf87e93b26fc 100644 --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c @@ -564,17 +564,20 @@ static int navi10_get_metrics_table(struct smu_context *smu, struct smu_table_context *smu_table= &smu->smu_table; int ret = 0; + mutex_lock(&smu->metrics_lock); if (!smu_table->metrics_time || time_after(jiffies, smu_table->metrics_time + msecs_to_jiffies(100))) { ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, (void *)smu_table->metrics_table, false); if (ret) { pr_info("Failed to export SMU metrics table!\n"); + mutex_unlock(&smu->metrics_lock); return ret; } smu_table->metrics_time = jiffies; } memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t)); + mutex_unlock(&smu->metrics_lock); return ret; } -- 2.23.0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH v2 1/5] drm/amdgpu/smu: add metrics table lock
This table is used for lots of things, add it's own lock. Bug: https://gitlab.freedesktop.org/drm/amd/issues/900 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 1 + drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index f76a1717ffbd..936c68298786 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -872,6 +872,7 @@ static int smu_sw_init(void *handle) smu->smu_baco.platform_support = false; mutex_init(&smu->sensor_lock); + mutex_init(&smu->metrics_lock); smu->watermarks_bitmap = 0; smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h index a7d0ad831491..541cfde289ea 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h @@ -350,6 +350,7 @@ struct smu_context const struct pptable_funcs *ppt_funcs; struct mutexmutex; struct mutexsensor_lock; + struct mutexmetrics_lock; uint64_t pool_size; struct smu_table_contextsmu_table; -- 2.23.0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH v2 4/5] drm/amdgpu/smu: add metrics table lock for renoir (v2)
To protect access to the metrics table. v2: unlock on error Bug: https://gitlab.freedesktop.org/drm/amd/issues/900 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/renoir_ppt.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c index 81520b0fca68..979772dbe6a9 100644 --- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c @@ -171,17 +171,20 @@ static int renoir_get_metrics_table(struct smu_context *smu, struct smu_table_context *smu_table= &smu->smu_table; int ret = 0; + mutex_lock(&smu->metrics_lock); if (!smu_table->metrics_time || time_after(jiffies, smu_table->metrics_time + msecs_to_jiffies(100))) { ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, (void *)smu_table->metrics_table, false); if (ret) { pr_info("Failed to export SMU metrics table!\n"); + mutex_unlock(&smu->metrics_lock); return ret; } smu_table->metrics_time = jiffies; } memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t)); + mutex_unlock(&smu->metrics_lock); return ret; } @@ -239,8 +242,7 @@ static int renoir_print_clk_levels(struct smu_context *smu, memset(&metrics, 0, sizeof(metrics)); - ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, - (void *)&metrics, false); + ret = renoir_get_metrics_table(smu, &metrics); if (ret) return ret; -- 2.23.0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 2/2] drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV
I agree. Removing the call to pre-reset probably breaks GPU reset for KFD. We call the KFD suspend function in pre-reset, which uses the HIQ to stop any user mode queues still running. If that is not possible because the HIQ is hanging, it should fail with a timeout. There may be something we can do if we know that the HIQ is hanging, so we only update the KFD-internal queue state without actually sending anything to the HIQ. Regards, Felix On 2019-12-17 10:37, shaoyunl wrote: I think amdkfd side depends on this call to stop the user queue, without this call, the user queue can submit to HW during the reset which could cause hang again ... Do we know the root cause why this function would ruin MEC ? From the logic, I think this function should be called before FLR since we need to disable the user queue submission first. I remembered the function should use hiq to communicate with HW , shouldn't use kiq to access HW registerm, has this been changed ? Regards shaoyun.liu On 2019-12-17 5:19 a.m., Monk Liu wrote: issues: MEC is ruined by the amdkfd_pre_reset after VF FLR done fix: amdkfd_pre_reset() would ruin MEC after hypervisor finished the VF FLR, the correct sequence is do amdkfd_pre_reset before VF FLR but there is a limitation to block this sequence: if we do pre_reset() before VF FLR, it would go KIQ way to do register access and stuck there, because KIQ probably won't work by that time (e.g. you already made GFX hang) so the best way right now is to simply remove it. Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 605cef6..ae962b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3672,8 +3672,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) return r; - amdgpu_amdkfd_pre_reset(adev); - /* Resume IP prior to SMC */ r = amdgpu_device_ip_reinit_early_sriov(adev); if (r) ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cfelix.kuehling%40amd.com%7Cbd097404ba8b4e7f9d9308d7830717fe%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637121938908876710&sdata=bNGTZtFLiQ46UwjCa5u8hXG1KUtK%2Fs98g7rBmBtTaPs%3D&reserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 1/2] drm/amdgpu/display: include delay.h
On 2019-12-17 3:47 p.m., Alex Deucher wrote: For udelay. This is needed for some platforms. Signed-off-by: Alex Deucher Reviewed-by: Nicholas Kazlauskas I wonder if it makes more sense to include this in os_types.h to avoid these errors in the future. Nicholas Kazlauskas --- drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c index 110c8620907b..bcbc0b8a9aa0 100644 --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c @@ -23,6 +23,8 @@ * */ +#include + #include "hdcp.h" static inline enum mod_hdcp_status check_receiver_id_list_ready(struct mod_hdcp *hdcp) ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 2/2] drm/amdgpu/display: use msleep rather than udelay for HDCP
ARM has a 2000us limit for udelay. Switch to msleep. This code executes in a worker thread so shouldn't be an atomic context. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c index bcbc0b8a9aa0..f730b94ac3c0 100644 --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c @@ -153,7 +153,7 @@ static enum mod_hdcp_status poll_l_prime_available(struct mod_hdcp *hdcp) { enum mod_hdcp_status status; uint8_t size; - uint16_t max_wait = 2; // units of us + uint16_t max_wait = 20; // units of ms uint16_t num_polls = 5; uint16_t wait_time = max_wait / num_polls; @@ -161,7 +161,7 @@ static enum mod_hdcp_status poll_l_prime_available(struct mod_hdcp *hdcp) status = MOD_HDCP_STATUS_INVALID_OPERATION; else for (; num_polls; num_polls--) { - udelay(wait_time); + msleep(wait_time); status = mod_hdcp_read_rxstatus(hdcp); if (status != MOD_HDCP_STATUS_SUCCESS) @@ -474,7 +474,7 @@ static enum mod_hdcp_status locality_check(struct mod_hdcp *hdcp, hdcp, "lc_init_write")) goto out; if (is_dp_hdcp(hdcp)) - udelay(16000); + msleep(16); else if (!mod_hdcp_execute_and_set(poll_l_prime_available, &input->l_prime_available_poll, &status, -- 2.23.0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 1/2] drm/amdgpu/display: include delay.h
For udelay. This is needed for some platforms. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c index 110c8620907b..bcbc0b8a9aa0 100644 --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c @@ -23,6 +23,8 @@ * */ +#include + #include "hdcp.h" static inline enum mod_hdcp_status check_receiver_id_list_ready(struct mod_hdcp *hdcp) -- 2.23.0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [CI-NOTIFY]: TCWG Bisect tcwg_kernel/llvm-release-aarch64-next-allmodconfig - Build # 48 - Successful!
On Tue, Dec 17, 2019 at 09:19:37AM -0800, 'Nick Desaulniers' via Clang Built Linux wrote: > Bhawanpreet, I suspect you're missing the header to include udelay in > drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_execution.c. > Can you please send a fix for this? > arm allyesconfig is also broken at link time, which I reported here previously: https://lists.freedesktop.org/archives/amd-gfx/2019-November/043109.html ld.lld: error: undefined symbol: __bad_udelay >>> referenced by hdcp2_execution.c >>> >>> gpu/drm/amd/display/modules/hdcp/hdcp2_execution.o:(locality_check) in >>> archive drivers/built-in.a >>> referenced by hdcp2_execution.c >>> >>> gpu/drm/amd/display/modules/hdcp/hdcp2_execution.o:(poll_l_prime_available) >>> in archive drivers/built-in.a >>> referenced by hdcp2_execution.c >>> >>> gpu/drm/amd/display/modules/hdcp/hdcp2_execution.o:(poll_l_prime_available) >>> in archive drivers/built-in.a >>> referenced by hdcp2_execution.c >>> >>> gpu/drm/amd/display/modules/hdcp/hdcp2_execution.o:(poll_l_prime_available) >>> in archive drivers/built-in.a >>> referenced by hdcp2_execution.c >>> >>> gpu/drm/amd/display/modules/hdcp/hdcp2_execution.o:(poll_l_prime_available) >>> in archive drivers/built-in.a >>> referenced by hdcp2_execution.c >>> >>> gpu/drm/amd/display/modules/hdcp/hdcp2_execution.o:(poll_l_prime_available) >>> in archive drivers/built-in.a > On Tue, Dec 17, 2019 at 7:07 AM wrote: > > > > Successfully identified regression in *linux* in CI configuration > > tcwg_kernel/llvm-release-aarch64-next-allmodconfig. So far, this commit > > has regressed CI configurations: > > - tcwg_kernel/gnu-release-aarch64-next-allmodconfig > > - tcwg_kernel/llvm-master-aarch64-next-allyesconfig > > - tcwg_kernel/llvm-master-arm-next-allmodconfig > > - tcwg_kernel/llvm-release-aarch64-next-allmodconfig > > - tcwg_kernel/llvm-release-arm-next-allmodconfig > > > > Culprit: > > > > commit 51466b3fd2725bfb0de629f71c0854ff276d50ae > > Author: Bhawanpreet Lakha > > > > drm/amd/display: Add execution and transition states for HDCP2.2 > > > > > > First few errors in logs of first_bad: > > 00:03:03 > > drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_execution.c:162:4: > > error: implicit declaration of function 'udelay' > > [-Werror,-Wimplicit-function-declaration] > > 00:03:03 > > drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_execution.c:472:3: > > error: implicit declaration of function 'udelay' > > [-Werror,-Wimplicit-function-declaration] > > 00:03:03 make[4]: *** > > [drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_execution.o] > > Error 1 > > 00:03:06 make[3]: *** [drivers/gpu/drm/amd/amdgpu] Error 2 > > 00:03:26 make[2]: *** [drivers/gpu/drm] Error 2 > > 00:03:26 make[1]: *** [drivers/gpu] Error 2 > > 00:04:14 make: *** [drivers] Error 2 > > Configuration details: > > rr[llvm_url]="https://github.com/llvm/llvm-project.git"; > > rr[linux_url]="https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git"; > > rr[linux_branch]="32b8acf85223448973ca0bf0ee8149a01410f3a0" > > > > Results regressed to (for first_bad == > > 51466b3fd2725bfb0de629f71c0854ff276d50ae) > > reset_artifacts: > > -10 > > build_llvm: > > -1 > > linux_n_obj: > > 18938 > > > > from (for last_good == eff682f83c9c2030761e7536c5d97e1b20f71c15) > > reset_artifacts: > > -10 > > build_llvm: > > -1 > > linux_n_obj: > > 25911 > > linux build successful: > > all > > > > Artifacts of first_bad build: > > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/build-51466b3fd2725bfb0de629f71c0854ff276d50ae/ > > Artifacts of last_good build: > > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/build-eff682f83c9c2030761e7536c5d97e1b20f71c15/ > > Build top page/logs: > > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/ > > > > Reproduce builds: > > > > mkdir investigate-linux-51466b3fd2725bfb0de629f71c0854ff276d50ae > > cd investigate-linux-51466b3fd2725bfb0de629f71c0854ff276d50ae > > > > git clone https://git.linaro.org/toolchain/jenkins-scripts > > > > mkdir -p artifacts/manifests > > curl -o artifacts/manifests/build-baseline.sh > > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/manifests/build-baseline.sh > > curl -o artifacts/manifests/build-parameters.sh > > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/manifests/build-parameters.sh > > curl -o artifacts/test.sh > > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/test.sh > > chmod +x artifacts/test.sh > > > > # Reproduce the baseline build (build all pre-requisites) > > ./jenkins-scripts/tcwg_kernel-build.sh @@
Re: [PATCH 3/5] drm/amdgpu/smu: add metrics table lock for navi
[AMD Official Use Only - Internal Distribution Only] yeah, they need some fixes. Alex From: Pelloux-prayer, Pierre-eric Sent: Tuesday, December 17, 2019 1:56 PM To: Alex Deucher ; amd-gfx@lists.freedesktop.org Cc: Deucher, Alexander Subject: Re: [PATCH 3/5] drm/amdgpu/smu: add metrics table lock for navi Hi Alex, Isn't this patch missing something like this: pr_info("Failed to export SMU metrics table!\n"); + mutex_unlock(&smu->metrics_lock); return ret; to release the lock in case of error? Regards, Pierre-Eric On 17/12/2019 15:55, Alex Deucher wrote: > To protect access to the metrics table. > > Bug: https://gitlab.freedesktop.org/drm/amd/issues/900 > Signed-off-by: Alex Deucher > --- > drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c > b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c > index 15403b7979d6..102fddda925b 100644 > --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c > +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c > @@ -564,6 +564,7 @@ static int navi10_get_metrics_table(struct smu_context > *smu, >struct smu_table_context *smu_table= &smu->smu_table; >int ret = 0; > > + mutex_lock(&smu->metrics_lock); >if (!smu_table->metrics_time || time_after(jiffies, > smu_table->metrics_time + msecs_to_jiffies(100))) { >ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, >(void *)smu_table->metrics_table, false); > @@ -575,6 +576,7 @@ static int navi10_get_metrics_table(struct smu_context > *smu, >} > >memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t)); > + mutex_unlock(&smu->metrics_lock); > >return ret; > } > ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 3/5] drm/amdgpu/smu: add metrics table lock for navi
Hi Alex, Isn't this patch missing something like this: pr_info("Failed to export SMU metrics table!\n"); + mutex_unlock(&smu->metrics_lock); return ret; to release the lock in case of error? Regards, Pierre-Eric On 17/12/2019 15:55, Alex Deucher wrote: > To protect access to the metrics table. > > Bug: https://gitlab.freedesktop.org/drm/amd/issues/900 > Signed-off-by: Alex Deucher > --- > drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c > b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c > index 15403b7979d6..102fddda925b 100644 > --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c > +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c > @@ -564,6 +564,7 @@ static int navi10_get_metrics_table(struct smu_context > *smu, > struct smu_table_context *smu_table= &smu->smu_table; > int ret = 0; > > + mutex_lock(&smu->metrics_lock); > if (!smu_table->metrics_time || time_after(jiffies, > smu_table->metrics_time + msecs_to_jiffies(100))) { > ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, > (void *)smu_table->metrics_table, false); > @@ -575,6 +576,7 @@ static int navi10_get_metrics_table(struct smu_context > *smu, > } > > memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t)); > + mutex_unlock(&smu->metrics_lock); > > return ret; > } > ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 1/5] drm/amdgpu/smu: add metrics table lock
[AMD Official Use Only - Internal Distribution Only] the swSMU should be add metrics lock to protect the maintenance data of the metrics table. The series patches are Reviewed-by: Kevin Wang Best Regards, Kevin From: amd-gfx on behalf of Alex Deucher Sent: Tuesday, December 17, 2019 10:55 PM To: amd-gfx@lists.freedesktop.org Cc: Deucher, Alexander Subject: [PATCH 1/5] drm/amdgpu/smu: add metrics table lock This table is used for lots of things, add it's own lock. Bug: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fdrm%2Famd%2Fissues%2F900&data=02%7C01%7CKevin1.Wang%40amd.com%7C4ea0cd2cfad44f285ffb08d7830121d7%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637121913347060555&sdata=Az1dOiYWPr%2FJIvTgo35a7a9oTnnpCVvtSnA85mgExf8%3D&reserved=0 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 1 + drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 67818558..6177a6664737 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -872,6 +872,7 @@ static int smu_sw_init(void *handle) smu->smu_baco.platform_support = false; mutex_init(&smu->sensor_lock); + mutex_init(&smu->metrics_lock); smu->watermarks_bitmap = 0; smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h index ca3fdc6777cf..503099f254c1 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h @@ -350,6 +350,7 @@ struct smu_context const struct pptable_funcs *ppt_funcs; struct mutexmutex; struct mutexsensor_lock; + struct mutexmetrics_lock; uint64_t pool_size; struct smu_table_contextsmu_table; -- 2.23.0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7CKevin1.Wang%40amd.com%7C4ea0cd2cfad44f285ffb08d7830121d7%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637121913347070548&sdata=EwWmrrJWWxG14kfkuXeM4YPA9odQI2gWyq0iT4pOXCQ%3D&reserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [pull] amdgpu, amdkfd, radeon drm-next-5.6
On Tue, Dec 17, 2019 at 12:41:06PM -0500, Alex Deucher wrote: > On Tue, Dec 17, 2019 at 11:46 AM Daniel Vetter wrote: > > > > On Tue, Dec 17, 2019 at 09:17:51AM -0500, Alex Deucher wrote: > > > On Tue, Dec 17, 2019 at 8:47 AM Alex Deucher > > > wrote: > > > > > > > > On Tue, Dec 17, 2019 at 7:52 AM Daniel Vetter wrote: > > > > > > > > > > On Wed, Dec 11, 2019 at 05:30:20PM -0500, Alex Deucher wrote: > > > > > > Hi Dave, Daniel, > > > > > > > > > > > > Kicking off 5.6 with new stuff from AMD. There is a UAPI addition. > > > > > > We > > > > > > added a new firmware for display, and this just adds the version > > > > > > query > > > > > > to our existing firmware query interface. UMDs like mesa use this > > > > > > interface to > > > > > > query things like CP or UVD firmware versions to see what features > > > > > > are > > > > > > supported. > > > > > > > > > > I got bored, and a quick serach for what the userspace for > > > > > AMDGPU_INFO_FW_DMCUB didn't turn up anything. At least didn't spot any > > > > > patches on mesa-devel or mesa MR and the patch on amd-gfx also didn't > > > > > come > > > > > with pointers. From the patch series description I have no idea why > > > > > you'd > > > > > even want this in userspace (but then I stopped being surprised by hw > > > > > design long ago). > > > > > > > > > > > > > We expose all the firmwares via the same interface for consistency, > > > > but the only ones user space generally cares about are the versions > > > > for the acceleration engines like gfx and multimedia. I can revert it > > > > if it's a big deal, but I'd prefer to keep it for consistency since > > > > all the others are already available via the same interface. It's not > > > > really a new interace with no user per se. > > > > Imo not the best style adding uapi just in case. We have a lot of that > > which then ended up (in other drivers at least) being for some hacks for > > blobs or vendor trees and stuff like that. So personally I'd lean towards > > just taking all the ones out you don't need (but keep the debugfs ofc), > > but meh. > > > > > Also, there are a few minor conflicts. I backmerged drm-next into my > > > drm-next branch if that is helpful. I can also send another PR with > > > the backmerge if you'd prefer. > > > > Looks like you didn't push, and I've thrown in the towel on the wm stuff. > > I honestly wonder how exactly you validate this stuff internally, this is > > almost as bad as i915 :-) > > > > Fixing your scripts to also push your validated integration tree (whatever > > funny kernel version that's based on, at least I'm assuming you're testing > > the merged version somewhere) might be really good here. Or use dim push, > > so that the git rr-cache is shared. > > Here's the relevant backmerge in my drm-next branch: > https://cgit.freedesktop.org/~agd5f/linux/commit/?h=drm-next&id=a759ca47934e83a117a7095a5fd9b91e62a91a0c > > And here's the standalong branch with just the merge on top of my last PR: > https://cgit.freedesktop.org/~agd5f/linux/log/?h=drm-next-5.6-backmerge Oh silly me didn't scroll down enough. btw there's a bunch of other things now in drm/amd from drm-misc, I think you want to redo your backmerge (and double check what I've done). Pulled, thanks for the pile. -Daniel > > Thanks! > > Alex > > > > > Thanks, Daniel > > > > > > > > Alex > > > > > > > Alex > > > > > > > > > Otherwise looks all good, no complaints from dim at least :-) > > > > > -Daniel > > > > > > > > > > > > > > > > > The following changes since commit > > > > > > 622b2a0ab647d2755f2c1f1000d3403e86a69763: > > > > > > > > > > > > drm/amdgpu/vcn: finish delay work before release resources > > > > > > (2019-11-13 15:29:42 -0500) > > > > > > > > > > > > are available in the Git repository at: > > > > > > > > > > > > git://people.freedesktop.org/~agd5f/linux > > > > > > tags/drm-next-5.6-2019-12-11 > > > > > > > > > > > > for you to fetch changes up to > > > > > > ad808910be68dcf8da5d837d4511d00ad5d3678a: > > > > > > > > > > > > drm/amdgpu: fix license on Kconfig and Makefiles (2019-12-11 > > > > > > 15:22:08 -0500) > > > > > > > > > > > > > > > > > > drm-next-5.6-2019-12-11: > > > > > > > > > > > > amdgpu: > > > > > > - Add MST atomic routines > > > > > > - Add support for DMCUB (new helper microengine for displays) > > > > > > - Add OEM i2c support in DC > > > > > > - Use vstartup for vblank events on DCN > > > > > > - Simplify Kconfig for DC > > > > > > - Renoir fixes for DC > > > > > > - Clean up function pointers in DC > > > > > > - Initial support for HDCP 2.x > > > > > > - Misc code cleanups > > > > > > - GFX10 fixes > > > > > > - Rework JPEG engine handling for VCN > > > > > > - Add clock and power gating support for JPEG > > > > > > - BACO support for Arcturus > > > > > > - Cleanup PSP ring handling > > > > > > - Add framework for using BACO with runtime pm to save power > > > > > > -
Re: [pull] amdgpu, amdkfd, radeon drm-next-5.6
On Tue, Dec 17, 2019 at 11:46 AM Daniel Vetter wrote: > > On Tue, Dec 17, 2019 at 09:17:51AM -0500, Alex Deucher wrote: > > On Tue, Dec 17, 2019 at 8:47 AM Alex Deucher wrote: > > > > > > On Tue, Dec 17, 2019 at 7:52 AM Daniel Vetter wrote: > > > > > > > > On Wed, Dec 11, 2019 at 05:30:20PM -0500, Alex Deucher wrote: > > > > > Hi Dave, Daniel, > > > > > > > > > > Kicking off 5.6 with new stuff from AMD. There is a UAPI addition. > > > > > We > > > > > added a new firmware for display, and this just adds the version query > > > > > to our existing firmware query interface. UMDs like mesa use this > > > > > interface to > > > > > query things like CP or UVD firmware versions to see what features are > > > > > supported. > > > > > > > > I got bored, and a quick serach for what the userspace for > > > > AMDGPU_INFO_FW_DMCUB didn't turn up anything. At least didn't spot any > > > > patches on mesa-devel or mesa MR and the patch on amd-gfx also didn't > > > > come > > > > with pointers. From the patch series description I have no idea why > > > > you'd > > > > even want this in userspace (but then I stopped being surprised by hw > > > > design long ago). > > > > > > > > > > We expose all the firmwares via the same interface for consistency, > > > but the only ones user space generally cares about are the versions > > > for the acceleration engines like gfx and multimedia. I can revert it > > > if it's a big deal, but I'd prefer to keep it for consistency since > > > all the others are already available via the same interface. It's not > > > really a new interace with no user per se. > > Imo not the best style adding uapi just in case. We have a lot of that > which then ended up (in other drivers at least) being for some hacks for > blobs or vendor trees and stuff like that. So personally I'd lean towards > just taking all the ones out you don't need (but keep the debugfs ofc), > but meh. > > > Also, there are a few minor conflicts. I backmerged drm-next into my > > drm-next branch if that is helpful. I can also send another PR with > > the backmerge if you'd prefer. > > Looks like you didn't push, and I've thrown in the towel on the wm stuff. > I honestly wonder how exactly you validate this stuff internally, this is > almost as bad as i915 :-) > > Fixing your scripts to also push your validated integration tree (whatever > funny kernel version that's based on, at least I'm assuming you're testing > the merged version somewhere) might be really good here. Or use dim push, > so that the git rr-cache is shared. Here's the relevant backmerge in my drm-next branch: https://cgit.freedesktop.org/~agd5f/linux/commit/?h=drm-next&id=a759ca47934e83a117a7095a5fd9b91e62a91a0c And here's the standalong branch with just the merge on top of my last PR: https://cgit.freedesktop.org/~agd5f/linux/log/?h=drm-next-5.6-backmerge Thanks! Alex > > Thanks, Daniel > > > > > Alex > > > > > Alex > > > > > > > Otherwise looks all good, no complaints from dim at least :-) > > > > -Daniel > > > > > > > > > > > > > > The following changes since commit > > > > > 622b2a0ab647d2755f2c1f1000d3403e86a69763: > > > > > > > > > > drm/amdgpu/vcn: finish delay work before release resources > > > > > (2019-11-13 15:29:42 -0500) > > > > > > > > > > are available in the Git repository at: > > > > > > > > > > git://people.freedesktop.org/~agd5f/linux > > > > > tags/drm-next-5.6-2019-12-11 > > > > > > > > > > for you to fetch changes up to > > > > > ad808910be68dcf8da5d837d4511d00ad5d3678a: > > > > > > > > > > drm/amdgpu: fix license on Kconfig and Makefiles (2019-12-11 > > > > > 15:22:08 -0500) > > > > > > > > > > > > > > > drm-next-5.6-2019-12-11: > > > > > > > > > > amdgpu: > > > > > - Add MST atomic routines > > > > > - Add support for DMCUB (new helper microengine for displays) > > > > > - Add OEM i2c support in DC > > > > > - Use vstartup for vblank events on DCN > > > > > - Simplify Kconfig for DC > > > > > - Renoir fixes for DC > > > > > - Clean up function pointers in DC > > > > > - Initial support for HDCP 2.x > > > > > - Misc code cleanups > > > > > - GFX10 fixes > > > > > - Rework JPEG engine handling for VCN > > > > > - Add clock and power gating support for JPEG > > > > > - BACO support for Arcturus > > > > > - Cleanup PSP ring handling > > > > > - Add framework for using BACO with runtime pm to save power > > > > > - Move core pci state handling out of the driver for pm ops > > > > > - Allow guest power control in 1 VF case with SR-IOV > > > > > - SR-IOV fixes > > > > > - RAS fixes > > > > > - Support for power metrics on renoir > > > > > - Golden settings updates for gfx10 > > > > > - Enable gfxoff on supported navi10 skus > > > > > - Update MAINTAINERS > > > > > > > > > > amdkfd: > > > > > - Clean up generational gfx code > > > > > - Fixes for gfx10 > > > > > - DIQ fixes > > > > > - Share more code with amdgpu > > > > > > > > > > radeon: > > > > >
Re: [CI-NOTIFY]: TCWG Bisect tcwg_kernel/llvm-release-aarch64-next-allmodconfig - Build # 48 - Successful!
Bhawanpreet, I suspect you're missing the header to include udelay in drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_execution.c. Can you please send a fix for this? On Tue, Dec 17, 2019 at 7:07 AM wrote: > > Successfully identified regression in *linux* in CI configuration > tcwg_kernel/llvm-release-aarch64-next-allmodconfig. So far, this commit has > regressed CI configurations: > - tcwg_kernel/gnu-release-aarch64-next-allmodconfig > - tcwg_kernel/llvm-master-aarch64-next-allyesconfig > - tcwg_kernel/llvm-master-arm-next-allmodconfig > - tcwg_kernel/llvm-release-aarch64-next-allmodconfig > - tcwg_kernel/llvm-release-arm-next-allmodconfig > > Culprit: > > commit 51466b3fd2725bfb0de629f71c0854ff276d50ae > Author: Bhawanpreet Lakha > > drm/amd/display: Add execution and transition states for HDCP2.2 > > > First few errors in logs of first_bad: > 00:03:03 > drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_execution.c:162:4: > error: implicit declaration of function 'udelay' > [-Werror,-Wimplicit-function-declaration] > 00:03:03 > drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_execution.c:472:3: > error: implicit declaration of function 'udelay' > [-Werror,-Wimplicit-function-declaration] > 00:03:03 make[4]: *** > [drivers/gpu/drm/amd/amdgpu/../display/modules/hdcp/hdcp2_execution.o] Error 1 > 00:03:06 make[3]: *** [drivers/gpu/drm/amd/amdgpu] Error 2 > 00:03:26 make[2]: *** [drivers/gpu/drm] Error 2 > 00:03:26 make[1]: *** [drivers/gpu] Error 2 > 00:04:14 make: *** [drivers] Error 2 > Configuration details: > rr[llvm_url]="https://github.com/llvm/llvm-project.git"; > rr[linux_url]="https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git"; > rr[linux_branch]="32b8acf85223448973ca0bf0ee8149a01410f3a0" > > Results regressed to (for first_bad == > 51466b3fd2725bfb0de629f71c0854ff276d50ae) > reset_artifacts: > -10 > build_llvm: > -1 > linux_n_obj: > 18938 > > from (for last_good == eff682f83c9c2030761e7536c5d97e1b20f71c15) > reset_artifacts: > -10 > build_llvm: > -1 > linux_n_obj: > 25911 > linux build successful: > all > > Artifacts of first_bad build: > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/build-51466b3fd2725bfb0de629f71c0854ff276d50ae/ > Artifacts of last_good build: > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/build-eff682f83c9c2030761e7536c5d97e1b20f71c15/ > Build top page/logs: > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/ > > Reproduce builds: > > mkdir investigate-linux-51466b3fd2725bfb0de629f71c0854ff276d50ae > cd investigate-linux-51466b3fd2725bfb0de629f71c0854ff276d50ae > > git clone https://git.linaro.org/toolchain/jenkins-scripts > > mkdir -p artifacts/manifests > curl -o artifacts/manifests/build-baseline.sh > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/manifests/build-baseline.sh > curl -o artifacts/manifests/build-parameters.sh > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/manifests/build-parameters.sh > curl -o artifacts/test.sh > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/test.sh > chmod +x artifacts/test.sh > > # Reproduce the baseline build (build all pre-requisites) > ./jenkins-scripts/tcwg_kernel-build.sh @@ > artifacts/manifests/build-baseline.sh > > cd linux > > # Reproduce first_bad build > git checkout --detach 51466b3fd2725bfb0de629f71c0854ff276d50ae > ../artifacts/test.sh > > # Reproduce last_good build > git checkout --detach eff682f83c9c2030761e7536c5d97e1b20f71c15 > ../artifacts/test.sh > > cd .. > > > History of pending regressions and results: > https://git.linaro.org/toolchain/ci/base-artifacts.git/log/?h=linaro-local/ci/tcwg_kernel/llvm-release-aarch64-next-allmodconfig > > Artifacts: > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/artifact/artifacts/ > Build log: > https://ci.linaro.org/job/tcwg_kernel-bisect-llvm-release-aarch64-next-allmodconfig/48/consoleText > > Full commit: > > commit 51466b3fd2725bfb0de629f71c0854ff276d50ae > Author: Bhawanpreet Lakha > Date: Wed Sep 18 11:18:15 2019 -0400 > > drm/amd/display: Add execution and transition states for HDCP2.2 > > The module works like a state machine > > +-+ > --> | Execution.c | -- > | +-+ | > | V > ++ ++ +--+ > | DM |->| Hdcp.c | < | Transition.c | > ++<-++ +--+ > > This patch a
[PATCH 1/2] drm/amdgpu: add perfmons accessible during df c-states
During DF C-State, Perfmon counters outside of range 1D700-1D7FF will encounter SLVERR affecting xGMI performance monitoring. PerfmonCtr[7:4] is being added to avoid SLVERR during read since it falls within this range. PerfmonCtl[7:4] is being added in order to arm PerfmonCtr[7:4]. Since PerfmonCtl[7:4] exists outside of range 1D700-1D7FF, DF routines will be enabled to opportunistically re-arm PerfmonCtl[7:4] on retry after SLVERR. Signed-off-by: Jonathan Kim Acked-by: Alex Deucher --- .../drm/amd/include/asic_reg/df/df_3_6_offset.h | 16 1 file changed, 16 insertions(+) diff --git a/drivers/gpu/drm/amd/include/asic_reg/df/df_3_6_offset.h b/drivers/gpu/drm/amd/include/asic_reg/df/df_3_6_offset.h index c2bd25589e84..f301e5fe2109 100644 --- a/drivers/gpu/drm/amd/include/asic_reg/df/df_3_6_offset.h +++ b/drivers/gpu/drm/amd/include/asic_reg/df/df_3_6_offset.h @@ -38,6 +38,14 @@ #define smnPerfMonCtlHi2 0x01d464UL #define smnPerfMonCtlLo3 0x01d470UL #define smnPerfMonCtlHi3 0x01d474UL +#define smnPerfMonCtlLo4 0x01d880UL +#define smnPerfMonCtlHi4 0x01d884UL +#define smnPerfMonCtlLo5 0x01d888UL +#define smnPerfMonCtlHi5 0x01d88cUL +#define smnPerfMonCtlLo6 0x01d890UL +#define smnPerfMonCtlHi6 0x01d894UL +#define smnPerfMonCtlLo7 0x01d898UL +#define smnPerfMonCtlHi7 0x01d89cUL #define smnPerfMonCtrLo0 0x01d448UL #define smnPerfMonCtrHi0 0x01d44cUL @@ -47,6 +55,14 @@ #define smnPerfMonCtrHi2 0x01d46cUL #define smnPerfMonCtrLo3 0x01d478UL #define smnPerfMonCtrHi3 0x01d47cUL +#define smnPerfMonCtrLo4 0x01d790UL +#define smnPerfMonCtrHi4 0x01d794UL +#define smnPerfMonCtrLo5 0x01d798UL +#define smnPerfMonCtrHi5 0x01d79cUL +#define smnPerfMonCtrLo6 0x01d7a0UL +#define smnPerfMonCtrHi6 0x01d7a4UL +#define smnPerfMonCtrLo7 0x01d7a8UL +#define smnPerfMonCtrHi7 0x01d7acUL #define smnDF_PIE_AON_FabricIndirectConfigAccessAddress3 0x1d05cUL #define smnDF_PIE_AON_FabricIndirectConfigAccessDataLo3 0x1d098UL -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 2/2] drm/amdgpu: attempt xgmi perfmon re-arm on failed arm
The DF routines to arm xGMI performance will attempt to re-arm both on performance monitoring start and read on initial failure to arm. Signed-off-by: Jonathan Kim --- drivers/gpu/drm/amd/amdgpu/df_v3_6.c | 153 --- 1 file changed, 117 insertions(+), 36 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/df_v3_6.c b/drivers/gpu/drm/amd/amdgpu/df_v3_6.c index 4043ebcea5de..af445054305f 100644 --- a/drivers/gpu/drm/amd/amdgpu/df_v3_6.c +++ b/drivers/gpu/drm/amd/amdgpu/df_v3_6.c @@ -162,25 +162,45 @@ static void df_v3_6_perfmon_rreg(struct amdgpu_device *adev, } /* - * df_v3_6_perfmon_wreg - write to perfmon lo and hi - * - * required to be atomic. no mmio method provided so subsequent reads after - * data writes cannot occur to preserve data fabrics finite state machine. + * retry arming counters every 100 usecs within 1 millisecond interval. + * if retry fails after time out, return error. */ -static void df_v3_6_perfmon_wreg(struct amdgpu_device *adev, uint32_t lo_addr, - uint32_t lo_val, uint32_t hi_addr, uint32_t hi_val) +#define ARM_RETRY_USEC_TIMEOUT 1000 +#define ARM_RETRY_USEC_INTERVAL100 +static int df_v3_6_perfmon_arm_with_retry(struct amdgpu_device *adev, + uint32_t lo_addr, uint32_t lo_val, + uint32_t hi_addr, uint32_t hi_val) { unsigned long flags, address, data; + uint32_t lo_val_rb, hi_val_rb; + int countdown = ARM_RETRY_USEC_TIMEOUT; address = adev->nbio.funcs->get_pcie_index_offset(adev); data = adev->nbio.funcs->get_pcie_data_offset(adev); spin_lock_irqsave(&adev->pcie_idx_lock, flags); - WREG32(address, lo_addr); - WREG32(data, lo_val); - WREG32(address, hi_addr); - WREG32(data, hi_val); + + while (countdown) { + WREG32(address, lo_addr); + WREG32(data, lo_val); + WREG32(address, hi_addr); + WREG32(data, hi_val); + + WREG32(address, lo_addr); + lo_val_rb = RREG32(data); + WREG32(address, hi_addr); + hi_val_rb = RREG32(data); + + if (lo_val == lo_val_rb && hi_val == hi_val_rb) + break; + + countdown -= ARM_RETRY_USEC_INTERVAL; + udelay(ARM_RETRY_USEC_INTERVAL); + } + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); + + return countdown > 0 ? 0 : -ETIME; } /* get the number of df counters available */ @@ -334,20 +354,20 @@ static void df_v3_6_pmc_get_addr(struct amdgpu_device *adev, switch (target_cntr) { case 0: - *lo_base_addr = is_ctrl ? smnPerfMonCtlLo0 : smnPerfMonCtrLo0; - *hi_base_addr = is_ctrl ? smnPerfMonCtlHi0 : smnPerfMonCtrHi0; + *lo_base_addr = is_ctrl ? smnPerfMonCtlLo4 : smnPerfMonCtrLo4; + *hi_base_addr = is_ctrl ? smnPerfMonCtlHi4 : smnPerfMonCtrHi4; break; case 1: - *lo_base_addr = is_ctrl ? smnPerfMonCtlLo1 : smnPerfMonCtrLo1; - *hi_base_addr = is_ctrl ? smnPerfMonCtlHi1 : smnPerfMonCtrHi1; + *lo_base_addr = is_ctrl ? smnPerfMonCtlLo5 : smnPerfMonCtrLo5; + *hi_base_addr = is_ctrl ? smnPerfMonCtlHi5 : smnPerfMonCtrHi5; break; case 2: - *lo_base_addr = is_ctrl ? smnPerfMonCtlLo2 : smnPerfMonCtrLo2; - *hi_base_addr = is_ctrl ? smnPerfMonCtlHi2 : smnPerfMonCtrHi2; + *lo_base_addr = is_ctrl ? smnPerfMonCtlLo6 : smnPerfMonCtrLo6; + *hi_base_addr = is_ctrl ? smnPerfMonCtlHi6 : smnPerfMonCtrHi6; break; case 3: - *lo_base_addr = is_ctrl ? smnPerfMonCtlLo3 : smnPerfMonCtrLo3; - *hi_base_addr = is_ctrl ? smnPerfMonCtlHi3 : smnPerfMonCtrHi3; + *lo_base_addr = is_ctrl ? smnPerfMonCtlLo7 : smnPerfMonCtrLo7; + *hi_base_addr = is_ctrl ? smnPerfMonCtlHi7 : smnPerfMonCtrHi7; break; } @@ -422,6 +442,42 @@ static int df_v3_6_pmc_add_cntr(struct amdgpu_device *adev, return -ENOSPC; } +#define DEFERRED_ARM_MASK (1 << 31) +static int df_v3_6_pmc_defer_cntr(struct amdgpu_device *adev, + uint64_t config, int err) +{ + int target_cntr; + + target_cntr = df_v3_6_pmc_config_2_cntr(adev, config); + + if (target_cntr < 0) + return -EINVAL; + + if (err) + adev->df_perfmon_config_assign_mask[target_cntr] |= + DEFERRED_ARM_MASK; + else + adev->df_perfmon_config_assign_mask[target_cntr] &= + ~DEFERRED_ARM_MASK; + + return 0; +} + +static bool df_v3_6_pmc_is_deferred(struct amdgpu_device *adev, +
Re: [pull] amdgpu, amdkfd, radeon drm-next-5.6
On Tue, Dec 17, 2019 at 09:17:51AM -0500, Alex Deucher wrote: > On Tue, Dec 17, 2019 at 8:47 AM Alex Deucher wrote: > > > > On Tue, Dec 17, 2019 at 7:52 AM Daniel Vetter wrote: > > > > > > On Wed, Dec 11, 2019 at 05:30:20PM -0500, Alex Deucher wrote: > > > > Hi Dave, Daniel, > > > > > > > > Kicking off 5.6 with new stuff from AMD. There is a UAPI addition. We > > > > added a new firmware for display, and this just adds the version query > > > > to our existing firmware query interface. UMDs like mesa use this > > > > interface to > > > > query things like CP or UVD firmware versions to see what features are > > > > supported. > > > > > > I got bored, and a quick serach for what the userspace for > > > AMDGPU_INFO_FW_DMCUB didn't turn up anything. At least didn't spot any > > > patches on mesa-devel or mesa MR and the patch on amd-gfx also didn't come > > > with pointers. From the patch series description I have no idea why you'd > > > even want this in userspace (but then I stopped being surprised by hw > > > design long ago). > > > > > > > We expose all the firmwares via the same interface for consistency, > > but the only ones user space generally cares about are the versions > > for the acceleration engines like gfx and multimedia. I can revert it > > if it's a big deal, but I'd prefer to keep it for consistency since > > all the others are already available via the same interface. It's not > > really a new interace with no user per se. Imo not the best style adding uapi just in case. We have a lot of that which then ended up (in other drivers at least) being for some hacks for blobs or vendor trees and stuff like that. So personally I'd lean towards just taking all the ones out you don't need (but keep the debugfs ofc), but meh. > Also, there are a few minor conflicts. I backmerged drm-next into my > drm-next branch if that is helpful. I can also send another PR with > the backmerge if you'd prefer. Looks like you didn't push, and I've thrown in the towel on the wm stuff. I honestly wonder how exactly you validate this stuff internally, this is almost as bad as i915 :-) Fixing your scripts to also push your validated integration tree (whatever funny kernel version that's based on, at least I'm assuming you're testing the merged version somewhere) might be really good here. Or use dim push, so that the git rr-cache is shared. Thanks, Daniel > > Alex > > > Alex > > > > > Otherwise looks all good, no complaints from dim at least :-) > > > -Daniel > > > > > > > > > > > The following changes since commit > > > > 622b2a0ab647d2755f2c1f1000d3403e86a69763: > > > > > > > > drm/amdgpu/vcn: finish delay work before release resources > > > > (2019-11-13 15:29:42 -0500) > > > > > > > > are available in the Git repository at: > > > > > > > > git://people.freedesktop.org/~agd5f/linux tags/drm-next-5.6-2019-12-11 > > > > > > > > for you to fetch changes up to ad808910be68dcf8da5d837d4511d00ad5d3678a: > > > > > > > > drm/amdgpu: fix license on Kconfig and Makefiles (2019-12-11 15:22:08 > > > > -0500) > > > > > > > > > > > > drm-next-5.6-2019-12-11: > > > > > > > > amdgpu: > > > > - Add MST atomic routines > > > > - Add support for DMCUB (new helper microengine for displays) > > > > - Add OEM i2c support in DC > > > > - Use vstartup for vblank events on DCN > > > > - Simplify Kconfig for DC > > > > - Renoir fixes for DC > > > > - Clean up function pointers in DC > > > > - Initial support for HDCP 2.x > > > > - Misc code cleanups > > > > - GFX10 fixes > > > > - Rework JPEG engine handling for VCN > > > > - Add clock and power gating support for JPEG > > > > - BACO support for Arcturus > > > > - Cleanup PSP ring handling > > > > - Add framework for using BACO with runtime pm to save power > > > > - Move core pci state handling out of the driver for pm ops > > > > - Allow guest power control in 1 VF case with SR-IOV > > > > - SR-IOV fixes > > > > - RAS fixes > > > > - Support for power metrics on renoir > > > > - Golden settings updates for gfx10 > > > > - Enable gfxoff on supported navi10 skus > > > > - Update MAINTAINERS > > > > > > > > amdkfd: > > > > - Clean up generational gfx code > > > > - Fixes for gfx10 > > > > - DIQ fixes > > > > - Share more code with amdgpu > > > > > > > > radeon: > > > > - PPC DMA fix > > > > - Register checker fixes for r1xx/r2xx > > > > - Misc cleanups > > > > > > > > > > > > Alex Deucher (34): > > > > drm/amdgpu/display: fix the build when CONFIG_DRM_AMD_DC_DCN is > > > > not set > > > > drm/amdgpu/display: fix warning when CONFIG_DRM_AMD_DC_DCN is not > > > > set > > > > drm/amdgpu/soc15: move struct definition around to align with > > > > other soc15 asics > > > > drm/amdgpu/nv: add asic func for fetching vbios from rom directly > > > > drm/amdgpu/powerplay: properly set PP_GFXOFF_MASK (v2) > > > >
Re: [PATCH v3] drm/amd/display: Fix AppleDongle can't be detected
On 2019-12-11 2:33 a.m., Louis Li wrote: > [Why] > External monitor cannot be displayed consistently, if connecting > via this Apple dongle (A1621, USB Type-C to HDMI). > Experiments prove that the dongle needs 200ms at least to be ready > for communication, after it drives HPDsignal high, and DPCD cannot > be read correctly during the period, even reading it repeatedly. > In such a case, driver does not perform link training bcz of no DPCD. > > [How] > When driver is run to the modified point, EDID is read correctly > and dpcd_sink_count of link is not zero. Therefore, link training > should be successfully performed. Which implies parameters should > be updated, e.g. lane count, link rate, etc. Checking parameters, > if values of those parameters are zero, link training is not > performed. So, do link-training to have detection completed. > > With this patch applied, the problem cannot be reproduced. > Testing other dongles, results are PASS. > Patch(v3) is verified PASS by both AMD internal lab and customer. > > > Signed-off-by: Louis Li > --- > drivers/gpu/drm/amd/display/dc/core/dc_link.c | 4 +++- > 1 file changed, 3 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link.c > b/drivers/gpu/drm/amd/display/dc/core/dc_link.c > index 7372dedd2f48..6188edc92d0f 100644 > --- a/drivers/gpu/drm/amd/display/dc/core/dc_link.c > +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link.c > @@ -725,7 +725,9 @@ bool dc_link_detect(struct dc_link *link, enum > dc_detect_reason reason) > > if (link->connector_signal == SIGNAL_TYPE_DISPLAY_PORT && > sink_caps.transaction_type == > DDC_TRANSACTION_TYPE_I2C_OVER_AUX && > - reason != DETECT_REASON_HPDRX) { Do we need to drop this line? This looks like it'll break the previous fix here. It looks like Abdoulaye added this here to fix the 400.1.1 DP compliance test. If you can check with him that your solution is fine and make sure to test that you can get a consistent pass of 400.1.1 over 30 runs I'm okay to take the change. Harry > + link->verified_link_cap.lane_count == 0 && > + link->verified_link_cap.link_rate == 0 && > + link->verified_link_cap.link_spread == 0) { > /* >* TODO debug why Dell 2413 doesn't like >* two link trainings > ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 1/3] drm/amdgpu: wait for all rings to drain before runtime suspending
Reviewed-by: Andrey Grodzovsky Andrey On 12/16/19 12:18 PM, Alex Deucher wrote: Add a safety check to runtime suspend to make sure all outstanding fences have signaled before we suspend. Doesn't fix any known issue. We already do this via the fence driver suspend function, but we just force completion rather than bailing. This bails on runtime suspend so we can try again later once the fences are signaled to avoid missing any outstanding work. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index d1e9946ac218..61dc26515c7e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1214,13 +1214,23 @@ static int amdgpu_pmops_runtime_suspend(struct device *dev) struct pci_dev *pdev = to_pci_dev(dev); struct drm_device *drm_dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_dev->dev_private; - int ret; + int ret, i; if (!adev->runpm) { pm_runtime_forbid(dev); return -EBUSY; } + /* wait for all rings to drain before suspending */ + for (i = 0; i < AMDGPU_MAX_RINGS; i++) { + struct amdgpu_ring *ring = adev->rings[i]; + if (ring && ring->sched.ready) { + ret = amdgpu_fence_wait_empty(ring); + if (ret) + return -EBUSY; + } + } + if (amdgpu_device_supports_boco(drm_dev)) drm_dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; drm_kms_helper_poll_disable(drm_dev); ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 2/2] drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV
I think amdkfd side depends on this call to stop the user queue, without this call, the user queue can submit to HW during the reset which could cause hang again ... Do we know the root cause why this function would ruin MEC ? From the logic, I think this function should be called before FLR since we need to disable the user queue submission first. I remembered the function should use hiq to communicate with HW , shouldn't use kiq to access HW registerm, has this been changed ? Regards shaoyun.liu On 2019-12-17 5:19 a.m., Monk Liu wrote: issues: MEC is ruined by the amdkfd_pre_reset after VF FLR done fix: amdkfd_pre_reset() would ruin MEC after hypervisor finished the VF FLR, the correct sequence is do amdkfd_pre_reset before VF FLR but there is a limitation to block this sequence: if we do pre_reset() before VF FLR, it would go KIQ way to do register access and stuck there, because KIQ probably won't work by that time (e.g. you already made GFX hang) so the best way right now is to simply remove it. Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 605cef6..ae962b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3672,8 +3672,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) return r; - amdgpu_amdkfd_pre_reset(adev); - /* Resume IP prior to SMC */ r = amdgpu_device_ip_reinit_early_sriov(adev); if (r) ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] drm/amdgpu: move umc offset to one new header file for Arcturus
[AMD Official Use Only - Internal Distribution Only] Reviewed-by: Alex Deucher From: Chen, Guchun Sent: Tuesday, December 17, 2019 4:08 AM To: Clements, John ; Zhang, Hawking ; Deucher, Alexander ; amd-gfx@lists.freedesktop.org Cc: Chen, Guchun Subject: [PATCH] drm/amdgpu: move umc offset to one new header file for Arcturus Fixes: 9686563c4c42 drm/amdgpu: Added RAS UMC error query support for Arcturus Code refactor and no functional change. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/umc_v6_1.c | 17 +- .../include/asic_reg/umc/umc_6_1_2_offset.h | 32 +++ 2 files changed, 33 insertions(+), 16 deletions(-) create mode 100644 drivers/gpu/drm/amd/include/asic_reg/umc/umc_6_1_2_offset.h diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c index 515eb50cd0f8..5093965dbc24 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c @@ -28,17 +28,10 @@ #include "rsmu/rsmu_0_0_2_sh_mask.h" #include "umc/umc_6_1_1_offset.h" #include "umc/umc_6_1_1_sh_mask.h" +#include "umc/umc_6_1_2_offset.h" #define smnMCA_UMC0_MCUMC_ADDRT00x50f10 -/* UMC 6_1_2 register offsets */ -#define mmUMCCH0_0_EccErrCntSel_ARCT 0x0360 -#define mmUMCCH0_0_EccErrCntSel_ARCT_BASE_IDX1 -#define mmUMCCH0_0_EccErrCnt_ARCT0x0361 -#define mmUMCCH0_0_EccErrCnt_ARCT_BASE_IDX 1 -#define mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT 0x03c2 -#define mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT_BASE_IDX 1 - /* * (addr / 256) * 8192, the higher 26 bits in ErrorAddr * is the index of 8KB block @@ -105,7 +98,6 @@ static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, if (adev->asic_type == CHIP_ARCTURUS) { /* UMC 6_1_2 registers */ - ecc_err_cnt_sel_addr = SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT); ecc_err_cnt_addr = @@ -114,7 +106,6 @@ static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT); } else { /* UMC 6_1_1 registers */ - ecc_err_cnt_sel_addr = SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel); ecc_err_cnt_addr = @@ -164,12 +155,10 @@ static void umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device *adev if (adev->asic_type == CHIP_ARCTURUS) { /* UMC 6_1_2 registers */ - mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT); } else { /* UMC 6_1_1 registers */ - mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); } @@ -211,12 +200,10 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, if (adev->asic_type == CHIP_ARCTURUS) { /* UMC 6_1_2 registers */ - mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT); } else { /* UMC 6_1_1 registers */ - mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); } @@ -282,14 +269,12 @@ static void umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device *adev, if (adev->asic_type == CHIP_ARCTURUS) { /* UMC 6_1_2 registers */ - ecc_err_cnt_sel_addr = SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT); ecc_err_cnt_addr = SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT); } else { /* UMC 6_1_1 registers */ - ecc_err_cnt_sel_addr = SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel); ecc_err_cnt_addr = diff --git a/drivers/gpu/drm/amd/include/asic_reg/umc/umc_6_1_2_offset.h b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_6_1_2_offset.h new file mode 100644 index ..3e79a8056556 --- /dev/null +++ b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_6_1_2_offset.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall b
[PATCH 3/5] drm/amdgpu/smu: add metrics table lock for navi
To protect access to the metrics table. Bug: https://gitlab.freedesktop.org/drm/amd/issues/900 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c index 15403b7979d6..102fddda925b 100644 --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c @@ -564,6 +564,7 @@ static int navi10_get_metrics_table(struct smu_context *smu, struct smu_table_context *smu_table= &smu->smu_table; int ret = 0; + mutex_lock(&smu->metrics_lock); if (!smu_table->metrics_time || time_after(jiffies, smu_table->metrics_time + msecs_to_jiffies(100))) { ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, (void *)smu_table->metrics_table, false); @@ -575,6 +576,7 @@ static int navi10_get_metrics_table(struct smu_context *smu, } memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t)); + mutex_unlock(&smu->metrics_lock); return ret; } -- 2.23.0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 2/5] drm/amdgpu/smu: add metrics table lock for arcturus
To protect access to the metrics table. Bug: https://gitlab.freedesktop.org/drm/amd/issues/900 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/arcturus_ppt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c index 17eeb546c550..bd5bb7040638 100644 --- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c @@ -867,6 +867,7 @@ static int arcturus_get_metrics_table(struct smu_context *smu, struct smu_table_context *smu_table= &smu->smu_table; int ret = 0; + mutex_lock(&smu->metrics_lock); if (!smu_table->metrics_time || time_after(jiffies, smu_table->metrics_time + HZ / 1000)) { ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, @@ -879,6 +880,7 @@ static int arcturus_get_metrics_table(struct smu_context *smu, } memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t)); + mutex_unlock(&smu->metrics_lock); return ret; } -- 2.23.0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 1/5] drm/amdgpu/smu: add metrics table lock
This table is used for lots of things, add it's own lock. Bug: https://gitlab.freedesktop.org/drm/amd/issues/900 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 1 + drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 67818558..6177a6664737 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -872,6 +872,7 @@ static int smu_sw_init(void *handle) smu->smu_baco.platform_support = false; mutex_init(&smu->sensor_lock); + mutex_init(&smu->metrics_lock); smu->watermarks_bitmap = 0; smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h index ca3fdc6777cf..503099f254c1 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h @@ -350,6 +350,7 @@ struct smu_context const struct pptable_funcs *ppt_funcs; struct mutexmutex; struct mutexsensor_lock; + struct mutexmetrics_lock; uint64_t pool_size; struct smu_table_contextsmu_table; -- 2.23.0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 5/5] drm/amdgpu/smu: add metrics table lock for vega20
To protect access to the metrics table. Bug: https://gitlab.freedesktop.org/drm/amd/issues/900 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/vega20_ppt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c index 12bcc3e3ba99..740cf62e74f3 100644 --- a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c @@ -1678,6 +1678,7 @@ static int vega20_get_metrics_table(struct smu_context *smu, struct smu_table_context *smu_table= &smu->smu_table; int ret = 0; + mutex_lock(&smu->metrics_lock); if (!smu_table->metrics_time || time_after(jiffies, smu_table->metrics_time + HZ / 1000)) { ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, (void *)smu_table->metrics_table, false); @@ -1689,6 +1690,7 @@ static int vega20_get_metrics_table(struct smu_context *smu, } memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t)); + mutex_unlock(&smu->metrics_lock); return ret; } -- 2.23.0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 4/5] drm/amdgpu/smu: add metrics table lock for renoir
To protect access to the metrics table. Bug: https://gitlab.freedesktop.org/drm/amd/issues/900 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/renoir_ppt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c index 81520b0fca68..8e723022be3e 100644 --- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c @@ -171,6 +171,7 @@ static int renoir_get_metrics_table(struct smu_context *smu, struct smu_table_context *smu_table= &smu->smu_table; int ret = 0; + mutex_lock(&smu->metrics_lock); if (!smu_table->metrics_time || time_after(jiffies, smu_table->metrics_time + msecs_to_jiffies(100))) { ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, (void *)smu_table->metrics_table, false); @@ -182,6 +183,7 @@ static int renoir_get_metrics_table(struct smu_context *smu, } memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t)); + mutex_unlock(&smu->metrics_lock); return ret; } @@ -239,8 +241,7 @@ static int renoir_print_clk_levels(struct smu_context *smu, memset(&metrics, 0, sizeof(metrics)); - ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, - (void *)&metrics, false); + ret = renoir_get_metrics_table(smu, &metrics); if (ret) return ret; -- 2.23.0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [pull] amdgpu, amdkfd, radeon drm-next-5.6
On Tue, Dec 17, 2019 at 8:47 AM Alex Deucher wrote: > > On Tue, Dec 17, 2019 at 7:52 AM Daniel Vetter wrote: > > > > On Wed, Dec 11, 2019 at 05:30:20PM -0500, Alex Deucher wrote: > > > Hi Dave, Daniel, > > > > > > Kicking off 5.6 with new stuff from AMD. There is a UAPI addition. We > > > added a new firmware for display, and this just adds the version query > > > to our existing firmware query interface. UMDs like mesa use this > > > interface to > > > query things like CP or UVD firmware versions to see what features are > > > supported. > > > > I got bored, and a quick serach for what the userspace for > > AMDGPU_INFO_FW_DMCUB didn't turn up anything. At least didn't spot any > > patches on mesa-devel or mesa MR and the patch on amd-gfx also didn't come > > with pointers. From the patch series description I have no idea why you'd > > even want this in userspace (but then I stopped being surprised by hw > > design long ago). > > > > We expose all the firmwares via the same interface for consistency, > but the only ones user space generally cares about are the versions > for the acceleration engines like gfx and multimedia. I can revert it > if it's a big deal, but I'd prefer to keep it for consistency since > all the others are already available via the same interface. It's not > really a new interace with no user per se. > Also, there are a few minor conflicts. I backmerged drm-next into my drm-next branch if that is helpful. I can also send another PR with the backmerge if you'd prefer. Alex > Alex > > > Otherwise looks all good, no complaints from dim at least :-) > > -Daniel > > > > > > > > The following changes since commit > > > 622b2a0ab647d2755f2c1f1000d3403e86a69763: > > > > > > drm/amdgpu/vcn: finish delay work before release resources (2019-11-13 > > > 15:29:42 -0500) > > > > > > are available in the Git repository at: > > > > > > git://people.freedesktop.org/~agd5f/linux tags/drm-next-5.6-2019-12-11 > > > > > > for you to fetch changes up to ad808910be68dcf8da5d837d4511d00ad5d3678a: > > > > > > drm/amdgpu: fix license on Kconfig and Makefiles (2019-12-11 15:22:08 > > > -0500) > > > > > > > > > drm-next-5.6-2019-12-11: > > > > > > amdgpu: > > > - Add MST atomic routines > > > - Add support for DMCUB (new helper microengine for displays) > > > - Add OEM i2c support in DC > > > - Use vstartup for vblank events on DCN > > > - Simplify Kconfig for DC > > > - Renoir fixes for DC > > > - Clean up function pointers in DC > > > - Initial support for HDCP 2.x > > > - Misc code cleanups > > > - GFX10 fixes > > > - Rework JPEG engine handling for VCN > > > - Add clock and power gating support for JPEG > > > - BACO support for Arcturus > > > - Cleanup PSP ring handling > > > - Add framework for using BACO with runtime pm to save power > > > - Move core pci state handling out of the driver for pm ops > > > - Allow guest power control in 1 VF case with SR-IOV > > > - SR-IOV fixes > > > - RAS fixes > > > - Support for power metrics on renoir > > > - Golden settings updates for gfx10 > > > - Enable gfxoff on supported navi10 skus > > > - Update MAINTAINERS > > > > > > amdkfd: > > > - Clean up generational gfx code > > > - Fixes for gfx10 > > > - DIQ fixes > > > - Share more code with amdgpu > > > > > > radeon: > > > - PPC DMA fix > > > - Register checker fixes for r1xx/r2xx > > > - Misc cleanups > > > > > > > > > Alex Deucher (34): > > > drm/amdgpu/display: fix the build when CONFIG_DRM_AMD_DC_DCN is not > > > set > > > drm/amdgpu/display: fix warning when CONFIG_DRM_AMD_DC_DCN is not > > > set > > > drm/amdgpu/soc15: move struct definition around to align with other > > > soc15 asics > > > drm/amdgpu/nv: add asic func for fetching vbios from rom directly > > > drm/amdgpu/powerplay: properly set PP_GFXOFF_MASK (v2) > > > drm/amdgpu: disable gfxoff when using register read interface > > > drm/amdgpu: remove experimental flag for Navi14 > > > drm/amdgpu: disable gfxoff on original raven > > > Revert "drm/amd/display: enable S/G for RAVEN chip" > > > drm/amdgpu: add asic callback for BACO support > > > drm/amdgpu: add supports_baco callback for soc15 asics. (v2) > > > drm/amdgpu: add supports_baco callback for SI asics. > > > drm/amdgpu: add supports_baco callback for CIK asics. > > > drm/amdgpu: add supports_baco callback for VI asics. > > > drm/amdgpu: add supports_baco callback for NV asics. > > > drm/amdgpu: add a amdgpu_device_supports_baco helper > > > drm/amdgpu: rename amdgpu_device_is_px to > > > amdgpu_device_supports_boco (v2) > > > drm/amdgpu: add additional boco checks to runtime suspend/resume > > > (v2) > > > drm/amdgpu: split swSMU baco_reset into enter and exit > > > drm/amdgpu: add helpers for baco entry and exit > > >
RE: [PATCH 2/2] drm/amdgpu: remove memory training p2c buffer reservation
[AMD Official Use Only - Internal Distribution Only] -Original Message- From: amd-gfx On Behalf Of Tianci Yin Sent: Tuesday, December 17, 2019 7:23 PM To: amd-gfx@lists.freedesktop.org Cc: Long, Gang ; Yin, Tianci (Rico) ; Xu, Feifei ; Wang, Kevin(Yang) ; Tuikov, Luben ; Deucher, Alexander ; Zhang, Hawking ; Koenig, Christian ; Yuan, Xiaojie Subject: [PATCH 2/2] drm/amdgpu: remove memory training p2c buffer reservation From: "Tianci.Yin" IP discovery TMR(occupied the top VRAM with size DISCOVERY_TMR_SIZE) has been reserved, and the p2c buffer is in the range of this TMR, so the p2c buffer reservation is unnecessary. Change-Id: Ib1f2f2b4a1f3869c03ffe22e2836cdbee17ba99f Signed-off-by: Tianci.Yin --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 14 -- 2 files changed, 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index 5f8fd3e3535b..3265487b859f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -202,7 +202,6 @@ struct psp_memory_training_context { /*vram offset of the p2c training data*/ u64 p2c_train_data_offset; - struct amdgpu_bo *p2c_bo; /*vram offset of the c2p training data*/ u64 c2p_train_data_offset; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index ce5cb854bdb9..6f0ad1d1d4d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1681,9 +1681,6 @@ static int amdgpu_ttm_training_reserve_vram_fini(struct amdgpu_device *adev) amdgpu_bo_free_kernel(&ctx->c2p_bo, NULL, NULL); ctx->c2p_bo = NULL; - amdgpu_bo_free_kernel(&ctx->p2c_bo, NULL, NULL); - ctx->p2c_bo = NULL; - return 0; } @@ -1718,17 +1715,6 @@ static int amdgpu_ttm_training_reserve_vram_init(struct amdgpu_device *adev) ctx->p2c_train_data_offset, ctx->c2p_train_data_offset); - ret = amdgpu_bo_create_kernel_at(adev, -ctx->p2c_train_data_offset, -ctx->train_data_size, -AMDGPU_GEM_DOMAIN_VRAM, -&ctx->p2c_bo, -NULL); - if (ret) { - DRM_ERROR("alloc p2c_bo failed(%d)!\n", ret); - goto Err_out; - } - ret = amdgpu_bo_create_kernel_at(adev, ctx->c2p_train_data_offset, ctx->train_data_size, [Guchun] If we have to remove such buffer reservation, from coding style's perspective, I suggest removing error handler code by "goto" too in amdgpu_ttm_training_reserve_vram_init. After removing p2c buffer reservation from this function, there is only one buffer reservation case for c2p. So direct error handle and return should be better. -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cguchun.chen%40amd.com%7C888c561716c342aa9ecc08d782e397d0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637121786693411170&sdata=pH1rob4R5ljvEGo8PSjn1te7ctWLG1Wctv30lNCLyx4%3D&reserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [pull] amdgpu, amdkfd, radeon drm-next-5.6
On Tue, Dec 17, 2019 at 7:52 AM Daniel Vetter wrote: > > On Wed, Dec 11, 2019 at 05:30:20PM -0500, Alex Deucher wrote: > > Hi Dave, Daniel, > > > > Kicking off 5.6 with new stuff from AMD. There is a UAPI addition. We > > added a new firmware for display, and this just adds the version query > > to our existing firmware query interface. UMDs like mesa use this > > interface to > > query things like CP or UVD firmware versions to see what features are > > supported. > > I got bored, and a quick serach for what the userspace for > AMDGPU_INFO_FW_DMCUB didn't turn up anything. At least didn't spot any > patches on mesa-devel or mesa MR and the patch on amd-gfx also didn't come > with pointers. From the patch series description I have no idea why you'd > even want this in userspace (but then I stopped being surprised by hw > design long ago). > We expose all the firmwares via the same interface for consistency, but the only ones user space generally cares about are the versions for the acceleration engines like gfx and multimedia. I can revert it if it's a big deal, but I'd prefer to keep it for consistency since all the others are already available via the same interface. It's not really a new interace with no user per se. Alex > Otherwise looks all good, no complaints from dim at least :-) > -Daniel > > > > > The following changes since commit 622b2a0ab647d2755f2c1f1000d3403e86a69763: > > > > drm/amdgpu/vcn: finish delay work before release resources (2019-11-13 > > 15:29:42 -0500) > > > > are available in the Git repository at: > > > > git://people.freedesktop.org/~agd5f/linux tags/drm-next-5.6-2019-12-11 > > > > for you to fetch changes up to ad808910be68dcf8da5d837d4511d00ad5d3678a: > > > > drm/amdgpu: fix license on Kconfig and Makefiles (2019-12-11 15:22:08 > > -0500) > > > > > > drm-next-5.6-2019-12-11: > > > > amdgpu: > > - Add MST atomic routines > > - Add support for DMCUB (new helper microengine for displays) > > - Add OEM i2c support in DC > > - Use vstartup for vblank events on DCN > > - Simplify Kconfig for DC > > - Renoir fixes for DC > > - Clean up function pointers in DC > > - Initial support for HDCP 2.x > > - Misc code cleanups > > - GFX10 fixes > > - Rework JPEG engine handling for VCN > > - Add clock and power gating support for JPEG > > - BACO support for Arcturus > > - Cleanup PSP ring handling > > - Add framework for using BACO with runtime pm to save power > > - Move core pci state handling out of the driver for pm ops > > - Allow guest power control in 1 VF case with SR-IOV > > - SR-IOV fixes > > - RAS fixes > > - Support for power metrics on renoir > > - Golden settings updates for gfx10 > > - Enable gfxoff on supported navi10 skus > > - Update MAINTAINERS > > > > amdkfd: > > - Clean up generational gfx code > > - Fixes for gfx10 > > - DIQ fixes > > - Share more code with amdgpu > > > > radeon: > > - PPC DMA fix > > - Register checker fixes for r1xx/r2xx > > - Misc cleanups > > > > > > Alex Deucher (34): > > drm/amdgpu/display: fix the build when CONFIG_DRM_AMD_DC_DCN is not > > set > > drm/amdgpu/display: fix warning when CONFIG_DRM_AMD_DC_DCN is not set > > drm/amdgpu/soc15: move struct definition around to align with other > > soc15 asics > > drm/amdgpu/nv: add asic func for fetching vbios from rom directly > > drm/amdgpu/powerplay: properly set PP_GFXOFF_MASK (v2) > > drm/amdgpu: disable gfxoff when using register read interface > > drm/amdgpu: remove experimental flag for Navi14 > > drm/amdgpu: disable gfxoff on original raven > > Revert "drm/amd/display: enable S/G for RAVEN chip" > > drm/amdgpu: add asic callback for BACO support > > drm/amdgpu: add supports_baco callback for soc15 asics. (v2) > > drm/amdgpu: add supports_baco callback for SI asics. > > drm/amdgpu: add supports_baco callback for CIK asics. > > drm/amdgpu: add supports_baco callback for VI asics. > > drm/amdgpu: add supports_baco callback for NV asics. > > drm/amdgpu: add a amdgpu_device_supports_baco helper > > drm/amdgpu: rename amdgpu_device_is_px to amdgpu_device_supports_boco > > (v2) > > drm/amdgpu: add additional boco checks to runtime suspend/resume (v2) > > drm/amdgpu: split swSMU baco_reset into enter and exit > > drm/amdgpu: add helpers for baco entry and exit > > drm/amdgpu: add baco support to runtime suspend/resume > > drm/amdgpu: start to disentangle boco from runtime pm > > drm/amdgpu: disentangle runtime pm and vga_switcheroo > > drm/amdgpu: enable runtime pm on BACO capable boards if runpm=1 > > drm/amdgpu: simplify runtime suspend > > drm/amd/display: add default clocks if not able to fetch them > > MAINTAINERS: Drop Rex Zhu for amdgpu powerplay > > drm/amdgpu: move pci handling
Re: [pull] amdgpu, amdkfd, radeon drm-next-5.6
On Wed, Dec 11, 2019 at 05:30:20PM -0500, Alex Deucher wrote: > Hi Dave, Daniel, > > Kicking off 5.6 with new stuff from AMD. There is a UAPI addition. We > added a new firmware for display, and this just adds the version query > to our existing firmware query interface. UMDs like mesa use this interface > to > query things like CP or UVD firmware versions to see what features are > supported. I got bored, and a quick serach for what the userspace for AMDGPU_INFO_FW_DMCUB didn't turn up anything. At least didn't spot any patches on mesa-devel or mesa MR and the patch on amd-gfx also didn't come with pointers. From the patch series description I have no idea why you'd even want this in userspace (but then I stopped being surprised by hw design long ago). Otherwise looks all good, no complaints from dim at least :-) -Daniel > > The following changes since commit 622b2a0ab647d2755f2c1f1000d3403e86a69763: > > drm/amdgpu/vcn: finish delay work before release resources (2019-11-13 > 15:29:42 -0500) > > are available in the Git repository at: > > git://people.freedesktop.org/~agd5f/linux tags/drm-next-5.6-2019-12-11 > > for you to fetch changes up to ad808910be68dcf8da5d837d4511d00ad5d3678a: > > drm/amdgpu: fix license on Kconfig and Makefiles (2019-12-11 15:22:08 -0500) > > > drm-next-5.6-2019-12-11: > > amdgpu: > - Add MST atomic routines > - Add support for DMCUB (new helper microengine for displays) > - Add OEM i2c support in DC > - Use vstartup for vblank events on DCN > - Simplify Kconfig for DC > - Renoir fixes for DC > - Clean up function pointers in DC > - Initial support for HDCP 2.x > - Misc code cleanups > - GFX10 fixes > - Rework JPEG engine handling for VCN > - Add clock and power gating support for JPEG > - BACO support for Arcturus > - Cleanup PSP ring handling > - Add framework for using BACO with runtime pm to save power > - Move core pci state handling out of the driver for pm ops > - Allow guest power control in 1 VF case with SR-IOV > - SR-IOV fixes > - RAS fixes > - Support for power metrics on renoir > - Golden settings updates for gfx10 > - Enable gfxoff on supported navi10 skus > - Update MAINTAINERS > > amdkfd: > - Clean up generational gfx code > - Fixes for gfx10 > - DIQ fixes > - Share more code with amdgpu > > radeon: > - PPC DMA fix > - Register checker fixes for r1xx/r2xx > - Misc cleanups > > > Alex Deucher (34): > drm/amdgpu/display: fix the build when CONFIG_DRM_AMD_DC_DCN is not set > drm/amdgpu/display: fix warning when CONFIG_DRM_AMD_DC_DCN is not set > drm/amdgpu/soc15: move struct definition around to align with other > soc15 asics > drm/amdgpu/nv: add asic func for fetching vbios from rom directly > drm/amdgpu/powerplay: properly set PP_GFXOFF_MASK (v2) > drm/amdgpu: disable gfxoff when using register read interface > drm/amdgpu: remove experimental flag for Navi14 > drm/amdgpu: disable gfxoff on original raven > Revert "drm/amd/display: enable S/G for RAVEN chip" > drm/amdgpu: add asic callback for BACO support > drm/amdgpu: add supports_baco callback for soc15 asics. (v2) > drm/amdgpu: add supports_baco callback for SI asics. > drm/amdgpu: add supports_baco callback for CIK asics. > drm/amdgpu: add supports_baco callback for VI asics. > drm/amdgpu: add supports_baco callback for NV asics. > drm/amdgpu: add a amdgpu_device_supports_baco helper > drm/amdgpu: rename amdgpu_device_is_px to amdgpu_device_supports_boco > (v2) > drm/amdgpu: add additional boco checks to runtime suspend/resume (v2) > drm/amdgpu: split swSMU baco_reset into enter and exit > drm/amdgpu: add helpers for baco entry and exit > drm/amdgpu: add baco support to runtime suspend/resume > drm/amdgpu: start to disentangle boco from runtime pm > drm/amdgpu: disentangle runtime pm and vga_switcheroo > drm/amdgpu: enable runtime pm on BACO capable boards if runpm=1 > drm/amdgpu: simplify runtime suspend > drm/amd/display: add default clocks if not able to fetch them > MAINTAINERS: Drop Rex Zhu for amdgpu powerplay > drm/amdgpu: move pci handling out of pm ops > drm/amdgpu: flag vram lost on baco reset for VI/CIK > drm/amd/display: re-enable wait in pipelock, but add timeout > drm/radeon: fix r1xx/r2xx register checker for POT textures > drm/amdgpu: add header line for power profile on Arcturus > drm/amdgpu/display: add fallthrough comment > drm/amdgpu: fix license on Kconfig and Makefiles > > Alex Sierra (2): > drm/amdgpu: add flag to indicate amdgpu vm context > amd/amdgpu: force to trigger a no-retry-fault after a retry-fault > > Alvin Lee (1): > drm/amd/display: Changes in dc to allow full update in some cases > > Amanda Liu (1): >
[PATCH 2/2] drm/amdgpu: remove memory training p2c buffer reservation
From: "Tianci.Yin" IP discovery TMR(occupied the top VRAM with size DISCOVERY_TMR_SIZE) has been reserved, and the p2c buffer is in the range of this TMR, so the p2c buffer reservation is unnecessary. Change-Id: Ib1f2f2b4a1f3869c03ffe22e2836cdbee17ba99f Signed-off-by: Tianci.Yin --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 14 -- 2 files changed, 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index 5f8fd3e3535b..3265487b859f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -202,7 +202,6 @@ struct psp_memory_training_context { /*vram offset of the p2c training data*/ u64 p2c_train_data_offset; - struct amdgpu_bo *p2c_bo; /*vram offset of the c2p training data*/ u64 c2p_train_data_offset; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index ce5cb854bdb9..6f0ad1d1d4d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1681,9 +1681,6 @@ static int amdgpu_ttm_training_reserve_vram_fini(struct amdgpu_device *adev) amdgpu_bo_free_kernel(&ctx->c2p_bo, NULL, NULL); ctx->c2p_bo = NULL; - amdgpu_bo_free_kernel(&ctx->p2c_bo, NULL, NULL); - ctx->p2c_bo = NULL; - return 0; } @@ -1718,17 +1715,6 @@ static int amdgpu_ttm_training_reserve_vram_init(struct amdgpu_device *adev) ctx->p2c_train_data_offset, ctx->c2p_train_data_offset); - ret = amdgpu_bo_create_kernel_at(adev, -ctx->p2c_train_data_offset, -ctx->train_data_size, -AMDGPU_GEM_DOMAIN_VRAM, -&ctx->p2c_bo, -NULL); - if (ret) { - DRM_ERROR("alloc p2c_bo failed(%d)!\n", ret); - goto Err_out; - } - ret = amdgpu_bo_create_kernel_at(adev, ctx->c2p_train_data_offset, ctx->train_data_size, -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 1/2] drm/amdgpu: update the method to get fb_loc of memory training
From: "Tianci.Yin" The method of getting fb_loc changed from parsing VBIOS to taking certain offset from top of VRAM Change-Id: I053b42fdb1d822722fa7980b2cd9f86b3fdce539 --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 +- .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c | 36 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 6 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 7 drivers/gpu/drm/amd/include/atomfirmware.h| 14 5 files changed, 16 insertions(+), 50 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index a78a363b1d71..fa2cf8e7bc07 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -642,9 +642,8 @@ struct amdgpu_fw_vram_usage { struct amdgpu_bo *reserved_bo; void *va; - /* Offset on the top of VRAM, used as c2p write buffer. + /* GDDR6 training support flag. */ - u64 mem_train_fb_loc; bool mem_train_support; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c index ff4eb96bdfb5..009cb0b03d13 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c @@ -528,13 +528,9 @@ static int gddr6_mem_train_support(struct amdgpu_device *adev) int amdgpu_atomfirmware_get_mem_train_fb_loc(struct amdgpu_device *adev) { struct atom_context *ctx = adev->mode_info.atom_context; - unsigned char *bios = ctx->bios; - struct vram_reserve_block *reserved_block; - int index, block_number; + int index; uint8_t frev, crev; uint16_t data_offset, size; - uint32_t start_address_in_kb; - uint64_t offset; int ret; adev->fw_vram_usage.mem_train_support = false; @@ -569,32 +565,6 @@ int amdgpu_atomfirmware_get_mem_train_fb_loc(struct amdgpu_device *adev) return -EINVAL; } - reserved_block = (struct vram_reserve_block *) - (bios + data_offset + sizeof(struct atom_common_table_header)); - block_number = ((unsigned int)size - sizeof(struct atom_common_table_header)) - / sizeof(struct vram_reserve_block); - reserved_block += (block_number > 0) ? block_number-1 : 0; - DRM_DEBUG("block_number:0x%04x, last block: 0x%08xkb sz, %dkb fw, %dkb drv.\n", - block_number, - le32_to_cpu(reserved_block->start_address_in_kb), - le16_to_cpu(reserved_block->used_by_firmware_in_kb), - le16_to_cpu(reserved_block->used_by_driver_in_kb)); - if (reserved_block->used_by_firmware_in_kb > 0) { - start_address_in_kb = le32_to_cpu(reserved_block->start_address_in_kb); - offset = (uint64_t)start_address_in_kb * ONE_KiB; - if ((offset & (ONE_MiB - 1)) < (4 * ONE_KiB + 1) ) { - offset -= ONE_MiB; - } - - offset &= ~(ONE_MiB - 1); - adev->fw_vram_usage.mem_train_fb_loc = offset; - adev->fw_vram_usage.mem_train_support = true; - DRM_DEBUG("mem_train_fb_loc:0x%09llx.\n", offset); - ret = 0; - } else { - DRM_ERROR("used_by_firmware_in_kb is 0!\n"); - ret = -EINVAL; - } - - return ret; + adev->fw_vram_usage.mem_train_support = true; + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 2ff63d0414c9..ce5cb854bdb9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1705,7 +1705,11 @@ static int amdgpu_ttm_training_reserve_vram_init(struct amdgpu_device *adev) return 0; } - ctx->c2p_train_data_offset = adev->fw_vram_usage.mem_train_fb_loc; + ctx->c2p_train_data_offset = adev->gmc.mc_vram_size; + if ((ctx->c2p_train_data_offset & (ONE_MiB - 1)) < (4 * ONE_KiB + 1) ) { + ctx->c2p_train_data_offset -= ONE_MiB; + } + ctx->c2p_train_data_offset &= ~(ONE_MiB - 1); ctx->p2c_train_data_offset = (adev->gmc.mc_vram_size - GDDR6_MEM_TRAINING_OFFSET); ctx->train_data_size = GDDR6_MEM_TRAINING_DATA_SIZE_IN_BYTES; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index f1ebd424510c..19eb3e8456c7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -66,6 +66,13 @@ struct amdgpu_copy_mem { unsigned long offset; }; +/* Definitions for constance */ +enum amdgpu_internal_constants +{ + ONE_KiB = 0x400, + ONE_MiB = 0x10, +}; + extern const struct ttm_mem_type_manager_func amdgpu_gtt_mgr_func; extern const struct ttm_mem_type_manager_func amdgpu_vram_mgr_func; diff --git a/drivers/gpu/drm/amd/incl
RE: [PATCH 2/2] drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV
[AMD Official Use Only - Internal Distribution Only] Reviewed-by: Emily Deng >-Original Message- >From: amd-gfx On Behalf Of Monk Liu >Sent: Tuesday, December 17, 2019 6:20 PM >To: amd-gfx@lists.freedesktop.org >Cc: Liu, Monk >Subject: [PATCH 2/2] drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV > >issues: >MEC is ruined by the amdkfd_pre_reset after VF FLR done > >fix: >amdkfd_pre_reset() would ruin MEC after hypervisor finished the VF FLR, the >correct sequence is do amdkfd_pre_reset before VF FLR but there is a limitation >to block this sequence: >if we do pre_reset() before VF FLR, it would go KIQ way to do register access >and >stuck there, because KIQ probably won't work by that time (e.g. you already >made GFX hang) > >so the best way right now is to simply remove it. > >Signed-off-by: Monk Liu >--- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 -- > 1 file changed, 2 deletions(-) > >diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >index 605cef6..ae962b9 100644 >--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >@@ -3672,8 +3672,6 @@ static int amdgpu_device_reset_sriov(struct >amdgpu_device *adev, > if (r) > return r; > >- amdgpu_amdkfd_pre_reset(adev); >- > /* Resume IP prior to SMC */ > r = amdgpu_device_ip_reinit_early_sriov(adev); > if (r) >-- >2.7.4 > >___ >amd-gfx mailing list >amd-gfx@lists.freedesktop.org >https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.fre >edesktop.org%2Fmailman%2Flistinfo%2Famd- >gfx&data=02%7C01%7CEmily.Deng%40amd.com%7C74408803b49e4f328 >f7708d782daba6c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C6 >37121748318124859&sdata=4YbyHwEEGxVLEhuOg%2Frc%2FxdhFRwrdm >FuZ4vpHx%2FApAE%3D&reserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 1/2] drm/amdgpu: fix double gpu_recovery for NV of SRIOV
[AMD Official Use Only - Internal Distribution Only] Reviewed-by: Emily Deng >-Original Message- >From: amd-gfx On Behalf Of Monk Liu >Sent: Tuesday, December 17, 2019 6:20 PM >To: amd-gfx@lists.freedesktop.org >Cc: Liu, Monk >Subject: [PATCH 1/2] drm/amdgpu: fix double gpu_recovery for NV of SRIOV > >issues: >gpu_recover() is re-entered by the mailbox interrupt handler mxgpu_nv.c > >fix: >we need to bypass the gpu_recover() invoke in mailbox interrupt as long as the >timeout is not infinite (thus the TDR will be triggered automatically after >time >out, no need to invoke >gpu_recover() through mailbox interrupt. > >Signed-off-by: Monk Liu >--- > drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 6 +- > 1 file changed, 5 insertions(+), 1 deletion(-) > >diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >index 0d8767e..1c3a7d4 100644 >--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >@@ -269,7 +269,11 @@ static void xgpu_nv_mailbox_flr_work(struct >work_struct *work) > } > > /* Trigger recovery for world switch failure if no TDR */ >- if (amdgpu_device_should_recover_gpu(adev)) >+ if (amdgpu_device_should_recover_gpu(adev) >+ && (adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT || >+ adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT || >+ adev->compute_timeout == MAX_SCHEDULE_TIMEOUT || >+ adev->video_timeout == MAX_SCHEDULE_TIMEOUT)) > amdgpu_device_gpu_recover(adev, NULL); } > >-- >2.7.4 > >___ >amd-gfx mailing list >amd-gfx@lists.freedesktop.org >https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.fre >edesktop.org%2Fmailman%2Flistinfo%2Famd- >gfx&data=02%7C01%7CEmily.Deng%40amd.com%7C029ef88677e744f2ad >8f08d782dab68c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C63 >7121748276776005&sdata=IiRwMTw6DQW8sh8Y7SkZ2PehohwnH6gSqkt >t64a73UU%3D&reserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 1/2] drm/amdgpu: fix double gpu_recovery for NV of SRIOV
issues: gpu_recover() is re-entered by the mailbox interrupt handler mxgpu_nv.c fix: we need to bypass the gpu_recover() invoke in mailbox interrupt as long as the timeout is not infinite (thus the TDR will be triggered automatically after time out, no need to invoke gpu_recover() through mailbox interrupt. Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index 0d8767e..1c3a7d4 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -269,7 +269,11 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) } /* Trigger recovery for world switch failure if no TDR */ - if (amdgpu_device_should_recover_gpu(adev)) + if (amdgpu_device_should_recover_gpu(adev) + && (adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT || + adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT || + adev->compute_timeout == MAX_SCHEDULE_TIMEOUT || + adev->video_timeout == MAX_SCHEDULE_TIMEOUT)) amdgpu_device_gpu_recover(adev, NULL); } -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 2/2] drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV
issues: MEC is ruined by the amdkfd_pre_reset after VF FLR done fix: amdkfd_pre_reset() would ruin MEC after hypervisor finished the VF FLR, the correct sequence is do amdkfd_pre_reset before VF FLR but there is a limitation to block this sequence: if we do pre_reset() before VF FLR, it would go KIQ way to do register access and stuck there, because KIQ probably won't work by that time (e.g. you already made GFX hang) so the best way right now is to simply remove it. Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 605cef6..ae962b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3672,8 +3672,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) return r; - amdgpu_amdkfd_pre_reset(adev); - /* Resume IP prior to SMC */ r = amdgpu_device_ip_reinit_early_sriov(adev); if (r) -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: fix KIQ ring test fail in TDR
issues: there are two issue may lead to TDR failure for SRIOV 1) gpu_recover() is re-entered by the mailbox interrupt handler mxgpu_nv.c 2) MEC is ruined by the amdkfd_pre_reset after VF FLR done fix: for 1) we need to bypass the gpu_recover() invoke in mailbox interrupt as long as the timeout is not infinite (thus the TDR will be triggered automatically after time out, no need to invoke gpu_recover() through mailbox interrupt. for 2) amdkfd_pre_reset() would ruin MEC after hypervisor finished the VF FLR, the correct sequence is do amdkfd_pre_reset before VF FLR but there is a limitation to block this sequence: if we do pre_reset() before VF FLR, it would go KIQ way to do register access and stuck there, because KIQ probably won't work by that time (e.g. you already made GFX hang) Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 -- drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 6 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 605cef6..ae962b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3672,8 +3672,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) return r; - amdgpu_amdkfd_pre_reset(adev); - /* Resume IP prior to SMC */ r = amdgpu_device_ip_reinit_early_sriov(adev); if (r) diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index 0d8767e..1c3a7d4 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -269,7 +269,11 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) } /* Trigger recovery for world switch failure if no TDR */ - if (amdgpu_device_should_recover_gpu(adev)) + if (amdgpu_device_should_recover_gpu(adev) + && (adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT || + adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT || + adev->compute_timeout == MAX_SCHEDULE_TIMEOUT || + adev->video_timeout == MAX_SCHEDULE_TIMEOUT)) amdgpu_device_gpu_recover(adev, NULL); } -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: move umc offset to one new header file for Arcturus
Fixes: 9686563c4c42 drm/amdgpu: Added RAS UMC error query support for Arcturus Code refactor and no functional change. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/umc_v6_1.c | 17 +- .../include/asic_reg/umc/umc_6_1_2_offset.h | 32 +++ 2 files changed, 33 insertions(+), 16 deletions(-) create mode 100644 drivers/gpu/drm/amd/include/asic_reg/umc/umc_6_1_2_offset.h diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c index 515eb50cd0f8..5093965dbc24 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c @@ -28,17 +28,10 @@ #include "rsmu/rsmu_0_0_2_sh_mask.h" #include "umc/umc_6_1_1_offset.h" #include "umc/umc_6_1_1_sh_mask.h" +#include "umc/umc_6_1_2_offset.h" #define smnMCA_UMC0_MCUMC_ADDRT0 0x50f10 -/* UMC 6_1_2 register offsets */ -#define mmUMCCH0_0_EccErrCntSel_ARCT 0x0360 -#define mmUMCCH0_0_EccErrCntSel_ARCT_BASE_IDX1 -#define mmUMCCH0_0_EccErrCnt_ARCT0x0361 -#define mmUMCCH0_0_EccErrCnt_ARCT_BASE_IDX 1 -#define mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT 0x03c2 -#define mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT_BASE_IDX 1 - /* * (addr / 256) * 8192, the higher 26 bits in ErrorAddr * is the index of 8KB block @@ -105,7 +98,6 @@ static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, if (adev->asic_type == CHIP_ARCTURUS) { /* UMC 6_1_2 registers */ - ecc_err_cnt_sel_addr = SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT); ecc_err_cnt_addr = @@ -114,7 +106,6 @@ static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT); } else { /* UMC 6_1_1 registers */ - ecc_err_cnt_sel_addr = SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel); ecc_err_cnt_addr = @@ -164,12 +155,10 @@ static void umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device *adev if (adev->asic_type == CHIP_ARCTURUS) { /* UMC 6_1_2 registers */ - mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT); } else { /* UMC 6_1_1 registers */ - mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); } @@ -211,12 +200,10 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, if (adev->asic_type == CHIP_ARCTURUS) { /* UMC 6_1_2 registers */ - mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT); } else { /* UMC 6_1_1 registers */ - mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); } @@ -282,14 +269,12 @@ static void umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device *adev, if (adev->asic_type == CHIP_ARCTURUS) { /* UMC 6_1_2 registers */ - ecc_err_cnt_sel_addr = SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT); ecc_err_cnt_addr = SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT); } else { /* UMC 6_1_1 registers */ - ecc_err_cnt_sel_addr = SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel); ecc_err_cnt_addr = diff --git a/drivers/gpu/drm/amd/include/asic_reg/umc/umc_6_1_2_offset.h b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_6_1_2_offset.h new file mode 100644 index ..3e79a8056556 --- /dev/null +++ b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_6_1_2_offset.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + *
RE: [PATCH v2 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d.
[AMD Official Use Only - Internal Distribution Only] Hi Andry Please check the 3 minor comments in this patch. With that addressed, the V2s series is Reviewed-by: Le Ma mailto:le...@amd.com>> Regards, Ma Le -Original Message- From: Andrey Grodzovsky Sent: Saturday, December 14, 2019 12:54 AM To: dri-de...@lists.freedesktop.org; amd-gfx@lists.freedesktop.org Cc: Deucher, Alexander ; Ma, Le ; Zhang, Hawking ; Quan, Evan ; Grodzovsky, Andrey Subject: [PATCH v2 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d. In preparation for doing XGMI reset synchronization using task barrier. Signed-off-by: Andrey Grodzovsky mailto:andrey.grodzov...@amd.com>> Reviewed-by: Le Ma mailto:le...@amd.com>> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h| 2 - drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 76 +- 2 files changed, 12 insertions(+), 66 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index a78a363..50bab33 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1001,8 +1001,6 @@ struct amdgpu_device { boolpm_sysfs_en; boolucode_sysfs_en; - - boolin_baco; }; static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7324a5f..1d19edfa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2667,7 +2667,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) adev->asic_reset_res = (adev->in_baco == false) ? amdgpu_device_baco_enter(adev->ddev) : - amdgpu_device_baco_exit(adev->ddev); + qamdgpu_device_baco_exit(adev->ddev); [Le] 1/3: Still unnecessary typo here, although it will be removed in patch #4. else adev->asic_reset_res = amdgpu_asic_reset(adev); @@ -3796,18 +3796,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, return r; } -static int amdgpu_do_asic_reset(struct amdgpu_device *adev, - struct amdgpu_hive_info *hive, +static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, struct list_head *device_list_handle, bool *need_full_reset_arg) { struct amdgpu_device *tmp_adev = NULL; bool need_full_reset = *need_full_reset_arg, vram_lost = false; int r = 0; - int cpu = smp_processor_id(); - bool use_baco = - (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? - true : false; /* * ASIC reset has to be done on all HGMI hive nodes ASAP @@ -3815,62 +3810,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev, */ if (need_full_reset) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - /* - * For XGMI run all resets in parallel to speed up the - * process by scheduling the highpri wq on different - * cpus. For XGMI with baco reset, all nodes must enter - * baco within close proximity before anyone exit. - */ + /* For XGMI run all resets in parallel to speed up the process */ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { - if (!queue_work_on(cpu, system_highpri_wq, - &tmp_adev->xgmi_reset_work)) + if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) r = -EALREADY; - cpu = cpumask_next(cpu, cpu_online_mask); } else r = amdgpu_asic_reset(tmp_adev); - if (r) - break; - } - - /* For XGMI wait for all work to complete before proceed */ - if