[PATCH v3] drm/amdgpu: Move mca debug mode decision to ras
Refactor code such that ras block decides the default mca debug mode, and not swsmu block. By default mca debug mode is set to false. Signed-off-by: Lijo Lazar --- v3: Default mca debug mode is set to false v2: Set mca debug mode early before ras block late init as ras query is initiated during late init of ras blocks (KevinYang) drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c| 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 14 +++--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h| 2 +- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 12 4 files changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index cf33eb219e25..54f2f346579e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -377,7 +377,7 @@ static int amdgpu_mca_smu_debug_mode_set(void *data, u64 val) struct amdgpu_device *adev = (struct amdgpu_device *)data; int ret; - ret = amdgpu_mca_smu_set_debug_mode(adev, val ? true : false); + ret = amdgpu_ras_set_mca_debug_mode(adev, val ? true : false); if (ret) return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 84e5987b14e0..6747fbe4feab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3132,6 +3132,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev)) return 0; + amdgpu_ras_set_mca_debug_mode(adev, false); + list_for_each_entry_safe(node, tmp, >ras_list, node) { if (!node->ras_obj) { dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); @@ -3405,12 +3407,18 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) return 0; } -void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable) +int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + int ret; - if (con) - con->is_mca_debug_mode = enable; + if (con) { + ret = amdgpu_mca_smu_set_debug_mode(adev, enable); + if (!ret) + con->is_mca_debug_mode = enable; + } + + return ret; } bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 19161916ac46..6a941eb8fb8f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -773,7 +773,7 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev); int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con); -void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); +int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev); bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev, unsigned int *mode); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 6cbfb25a05de..f09f56efbdc3 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -1516,7 +1516,6 @@ static int smu_v13_0_6_mca_set_debug_mode(struct smu_context *smu, bool enable) if (smu->smc_fw_version < 0x554800) return 0; - amdgpu_ras_set_mca_debug_mode(smu->adev, enable); return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ClearMcaOnRead, enable ? 0 : ClearMcaOnRead_UE_FLAG_MASK | ClearMcaOnRead_CE_POLL_MASK, NULL); @@ -2338,16 +2337,6 @@ static int smu_v13_0_6_smu_send_hbm_bad_page_num(struct smu_context *smu, return ret; } -static int smu_v13_0_6_post_init(struct smu_context *smu) -{ - struct amdgpu_device *adev = smu->adev; - - if (!amdgpu_sriov_vf(adev) && adev->ras_enabled) - return smu_v13_0_6_mca_set_debug_mode(smu, false); - - return 0; -} - static int mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable) { struct smu_context *smu = adev->powerplay.pp_handle; @@ -2904,7 +2893,6 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = { .i2c_init = smu_v13_0_6_i2c_control_init, .i2c_fini = smu_v13_0_6_i2c_control_fini, .send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num, - .post_init = smu_v13_0_6_post_init, }; void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu) -- 2.25.1
[PATCH 2/2] drm/virtio: Modify RESOURCE_GET_LAYOUT ioctl
Modify RESOURCE_GET_LAYOUT ioctl to handle the use case that query correct stride for guest linear resource before it is created. Signed-off-by: Julia Zhang --- drivers/gpu/drm/virtio/virtgpu_drv.h | 26 -- drivers/gpu/drm/virtio/virtgpu_ioctl.c | 47 -- drivers/gpu/drm/virtio/virtgpu_vq.c| 35 +++ include/uapi/drm/virtgpu_drm.h | 6 ++-- include/uapi/linux/virtio_gpu.h| 8 ++--- 5 files changed, 66 insertions(+), 56 deletions(-) diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.h b/drivers/gpu/drm/virtio/virtgpu_drv.h index d6fc0d4ecb7d..82dffb3e4c6b 100644 --- a/drivers/gpu/drm/virtio/virtgpu_drv.h +++ b/drivers/gpu/drm/virtio/virtgpu_drv.h @@ -93,15 +93,6 @@ struct virtio_gpu_object { bool host3d_blob, guest_blob; uint32_t blob_mem, blob_flags; - atomic_t layout_state; - struct { - uint64_t offset; - uint64_t size; - uint32_t stride; - } planes[VIRTIO_GPU_RES_MAX_PLANES]; - uint64_t modifier; - uint32_t num_planes; - int uuid_state; uuid_t uuid; @@ -225,6 +216,16 @@ struct virtio_gpu_drv_cap_cache { atomic_t is_valid; }; +struct virtio_gpu_query_info { + uint32_t num_planes; + uint64_t modifier; + struct { + uint64_t offset; + uint32_t stride; + } planes [VIRTIO_GPU_MAX_RESOURCE_PLANES]; + atomic_t is_valid; +}; + struct virtio_gpu_device { struct drm_device *ddev; @@ -448,7 +449,12 @@ void virtio_gpu_cmd_host_wait(struct virtio_gpu_device *vgdev, int virtio_gpu_cmd_get_resource_layout(struct virtio_gpu_device *vgdev, - struct virtio_gpu_object *bo); + struct virtio_gpu_query_info *bo_info, + uint32_t width, + uint32_t height, + uint32_t format, + uint32_t bind, + uint32_t hw_res_handle); /* virtgpu_display.c */ int virtio_gpu_modeset_init(struct virtio_gpu_device *vgdev); diff --git a/drivers/gpu/drm/virtio/virtgpu_ioctl.c b/drivers/gpu/drm/virtio/virtgpu_ioctl.c index 51d04460d0d8..034a7c0927a5 100644 --- a/drivers/gpu/drm/virtio/virtgpu_ioctl.c +++ b/drivers/gpu/drm/virtio/virtgpu_ioctl.c @@ -685,9 +685,9 @@ static int virtio_gpu_resource_query_layout_ioctl(struct drm_device *dev, { struct drm_virtgpu_resource_query_layout *args = data; struct virtio_gpu_device *vgdev = dev->dev_private; - struct drm_gem_object *obj; - struct virtio_gpu_object *bo; - int layout_state; + struct drm_gem_object *obj = NULL; + struct virtio_gpu_object *bo = NULL; + struct virtio_gpu_query_info bo_info = {0}; int ret = 0; int i; @@ -696,50 +696,45 @@ static int virtio_gpu_resource_query_layout_ioctl(struct drm_device *dev, return -EINVAL; } - obj = drm_gem_object_lookup(file, args->handle); - if (obj == NULL) { - DRM_ERROR("invalid handle 0x%x\n", args->handle); - return -ENOENT; - } - bo = gem_to_virtio_gpu_obj(obj); - - layout_state = atomic_read(>layout_state); - if (layout_state == STATE_ERR) { - ret = -EINVAL; - goto out; - } else if (layout_state == STATE_OK) { - goto valid; + if (args->handle > 0) { + obj = drm_gem_object_lookup(file, args->handle); + if (obj == NULL) { + DRM_ERROR("invalid handle 0x%x\n", args->handle); + return -ENOENT; + } + bo = gem_to_virtio_gpu_obj(obj); } - ret = virtio_gpu_cmd_get_resource_layout(vgdev, bo); + ret = virtio_gpu_cmd_get_resource_layout(vgdev, _info, args->width, +args->height, args->format, +args->bind, bo ? bo->hw_res_handle : 0); if (ret) goto out; ret = wait_event_timeout(vgdev->resp_wq, -atomic_read(>layout_state) == STATE_OK, +atomic_read(_info.is_valid), 5 * HZ); if (!ret) goto out; valid: smp_rmb(); - WARN_ON(atomic_read(>layout_state) != STATE_OK); - args->num_planes = bo->num_planes; - args->modifier = bo->modifier; + WARN_ON(atomic_read(_info.is_valid)); + args->num_planes = bo_info.num_planes; + args->modifier = bo_info.modifier; for (i = 0; i < args->num_planes; i++) { - args->planes[i].offset = bo->planes[i].offset; - args->planes[i].size = bo->planes[i].size; - args->planes[i].stride =
[PATCH 1/2] drm/virtio: Implement RESOURCE_GET_LAYOUT ioctl
From: Daniel Stone This ioctl allows the guest to discover how the guest actually allocated the underlying buffer, which allows buffers to be used for GL<->Vulkan interop and through standard window systems. It's also a step towards properly supporting modifiers in the guest. --- drivers/gpu/drm/virtio/virtgpu_drv.c | 1 + drivers/gpu/drm/virtio/virtgpu_drv.h | 16 +- drivers/gpu/drm/virtio/virtgpu_ioctl.c | 71 ++ drivers/gpu/drm/virtio/virtgpu_kms.c | 8 ++- drivers/gpu/drm/virtio/virtgpu_vq.c| 56 include/uapi/drm/virtgpu_drm.h | 19 +++ include/uapi/linux/virtio_gpu.h| 30 +++ 7 files changed, 198 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.c b/drivers/gpu/drm/virtio/virtgpu_drv.c index 4f7140e27614..1ee09974d4b7 100644 --- a/drivers/gpu/drm/virtio/virtgpu_drv.c +++ b/drivers/gpu/drm/virtio/virtgpu_drv.c @@ -190,6 +190,7 @@ static unsigned int features[] = { VIRTIO_GPU_F_RESOURCE_BLOB, VIRTIO_GPU_F_CONTEXT_INIT, VIRTIO_GPU_F_CONTEXT_FENCE_WAIT, + VIRTIO_GPU_F_RESOURCE_QUERY_LAYOUT, }; static struct virtio_driver virtio_gpu_driver = { .feature_table = features, diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.h b/drivers/gpu/drm/virtio/virtgpu_drv.h index 7ef4b3df0ada..d6fc0d4ecb7d 100644 --- a/drivers/gpu/drm/virtio/virtgpu_drv.h +++ b/drivers/gpu/drm/virtio/virtgpu_drv.h @@ -93,6 +93,15 @@ struct virtio_gpu_object { bool host3d_blob, guest_blob; uint32_t blob_mem, blob_flags; + atomic_t layout_state; + struct { + uint64_t offset; + uint64_t size; + uint32_t stride; + } planes[VIRTIO_GPU_RES_MAX_PLANES]; + uint64_t modifier; + uint32_t num_planes; + int uuid_state; uuid_t uuid; @@ -249,6 +258,7 @@ struct virtio_gpu_device { bool has_host_visible; bool has_context_init; bool has_host_fence_wait; + bool has_resource_query_layout; struct virtio_shm_region host_visible_region; struct drm_mm host_visible_mm; @@ -281,7 +291,7 @@ struct virtio_gpu_fpriv { }; /* virtgpu_ioctl.c */ -#define DRM_VIRTIO_NUM_IOCTLS 12 +#define DRM_VIRTIO_NUM_IOCTLS 13 extern struct drm_ioctl_desc virtio_gpu_ioctls[DRM_VIRTIO_NUM_IOCTLS]; void virtio_gpu_create_context(struct drm_device *dev, struct drm_file *file); @@ -436,6 +446,10 @@ int virtio_gpu_cmd_status_freezing(struct virtio_gpu_device *vgdev, void virtio_gpu_cmd_host_wait(struct virtio_gpu_device *vgdev, uint32_t ctx_id, uint64_t fence_id); +int +virtio_gpu_cmd_get_resource_layout(struct virtio_gpu_device *vgdev, + struct virtio_gpu_object *bo); + /* virtgpu_display.c */ int virtio_gpu_modeset_init(struct virtio_gpu_device *vgdev); void virtio_gpu_modeset_fini(struct virtio_gpu_device *vgdev); diff --git a/drivers/gpu/drm/virtio/virtgpu_ioctl.c b/drivers/gpu/drm/virtio/virtgpu_ioctl.c index b6079d2bff69..51d04460d0d8 100644 --- a/drivers/gpu/drm/virtio/virtgpu_ioctl.c +++ b/drivers/gpu/drm/virtio/virtgpu_ioctl.c @@ -107,6 +107,9 @@ static int virtio_gpu_getparam_ioctl(struct drm_device *dev, void *data, case VIRTGPU_PARAM_SUPPORTED_CAPSET_IDs: value = vgdev->capset_id_mask; break; + case VIRTGPU_PARAM_RESOURCE_QUERY_LAYOUT: + value = vgdev->has_resource_query_layout ? 1 : 0; + break; default: return -EINVAL; } @@ -676,6 +679,70 @@ static int virtio_gpu_context_init_ioctl(struct drm_device *dev, return ret; } +static int virtio_gpu_resource_query_layout_ioctl(struct drm_device *dev, + void *data, + struct drm_file *file) +{ + struct drm_virtgpu_resource_query_layout *args = data; + struct virtio_gpu_device *vgdev = dev->dev_private; + struct drm_gem_object *obj; + struct virtio_gpu_object *bo; + int layout_state; + int ret = 0; + int i; + + if (!vgdev->has_resource_query_layout) { + DRM_ERROR("failing: no RQL on host\n"); + return -EINVAL; + } + + obj = drm_gem_object_lookup(file, args->handle); + if (obj == NULL) { + DRM_ERROR("invalid handle 0x%x\n", args->handle); + return -ENOENT; + } + bo = gem_to_virtio_gpu_obj(obj); + + layout_state = atomic_read(>layout_state); + if (layout_state == STATE_ERR) { + ret = -EINVAL; + goto out; + } else if (layout_state == STATE_OK) { + goto valid; + } + + ret = virtio_gpu_cmd_get_resource_layout(vgdev, bo); + if (ret) + goto out; + + ret = wait_event_timeout(vgdev->resp_wq, +
[PATCH 0/2] Add RESOURCE_GET_LAYOUT ioctl
This is to add a new ioctl RESOURCE_GET_LAYOUT to virtio-gpu to get the information about how the host has actually allocated the buffer. It is implemented to query the stride of linear buffer for dGPU prime on guest VM, related mesa mr: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23896 Daniel Stone (1): drm/virtio: Implement RESOURCE_GET_LAYOUT ioctl Julia Zhang (1): drm/virtio: Modify RESOURCE_GET_LAYOUT ioctl drivers/gpu/drm/virtio/virtgpu_drv.c | 1 + drivers/gpu/drm/virtio/virtgpu_drv.h | 22 - drivers/gpu/drm/virtio/virtgpu_ioctl.c | 66 ++ drivers/gpu/drm/virtio/virtgpu_kms.c | 8 +++- drivers/gpu/drm/virtio/virtgpu_vq.c| 63 include/uapi/drm/virtgpu_drm.h | 21 include/uapi/linux/virtio_gpu.h| 30 7 files changed, 208 insertions(+), 3 deletions(-) -- 2.34.1
RE: [PATCH] drm/amdgpu: correct mca ipid die/socket/addr decode
[AMD Official Use Only - General] Reviewed-by: Hawking Zhang Regards, Hawking -Original Message- From: Wang, Yang(Kevin) Sent: Friday, November 10, 2023 14:33 To: amd-gfx@lists.freedesktop.org Cc: Zhang, Hawking ; Wang, Yang(Kevin) Subject: [PATCH] drm/amdgpu: correct mca ipid die/socket/addr decode correct mca ipid die/socket/addr decode Signed-off-by: Yang Wang --- .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c| 17 - 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 789552c66d1d..3f71490b779c 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2381,8 +2381,8 @@ static const struct mca_bank_ipid smu_v13_0_6_mca_ipid_table[AMDGPU_MCA_IP_COUNT static void mca_bank_entry_info_decode(struct mca_bank_entry *entry, struct mca_bank_info *info) { - uint64_t ipid = entry->regs[MCA_REG_IDX_IPID]; - uint32_t insthi; + u64 ipid = entry->regs[MCA_REG_IDX_IPID]; + u32 instidhi, instid; /* NOTE: All MCA IPID register share the same format, * so the driver can share the MCMP1 register header file. @@ -2391,9 +2391,14 @@ static void mca_bank_entry_info_decode(struct mca_bank_entry *entry, struct mca_ info->hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID); info->mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); - insthi = REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi); - info->aid = ((insthi >> 2) & 0x03); - info->socket_id = insthi & 0x03; + /* Unfied DieID Format: SAASS. A:AID, S:Socket. +* Unfied DieID[4] = InstanceId[0] +* Unfied DieID[0:3] = InstanceIdHi[0:3] +* */ + instidhi = REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi); + instid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo); + info->aid = ((instidhi >> 2) & 0x03); + info->socket_id = ((instid & 0x1) << 4) | (instidhi & 0x03); } static int mca_bank_read_reg(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, @@ -2567,6 +2572,7 @@ static bool mca_gfx_smu_bank_is_valid(const struct mca_ras_info *mca_ras, struct uint32_t instlo; instlo = REG_GET_FIELD(entry->regs[MCA_REG_IDX_IPID], MCMP1_IPIDT0, InstanceIdLo); + instlo &= GENMASK(31, 1); switch (instlo) { case 0x36430400: /* SMNAID XCD 0 */ case 0x38430400: /* SMNAID XCD 1 */ @@ -2585,6 +2591,7 @@ static bool mca_smu_bank_is_valid(const struct mca_ras_info *mca_ras, struct amd uint32_t errcode, instlo; instlo = REG_GET_FIELD(entry->regs[MCA_REG_IDX_IPID], MCMP1_IPIDT0, InstanceIdLo); + instlo &= GENMASK(31, 1); if (instlo != 0x03b30400) return false; -- 2.34.1
RE: [PATCH 2/3] drm/amdgpu: fall back to INPUT power for AVG power via INFO IOCTL
[AMD Official Use Only - General] Reviewed-by: Yang Wang Best Regards, Kevin -Original Message- From: amd-gfx On Behalf Of Alex Deucher Sent: Friday, November 10, 2023 6:14 AM To: amd-gfx@lists.freedesktop.org Cc: Deucher, Alexander Subject: [PATCH 2/3] drm/amdgpu: fall back to INPUT power for AVG power via INFO IOCTL For backwards compatibility with userspace. Fixes: 47f1724db4fe ("drm/amd: Introduce `AMDGPU_PP_SENSOR_GPU_INPUT_POWER`") Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2897 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index b5ebafd4a3ad..bf4f48fe438d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1105,7 +1105,12 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) if (amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_AVG_POWER, (void *), _size)) { - return -EINVAL; + /* fall back to input power for backwards compat */ + if (amdgpu_dpm_read_sensor(adev, + AMDGPU_PP_SENSOR_GPU_INPUT_POWER, + (void *), _size)) { + return -EINVAL; + } } ui32 >>= 8; break; -- 2.41.0
[PATCH] drm/amdgpu: correct mca ipid die/socket/addr decode
correct mca ipid die/socket/addr decode Signed-off-by: Yang Wang --- .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c| 17 - 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 789552c66d1d..3f71490b779c 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2381,8 +2381,8 @@ static const struct mca_bank_ipid smu_v13_0_6_mca_ipid_table[AMDGPU_MCA_IP_COUNT static void mca_bank_entry_info_decode(struct mca_bank_entry *entry, struct mca_bank_info *info) { - uint64_t ipid = entry->regs[MCA_REG_IDX_IPID]; - uint32_t insthi; + u64 ipid = entry->regs[MCA_REG_IDX_IPID]; + u32 instidhi, instid; /* NOTE: All MCA IPID register share the same format, * so the driver can share the MCMP1 register header file. @@ -2391,9 +2391,14 @@ static void mca_bank_entry_info_decode(struct mca_bank_entry *entry, struct mca_ info->hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID); info->mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); - insthi = REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi); - info->aid = ((insthi >> 2) & 0x03); - info->socket_id = insthi & 0x03; + /* Unfied DieID Format: SAASS. A:AID, S:Socket. +* Unfied DieID[4] = InstanceId[0] +* Unfied DieID[0:3] = InstanceIdHi[0:3] +* */ + instidhi = REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi); + instid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo); + info->aid = ((instidhi >> 2) & 0x03); + info->socket_id = ((instid & 0x1) << 4) | (instidhi & 0x03); } static int mca_bank_read_reg(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, @@ -2567,6 +2572,7 @@ static bool mca_gfx_smu_bank_is_valid(const struct mca_ras_info *mca_ras, struct uint32_t instlo; instlo = REG_GET_FIELD(entry->regs[MCA_REG_IDX_IPID], MCMP1_IPIDT0, InstanceIdLo); + instlo &= GENMASK(31, 1); switch (instlo) { case 0x36430400: /* SMNAID XCD 0 */ case 0x38430400: /* SMNAID XCD 1 */ @@ -2585,6 +2591,7 @@ static bool mca_smu_bank_is_valid(const struct mca_ras_info *mca_ras, struct amd uint32_t errcode, instlo; instlo = REG_GET_FIELD(entry->regs[MCA_REG_IDX_IPID], MCMP1_IPIDT0, InstanceIdLo); + instlo &= GENMASK(31, 1); if (instlo != 0x03b30400) return false; -- 2.34.1
Re: [Patch v13 0/9] Enable Wifi RFI interference mitigation feature support
ping... Any other comments? Regards, Ma Jun On 10/30/2023 3:18 PM, Ma Jun wrote: > Due to electrical and mechanical constraints in certain platform designs there > may be likely interference of relatively high-powered harmonics of the (G-)DDR > memory clocks with local radio module frequency bands used by Wifi 6/6e/7. To > mitigate possible RFI interference we introuduced WBRF(Wifi Band RFI > mitigation Feature). > Producers can advertise the frequencies in use and consumers can use this > information > to avoid using these frequencies for sensitive features. > > The whole patch set is based on Linux 6.6.0-rc6. With some brief introductions > as below: > Patch1: Document about WBRF > Patch2: Core functionality setup for WBRF feature support > Patch3 - 4: Bring WBRF support to wifi subsystem. > Patch5 - 9: Bring WBRF support to AMD graphics driver. > > Evan Quan (6): > cfg80211: expose nl80211_chan_width_to_mhz for wide sharing > wifi: mac80211: Add support for WBRF features > drm/amd/pm: update driver_if and ppsmc headers for coming wbrf feature > drm/amd/pm: setup the framework to support Wifi RFI mitigation feature > drm/amd/pm: add flood detection for wbrf events > drm/amd/pm: enable Wifi RFI mitigation feature support for SMU13.0.7 > > Ma Jun (3): > Documentation/driver-api: Add document about WBRF mechanism > platform/x86/amd: Add support for AMD ACPI based Wifi band RFI > mitigation feature > drm/amd/pm: enable Wifi RFI mitigation feature support for SMU13.0.0 > > Documentation/driver-api/wbrf.rst | 76 > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 + > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 17 + > drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 208 + > drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 42 ++ > .../inc/pmfw_if/smu13_driver_if_v13_0_0.h | 3 +- > .../inc/pmfw_if/smu13_driver_if_v13_0_7.h | 3 +- > .../pm/swsmu/inc/pmfw_if/smu_v13_0_0_ppsmc.h | 5 +- > .../pm/swsmu/inc/pmfw_if/smu_v13_0_7_ppsmc.h | 3 +- > drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h | 3 +- > drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h | 4 + > .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 48 ++ > .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 22 + > .../drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 13 + > drivers/gpu/drm/amd/pm/swsmu/smu_internal.h | 3 + > drivers/platform/x86/amd/Kconfig | 15 + > drivers/platform/x86/amd/Makefile | 1 + > drivers/platform/x86/amd/wbrf.c | 413 ++ > include/linux/acpi_amd_wbrf.h | 94 > include/net/cfg80211.h| 9 + > net/mac80211/Makefile | 2 + > net/mac80211/chan.c | 9 + > net/mac80211/ieee80211_i.h| 7 + > net/mac80211/main.c | 2 + > net/mac80211/wbrf.c | 95 > net/wireless/chan.c | 3 +- > 26 files changed, 1094 insertions(+), 8 deletions(-) > create mode 100644 Documentation/driver-api/wbrf.rst > create mode 100644 drivers/platform/x86/amd/wbrf.c > create mode 100644 include/linux/acpi_amd_wbrf.h > create mode 100644 net/mac80211/wbrf.c >
RE: [RFC PATCH v2] drm/amdkfd: Run restore_workers on freezable WQs
[AMD Official Use Only - General] I once replaced the queue with the freezable one, but got hang in flush. Looks like Felix has fixed it. Acked-and-tested-by: xinhui pan -Original Message- From: Kuehling, Felix Sent: Wednesday, November 8, 2023 6:06 AM To: amd-gfx@lists.freedesktop.org Cc: Deng, Emily ; Pan, Xinhui ; Koenig, Christian Subject: [RFC PATCH v2] drm/amdkfd: Run restore_workers on freezable WQs Make restore workers freezable so we don't have to explicitly flush them in suspend and GPU reset code paths, and we don't accidentally try to restore BOs while the GPU is suspended. Not having to flush restore_work also helps avoid lock/fence dependencies in the GPU reset case where we're not allowed to wait for fences. A side effect of this is, that we can now have multiple concurrent threads trying to signal the same eviction fence. Rework eviction fence signaling and replacement to account for that. The GPU reset path can no longer rely on restore_process_worker to resume queues because evict/restore workers can run independently of it. Instead call a new restore_process_helper directly. This is an RFC and request for testing. v2: - Reworked eviction fence signaling - Introduced restore_process_helper Signed-off-by: Felix Kuehling --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 34 ++-- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 87 +++ drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 4 +- 3 files changed, 81 insertions(+), 44 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 54f31a420229..1b33ddc0512e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1405,7 +1405,6 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info, amdgpu_amdkfd_restore_userptr_worker); *process_info = info; - *ef = dma_fence_get(>eviction_fence->base); } vm->process_info = *process_info; @@ -1436,6 +1435,8 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info, list_add_tail(>vm_list_node, &(vm->process_info->vm_list_head)); vm->process_info->n_vms++; + + *ef = dma_fence_get(>process_info->eviction_fence->base); mutex_unlock(>process_info->lock); return 0; @@ -1447,10 +1448,7 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info, reserve_pd_fail: vm->process_info = NULL; if (info) { - /* Two fence references: one in info and one in *ef */ dma_fence_put(>eviction_fence->base); - dma_fence_put(*ef); - *ef = NULL; *process_info = NULL; put_pid(info->pid); create_evict_fence_fail: @@ -1644,7 +1642,8 @@ int amdgpu_amdkfd_criu_resume(void *p) goto out_unlock; } WRITE_ONCE(pinfo->block_mmu_notifications, false); - schedule_delayed_work(>restore_userptr_work, 0); + queue_delayed_work(system_freezable_wq, + >restore_userptr_work, 0); out_unlock: mutex_unlock(>lock); @@ -2458,7 +2457,8 @@ int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni, KFD_QUEUE_EVICTION_TRIGGER_USERPTR); if (r) pr_err("Failed to quiesce KFD\n"); - schedule_delayed_work(_info->restore_userptr_work, + queue_delayed_work(system_freezable_wq, + _info->restore_userptr_work, msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS)); } mutex_unlock(_info->notifier_lock); @@ -2793,7 +2793,8 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) /* If validation failed, reschedule another attempt */ if (evicted_bos) { - schedule_delayed_work(_info->restore_userptr_work, + queue_delayed_work(system_freezable_wq, + _info->restore_userptr_work, msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS)); kfd_smi_event_queue_restore_rescheduled(mm); @@ -2802,6 +2803,23 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) put_task_struct(usertask); } +static void replace_eviction_fence(struct dma_fence **ef, + struct dma_fence *new_ef) +{ + struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true + /* protected by process_info->lock */); + + /* If we're replacing an unsignaled eviction fence, that fence will +* never be signaled, and if anyone is still waiting on that fence, +* they will hang forever. This should never happen. We should only +* replace the fence in restore_work
Re: [PATCH] drm/amdgpu: move UVD and VCE sched entity init after sched init
On Thu, Nov 9, 2023 at 4:35 PM Luben Tuikov wrote: > > On 2023-11-09 11:13, Alex Deucher wrote: > > Ping? > > > > On Wed, Nov 8, 2023 at 1:42 PM Alex Deucher > > wrote: > >> > >> We need kernel scheduling entities to deal with handle clean up > >> if apps are not cleaned up properly. With commit 56e449603f0ac5 > >> ("drm/sched: Convert the GPU scheduler to variable number of run-queues") > >> the scheduler entities have to be created after scheduler init, so > >> change the ordering to fix this. > >> > >> v2: Leave logic in UVD and VCE code > >> > >> Fixes: 56e449603f0ac5 ("drm/sched: Convert the GPU scheduler to variable > >> number of run-queues") > >> Signed-off-by: Alex Deucher > >> Cc: ltuiko...@gmail.com > > Reviewed-by: Luben Tuikov Thanks Luben! Alex > > Regards, > Luben > > >> --- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++ > >> drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c| 22 ++-- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h| 2 +- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c| 24 +++--- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_vce.h| 2 +- > >> drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c | 2 -- > >> drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c | 2 -- > >> drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 2 -- > >> drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c | 2 -- > >> drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 4 > >> drivers/gpu/drm/amd/amdgpu/vce_v2_0.c | 2 -- > >> drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 2 -- > >> drivers/gpu/drm/amd/amdgpu/vce_v4_0.c | 5 - > >> 13 files changed, 37 insertions(+), 46 deletions(-) > >> > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > >> index 43a95feba884..03e669c34033 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > >> @@ -2499,6 +2499,18 @@ static int amdgpu_device_init_schedulers(struct > >> amdgpu_device *adev) > >> ring->name); > >> return r; > >> } > >> + r = amdgpu_uvd_entity_init(adev, ring); > >> + if (r) { > >> + DRM_ERROR("Failed to create UVD scheduling entity > >> on ring %s.\n", > >> + ring->name); > >> + return r; > >> + } > >> + r = amdgpu_vce_entity_init(adev, ring); > >> + if (r) { > >> + DRM_ERROR("Failed to create VCE scheduling entity > >> on ring %s.\n", > >> + ring->name); > >> + return r; > >> + } > >> } > >> > >> amdgpu_xcp_update_partition_sched_list(adev); > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c > >> index 815b7c34ed33..65949cc7abb9 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c > >> @@ -399,20 +399,20 @@ int amdgpu_uvd_sw_fini(struct amdgpu_device *adev) > >> * > >> * @adev: amdgpu_device pointer > >> * > >> + * Initialize the entity used for handle management in the kernel driver. > >> */ > >> -int amdgpu_uvd_entity_init(struct amdgpu_device *adev) > >> +int amdgpu_uvd_entity_init(struct amdgpu_device *adev, struct amdgpu_ring > >> *ring) > >> { > >> - struct amdgpu_ring *ring; > >> - struct drm_gpu_scheduler *sched; > >> - int r; > >> + if (ring == >uvd.inst[0].ring) { > >> + struct drm_gpu_scheduler *sched = >sched; > >> + int r; > >> > >> - ring = >uvd.inst[0].ring; > >> - sched = >sched; > >> - r = drm_sched_entity_init(>uvd.entity, > >> DRM_SCHED_PRIORITY_NORMAL, > >> - , 1, NULL); > >> - if (r) { > >> - DRM_ERROR("Failed setting up UVD kernel entity.\n"); > >> - return r; > >> + r = drm_sched_entity_init(>uvd.entity, > >> DRM_SCHED_PRIORITY_NORMAL, > >> + , 1, NULL); > >> + if (r) { > >> + DRM_ERROR("Failed setting up UVD kernel > >> entity.\n"); > >> + return r; > >> + } > >> } > >> > >> return 0; > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h > >> index a9f342537c68..9dfad2f48ef4 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h > >> @@ -73,7 +73,7 @@ struct amdgpu_uvd { > >> > >> int amdgpu_uvd_sw_init(struct amdgpu_device *adev); > >> int amdgpu_uvd_sw_fini(struct amdgpu_device *adev); > >> -int amdgpu_uvd_entity_init(struct amdgpu_device *adev); > >> +int amdgpu_uvd_entity_init(struct amdgpu_device *adev, struct amdgpu_ring > >>
[PATCH 3/3] drm/amdgpu: add new INFO IOCTL query for input power
Some chips provide both average and input power. Previously we just exposed average power, add a new query for input power. Example userspace: https://github.com/Umio-Yasuno/libdrm-amdgpu-sys-rs/tree/input_power Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 9 + include/uapi/drm/amdgpu_drm.h | 2 ++ 2 files changed, 11 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index bf4f48fe438d..48496bb585c7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1114,6 +1114,15 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) } ui32 >>= 8; break; + case AMDGPU_INFO_SENSOR_GPU_INPUT_POWER: + /* get input GPU power */ + if (amdgpu_dpm_read_sensor(adev, + AMDGPU_PP_SENSOR_GPU_INPUT_POWER, + (void *), _size)) { + return -EINVAL; + } + ui32 >>= 8; + break; case AMDGPU_INFO_SENSOR_VDDNB: /* get VDDNB in millivolts */ if (amdgpu_dpm_read_sensor(adev, diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index ad21c613fec8..96e32dafd4f0 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -865,6 +865,8 @@ struct drm_amdgpu_cs_chunk_cp_gfx_shadow { #define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_SCLK 0xa /* Subquery id: Query GPU peak pstate memory clock */ #define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_MCLK 0xb + /* Subquery id: Query input GPU power */ + #define AMDGPU_INFO_SENSOR_GPU_INPUT_POWER 0xc /* Number of VRAM page faults on CPU access. */ #define AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS 0x1E #define AMDGPU_INFO_VRAM_LOST_COUNTER 0x1F -- 2.41.0
[PATCH 1/3] drm/amdgpu: fix avg vs input power reporting on smu7
Hawaii, Bonaire, Fiji, and Tonga support average power, the others support current power. Signed-off-by: Alex Deucher --- .../gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c | 17 - 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c index 11372fcc59c8..a2c7b2e111fa 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c @@ -3995,6 +3995,7 @@ static int smu7_read_sensor(struct pp_hwmgr *hwmgr, int idx, uint32_t sclk, mclk, activity_percent; uint32_t offset, val_vid; struct smu7_hwmgr *data = (struct smu7_hwmgr *)(hwmgr->backend); + struct amdgpu_device *adev = hwmgr->adev; /* size must be at least 4 bytes for all sensors */ if (*size < 4) @@ -4038,7 +4039,21 @@ static int smu7_read_sensor(struct pp_hwmgr *hwmgr, int idx, *size = 4; return 0; case AMDGPU_PP_SENSOR_GPU_INPUT_POWER: - return smu7_get_gpu_power(hwmgr, (uint32_t *)value); + if ((adev->asic_type != CHIP_HAWAII) && + (adev->asic_type != CHIP_BONAIRE) && + (adev->asic_type != CHIP_FIJI) && + (adev->asic_type != CHIP_TONGA)) + return smu7_get_gpu_power(hwmgr, (uint32_t *)value); + else + return -EOPNOTSUPP; + case AMDGPU_PP_SENSOR_GPU_AVG_POWER: + if ((adev->asic_type != CHIP_HAWAII) && + (adev->asic_type != CHIP_BONAIRE) && + (adev->asic_type != CHIP_FIJI) && + (adev->asic_type != CHIP_TONGA)) + return -EOPNOTSUPP; + else + return smu7_get_gpu_power(hwmgr, (uint32_t *)value); case AMDGPU_PP_SENSOR_VDDGFX: if ((data->vr_config & VRCONF_VDDGFX_MASK) == (VR_SVI2_PLANE_2 << VRCONF_VDDGFX_SHIFT)) -- 2.41.0
[PATCH 2/3] drm/amdgpu: fall back to INPUT power for AVG power via INFO IOCTL
For backwards compatibility with userspace. Fixes: 47f1724db4fe ("drm/amd: Introduce `AMDGPU_PP_SENSOR_GPU_INPUT_POWER`") Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2897 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index b5ebafd4a3ad..bf4f48fe438d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1105,7 +1105,12 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) if (amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_AVG_POWER, (void *), _size)) { - return -EINVAL; + /* fall back to input power for backwards compat */ + if (amdgpu_dpm_read_sensor(adev, + AMDGPU_PP_SENSOR_GPU_INPUT_POWER, + (void *), _size)) { + return -EINVAL; + } } ui32 >>= 8; break; -- 2.41.0
RE: [PATCH 2/2] drm/amdgpu: add amdgpu runpm usage trace for separate funcs
[Public] > -Original Message- > From: Liang, Prike > Sent: Thursday, November 9, 2023 2:37 AM > To: amd-gfx@lists.freedesktop.org > Cc: Deucher, Alexander ; Liang, Prike > > Subject: [PATCH 2/2] drm/amdgpu: add amdgpu runpm usage trace for > separate funcs > > Add trace for amdgpu runpm separate funcs usage and this will help > debugging on the case of runpm usage missed to dereference. > In the normal case the runpm usage count referred by one kind of > functionality pairwise and usage should be changed from 1 to 0, otherwise > there will be an issue in the amdgpu runpm usage dereference. > > Signed-off-by: Prike Liang Looks good. Not sure if you want to add tracepoints to the other call sites as well. These are probably the trickiest however. Acked-by: Alex Deucher > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 4 > drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 7 ++- > drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h | 15 +++ > 3 files changed, 25 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c > index e7e87a3b2601..decbbe3d4f06 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c > @@ -42,6 +42,7 @@ > #include > #include > #include > +#include "amdgpu_trace.h" > > /** > * amdgpu_dma_buf_attach - _buf_ops.attach implementation @@ - > 63,6 +64,7 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf, > attach->peer2peer = false; > > r = pm_runtime_get_sync(adev_to_drm(adev)->dev); > + trace_amdgpu_runpm_reference_dumps(1, __func__); > if (r < 0) > goto out; > > @@ -70,6 +72,7 @@ static int amdgpu_dma_buf_attach(struct dma_buf > *dmabuf, > > out: > pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); > + trace_amdgpu_runpm_reference_dumps(0, __func__); > return r; > } > > @@ -90,6 +93,7 @@ static void amdgpu_dma_buf_detach(struct dma_buf > *dmabuf, > > pm_runtime_mark_last_busy(adev_to_drm(adev)->dev); > pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); > + trace_amdgpu_runpm_reference_dumps(0, __func__); > } > > /** > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > index 709a2c1b9d63..1026a9fa0c0f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > @@ -183,6 +183,7 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, > struct dma_fence **f, struct amd > amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, > seq, flags | AMDGPU_FENCE_FLAG_INT); > pm_runtime_get_noresume(adev_to_drm(adev)->dev); > + trace_amdgpu_runpm_reference_dumps(1, __func__); > ptr = >fence_drv.fences[seq & ring- > >fence_drv.num_fences_mask]; > if (unlikely(rcu_dereference_protected(*ptr, 1))) { > struct dma_fence *old; > @@ -286,8 +287,11 @@ bool amdgpu_fence_process(struct amdgpu_ring > *ring) > seq != ring->fence_drv.sync_seq) > amdgpu_fence_schedule_fallback(ring); > > - if (unlikely(seq == last_seq)) > + if (unlikely(seq == last_seq)) { > + pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); > + trace_amdgpu_runpm_reference_dumps(0, __func__); > return false; > + } > > last_seq &= drv->num_fences_mask; > seq &= drv->num_fences_mask; > @@ -310,6 +314,7 @@ bool amdgpu_fence_process(struct amdgpu_ring > *ring) > dma_fence_put(fence); > pm_runtime_mark_last_busy(adev_to_drm(adev)->dev); > pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); > + trace_amdgpu_runpm_reference_dumps(0, __func__); > } while (last_seq != seq); > > return true; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h > index 2fd1bfb35916..5d4792645540 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h > @@ -554,6 +554,21 @@ TRACE_EVENT(amdgpu_reset_reg_dumps, > __entry->value) > ); > > +TRACE_EVENT(amdgpu_runpm_reference_dumps, > + TP_PROTO(uint32_t index, const char *func), > + TP_ARGS(index, func), > + TP_STRUCT__entry( > + __field(uint32_t, index) > + __string(func, func) > + ), > + TP_fast_assign( > +__entry->index = index; > +__assign_str(func, func); > +), > + TP_printk("amdgpu runpm reference dump 0x%d: 0x%s\n", > + __entry->index, > + __get_str(func)) > +); > #undef AMDGPU_JOB_GET_TIMELINE_NAME > #endif > > -- > 2.34.1
RE: [PATCH 1/2] drm/amdgpu: correct the amdgpu runtime dereference usage count
[Public] > -Original Message- > From: Liang, Prike > Sent: Thursday, November 9, 2023 2:37 AM > To: amd-gfx@lists.freedesktop.org > Cc: Deucher, Alexander ; Liang, Prike > > Subject: [PATCH 1/2] drm/amdgpu: correct the amdgpu runtime dereference > usage count > > Fix the amdgpu runpm dereference usage count. > > Signed-off-by: Prike Liang > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 1 + > 2 files changed, 2 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > index a53f436fa9f1..f6e5d9f7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > @@ -1992,7 +1992,7 @@ static int amdgpu_debugfs_sclk_set(void *data, > u64 val) > > ret = amdgpu_dpm_set_soft_freq_range(adev, PP_SCLK, > (uint32_t)val, (uint32_t)val); > if (ret) > - ret = -EINVAL; > + goto out; I think this hunk can be dropped. It doesn't really change anything. Or you could just drop the whole ret check since we just return ret at the end anyway. Not sure if changing the error code is important here or not. > > out: > pm_runtime_mark_last_busy(adev_to_drm(adev)->dev); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c > index 0cacd0b9f8be..ff1f42ae6d8e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c > @@ -346,6 +346,7 @@ int amdgpu_display_crtc_set_config(struct > drm_mode_set *set, > if (!active && adev->have_disp_power_ref) { > pm_runtime_put_autosuspend(dev->dev); > adev->have_disp_power_ref = false; > + return ret; > } I think it would be cleaner to just drop the runtime_put above and update the comment. We'll just fall through to the end of the function. Alex > > out: > -- > 2.34.1
Re: [PATCH] drm/amdgpu: move UVD and VCE sched entity init after sched init
On 2023-11-09 11:13, Alex Deucher wrote: > Ping? > > On Wed, Nov 8, 2023 at 1:42 PM Alex Deucher wrote: >> >> We need kernel scheduling entities to deal with handle clean up >> if apps are not cleaned up properly. With commit 56e449603f0ac5 >> ("drm/sched: Convert the GPU scheduler to variable number of run-queues") >> the scheduler entities have to be created after scheduler init, so >> change the ordering to fix this. >> >> v2: Leave logic in UVD and VCE code >> >> Fixes: 56e449603f0ac5 ("drm/sched: Convert the GPU scheduler to variable >> number of run-queues") >> Signed-off-by: Alex Deucher >> Cc: ltuiko...@gmail.com Reviewed-by: Luben Tuikov Regards, Luben >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c| 22 ++-- >> drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h| 2 +- >> drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c| 24 +++--- >> drivers/gpu/drm/amd/amdgpu/amdgpu_vce.h| 2 +- >> drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c | 2 -- >> drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c | 2 -- >> drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 2 -- >> drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c | 2 -- >> drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 4 >> drivers/gpu/drm/amd/amdgpu/vce_v2_0.c | 2 -- >> drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 2 -- >> drivers/gpu/drm/amd/amdgpu/vce_v4_0.c | 5 - >> 13 files changed, 37 insertions(+), 46 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 43a95feba884..03e669c34033 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -2499,6 +2499,18 @@ static int amdgpu_device_init_schedulers(struct >> amdgpu_device *adev) >> ring->name); >> return r; >> } >> + r = amdgpu_uvd_entity_init(adev, ring); >> + if (r) { >> + DRM_ERROR("Failed to create UVD scheduling entity on >> ring %s.\n", >> + ring->name); >> + return r; >> + } >> + r = amdgpu_vce_entity_init(adev, ring); >> + if (r) { >> + DRM_ERROR("Failed to create VCE scheduling entity on >> ring %s.\n", >> + ring->name); >> + return r; >> + } >> } >> >> amdgpu_xcp_update_partition_sched_list(adev); >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c >> index 815b7c34ed33..65949cc7abb9 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c >> @@ -399,20 +399,20 @@ int amdgpu_uvd_sw_fini(struct amdgpu_device *adev) >> * >> * @adev: amdgpu_device pointer >> * >> + * Initialize the entity used for handle management in the kernel driver. >> */ >> -int amdgpu_uvd_entity_init(struct amdgpu_device *adev) >> +int amdgpu_uvd_entity_init(struct amdgpu_device *adev, struct amdgpu_ring >> *ring) >> { >> - struct amdgpu_ring *ring; >> - struct drm_gpu_scheduler *sched; >> - int r; >> + if (ring == >uvd.inst[0].ring) { >> + struct drm_gpu_scheduler *sched = >sched; >> + int r; >> >> - ring = >uvd.inst[0].ring; >> - sched = >sched; >> - r = drm_sched_entity_init(>uvd.entity, >> DRM_SCHED_PRIORITY_NORMAL, >> - , 1, NULL); >> - if (r) { >> - DRM_ERROR("Failed setting up UVD kernel entity.\n"); >> - return r; >> + r = drm_sched_entity_init(>uvd.entity, >> DRM_SCHED_PRIORITY_NORMAL, >> + , 1, NULL); >> + if (r) { >> + DRM_ERROR("Failed setting up UVD kernel entity.\n"); >> + return r; >> + } >> } >> >> return 0; >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h >> index a9f342537c68..9dfad2f48ef4 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h >> @@ -73,7 +73,7 @@ struct amdgpu_uvd { >> >> int amdgpu_uvd_sw_init(struct amdgpu_device *adev); >> int amdgpu_uvd_sw_fini(struct amdgpu_device *adev); >> -int amdgpu_uvd_entity_init(struct amdgpu_device *adev); >> +int amdgpu_uvd_entity_init(struct amdgpu_device *adev, struct amdgpu_ring >> *ring); >> int amdgpu_uvd_prepare_suspend(struct amdgpu_device *adev); >> int amdgpu_uvd_suspend(struct amdgpu_device *adev); >> int amdgpu_uvd_resume(struct amdgpu_device *adev); >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c >> index 1904edf68407..0954447f689d
[PATCH 5/5] drm/amdgpu/gmc9: disable AGP aperture
We've had misc reports of random IOMMU page faults when this is used. It's just a rarely used optimization anyway, so let's just disable it. It can still be toggled via the module parameter for testing. v2: leave it configurable via module parameter Reviewed-by: Yang Wang (v1) Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 22375fc23412..bd69cf61de06 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1630,7 +1630,7 @@ static void gmc_v9_0_vram_gtt_location(struct amdgpu_device *adev, } else { amdgpu_gmc_vram_location(adev, mc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT); - if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0)) + if (!amdgpu_sriov_vf(adev) && (amdgpu_agp == 1)) amdgpu_gmc_agp_location(adev, mc); } /* base offset of vram pages */ -- 2.41.0
[PATCH 4/5] drm/amdgpu/gmc10: disable AGP aperture
We've had misc reports of random IOMMU page faults when this is used. It's just a rarely used optimization anyway, so let's just disable it. It can still be toggled via the module parameter for testing. v2: leave it configurable via module parameter Reviewed-by: Yang Wang (v1) Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 23483bffa1c7..a5a05c16c10d 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -675,7 +675,7 @@ static void gmc_v10_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_set_agp_default(adev, mc); amdgpu_gmc_vram_location(adev, >gmc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT); - if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0)) + if (!amdgpu_sriov_vf(adev) && (amdgpu_agp == 1)) amdgpu_gmc_agp_location(adev, mc); /* base offset of vram pages */ -- 2.41.0
[PATCH 2/5] drm/amdgpu: add a module parameter to control the AGP aperture
Add a module parameter to control the AGP aperture. The AGP aperture is an aperture in the GPU's internal address space which provides direct non-paged access to the platform address space. This access is non-snooped so only uncached memory can be accessed. Add a knob so that we can toggle this for debugging. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 ++ drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 3 ++- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 2 +- 5 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 969d8ba8b8dc..8c1f88b4a58c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -250,6 +250,7 @@ extern int amdgpu_umsch_mm; extern int amdgpu_seamless; extern int amdgpu_user_partt_mode; +extern int amdgpu_agp; #define AMDGPU_VM_MAX_NUM_CTX 4096 #define AMDGPU_SG_THRESHOLD(256*1024*1024) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 0c0497d3a003..e8c42ff5d464 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -207,6 +207,7 @@ int amdgpu_user_partt_mode = AMDGPU_AUTO_COMPUTE_PARTITION_MODE; int amdgpu_umsch_mm; int amdgpu_seamless = -1; /* auto */ uint amdgpu_debug_mask; +int amdgpu_agp = -1; /* auto */ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work); @@ -961,6 +962,15 @@ module_param_named(seamless, amdgpu_seamless, int, 0444); MODULE_PARM_DESC(debug_mask, "debug options for amdgpu, disabled by default"); module_param_named(debug_mask, amdgpu_debug_mask, uint, 0444); +/** + * DOC: agp (int) + * Enable the AGP aperture. This provides an aperture in the GPU's internal + * address space for direct access to system memory. Note that these accesses + * are non-snooped, so they are only used for access to uncached memory. + */ +MODULE_PARM_DESC(agp, "AGP (-1 = auto (default), 0 = disable, 1 = enable)"); +module_param_named(agp, amdgpu_agp, int, 0444); + /* These devices are not supported by amdgpu. * They are supported by the mach64, r128, radeon drivers */ diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 0ec7b061d7c2..23483bffa1c7 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -675,7 +675,7 @@ static void gmc_v10_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_set_agp_default(adev, mc); amdgpu_gmc_vram_location(adev, >gmc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT); - if (!amdgpu_sriov_vf(adev)) + if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0)) amdgpu_gmc_agp_location(adev, mc); /* base offset of vram pages */ diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index ba4c82f5e617..e1078b53e942 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -641,7 +641,8 @@ static void gmc_v11_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_vram_location(adev, >gmc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH); if (!amdgpu_sriov_vf(adev) && - (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0))) + (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)) && + (amdgpu_agp != 0)) amdgpu_gmc_agp_location(adev, mc); /* base offset of vram pages */ diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index bde25eb4ed8e..22375fc23412 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1630,7 +1630,7 @@ static void gmc_v9_0_vram_gtt_location(struct amdgpu_device *adev, } else { amdgpu_gmc_vram_location(adev, mc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT); - if (!amdgpu_sriov_vf(adev)) + if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0)) amdgpu_gmc_agp_location(adev, mc); } /* base offset of vram pages */ -- 2.41.0
[PATCH 1/5] drm/amdgpu/gmc11: fix logic typo in AGP check
Should be && rather than ||. Fixes: b2e1cbe6281f ("drm/amdgpu/gmc11: disable AGP on GC 11.5") Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index 6dce9b29f675..ba4c82f5e617 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -640,7 +640,7 @@ static void gmc_v11_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_set_agp_default(adev, mc); amdgpu_gmc_vram_location(adev, >gmc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH); - if (!amdgpu_sriov_vf(adev) || + if (!amdgpu_sriov_vf(adev) && (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0))) amdgpu_gmc_agp_location(adev, mc); -- 2.41.0
[PATCH 3/5] drm/amdgpu/gmc11: disable AGP aperture
We've had misc reports of random IOMMU page faults when this is used. It's just a rarely used optimization anyway, so let's just disable it. It can still be toggled via the module parameter for testing. v2: leave it configurable via module parameter Reviewed-by: Yang Wang (v1) Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index e1078b53e942..23d7b548d13f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -642,7 +642,7 @@ static void gmc_v11_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH); if (!amdgpu_sriov_vf(adev) && (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)) && - (amdgpu_agp != 0)) + (amdgpu_agp == 1)) amdgpu_gmc_agp_location(adev, mc); /* base offset of vram pages */ -- 2.41.0
Re: [PATCH] drm/amd: Explicitly check for GFXOFF to be enabled for s0ix
[Public] Reviewed-by: Alex Deucher From: amd-gfx on behalf of Mario Limonciello Sent: Thursday, November 9, 2023 11:27 AM To: amd-gfx@lists.freedesktop.org Cc: Limonciello, Mario Subject: [PATCH] drm/amd: Explicitly check for GFXOFF to be enabled for s0ix If a user has disabled GFXOFF this may cause problems for the suspend sequence. Ensure that it is enabled in amdgpu_acpi_is_s0ix_active(). The system won't reach the deepest state but it also won't hang. Signed-off-by: Mario Limonciello --- drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c index d62e49758635..e550067e5c5d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c @@ -1497,6 +1497,9 @@ bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) if (adev->asic_type < CHIP_RAVEN) return false; + if (!(adev->pm.pp_feature & PP_GFXOFF_MASK)) + return false; + /* * If ACPI_FADT_LOW_POWER_S0 is not set in the FADT, it is generally * risky to do any special firmware-related preparations for entering -- 2.34.1
[PATCH] drm/amd: Explicitly check for GFXOFF to be enabled for s0ix
If a user has disabled GFXOFF this may cause problems for the suspend sequence. Ensure that it is enabled in amdgpu_acpi_is_s0ix_active(). The system won't reach the deepest state but it also won't hang. Signed-off-by: Mario Limonciello --- drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c index d62e49758635..e550067e5c5d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c @@ -1497,6 +1497,9 @@ bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) if (adev->asic_type < CHIP_RAVEN) return false; + if (!(adev->pm.pp_feature & PP_GFXOFF_MASK)) + return false; + /* * If ACPI_FADT_LOW_POWER_S0 is not set in the FADT, it is generally * risky to do any special firmware-related preparations for entering -- 2.34.1
[PATCH v3] drm/amdgpu: Change extended-scope MTYPE on GC 9.4.3
Change local memory type to MTYPE_UC on revision id 0 Signed-off-by: David Yat Sin --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 7 +-- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 7 +-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 5464b08064c4..b5633cc2b97d 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1197,7 +1197,10 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev, if (uncached) { mtype = MTYPE_UC; } else if (ext_coherent) { - mtype = is_local ? MTYPE_CC : MTYPE_UC; + if (adev->rev_id) + mtype = is_local ? MTYPE_CC : MTYPE_UC; + else + mtype = MTYPE_UC; } else if (adev->flags & AMD_IS_APU) { mtype = is_local ? mtype_local : MTYPE_NC; } else { @@ -1318,7 +1321,7 @@ static void gmc_v9_0_override_vm_pte_flags(struct amdgpu_device *adev, *flags = (*flags & ~AMDGPU_PTE_MTYPE_VG10_MASK) | AMDGPU_PTE_MTYPE_VG10(mtype_local); - } else { + } else if (adev->rev_id) { /* MTYPE_UC case */ *flags = (*flags & ~AMDGPU_PTE_MTYPE_VG10_MASK) | AMDGPU_PTE_MTYPE_VG10(MTYPE_CC); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index e7f67e5226ec..08aa9b85c71a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1257,8 +1257,11 @@ svm_range_get_pte_flags(struct kfd_node *node, } break; case IP_VERSION(9, 4, 3): - mtype_local = amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC : -(amdgpu_mtype_local == 2 || ext_coherent ? + if (ext_coherent) + mtype_local = node->adev->rev_id ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_UC; + else + mtype_local = amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC : +(amdgpu_mtype_local == 2 ? AMDGPU_VM_MTYPE_CC : (amdgpu_mtype_local == 3 ? AMDGPU_VM_MTYPE_RW : (node->adev->rev_id ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW))); -- 2.34.1
Re: [PATCH] drm/amdgpu: move UVD and VCE sched entity init after sched init
Ping? On Wed, Nov 8, 2023 at 1:42 PM Alex Deucher wrote: > > We need kernel scheduling entities to deal with handle clean up > if apps are not cleaned up properly. With commit 56e449603f0ac5 > ("drm/sched: Convert the GPU scheduler to variable number of run-queues") > the scheduler entities have to be created after scheduler init, so > change the ordering to fix this. > > v2: Leave logic in UVD and VCE code > > Fixes: 56e449603f0ac5 ("drm/sched: Convert the GPU scheduler to variable > number of run-queues") > Signed-off-by: Alex Deucher > Cc: ltuiko...@gmail.com > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c| 22 ++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h| 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c| 24 +++--- > drivers/gpu/drm/amd/amdgpu/amdgpu_vce.h| 2 +- > drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c | 2 -- > drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c | 2 -- > drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 2 -- > drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c | 2 -- > drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 4 > drivers/gpu/drm/amd/amdgpu/vce_v2_0.c | 2 -- > drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 2 -- > drivers/gpu/drm/amd/amdgpu/vce_v4_0.c | 5 - > 13 files changed, 37 insertions(+), 46 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 43a95feba884..03e669c34033 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2499,6 +2499,18 @@ static int amdgpu_device_init_schedulers(struct > amdgpu_device *adev) > ring->name); > return r; > } > + r = amdgpu_uvd_entity_init(adev, ring); > + if (r) { > + DRM_ERROR("Failed to create UVD scheduling entity on > ring %s.\n", > + ring->name); > + return r; > + } > + r = amdgpu_vce_entity_init(adev, ring); > + if (r) { > + DRM_ERROR("Failed to create VCE scheduling entity on > ring %s.\n", > + ring->name); > + return r; > + } > } > > amdgpu_xcp_update_partition_sched_list(adev); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c > index 815b7c34ed33..65949cc7abb9 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c > @@ -399,20 +399,20 @@ int amdgpu_uvd_sw_fini(struct amdgpu_device *adev) > * > * @adev: amdgpu_device pointer > * > + * Initialize the entity used for handle management in the kernel driver. > */ > -int amdgpu_uvd_entity_init(struct amdgpu_device *adev) > +int amdgpu_uvd_entity_init(struct amdgpu_device *adev, struct amdgpu_ring > *ring) > { > - struct amdgpu_ring *ring; > - struct drm_gpu_scheduler *sched; > - int r; > + if (ring == >uvd.inst[0].ring) { > + struct drm_gpu_scheduler *sched = >sched; > + int r; > > - ring = >uvd.inst[0].ring; > - sched = >sched; > - r = drm_sched_entity_init(>uvd.entity, > DRM_SCHED_PRIORITY_NORMAL, > - , 1, NULL); > - if (r) { > - DRM_ERROR("Failed setting up UVD kernel entity.\n"); > - return r; > + r = drm_sched_entity_init(>uvd.entity, > DRM_SCHED_PRIORITY_NORMAL, > + , 1, NULL); > + if (r) { > + DRM_ERROR("Failed setting up UVD kernel entity.\n"); > + return r; > + } > } > > return 0; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h > index a9f342537c68..9dfad2f48ef4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h > @@ -73,7 +73,7 @@ struct amdgpu_uvd { > > int amdgpu_uvd_sw_init(struct amdgpu_device *adev); > int amdgpu_uvd_sw_fini(struct amdgpu_device *adev); > -int amdgpu_uvd_entity_init(struct amdgpu_device *adev); > +int amdgpu_uvd_entity_init(struct amdgpu_device *adev, struct amdgpu_ring > *ring); > int amdgpu_uvd_prepare_suspend(struct amdgpu_device *adev); > int amdgpu_uvd_suspend(struct amdgpu_device *adev); > int amdgpu_uvd_resume(struct amdgpu_device *adev); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c > index 1904edf68407..0954447f689d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c > @@ -231,20 +231,20 @@ int amdgpu_vce_sw_fini(struct amdgpu_device *adev) > * > * @adev:
Re: [PATCH] drm/amd/pm: replace 1-element arrays with flexible-array members
On Thu, Nov 9, 2023 at 7:14 AM José Pekkarinen wrote: > > On 2023-11-09 11:06, Greg KH wrote: > > On Thu, Nov 09, 2023 at 10:43:50AM +0200, José Pekkarinen wrote: > >> On 2023-11-08 09:29, Greg KH wrote: > >> > On Wed, Nov 08, 2023 at 08:54:35AM +0200, José Pekkarinen wrote: > >> > > The following case seems to be safe to be replaced with a flexible > >> > > array > >> > > to clean up the added coccinelle warning. This patch will just do it. > >> > > > >> > > drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h:76:38-63: > >> > > WARNING use flexible-array member instead > >> > > (https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays) > >> > > > >> > > Signed-off-by: José Pekkarinen > >> > > --- > >> > > drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h | 2 +- > >> > > 1 file changed, 1 insertion(+), 1 deletion(-) > >> > > > >> > > diff --git a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h > >> > > b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h > >> > > index c7b61222d258..1ce4087005f0 100644 > >> > > --- a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h > >> > > +++ b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h > >> > > @@ -73,7 +73,7 @@ struct smu8_register_index_data_pair { > >> > > > >> > > struct smu8_ih_meta_data { > >> > > uint32_t command; > >> > > -struct smu8_register_index_data_pair > >> > > register_index_value_pair[1]; > >> > > +struct smu8_register_index_data_pair > >> > > register_index_value_pair[]; > >> > > >> > Did you just change this structure size without any need to change any > >> > code as well? How was this tested? > >> > >> I didn't find any use of that struct member, if I missed > >> something here, please let me know and I'll happily address any > >> needed further work. > > > > I don't think this is even a variable array. It's just a one element > > one, which is fine, don't be confused by the coccinelle "warning" here, > > it's fired many false-positives and you need to verify this properly > > with the driver authors first before changing anything. > > My apologies to you, and anybody that feels the same, it is not my > intention to bother with mistaken patches, I just assume that this patch > or any other from me, will go to review process, where it should be fine > if the patch is right, wrong, need further work, or further testing > either > from my side or anybody else, and at the end of the day I need to do > patches if I want to find my mentorship patches, and graduate. > > > In short, you just changed the size of this structure, are you _sure_ > > you can do that? And yes, it doesn't look like this field is used, but > > the structure is, so be careful. > > I don't know, let check it out together and see where this goes. I think it may have been used with the SMU firmware. I'll need to check with that team to determine if this was meant to be a variable sized array or not. Alex
RE: [PATCH 1/3] drm/amdgpu/gmc11: disable AGP aperture
Series is Reviewed-by: Yang Wang Best Regards, Kevin -Original Message- From: amd-gfx On Behalf Of Alex Deucher Sent: Thursday, November 9, 2023 10:42 PM To: amd-gfx@lists.freedesktop.org Cc: Deucher, Alexander Subject: [PATCH 1/3] drm/amdgpu/gmc11: disable AGP aperture We've had misc reports of random IOMMU page faults when this is used. It's just a rarely used optimization anyway, so let's just disable it. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index 6dce9b29f675..e1cac5da9c4b 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -640,9 +640,6 @@ static void gmc_v11_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_set_agp_default(adev, mc); amdgpu_gmc_vram_location(adev, >gmc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH); - if (!amdgpu_sriov_vf(adev) || - (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0))) - amdgpu_gmc_agp_location(adev, mc); /* base offset of vram pages */ if (amdgpu_sriov_vf(adev)) -- 2.41.0
Re: [PATCH] drm/amd/pm: replace 1-element arrays with flexible-array members
On 2023-11-09 11:06, Greg KH wrote: On Thu, Nov 09, 2023 at 10:43:50AM +0200, José Pekkarinen wrote: On 2023-11-08 09:29, Greg KH wrote: > On Wed, Nov 08, 2023 at 08:54:35AM +0200, José Pekkarinen wrote: > > The following case seems to be safe to be replaced with a flexible > > array > > to clean up the added coccinelle warning. This patch will just do it. > > > > drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h:76:38-63: > > WARNING use flexible-array member instead (https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays) > > > > Signed-off-by: José Pekkarinen > > --- > > drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > diff --git a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h > > b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h > > index c7b61222d258..1ce4087005f0 100644 > > --- a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h > > +++ b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h > > @@ -73,7 +73,7 @@ struct smu8_register_index_data_pair { > > > > struct smu8_ih_meta_data { > > uint32_t command; > > - struct smu8_register_index_data_pair register_index_value_pair[1]; > > + struct smu8_register_index_data_pair register_index_value_pair[]; > > Did you just change this structure size without any need to change any > code as well? How was this tested? I didn't find any use of that struct member, if I missed something here, please let me know and I'll happily address any needed further work. I don't think this is even a variable array. It's just a one element one, which is fine, don't be confused by the coccinelle "warning" here, it's fired many false-positives and you need to verify this properly with the driver authors first before changing anything. My apologies to you, and anybody that feels the same, it is not my intention to bother with mistaken patches, I just assume that this patch or any other from me, will go to review process, where it should be fine if the patch is right, wrong, need further work, or further testing either from my side or anybody else, and at the end of the day I need to do patches if I want to find my mentorship patches, and graduate. In short, you just changed the size of this structure, are you _sure_ you can do that? And yes, it doesn't look like this field is used, but the structure is, so be careful. I don't know, let check it out together and see where this goes. José.
[PATCH] drm/amd/pm: make power values signed
The following patch will convert the power values returned by amdgpu_hwmon_get_power to signed, fixing the following warnings reported by coccinelle: drivers/gpu/drm/amd/pm/amdgpu_pm.c:2801:5-8: WARNING: Unsigned expression compared with zero: val < 0 drivers/gpu/drm/amd/pm/amdgpu_pm.c:2814:5-8: WARNING: Unsigned expression compared with zero: val < 0 Signed-off-by: José Pekkarinen --- drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index e7bb1d324084..913ff62d5d5e 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -2795,7 +2795,7 @@ static ssize_t amdgpu_hwmon_show_power_avg(struct device *dev, struct device_attribute *attr, char *buf) { - unsigned int val; + int val; val = amdgpu_hwmon_get_power(dev, AMDGPU_PP_SENSOR_GPU_AVG_POWER); @@ -2806,7 +2806,7 @@ static ssize_t amdgpu_hwmon_show_power_input(struct device *dev, struct device_attribute *attr, char *buf) { - unsigned int val; + int val; val = amdgpu_hwmon_get_power(dev, AMDGPU_PP_SENSOR_GPU_INPUT_POWER); -- 2.39.2
Re: [PATCH] drm/amd/pm: replace 1-element arrays with flexible-array members
On Thu, Nov 09, 2023 at 10:43:50AM +0200, José Pekkarinen wrote: > On 2023-11-08 09:29, Greg KH wrote: > > On Wed, Nov 08, 2023 at 08:54:35AM +0200, José Pekkarinen wrote: > > > The following case seems to be safe to be replaced with a flexible > > > array > > > to clean up the added coccinelle warning. This patch will just do it. > > > > > > drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h:76:38-63: > > > WARNING use flexible-array member instead > > > (https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays) > > > > > > Signed-off-by: José Pekkarinen > > > --- > > > drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h | 2 +- > > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > > > diff --git a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h > > > b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h > > > index c7b61222d258..1ce4087005f0 100644 > > > --- a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h > > > +++ b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h > > > @@ -73,7 +73,7 @@ struct smu8_register_index_data_pair { > > > > > > struct smu8_ih_meta_data { > > > uint32_t command; > > > - struct smu8_register_index_data_pair register_index_value_pair[1]; > > > + struct smu8_register_index_data_pair register_index_value_pair[]; > > > > Did you just change this structure size without any need to change any > > code as well? How was this tested? > > I didn't find any use of that struct member, if I missed > something here, please let me know and I'll happily address any > needed further work. I don't think this is even a variable array. It's just a one element one, which is fine, don't be confused by the coccinelle "warning" here, it's fired many false-positives and you need to verify this properly with the driver authors first before changing anything. In short, you just changed the size of this structure, are you _sure_ you can do that? And yes, it doesn't look like this field is used, but the structure is, so be careful. thanks, greg k-h
Re: [PATCH] drm/amd/pm: replace 1-element arrays with flexible-array members
On 2023-11-08 09:29, Greg KH wrote: On Wed, Nov 08, 2023 at 08:54:35AM +0200, José Pekkarinen wrote: The following case seems to be safe to be replaced with a flexible array to clean up the added coccinelle warning. This patch will just do it. drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h:76:38-63: WARNING use flexible-array member instead (https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays) Signed-off-by: José Pekkarinen --- drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h index c7b61222d258..1ce4087005f0 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h +++ b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h @@ -73,7 +73,7 @@ struct smu8_register_index_data_pair { struct smu8_ih_meta_data { uint32_t command; - struct smu8_register_index_data_pair register_index_value_pair[1]; + struct smu8_register_index_data_pair register_index_value_pair[]; Did you just change this structure size without any need to change any code as well? How was this tested? I didn't find any use of that struct member, if I missed something here, please let me know and I'll happily address any needed further work. José.
[PATCH 2/3] drm/amdgpu/gmc10: disable AGP aperture
We've had misc reports of random IOMMU page faults when this is used. It's just a rarely used optimization anyway, so let's just disable it. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 0ec7b061d7c2..8eb4db3dd931 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -675,8 +675,6 @@ static void gmc_v10_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_set_agp_default(adev, mc); amdgpu_gmc_vram_location(adev, >gmc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT); - if (!amdgpu_sriov_vf(adev)) - amdgpu_gmc_agp_location(adev, mc); /* base offset of vram pages */ adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); -- 2.41.0
[PATCH 1/3] drm/amdgpu/gmc11: disable AGP aperture
We've had misc reports of random IOMMU page faults when this is used. It's just a rarely used optimization anyway, so let's just disable it. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index 6dce9b29f675..e1cac5da9c4b 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -640,9 +640,6 @@ static void gmc_v11_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_set_agp_default(adev, mc); amdgpu_gmc_vram_location(adev, >gmc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH); - if (!amdgpu_sriov_vf(adev) || - (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0))) - amdgpu_gmc_agp_location(adev, mc); /* base offset of vram pages */ if (amdgpu_sriov_vf(adev)) -- 2.41.0
[PATCH 3/3] drm/amdgpu/gmc9: disable AGP aperture
We've had misc reports of random IOMMU page faults when this is used. It's just a rarely used optimization anyway, so let's just disable it. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 05fb57540e35..39a0251ba60e 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1627,8 +1627,6 @@ static void gmc_v9_0_vram_gtt_location(struct amdgpu_device *adev, } else { amdgpu_gmc_vram_location(adev, mc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT); - if (!amdgpu_sriov_vf(adev)) - amdgpu_gmc_agp_location(adev, mc); } /* base offset of vram pages */ adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); -- 2.41.0
[PATCH 1/2] drm/amdgpu: fix error handling in amdgpu_bo_list_get()
We should not leak the pointer where we couldn't grab the reference on to the caller because it can be that the error handling still tries to put the reference then. Signed-off-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c index 4e7089a22139..8ffad1dabcb5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c @@ -168,6 +168,7 @@ int amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, int id, } rcu_read_unlock(); + *result = NULL; return -ENOENT; } -- 2.34.1
[PATCH 2/2] drm/amdgpu: lower CS errors to debug severity
Otherwise userspace can spam the logs by using incorrect input values. Signed-off-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index aafedb344c1b..af844c98f295 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1442,7 +1442,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) if (r == -ENOMEM) DRM_ERROR("Not enough memory for command submission!\n"); else if (r != -ERESTARTSYS && r != -EAGAIN) - DRM_ERROR("Failed to process the buffer list %d!\n", r); + DRM_DEBUG("Failed to process the buffer list %d!\n", r); goto error_fini; } -- 2.34.1
Re: [PATCH] drm/amdkfd: Clear the VALU exception state in the trap handler
The trap handler could be entered with pending VALU exceptions, so clear the exception state before issuing vector instructions. Signed-off-by: Laurent Morichetti Reviewed-by: Jay Cornwall Hi, FYI, I tested this and it fixes the issue. Best, Lancelot. Tested-by: Lancelot Six
Re: [PATCH 5/6] drm/amdkfd: Import DMABufs for interop through DRM
Am 07.11.23 um 17:58 schrieb Felix Kuehling: Use drm_gem_prime_fd_to_handle to import DMABufs for interop. This ensures that a GEM handle is created on import and that obj->dma_buf will be set and remain set as long as the object is imported into KFD. Signed-off-by: Felix Kuehling Acked-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h| 9 ++- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 64 +-- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 15 ++--- 3 files changed, 52 insertions(+), 36 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 4caf8cece028..88a0e0734270 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -318,11 +318,10 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, struct dma_fence **ef); int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev, struct kfd_vm_fault_info *info); -int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev, - struct dma_buf *dmabuf, - uint64_t va, void *drm_priv, - struct kgd_mem **mem, uint64_t *size, - uint64_t *mmap_offset); +int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device *adev, int fd, +uint64_t va, void *drm_priv, +struct kgd_mem **mem, uint64_t *size, +uint64_t *mmap_offset); int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_mem *mem, struct dma_buf **dmabuf); void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 4bb8b5fd7598..1077de8bced2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2006,8 +2006,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( /* Free the BO*/ drm_vma_node_revoke(>bo->tbo.base.vma_node, drm_priv); - if (!mem->is_imported) - drm_gem_handle_delete(adev->kfd.client.file, mem->gem_handle); + drm_gem_handle_delete(adev->kfd.client.file, mem->gem_handle); if (mem->dmabuf) { dma_buf_put(mem->dmabuf); mem->dmabuf = NULL; @@ -2363,34 +2362,26 @@ int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev, return 0; } -int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev, - struct dma_buf *dma_buf, - uint64_t va, void *drm_priv, - struct kgd_mem **mem, uint64_t *size, - uint64_t *mmap_offset) +static int import_obj_create(struct amdgpu_device *adev, +struct dma_buf *dma_buf, +struct drm_gem_object *obj, +uint64_t va, void *drm_priv, +struct kgd_mem **mem, uint64_t *size, +uint64_t *mmap_offset) { struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv); - struct drm_gem_object *obj; struct amdgpu_bo *bo; int ret; - obj = amdgpu_gem_prime_import(adev_to_drm(adev), dma_buf); - if (IS_ERR(obj)) - return PTR_ERR(obj); - bo = gem_to_amdgpu_bo(obj); if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | - AMDGPU_GEM_DOMAIN_GTT))) { + AMDGPU_GEM_DOMAIN_GTT))) /* Only VRAM and GTT BOs are supported */ - ret = -EINVAL; - goto err_put_obj; - } + return -EINVAL; *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); - if (!*mem) { - ret = -ENOMEM; - goto err_put_obj; - } + if (!*mem) + return -ENOMEM; ret = drm_vma_node_allow(>vma_node, drm_priv); if (ret) @@ -2440,8 +2431,41 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev, drm_vma_node_revoke(>vma_node, drm_priv); err_free_mem: kfree(*mem); + return ret; +} + +int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device *adev, int fd, +uint64_t va, void *drm_priv, +struct kgd_mem **mem, uint64_t *size, +uint64_t *mmap_offset) +{ + struct drm_gem_object *obj; + uint32_t handle; + int ret; + + ret = drm_gem_prime_fd_to_handle(>ddev,
Re: [PATCH 2/6] drm/amdgpu: New VM state for evicted user BOs
Am 08.11.23 um 22:23 schrieb Felix Kuehling: On 2023-11-08 07:28, Christian König wrote: Not necessary objections to this patch here, but rather how this new state is used later on. The fundamental problem is that re-validating things in amdgpu_vm_handle_moved() won't work in all cases. The general approach for both classic CS IOCTL as well as user queues is the following: 1. Grab all the necessary locks The VM lock + either everything inside the VM (user queues) or just the BOs in the submission (CS IOCTL). 2. Validate everything locked It can be that additional BO locks are grabbed for eviction and even locked BOs are shuffled around during that. 3. Update the PDEs and PTEs This can be done only after validating everything because things can still shuffle around during validation. 4. Do your CS or make the userqueu / process runable etc... 5. Attach fences to the locked BOs. 6. Unlock everything again. I think that is part of the problem why handling all updates in amdgpu_vm_handle_moved() for the CS doesn't work and sooner or later you might run into the same issue during process restore as well. If I'm not completely mistaken this should be solvable by moving all validation to amdgpu_vm_validate_pt_bos() instead (but let us rename that function). I'm trying to understand what the problem is. As far as I can tell, amdgpu_vm_handle_moved just runs a bit later in the CS and process-restore code than amdgpu_vm_validate_pt_bos. But it runs with all the same reservations locked. My current implementation in amdgpu_vm_handle_moved has some failure cases when reservations cannot be acquired. I think your drm_exec patch series would make it possible to handle this more robustly. That said, in case of KFD process restore, we can retry in case of failures already. Anyway, moving my re-validation code to a renamed version of amdgpu_vm_validate_pt_bos shouldn't be a big problem. I just don't understand what problem that solves. Maybe it just makes the code cleaner to handle both evicted states in one place? Or maybe you're pointing to a way to merge the two states into one? Well yeah merging both state into one might be nice to have, but that isn't the fundamental problem. What basically happens during validation of a BO is that this BO is move to a desired place (as described by the placement parameter). During this it can happen that we shuffle out other BOs. Now the issue is that when you validate in amdgpu_vm_handle_moved() it can be that by validating BO we shuffle out other BOs which then end up of the evicted list again. This most likely won't happen with KFD, but for GFX CS it's certainly possible. So the idea is that we first validate everything and then update all the page tables and not inter mix the two operations. Regards, Christian. Regards, Felix Regards, Christian. Am 07.11.23 um 23:11 schrieb Felix Kuehling: Hi Christian, I know you have objected to this patch before. I still think this is the best solution for what I need. I can talk you through my reasoning by email or offline. If I can't convince you, I will need your guidance for a better solution. Thanks, Felix On 2023-11-07 11:58, Felix Kuehling wrote: Create a new VM state to track user BOs that are in the system domain. In the next patch this will be used do conditionally re-validate them in amdgpu_vm_handle_moved. Signed-off-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 17 + drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 5 - 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 1442d97ddd0f..0d685577243c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -232,6 +232,22 @@ static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo) spin_unlock(_bo->vm->status_lock); } +/** + * amdgpu_vm_bo_evicted_user - vm_bo is evicted + * + * @vm_bo: vm_bo which is evicted + * + * State for BOs used by user mode queues which are not at the location they + * should be. + */ +static void amdgpu_vm_bo_evicted_user(struct amdgpu_vm_bo_base *vm_bo) +{ + vm_bo->moved = true; + spin_lock(_bo->vm->status_lock); + list_move(_bo->vm_status, _bo->vm->evicted_user); + spin_unlock(_bo->vm->status_lock); +} + /** * amdgpu_vm_bo_relocated - vm_bo is reloacted * @@ -2110,6 +2126,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, int32_t xcp for (i = 0; i < AMDGPU_MAX_VMHUBS; i++) vm->reserved_vmid[i] = NULL; INIT_LIST_HEAD(>evicted); + INIT_LIST_HEAD(>evicted_user); INIT_LIST_HEAD(>relocated); INIT_LIST_HEAD(>moved); INIT_LIST_HEAD(>idle); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 12cac06fa289..939d0c2219c0 100644 ---
[PATCH v2] drm/amdgpu: Move mca debug mode decision to ras
Refactor code such that ras block decides the default mca debug mode, and not swsmu block. By default mca debug mode is set to true for now. Signed-off-by: Lijo Lazar --- v2: Set mca debug mode early before ras block late init as ras query is initiated during late init of ras blocks (KevinYang) drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c| 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 14 +++--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h| 2 +- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 12 4 files changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index 5a828c175e3a..cb51a46e8535 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -218,7 +218,7 @@ static int amdgpu_mca_smu_debug_mode_set(void *data, u64 val) struct amdgpu_device *adev = (struct amdgpu_device *)data; int ret; - ret = amdgpu_mca_smu_set_debug_mode(adev, val ? true : false); + ret = amdgpu_ras_set_mca_debug_mode(adev, val ? true : false); if (ret) return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b7fe5951b166..bb4f1e60c099 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3102,6 +3102,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev)) return 0; + amdgpu_ras_set_mca_debug_mode(adev, true); + list_for_each_entry_safe(node, tmp, >ras_list, node) { if (!node->ras_obj) { dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); @@ -3375,12 +3377,18 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) return 0; } -void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable) +int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + int ret; - if (con) - con->is_mca_debug_mode = enable; + if (con) { + ret = amdgpu_mca_smu_set_debug_mode(adev, enable); + if (!ret) + con->is_mca_debug_mode = enable; + } + + return ret; } bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 665414c22ca9..e1e0fc66ea49 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -767,7 +767,7 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev); int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con); -void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); +int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev); int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index cecd210397d6..404db2b0fb71 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -1524,7 +1524,6 @@ static int smu_v13_0_6_mca_set_debug_mode(struct smu_context *smu, bool enable) if (smu->smc_fw_version < 0x554800) return 0; - amdgpu_ras_set_mca_debug_mode(smu->adev, enable); return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ClearMcaOnRead, enable ? 0 : ClearMcaOnRead_UE_FLAG_MASK | ClearMcaOnRead_CE_POLL_MASK, NULL); @@ -2346,16 +2345,6 @@ static int smu_v13_0_6_smu_send_hbm_bad_page_num(struct smu_context *smu, return ret; } -static int smu_v13_0_6_post_init(struct smu_context *smu) -{ - struct amdgpu_device *adev = smu->adev; - - if (!amdgpu_sriov_vf(adev) && adev->ras_enabled) - return smu_v13_0_6_mca_set_debug_mode(smu, true); - - return 0; -} - static int mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable) { struct smu_context *smu = adev->powerplay.pp_handle; @@ -2945,7 +2934,6 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = { .i2c_init = smu_v13_0_6_i2c_control_init, .i2c_fini = smu_v13_0_6_i2c_control_fini, .send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num, - .post_init = smu_v13_0_6_post_init, }; void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu) -- 2.25.1
RE: [PATCH] drm/amdgpu: Move mca debug mode decision to ras
Hi Lijo, after our private discussion, in driver polling mode, the RAS will return 0 error count in driver probe stage (at that time the debug mode is in off state). In the case of driver polling mode, debug mode needs to be turned on in advance. Best Regards, Kevin -Original Message- From: Lazar, Lijo Sent: Thursday, November 9, 2023 1:26 PM To: amd-gfx@lists.freedesktop.org Cc: Zhang, Hawking ; Deucher, Alexander ; Wang, Yang(Kevin) ; Zhou1, Tao Subject: [PATCH] drm/amdgpu: Move mca debug mode decision to ras Refactor code such that ras block decides the default mca debug mode, and not swsmu block. By default mca debug mode is set to true for now. Signed-off-by: Lijo Lazar --- drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c| 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 14 +++--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h| 2 +- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 12 4 files changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index 5a828c175e3a..cb51a46e8535 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -218,7 +218,7 @@ static int amdgpu_mca_smu_debug_mode_set(void *data, u64 val) struct amdgpu_device *adev = (struct amdgpu_device *)data; int ret; - ret = amdgpu_mca_smu_set_debug_mode(adev, val ? true : false); + ret = amdgpu_ras_set_mca_debug_mode(adev, val ? true : false); if (ret) return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b7fe5951b166..b2e9ed65a061 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3120,6 +3120,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) amdgpu_ras_block_late_init_default(adev, >ras_comm); } + amdgpu_ras_set_mca_debug_mode(adev, true); + return 0; } @@ -3375,12 +3377,18 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) return 0; } -void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable) +int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool +enable) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + int ret; - if (con) - con->is_mca_debug_mode = enable; + if (con) { + ret = amdgpu_mca_smu_set_debug_mode(adev, enable); + if (!ret) + con->is_mca_debug_mode = enable; + } + + return ret; } bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 665414c22ca9..e1e0fc66ea49 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -767,7 +767,7 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev); int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con); -void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); +int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool +enable); bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev); int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index cecd210397d6..404db2b0fb71 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -1524,7 +1524,6 @@ static int smu_v13_0_6_mca_set_debug_mode(struct smu_context *smu, bool enable) if (smu->smc_fw_version < 0x554800) return 0; - amdgpu_ras_set_mca_debug_mode(smu->adev, enable); return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ClearMcaOnRead, enable ? 0 : ClearMcaOnRead_UE_FLAG_MASK | ClearMcaOnRead_CE_POLL_MASK, NULL); @@ -2346,16 +2345,6 @@ static int smu_v13_0_6_smu_send_hbm_bad_page_num(struct smu_context *smu, return ret; } -static int smu_v13_0_6_post_init(struct smu_context *smu) -{ - struct amdgpu_device *adev = smu->adev; - - if (!amdgpu_sriov_vf(adev) && adev->ras_enabled) - return smu_v13_0_6_mca_set_debug_mode(smu, true); - - return 0; -} - static int mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable) { struct smu_context *smu = adev->powerplay.pp_handle; @@ -2945,7 +2934,6 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = { .i2c_init = smu_v13_0_6_i2c_control_init, .i2c_fini = smu_v13_0_6_i2c_control_fini, .send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num,