[PATCH v3] drm/amdgpu: Move mca debug mode decision to ras

2023-11-09 Thread Lijo Lazar
Refactor code such that ras block decides the default mca debug mode,
and not swsmu block.

By default mca debug mode is set to false.

Signed-off-by: Lijo Lazar 
---
v3: Default mca debug mode is set to false

v2: Set mca debug mode early before ras block late init as ras query is
initiated during late init of ras blocks (KevinYang)

 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c|  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 14 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  2 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c   | 12 
 4 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index cf33eb219e25..54f2f346579e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -377,7 +377,7 @@ static int amdgpu_mca_smu_debug_mode_set(void *data, u64 
val)
struct amdgpu_device *adev = (struct amdgpu_device *)data;
int ret;
 
-   ret = amdgpu_mca_smu_set_debug_mode(adev, val ? true : false);
+   ret = amdgpu_ras_set_mca_debug_mode(adev, val ? true : false);
if (ret)
return ret;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 84e5987b14e0..6747fbe4feab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3132,6 +3132,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev))
return 0;
 
+   amdgpu_ras_set_mca_debug_mode(adev, false);
+
list_for_each_entry_safe(node, tmp, >ras_list, node) {
if (!node->ras_obj) {
dev_warn(adev->dev, "Warning: abnormal ras list 
node.\n");
@@ -3405,12 +3407,18 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
return 0;
 }
 
-void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
+int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   int ret;
 
-   if (con)
-   con->is_mca_debug_mode = enable;
+   if (con) {
+   ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
+   if (!ret)
+   con->is_mca_debug_mode = enable;
+   }
+
+   return ret;
 }
 
 bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 19161916ac46..6a941eb8fb8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -773,7 +773,7 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct 
amdgpu_device *adev);
 
 int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras 
*ras_con);
 
-void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
+int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
 bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
 bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
 unsigned int *mode);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 6cbfb25a05de..f09f56efbdc3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -1516,7 +1516,6 @@ static int smu_v13_0_6_mca_set_debug_mode(struct 
smu_context *smu, bool enable)
if (smu->smc_fw_version < 0x554800)
return 0;
 
-   amdgpu_ras_set_mca_debug_mode(smu->adev, enable);
return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ClearMcaOnRead,
   enable ? 0 : 
ClearMcaOnRead_UE_FLAG_MASK | ClearMcaOnRead_CE_POLL_MASK,
   NULL);
@@ -2338,16 +2337,6 @@ static int smu_v13_0_6_smu_send_hbm_bad_page_num(struct 
smu_context *smu,
return ret;
 }
 
-static int smu_v13_0_6_post_init(struct smu_context *smu)
-{
-   struct amdgpu_device *adev = smu->adev;
-
-   if (!amdgpu_sriov_vf(adev) && adev->ras_enabled)
-   return smu_v13_0_6_mca_set_debug_mode(smu, false);
-
-   return 0;
-}
-
 static int mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
 {
struct smu_context *smu = adev->powerplay.pp_handle;
@@ -2904,7 +2893,6 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = 
{
.i2c_init = smu_v13_0_6_i2c_control_init,
.i2c_fini = smu_v13_0_6_i2c_control_fini,
.send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num,
-   .post_init = smu_v13_0_6_post_init,
 };
 
 void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu)
-- 
2.25.1



[PATCH 2/2] drm/virtio: Modify RESOURCE_GET_LAYOUT ioctl

2023-11-09 Thread Julia Zhang
Modify RESOURCE_GET_LAYOUT ioctl to handle the use case that query
correct stride for guest linear resource before it is created.

Signed-off-by: Julia Zhang 
---
 drivers/gpu/drm/virtio/virtgpu_drv.h   | 26 --
 drivers/gpu/drm/virtio/virtgpu_ioctl.c | 47 --
 drivers/gpu/drm/virtio/virtgpu_vq.c| 35 +++
 include/uapi/drm/virtgpu_drm.h |  6 ++--
 include/uapi/linux/virtio_gpu.h|  8 ++---
 5 files changed, 66 insertions(+), 56 deletions(-)

diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.h 
b/drivers/gpu/drm/virtio/virtgpu_drv.h
index d6fc0d4ecb7d..82dffb3e4c6b 100644
--- a/drivers/gpu/drm/virtio/virtgpu_drv.h
+++ b/drivers/gpu/drm/virtio/virtgpu_drv.h
@@ -93,15 +93,6 @@ struct virtio_gpu_object {
bool host3d_blob, guest_blob;
uint32_t blob_mem, blob_flags;
 
-   atomic_t layout_state;
-   struct {
-   uint64_t offset;
-   uint64_t size;
-   uint32_t stride;
-   } planes[VIRTIO_GPU_RES_MAX_PLANES];
-   uint64_t modifier;
-   uint32_t num_planes;
-
int uuid_state;
uuid_t uuid;
 
@@ -225,6 +216,16 @@ struct virtio_gpu_drv_cap_cache {
atomic_t is_valid;
 };
 
+struct virtio_gpu_query_info {
+   uint32_t num_planes;
+   uint64_t modifier;
+   struct {
+   uint64_t offset;
+   uint32_t stride;
+   } planes [VIRTIO_GPU_MAX_RESOURCE_PLANES];
+   atomic_t is_valid;
+};
+
 struct virtio_gpu_device {
struct drm_device *ddev;
 
@@ -448,7 +449,12 @@ void virtio_gpu_cmd_host_wait(struct virtio_gpu_device 
*vgdev,
 
 int
 virtio_gpu_cmd_get_resource_layout(struct virtio_gpu_device *vgdev,
-  struct virtio_gpu_object *bo);
+  struct virtio_gpu_query_info *bo_info,
+  uint32_t width,
+  uint32_t height,
+  uint32_t format,
+  uint32_t bind,
+  uint32_t hw_res_handle);
 
 /* virtgpu_display.c */
 int virtio_gpu_modeset_init(struct virtio_gpu_device *vgdev);
diff --git a/drivers/gpu/drm/virtio/virtgpu_ioctl.c 
b/drivers/gpu/drm/virtio/virtgpu_ioctl.c
index 51d04460d0d8..034a7c0927a5 100644
--- a/drivers/gpu/drm/virtio/virtgpu_ioctl.c
+++ b/drivers/gpu/drm/virtio/virtgpu_ioctl.c
@@ -685,9 +685,9 @@ static int virtio_gpu_resource_query_layout_ioctl(struct 
drm_device *dev,
 {
struct drm_virtgpu_resource_query_layout *args = data;
struct virtio_gpu_device *vgdev = dev->dev_private;
-   struct drm_gem_object *obj;
-   struct virtio_gpu_object *bo;
-   int layout_state;
+   struct drm_gem_object *obj = NULL;
+   struct virtio_gpu_object *bo = NULL;
+   struct virtio_gpu_query_info bo_info = {0};
int ret = 0;
int i;
 
@@ -696,50 +696,45 @@ static int virtio_gpu_resource_query_layout_ioctl(struct 
drm_device *dev,
return -EINVAL;
}
 
-   obj = drm_gem_object_lookup(file, args->handle);
-   if (obj == NULL) {
-   DRM_ERROR("invalid handle 0x%x\n", args->handle);
-   return -ENOENT;
-   }
-   bo = gem_to_virtio_gpu_obj(obj);
-
-   layout_state = atomic_read(>layout_state);
-   if (layout_state == STATE_ERR) {
-   ret = -EINVAL;
-   goto out;
-   } else if (layout_state == STATE_OK) {
-   goto valid;
+   if (args->handle > 0) {
+   obj = drm_gem_object_lookup(file, args->handle);
+   if (obj == NULL) {
+   DRM_ERROR("invalid handle 0x%x\n", args->handle);
+   return -ENOENT;
+   }
+   bo = gem_to_virtio_gpu_obj(obj);
}
 
-   ret = virtio_gpu_cmd_get_resource_layout(vgdev, bo);
+   ret = virtio_gpu_cmd_get_resource_layout(vgdev, _info, args->width,
+args->height, args->format,
+args->bind, bo ? 
bo->hw_res_handle : 0);
if (ret)
goto out;
 
ret = wait_event_timeout(vgdev->resp_wq,
-atomic_read(>layout_state) == STATE_OK,
+atomic_read(_info.is_valid),
 5 * HZ);
if (!ret)
goto out;
 
 valid:
smp_rmb();
-   WARN_ON(atomic_read(>layout_state) != STATE_OK);
-   args->num_planes = bo->num_planes;
-   args->modifier = bo->modifier;
+   WARN_ON(atomic_read(_info.is_valid));
+   args->num_planes = bo_info.num_planes;
+   args->modifier = bo_info.modifier;
for (i = 0; i < args->num_planes; i++) {
-   args->planes[i].offset = bo->planes[i].offset;
-   args->planes[i].size = bo->planes[i].size;
-   args->planes[i].stride = 

[PATCH 1/2] drm/virtio: Implement RESOURCE_GET_LAYOUT ioctl

2023-11-09 Thread Julia Zhang
From: Daniel Stone 

This ioctl allows the guest to discover how the guest actually allocated
the underlying buffer, which allows buffers to be used for GL<->Vulkan
interop and through standard window systems. It's also a step towards
properly supporting modifiers in the guest.
---
 drivers/gpu/drm/virtio/virtgpu_drv.c   |  1 +
 drivers/gpu/drm/virtio/virtgpu_drv.h   | 16 +-
 drivers/gpu/drm/virtio/virtgpu_ioctl.c | 71 ++
 drivers/gpu/drm/virtio/virtgpu_kms.c   |  8 ++-
 drivers/gpu/drm/virtio/virtgpu_vq.c| 56 
 include/uapi/drm/virtgpu_drm.h | 19 +++
 include/uapi/linux/virtio_gpu.h| 30 +++
 7 files changed, 198 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.c 
b/drivers/gpu/drm/virtio/virtgpu_drv.c
index 4f7140e27614..1ee09974d4b7 100644
--- a/drivers/gpu/drm/virtio/virtgpu_drv.c
+++ b/drivers/gpu/drm/virtio/virtgpu_drv.c
@@ -190,6 +190,7 @@ static unsigned int features[] = {
VIRTIO_GPU_F_RESOURCE_BLOB,
VIRTIO_GPU_F_CONTEXT_INIT,
VIRTIO_GPU_F_CONTEXT_FENCE_WAIT,
+   VIRTIO_GPU_F_RESOURCE_QUERY_LAYOUT,
 };
 static struct virtio_driver virtio_gpu_driver = {
.feature_table = features,
diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.h 
b/drivers/gpu/drm/virtio/virtgpu_drv.h
index 7ef4b3df0ada..d6fc0d4ecb7d 100644
--- a/drivers/gpu/drm/virtio/virtgpu_drv.h
+++ b/drivers/gpu/drm/virtio/virtgpu_drv.h
@@ -93,6 +93,15 @@ struct virtio_gpu_object {
bool host3d_blob, guest_blob;
uint32_t blob_mem, blob_flags;
 
+   atomic_t layout_state;
+   struct {
+   uint64_t offset;
+   uint64_t size;
+   uint32_t stride;
+   } planes[VIRTIO_GPU_RES_MAX_PLANES];
+   uint64_t modifier;
+   uint32_t num_planes;
+
int uuid_state;
uuid_t uuid;
 
@@ -249,6 +258,7 @@ struct virtio_gpu_device {
bool has_host_visible;
bool has_context_init;
bool has_host_fence_wait;
+   bool has_resource_query_layout;
struct virtio_shm_region host_visible_region;
struct drm_mm host_visible_mm;
 
@@ -281,7 +291,7 @@ struct virtio_gpu_fpriv {
 };
 
 /* virtgpu_ioctl.c */
-#define DRM_VIRTIO_NUM_IOCTLS 12
+#define DRM_VIRTIO_NUM_IOCTLS 13
 extern struct drm_ioctl_desc virtio_gpu_ioctls[DRM_VIRTIO_NUM_IOCTLS];
 void virtio_gpu_create_context(struct drm_device *dev, struct drm_file *file);
 
@@ -436,6 +446,10 @@ int virtio_gpu_cmd_status_freezing(struct 
virtio_gpu_device *vgdev,
 void virtio_gpu_cmd_host_wait(struct virtio_gpu_device *vgdev,
  uint32_t ctx_id, uint64_t fence_id);
 
+int
+virtio_gpu_cmd_get_resource_layout(struct virtio_gpu_device *vgdev,
+  struct virtio_gpu_object *bo);
+
 /* virtgpu_display.c */
 int virtio_gpu_modeset_init(struct virtio_gpu_device *vgdev);
 void virtio_gpu_modeset_fini(struct virtio_gpu_device *vgdev);
diff --git a/drivers/gpu/drm/virtio/virtgpu_ioctl.c 
b/drivers/gpu/drm/virtio/virtgpu_ioctl.c
index b6079d2bff69..51d04460d0d8 100644
--- a/drivers/gpu/drm/virtio/virtgpu_ioctl.c
+++ b/drivers/gpu/drm/virtio/virtgpu_ioctl.c
@@ -107,6 +107,9 @@ static int virtio_gpu_getparam_ioctl(struct drm_device 
*dev, void *data,
case VIRTGPU_PARAM_SUPPORTED_CAPSET_IDs:
value = vgdev->capset_id_mask;
break;
+   case VIRTGPU_PARAM_RESOURCE_QUERY_LAYOUT:
+   value = vgdev->has_resource_query_layout ? 1 : 0;
+   break;
default:
return -EINVAL;
}
@@ -676,6 +679,70 @@ static int virtio_gpu_context_init_ioctl(struct drm_device 
*dev,
return ret;
 }
 
+static int virtio_gpu_resource_query_layout_ioctl(struct drm_device *dev,
+ void *data,
+ struct drm_file *file)
+{
+   struct drm_virtgpu_resource_query_layout *args = data;
+   struct virtio_gpu_device *vgdev = dev->dev_private;
+   struct drm_gem_object *obj;
+   struct virtio_gpu_object *bo;
+   int layout_state;
+   int ret = 0;
+   int i;
+
+   if (!vgdev->has_resource_query_layout) {
+   DRM_ERROR("failing: no RQL on host\n");
+   return -EINVAL;
+   }
+
+   obj = drm_gem_object_lookup(file, args->handle);
+   if (obj == NULL) {
+   DRM_ERROR("invalid handle 0x%x\n", args->handle);
+   return -ENOENT;
+   }
+   bo = gem_to_virtio_gpu_obj(obj);
+
+   layout_state = atomic_read(>layout_state);
+   if (layout_state == STATE_ERR) {
+   ret = -EINVAL;
+   goto out;
+   } else if (layout_state == STATE_OK) {
+   goto valid;
+   }
+
+   ret = virtio_gpu_cmd_get_resource_layout(vgdev, bo);
+   if (ret)
+   goto out;
+
+   ret = wait_event_timeout(vgdev->resp_wq,
+ 

[PATCH 0/2] Add RESOURCE_GET_LAYOUT ioctl

2023-11-09 Thread Julia Zhang
This is to add a new ioctl RESOURCE_GET_LAYOUT to virtio-gpu to get the
information about how the host has actually allocated the buffer. It is
implemented to query the stride of linear buffer for dGPU prime on guest VM,
related mesa mr: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23896

Daniel Stone (1):
  drm/virtio: Implement RESOURCE_GET_LAYOUT ioctl

Julia Zhang (1):
  drm/virtio: Modify RESOURCE_GET_LAYOUT ioctl

 drivers/gpu/drm/virtio/virtgpu_drv.c   |  1 +
 drivers/gpu/drm/virtio/virtgpu_drv.h   | 22 -
 drivers/gpu/drm/virtio/virtgpu_ioctl.c | 66 ++
 drivers/gpu/drm/virtio/virtgpu_kms.c   |  8 +++-
 drivers/gpu/drm/virtio/virtgpu_vq.c| 63 
 include/uapi/drm/virtgpu_drm.h | 21 
 include/uapi/linux/virtio_gpu.h| 30 
 7 files changed, 208 insertions(+), 3 deletions(-)

-- 
2.34.1




RE: [PATCH] drm/amdgpu: correct mca ipid die/socket/addr decode

2023-11-09 Thread Zhang, Hawking
[AMD Official Use Only - General]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Wang, Yang(Kevin) 
Sent: Friday, November 10, 2023 14:33
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Wang, Yang(Kevin) 

Subject: [PATCH] drm/amdgpu: correct mca ipid die/socket/addr decode

correct mca ipid die/socket/addr decode

Signed-off-by: Yang Wang 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c| 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 789552c66d1d..3f71490b779c 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2381,8 +2381,8 @@ static const struct mca_bank_ipid 
smu_v13_0_6_mca_ipid_table[AMDGPU_MCA_IP_COUNT

 static void mca_bank_entry_info_decode(struct mca_bank_entry *entry, struct 
mca_bank_info *info)  {
-   uint64_t ipid = entry->regs[MCA_REG_IDX_IPID];
-   uint32_t insthi;
+   u64 ipid = entry->regs[MCA_REG_IDX_IPID];
+   u32 instidhi, instid;

/* NOTE: All MCA IPID register share the same format,
 * so the driver can share the MCMP1 register header file.
@@ -2391,9 +2391,14 @@ static void mca_bank_entry_info_decode(struct 
mca_bank_entry *entry, struct mca_
info->hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
info->mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);

-   insthi = REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi);
-   info->aid = ((insthi >> 2) & 0x03);
-   info->socket_id = insthi & 0x03;
+   /* Unfied DieID Format: SAASS. A:AID, S:Socket.
+* Unfied DieID[4] = InstanceId[0]
+* Unfied DieID[0:3] = InstanceIdHi[0:3]
+* */
+   instidhi = REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi);
+   instid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo);
+   info->aid = ((instidhi >> 2) & 0x03);
+   info->socket_id = ((instid & 0x1) << 4) | (instidhi & 0x03);
 }

 static int mca_bank_read_reg(struct amdgpu_device *adev, enum 
amdgpu_mca_error_type type, @@ -2567,6 +2572,7 @@ static bool 
mca_gfx_smu_bank_is_valid(const struct mca_ras_info *mca_ras, struct
uint32_t instlo;

instlo = REG_GET_FIELD(entry->regs[MCA_REG_IDX_IPID], MCMP1_IPIDT0, 
InstanceIdLo);
+   instlo &= GENMASK(31, 1);
switch (instlo) {
case 0x36430400: /* SMNAID XCD 0 */
case 0x38430400: /* SMNAID XCD 1 */
@@ -2585,6 +2591,7 @@ static bool mca_smu_bank_is_valid(const struct 
mca_ras_info *mca_ras, struct amd
uint32_t errcode, instlo;

instlo = REG_GET_FIELD(entry->regs[MCA_REG_IDX_IPID], MCMP1_IPIDT0, 
InstanceIdLo);
+   instlo &= GENMASK(31, 1);
if (instlo != 0x03b30400)
return false;

--
2.34.1



RE: [PATCH 2/3] drm/amdgpu: fall back to INPUT power for AVG power via INFO IOCTL

2023-11-09 Thread Wang, Yang(Kevin)
[AMD Official Use Only - General]

Reviewed-by: Yang Wang 

Best Regards,
Kevin

-Original Message-
From: amd-gfx  On Behalf Of Alex Deucher
Sent: Friday, November 10, 2023 6:14 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander 
Subject: [PATCH 2/3] drm/amdgpu: fall back to INPUT power for AVG power via 
INFO IOCTL

For backwards compatibility with userspace.

Fixes: 47f1724db4fe ("drm/amd: Introduce `AMDGPU_PP_SENSOR_GPU_INPUT_POWER`")
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2897
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index b5ebafd4a3ad..bf4f48fe438d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1105,7 +1105,12 @@ int amdgpu_info_ioctl(struct drm_device *dev, void 
*data, struct drm_file *filp)
if (amdgpu_dpm_read_sensor(adev,
   
AMDGPU_PP_SENSOR_GPU_AVG_POWER,
   (void *), _size)) {
-   return -EINVAL;
+   /* fall back to input power for backwards 
compat */
+   if (amdgpu_dpm_read_sensor(adev,
+  
AMDGPU_PP_SENSOR_GPU_INPUT_POWER,
+  (void *), 
_size)) {
+   return -EINVAL;
+   }
}
ui32 >>= 8;
break;
--
2.41.0



[PATCH] drm/amdgpu: correct mca ipid die/socket/addr decode

2023-11-09 Thread Yang Wang
correct mca ipid die/socket/addr decode

Signed-off-by: Yang Wang 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c| 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 789552c66d1d..3f71490b779c 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2381,8 +2381,8 @@ static const struct mca_bank_ipid 
smu_v13_0_6_mca_ipid_table[AMDGPU_MCA_IP_COUNT
 
 static void mca_bank_entry_info_decode(struct mca_bank_entry *entry, struct 
mca_bank_info *info)
 {
-   uint64_t ipid = entry->regs[MCA_REG_IDX_IPID];
-   uint32_t insthi;
+   u64 ipid = entry->regs[MCA_REG_IDX_IPID];
+   u32 instidhi, instid;
 
/* NOTE: All MCA IPID register share the same format,
 * so the driver can share the MCMP1 register header file.
@@ -2391,9 +2391,14 @@ static void mca_bank_entry_info_decode(struct 
mca_bank_entry *entry, struct mca_
info->hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
info->mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);
 
-   insthi = REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi);
-   info->aid = ((insthi >> 2) & 0x03);
-   info->socket_id = insthi & 0x03;
+   /* Unfied DieID Format: SAASS. A:AID, S:Socket.
+* Unfied DieID[4] = InstanceId[0]
+* Unfied DieID[0:3] = InstanceIdHi[0:3]
+* */
+   instidhi = REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi);
+   instid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo);
+   info->aid = ((instidhi >> 2) & 0x03);
+   info->socket_id = ((instid & 0x1) << 4) | (instidhi & 0x03);
 }
 
 static int mca_bank_read_reg(struct amdgpu_device *adev, enum 
amdgpu_mca_error_type type,
@@ -2567,6 +2572,7 @@ static bool mca_gfx_smu_bank_is_valid(const struct 
mca_ras_info *mca_ras, struct
uint32_t instlo;
 
instlo = REG_GET_FIELD(entry->regs[MCA_REG_IDX_IPID], MCMP1_IPIDT0, 
InstanceIdLo);
+   instlo &= GENMASK(31, 1);
switch (instlo) {
case 0x36430400: /* SMNAID XCD 0 */
case 0x38430400: /* SMNAID XCD 1 */
@@ -2585,6 +2591,7 @@ static bool mca_smu_bank_is_valid(const struct 
mca_ras_info *mca_ras, struct amd
uint32_t errcode, instlo;
 
instlo = REG_GET_FIELD(entry->regs[MCA_REG_IDX_IPID], MCMP1_IPIDT0, 
InstanceIdLo);
+   instlo &= GENMASK(31, 1);
if (instlo != 0x03b30400)
return false;
 
-- 
2.34.1



Re: [Patch v13 0/9] Enable Wifi RFI interference mitigation feature support

2023-11-09 Thread Ma, Jun
ping...
Any other comments?

Regards,
Ma Jun

On 10/30/2023 3:18 PM, Ma Jun wrote:
> Due to electrical and mechanical constraints in certain platform designs there
> may be likely interference of relatively high-powered harmonics of the (G-)DDR
> memory clocks with local radio module frequency bands used by Wifi 6/6e/7. To
> mitigate possible RFI interference we introuduced WBRF(Wifi Band RFI 
> mitigation Feature).
> Producers can advertise the frequencies in use and consumers can use this 
> information
> to avoid using these frequencies for sensitive features.
> 
> The whole patch set is based on Linux 6.6.0-rc6. With some brief introductions
> as below:
> Patch1:  Document about WBRF
> Patch2:  Core functionality setup for WBRF feature support
> Patch3 - 4:  Bring WBRF support to wifi subsystem.
> Patch5 - 9:  Bring WBRF support to AMD graphics driver.
> 
> Evan Quan (6):
>   cfg80211: expose nl80211_chan_width_to_mhz for wide sharing
>   wifi: mac80211: Add support for WBRF features
>   drm/amd/pm: update driver_if and ppsmc headers for coming wbrf feature
>   drm/amd/pm: setup the framework to support Wifi RFI mitigation feature
>   drm/amd/pm: add flood detection for wbrf events
>   drm/amd/pm: enable Wifi RFI mitigation feature support for SMU13.0.7
> 
> Ma Jun (3):
>   Documentation/driver-api: Add document about WBRF mechanism
>   platform/x86/amd: Add support for AMD ACPI based Wifi band RFI
> mitigation feature
>   drm/amd/pm: enable Wifi RFI mitigation feature support for SMU13.0.0
> 
>  Documentation/driver-api/wbrf.rst |  76 
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |   2 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   |  17 +
>  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 208 +
>  drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  42 ++
>  .../inc/pmfw_if/smu13_driver_if_v13_0_0.h |   3 +-
>  .../inc/pmfw_if/smu13_driver_if_v13_0_7.h |   3 +-
>  .../pm/swsmu/inc/pmfw_if/smu_v13_0_0_ppsmc.h  |   5 +-
>  .../pm/swsmu/inc/pmfw_if/smu_v13_0_7_ppsmc.h  |   3 +-
>  drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  |   3 +-
>  drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h  |   4 +
>  .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c|  48 ++
>  .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c  |  22 +
>  .../drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c  |  13 +
>  drivers/gpu/drm/amd/pm/swsmu/smu_internal.h   |   3 +
>  drivers/platform/x86/amd/Kconfig  |  15 +
>  drivers/platform/x86/amd/Makefile |   1 +
>  drivers/platform/x86/amd/wbrf.c   | 413 ++
>  include/linux/acpi_amd_wbrf.h |  94 
>  include/net/cfg80211.h|   9 +
>  net/mac80211/Makefile |   2 +
>  net/mac80211/chan.c   |   9 +
>  net/mac80211/ieee80211_i.h|   7 +
>  net/mac80211/main.c   |   2 +
>  net/mac80211/wbrf.c   |  95 
>  net/wireless/chan.c   |   3 +-
>  26 files changed, 1094 insertions(+), 8 deletions(-)
>  create mode 100644 Documentation/driver-api/wbrf.rst
>  create mode 100644 drivers/platform/x86/amd/wbrf.c
>  create mode 100644 include/linux/acpi_amd_wbrf.h
>  create mode 100644 net/mac80211/wbrf.c
> 


RE: [RFC PATCH v2] drm/amdkfd: Run restore_workers on freezable WQs

2023-11-09 Thread Pan, Xinhui
[AMD Official Use Only - General]

I once replaced the queue with the freezable one, but got hang in flush.
Looks like Felix has fixed it.

Acked-and-tested-by: xinhui pan 


-Original Message-
From: Kuehling, Felix 
Sent: Wednesday, November 8, 2023 6:06 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deng, Emily ; Pan, Xinhui ; Koenig, 
Christian 
Subject: [RFC PATCH v2] drm/amdkfd: Run restore_workers on freezable WQs

Make restore workers freezable so we don't have to explicitly flush them in 
suspend and GPU reset code paths, and we don't accidentally try to restore BOs 
while the GPU is suspended. Not having to flush restore_work also helps avoid 
lock/fence dependencies in the GPU reset case where we're not allowed to wait 
for fences.

A side effect of this is, that we can now have multiple concurrent threads 
trying to signal the same eviction fence. Rework eviction fence signaling and 
replacement to account for that.

The GPU reset path can no longer rely on restore_process_worker to resume 
queues because evict/restore workers can run independently of it. Instead call 
a new restore_process_helper directly.

This is an RFC and request for testing.

v2:
- Reworked eviction fence signaling
- Introduced restore_process_helper

Signed-off-by: Felix Kuehling 
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 34 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  | 87 +++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c  |  4 +-
 3 files changed, 81 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 54f31a420229..1b33ddc0512e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1405,7 +1405,6 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void 
**process_info,
  amdgpu_amdkfd_restore_userptr_worker);

*process_info = info;
-   *ef = dma_fence_get(>eviction_fence->base);
}

vm->process_info = *process_info;
@@ -1436,6 +1435,8 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void 
**process_info,
list_add_tail(>vm_list_node,
&(vm->process_info->vm_list_head));
vm->process_info->n_vms++;
+
+   *ef = dma_fence_get(>process_info->eviction_fence->base);
mutex_unlock(>process_info->lock);

return 0;
@@ -1447,10 +1448,7 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void 
**process_info,
 reserve_pd_fail:
vm->process_info = NULL;
if (info) {
-   /* Two fence references: one in info and one in *ef */
dma_fence_put(>eviction_fence->base);
-   dma_fence_put(*ef);
-   *ef = NULL;
*process_info = NULL;
put_pid(info->pid);
 create_evict_fence_fail:
@@ -1644,7 +1642,8 @@ int amdgpu_amdkfd_criu_resume(void *p)
goto out_unlock;
}
WRITE_ONCE(pinfo->block_mmu_notifications, false);
-   schedule_delayed_work(>restore_userptr_work, 0);
+   queue_delayed_work(system_freezable_wq,
+  >restore_userptr_work, 0);

 out_unlock:
mutex_unlock(>lock);
@@ -2458,7 +2457,8 @@ int amdgpu_amdkfd_evict_userptr(struct 
mmu_interval_notifier *mni,
   KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
if (r)
pr_err("Failed to quiesce KFD\n");
-   schedule_delayed_work(_info->restore_userptr_work,
+   queue_delayed_work(system_freezable_wq,
+   _info->restore_userptr_work,
msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
}
mutex_unlock(_info->notifier_lock);
@@ -2793,7 +2793,8 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct 
work_struct *work)

/* If validation failed, reschedule another attempt */
if (evicted_bos) {
-   schedule_delayed_work(_info->restore_userptr_work,
+   queue_delayed_work(system_freezable_wq,
+   _info->restore_userptr_work,
msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));

kfd_smi_event_queue_restore_rescheduled(mm);
@@ -2802,6 +2803,23 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct 
work_struct *work)
put_task_struct(usertask);
 }

+static void replace_eviction_fence(struct dma_fence **ef,
+  struct dma_fence *new_ef)
+{
+   struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true
+   /* protected by process_info->lock */);
+
+   /* If we're replacing an unsignaled eviction fence, that fence will
+* never be signaled, and if anyone is still waiting on that fence,
+* they will hang forever. This should never happen. We should only
+* replace the fence in restore_work 

Re: [PATCH] drm/amdgpu: move UVD and VCE sched entity init after sched init

2023-11-09 Thread Alex Deucher
On Thu, Nov 9, 2023 at 4:35 PM Luben Tuikov  wrote:
>
> On 2023-11-09 11:13, Alex Deucher wrote:
> > Ping?
> >
> > On Wed, Nov 8, 2023 at 1:42 PM Alex Deucher  
> > wrote:
> >>
> >> We need kernel scheduling entities to deal with handle clean up
> >> if apps are not cleaned up properly.  With commit 56e449603f0ac5
> >> ("drm/sched: Convert the GPU scheduler to variable number of run-queues")
> >> the scheduler entities have to be created after scheduler init, so
> >> change the ordering to fix this.
> >>
> >> v2: Leave logic in UVD and VCE code
> >>
> >> Fixes: 56e449603f0ac5 ("drm/sched: Convert the GPU scheduler to variable 
> >> number of run-queues")
> >> Signed-off-by: Alex Deucher 
> >> Cc: ltuiko...@gmail.com
>
> Reviewed-by: Luben Tuikov 

Thanks Luben!

Alex

>
> Regards,
> Luben
>
> >> ---
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c| 22 ++--
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h|  2 +-
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c| 24 +++---
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_vce.h|  2 +-
> >>  drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c  |  2 --
> >>  drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c  |  2 --
> >>  drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c  |  2 --
> >>  drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c  |  2 --
> >>  drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c  |  4 
> >>  drivers/gpu/drm/amd/amdgpu/vce_v2_0.c  |  2 --
> >>  drivers/gpu/drm/amd/amdgpu/vce_v3_0.c  |  2 --
> >>  drivers/gpu/drm/amd/amdgpu/vce_v4_0.c  |  5 -
> >>  13 files changed, 37 insertions(+), 46 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >> index 43a95feba884..03e669c34033 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >> @@ -2499,6 +2499,18 @@ static int amdgpu_device_init_schedulers(struct 
> >> amdgpu_device *adev)
> >>   ring->name);
> >> return r;
> >> }
> >> +   r = amdgpu_uvd_entity_init(adev, ring);
> >> +   if (r) {
> >> +   DRM_ERROR("Failed to create UVD scheduling entity 
> >> on ring %s.\n",
> >> + ring->name);
> >> +   return r;
> >> +   }
> >> +   r = amdgpu_vce_entity_init(adev, ring);
> >> +   if (r) {
> >> +   DRM_ERROR("Failed to create VCE scheduling entity 
> >> on ring %s.\n",
> >> + ring->name);
> >> +   return r;
> >> +   }
> >> }
> >>
> >> amdgpu_xcp_update_partition_sched_list(adev);
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c 
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
> >> index 815b7c34ed33..65949cc7abb9 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
> >> @@ -399,20 +399,20 @@ int amdgpu_uvd_sw_fini(struct amdgpu_device *adev)
> >>   *
> >>   * @adev: amdgpu_device pointer
> >>   *
> >> + * Initialize the entity used for handle management in the kernel driver.
> >>   */
> >> -int amdgpu_uvd_entity_init(struct amdgpu_device *adev)
> >> +int amdgpu_uvd_entity_init(struct amdgpu_device *adev, struct amdgpu_ring 
> >> *ring)
> >>  {
> >> -   struct amdgpu_ring *ring;
> >> -   struct drm_gpu_scheduler *sched;
> >> -   int r;
> >> +   if (ring == >uvd.inst[0].ring) {
> >> +   struct drm_gpu_scheduler *sched = >sched;
> >> +   int r;
> >>
> >> -   ring = >uvd.inst[0].ring;
> >> -   sched = >sched;
> >> -   r = drm_sched_entity_init(>uvd.entity, 
> >> DRM_SCHED_PRIORITY_NORMAL,
> >> - , 1, NULL);
> >> -   if (r) {
> >> -   DRM_ERROR("Failed setting up UVD kernel entity.\n");
> >> -   return r;
> >> +   r = drm_sched_entity_init(>uvd.entity, 
> >> DRM_SCHED_PRIORITY_NORMAL,
> >> + , 1, NULL);
> >> +   if (r) {
> >> +   DRM_ERROR("Failed setting up UVD kernel 
> >> entity.\n");
> >> +   return r;
> >> +   }
> >> }
> >>
> >> return 0;
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h 
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h
> >> index a9f342537c68..9dfad2f48ef4 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h
> >> @@ -73,7 +73,7 @@ struct amdgpu_uvd {
> >>
> >>  int amdgpu_uvd_sw_init(struct amdgpu_device *adev);
> >>  int amdgpu_uvd_sw_fini(struct amdgpu_device *adev);
> >> -int amdgpu_uvd_entity_init(struct amdgpu_device *adev);
> >> +int amdgpu_uvd_entity_init(struct amdgpu_device *adev, struct amdgpu_ring 
> >> 

[PATCH 3/3] drm/amdgpu: add new INFO IOCTL query for input power

2023-11-09 Thread Alex Deucher
Some chips provide both average and input power.  Previously
we just exposed average power, add a new query for input
power.

Example userspace:
https://github.com/Umio-Yasuno/libdrm-amdgpu-sys-rs/tree/input_power

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 9 +
 include/uapi/drm/amdgpu_drm.h   | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index bf4f48fe438d..48496bb585c7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1114,6 +1114,15 @@ int amdgpu_info_ioctl(struct drm_device *dev, void 
*data, struct drm_file *filp)
}
ui32 >>= 8;
break;
+   case AMDGPU_INFO_SENSOR_GPU_INPUT_POWER:
+   /* get input GPU power */
+   if (amdgpu_dpm_read_sensor(adev,
+  
AMDGPU_PP_SENSOR_GPU_INPUT_POWER,
+  (void *), _size)) {
+   return -EINVAL;
+   }
+   ui32 >>= 8;
+   break;
case AMDGPU_INFO_SENSOR_VDDNB:
/* get VDDNB in millivolts */
if (amdgpu_dpm_read_sensor(adev,
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index ad21c613fec8..96e32dafd4f0 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -865,6 +865,8 @@ struct drm_amdgpu_cs_chunk_cp_gfx_shadow {
#define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_SCLK 0xa
/* Subquery id: Query GPU peak pstate memory clock */
#define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_MCLK 0xb
+   /* Subquery id: Query input GPU power   */
+   #define AMDGPU_INFO_SENSOR_GPU_INPUT_POWER  0xc
 /* Number of VRAM page faults on CPU access. */
 #define AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS   0x1E
 #define AMDGPU_INFO_VRAM_LOST_COUNTER  0x1F
-- 
2.41.0



[PATCH 1/3] drm/amdgpu: fix avg vs input power reporting on smu7

2023-11-09 Thread Alex Deucher
Hawaii, Bonaire, Fiji, and Tonga support average power, the others
support current power.

Signed-off-by: Alex Deucher 
---
 .../gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c 
b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
index 11372fcc59c8..a2c7b2e111fa 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
@@ -3995,6 +3995,7 @@ static int smu7_read_sensor(struct pp_hwmgr *hwmgr, int 
idx,
uint32_t sclk, mclk, activity_percent;
uint32_t offset, val_vid;
struct smu7_hwmgr *data = (struct smu7_hwmgr *)(hwmgr->backend);
+   struct amdgpu_device *adev = hwmgr->adev;
 
/* size must be at least 4 bytes for all sensors */
if (*size < 4)
@@ -4038,7 +4039,21 @@ static int smu7_read_sensor(struct pp_hwmgr *hwmgr, int 
idx,
*size = 4;
return 0;
case AMDGPU_PP_SENSOR_GPU_INPUT_POWER:
-   return smu7_get_gpu_power(hwmgr, (uint32_t *)value);
+   if ((adev->asic_type != CHIP_HAWAII) &&
+   (adev->asic_type != CHIP_BONAIRE) &&
+   (adev->asic_type != CHIP_FIJI) &&
+   (adev->asic_type != CHIP_TONGA))
+   return smu7_get_gpu_power(hwmgr, (uint32_t *)value);
+   else
+   return -EOPNOTSUPP;
+   case AMDGPU_PP_SENSOR_GPU_AVG_POWER:
+   if ((adev->asic_type != CHIP_HAWAII) &&
+   (adev->asic_type != CHIP_BONAIRE) &&
+   (adev->asic_type != CHIP_FIJI) &&
+   (adev->asic_type != CHIP_TONGA))
+   return -EOPNOTSUPP;
+   else
+   return smu7_get_gpu_power(hwmgr, (uint32_t *)value);
case AMDGPU_PP_SENSOR_VDDGFX:
if ((data->vr_config & VRCONF_VDDGFX_MASK) ==
(VR_SVI2_PLANE_2 << VRCONF_VDDGFX_SHIFT))
-- 
2.41.0



[PATCH 2/3] drm/amdgpu: fall back to INPUT power for AVG power via INFO IOCTL

2023-11-09 Thread Alex Deucher
For backwards compatibility with userspace.

Fixes: 47f1724db4fe ("drm/amd: Introduce `AMDGPU_PP_SENSOR_GPU_INPUT_POWER`")
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2897
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index b5ebafd4a3ad..bf4f48fe438d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1105,7 +1105,12 @@ int amdgpu_info_ioctl(struct drm_device *dev, void 
*data, struct drm_file *filp)
if (amdgpu_dpm_read_sensor(adev,
   
AMDGPU_PP_SENSOR_GPU_AVG_POWER,
   (void *), _size)) {
-   return -EINVAL;
+   /* fall back to input power for backwards 
compat */
+   if (amdgpu_dpm_read_sensor(adev,
+  
AMDGPU_PP_SENSOR_GPU_INPUT_POWER,
+  (void *), 
_size)) {
+   return -EINVAL;
+   }
}
ui32 >>= 8;
break;
-- 
2.41.0



RE: [PATCH 2/2] drm/amdgpu: add amdgpu runpm usage trace for separate funcs

2023-11-09 Thread Deucher, Alexander
[Public]

> -Original Message-
> From: Liang, Prike 
> Sent: Thursday, November 9, 2023 2:37 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander ; Liang, Prike
> 
> Subject: [PATCH 2/2] drm/amdgpu: add amdgpu runpm usage trace for
> separate funcs
>
> Add trace for amdgpu runpm separate funcs usage and this will help
> debugging on the case of runpm usage missed to dereference.
> In the normal case the runpm usage count referred by one kind of
> functionality pairwise and usage should be changed from 1 to 0, otherwise
> there will be an issue in the amdgpu runpm usage dereference.
>
> Signed-off-by: Prike Liang 

Looks good.  Not sure if you want to add tracepoints to the other call sites as 
well.  These are probably the trickiest however.

Acked-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c |  4 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   |  7 ++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h   | 15 +++
>  3 files changed, 25 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> index e7e87a3b2601..decbbe3d4f06 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> @@ -42,6 +42,7 @@
>  #include 
>  #include 
>  #include 
> +#include "amdgpu_trace.h"
>
>  /**
>   * amdgpu_dma_buf_attach - _buf_ops.attach implementation @@ -
> 63,6 +64,7 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf,
>   attach->peer2peer = false;
>
>   r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> + trace_amdgpu_runpm_reference_dumps(1, __func__);
>   if (r < 0)
>   goto out;
>
> @@ -70,6 +72,7 @@ static int amdgpu_dma_buf_attach(struct dma_buf
> *dmabuf,
>
>  out:
>   pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
> + trace_amdgpu_runpm_reference_dumps(0, __func__);
>   return r;
>  }
>
> @@ -90,6 +93,7 @@ static void amdgpu_dma_buf_detach(struct dma_buf
> *dmabuf,
>
>   pm_runtime_mark_last_busy(adev_to_drm(adev)->dev);
>   pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
> + trace_amdgpu_runpm_reference_dumps(0, __func__);
>  }
>
>  /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 709a2c1b9d63..1026a9fa0c0f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -183,6 +183,7 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring,
> struct dma_fence **f, struct amd
>   amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
>  seq, flags | AMDGPU_FENCE_FLAG_INT);
>   pm_runtime_get_noresume(adev_to_drm(adev)->dev);
> + trace_amdgpu_runpm_reference_dumps(1, __func__);
>   ptr = >fence_drv.fences[seq & ring-
> >fence_drv.num_fences_mask];
>   if (unlikely(rcu_dereference_protected(*ptr, 1))) {
>   struct dma_fence *old;
> @@ -286,8 +287,11 @@ bool amdgpu_fence_process(struct amdgpu_ring
> *ring)
>   seq != ring->fence_drv.sync_seq)
>   amdgpu_fence_schedule_fallback(ring);
>
> - if (unlikely(seq == last_seq))
> + if (unlikely(seq == last_seq)) {
> + pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
> + trace_amdgpu_runpm_reference_dumps(0, __func__);
>   return false;
> + }
>
>   last_seq &= drv->num_fences_mask;
>   seq &= drv->num_fences_mask;
> @@ -310,6 +314,7 @@ bool amdgpu_fence_process(struct amdgpu_ring
> *ring)
>   dma_fence_put(fence);
>   pm_runtime_mark_last_busy(adev_to_drm(adev)->dev);
>   pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
> + trace_amdgpu_runpm_reference_dumps(0, __func__);
>   } while (last_seq != seq);
>
>   return true;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> index 2fd1bfb35916..5d4792645540 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
> @@ -554,6 +554,21 @@ TRACE_EVENT(amdgpu_reset_reg_dumps,
> __entry->value)
>  );
>
> +TRACE_EVENT(amdgpu_runpm_reference_dumps,
> + TP_PROTO(uint32_t index, const char *func),
> + TP_ARGS(index, func),
> + TP_STRUCT__entry(
> +  __field(uint32_t, index)
> +  __string(func, func)
> +  ),
> + TP_fast_assign(
> +__entry->index = index;
> +__assign_str(func, func);
> +),
> + TP_printk("amdgpu runpm reference dump 0x%d: 0x%s\n",
> +   __entry->index,
> +   __get_str(func))
> +);
>  #undef AMDGPU_JOB_GET_TIMELINE_NAME
>  #endif
>
> --
> 2.34.1



RE: [PATCH 1/2] drm/amdgpu: correct the amdgpu runtime dereference usage count

2023-11-09 Thread Deucher, Alexander
[Public]

> -Original Message-
> From: Liang, Prike 
> Sent: Thursday, November 9, 2023 2:37 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander ; Liang, Prike
> 
> Subject: [PATCH 1/2] drm/amdgpu: correct the amdgpu runtime dereference
> usage count
>
> Fix the amdgpu runpm dereference usage count.
>
> Signed-off-by: Prike Liang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 1 +
>  2 files changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index a53f436fa9f1..f6e5d9f7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1992,7 +1992,7 @@ static int amdgpu_debugfs_sclk_set(void *data,
> u64 val)
>
>   ret = amdgpu_dpm_set_soft_freq_range(adev, PP_SCLK,
> (uint32_t)val, (uint32_t)val);
>   if (ret)
> - ret = -EINVAL;
> + goto out;

I think this hunk can be dropped.  It doesn't really change anything.  Or you 
could just drop the whole ret check since we just return ret at the end anyway. 
 Not sure if changing the error code is important here or not.

>
>  out:
>   pm_runtime_mark_last_busy(adev_to_drm(adev)->dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
> index 0cacd0b9f8be..ff1f42ae6d8e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
> @@ -346,6 +346,7 @@ int amdgpu_display_crtc_set_config(struct
> drm_mode_set *set,
>   if (!active && adev->have_disp_power_ref) {
>   pm_runtime_put_autosuspend(dev->dev);
>   adev->have_disp_power_ref = false;
> + return ret;
>   }

I think it would be cleaner to just drop the runtime_put above and update the 
comment.  We'll just fall through to the end of the function.

Alex

>
>  out:
> --
> 2.34.1



Re: [PATCH] drm/amdgpu: move UVD and VCE sched entity init after sched init

2023-11-09 Thread Luben Tuikov
On 2023-11-09 11:13, Alex Deucher wrote:
> Ping?
> 
> On Wed, Nov 8, 2023 at 1:42 PM Alex Deucher  wrote:
>>
>> We need kernel scheduling entities to deal with handle clean up
>> if apps are not cleaned up properly.  With commit 56e449603f0ac5
>> ("drm/sched: Convert the GPU scheduler to variable number of run-queues")
>> the scheduler entities have to be created after scheduler init, so
>> change the ordering to fix this.
>>
>> v2: Leave logic in UVD and VCE code
>>
>> Fixes: 56e449603f0ac5 ("drm/sched: Convert the GPU scheduler to variable 
>> number of run-queues")
>> Signed-off-by: Alex Deucher 
>> Cc: ltuiko...@gmail.com

Reviewed-by: Luben Tuikov 

Regards,
Luben

>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c| 22 ++--
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h|  2 +-
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c| 24 +++---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_vce.h|  2 +-
>>  drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c  |  2 --
>>  drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c  |  2 --
>>  drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c  |  2 --
>>  drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c  |  2 --
>>  drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c  |  4 
>>  drivers/gpu/drm/amd/amdgpu/vce_v2_0.c  |  2 --
>>  drivers/gpu/drm/amd/amdgpu/vce_v3_0.c  |  2 --
>>  drivers/gpu/drm/amd/amdgpu/vce_v4_0.c  |  5 -
>>  13 files changed, 37 insertions(+), 46 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 43a95feba884..03e669c34033 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2499,6 +2499,18 @@ static int amdgpu_device_init_schedulers(struct 
>> amdgpu_device *adev)
>>   ring->name);
>> return r;
>> }
>> +   r = amdgpu_uvd_entity_init(adev, ring);
>> +   if (r) {
>> +   DRM_ERROR("Failed to create UVD scheduling entity on 
>> ring %s.\n",
>> + ring->name);
>> +   return r;
>> +   }
>> +   r = amdgpu_vce_entity_init(adev, ring);
>> +   if (r) {
>> +   DRM_ERROR("Failed to create VCE scheduling entity on 
>> ring %s.\n",
>> + ring->name);
>> +   return r;
>> +   }
>> }
>>
>> amdgpu_xcp_update_partition_sched_list(adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
>> index 815b7c34ed33..65949cc7abb9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
>> @@ -399,20 +399,20 @@ int amdgpu_uvd_sw_fini(struct amdgpu_device *adev)
>>   *
>>   * @adev: amdgpu_device pointer
>>   *
>> + * Initialize the entity used for handle management in the kernel driver.
>>   */
>> -int amdgpu_uvd_entity_init(struct amdgpu_device *adev)
>> +int amdgpu_uvd_entity_init(struct amdgpu_device *adev, struct amdgpu_ring 
>> *ring)
>>  {
>> -   struct amdgpu_ring *ring;
>> -   struct drm_gpu_scheduler *sched;
>> -   int r;
>> +   if (ring == >uvd.inst[0].ring) {
>> +   struct drm_gpu_scheduler *sched = >sched;
>> +   int r;
>>
>> -   ring = >uvd.inst[0].ring;
>> -   sched = >sched;
>> -   r = drm_sched_entity_init(>uvd.entity, 
>> DRM_SCHED_PRIORITY_NORMAL,
>> - , 1, NULL);
>> -   if (r) {
>> -   DRM_ERROR("Failed setting up UVD kernel entity.\n");
>> -   return r;
>> +   r = drm_sched_entity_init(>uvd.entity, 
>> DRM_SCHED_PRIORITY_NORMAL,
>> + , 1, NULL);
>> +   if (r) {
>> +   DRM_ERROR("Failed setting up UVD kernel entity.\n");
>> +   return r;
>> +   }
>> }
>>
>> return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h
>> index a9f342537c68..9dfad2f48ef4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h
>> @@ -73,7 +73,7 @@ struct amdgpu_uvd {
>>
>>  int amdgpu_uvd_sw_init(struct amdgpu_device *adev);
>>  int amdgpu_uvd_sw_fini(struct amdgpu_device *adev);
>> -int amdgpu_uvd_entity_init(struct amdgpu_device *adev);
>> +int amdgpu_uvd_entity_init(struct amdgpu_device *adev, struct amdgpu_ring 
>> *ring);
>>  int amdgpu_uvd_prepare_suspend(struct amdgpu_device *adev);
>>  int amdgpu_uvd_suspend(struct amdgpu_device *adev);
>>  int amdgpu_uvd_resume(struct amdgpu_device *adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
>> index 1904edf68407..0954447f689d 

[PATCH 5/5] drm/amdgpu/gmc9: disable AGP aperture

2023-11-09 Thread Alex Deucher
We've had misc reports of random IOMMU page faults when
this is used.  It's just a rarely used optimization anyway, so
let's just disable it.  It can still be toggled via the
module parameter for testing.

v2: leave it configurable via module parameter

Reviewed-by: Yang Wang  (v1)
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 22375fc23412..bd69cf61de06 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1630,7 +1630,7 @@ static void gmc_v9_0_vram_gtt_location(struct 
amdgpu_device *adev,
} else {
amdgpu_gmc_vram_location(adev, mc, base);
amdgpu_gmc_gart_location(adev, mc, 
AMDGPU_GART_PLACEMENT_BEST_FIT);
-   if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0))
+   if (!amdgpu_sriov_vf(adev) && (amdgpu_agp == 1))
amdgpu_gmc_agp_location(adev, mc);
}
/* base offset of vram pages */
-- 
2.41.0



[PATCH 4/5] drm/amdgpu/gmc10: disable AGP aperture

2023-11-09 Thread Alex Deucher
We've had misc reports of random IOMMU page faults when
this is used.  It's just a rarely used optimization anyway, so
let's just disable it.  It can still be toggled via the
module parameter for testing.

v2: leave it configurable via module parameter

Reviewed-by: Yang Wang  (v1)
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 23483bffa1c7..a5a05c16c10d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -675,7 +675,7 @@ static void gmc_v10_0_vram_gtt_location(struct 
amdgpu_device *adev,
amdgpu_gmc_set_agp_default(adev, mc);
amdgpu_gmc_vram_location(adev, >gmc, base);
amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT);
-   if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0))
+   if (!amdgpu_sriov_vf(adev) && (amdgpu_agp == 1))
amdgpu_gmc_agp_location(adev, mc);
 
/* base offset of vram pages */
-- 
2.41.0



[PATCH 2/5] drm/amdgpu: add a module parameter to control the AGP aperture

2023-11-09 Thread Alex Deucher
Add a module parameter to control the AGP aperture.  The AGP
aperture is an aperture in the GPU's internal address space
which provides direct non-paged access to the platform address
space.  This access is non-snooped so only uncached memory
can be accessed.

Add a knob so that we can toggle this for debugging.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  |  3 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  2 +-
 5 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 969d8ba8b8dc..8c1f88b4a58c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -250,6 +250,7 @@ extern int amdgpu_umsch_mm;
 extern int amdgpu_seamless;
 
 extern int amdgpu_user_partt_mode;
+extern int amdgpu_agp;
 
 #define AMDGPU_VM_MAX_NUM_CTX  4096
 #define AMDGPU_SG_THRESHOLD(256*1024*1024)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 0c0497d3a003..e8c42ff5d464 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -207,6 +207,7 @@ int amdgpu_user_partt_mode = 
AMDGPU_AUTO_COMPUTE_PARTITION_MODE;
 int amdgpu_umsch_mm;
 int amdgpu_seamless = -1; /* auto */
 uint amdgpu_debug_mask;
+int amdgpu_agp = -1; /* auto */
 
 static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work);
 
@@ -961,6 +962,15 @@ module_param_named(seamless, amdgpu_seamless, int, 0444);
 MODULE_PARM_DESC(debug_mask, "debug options for amdgpu, disabled by default");
 module_param_named(debug_mask, amdgpu_debug_mask, uint, 0444);
 
+/**
+ * DOC: agp (int)
+ * Enable the AGP aperture.  This provides an aperture in the GPU's internal
+ * address space for direct access to system memory.  Note that these accesses
+ * are non-snooped, so they are only used for access to uncached memory.
+ */
+MODULE_PARM_DESC(agp, "AGP (-1 = auto (default), 0 = disable, 1 = enable)");
+module_param_named(agp, amdgpu_agp, int, 0444);
+
 /* These devices are not supported by amdgpu.
  * They are supported by the mach64, r128, radeon drivers
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 0ec7b061d7c2..23483bffa1c7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -675,7 +675,7 @@ static void gmc_v10_0_vram_gtt_location(struct 
amdgpu_device *adev,
amdgpu_gmc_set_agp_default(adev, mc);
amdgpu_gmc_vram_location(adev, >gmc, base);
amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT);
-   if (!amdgpu_sriov_vf(adev))
+   if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0))
amdgpu_gmc_agp_location(adev, mc);
 
/* base offset of vram pages */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index ba4c82f5e617..e1078b53e942 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -641,7 +641,8 @@ static void gmc_v11_0_vram_gtt_location(struct 
amdgpu_device *adev,
amdgpu_gmc_vram_location(adev, >gmc, base);
amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH);
if (!amdgpu_sriov_vf(adev) &&
-   (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)))
+   (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)) &&
+   (amdgpu_agp != 0))
amdgpu_gmc_agp_location(adev, mc);
 
/* base offset of vram pages */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index bde25eb4ed8e..22375fc23412 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1630,7 +1630,7 @@ static void gmc_v9_0_vram_gtt_location(struct 
amdgpu_device *adev,
} else {
amdgpu_gmc_vram_location(adev, mc, base);
amdgpu_gmc_gart_location(adev, mc, 
AMDGPU_GART_PLACEMENT_BEST_FIT);
-   if (!amdgpu_sriov_vf(adev))
+   if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0))
amdgpu_gmc_agp_location(adev, mc);
}
/* base offset of vram pages */
-- 
2.41.0



[PATCH 1/5] drm/amdgpu/gmc11: fix logic typo in AGP check

2023-11-09 Thread Alex Deucher
Should be && rather than ||.

Fixes: b2e1cbe6281f ("drm/amdgpu/gmc11: disable AGP on GC 11.5")
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 6dce9b29f675..ba4c82f5e617 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -640,7 +640,7 @@ static void gmc_v11_0_vram_gtt_location(struct 
amdgpu_device *adev,
amdgpu_gmc_set_agp_default(adev, mc);
amdgpu_gmc_vram_location(adev, >gmc, base);
amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH);
-   if (!amdgpu_sriov_vf(adev) ||
+   if (!amdgpu_sriov_vf(adev) &&
(amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)))
amdgpu_gmc_agp_location(adev, mc);
 
-- 
2.41.0



[PATCH 3/5] drm/amdgpu/gmc11: disable AGP aperture

2023-11-09 Thread Alex Deucher
We've had misc reports of random IOMMU page faults when
this is used.  It's just a rarely used optimization anyway, so
let's just disable it.  It can still be toggled via the
module parameter for testing.

v2: leave it configurable via module parameter

Reviewed-by: Yang Wang  (v1)
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index e1078b53e942..23d7b548d13f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -642,7 +642,7 @@ static void gmc_v11_0_vram_gtt_location(struct 
amdgpu_device *adev,
amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH);
if (!amdgpu_sriov_vf(adev) &&
(amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)) &&
-   (amdgpu_agp != 0))
+   (amdgpu_agp == 1))
amdgpu_gmc_agp_location(adev, mc);
 
/* base offset of vram pages */
-- 
2.41.0



Re: [PATCH] drm/amd: Explicitly check for GFXOFF to be enabled for s0ix

2023-11-09 Thread Deucher, Alexander
[Public]

Reviewed-by: Alex Deucher 

From: amd-gfx  on behalf of Mario 
Limonciello 
Sent: Thursday, November 9, 2023 11:27 AM
To: amd-gfx@lists.freedesktop.org 
Cc: Limonciello, Mario 
Subject: [PATCH] drm/amd: Explicitly check for GFXOFF to be enabled for s0ix

If a user has disabled GFXOFF this may cause problems for the suspend
sequence.  Ensure that it is enabled in amdgpu_acpi_is_s0ix_active().

The system won't reach the deepest state but it also won't hang.

Signed-off-by: Mario Limonciello 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
index d62e49758635..e550067e5c5d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
@@ -1497,6 +1497,9 @@ bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device 
*adev)
 if (adev->asic_type < CHIP_RAVEN)
 return false;

+   if (!(adev->pm.pp_feature & PP_GFXOFF_MASK))
+   return false;
+
 /*
  * If ACPI_FADT_LOW_POWER_S0 is not set in the FADT, it is generally
  * risky to do any special firmware-related preparations for entering
--
2.34.1



[PATCH] drm/amd: Explicitly check for GFXOFF to be enabled for s0ix

2023-11-09 Thread Mario Limonciello
If a user has disabled GFXOFF this may cause problems for the suspend
sequence.  Ensure that it is enabled in amdgpu_acpi_is_s0ix_active().

The system won't reach the deepest state but it also won't hang.

Signed-off-by: Mario Limonciello 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
index d62e49758635..e550067e5c5d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
@@ -1497,6 +1497,9 @@ bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device 
*adev)
if (adev->asic_type < CHIP_RAVEN)
return false;
 
+   if (!(adev->pm.pp_feature & PP_GFXOFF_MASK))
+   return false;
+
/*
 * If ACPI_FADT_LOW_POWER_S0 is not set in the FADT, it is generally
 * risky to do any special firmware-related preparations for entering
-- 
2.34.1



[PATCH v3] drm/amdgpu: Change extended-scope MTYPE on GC 9.4.3

2023-11-09 Thread David Yat Sin
Change local memory type to MTYPE_UC on revision id 0

Signed-off-by: David Yat Sin 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 7 +--
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c  | 7 +--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 5464b08064c4..b5633cc2b97d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1197,7 +1197,10 @@ static void gmc_v9_0_get_coherence_flags(struct 
amdgpu_device *adev,
if (uncached) {
mtype = MTYPE_UC;
} else if (ext_coherent) {
-   mtype = is_local ? MTYPE_CC : MTYPE_UC;
+   if (adev->rev_id)
+   mtype = is_local ? MTYPE_CC : MTYPE_UC;
+   else
+   mtype = MTYPE_UC;
} else if (adev->flags & AMD_IS_APU) {
mtype = is_local ? mtype_local : MTYPE_NC;
} else {
@@ -1318,7 +1321,7 @@ static void gmc_v9_0_override_vm_pte_flags(struct 
amdgpu_device *adev,
 
*flags = (*flags & ~AMDGPU_PTE_MTYPE_VG10_MASK) |
 AMDGPU_PTE_MTYPE_VG10(mtype_local);
-   } else {
+   } else if (adev->rev_id) {
/* MTYPE_UC case */
*flags = (*flags & ~AMDGPU_PTE_MTYPE_VG10_MASK) |
 AMDGPU_PTE_MTYPE_VG10(MTYPE_CC);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index e7f67e5226ec..08aa9b85c71a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1257,8 +1257,11 @@ svm_range_get_pte_flags(struct kfd_node *node,
}
break;
case IP_VERSION(9, 4, 3):
-   mtype_local = amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC :
-(amdgpu_mtype_local == 2 || ext_coherent ?
+   if (ext_coherent)
+   mtype_local = node->adev->rev_id ? AMDGPU_VM_MTYPE_CC : 
AMDGPU_VM_MTYPE_UC;
+   else
+   mtype_local = amdgpu_mtype_local == 1 ? 
AMDGPU_VM_MTYPE_NC :
+(amdgpu_mtype_local == 2 ?
 AMDGPU_VM_MTYPE_CC : (amdgpu_mtype_local == 3 ?
 AMDGPU_VM_MTYPE_RW : (node->adev->rev_id ?
AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW)));
-- 
2.34.1



Re: [PATCH] drm/amdgpu: move UVD and VCE sched entity init after sched init

2023-11-09 Thread Alex Deucher
Ping?

On Wed, Nov 8, 2023 at 1:42 PM Alex Deucher  wrote:
>
> We need kernel scheduling entities to deal with handle clean up
> if apps are not cleaned up properly.  With commit 56e449603f0ac5
> ("drm/sched: Convert the GPU scheduler to variable number of run-queues")
> the scheduler entities have to be created after scheduler init, so
> change the ordering to fix this.
>
> v2: Leave logic in UVD and VCE code
>
> Fixes: 56e449603f0ac5 ("drm/sched: Convert the GPU scheduler to variable 
> number of run-queues")
> Signed-off-by: Alex Deucher 
> Cc: ltuiko...@gmail.com
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c| 22 ++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h|  2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c| 24 +++---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vce.h|  2 +-
>  drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c  |  2 --
>  drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c  |  2 --
>  drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c  |  2 --
>  drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c  |  2 --
>  drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c  |  4 
>  drivers/gpu/drm/amd/amdgpu/vce_v2_0.c  |  2 --
>  drivers/gpu/drm/amd/amdgpu/vce_v3_0.c  |  2 --
>  drivers/gpu/drm/amd/amdgpu/vce_v4_0.c  |  5 -
>  13 files changed, 37 insertions(+), 46 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 43a95feba884..03e669c34033 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2499,6 +2499,18 @@ static int amdgpu_device_init_schedulers(struct 
> amdgpu_device *adev)
>   ring->name);
> return r;
> }
> +   r = amdgpu_uvd_entity_init(adev, ring);
> +   if (r) {
> +   DRM_ERROR("Failed to create UVD scheduling entity on 
> ring %s.\n",
> + ring->name);
> +   return r;
> +   }
> +   r = amdgpu_vce_entity_init(adev, ring);
> +   if (r) {
> +   DRM_ERROR("Failed to create VCE scheduling entity on 
> ring %s.\n",
> + ring->name);
> +   return r;
> +   }
> }
>
> amdgpu_xcp_update_partition_sched_list(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
> index 815b7c34ed33..65949cc7abb9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
> @@ -399,20 +399,20 @@ int amdgpu_uvd_sw_fini(struct amdgpu_device *adev)
>   *
>   * @adev: amdgpu_device pointer
>   *
> + * Initialize the entity used for handle management in the kernel driver.
>   */
> -int amdgpu_uvd_entity_init(struct amdgpu_device *adev)
> +int amdgpu_uvd_entity_init(struct amdgpu_device *adev, struct amdgpu_ring 
> *ring)
>  {
> -   struct amdgpu_ring *ring;
> -   struct drm_gpu_scheduler *sched;
> -   int r;
> +   if (ring == >uvd.inst[0].ring) {
> +   struct drm_gpu_scheduler *sched = >sched;
> +   int r;
>
> -   ring = >uvd.inst[0].ring;
> -   sched = >sched;
> -   r = drm_sched_entity_init(>uvd.entity, 
> DRM_SCHED_PRIORITY_NORMAL,
> - , 1, NULL);
> -   if (r) {
> -   DRM_ERROR("Failed setting up UVD kernel entity.\n");
> -   return r;
> +   r = drm_sched_entity_init(>uvd.entity, 
> DRM_SCHED_PRIORITY_NORMAL,
> + , 1, NULL);
> +   if (r) {
> +   DRM_ERROR("Failed setting up UVD kernel entity.\n");
> +   return r;
> +   }
> }
>
> return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h
> index a9f342537c68..9dfad2f48ef4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h
> @@ -73,7 +73,7 @@ struct amdgpu_uvd {
>
>  int amdgpu_uvd_sw_init(struct amdgpu_device *adev);
>  int amdgpu_uvd_sw_fini(struct amdgpu_device *adev);
> -int amdgpu_uvd_entity_init(struct amdgpu_device *adev);
> +int amdgpu_uvd_entity_init(struct amdgpu_device *adev, struct amdgpu_ring 
> *ring);
>  int amdgpu_uvd_prepare_suspend(struct amdgpu_device *adev);
>  int amdgpu_uvd_suspend(struct amdgpu_device *adev);
>  int amdgpu_uvd_resume(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> index 1904edf68407..0954447f689d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> @@ -231,20 +231,20 @@ int amdgpu_vce_sw_fini(struct amdgpu_device *adev)
>   *
>   * @adev: 

Re: [PATCH] drm/amd/pm: replace 1-element arrays with flexible-array members

2023-11-09 Thread Alex Deucher
On Thu, Nov 9, 2023 at 7:14 AM José Pekkarinen
 wrote:
>
> On 2023-11-09 11:06, Greg KH wrote:
> > On Thu, Nov 09, 2023 at 10:43:50AM +0200, José Pekkarinen wrote:
> >> On 2023-11-08 09:29, Greg KH wrote:
> >> > On Wed, Nov 08, 2023 at 08:54:35AM +0200, José Pekkarinen wrote:
> >> > > The following case seems to be safe to be replaced with a flexible
> >> > > array
> >> > > to clean up the added coccinelle warning. This patch will just do it.
> >> > >
> >> > > drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h:76:38-63:
> >> > > WARNING use flexible-array member instead 
> >> > > (https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays)
> >> > >
> >> > > Signed-off-by: José Pekkarinen 
> >> > > ---
> >> > >  drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h | 2 +-
> >> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> >> > >
> >> > > diff --git a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h
> >> > > b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h
> >> > > index c7b61222d258..1ce4087005f0 100644
> >> > > --- a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h
> >> > > +++ b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h
> >> > > @@ -73,7 +73,7 @@ struct smu8_register_index_data_pair {
> >> > >
> >> > >  struct smu8_ih_meta_data {
> >> > >  uint32_t command;
> >> > > -struct smu8_register_index_data_pair 
> >> > > register_index_value_pair[1];
> >> > > +struct smu8_register_index_data_pair 
> >> > > register_index_value_pair[];
> >> >
> >> > Did you just change this structure size without any need to change any
> >> > code as well?  How was this tested?
> >>
> >> I didn't find any use of that struct member, if I missed
> >> something here, please let me know and I'll happily address any
> >> needed further work.
> >
> > I don't think this is even a variable array.  It's just a one element
> > one, which is fine, don't be confused by the coccinelle "warning" here,
> > it's fired many false-positives and you need to verify this properly
> > with the driver authors first before changing anything.
>
>   My apologies to you, and anybody that feels the same, it is not my
> intention to bother with mistaken patches, I just assume that this patch
> or any other from me, will go to review process, where it should be fine
> if the patch is right, wrong, need further work, or further testing
> either
> from my side or anybody else, and at the end of the day I need to do
> patches if I want to find my mentorship patches, and graduate.
>
> > In short, you just changed the size of this structure, are you _sure_
> > you can do that?  And yes, it doesn't look like this field is used, but
> > the structure is, so be careful.
>
>  I don't know, let check it out together and see where this goes.

I think it may have been used with the SMU firmware.  I'll need to
check with that team to determine if this was meant to be a variable
sized array or not.

Alex


RE: [PATCH 1/3] drm/amdgpu/gmc11: disable AGP aperture

2023-11-09 Thread Wang, Yang(Kevin)
Series is
Reviewed-by: Yang Wang 

Best Regards,
Kevin

-Original Message-
From: amd-gfx  On Behalf Of Alex Deucher
Sent: Thursday, November 9, 2023 10:42 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander 
Subject: [PATCH 1/3] drm/amdgpu/gmc11: disable AGP aperture

We've had misc reports of random IOMMU page faults when this is used.  It's 
just a rarely used optimization anyway, so let's just disable it.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 6dce9b29f675..e1cac5da9c4b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -640,9 +640,6 @@ static void gmc_v11_0_vram_gtt_location(struct 
amdgpu_device *adev,
amdgpu_gmc_set_agp_default(adev, mc);
amdgpu_gmc_vram_location(adev, >gmc, base);
amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH);
-   if (!amdgpu_sriov_vf(adev) ||
-   (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)))
-   amdgpu_gmc_agp_location(adev, mc);
 
/* base offset of vram pages */
if (amdgpu_sriov_vf(adev))
--
2.41.0



Re: [PATCH] drm/amd/pm: replace 1-element arrays with flexible-array members

2023-11-09 Thread José Pekkarinen

On 2023-11-09 11:06, Greg KH wrote:

On Thu, Nov 09, 2023 at 10:43:50AM +0200, José Pekkarinen wrote:

On 2023-11-08 09:29, Greg KH wrote:
> On Wed, Nov 08, 2023 at 08:54:35AM +0200, José Pekkarinen wrote:
> > The following case seems to be safe to be replaced with a flexible
> > array
> > to clean up the added coccinelle warning. This patch will just do it.
> >
> > drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h:76:38-63:
> > WARNING use flexible-array member instead 
(https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays)
> >
> > Signed-off-by: José Pekkarinen 
> > ---
> >  drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h
> > b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h
> > index c7b61222d258..1ce4087005f0 100644
> > --- a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h
> > +++ b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h
> > @@ -73,7 +73,7 @@ struct smu8_register_index_data_pair {
> >
> >  struct smu8_ih_meta_data {
> >   uint32_t command;
> > - struct smu8_register_index_data_pair register_index_value_pair[1];
> > + struct smu8_register_index_data_pair register_index_value_pair[];
>
> Did you just change this structure size without any need to change any
> code as well?  How was this tested?

I didn't find any use of that struct member, if I missed
something here, please let me know and I'll happily address any
needed further work.


I don't think this is even a variable array.  It's just a one element
one, which is fine, don't be confused by the coccinelle "warning" here,
it's fired many false-positives and you need to verify this properly
with the driver authors first before changing anything.


 My apologies to you, and anybody that feels the same, it is not my
intention to bother with mistaken patches, I just assume that this patch
or any other from me, will go to review process, where it should be fine
if the patch is right, wrong, need further work, or further testing 
either

from my side or anybody else, and at the end of the day I need to do
patches if I want to find my mentorship patches, and graduate.


In short, you just changed the size of this structure, are you _sure_
you can do that?  And yes, it doesn't look like this field is used, but
the structure is, so be careful.


I don't know, let check it out together and see where this goes.

José.


[PATCH] drm/amd/pm: make power values signed

2023-11-09 Thread José Pekkarinen
The following patch will convert the power values returned by
amdgpu_hwmon_get_power to signed, fixing the following warnings reported
by coccinelle:

drivers/gpu/drm/amd/pm/amdgpu_pm.c:2801:5-8: WARNING: Unsigned expression 
compared with zero: val < 0
drivers/gpu/drm/amd/pm/amdgpu_pm.c:2814:5-8: WARNING: Unsigned expression 
compared with zero: val < 0

Signed-off-by: José Pekkarinen 
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index e7bb1d324084..913ff62d5d5e 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -2795,7 +2795,7 @@ static ssize_t amdgpu_hwmon_show_power_avg(struct device 
*dev,
   struct device_attribute *attr,
   char *buf)
 {
-   unsigned int val;
+   int val;
 
val = amdgpu_hwmon_get_power(dev, AMDGPU_PP_SENSOR_GPU_AVG_POWER);
 
@@ -2806,7 +2806,7 @@ static ssize_t amdgpu_hwmon_show_power_input(struct 
device *dev,
 struct device_attribute *attr,
 char *buf)
 {
-   unsigned int val;
+   int val;
 
val = amdgpu_hwmon_get_power(dev, AMDGPU_PP_SENSOR_GPU_INPUT_POWER);
 
-- 
2.39.2



Re: [PATCH] drm/amd/pm: replace 1-element arrays with flexible-array members

2023-11-09 Thread Greg KH
On Thu, Nov 09, 2023 at 10:43:50AM +0200, José Pekkarinen wrote:
> On 2023-11-08 09:29, Greg KH wrote:
> > On Wed, Nov 08, 2023 at 08:54:35AM +0200, José Pekkarinen wrote:
> > > The following case seems to be safe to be replaced with a flexible
> > > array
> > > to clean up the added coccinelle warning. This patch will just do it.
> > > 
> > > drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h:76:38-63:
> > > WARNING use flexible-array member instead 
> > > (https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays)
> > > 
> > > Signed-off-by: José Pekkarinen 
> > > ---
> > >  drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h | 2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > 
> > > diff --git a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h
> > > b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h
> > > index c7b61222d258..1ce4087005f0 100644
> > > --- a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h
> > > +++ b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h
> > > @@ -73,7 +73,7 @@ struct smu8_register_index_data_pair {
> > > 
> > >  struct smu8_ih_meta_data {
> > >   uint32_t command;
> > > - struct smu8_register_index_data_pair register_index_value_pair[1];
> > > + struct smu8_register_index_data_pair register_index_value_pair[];
> > 
> > Did you just change this structure size without any need to change any
> > code as well?  How was this tested?
> 
> I didn't find any use of that struct member, if I missed
> something here, please let me know and I'll happily address any
> needed further work.

I don't think this is even a variable array.  It's just a one element
one, which is fine, don't be confused by the coccinelle "warning" here,
it's fired many false-positives and you need to verify this properly
with the driver authors first before changing anything.

In short, you just changed the size of this structure, are you _sure_
you can do that?  And yes, it doesn't look like this field is used, but
the structure is, so be careful.

thanks,

greg k-h


Re: [PATCH] drm/amd/pm: replace 1-element arrays with flexible-array members

2023-11-09 Thread José Pekkarinen

On 2023-11-08 09:29, Greg KH wrote:

On Wed, Nov 08, 2023 at 08:54:35AM +0200, José Pekkarinen wrote:
The following case seems to be safe to be replaced with a flexible 
array

to clean up the added coccinelle warning. This patch will just do it.

drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h:76:38-63: 
WARNING use flexible-array member instead 
(https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays)


Signed-off-by: José Pekkarinen 
---
 drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h 
b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h

index c7b61222d258..1ce4087005f0 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h
+++ b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu8_smumgr.h
@@ -73,7 +73,7 @@ struct smu8_register_index_data_pair {

 struct smu8_ih_meta_data {
uint32_t command;
-   struct smu8_register_index_data_pair register_index_value_pair[1];
+   struct smu8_register_index_data_pair register_index_value_pair[];


Did you just change this structure size without any need to change any
code as well?  How was this tested?


I didn't find any use of that struct member, if I missed
something here, please let me know and I'll happily address any
needed further work.

José.


[PATCH 2/3] drm/amdgpu/gmc10: disable AGP aperture

2023-11-09 Thread Alex Deucher
We've had misc reports of random IOMMU page faults when
this is used.  It's just a rarely used optimization anyway, so
let's just disable it.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 0ec7b061d7c2..8eb4db3dd931 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -675,8 +675,6 @@ static void gmc_v10_0_vram_gtt_location(struct 
amdgpu_device *adev,
amdgpu_gmc_set_agp_default(adev, mc);
amdgpu_gmc_vram_location(adev, >gmc, base);
amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT);
-   if (!amdgpu_sriov_vf(adev))
-   amdgpu_gmc_agp_location(adev, mc);
 
/* base offset of vram pages */
adev->vm_manager.vram_base_offset = 
adev->gfxhub.funcs->get_mc_fb_offset(adev);
-- 
2.41.0



[PATCH 1/3] drm/amdgpu/gmc11: disable AGP aperture

2023-11-09 Thread Alex Deucher
We've had misc reports of random IOMMU page faults when
this is used.  It's just a rarely used optimization anyway, so
let's just disable it.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 6dce9b29f675..e1cac5da9c4b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -640,9 +640,6 @@ static void gmc_v11_0_vram_gtt_location(struct 
amdgpu_device *adev,
amdgpu_gmc_set_agp_default(adev, mc);
amdgpu_gmc_vram_location(adev, >gmc, base);
amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH);
-   if (!amdgpu_sriov_vf(adev) ||
-   (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)))
-   amdgpu_gmc_agp_location(adev, mc);
 
/* base offset of vram pages */
if (amdgpu_sriov_vf(adev))
-- 
2.41.0



[PATCH 3/3] drm/amdgpu/gmc9: disable AGP aperture

2023-11-09 Thread Alex Deucher
We've had misc reports of random IOMMU page faults when
this is used.  It's just a rarely used optimization anyway, so
let's just disable it.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 05fb57540e35..39a0251ba60e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1627,8 +1627,6 @@ static void gmc_v9_0_vram_gtt_location(struct 
amdgpu_device *adev,
} else {
amdgpu_gmc_vram_location(adev, mc, base);
amdgpu_gmc_gart_location(adev, mc, 
AMDGPU_GART_PLACEMENT_BEST_FIT);
-   if (!amdgpu_sriov_vf(adev))
-   amdgpu_gmc_agp_location(adev, mc);
}
/* base offset of vram pages */
adev->vm_manager.vram_base_offset = 
adev->gfxhub.funcs->get_mc_fb_offset(adev);
-- 
2.41.0



[PATCH 1/2] drm/amdgpu: fix error handling in amdgpu_bo_list_get()

2023-11-09 Thread Christian König
We should not leak the pointer where we couldn't grab the reference
on to the caller because it can be that the error handling still
tries to put the reference then.

Signed-off-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
index 4e7089a22139..8ffad1dabcb5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
@@ -168,6 +168,7 @@ int amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, int id,
}
 
rcu_read_unlock();
+   *result = NULL;
return -ENOENT;
 }
 
-- 
2.34.1



[PATCH 2/2] drm/amdgpu: lower CS errors to debug severity

2023-11-09 Thread Christian König
Otherwise userspace can spam the logs by using incorrect input values.

Signed-off-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index aafedb344c1b..af844c98f295 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1442,7 +1442,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
if (r == -ENOMEM)
DRM_ERROR("Not enough memory for command 
submission!\n");
else if (r != -ERESTARTSYS && r != -EAGAIN)
-   DRM_ERROR("Failed to process the buffer list %d!\n", r);
+   DRM_DEBUG("Failed to process the buffer list %d!\n", r);
goto error_fini;
}
 
-- 
2.34.1



Re: [PATCH] drm/amdkfd: Clear the VALU exception state in the trap handler

2023-11-09 Thread Lancelot SIX

The trap handler could be entered with pending VALU exceptions, so
clear the exception state before issuing vector instructions.

Signed-off-by: Laurent Morichetti 


Reviewed-by: Jay Cornwall 


Hi,

FYI, I tested this and it fixes the issue.

Best,
Lancelot.

Tested-by: Lancelot Six 


Re: [PATCH 5/6] drm/amdkfd: Import DMABufs for interop through DRM

2023-11-09 Thread Christian König

Am 07.11.23 um 17:58 schrieb Felix Kuehling:

Use drm_gem_prime_fd_to_handle to import DMABufs for interop. This
ensures that a GEM handle is created on import and that obj->dma_buf
will be set and remain set as long as the object is imported into KFD.

Signed-off-by: Felix Kuehling 


Acked-by: Christian König 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  9 ++-
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 64 +--
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 15 ++---
  3 files changed, 52 insertions(+), 36 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 4caf8cece028..88a0e0734270 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -318,11 +318,10 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void 
*process_info,
struct dma_fence **ef);
  int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev,
  struct kfd_vm_fault_info *info);
-int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev,
- struct dma_buf *dmabuf,
- uint64_t va, void *drm_priv,
- struct kgd_mem **mem, uint64_t *size,
- uint64_t *mmap_offset);
+int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device *adev, int fd,
+uint64_t va, void *drm_priv,
+struct kgd_mem **mem, uint64_t *size,
+uint64_t *mmap_offset);
  int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_mem *mem,
  struct dma_buf **dmabuf);
  void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 4bb8b5fd7598..1077de8bced2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2006,8 +2006,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
  
  	/* Free the BO*/

drm_vma_node_revoke(>bo->tbo.base.vma_node, drm_priv);
-   if (!mem->is_imported)
-   drm_gem_handle_delete(adev->kfd.client.file, mem->gem_handle);
+   drm_gem_handle_delete(adev->kfd.client.file, mem->gem_handle);
if (mem->dmabuf) {
dma_buf_put(mem->dmabuf);
mem->dmabuf = NULL;
@@ -2363,34 +2362,26 @@ int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct 
amdgpu_device *adev,
return 0;
  }
  
-int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev,

- struct dma_buf *dma_buf,
- uint64_t va, void *drm_priv,
- struct kgd_mem **mem, uint64_t *size,
- uint64_t *mmap_offset)
+static int import_obj_create(struct amdgpu_device *adev,
+struct dma_buf *dma_buf,
+struct drm_gem_object *obj,
+uint64_t va, void *drm_priv,
+struct kgd_mem **mem, uint64_t *size,
+uint64_t *mmap_offset)
  {
struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);
-   struct drm_gem_object *obj;
struct amdgpu_bo *bo;
int ret;
  
-	obj = amdgpu_gem_prime_import(adev_to_drm(adev), dma_buf);

-   if (IS_ERR(obj))
-   return PTR_ERR(obj);
-
bo = gem_to_amdgpu_bo(obj);
if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM |
-   AMDGPU_GEM_DOMAIN_GTT))) {
+   AMDGPU_GEM_DOMAIN_GTT)))
/* Only VRAM and GTT BOs are supported */
-   ret = -EINVAL;
-   goto err_put_obj;
-   }
+   return -EINVAL;
  
  	*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);

-   if (!*mem) {
-   ret = -ENOMEM;
-   goto err_put_obj;
-   }
+   if (!*mem)
+   return -ENOMEM;
  
  	ret = drm_vma_node_allow(>vma_node, drm_priv);

if (ret)
@@ -2440,8 +2431,41 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct 
amdgpu_device *adev,
drm_vma_node_revoke(>vma_node, drm_priv);
  err_free_mem:
kfree(*mem);
+   return ret;
+}
+
+int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device *adev, int fd,
+uint64_t va, void *drm_priv,
+struct kgd_mem **mem, uint64_t *size,
+uint64_t *mmap_offset)
+{
+   struct drm_gem_object *obj;
+   uint32_t handle;
+   int ret;
+
+   ret = drm_gem_prime_fd_to_handle(>ddev, 

Re: [PATCH 2/6] drm/amdgpu: New VM state for evicted user BOs

2023-11-09 Thread Christian König

Am 08.11.23 um 22:23 schrieb Felix Kuehling:


On 2023-11-08 07:28, Christian König wrote:
Not necessary objections to this patch here, but rather how this new 
state is used later on.


The fundamental problem is that re-validating things in 
amdgpu_vm_handle_moved() won't work in all cases.


The general approach for both classic CS IOCTL as well as user queues 
is the following:


1. Grab all the necessary locks
    The VM lock + either everything inside the VM (user queues) or 
just the BOs in the submission (CS IOCTL).


2. Validate everything locked
    It can be that additional BO locks are grabbed for eviction and 
even locked BOs are shuffled around during that.


3. Update the PDEs and PTEs
    This can be done only after validating everything because things 
can still shuffle around during validation.


4. Do your CS or make the userqueu / process runable etc...

5. Attach fences to the locked BOs.

6. Unlock everything again.

I think that is part of the problem why handling all updates in 
amdgpu_vm_handle_moved() for the CS doesn't work and sooner or later 
you might run into the same issue during process restore as well.


If I'm not completely mistaken this should be solvable by moving all 
validation to amdgpu_vm_validate_pt_bos() instead (but let us rename 
that function).


I'm trying to understand what the problem is. As far as I can tell, 
amdgpu_vm_handle_moved just runs a bit later in the CS and 
process-restore code than amdgpu_vm_validate_pt_bos. But it runs with 
all the same reservations locked. My current implementation in 
amdgpu_vm_handle_moved has some failure cases when reservations cannot 
be acquired. I think your drm_exec patch series would make it possible 
to handle this more robustly. That said, in case of KFD process 
restore, we can retry in case of failures already.


Anyway, moving my re-validation code to a renamed version of 
amdgpu_vm_validate_pt_bos shouldn't be a big problem. I just don't 
understand what problem that solves. Maybe it just makes the code 
cleaner to handle both evicted states in one place? Or maybe you're 
pointing to a way to merge the two states into one?


Well yeah merging both state into one might be nice to have, but that 
isn't the fundamental problem.


What basically happens during validation of a BO is that this BO is move 
to a desired place (as described by the placement parameter). During 
this it can happen that we shuffle out other BOs.


Now the issue is that when you validate in amdgpu_vm_handle_moved() it 
can be that by validating BO we shuffle out other BOs which then end up 
of the evicted list again.


This most likely won't happen with KFD, but for GFX CS it's certainly 
possible. So the idea is that we first validate everything and then 
update all the page tables and not inter mix the two operations.


Regards,
Christian.



Regards,
  Felix




Regards,
Christian.

Am 07.11.23 um 23:11 schrieb Felix Kuehling:

Hi Christian,

I know you have objected to this patch before. I still think this is 
the best solution for what I need. I can talk you through my 
reasoning by email or offline. If I can't convince you, I will need 
your guidance for a better solution.


Thanks,
  Felix


On 2023-11-07 11:58, Felix Kuehling wrote:

Create a new VM state to track user BOs that are in the system domain.
In the next patch this will be used do conditionally re-validate 
them in

amdgpu_vm_handle_moved.

Signed-off-by: Felix Kuehling 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 17 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  5 -
  2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

index 1442d97ddd0f..0d685577243c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -232,6 +232,22 @@ static void amdgpu_vm_bo_invalidated(struct 
amdgpu_vm_bo_base *vm_bo)

  spin_unlock(_bo->vm->status_lock);
  }
  +/**
+ * amdgpu_vm_bo_evicted_user - vm_bo is evicted
+ *
+ * @vm_bo: vm_bo which is evicted
+ *
+ * State for BOs used by user mode queues which are not at the 
location they

+ * should be.
+ */
+static void amdgpu_vm_bo_evicted_user(struct amdgpu_vm_bo_base 
*vm_bo)

+{
+    vm_bo->moved = true;
+    spin_lock(_bo->vm->status_lock);
+    list_move(_bo->vm_status, _bo->vm->evicted_user);
+    spin_unlock(_bo->vm->status_lock);
+}
+
  /**
   * amdgpu_vm_bo_relocated - vm_bo is reloacted
   *
@@ -2110,6 +2126,7 @@ int amdgpu_vm_init(struct amdgpu_device 
*adev, struct amdgpu_vm *vm, int32_t xcp

  for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
  vm->reserved_vmid[i] = NULL;
  INIT_LIST_HEAD(>evicted);
+    INIT_LIST_HEAD(>evicted_user);
  INIT_LIST_HEAD(>relocated);
  INIT_LIST_HEAD(>moved);
  INIT_LIST_HEAD(>idle);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

index 12cac06fa289..939d0c2219c0 100644
--- 

[PATCH v2] drm/amdgpu: Move mca debug mode decision to ras

2023-11-09 Thread Lijo Lazar
Refactor code such that ras block decides the default mca debug mode,
and not swsmu block.

By default mca debug mode is set to true for now.

Signed-off-by: Lijo Lazar 
---
v2: Set mca debug mode early before ras block late init as ras query is
initiated during late init of ras blocks (KevinYang)

 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c|  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 14 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  2 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c   | 12 
 4 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index 5a828c175e3a..cb51a46e8535 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -218,7 +218,7 @@ static int amdgpu_mca_smu_debug_mode_set(void *data, u64 
val)
struct amdgpu_device *adev = (struct amdgpu_device *)data;
int ret;
 
-   ret = amdgpu_mca_smu_set_debug_mode(adev, val ? true : false);
+   ret = amdgpu_ras_set_mca_debug_mode(adev, val ? true : false);
if (ret)
return ret;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b7fe5951b166..bb4f1e60c099 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3102,6 +3102,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev))
return 0;
 
+   amdgpu_ras_set_mca_debug_mode(adev, true);
+
list_for_each_entry_safe(node, tmp, >ras_list, node) {
if (!node->ras_obj) {
dev_warn(adev->dev, "Warning: abnormal ras list 
node.\n");
@@ -3375,12 +3377,18 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
return 0;
 }
 
-void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
+int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   int ret;
 
-   if (con)
-   con->is_mca_debug_mode = enable;
+   if (con) {
+   ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
+   if (!ret)
+   con->is_mca_debug_mode = enable;
+   }
+
+   return ret;
 }
 
 bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 665414c22ca9..e1e0fc66ea49 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -767,7 +767,7 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct 
amdgpu_device *adev);
 
 int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras 
*ras_con);
 
-void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
+int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
 bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
 
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index cecd210397d6..404db2b0fb71 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -1524,7 +1524,6 @@ static int smu_v13_0_6_mca_set_debug_mode(struct 
smu_context *smu, bool enable)
if (smu->smc_fw_version < 0x554800)
return 0;
 
-   amdgpu_ras_set_mca_debug_mode(smu->adev, enable);
return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ClearMcaOnRead,
   enable ? 0 : 
ClearMcaOnRead_UE_FLAG_MASK | ClearMcaOnRead_CE_POLL_MASK,
   NULL);
@@ -2346,16 +2345,6 @@ static int smu_v13_0_6_smu_send_hbm_bad_page_num(struct 
smu_context *smu,
return ret;
 }
 
-static int smu_v13_0_6_post_init(struct smu_context *smu)
-{
-   struct amdgpu_device *adev = smu->adev;
-
-   if (!amdgpu_sriov_vf(adev) && adev->ras_enabled)
-   return smu_v13_0_6_mca_set_debug_mode(smu, true);
-
-   return 0;
-}
-
 static int mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
 {
struct smu_context *smu = adev->powerplay.pp_handle;
@@ -2945,7 +2934,6 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = 
{
.i2c_init = smu_v13_0_6_i2c_control_init,
.i2c_fini = smu_v13_0_6_i2c_control_fini,
.send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num,
-   .post_init = smu_v13_0_6_post_init,
 };
 
 void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu)
-- 
2.25.1



RE: [PATCH] drm/amdgpu: Move mca debug mode decision to ras

2023-11-09 Thread Wang, Yang(Kevin)
Hi Lijo,

after our private discussion, in driver polling mode,  the RAS will return 0 
error count in driver probe stage (at that time the debug mode is in off state).
In the case of driver polling mode, debug mode needs to be turned on in advance.

Best Regards,
Kevin

-Original Message-
From: Lazar, Lijo  
Sent: Thursday, November 9, 2023 1:26 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Wang, Yang(Kevin) ; Zhou1, 
Tao 
Subject: [PATCH] drm/amdgpu: Move mca debug mode decision to ras

Refactor code such that ras block decides the default mca debug mode, and not 
swsmu block.

By default mca debug mode is set to true for now.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c|  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 14 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  2 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c   | 12 
 4 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index 5a828c175e3a..cb51a46e8535 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -218,7 +218,7 @@ static int amdgpu_mca_smu_debug_mode_set(void *data, u64 
val)
struct amdgpu_device *adev = (struct amdgpu_device *)data;
int ret;
 
-   ret = amdgpu_mca_smu_set_debug_mode(adev, val ? true : false);
+   ret = amdgpu_ras_set_mca_debug_mode(adev, val ? true : false);
if (ret)
return ret;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b7fe5951b166..b2e9ed65a061 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3120,6 +3120,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
amdgpu_ras_block_late_init_default(adev, 
>ras_comm);
}
 
+   amdgpu_ras_set_mca_debug_mode(adev, true);
+
return 0;
 }
 
@@ -3375,12 +3377,18 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
return 0;
 }
 
-void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
+int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool 
+enable)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   int ret;
 
-   if (con)
-   con->is_mca_debug_mode = enable;
+   if (con) {
+   ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
+   if (!ret)
+   con->is_mca_debug_mode = enable;
+   }
+
+   return ret;
 }
 
 bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev) diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 665414c22ca9..e1e0fc66ea49 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -767,7 +767,7 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct 
amdgpu_device *adev);
 
 int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras 
*ras_con);
 
-void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
+int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool 
+enable);
 bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
 
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, diff --git 
a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index cecd210397d6..404db2b0fb71 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -1524,7 +1524,6 @@ static int smu_v13_0_6_mca_set_debug_mode(struct 
smu_context *smu, bool enable)
if (smu->smc_fw_version < 0x554800)
return 0;
 
-   amdgpu_ras_set_mca_debug_mode(smu->adev, enable);
return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ClearMcaOnRead,
   enable ? 0 : 
ClearMcaOnRead_UE_FLAG_MASK | ClearMcaOnRead_CE_POLL_MASK,
   NULL);
@@ -2346,16 +2345,6 @@ static int smu_v13_0_6_smu_send_hbm_bad_page_num(struct 
smu_context *smu,
return ret;
 }
 
-static int smu_v13_0_6_post_init(struct smu_context *smu) -{
-   struct amdgpu_device *adev = smu->adev;
-
-   if (!amdgpu_sriov_vf(adev) && adev->ras_enabled)
-   return smu_v13_0_6_mca_set_debug_mode(smu, true);
-
-   return 0;
-}
-
 static int mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)  {
struct smu_context *smu = adev->powerplay.pp_handle; @@ -2945,7 +2934,6 
@@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = {
.i2c_init = smu_v13_0_6_i2c_control_init,
.i2c_fini = smu_v13_0_6_i2c_control_fini,
.send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num,