Re: [PATCH 06/13] drm/amdgpu: use the new drm_exec object for CS v2

2023-06-19 Thread Tatsuyuki Ishi

On 6/20/23 13:07, Tatsuyuki Ishi wrote:
@@ -1296,9 +1271,8 @@ static int amdgpu_cs_submit(struct 
amdgpu_cs_parser *p,

   */
  r = 0;
  amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
-    struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
-
-    r |= !amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, e->range);
+    r |= !amdgpu_ttm_tt_get_user_pages_done(e->bo->tbo.ttm,
+    e->range);
  e->range = NULL;


e->range = NULL; needs to be removed, or it's causing *massive* memory 
leaks.


Actually, I quoted the wrong hunk, the correct one is below.


@@ -928,18 +874,56 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser 
*p,
e->user_invalidated = userpage_invalidated;
}
 
-	r = ttm_eu_reserve_buffers(>ticket, >validated, true,

-  );
-   if (unlikely(r != 0)) {
-   if (r != -ERESTARTSYS)
-   DRM_ERROR("ttm_eu_reserve_buffers failed.\n");
-   goto out_free_user_pages;
+   drm_exec_while_not_all_locked(>exec) {
+   r = amdgpu_vm_lock_pd(>vm, >exec);
+   drm_exec_continue_on_contention(>exec);
+   if (unlikely(r))
+   goto out_free_user_pages;
+
+   amdgpu_bo_list_for_each_entry(e, p->bo_list) {
+   r = drm_exec_prepare_obj(>exec, >bo->tbo.base, 2);
+   drm_exec_break_on_contention(>exec);
+   if (unlikely(r))
+   goto out_free_user_pages;
+
+   e->bo_va = amdgpu_vm_bo_find(vm, e->bo);
+   e->range = NULL;


This causes the leak.


+   }
+   drm_exec_continue_on_contention(>exec);
+
+   if (p->uf_bo) {
+   r = drm_exec_prepare_obj(>exec, >uf_bo->tbo.base,
+2);
+   drm_exec_continue_on_contention(>exec);
+   if (unlikely(r))
+   goto out_free_user_pages;
+   }
}


Tatsuyuki


Re: [PATCH 06/13] drm/amdgpu: use the new drm_exec object for CS v2

2023-06-19 Thread Tatsuyuki Ishi

+Boris and +Matthew in case you want to take over this patch set.

Here are some review / testing comments, including those I posted before 
to ease tracking.


On 5/4/23 20:51, Christian König wrote:

Use the new component here as well and remove the old handling.

v2: drop dupplicate handling

Signed-off-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h |   1 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c |  71 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h |   5 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 210 +---
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h  |   7 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  |  22 --
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |   3 -
  7 files changed, 115 insertions(+), 204 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 02b827785e39..eba3e4f01ea6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -133,6 +141,8 @@ int amdgpu_bo_list_create(struct amdgpu_device *adev, 
struct drm_file *filp,
  
  	list->first_userptr = first_userptr;

list->num_entries = num_entries;
+   sort(array, last_entry, sizeof(struct amdgpu_bo_list_entry),
+amdgpu_bo_list_entry_cmp, NULL);


Previously amdgpu_bo_list_get_list sorted all entries, but this one only 
sorts userptr entries. I think this changes behavior?



@@ -928,18 +874,56 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser 
*p,
e->user_invalidated = userpage_invalidated;
}
  
-	r = ttm_eu_reserve_buffers(>ticket, >validated, true,

-  );
-   if (unlikely(r != 0)) {
-   if (r != -ERESTARTSYS)
-   DRM_ERROR("ttm_eu_reserve_buffers failed.\n");
-   goto out_free_user_pages;
+   drm_exec_while_not_all_locked(>exec) {
+   r = amdgpu_vm_lock_pd(>vm, >exec);
+   drm_exec_continue_on_contention(>exec);


Duplicate handling is needed for pretty much every call of 
amdgpu_vm_lock_pd, as bo->tbo.base.resv == vm->root.bo->tbo.base.resv 
for AMDGPU_GEM_CREATE_VM_ALWAYS_VALID.


I think Boris's suggestion of having this through a common 
DRM_EXEC_FLAG_ALLOW_DUPLICATES flag fits well.



+   if (unlikely(r))
+   goto out_free_user_pages;
+
+   amdgpu_bo_list_for_each_entry(e, p->bo_list) {
+   r = drm_exec_prepare_obj(>exec, >bo->tbo.base, 2);


Previously there were comments for how the fence count is calculated, 
now they seem to be removed. I'd prefer if they were properly retained 
as finding out who calls drm_resv_add_fence is not trivial, and wrong 
reservation count can also be really hard to debug.


Likewise for amdgpu_vm_lock_pd (which was added in another commit).


+   drm_exec_break_on_contention(>exec);
+   if (unlikely(r))
+   goto out_free_user_pages;
+
+   e->bo_va = amdgpu_vm_bo_find(vm, e->bo);
+   e->range = NULL;
+   }
+   drm_exec_continue_on_contention(>exec);
+
+   if (p->uf_bo) {
+   r = drm_exec_prepare_obj(>exec, >uf_bo->tbo.base,
+2);
+   drm_exec_continue_on_contention(>exec);
+   if (unlikely(r))
+   goto out_free_user_pages;
+   }
}
  
-	amdgpu_bo_list_for_each_entry(e, p->bo_list) {

-   struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
+   amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
+   struct mm_struct *usermm;
  
-		e->bo_va = amdgpu_vm_bo_find(vm, bo);

+   usermm = amdgpu_ttm_tt_get_usermm(e->bo->tbo.ttm);
+   if (usermm && usermm != current->mm) {
+   r = -EPERM;
+   goto out_free_user_pages;
+   }
+
+   if (amdgpu_ttm_tt_is_userptr(e->bo->tbo.ttm) &&
+   e->user_invalidated && e->user_pages) {
+   amdgpu_bo_placement_from_domain(e->bo,
+   AMDGPU_GEM_DOMAIN_CPU);
+   r = ttm_bo_validate(>bo->tbo, >bo->placement,
+   );
+   if (r)
+   goto out_free_user_pages;
+
+   amdgpu_ttm_tt_set_user_pages(e->bo->tbo.ttm,
+e->user_pages);
+   }
+
+   kvfree(e->user_pages);
+   e->user_pages = NULL;
}
  
  	amdgpu_cs_get_threshold_for_moves(p->adev, >bytes_moved_threshold,

@@ -1296,9 +1271,8 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 */
r = 0;

[PATCH] drm/amd/pm: add abnormal fan detection for smu 13.0.0

2023-06-19 Thread Kenneth Feng
add abnormal fan detection for smu 13.0.0

Signed-off-by: Kenneth Feng 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index a6083957ae51..124287cbbff8 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -1710,6 +1710,7 @@ static int 
smu_v13_0_0_get_thermal_temperature_range(struct smu_context *smu,
range->mem_emergency_max = 
(pptable->SkuTable.TemperatureLimit[TEMP_MEM] + CTF_OFFSET_MEM)*
SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
range->software_shutdown_temp = powerplay_table->software_shutdown_temp;
+   range->software_shutdown_temp_offset = 
pptable->SkuTable.FanAbnormalTempLimitOffset;
 
return 0;
 }
-- 
2.34.1



Re: [PATCH v6 2/8] PCI/VGA: Deal only with VGA class devices

2023-06-19 Thread Sui Jingfeng

Hi,

On 2023/6/20 02:12, Limonciello, Mario wrote:


On 6/12/2023 2:25 PM, Sui Jingfeng wrote:

From: Sui Jingfeng 

Deal only with the VGA devcie(pdev->class == 0x0300), so replace the
pci_get_subsys() function with pci_get_class(). Filter the non-PCI 
display
device(pdev->class != 0x0300) out. There no need to process the 
non-display

PCI device.

Signed-off-by: Sui Jingfeng 
---

This also means that deleting a PCI device no longer needs
to walk the list.

Reviewed-by: Mario Limonciello 


Thanks a lot,

can you help to resend this precious R-B to the V7 of this series [1],

This is V6.

[1] https://patchwork.freedesktop.org/series/119250/


  drivers/pci/vgaarb.c | 22 --
  1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/pci/vgaarb.c b/drivers/pci/vgaarb.c
index c1bc6c983932..22a505e877dc 100644
--- a/drivers/pci/vgaarb.c
+++ b/drivers/pci/vgaarb.c
@@ -754,10 +754,6 @@ static bool vga_arbiter_add_pci_device(struct 
pci_dev *pdev)

  struct pci_dev *bridge;
  u16 cmd;
  -    /* Only deal with VGA class devices */
-    if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
-    return false;
-
  /* Allocate structure */
  vgadev = kzalloc(sizeof(struct vga_device), GFP_KERNEL);
  if (vgadev == NULL) {
@@ -1500,7 +1496,9 @@ static int pci_notify(struct notifier_block 
*nb, unsigned long action,

  struct pci_dev *pdev = to_pci_dev(dev);
  bool notify = false;
  -    vgaarb_dbg(dev, "%s\n", __func__);
+    /* Only deal with VGA class devices */
+    if (pdev->class != PCI_CLASS_DISPLAY_VGA << 8)
+    return 0;
    /* For now we're only intereted in devices added and removed. 
I didn't

   * test this thing here, so someone needs to double check for the
@@ -1510,6 +1508,8 @@ static int pci_notify(struct notifier_block 
*nb, unsigned long action,

  else if (action == BUS_NOTIFY_DEL_DEVICE)
  notify = vga_arbiter_del_pci_device(pdev);
  +    vgaarb_dbg(dev, "%s: action = %lu\n", __func__, action);
+
  if (notify)
  vga_arbiter_notify_clients();
  return 0;
@@ -1534,8 +1534,8 @@ static struct miscdevice vga_arb_device = {
    static int __init vga_arb_device_init(void)
  {
+    struct pci_dev *pdev = NULL;
  int rc;
-    struct pci_dev *pdev;
    rc = misc_register(_arb_device);
  if (rc < 0)
@@ -1545,11 +1545,13 @@ static int __init vga_arb_device_init(void)
    /* We add all PCI devices satisfying VGA class in the arbiter by
   * default */
-    pdev = NULL;
-    while ((pdev =
-    pci_get_subsys(PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
-   PCI_ANY_ID, pdev)) != NULL)
+    while (1) {
+    pdev = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, pdev);
+    if (!pdev)
+    break;
+
  vga_arbiter_add_pci_device(pdev);
+    }
    pr_info("loaded\n");
  return rc;


--
Jingfeng



Re: [PATCH] drm/amdgpu: remove vm sanity check from amdgpu_vm_make_compute

2023-06-19 Thread Felix Kuehling

On 2023-06-19 17:28, Xiaogang.Chen wrote:

From: Xiaogang Chen 

Since we allow kfd and graphic operate on same GPU VM to have interoperation
between them GPU VM may have been used by graphic vm operations before kfd turns
a GPU VM into a compute VM. Remove vm clean checking at amdgpu_vm_make_compute.

Signed-off-by: Xiaogang Chen


Reviewed-by: Felix Kuehling 

As discussed, we can follow this up with a change that enables ATS for 
graphics VMs as well, so we don't need to enable ATS in 
amdgpu_vm_make_compute. This would improve interop for Raven. We only 
enable ATS for the lower half of the address space, so it should not 
affect graphics client that use the upper half.


Thanks,
  Felix



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 12 ++--
  1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index eff73c428b12..291977b93b1d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2245,16 +2245,16 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, 
struct amdgpu_vm *vm)
if (r)
return r;
  
-	/* Sanity checks */

-   if (!amdgpu_vm_pt_is_root_clean(adev, vm)) {
-   r = -EINVAL;
-   goto unreserve_bo;
-   }
-
/* Check if PD needs to be reinitialized and do it before
 * changing any other state, in case it fails.
 */
if (pte_support_ats != vm->pte_support_ats) {
+   /* Sanity checks */
+   if (!amdgpu_vm_pt_is_root_clean(adev, vm)) {
+   r = -EINVAL;
+   goto unreserve_bo;
+   }
+
vm->pte_support_ats = pte_support_ats;
r = amdgpu_vm_pt_clear(adev, vm, to_amdgpu_bo_vm(vm->root.bo),
   false);


[PATCH] drm/amdgpu: remove vm sanity check from amdgpu_vm_make_compute

2023-06-19 Thread Xiaogang . Chen
From: Xiaogang Chen 

Since we allow kfd and graphic operate on same GPU VM to have interoperation
between them GPU VM may have been used by graphic vm operations before kfd turns
a GPU VM into a compute VM. Remove vm clean checking at amdgpu_vm_make_compute.

Signed-off-by: Xiaogang Chen
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index eff73c428b12..291977b93b1d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2245,16 +2245,16 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, 
struct amdgpu_vm *vm)
if (r)
return r;
 
-   /* Sanity checks */
-   if (!amdgpu_vm_pt_is_root_clean(adev, vm)) {
-   r = -EINVAL;
-   goto unreserve_bo;
-   }
-
/* Check if PD needs to be reinitialized and do it before
 * changing any other state, in case it fails.
 */
if (pte_support_ats != vm->pte_support_ats) {
+   /* Sanity checks */
+   if (!amdgpu_vm_pt_is_root_clean(adev, vm)) {
+   r = -EINVAL;
+   goto unreserve_bo;
+   }
+
vm->pte_support_ats = pte_support_ats;
r = amdgpu_vm_pt_clear(adev, vm, to_amdgpu_bo_vm(vm->root.bo),
   false);
-- 
2.25.1



Re: [PATCH] drm/amd: Disable PSR-SU on Parade 0803 TCON

2023-06-19 Thread Hamza Mahfooz

On 6/19/23 16:13, Mario Limonciello wrote:

A number of users have reported that there are random hangs occurring
caused by PSR-SU specifically on panels that contain the parade 0803
TCON.  Users have been able to work around the issue by disabling PSR
entirely.

To avoid these hangs, disable PSR-SU when this TCON is found.

Cc: Sean Wang 
Cc: Marc Rossi 
Cc: Hamza Mahfooz 
Suggested-by: Tsung-hua (Ryan) Lin 
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2443
Signed-off-by: Mario Limonciello 


Reviewed-by: Hamza Mahfooz 


---
  drivers/gpu/drm/amd/display/modules/power/power_helpers.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/modules/power/power_helpers.c 
b/drivers/gpu/drm/amd/display/modules/power/power_helpers.c
index 30349881a283..b9e78451a3d5 100644
--- a/drivers/gpu/drm/amd/display/modules/power/power_helpers.c
+++ b/drivers/gpu/drm/amd/display/modules/power/power_helpers.c
@@ -839,6 +839,8 @@ bool is_psr_su_specific_panel(struct dc_link *link)
((dpcd_caps->sink_dev_id_str[1] == 0x08 && 
dpcd_caps->sink_dev_id_str[0] == 0x08) ||
(dpcd_caps->sink_dev_id_str[1] == 0x08 && 
dpcd_caps->sink_dev_id_str[0] == 0x07)))
isPSRSUSupported = false;
+   else if (dpcd_caps->sink_dev_id_str[1] == 0x08 && 
dpcd_caps->sink_dev_id_str[0] == 0x03)
+   isPSRSUSupported = false;
else if (dpcd_caps->psr_info.force_psrsu_cap == 0x1)
isPSRSUSupported = true;
}

--
Hamza



[PATCH] drm/amd: Disable PSR-SU on Parade 0803 TCON

2023-06-19 Thread Mario Limonciello
A number of users have reported that there are random hangs occurring
caused by PSR-SU specifically on panels that contain the parade 0803
TCON.  Users have been able to work around the issue by disabling PSR
entirely.

To avoid these hangs, disable PSR-SU when this TCON is found.

Cc: Sean Wang 
Cc: Marc Rossi 
Cc: Hamza Mahfooz 
Suggested-by: Tsung-hua (Ryan) Lin 
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2443
Signed-off-by: Mario Limonciello 
---
 drivers/gpu/drm/amd/display/modules/power/power_helpers.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/modules/power/power_helpers.c 
b/drivers/gpu/drm/amd/display/modules/power/power_helpers.c
index 30349881a283..b9e78451a3d5 100644
--- a/drivers/gpu/drm/amd/display/modules/power/power_helpers.c
+++ b/drivers/gpu/drm/amd/display/modules/power/power_helpers.c
@@ -839,6 +839,8 @@ bool is_psr_su_specific_panel(struct dc_link *link)
((dpcd_caps->sink_dev_id_str[1] == 0x08 && 
dpcd_caps->sink_dev_id_str[0] == 0x08) ||
(dpcd_caps->sink_dev_id_str[1] == 0x08 && 
dpcd_caps->sink_dev_id_str[0] == 0x07)))
isPSRSUSupported = false;
+   else if (dpcd_caps->sink_dev_id_str[1] == 0x08 && 
dpcd_caps->sink_dev_id_str[0] == 0x03)
+   isPSRSUSupported = false;
else if (dpcd_caps->psr_info.force_psrsu_cap == 0x1)
isPSRSUSupported = true;
}
-- 
2.34.1



Re: [PATCH] drm/amdgpu: remove vm sanity check from amdgpu_vm_make_compute

2023-06-19 Thread Felix Kuehling



On 2023-06-19 15:06, Xiaogang.Chen wrote:

From: Xiaogang Chen 

Since we allow kfd and graphic operate on same GPU VM to have interoperation
between them GPU VM may have been used by graphic vm operations before kfd turn
a GFX VM into a compute VM. Remove vm clean checking at amdgpu_vm_make_compute.

Signed-off-by: Xiaogang Chen
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index eff73c428b12..33f05297ab7e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2246,7 +2246,7 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, 
struct amdgpu_vm *vm)
return r;
  
  	/* Sanity checks */

-   if (!amdgpu_vm_pt_is_root_clean(adev, vm)) {
+   if (pte_support_ats && !amdgpu_vm_pt_is_root_clean(adev, vm)) {


I think the correct condition here would be "pte_support_ats != 
vm->pte_support_ats", because that's what's used to reinitialize the 
page table just below. I think it would be even cleaner if you moved 
that check inside the "if (pte_support_ats != vm->pte_support_ats)" 
block below.


Regards,
  Felix



r = -EINVAL;
goto unreserve_bo;
}


[PATCH] drm/amdgpu: remove vm sanity check from amdgpu_vm_make_compute

2023-06-19 Thread Xiaogang . Chen
From: Xiaogang Chen 

Since we allow kfd and graphic operate on same GPU VM to have interoperation
between them GPU VM may have been used by graphic vm operations before kfd turn
a GFX VM into a compute VM. Remove vm clean checking at amdgpu_vm_make_compute.

Signed-off-by: Xiaogang Chen
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index eff73c428b12..33f05297ab7e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2246,7 +2246,7 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, 
struct amdgpu_vm *vm)
return r;
 
/* Sanity checks */
-   if (!amdgpu_vm_pt_is_root_clean(adev, vm)) {
+   if (pte_support_ats && !amdgpu_vm_pt_is_root_clean(adev, vm)) {
r = -EINVAL;
goto unreserve_bo;
}
-- 
2.25.1



Re: [PATCH v6 2/8] PCI/VGA: Deal only with VGA class devices

2023-06-19 Thread Limonciello, Mario



On 6/12/2023 2:25 PM, Sui Jingfeng wrote:

From: Sui Jingfeng 

Deal only with the VGA devcie(pdev->class == 0x0300), so replace the
pci_get_subsys() function with pci_get_class(). Filter the non-PCI display
device(pdev->class != 0x0300) out. There no need to process the non-display
PCI device.

Signed-off-by: Sui Jingfeng 
---

This also means that deleting a PCI device no longer needs
to walk the list.

Reviewed-by: Mario Limonciello 


  drivers/pci/vgaarb.c | 22 --
  1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/pci/vgaarb.c b/drivers/pci/vgaarb.c
index c1bc6c983932..22a505e877dc 100644
--- a/drivers/pci/vgaarb.c
+++ b/drivers/pci/vgaarb.c
@@ -754,10 +754,6 @@ static bool vga_arbiter_add_pci_device(struct pci_dev 
*pdev)
struct pci_dev *bridge;
u16 cmd;
  
-	/* Only deal with VGA class devices */

-   if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
-   return false;
-
/* Allocate structure */
vgadev = kzalloc(sizeof(struct vga_device), GFP_KERNEL);
if (vgadev == NULL) {
@@ -1500,7 +1496,9 @@ static int pci_notify(struct notifier_block *nb, unsigned 
long action,
struct pci_dev *pdev = to_pci_dev(dev);
bool notify = false;
  
-	vgaarb_dbg(dev, "%s\n", __func__);

+   /* Only deal with VGA class devices */
+   if (pdev->class != PCI_CLASS_DISPLAY_VGA << 8)
+   return 0;
  
  	/* For now we're only intereted in devices added and removed. I didn't

 * test this thing here, so someone needs to double check for the
@@ -1510,6 +1508,8 @@ static int pci_notify(struct notifier_block *nb, unsigned 
long action,
else if (action == BUS_NOTIFY_DEL_DEVICE)
notify = vga_arbiter_del_pci_device(pdev);
  
+	vgaarb_dbg(dev, "%s: action = %lu\n", __func__, action);

+
if (notify)
vga_arbiter_notify_clients();
return 0;
@@ -1534,8 +1534,8 @@ static struct miscdevice vga_arb_device = {
  
  static int __init vga_arb_device_init(void)

  {
+   struct pci_dev *pdev = NULL;
int rc;
-   struct pci_dev *pdev;
  
  	rc = misc_register(_arb_device);

if (rc < 0)
@@ -1545,11 +1545,13 @@ static int __init vga_arb_device_init(void)
  
  	/* We add all PCI devices satisfying VGA class in the arbiter by

 * default */
-   pdev = NULL;
-   while ((pdev =
-   pci_get_subsys(PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
-  PCI_ANY_ID, pdev)) != NULL)
+   while (1) {
+   pdev = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, pdev);
+   if (!pdev)
+   break;
+
vga_arbiter_add_pci_device(pdev);
+   }
  
  	pr_info("loaded\n");

return rc;


[PATCHv4] drm/amdgpu: Update invalid PTE flag setting

2023-06-19 Thread Mukul Joshi
Update the invalid PTE flag setting with TF enabled.
This is to ensure, in addition to transitioning the
retry fault to a no-retry fault, it also causes the
wavefront to enter the trap handler. With the current
setting, the fault only transitions to a no-retry fault.
Additionally, have 2 sets of invalid PTE settings, one for
TF enabled, the other for TF disabled. The setting with
TF disabled, doesn't work with TF enabled.

Signed-off-by: Mukul Joshi 
---
v1->v2:
- Update handling according to Christian's feedback.

v2->v3:
- Remove ASIC specific callback (Felix).

v3->v4:
- Add noretry flag to amdgpu->gmc. This allows to set
  ASIC specific flags.

 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h   |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c|  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|  6 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 31 +++
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c|  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c|  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |  1 +
 9 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 56d73fade568..fdc25cd559b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -331,6 +331,8 @@ struct amdgpu_gmc {
u64 VM_CONTEXT_PAGE_TABLE_END_ADDR_LO32[16];
u64 VM_CONTEXT_PAGE_TABLE_END_ADDR_HI32[16];
u64 MC_VM_MX_L1_TLB_CNTL;
+
+   u64 noretry_flags;
 };
 
 #define amdgpu_gmc_flush_gpu_tlb(adev, vmid, vmhub, type) 
((adev)->gmc.gmc_funcs->flush_gpu_tlb((adev), (vmid), (vmhub), (type)))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index eff73c428b12..8c7861a4d75d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2604,7 +2604,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, 
u32 pasid,
/* Intentionally setting invalid PTE flag
 * combination to force a no-retry-fault
 */
-   flags = AMDGPU_PTE_SNOOPED | AMDGPU_PTE_PRT;
+   flags = AMDGPU_VM_NORETRY_FLAGS;
value = 0;
} else if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_NEVER) {
/* Redirect the access to the dummy page */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 9c85d494f2a2..b81fcb962d8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -84,7 +84,13 @@ struct amdgpu_mem_stats;
 /* PDE Block Fragment Size for VEGA10 */
 #define AMDGPU_PDE_BFS(a)  ((uint64_t)a << 59)
 
+/* Flag combination to set no-retry with TF disabled */
+#define AMDGPU_VM_NORETRY_FLAGS(AMDGPU_PTE_EXECUTABLE | AMDGPU_PDE_PTE 
| \
+   AMDGPU_PTE_TF)
 
+/* Flag combination to set no-retry with TF enabled */
+#define AMDGPU_VM_NORETRY_FLAGS_TF (AMDGPU_PTE_VALID | AMDGPU_PTE_SYSTEM | \
+  AMDGPU_PTE_PRT)
 /* For GFX9 */
 #define AMDGPU_PTE_MTYPE_VG10(a)   ((uint64_t)(a) << 57)
 #define AMDGPU_PTE_MTYPE_VG10_MASK AMDGPU_PTE_MTYPE_VG10(3ULL)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index dea1a64be44d..24ddf6a0512a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -778,6 +778,27 @@ int amdgpu_vm_pde_update(struct amdgpu_vm_update_params 
*params,
1, 0, flags);
 }
 
+/**
+ * amdgpu_vm_pte_update_noretry_flags - Update PTE no-retry flags
+ *
+ * @adev - amdgpu_device pointer
+ * @flags: pointer to PTE flags
+ *
+ * Update PTE no-retry flags when TF is enabled.
+ */
+static void amdgpu_vm_pte_update_noretry_flags(struct amdgpu_device *adev,
+   uint64_t *flags)
+{
+   /*
+* Update no-retry flags with the corresponding TF
+* no-retry combination.
+*/
+   if ((*flags & AMDGPU_VM_NORETRY_FLAGS) == AMDGPU_VM_NORETRY_FLAGS) {
+   *flags &= ~AMDGPU_VM_NORETRY_FLAGS;
+   *flags |= adev->gmc.noretry_flags;
+   }
+}
+
 /*
  * amdgpu_vm_pte_update_flags - figure out flags for PTE updates
  *
@@ -804,6 +825,16 @@ static void amdgpu_vm_pte_update_flags(struct 
amdgpu_vm_update_params *params,
flags |= AMDGPU_PTE_EXECUTABLE;
}
 
+   /*
+* Update no-retry flags to use the no-retry flag combination
+* with TF enabled. The AMDGPU_VM_NORETRY_FLAGS flag combination
+* does not work when TF is enabled. So, replace them with
+* AMDGPU_VM_NORETRY_FLAGS_TF flag combination which works for
+* all cases.
+*/
+   if (level == 

Re: [PATCH V3 4/7] drm/amd/pm: setup the framework to support Wifi RFI mitigation feature

2023-06-19 Thread Lazar, Lijo




On 6/16/2023 12:27 PM, Evan Quan wrote:

With WBRF feature supported, as a driver responding to the frequencies,
amdgpu driver is able to do shadow pstate switching to mitigate possible
interference(between its (G-)DDR memory clocks and local radio module
frequency bands used by Wifi 6/6e/7).

To make WBRF feature functional, the kernel needs to be configured with
CONFIG_ACPI_WBRF and the platform is equipped with necessary ACPI based
mechanism to get amdgpu driver notified.

Signed-off-by: Evan Quan 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  26 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c  |  63 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   |  19 ++
  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 184 ++
  drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  20 ++
  drivers/gpu/drm/amd/pm/swsmu/smu_internal.h   |   3 +
  6 files changed, 315 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 02b827785e39..2f2ec64ed1b2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -50,6 +50,7 @@
  #include 
  #include 
  #include 
+#include 
  
  #include 

  #include 
@@ -241,6 +242,7 @@ extern int amdgpu_num_kcq;
  #define AMDGPU_VCNFW_LOG_SIZE (32 * 1024)
  extern int amdgpu_vcnfw_log;
  extern int amdgpu_sg_display;
+extern int amdgpu_wbrf;
  
  #define AMDGPU_VM_MAX_NUM_CTX			4096

  #define AMDGPU_SG_THRESHOLD   (256*1024*1024)
@@ -741,6 +743,9 @@ struct amdgpu_reset_domain;
   */
  #define AMDGPU_HAS_VRAM(_adev) ((_adev)->gmc.real_vram_size)
  
+typedef

+void (*wbrf_notify_handler) (struct amdgpu_device *adev);
+
  struct amdgpu_device {
struct device   *dev;
struct pci_dev  *pdev;
@@ -1050,6 +1055,8 @@ struct amdgpu_device {
  
  	booljob_hang;

booldc_enabled;
+
+   wbrf_notify_handler wbrf_event_handler;
  };
  
  static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)

@@ -1381,6 +1388,25 @@ static inline int amdgpu_acpi_smart_shift_update(struct 
drm_device *dev,
 enum amdgpu_ss ss_state) { 
return 0; }
  #endif
  
+#if defined(CONFIG_ACPI_WBRF)

+bool amdgpu_acpi_is_wbrf_supported(struct amdgpu_device *adev);
+int amdgpu_acpi_wbrf_retrieve_exclusions(struct amdgpu_device *adev,
+struct wbrf_ranges_out 
*exclusions_out);
+int amdgpu_acpi_register_wbrf_notify_handler(struct amdgpu_device *adev,
+wbrf_notify_handler handler);
+int amdgpu_acpi_unregister_wbrf_notify_handler(struct amdgpu_device *adev);
+#else
+static inline bool amdgpu_acpi_is_wbrf_supported(struct amdgpu_device *adev) { 
return false; }
+static inline
+int amdgpu_acpi_wbrf_retrieve_exclusions(struct amdgpu_device *adev,
+struct wbrf_ranges_out 
*exclusions_out) { return 0; }
+static inline
+int amdgpu_acpi_register_wbrf_notify_handler(struct amdgpu_device *adev,
+wbrf_notify_handler handler) { 
return 0; }
+static inline
+int amdgpu_acpi_unregister_wbrf_notify_handler(struct amdgpu_device *adev) { 
return 0; }
+#endif
+
  #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
  bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev);
  bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
index aeeec211861c..efbe6dd91d1a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
@@ -1105,3 +1105,66 @@ bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device 
*adev)
  }
  
  #endif /* CONFIG_SUSPEND */

+
+#ifdef CONFIG_ACPI_WBRF
+bool amdgpu_acpi_is_wbrf_supported(struct amdgpu_device *adev)
+{
+   struct acpi_device *acpi_dev = ACPI_COMPANION(adev->dev);
+
+   if (!acpi_dev)
+   return false;
+
+   return wbrf_supported_consumer(acpi_dev);
+}
+
+int amdgpu_acpi_wbrf_retrieve_exclusions(struct amdgpu_device *adev,
+struct wbrf_ranges_out *exclusions_out)
+{
+   struct acpi_device *acpi_dev = ACPI_COMPANION(adev->dev);
+
+   if (!acpi_dev)
+   return -ENODEV;
+
+   return wbrf_retrieve_exclusions(acpi_dev, exclusions_out);
+}
+
+#define CPM_GPU_NOTIFY_COMMAND 0x55
+static void amdgpu_acpi_wbrf_event(acpi_handle handle, u32 event, void *data)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device *)data;
+
+   if (event == CPM_GPU_NOTIFY_COMMAND &&
+   adev->wbrf_event_handler)
+   adev->wbrf_event_handler(adev); > +}
+
+int amdgpu_acpi_register_wbrf_notify_handler(struct amdgpu_device *adev,
+wbrf_notify_handler handler)

Re: [PATCH] drm/amdgpu: Remove struct drm_driver.gem_prime_mmap

2023-06-19 Thread Thomas Zimmermann

Hi Christian

Am 19.06.23 um 16:13 schrieb Christian König:



Am 19.06.23 um 16:11 schrieb Thomas Zimmermann:

The callback struct drm_driver.gem_prime_mmap as been removed in
commit 0adec22702d4 ("drm: Remove struct drm_driver.gem_prime_mmap").
Do not assign to it. The assigned function, drm_gem_prime_mmap(), is
now the default for the operation, so there is no change in 
functionality.


Signed-off-by: Thomas Zimmermann 
Fixes: 0adec22702d4 ("drm: Remove struct drm_driver.gem_prime_mmap")
Cc: Thomas Zimmermann 
Cc: Alex Deucher 
Cc: "Christian König" 
Cc: "Pan, Xinhui" 
Cc: amd-gfx@lists.freedesktop.org
Cc: dri-de...@lists.freedesktop.org


Reviewed-by: Christian König 


Thanks for the quick response. I'll add the patch to drm-misc-next 
immediately, to make the tree's amdgpu build again.


Best regards
Thomas




---
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 -
  1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

index 43613569801b6..07e16ad465d06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2879,7 +2879,6 @@ const struct drm_driver amdgpu_partition_driver = {
  .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
  .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
  .gem_prime_import = amdgpu_gem_prime_import,
-    .gem_prime_mmap = drm_gem_prime_mmap,
  .name = DRIVER_NAME,
  .desc = DRIVER_DESC,




--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Frankenstrasse 146, 90461 Nuernberg, Germany
GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman
HRB 36809 (AG Nuernberg)


OpenPGP_signature
Description: OpenPGP digital signature


Re: [PATCH] drm/amdgpu: Remove struct drm_driver.gem_prime_mmap

2023-06-19 Thread Christian König




Am 19.06.23 um 16:11 schrieb Thomas Zimmermann:

The callback struct drm_driver.gem_prime_mmap as been removed in
commit 0adec22702d4 ("drm: Remove struct drm_driver.gem_prime_mmap").
Do not assign to it. The assigned function, drm_gem_prime_mmap(), is
now the default for the operation, so there is no change in functionality.

Signed-off-by: Thomas Zimmermann 
Fixes: 0adec22702d4 ("drm: Remove struct drm_driver.gem_prime_mmap")
Cc: Thomas Zimmermann 
Cc: Alex Deucher 
Cc: "Christian König" 
Cc: "Pan, Xinhui" 
Cc: amd-gfx@lists.freedesktop.org
Cc: dri-de...@lists.freedesktop.org


Reviewed-by: Christian König 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 -
  1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 43613569801b6..07e16ad465d06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2879,7 +2879,6 @@ const struct drm_driver amdgpu_partition_driver = {
.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
.gem_prime_import = amdgpu_gem_prime_import,
-   .gem_prime_mmap = drm_gem_prime_mmap,
  
  	.name = DRIVER_NAME,

.desc = DRIVER_DESC,




[PATCH] drm/amdgpu: Remove struct drm_driver.gem_prime_mmap

2023-06-19 Thread Thomas Zimmermann
The callback struct drm_driver.gem_prime_mmap as been removed in
commit 0adec22702d4 ("drm: Remove struct drm_driver.gem_prime_mmap").
Do not assign to it. The assigned function, drm_gem_prime_mmap(), is
now the default for the operation, so there is no change in functionality.

Signed-off-by: Thomas Zimmermann 
Fixes: 0adec22702d4 ("drm: Remove struct drm_driver.gem_prime_mmap")
Cc: Thomas Zimmermann 
Cc: Alex Deucher 
Cc: "Christian König" 
Cc: "Pan, Xinhui" 
Cc: amd-gfx@lists.freedesktop.org
Cc: dri-de...@lists.freedesktop.org
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 43613569801b6..07e16ad465d06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2879,7 +2879,6 @@ const struct drm_driver amdgpu_partition_driver = {
.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
.gem_prime_import = amdgpu_gem_prime_import,
-   .gem_prime_mmap = drm_gem_prime_mmap,
 
.name = DRIVER_NAME,
.desc = DRIVER_DESC,
-- 
2.41.0



Re: Warning appeared after c8b5a95 ("drm/amdgpu: Fix desktop freezed after gpu-reset")

2023-06-19 Thread Alex Deucher
On Mon, Jun 19, 2023 at 9:05 AM Christian Kastner  wrote:
>
> Hi,
>
> On a Debian 12 ("bookworm") system, I observed a new warning when I
> upgraded from kernel 6.1.25 to 6.1.27. This is on a system with an RX
> 6800 XT GPU and 3500X processor.
>
> I've traced it down to commit c8b5a95 ("drm/amdgpu: Fix desktop freezed
> after gpu-reset"). Rebuilding the 6.1.27 kernel without this change
> makes the warning disappear.
>
> I can reliably trigger this (and another) warning with
>
>   $ sudo cat /sys/kernel/debug/dri/0/amdgpu_test_ib
>   run ib test:
>   ib ring tests passed.
>
> 5 or 6 seconds after this, two warnings are printed. I see these same
> two warnings on system shutdown (or, at least, they looked similar
> enough to the above that I didn't check for identity).
>
> I've attached
>   (1) the dmesg output after modprobe'ing amdgpu
>   (2) the dmesg output after triggering amdgpu_test_ib
>
> The system in question is only used for ROCm development. I haven't
> observed any other side effects there, other than the warning. There's
> no monitor attached. So I can't speak to the effect of a desktop freeze.

The warnings are harmless, but they have been fixed[1] and the fixes
are making their way back to stable kernels.

[1] - 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=08c677cb0b436a96a836792bb35a8ec5de4999c2

Alex


[PATCH Review V2 1/1] drm/amdgpu: Remove redundant poison consumption handler function

2023-06-19 Thread Stanley . Yang
The function callback handle_poison_consumption and callback
function poison_consumption_handler are almost same to handle
poison consumption, remove poison_consumption_handler.

Changed from V1:
Add handle poison consumption function for VCN2.6, VCN4.0,
JPEG2.6 and JPEG4.0, return false when handle VCN/JPEGP poison
consumption.

Signed-off-by: Stanley.Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  |  9 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  4 
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  8 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h  |  3 ++-
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 12 +---
 drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c   |  9 +
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c   |  9 +
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c|  9 +
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c|  9 +
 9 files changed, 50 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a33d4bc34cee..c15dbdb2e0f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -840,15 +840,6 @@ int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev)
return 0;
 }
 
-int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
-   struct amdgpu_iv_entry *entry)
-{
-   if (adev->gfx.ras && adev->gfx.ras->poison_consumption_handler)
-   return adev->gfx.ras->poison_consumption_handler(adev, entry);
-
-   return 0;
-}
-
 int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
void *err_data,
struct amdgpu_iv_entry *entry)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index d0c3f2955821..95b80bc8cdb9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -273,8 +273,6 @@ struct amdgpu_gfx_ras {
int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
-   int (*poison_consumption_handler)(struct amdgpu_device *adev,
-   struct amdgpu_iv_entry *entry);
 };
 
 struct amdgpu_gfx_shadow_info {
@@ -538,8 +536,6 @@ int amdgpu_gfx_get_num_kcq(struct amdgpu_device *adev);
 void amdgpu_gfx_cp_init_microcode(struct amdgpu_device *adev, uint32_t 
ucode_id);
 
 int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev);
-int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
-   struct amdgpu_iv_entry *entry);
 
 bool amdgpu_gfx_is_master_xcc(struct amdgpu_device *adev, int xcc_id);
 int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5b6525d8dace..9ce7c7537751 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1668,7 +1668,7 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct 
amdgpu_device *adev)
 static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager 
*obj,
struct amdgpu_iv_entry *entry)
 {
-   bool poison_stat = false;
+   bool poison_stat = true;
struct amdgpu_device *adev = obj->adev;
struct amdgpu_ras_block_object *block_obj =
amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
@@ -1694,15 +1694,13 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
amdgpu_umc_poison_handler(adev, false);
 
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
-   poison_stat = 
block_obj->hw_ops->handle_poison_consumption(adev);
+   poison_stat = 
block_obj->hw_ops->handle_poison_consumption(adev, entry);
 
/* gpu reset is fallback for failed and default cases */
-   if (poison_stat) {
+   if (poison_stat != true) {
dev_info(adev->dev, "GPU reset for %s RAS poison consumption is 
issued!\n",
block_obj->ras_comm.name);
amdgpu_ras_reset_gpu(adev);
-   } else {
-   amdgpu_gfx_poison_consumption_handler(adev, entry);
}
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 46bf1889a9d7..03f3b3774b85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -564,7 +564,8 @@ struct amdgpu_ras_block_hw_ops {
void (*reset_ras_error_count)(struct amdgpu_device *adev);
void (*reset_ras_error_status)(struct amdgpu_device *adev);
bool (*query_poison_status)(struct amdgpu_device *adev);
-   bool (*handle_poison_consumption)(struct amdgpu_device *adev);
+   

Requests For Proposals for hosting XDC 2024 are now open

2023-06-19 Thread Ricardo Garcia
Hello everyone!

The X.org board is soliciting proposals to host XDC in 2024. Since XDC
2023 is being held in Europe this year, we've decided to host in North
America. However, the board is open to other locations, especially if
there's an interesting co-location with another conference.

If you're considering hosting XDC, we've assembled a wiki page with
what's generally expected and needed:

https://www.x.org/wiki/Events/RFP/

When submitting your proposal, please make sure to include at least the
key information about the potential location in question, possible
dates along with estimated costs. Proposals can be submitted to board
at foundation.x.org until the deadline of *September 17th, 2023*. 

Additionally, an quirk early heads-up to the board if you're
considering hosting would be appreciated, in case we need to adjust the
schedule a bit. Also, earlier is better since there generally will be a
bit of Q with organizers.

And if you just have some questions about what organizing XDC entails,
please feel free to chat with previous organizers, or someone from the
board.

Thanks,
Ricardo Garcia, on behalf of X.Org


RE: [PATCH] drm/amdgpu/vcn: Need to pause dpg before stop dpg

2023-06-19 Thread Liu, Leo
[AMD Official Use Only - General]

Hi Emily,

Do you want to pause or un-pause dpg mode based on and change and commit 
message?

With bare metal, before calling the stop, the state of dpg should be un-paused 
within the call the of amdgpu_vcn_idle_work_handler, is it not the case for 
SRIOV?

Regards,
Leo


-Original Message-
From: amd-gfx  On Behalf Of Emily Deng
Sent: Monday, June 19, 2023 6:24 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deng, Emily 
Subject: [PATCH] drm/amdgpu/vcn: Need to pause dpg before stop dpg

Need to pause dpg first, or it will hit follow error during stop dpg:
"[drm] Register(1) [regUVD_POWER_STATUS] failed to reach value 0x0001 != 
0xn"

Signed-off-by: Emily Deng 
---
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index b48bb5212488..259795098173 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -1424,8 +1424,10 @@ static int vcn_v4_0_start_sriov(struct amdgpu_device 
*adev)
  */
 static void vcn_v4_0_stop_dpg_mode(struct amdgpu_device *adev, int inst_idx)  {
+   struct dpg_pause_state state = {.fw_based = VCN_DPG_STATE__UNPAUSE};
uint32_t tmp;

+   vcn_v4_0_pause_dpg_mode(adev, inst_idx, );
/* Wait for power status to be 1 */
SOC15_WAIT_ON_RREG(VCN, inst_idx, regUVD_POWER_STATUS, 1,
UVD_POWER_STATUS__UVD_POWER_STATUS_MASK);
--
2.36.1



Warning appeared after c8b5a95 ("drm/amdgpu: Fix desktop freezed after gpu-reset")

2023-06-19 Thread Christian Kastner
Hi,

On a Debian 12 ("bookworm") system, I observed a new warning when I
upgraded from kernel 6.1.25 to 6.1.27. This is on a system with an RX
6800 XT GPU and 3500X processor.

I've traced it down to commit c8b5a95 ("drm/amdgpu: Fix desktop freezed
after gpu-reset"). Rebuilding the 6.1.27 kernel without this change
makes the warning disappear.

I can reliably trigger this (and another) warning with

  $ sudo cat /sys/kernel/debug/dri/0/amdgpu_test_ib
  run ib test:
  ib ring tests passed.

5 or 6 seconds after this, two warnings are printed. I see these same
two warnings on system shutdown (or, at least, they looked similar
enough to the above that I didn't check for identity).

I've attached
  (1) the dmesg output after modprobe'ing amdgpu
  (2) the dmesg output after triggering amdgpu_test_ib

The system in question is only used for ROCm development. I haven't
observed any other side effects there, other than the warning. There's
no monitor attached. So I can't speak to the effect of a desktop freeze.

Best,
Christian[  266.669251] [drm] PCIE GART of 512M enabled (table at 0x0083FEB0).
[  266.669268] [drm] PSP is resuming...
[  266.739148] [drm] reserve 0xa0 from 0x83fd00 for PSP TMR
[  266.876401] amdgpu :09:00.0: amdgpu: SECUREDISPLAY: securedisplay ta 
ucode is not available
[  266.876404] amdgpu :09:00.0: amdgpu: SMU is resuming...
[  266.876407] amdgpu :09:00.0: amdgpu: smu driver if version = 0x0040, 
smu fw if version = 0x0041, smu fw program = 0, version = 0x003a5600 
(58.86.0)
[  266.876410] amdgpu :09:00.0: amdgpu: SMU driver if version not matched
[  266.876428] amdgpu :09:00.0: amdgpu: dpm has been enabled
[  266.879972] amdgpu :09:00.0: amdgpu: SMU is resumed successfully!
[  266.881457] [drm] DMUB hardware initialized: version=0x02020017
[  266.904086] [drm] kiq ring mec 2 pipe 1 q 0
[  266.910932] [drm] VCN decode and encode initialized successfully(under DPG 
Mode).
[  266.911082] [drm] JPEG decode initialized successfully.
[  266.911104] amdgpu :09:00.0: amdgpu: ring gfx_0.0.0 uses VM inv eng 0 on 
hub 0
[  266.911106] amdgpu :09:00.0: amdgpu: ring comp_1.0.0 uses VM inv eng 1 
on hub 0
[  266.911107] amdgpu :09:00.0: amdgpu: ring comp_1.1.0 uses VM inv eng 4 
on hub 0
[  266.911108] amdgpu :09:00.0: amdgpu: ring comp_1.2.0 uses VM inv eng 5 
on hub 0
[  266.911109] amdgpu :09:00.0: amdgpu: ring comp_1.3.0 uses VM inv eng 6 
on hub 0
[  266.90] amdgpu :09:00.0: amdgpu: ring comp_1.0.1 uses VM inv eng 7 
on hub 0
[  266.90] amdgpu :09:00.0: amdgpu: ring comp_1.1.1 uses VM inv eng 8 
on hub 0
[  266.91] amdgpu :09:00.0: amdgpu: ring comp_1.2.1 uses VM inv eng 9 
on hub 0
[  266.92] amdgpu :09:00.0: amdgpu: ring comp_1.3.1 uses VM inv eng 10 
on hub 0
[  266.93] amdgpu :09:00.0: amdgpu: ring kiq_2.1.0 uses VM inv eng 11 
on hub 0
[  266.94] amdgpu :09:00.0: amdgpu: ring sdma0 uses VM inv eng 12 on 
hub 0
[  266.95] amdgpu :09:00.0: amdgpu: ring sdma1 uses VM inv eng 13 on 
hub 0
[  266.96] amdgpu :09:00.0: amdgpu: ring sdma2 uses VM inv eng 14 on 
hub 0
[  266.97] amdgpu :09:00.0: amdgpu: ring sdma3 uses VM inv eng 15 on 
hub 0
[  266.97] amdgpu :09:00.0: amdgpu: ring vcn_dec_0 uses VM inv eng 0 on 
hub 1
[  266.98] amdgpu :09:00.0: amdgpu: ring vcn_enc_0.0 uses VM inv eng 1 
on hub 1
[  266.99] amdgpu :09:00.0: amdgpu: ring vcn_enc_0.1 uses VM inv eng 4 
on hub 1
[  266.911120] amdgpu :09:00.0: amdgpu: ring vcn_dec_1 uses VM inv eng 5 on 
hub 1
[  266.911121] amdgpu :09:00.0: amdgpu: ring vcn_enc_1.0 uses VM inv eng 6 
on hub 1
[  266.911122] amdgpu :09:00.0: amdgpu: ring vcn_enc_1.1 uses VM inv eng 7 
on hub 1
[  266.911123] amdgpu :09:00.0: amdgpu: ring jpeg_dec uses VM inv eng 8 on 
hub 1
[  266.916173] amdgpu :09:00.0: [drm] Cannot find any crtc or sizes
[  266.916177] amdgpu :09:00.0: [drm] Cannot find any crtc or sizes
[  272.409887] [ cut here ]
[  272.409891] WARNING: CPU: 1 PID: 259 at 
drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c:656 amdgpu_irq_put+0x45/0x70 [amdgpu]
[  272.410166] Modules linked in: amdgpu gpu_sched drm_buddy drm_display_helper 
cec rc_core drm_ttm_helper ttm drm_kms_helper i2c_algo_bit ipt_REJECT 
xt_multiport nft_compat ctr ccm wireguard libchacha20poly1305 chacha_x86_64 
poly1305_x86_64 curve25519_x86_64 libcurve25519_generic libchacha 
ip6_udp_tunnel udp_tunnel nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib 
nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat 
nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables libcrc32c 
nfnetlink overlay binfmt_misc nls_ascii nls_cp437 vfat fat intel_rapl_msr 
intel_rapl_common amd64_edac edac_mce_amd kvm_amd iwlmvm kvm mac80211 
snd_hda_codec_realtek irqbypass snd_hda_codec_generic ghash_clmulni_intel 
snd_hda_codec_hdmi sha512_ssse3 sha512_generic libarc4 snd_hda_intel 

[PATCH] gpu: drm/amd: Fix traditional comparison using max method

2023-06-19 Thread Li Dong
It would be better to replace the traditional ternary conditional operator with 
max()

Signed-off-by: Li Dong 
---
 drivers/gpu/drm/amd/display/modules/freesync/freesync.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c 
b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
index dbd60811f95d..a5eabde53fa4 100644
--- a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
+++ b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
@@ -1005,8 +1005,7 @@ void mod_freesync_build_vrr_params(struct mod_freesync 
*mod_freesync,
(stream->timing.h_total * 
stream->ctx->dc->caps.max_v_total));
}
/* Limit minimum refresh rate to what can be supported by hardware */
-   min_refresh_in_uhz = min_hardware_refresh_in_uhz > 
in_config->min_refresh_in_uhz ?
-   min_hardware_refresh_in_uhz : in_config->min_refresh_in_uhz;
+   min_refresh_in_uhz = max(min_hardware_refresh_in_uhz, 
in_config->min_refresh_in_uhz);
max_refresh_in_uhz = in_config->max_refresh_in_uhz;
 
/* Full range may be larger than current video timing, so cap at 
nominal */
-- 
2.31.1.windows.1



Re: [PATCH V3 2/7] wifi: mac80211: Add support for ACPI WBRF

2023-06-19 Thread Johannes Berg
On Sun, 2023-06-18 at 21:17 -0500, Mario Limonciello wrote:
> 
> > --- a/include/net/cfg80211.h
> > +++ b/include/net/cfg80211.h
> > @@ -920,6 +920,14 @@ const struct cfg80211_chan_def *
> >   cfg80211_chandef_compatible(const struct cfg80211_chan_def *chandef1,
> > const struct cfg80211_chan_def *chandef2);
> >   
> > +/**
> > + * nl80211_chan_width_to_mhz - get the channel width in Mhz
> > + * @chan_width: the channel width from  nl80211_chan_width
> > + * Return: channel width in Mhz if the chan_width from  
> > nl80211_chan_width
> > + * is valid. -1 otherwise.
> > + */
> > +int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width);
> > +
> 
> It's up to mac80211 maintainers, but I would think that the changes to 
> change nl80211_chan_width_to_mhz from static to exported should be 
> separate from the patch to introduced WBRF support in the series.

Yeah, that'd be nice :)


> > +#define KHZ_TO_HZ(freq)((freq) * 1000ULL)

Together with MHZ_TO_KHZ() for example :)

johannes


[PATCH] drm/amd/pm: Provide energy data in 15.625mJ units

2023-06-19 Thread Lijo Lazar
Publish energy data in 15.625mJ unit for SMU v13.0.6. The same unit is
used in Aldebaran also.

Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index a92ea4601ea4..6ef12252beb5 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -200,7 +200,6 @@ struct PPTable_t {
 };
 
 #define SMUQ10_TO_UINT(x) ((x) >> 10)
-#define SMUQ16_TO_UINT(x) ((x) >> 16)
 
 struct smu_v13_0_6_dpm_map {
enum smu_clk_type clk_type;
@@ -1994,8 +1993,9 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct 
smu_context *smu, void **table
 
gpu_metrics->average_socket_power =
SMUQ10_TO_UINT(metrics->SocketPower);
+   /* Energy is reported in 15.625mJ units */
gpu_metrics->energy_accumulator =
-   SMUQ16_TO_UINT(metrics->SocketEnergyAcc);
+   SMUQ10_TO_UINT(metrics->SocketEnergyAcc);
 
gpu_metrics->current_gfxclk =
SMUQ10_TO_UINT(metrics->GfxclkFrequency[xcc0]);
-- 
2.25.1



Re: [PATCH 01/13] drm: execution context for GEM buffers v4

2023-06-19 Thread Boris Brezillon
On Mon, 19 Jun 2023 12:44:06 +0200
Christian König  wrote:

> Am 19.06.23 um 12:12 schrieb Boris Brezillon:
> > [SNIP]
> > Note that the drm_exec_until_all_locked() helper I introduced is taking
> > an expression, so in theory, you don't have to define a separate
> > function.
> >
> > drm_exec_until_all_locked(, {
> > /* inlined-code */
> > int ret;
> >
> > ret = blabla()
> > if (ret)
> > goto error;
> >
> > ...
> >
> > error:
> > /* return value. */
> > ret;
> > });
> >
> > This being said, as soon as you have several failure paths,
> > it makes things a lot easier/controllable if you make it a function,
> > and I honestly don't think the readability would suffer from having a
> > function defined just above the user. My main concern with the original
> > approach was the risk of calling continue/break_if_contended() in the
> > wrong place, and also the fact you can't really externalize things to
> > a function if you're looking for a cleaner split. At least with
> > drm_exec_until_all_locked() you can do both.  
> 
> Yeah, but that means that you can't use return inside your code block 
> and instead has to define an error label for handling "normal" 
> contention which is what I'm trying to avoid here.

Sorry, didn't pay attention to this particular concern. Indeed, if you
want to return inside the expression, that's a problem.

> 
> How about:
> 
> #define drm_exec_until_all_locked(exec)    \
>      __drm_exec_retry: if (drm_exec_cleanup(exec))
> 
> 
> #define drm_exec_retry_on_contention(exec)  \
>      if (unlikely(drm_exec_is_contended(exec)))  \
>      goto __drm_exec_retry
> 
> 
> And then use it like:
> 
> drm_exec_until_all_locked(exec)
> {
>      ret = drm_exec_prepare_obj(exec, obj);
>      drm_exec_retry_on_contention(exec);
> }
> 
> The only problem I can see with this is that the __drm_exec_retry label 
> would be function local.

Yeah, I'm not sure it's safe to use non-local labels for that, because,
as soon as you have more than one drm_exec_until_all_locked() call in a
given function it won't work, which is why I placed things in a block
with local labels, which in turn means you can't return directly,
unfortunately.


[PATCH] drm/amd/display: Remove else after return statement in 'dm_update_plane_state'

2023-06-19 Thread Srinivasan Shanmugam
Else is not necessary after return statements, hence remove it.

Reported by checkpatch:

WARNING: else is not generally useful after a break or return
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c:9776:
   return -EINVAL;
   else

Cc: Bhawanpreet Lakha 
Cc: Qingqing Zhuo 
Cc: Nicholas Kazlauskas 
Cc: Rodrigo Siqueira 
Cc: Aurabindo Pillai 
Signed-off-by: Srinivasan Shanmugam 
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 2446529c329a..e657214e0104 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -9777,8 +9777,8 @@ static int dm_update_plane_state(struct dc *dc,
if (plane->type == DRM_PLANE_TYPE_OVERLAY) {
if 
(is_video_format(new_plane_state->fb->format->format) && *is_top_most_overlay)
return -EINVAL;
-   else
-   *is_top_most_overlay = false;
+
+   *is_top_most_overlay = false;
}
 
DRM_DEBUG_ATOMIC("Enabling DRM plane: %d on DRM crtc %d\n",
-- 
2.25.1



Re: [PATCH 01/13] drm: execution context for GEM buffers v4

2023-06-19 Thread Boris Brezillon
On Mon, 19 Jun 2023 13:05:02 +0200
Boris Brezillon  wrote:

> On Mon, 19 Jun 2023 12:44:06 +0200
> Christian König  wrote:
> 
> > Am 19.06.23 um 12:12 schrieb Boris Brezillon:  
> > > [SNIP]
> > > Note that the drm_exec_until_all_locked() helper I introduced is taking
> > > an expression, so in theory, you don't have to define a separate
> > > function.
> > >
> > >   drm_exec_until_all_locked(, {
> > >   /* inlined-code */
> > >   int ret;
> > >
> > >   ret = blabla()
> > >   if (ret)
> > >   goto error;
> > >
> > >   ...
> > >
> > > error:
> > >   /* return value. */
> > >   ret;
> > >   });
> > >
> > > This being said, as soon as you have several failure paths,
> > > it makes things a lot easier/controllable if you make it a function,
> > > and I honestly don't think the readability would suffer from having a
> > > function defined just above the user. My main concern with the original
> > > approach was the risk of calling continue/break_if_contended() in the
> > > wrong place, and also the fact you can't really externalize things to
> > > a function if you're looking for a cleaner split. At least with
> > > drm_exec_until_all_locked() you can do both.
> > 
> > Yeah, but that means that you can't use return inside your code block 
> > and instead has to define an error label for handling "normal" 
> > contention which is what I'm trying to avoid here.
> > 
> > How about:
> > 
> > #define drm_exec_until_all_locked(exec)    \
> >      __drm_exec_retry: if (drm_exec_cleanup(exec))
> > 
> > 
> > #define drm_exec_retry_on_contention(exec)  \
> >      if (unlikely(drm_exec_is_contended(exec)))  \
> >      goto __drm_exec_retry
> > 
> > 
> > And then use it like:
> > 
> > drm_exec_until_all_locked(exec)
> > {
> >      ret = drm_exec_prepare_obj(exec, obj);
> >      drm_exec_retry_on_contention(exec);
> > }  
> 
> That would work, and I was about to suggest extending my proposal with
> a drm_exec_retry_on_contention() to support both use cases. The only
> downside is the fact you might be able to break out of a loop that has
> local variables, which will leak stack space.

Nevermind, brain fart on my end. It shouldn't leak any stack space, so
yeah, I think that's a good compromise.


Re: [PATCH 01/13] drm: execution context for GEM buffers v4

2023-06-19 Thread Intel



On 6/19/23 11:48, Christian König wrote:

Hi,

Am 19.06.23 um 11:33 schrieb Thomas Hellström (Intel):

[SNIP]
Sometimes you want to just drop the contended lock after the above 
relaxation. (Eviction would be one example), and not add as 
prelocked, if the contended object goes out of scope. Eviction 
would in some situations be one such example, -EDEADLOCK leading to 
an error path where the object should otherwise be freed is 
another. Perhaps we could add an argument to prepare_obj() as to 
whether the object should be immediately put after relaxation.


I was considering a try_prepare version as well, that should cover 
this use case.


That sounds a bit different from this use-case. The use-case above 
would, on -EDEADLOCK actually unlock everything, then lock-slow the 
contending lock and then immediately unlock it and drop.


Hui? What would that be good for?


It's for the case where you have nested locking, the contended lock has 
gone out-of-scope and you're probably not going to need it on the next 
attempt. I think the refcounted "prelocked" object that is lacking in 
the i915 variant will resolve all correctness / uaf issues, but still 
the object might be needlessly carried around for yet another locking round.





It sounds like try_prepare would just skip locking and continue with 
everything locked so far still locked?


Correct.








+    ret = drm_exec_obj_locked(exec, obj);
+    if (unlikely(ret)) {
+    dma_resv_unlock(obj->resv);
+    goto error_dropref;
+    }
+
+    swap(exec->prelocked, obj);
+
+error_dropref:
+    /* Always cleanup the contention so that error handling can 
kick in */

+    drm_gem_object_put(obj);
+    exec->contended = NULL;
+    return ret;
+}
+
+/**
+ * drm_exec_prepare_obj - prepare a GEM object for use
+ * @exec: the drm_exec object with the state
+ * @obj: the GEM object to prepare
+ * @num_fences: how many fences to reserve
+ *
+ * Prepare a GEM object for use by locking it and reserving fence 
slots. All

+ * successfully locked objects are put into the locked container.
+ *
+ * Returns: -EDEADLK if a contention is detected, -EALREADY when 
object is
+ * already locked, -ENOMEM when memory allocation failed and zero 
for success.

+ */
+int drm_exec_prepare_obj(struct drm_exec *exec, struct 
drm_gem_object *obj,

+ unsigned int num_fences)
+{
+    int ret;
+
+    ret = drm_exec_lock_contended(exec);
+    if (unlikely(ret))
+    return ret;
+
+    if (exec->prelocked == obj) {
+    drm_gem_object_put(exec->prelocked);
+    exec->prelocked = NULL;
+
+    return dma_resv_reserve_fences(obj->resv, num_fences);
+    }
+
+    if (exec->flags & DRM_EXEC_FLAG_INTERRUPTIBLE)
+    ret = dma_resv_lock_interruptible(obj->resv, >ticket);
+    else
+    ret = dma_resv_lock(obj->resv, >ticket);
+
+    if (unlikely(ret == -EDEADLK)) {
+    drm_gem_object_get(obj);
+    exec->contended = obj;
+    return -EDEADLK;
+    }
+
+    if (unlikely(ret == -EALREADY &&
+    (exec->flags & DRM_EXEC_FLAG_ALLOW_DUPLICATES)))
+    goto reserve_fences;
+
+    if (unlikely(ret))
+    return ret;
+
+    ret = drm_exec_obj_locked(exec, obj);
+    if (ret)
+    goto error_unlock;
+
+reserve_fences:
+    /* Keep locked when reserving fences fails */
+    return dma_resv_reserve_fences(obj->resv, num_fences);


Ugh, what is the use-case for keeping things locked here? How would 
a caller tell the difference between an error where everything is 
locked and nothing is locked? IMO, we should unlock on error here. 
If there indeed is a use-case we should add a separate function for 
reserving fences for all locked objects, rather than returning 
sometimes locked on error sometime not.


We return the object locked here because it was to much churn to 
remove it again from the array and we are getting fully cleaned up 
at the end anyway.


OK, so if we add an unlock functionality, we could just have a 
consistent locking state on error return?


Yeah, that should work. Going to work on this.


Great.

Thanks,

Thomas




Regards,
Christian.



Thanks,
Thomas



Regards,
Christian.



Thanks,

Thomas



+
+error_unlock:
+    dma_resv_unlock(obj->resv);
+    return ret;
+}
+EXPORT_SYMBOL(drm_exec_prepare_obj);
+
+/**
+ * drm_exec_prepare_array - helper to prepare an array of objects
+ * @exec: the drm_exec object with the state
+ * @objects: array of GEM object to prepare
+ * @num_objects: number of GEM objects in the array
+ * @num_fences: number of fences to reserve on each GEM object
+ *
+ * Prepares all GEM objects in an array, handles contention but 
aports on first
+ * error otherwise. Reserves @num_fences on each GEM object after 
locking it.

+ *
+ * Returns: -EALREADY when object is already locked, -ENOMEM when 
memory

+ * allocation failed and zero for success.
+ */
+int drm_exec_prepare_array(struct drm_exec *exec,
+   struct drm_gem_object **objects,
+   unsigned int num_objects,
+   

Re: [PATCH 01/13] drm: execution context for GEM buffers v4

2023-06-19 Thread Boris Brezillon
On Mon, 19 Jun 2023 12:44:06 +0200
Christian König  wrote:

> Am 19.06.23 um 12:12 schrieb Boris Brezillon:
> > [SNIP]
> > Note that the drm_exec_until_all_locked() helper I introduced is taking
> > an expression, so in theory, you don't have to define a separate
> > function.
> >
> > drm_exec_until_all_locked(, {
> > /* inlined-code */
> > int ret;
> >
> > ret = blabla()
> > if (ret)
> > goto error;
> >
> > ...
> >
> > error:
> > /* return value. */
> > ret;
> > });
> >
> > This being said, as soon as you have several failure paths,
> > it makes things a lot easier/controllable if you make it a function,
> > and I honestly don't think the readability would suffer from having a
> > function defined just above the user. My main concern with the original
> > approach was the risk of calling continue/break_if_contended() in the
> > wrong place, and also the fact you can't really externalize things to
> > a function if you're looking for a cleaner split. At least with
> > drm_exec_until_all_locked() you can do both.  
> 
> Yeah, but that means that you can't use return inside your code block 
> and instead has to define an error label for handling "normal" 
> contention which is what I'm trying to avoid here.
> 
> How about:
> 
> #define drm_exec_until_all_locked(exec)    \
>      __drm_exec_retry: if (drm_exec_cleanup(exec))
> 
> 
> #define drm_exec_retry_on_contention(exec)  \
>      if (unlikely(drm_exec_is_contended(exec)))  \
>      goto __drm_exec_retry
> 
> 
> And then use it like:
> 
> drm_exec_until_all_locked(exec)
> {
>      ret = drm_exec_prepare_obj(exec, obj);
>      drm_exec_retry_on_contention(exec);
> }

That would work, and I was about to suggest extending my proposal with
a drm_exec_retry_on_contention() to support both use cases. The only
downside is the fact you might be able to break out of a loop that has
local variables, which will leak stack space.

> 
> The only problem I can see with this is that the __drm_exec_retry label 
> would be function local.

You can use local labels [1] to make it local to a block (see my
version, just need to rename the retry label into __drm_exec_retry). I
checked, and this is used elsewhere in the kernel (like in
linux/wait.h, which is a core feature), so it should be safe to use.

[1]https://gcc.gnu.org/onlinedocs/gcc/Local-Labels.html


Re: [PATCH 01/13] drm: execution context for GEM buffers v4

2023-06-19 Thread Christian König

Am 19.06.23 um 12:12 schrieb Boris Brezillon:

[SNIP]
Note that the drm_exec_until_all_locked() helper I introduced is taking
an expression, so in theory, you don't have to define a separate
function.

drm_exec_until_all_locked(, {
/* inlined-code */
int ret;

ret = blabla()
if (ret)
goto error;

...

error:
/* return value. */
ret;
});

This being said, as soon as you have several failure paths,
it makes things a lot easier/controllable if you make it a function,
and I honestly don't think the readability would suffer from having a
function defined just above the user. My main concern with the original
approach was the risk of calling continue/break_if_contended() in the
wrong place, and also the fact you can't really externalize things to
a function if you're looking for a cleaner split. At least with
drm_exec_until_all_locked() you can do both.


Yeah, but that means that you can't use return inside your code block 
and instead has to define an error label for handling "normal" 
contention which is what I'm trying to avoid here.


How about:

#define drm_exec_until_all_locked(exec)    \
    __drm_exec_retry: if (drm_exec_cleanup(exec))


#define drm_exec_retry_on_contention(exec)  \
    if (unlikely(drm_exec_is_contended(exec)))  \
    goto __drm_exec_retry


And then use it like:

drm_exec_until_all_locked(exec)
{
    ret = drm_exec_prepare_obj(exec, obj);
    drm_exec_retry_on_contention(exec);
}

The only problem I can see with this is that the __drm_exec_retry label 
would be function local.


Regards,
Christian.



Regards,

Boris




RE: [PATCH Review 1/1] drm/amdgpu: Remove redundant poison consumption handler function

2023-06-19 Thread Yang, Stanley
[AMD Official Use Only - General]

Please ignore this patch, I will send V2.

Regards,
Stanley
> -Original Message-
> From: Stanley.Yang 
> Sent: Monday, June 19, 2023 4:24 PM
> To: amd-gfx@lists.freedesktop.org; Zhang, Hawking
> ; Zhou1, Tao ; Chai,
> Thomas 
> Cc: Yang, Stanley 
> Subject: [PATCH Review 1/1] drm/amdgpu: Remove redundant poison
> consumption handler function
>
> The function callback handle_poison_consumption and callback function
> poison_consumption_handler are almost same to handle poison
> consumption, remove poison_consumption_handler.
>
> Signed-off-by: Stanley.Yang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  |  9 -
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  4 
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  6 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h  |  3 ++-
> drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 12 +---
>  5 files changed, 13 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index a33d4bc34cee..c15dbdb2e0f9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -840,15 +840,6 @@ int amdgpu_gfx_ras_sw_init(struct amdgpu_device
> *adev)
>   return 0;
>  }
>
> -int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
> - struct amdgpu_iv_entry
> *entry)
> -{
> - if (adev->gfx.ras && adev->gfx.ras->poison_consumption_handler)
> - return adev->gfx.ras->poison_consumption_handler(adev,
> entry);
> -
> - return 0;
> -}
> -
>  int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
>   void *err_data,
>   struct amdgpu_iv_entry *entry)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index d0c3f2955821..95b80bc8cdb9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -273,8 +273,6 @@ struct amdgpu_gfx_ras {
>   int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,
>   struct amdgpu_irq_src *source,
>   struct amdgpu_iv_entry *entry);
> - int (*poison_consumption_handler)(struct amdgpu_device *adev,
> - struct amdgpu_iv_entry
> *entry);
>  };
>
>  struct amdgpu_gfx_shadow_info {
> @@ -538,8 +536,6 @@ int amdgpu_gfx_get_num_kcq(struct amdgpu_device
> *adev);  void amdgpu_gfx_cp_init_microcode(struct amdgpu_device *adev,
> uint32_t ucode_id);
>
>  int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev); -int
> amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
> - struct amdgpu_iv_entry
> *entry);
>
>  bool amdgpu_gfx_is_master_xcc(struct amdgpu_device *adev, int xcc_id);  int
> amdgpu_gfx_sysfs_init(struct amdgpu_device *adev); diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 5b6525d8dace..7be289473034 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1694,15 +1694,13 @@ static void
> amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
>   amdgpu_umc_poison_handler(adev, false);
>
>   if (block_obj->hw_ops && block_obj->hw_ops-
> >handle_poison_consumption)
> - poison_stat = block_obj->hw_ops-
> >handle_poison_consumption(adev);
> + poison_stat = block_obj->hw_ops-
> >handle_poison_consumption(adev,
> +entry);
>
>   /* gpu reset is fallback for failed and default cases */
> - if (poison_stat) {
> + if (poison_stat != true) {
>   dev_info(adev->dev, "GPU reset for %s RAS poison
> consumption is issued!\n",
>   block_obj->ras_comm.name);
>   amdgpu_ras_reset_gpu(adev);
> - } else {
> - amdgpu_gfx_poison_consumption_handler(adev, entry);
>   }
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 46bf1889a9d7..03f3b3774b85 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -564,7 +564,8 @@ struct amdgpu_ras_block_hw_ops {
>   void (*reset_ras_error_count)(struct amdgpu_device *adev);
>   void (*reset_ras_error_status)(struct amdgpu_device *adev);
>   bool (*query_poison_status)(struct amdgpu_device *adev);
> - bool (*handle_poison_consumption)(struct amdgpu_device *adev);
> + bool (*handle_poison_consumption)(struct amdgpu_device *adev,
> + struct amdgpu_iv_entry *entry);
>  };
>
>  /* work flow
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
> index 26d6286d86c9..5b7eac547a05 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
> 

[PATCH] drm/amdgpu/vcn: Need to pause dpg before stop dpg

2023-06-19 Thread Emily Deng
Need to pause dpg first, or it will hit follow error during stop dpg:
"[drm] Register(1) [regUVD_POWER_STATUS] failed to reach value 0x0001 != 
0xn"

Signed-off-by: Emily Deng 
---
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index b48bb5212488..259795098173 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -1424,8 +1424,10 @@ static int vcn_v4_0_start_sriov(struct amdgpu_device 
*adev)
  */
 static void vcn_v4_0_stop_dpg_mode(struct amdgpu_device *adev, int inst_idx)
 {
+   struct dpg_pause_state state = {.fw_based = VCN_DPG_STATE__UNPAUSE};
uint32_t tmp;
 
+   vcn_v4_0_pause_dpg_mode(adev, inst_idx, );
/* Wait for power status to be 1 */
SOC15_WAIT_ON_RREG(VCN, inst_idx, regUVD_POWER_STATUS, 1,
UVD_POWER_STATUS__UVD_POWER_STATUS_MASK);
-- 
2.36.1



Re: [PATCH 01/13] drm: execution context for GEM buffers v4

2023-06-19 Thread Boris Brezillon
On Mon, 19 Jun 2023 11:20:06 +0200
Christian König  wrote:

> Hi guys,
> 
> Am 19.06.23 um 10:59 schrieb Thomas Hellström (Intel):
> > [SNIP]  
> 
>  I really need to find some time to work on that anyway.  
> >> I've been playing with drm_exec for a couple weeks now, and I wanted
> >> to share something I hacked to try and make the API simpler and
> >> more robust against misuse (see the below diff, which is a slightly
> >> adjusted version of your work).  
> >
> > It would be good if we could have someone taking charge of this series 
> > and address all review comments, I see some of my comments getting 
> > lost, we have multiple submitters and I can't find a dri-devel 
> > patchwork entry for this. Anyway some comments below.  
> 
> I can try to find some time for the series this week (As long as nobody 
> comes along and has any burning roof).

That's great news!

> 
> >  
> >>
> >> In this version, the user is no longer in control of the retry
> >> loop. Instead, it provides an expression (a call to a
> >> sub-function) to be re-evaluated each time a contention is
> >> detected. IMHO, this makes the 'prepare-objs' functions easier to
> >> apprehend, and avoids any mistake like calling
> >> drm_exec_continue_on_contention() in an inner loop, or breaking
> >> out of the drm_exec_while_all_locked() loop unintentionally.  
> >
> > In i915 we've had a very similar helper to this, and while I agree 
> > this newer version would probably help make code cleaner, but OTOH 
> > there also are some places where the short drm_exec_while_all_locked() 
> > -likeblock don't really motivate a separate function. Porting i915 to 
> > the current version will take some work, For  the xe driver both 
> > versions would work fine.  
> 
> Yeah, this is actually what my first version of this looked like. But I 
> abandoned that approach because we have a lot of cases were we just 
> quickly want to lock a few GEM objects and don't want the extra overhead 
> of putting all the state into some bag to forward it to a function.

If you're talking about verbosity, it might be the case, though I guess
it mostly a matter of taste (I do like when things are well isolated).
As for runtime overhead, I'd expect the compiler to inline the function
anyway, so it's unlikely to change anything.

> >> +/* Track the locked object in the array */
> >> +static int drm_exec_obj_locked(struct drm_exec *exec,
> >> +   struct drm_gem_object *obj)
> >> +{
> >> +    if (unlikely(exec->num_objects == exec->max_objects)) {
> >> +    size_t size = exec->max_objects * sizeof(void *);
> >> +    void *tmp;
> >> +
> >> +    tmp = kvrealloc(exec->objects, size, size + PAGE_SIZE,
> >> +    GFP_KERNEL);
> >> +    if (!tmp)
> >> +    return -ENOMEM;  
> >
> > Sometimes you need to just temporarily lock an object and then unlock 
> > it again if it goes out of scope before reaching the end of 
> > _until_all_locked(). In that case you might need to remove a lock from 
> > the array. I *think* for all use-cases in i915 it would suffice to 
> > take a snapshot of num_objects, and unlock everything above that, 
> > having exec->objects behave like a stack, but was ever a list 
> > considered instead of a realloced array?  
> 
> Yes, the problem is that linked lists really suck regarding their cache 
> line locality. That's why I've came up with this approach here.

Hm, maybe I'm missing something, but if you place the list_head obj you
use to stack the locked objects close enough to the resv pointer, and
aligned on cache line, it shouldn't really be a problem, given you have
to dereference the GEM object to retrieve its resv anyway.


Re: [PATCH 01/13] drm: execution context for GEM buffers v4

2023-06-19 Thread Boris Brezillon
Hello Thomas,

On Mon, 19 Jun 2023 10:59:16 +0200
Thomas Hellström (Intel)  wrote:

>    
> > +/**
> > + * DOC: Overview
> > + *
> > + * This component mainly abstracts the retry loop necessary for locking
> > + * multiple GEM objects while preparing hardware operations (e.g. 
> > command
> > + * submissions, page table updates etc..).
> > + *
> > + * If a contention is detected while locking a GEM object the cleanup 
> > procedure
> > + * unlocks all previously locked GEM objects and locks the contended 
> > one first
> > + * before locking any further objects.
> > + *
> > + * After an object is locked fences slots can optionally be reserved 
> > on the
> > + * dma_resv object inside the GEM object.
> > + *
> > + * A typical usage pattern should look like this::
> > + *
> > + * struct drm_gem_object *obj;
> > + * struct drm_exec exec;
> > + * unsigned long index;
> > + * int ret;
> > + *
> > + * drm_exec_init(, true);
> > + * drm_exec_while_not_all_locked() {
> > + * ret = drm_exec_prepare_obj(, boA, 1);
> > + * drm_exec_continue_on_contention();
> > + * if (ret)
> > + * goto error;
> > + *  
>  Have you considered defining a drm_exec_try_prepare_obj_or_retry()
>  combining drm_exec_prepare_obj() and drm_exec_continue_on_contention()?
> 
>  #define drm_exec_try_prepare_obj_or_retry(exec, obj, num_fences) \
>    ({ \
>    int __ret = drm_exec_prepare_obj(exec, bo, 
>  num_fences); \
>    if (unlikely(drm_exec_is_contended(exec))) \
>    continue; \
>    __ret; \
>    })
> 
>  This way the following pattern
> 
>   ret = drm_exec_prepare_obj(, boA, 1);
>   drm_exec_continue_on_contention();
>   if (ret)
>   goto error;
> 
>  can be turned into something more conventional:
> 
>   ret = drm_exec_try_prepare_obj_or_retry(, boA, 1);
>   if (ret)
>   goto error;  
> >>> Yeah, I was considering that as well. But then abandoned it as to
> >>> complicated.
> >>>
> >>> I really need to find some time to work on that anyway.  
> > I've been playing with drm_exec for a couple weeks now, and I wanted
> > to share something I hacked to try and make the API simpler and
> > more robust against misuse (see the below diff, which is a slightly
> > adjusted version of your work).  
> 
> It would be good if we could have someone taking charge of this series 
> and address all review comments, I see some of my comments getting lost, 
> we have multiple submitters and I can't find a dri-devel patchwork entry 
> for this.

My bad, I wasn't intending to submit a new version. I just added a
diff to show what I had in mind. This being said, it'd be great if we
could make some progress on this series, because we have quite a few
drivers depending on it now.

> 
> >
> > In this version, the user is no longer in control of the retry
> > loop. Instead, it provides an expression (a call to a
> > sub-function) to be re-evaluated each time a contention is
> > detected. IMHO, this makes the 'prepare-objs' functions easier to
> > apprehend, and avoids any mistake like calling
> > drm_exec_continue_on_contention() in an inner loop, or breaking
> > out of the drm_exec_while_all_locked() loop unintentionally.  
> 
> In i915 we've had a very similar helper to this, and while I agree this 
> newer version would probably help make code cleaner, but OTOH there also 
> are some places where the short drm_exec_while_all_locked() -likeblock 
> don't really motivate a separate function. Porting i915 to the current 
> version will take some work, For  the xe driver both versions would work 
> fine.

Note that the drm_exec_until_all_locked() helper I introduced is taking
an expression, so in theory, you don't have to define a separate
function.

drm_exec_until_all_locked(, {
/* inlined-code */
int ret;

ret = blabla()
if (ret)
goto error;

...

error:
/* return value. */
ret;
});

This being said, as soon as you have several failure paths,
it makes things a lot easier/controllable if you make it a function,
and I honestly don't think the readability would suffer from having a
function defined just above the user. My main concern with the original
approach was the risk of calling continue/break_if_contended() in the
wrong place, and also the fact you can't really externalize things to
a function if you're looking for a cleaner split. At least with
drm_exec_until_all_locked() you can do both.

Regards,

Boris


Re: [PATCH 01/13] drm: execution context for GEM buffers v4

2023-06-19 Thread Christian König

Hi,

Am 19.06.23 um 11:33 schrieb Thomas Hellström (Intel):

[SNIP]
Sometimes you want to just drop the contended lock after the above 
relaxation. (Eviction would be one example), and not add as 
prelocked, if the contended object goes out of scope. Eviction would 
in some situations be one such example, -EDEADLOCK leading to an 
error path where the object should otherwise be freed is another. 
Perhaps we could add an argument to prepare_obj() as to whether the 
object should be immediately put after relaxation.


I was considering a try_prepare version as well, that should cover 
this use case.


That sounds a bit different from this use-case. The use-case above 
would, on -EDEADLOCK actually unlock everything, then lock-slow the 
contending lock and then immediately unlock it and drop.


Hui? What would that be good for?

It sounds like try_prepare would just skip locking and continue with 
everything locked so far still locked?


Correct.








+    ret = drm_exec_obj_locked(exec, obj);
+    if (unlikely(ret)) {
+    dma_resv_unlock(obj->resv);
+    goto error_dropref;
+    }
+
+    swap(exec->prelocked, obj);
+
+error_dropref:
+    /* Always cleanup the contention so that error handling can 
kick in */

+    drm_gem_object_put(obj);
+    exec->contended = NULL;
+    return ret;
+}
+
+/**
+ * drm_exec_prepare_obj - prepare a GEM object for use
+ * @exec: the drm_exec object with the state
+ * @obj: the GEM object to prepare
+ * @num_fences: how many fences to reserve
+ *
+ * Prepare a GEM object for use by locking it and reserving fence 
slots. All

+ * successfully locked objects are put into the locked container.
+ *
+ * Returns: -EDEADLK if a contention is detected, -EALREADY when 
object is
+ * already locked, -ENOMEM when memory allocation failed and zero 
for success.

+ */
+int drm_exec_prepare_obj(struct drm_exec *exec, struct 
drm_gem_object *obj,

+ unsigned int num_fences)
+{
+    int ret;
+
+    ret = drm_exec_lock_contended(exec);
+    if (unlikely(ret))
+    return ret;
+
+    if (exec->prelocked == obj) {
+    drm_gem_object_put(exec->prelocked);
+    exec->prelocked = NULL;
+
+    return dma_resv_reserve_fences(obj->resv, num_fences);
+    }
+
+    if (exec->flags & DRM_EXEC_FLAG_INTERRUPTIBLE)
+    ret = dma_resv_lock_interruptible(obj->resv, >ticket);
+    else
+    ret = dma_resv_lock(obj->resv, >ticket);
+
+    if (unlikely(ret == -EDEADLK)) {
+    drm_gem_object_get(obj);
+    exec->contended = obj;
+    return -EDEADLK;
+    }
+
+    if (unlikely(ret == -EALREADY &&
+    (exec->flags & DRM_EXEC_FLAG_ALLOW_DUPLICATES)))
+    goto reserve_fences;
+
+    if (unlikely(ret))
+    return ret;
+
+    ret = drm_exec_obj_locked(exec, obj);
+    if (ret)
+    goto error_unlock;
+
+reserve_fences:
+    /* Keep locked when reserving fences fails */
+    return dma_resv_reserve_fences(obj->resv, num_fences);


Ugh, what is the use-case for keeping things locked here? How would 
a caller tell the difference between an error where everything is 
locked and nothing is locked? IMO, we should unlock on error here. 
If there indeed is a use-case we should add a separate function for 
reserving fences for all locked objects, rather than returning 
sometimes locked on error sometime not.


We return the object locked here because it was to much churn to 
remove it again from the array and we are getting fully cleaned up at 
the end anyway.


OK, so if we add an unlock functionality, we could just have a 
consistent locking state on error return?


Yeah, that should work. Going to work on this.

Regards,
Christian.



Thanks,
Thomas



Regards,
Christian.



Thanks,

Thomas



+
+error_unlock:
+    dma_resv_unlock(obj->resv);
+    return ret;
+}
+EXPORT_SYMBOL(drm_exec_prepare_obj);
+
+/**
+ * drm_exec_prepare_array - helper to prepare an array of objects
+ * @exec: the drm_exec object with the state
+ * @objects: array of GEM object to prepare
+ * @num_objects: number of GEM objects in the array
+ * @num_fences: number of fences to reserve on each GEM object
+ *
+ * Prepares all GEM objects in an array, handles contention but 
aports on first
+ * error otherwise. Reserves @num_fences on each GEM object after 
locking it.

+ *
+ * Returns: -EALREADY when object is already locked, -ENOMEM when 
memory

+ * allocation failed and zero for success.
+ */
+int drm_exec_prepare_array(struct drm_exec *exec,
+   struct drm_gem_object **objects,
+   unsigned int num_objects,
+   unsigned int num_fences)
+{
+    int ret;
+
+    for (unsigned int i = 0; i < num_objects; ++i) {
+    ret = drm_exec_prepare_obj(exec, objects[i], num_fences);
+    if (ret)
+    return ret;
+    }
+
+    return 0;
+}
+EXPORT_SYMBOL(drm_exec_prepare_array);
+
+MODULE_DESCRIPTION("DRM execution context");
+MODULE_LICENSE("Dual MIT/GPL");
diff --git a/include/drm/drm_exec.h b/include/drm/drm_exec.h
new file mode 

Re: [PATCH 01/13] drm: execution context for GEM buffers v4

2023-06-19 Thread Intel

Hi!

On 6/19/23 11:20, Christian König wrote:

Hi guys,

Am 19.06.23 um 10:59 schrieb Thomas Hellström (Intel):

[SNIP]


I really need to find some time to work on that anyway.

I've been playing with drm_exec for a couple weeks now, and I wanted
to share something I hacked to try and make the API simpler and
more robust against misuse (see the below diff, which is a slightly
adjusted version of your work).


It would be good if we could have someone taking charge of this 
series and address all review comments, I see some of my comments 
getting lost, we have multiple submitters and I can't find a 
dri-devel patchwork entry for this. Anyway some comments below.


I can try to find some time for the series this week (As long as 
nobody comes along and has any burning roof).






In this version, the user is no longer in control of the retry
loop. Instead, it provides an expression (a call to a
sub-function) to be re-evaluated each time a contention is
detected. IMHO, this makes the 'prepare-objs' functions easier to
apprehend, and avoids any mistake like calling
drm_exec_continue_on_contention() in an inner loop, or breaking
out of the drm_exec_while_all_locked() loop unintentionally.


In i915 we've had a very similar helper to this, and while I agree 
this newer version would probably help make code cleaner, but OTOH 
there also are some places where the short 
drm_exec_while_all_locked() -likeblock don't really motivate a 
separate function. Porting i915 to the current version will take some 
work, For  the xe driver both versions would work fine.


Yeah, this is actually what my first version of this looked like. But 
I abandoned that approach because we have a lot of cases were we just 
quickly want to lock a few GEM objects and don't want the extra 
overhead of putting all the state into some bag to forward it to a 
function.




Some additional review comments not related to the interface change 
below:




It also makes the internal management a bit simpler, since we
no longer call drm_exec_cleanup() on the first attempt, and can
thus get rid of the DRM_EXEC_DUMMY trick.

In the below diff, I also re-introduced native support for
duplicates as an opt-in, so we don't have to do things like:

ret = drm_exec_prepare_obj(exec, obj, num_fences);
if (ret == -EALREADY)
    ret = dma_resv_reserve_fences(obj->resv, num_fences);
if (ret)
    return ret;

and can just do:

ret = drm_exec_prepare_obj(exec, obj, num_fences);
if (ret)
    return;

Of course drivers can open-code a wrapper doing the same thing, but
given at least pvr and radeon need this, it'd be nice if the core
could support it natively.

That's mostly it. Just wanted to share what I had in case you're
interested. If not, that's fine too.

Regards,

Boris
---
  Documentation/gpu/drm-mm.rst |  12 ++
  drivers/gpu/drm/Kconfig  |   6 +
  drivers/gpu/drm/Makefile |   2 +
  drivers/gpu/drm/drm_exec.c   | 274 
+++

  include/drm/drm_exec.h   | 130 +
  5 files changed, 424 insertions(+)
  create mode 100644 drivers/gpu/drm/drm_exec.c
  create mode 100644 include/drm/drm_exec.h

diff --git a/Documentation/gpu/drm-mm.rst 
b/Documentation/gpu/drm-mm.rst

index fe40ee686f6e..c9f120cfe730 100644
--- a/Documentation/gpu/drm-mm.rst
+++ b/Documentation/gpu/drm-mm.rst
@@ -524,6 +524,18 @@ DRM Sync Objects
  .. kernel-doc:: drivers/gpu/drm/drm_syncobj.c
 :export:
  +DRM Execution context
+=
+
+.. kernel-doc:: drivers/gpu/drm/drm_exec.c
+   :doc: Overview
+
+.. kernel-doc:: include/drm/drm_exec.h
+   :internal:
+
+.. kernel-doc:: drivers/gpu/drm/drm_exec.c
+   :export:
+
  GPU Scheduler
  =
  diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 76991720637c..01a38fcdb1c4 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -194,6 +194,12 @@ config DRM_TTM
    GPU memory types. Will be enabled automatically if a device 
driver

    uses it.
  +config DRM_EXEC
+    tristate
+    depends on DRM
+    help
+  Execution context for command submissions
+
  config DRM_BUDDY
  tristate
  depends on DRM
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index 1873f64db171..18a02eaf2d49 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -79,6 +79,8 @@ obj-$(CONFIG_DRM_PANEL_ORIENTATION_QUIRKS) += 
drm_panel_orientation_quirks.o

  #
  # Memory-management helpers
  #
+#
+obj-$(CONFIG_DRM_EXEC) += drm_exec.o
    obj-$(CONFIG_DRM_BUDDY) += drm_buddy.o
  diff --git a/drivers/gpu/drm/drm_exec.c b/drivers/gpu/drm/drm_exec.c
new file mode 100644
index ..e0ad1a3e1610
--- /dev/null
+++ b/drivers/gpu/drm/drm_exec.c
@@ -0,0 +1,274 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+
+#include 
+#include 
+#include 
+
+/**
+ * DOC: Overview
+ *
+ * This component mainly abstracts the retry loop necessary for 
locking
+ * multiple GEM objects while preparing 

Re: [PATCH 01/13] drm: execution context for GEM buffers v4

2023-06-19 Thread Christian König

Hi guys,

Am 19.06.23 um 10:59 schrieb Thomas Hellström (Intel):

[SNIP]


I really need to find some time to work on that anyway.

I've been playing with drm_exec for a couple weeks now, and I wanted
to share something I hacked to try and make the API simpler and
more robust against misuse (see the below diff, which is a slightly
adjusted version of your work).


It would be good if we could have someone taking charge of this series 
and address all review comments, I see some of my comments getting 
lost, we have multiple submitters and I can't find a dri-devel 
patchwork entry for this. Anyway some comments below.


I can try to find some time for the series this week (As long as nobody 
comes along and has any burning roof).






In this version, the user is no longer in control of the retry
loop. Instead, it provides an expression (a call to a
sub-function) to be re-evaluated each time a contention is
detected. IMHO, this makes the 'prepare-objs' functions easier to
apprehend, and avoids any mistake like calling
drm_exec_continue_on_contention() in an inner loop, or breaking
out of the drm_exec_while_all_locked() loop unintentionally.


In i915 we've had a very similar helper to this, and while I agree 
this newer version would probably help make code cleaner, but OTOH 
there also are some places where the short drm_exec_while_all_locked() 
-likeblock don't really motivate a separate function. Porting i915 to 
the current version will take some work, For  the xe driver both 
versions would work fine.


Yeah, this is actually what my first version of this looked like. But I 
abandoned that approach because we have a lot of cases were we just 
quickly want to lock a few GEM objects and don't want the extra overhead 
of putting all the state into some bag to forward it to a function.




Some additional review comments not related to the interface change 
below:




It also makes the internal management a bit simpler, since we
no longer call drm_exec_cleanup() on the first attempt, and can
thus get rid of the DRM_EXEC_DUMMY trick.

In the below diff, I also re-introduced native support for
duplicates as an opt-in, so we don't have to do things like:

ret = drm_exec_prepare_obj(exec, obj, num_fences);
if (ret == -EALREADY)
    ret = dma_resv_reserve_fences(obj->resv, num_fences);
if (ret)
    return ret;

and can just do:

ret = drm_exec_prepare_obj(exec, obj, num_fences);
if (ret)
    return;

Of course drivers can open-code a wrapper doing the same thing, but
given at least pvr and radeon need this, it'd be nice if the core
could support it natively.

That's mostly it. Just wanted to share what I had in case you're
interested. If not, that's fine too.

Regards,

Boris
---
  Documentation/gpu/drm-mm.rst |  12 ++
  drivers/gpu/drm/Kconfig  |   6 +
  drivers/gpu/drm/Makefile |   2 +
  drivers/gpu/drm/drm_exec.c   | 274 +++
  include/drm/drm_exec.h   | 130 +
  5 files changed, 424 insertions(+)
  create mode 100644 drivers/gpu/drm/drm_exec.c
  create mode 100644 include/drm/drm_exec.h

diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst
index fe40ee686f6e..c9f120cfe730 100644
--- a/Documentation/gpu/drm-mm.rst
+++ b/Documentation/gpu/drm-mm.rst
@@ -524,6 +524,18 @@ DRM Sync Objects
  .. kernel-doc:: drivers/gpu/drm/drm_syncobj.c
 :export:
  +DRM Execution context
+=
+
+.. kernel-doc:: drivers/gpu/drm/drm_exec.c
+   :doc: Overview
+
+.. kernel-doc:: include/drm/drm_exec.h
+   :internal:
+
+.. kernel-doc:: drivers/gpu/drm/drm_exec.c
+   :export:
+
  GPU Scheduler
  =
  diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 76991720637c..01a38fcdb1c4 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -194,6 +194,12 @@ config DRM_TTM
    GPU memory types. Will be enabled automatically if a device 
driver

    uses it.
  +config DRM_EXEC
+    tristate
+    depends on DRM
+    help
+  Execution context for command submissions
+
  config DRM_BUDDY
  tristate
  depends on DRM
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index 1873f64db171..18a02eaf2d49 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -79,6 +79,8 @@ obj-$(CONFIG_DRM_PANEL_ORIENTATION_QUIRKS) += 
drm_panel_orientation_quirks.o

  #
  # Memory-management helpers
  #
+#
+obj-$(CONFIG_DRM_EXEC) += drm_exec.o
    obj-$(CONFIG_DRM_BUDDY) += drm_buddy.o
  diff --git a/drivers/gpu/drm/drm_exec.c b/drivers/gpu/drm/drm_exec.c
new file mode 100644
index ..e0ad1a3e1610
--- /dev/null
+++ b/drivers/gpu/drm/drm_exec.c
@@ -0,0 +1,274 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+
+#include 
+#include 
+#include 
+
+/**
+ * DOC: Overview
+ *
+ * This component mainly abstracts the retry loop necessary for locking
+ * multiple GEM objects while preparing hardware operations (e.g. 
command

+ * submissions, 

Re: [PATCH 01/13] drm: execution context for GEM buffers v4

2023-06-19 Thread Intel



On 6/17/23 13:54, Boris Brezillon wrote:

+Matthew who's been using drm_exec in Xe if I'm correct.

Hello Christian,

On Wed, 14 Jun 2023 15:02:52 +0200
Boris Brezillon  wrote:


On Wed, 14 Jun 2023 14:30:53 +0200
Christian König  wrote:


Am 14.06.23 um 14:23 schrieb Boris Brezillon:

Hi Christian,

On Thu,  4 May 2023 13:51:47 +0200
"Christian König"  wrote:


This adds the infrastructure for an execution context for GEM buffers
which is similar to the existing TTMs execbuf util and intended to replace
it in the long term.

The basic functionality is that we abstracts the necessary loop to lock
many different GEM buffers with automated deadlock and duplicate handling.

As many other drivers do already, we are considering using drm_exec()
for our resv locking in the PowerVR driver, so we might have more
questions/comments in the coming days/weeks, but I already have a
couple right now (see below).


v3: drop duplicate tracking, radeon is really the only one needing that

I think we'd actually be interested in duplicate tracking. Is there any
way we can make it an optional feature through some extra helpers/flags?
Doesn't have to be done in this patch series, I'm just wondering if this
is something we can share as well.

You can still capture the -EALREADY error and act appropriately in your
driver.

For radeon it just means ignoring the error code and going ahead, but
that behavior doesn't seem to be desired in most cases.

Initially I though we need to separately track how many and how often
BOs are duplicated, but there is simply no use for this.
   

[...]


+/**
+ * DOC: Overview
+ *
+ * This component mainly abstracts the retry loop necessary for locking
+ * multiple GEM objects while preparing hardware operations (e.g. command
+ * submissions, page table updates etc..).
+ *
+ * If a contention is detected while locking a GEM object the cleanup procedure
+ * unlocks all previously locked GEM objects and locks the contended one first
+ * before locking any further objects.
+ *
+ * After an object is locked fences slots can optionally be reserved on the
+ * dma_resv object inside the GEM object.
+ *
+ * A typical usage pattern should look like this::
+ *
+ * struct drm_gem_object *obj;
+ * struct drm_exec exec;
+ * unsigned long index;
+ * int ret;
+ *
+ * drm_exec_init(, true);
+ * drm_exec_while_not_all_locked() {
+ * ret = drm_exec_prepare_obj(, boA, 1);
+ * drm_exec_continue_on_contention();
+ * if (ret)
+ * goto error;
+ *

Have you considered defining a drm_exec_try_prepare_obj_or_retry()
combining drm_exec_prepare_obj() and drm_exec_continue_on_contention()?

#define drm_exec_try_prepare_obj_or_retry(exec, obj, num_fences) \
  ({ \
  int __ret = drm_exec_prepare_obj(exec, bo, num_fences); \
  if (unlikely(drm_exec_is_contended(exec))) \
  continue; \
  __ret; \
  })

This way the following pattern

ret = drm_exec_prepare_obj(, boA, 1);
drm_exec_continue_on_contention();
if (ret)
goto error;

can be turned into something more conventional:

ret = drm_exec_try_prepare_obj_or_retry(, boA, 1);
if (ret)
goto error;

Yeah, I was considering that as well. But then abandoned it as to
complicated.

I really need to find some time to work on that anyway.

I've been playing with drm_exec for a couple weeks now, and I wanted
to share something I hacked to try and make the API simpler and
more robust against misuse (see the below diff, which is a slightly
adjusted version of your work).


It would be good if we could have someone taking charge of this series 
and address all review comments, I see some of my comments getting lost, 
we have multiple submitters and I can't find a dri-devel patchwork entry 
for this. Anyway some comments below.




In this version, the user is no longer in control of the retry
loop. Instead, it provides an expression (a call to a
sub-function) to be re-evaluated each time a contention is
detected. IMHO, this makes the 'prepare-objs' functions easier to
apprehend, and avoids any mistake like calling
drm_exec_continue_on_contention() in an inner loop, or breaking
out of the drm_exec_while_all_locked() loop unintentionally.


In i915 we've had a very similar helper to this, and while I agree this 
newer version would probably help make code cleaner, but OTOH there also 
are some places where the short drm_exec_while_all_locked() -likeblock 
don't really motivate a separate function. Porting i915 to the current 
version will take some work, For  the xe driver both versions would work 
fine.


Some additional review comments not related to the interface change below:



It also makes the internal management a bit simpler, since we
no longer call drm_exec_cleanup() on the first 

Re: [PATCH v7 2/8] PCI/VGA: Deal only with VGA class devices

2023-06-19 Thread Maciej W. Rozycki
On Tue, 13 Jun 2023, Sui Jingfeng wrote:

> Deal only with the VGA devcie(pdev->class == 0x0300), so replace the

 Typo here: s/devcie/device/.

> pci_get_subsys() function with pci_get_class(). Filter the non-PCI display
> device(pdev->class != 0x0300) out. There no need to process the non-display
> PCI device.

 I've only come across this patch series now.  Without diving into what 
this code actually does I have just one question as a matter of interest.

> diff --git a/drivers/pci/vgaarb.c b/drivers/pci/vgaarb.c
> index c1bc6c983932..22a505e877dc 100644
> --- a/drivers/pci/vgaarb.c
> +++ b/drivers/pci/vgaarb.c
> @@ -1500,7 +1496,9 @@ static int pci_notify(struct notifier_block *nb, 
> unsigned long action,
>   struct pci_dev *pdev = to_pci_dev(dev);
>   bool notify = false;
>  
> - vgaarb_dbg(dev, "%s\n", __func__);
> + /* Only deal with VGA class devices */
> + if (pdev->class != PCI_CLASS_DISPLAY_VGA << 8)
> + return 0;

 Hmm, shouldn't this also handle PCI_CLASS_NOT_DEFINED_VGA?  As far as I 
know it is the equivalent of PCI_CLASS_DISPLAY_VGA for PCI <= 2.0 devices 
that were implemented before the idea of PCI device classes has developed 
into its current form.  I may have such a VGA device somewhere.

  Maciej


[PATCH Review 1/1] drm/amdgpu: Remove redundant poison consumption handler function

2023-06-19 Thread Stanley . Yang
The function callback handle_poison_consumption and callback
function poison_consumption_handler are almost same to handle
poison consumption, remove poison_consumption_handler.

Signed-off-by: Stanley.Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  |  9 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  4 
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h  |  3 ++-
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 12 +---
 5 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a33d4bc34cee..c15dbdb2e0f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -840,15 +840,6 @@ int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev)
return 0;
 }
 
-int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
-   struct amdgpu_iv_entry *entry)
-{
-   if (adev->gfx.ras && adev->gfx.ras->poison_consumption_handler)
-   return adev->gfx.ras->poison_consumption_handler(adev, entry);
-
-   return 0;
-}
-
 int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
void *err_data,
struct amdgpu_iv_entry *entry)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index d0c3f2955821..95b80bc8cdb9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -273,8 +273,6 @@ struct amdgpu_gfx_ras {
int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
-   int (*poison_consumption_handler)(struct amdgpu_device *adev,
-   struct amdgpu_iv_entry *entry);
 };
 
 struct amdgpu_gfx_shadow_info {
@@ -538,8 +536,6 @@ int amdgpu_gfx_get_num_kcq(struct amdgpu_device *adev);
 void amdgpu_gfx_cp_init_microcode(struct amdgpu_device *adev, uint32_t 
ucode_id);
 
 int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev);
-int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
-   struct amdgpu_iv_entry *entry);
 
 bool amdgpu_gfx_is_master_xcc(struct amdgpu_device *adev, int xcc_id);
 int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5b6525d8dace..7be289473034 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1694,15 +1694,13 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
amdgpu_umc_poison_handler(adev, false);
 
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
-   poison_stat = 
block_obj->hw_ops->handle_poison_consumption(adev);
+   poison_stat = 
block_obj->hw_ops->handle_poison_consumption(adev, entry);
 
/* gpu reset is fallback for failed and default cases */
-   if (poison_stat) {
+   if (poison_stat != true) {
dev_info(adev->dev, "GPU reset for %s RAS poison consumption is 
issued!\n",
block_obj->ras_comm.name);
amdgpu_ras_reset_gpu(adev);
-   } else {
-   amdgpu_gfx_poison_consumption_handler(adev, entry);
}
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 46bf1889a9d7..03f3b3774b85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -564,7 +564,8 @@ struct amdgpu_ras_block_hw_ops {
void (*reset_ras_error_count)(struct amdgpu_device *adev);
void (*reset_ras_error_status)(struct amdgpu_device *adev);
bool (*query_poison_status)(struct amdgpu_device *adev);
-   bool (*handle_poison_consumption)(struct amdgpu_device *adev);
+   bool (*handle_poison_consumption)(struct amdgpu_device *adev,
+   struct amdgpu_iv_entry *entry);
 };
 
 /* work flow
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index 26d6286d86c9..5b7eac547a05 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -78,7 +78,7 @@ static int gfx_v11_0_3_rlc_gc_fed_irq(struct amdgpu_device 
*adev,
return 0;
 }
 
-static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev,
+static bool gfx_v11_0_3_handle_poison_consumption(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry)
 {
/* Workaround: when vmid and pasid are both zero, trigger gpu reset in 
KGD. */
@@ -99,10 +99,16 @@ static int gfx_v11_0_3_poison_consumption_handler(struct 
amdgpu_device 

[PATCH v2] drm/amdgpu: Modify for_each_inst macro

2023-06-19 Thread Lijo Lazar
Modify it such that it doesn't change the instance mask parameter.

Signed-off-by: Lijo Lazar 
---
v2: 
Take care of bit-shift beyond width (Victor)

 drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4029c13a9be..aa42347bd67d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1295,9 +1295,10 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
 
 #define amdgpu_inc_vram_lost(adev) atomic_inc(&((adev)->vram_lost_counter));
 
-#define for_each_inst(i, inst_mask)
\
-   for (i = ffs(inst_mask) - 1; inst_mask;\
-inst_mask &= ~(1U << i), i = ffs(inst_mask) - 1)
+#define BIT_MASK_UPPER(i) ((i) >= BITS_PER_LONG ? 0 : ~0UL << (i))
+#define for_each_inst(i, inst_mask)\
+   for (i = ffs(inst_mask); i-- != 0; \
+i = ffs(inst_mask & BIT_MASK_UPPER(i + 1)))
 
 #define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
 
-- 
2.25.1