RE: [PATCH 3/3] drm/amdgpu: Remove pcie bw sys entry

2024-02-19 Thread Zhang, Hawking
[AMD Official Use Only - General]

Series is

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Kamal, Asad 
Sent: Friday, February 16, 2024 20:59
To: amd-gfx@lists.freedesktop.org
Cc: Lazar, Lijo ; Zhang, Hawking ; 
Ma, Le ; Zhang, Morris ; Kamal, Asad 

Subject: [PATCH 3/3] drm/amdgpu: Remove pcie bw sys entry

Remove pcie bw sys entry for asics not supporting such function

Signed-off-by: Asad Kamal 
Reviewed-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/soc15.c | 1 -  drivers/gpu/drm/amd/pm/amdgpu_pm.c | 
3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 7fc55e3262eb..20a4582885cc 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -895,7 +895,6 @@ static const struct amdgpu_asic_funcs 
aqua_vanjaram_asic_funcs =
.get_config_memsize = &soc15_get_config_memsize,
.need_full_reset = &soc15_need_full_reset,
.init_doorbell_index = &aqua_vanjaram_doorbell_index_init,
-   .get_pcie_usage = &vega20_get_pcie_usage,
.need_reset_on_init = &soc15_need_reset_on_init,
.get_pcie_replay_count = &amdgpu_nbio_get_pcie_replay_count,
.supports_baco = &soc15_supports_baco, diff --git 
a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 087d57850304..1ff7fc821871 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -2174,7 +2174,8 @@ static int default_attr_update(struct amdgpu_device 
*adev, struct amdgpu_device_
*states = ATTR_STATE_UNSUPPORTED;
} else if (DEVICE_ATTR_IS(pcie_bw)) {
/* PCIe Perf counters won't work on APU nodes */
-   if (adev->flags & AMD_IS_APU)
+   if (adev->flags & AMD_IS_APU ||
+   !adev->asic_funcs->get_pcie_usage)
*states = ATTR_STATE_UNSUPPORTED;
} else if (DEVICE_ATTR_IS(unique_id)) {
switch (gc_ver) {
--
2.42.0



Re: [PATCH 2/2] drm/amdgpu: use new reset-affected accessors for userspace interfaces

2024-02-19 Thread Lazar, Lijo



On 2/16/2024 8:43 PM, Alex Deucher wrote:
> Use the new reset critical section accessors for debugfs, sysfs,
> and the INFO IOCTL to provide proper mutual exclusivity
> to hardware with respect the GPU resets.
> 

This looks more like a priority inversion. When the device needs reset,
it doesn't make sense for reset handling to wait on these. Instead,
reset should be done at the earliest.

Thanks,
Lijo

> Signed-off-by: Alex Deucher 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  20 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  10 +
>  drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 215 
>  drivers/gpu/drm/amd/pm/amdgpu_pm.c  |  91 -
>  4 files changed, 235 insertions(+), 101 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 72eceb7d6667..d0e4a8729703 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1670,7 +1670,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
> *m, void *unused)
>   }
>  
>   /* Avoid accidently unparking the sched thread during GPU reset */
> - r = down_write_killable(&adev->reset_domain->sem);
> + r = amdgpu_reset_domain_access_write_start(adev);
>   if (r)
>   return r;
>  
> @@ -1699,7 +1699,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
> *m, void *unused)
>   kthread_unpark(ring->sched.thread);
>   }
>  
> - up_write(&adev->reset_domain->sem);
> + amdgpu_reset_domain_access_write_end(adev);
>  
>   pm_runtime_mark_last_busy(dev->dev);
>   pm_runtime_put_autosuspend(dev->dev);
> @@ -1929,7 +1929,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 
> val)
>   return -ENOMEM;
>  
>   /* Avoid accidently unparking the sched thread during GPU reset */
> - r = down_read_killable(&adev->reset_domain->sem);
> + r = amdgpu_reset_domain_access_read_start(adev);
>   if (r)
>   goto pro_end;
>  
> @@ -1970,7 +1970,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 
> val)
>   /* restart the scheduler */
>   kthread_unpark(ring->sched.thread);
>  
> - up_read(&adev->reset_domain->sem);
> + amdgpu_reset_domain_access_read_end(adev);
>  
>  pro_end:
>   kfree(fences);
> @@ -2031,23 +2031,23 @@ static ssize_t 
> amdgpu_reset_dump_register_list_read(struct file *f,
>   return 0;
>  
>   memset(reg_offset, 0, 12);
> - ret = down_read_killable(&adev->reset_domain->sem);
> + ret = amdgpu_reset_domain_access_read_start(adev);
>   if (ret)
>   return ret;
>  
>   for (i = 0; i < adev->reset_info.num_regs; i++) {
>   sprintf(reg_offset, "0x%x\n", 
> adev->reset_info.reset_dump_reg_list[i]);
> - up_read(&adev->reset_domain->sem);
> + amdgpu_reset_domain_access_read_end(adev);
>   if (copy_to_user(buf + len, reg_offset, strlen(reg_offset)))
>   return -EFAULT;
>  
>   len += strlen(reg_offset);
> - ret = down_read_killable(&adev->reset_domain->sem);
> + ret = amdgpu_reset_domain_access_read_start(adev);
>   if (ret)
>   return ret;
>   }
>  
> - up_read(&adev->reset_domain->sem);
> + amdgpu_reset_domain_access_read_end(adev);
>   *pos += len;
>  
>   return len;
> @@ -2089,14 +2089,14 @@ static ssize_t 
> amdgpu_reset_dump_register_list_write(struct file *f,
>   ret = -ENOMEM;
>   goto error_free;
>   }
> - ret = down_write_killable(&adev->reset_domain->sem);
> + ret = amdgpu_reset_domain_access_write_start(adev);
>   if (ret)
>   goto error_free;
>  
>   swap(adev->reset_info.reset_dump_reg_list, tmp);
>   swap(adev->reset_info.reset_dump_reg_value, new);
>   adev->reset_info.num_regs = i;
> - up_write(&adev->reset_domain->sem);
> + amdgpu_reset_domain_access_write_end(adev);
>   ret = size;
>  
>  error_free:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index a2df3025a754..4efb44a964ef 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -43,6 +43,7 @@
>  #include "amdgpu_gem.h"
>  #include "amdgpu_display.h"
>  #include "amdgpu_ras.h"
> +#include "amdgpu_reset.h"
>  #include "amd_pcie.h"
>  
>  void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev)
> @@ -704,7 +705,11 @@ int amdgpu_info_ioctl(struct drm_device *dev, void 
> *data, struct drm_file *filp)
>   return copy_to_user(out, &count, min(size, 4u)) ? -EFAULT : 0;
>   }
>   case AMDGPU_INFO_TIMESTAMP:
> + ret = amdgpu_reset_domain_access_read_start(adev);
> + if (ret)
> + return -EFAULT;
>   ui64 = amdgpu_gfx_get_gpu_clock_cou

RE: [PATCH 5/5] drm/amdgpu: skip GFX FED error in page fault handling

2024-02-19 Thread Zhou1, Tao
[AMD Official Use Only - General]

> -Original Message-
> From: Lazar, Lijo 
> Sent: Monday, February 19, 2024 8:40 PM
> To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 5/5] drm/amdgpu: skip GFX FED error in page fault handling
>
>
>
> On 2/19/2024 1:45 PM, Tao Zhou wrote:
> > Let kfd interrupt handler process it.
> >
> > Signed-off-by: Tao Zhou 
> > ---
> >  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 10 +-
> >  1 file changed, 9 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > index 773725a92cf1..70defc394b7b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > @@ -552,7 +552,7 @@ static int gmc_v9_0_process_interrupt(struct
> > amdgpu_device *adev,  {
> > bool retry_fault = !!(entry->src_data[1] & 0x80);
> > bool write_fault = !!(entry->src_data[1] & 0x20);
> > -   uint32_t status = 0, cid = 0, rw = 0;
> > +   uint32_t status = 0, cid = 0, rw = 0, fed = 0;
> > struct amdgpu_task_info task_info;
> > struct amdgpu_vmhub *hub;
> > const char *mmhub_cid;
> > @@ -663,6 +663,14 @@ static int gmc_v9_0_process_interrupt(struct
> amdgpu_device *adev,
> > status = RREG32(hub->vm_l2_pro_fault_status);
> > cid = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, CID);
> > rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW);
> > +   fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS,
> FED);
> > +
> > +   /* for gfx fed error, kfd will handle it, return directly */
> > +   if (fed && amdgpu_ras_is_poison_mode_supported(adev) &&
> > +   amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2) &&
> > +   !strcmp(hub_name, "gfxhub0"))
> > +   return 1;
>
> amdgpu_irq_dispatch() gives the impression that return value of 1 is treated 
> as
> handled, hence won't be passed to kfd. The commit description says it is 
> intended
> to pass to kfd for handling.

[Tao] good catch, it should return 0 here, will update it in v2, thanks.

>
> Also, FED status check may be moved up so that it's not misunderstood as a
> regular page fault with the extra prints coming to dmesg log.
> Otherwise, poison status also needs to be added to dmesg.

[Tao] there is poison consumption dmesg log in kfd interrupt handler, no neeed 
to add extra print here.
My intention is to skip " WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1)", moving 
up the check will make the change a little bit more and I think the page fault 
log is acceptable.

>
> Thanks,
> Lijo
>
> > +
> > WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);  #ifdef
> > HAVE_STRUCT_XARRAY
> > amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status,
> vmhub);


[pull] amdgpu, amdkfd, radeon drm-next-6.9

2024-02-19 Thread Alex Deucher
Hi Dave, Sima,

More new stuff for 6.9.

The following changes since commit d5597444032b2f5c8624918fb5b29be5bba78a3c:

  drm/amdgpu: Fix HDP flush for VFs on nbio v7.9 (2024-02-07 12:26:24 -0500)

are available in the Git repository at:

  https://gitlab.freedesktop.org/agd5f/linux.git 
tags/amd-drm-next-6.9-2024-02-19

for you to fetch changes up to 31e0a586f3385134bcad00d8194eb0728cb1a17d:

  drm/amdgpu: add MMHUB 3.3.1 support (2024-02-19 14:50:46 -0500)


amd-drm-next-6.9-2024-02-19:

amdgpu:
- ATHUB 4.1 support
- EEPROM support updates
- RAS updates
- LSDMA 7.0 support
- JPEG DPG support
- IH 7.0 support
- HDP 7.0 support
- VCN 5.0 support
- Misc display fixes
- Retimer fixes
- DCN 3.5 fixes
- VCN 4.x fixes
- PSR fixes
- PSP 14.0 support
- VA_RESERVED cleanup
- SMU 13.0.6 updates
- NBIO 7.11 updates
- SDMA 6.1 updates
- MMHUB 3.3 updates
- Suspend/resume fixes
- DMUB updates

amdkfd:
- Trap handler enhancements
- Fix cache size reporting
- Relocate the trap handler

radeon:
- fix typo in print statement


Alex Deucher (1):
  drm/amdgpu/psp: update define to better align with its meaning

Aric Cyr (1):
  drm/amd/display: 3.2.272

Charlene Liu (2):
  drm/amd/display: enable fgcg by default
  drm/amd/display: allow psr-su/replay for z8

Dan Carpenter (1):
  drm/amd/display: Fix && vs || typos

Felix Kuehling (2):
  drm/amdgpu: Reduce VA_RESERVED_BOTTOM to 64KB
  drm/amdkfd: Relocate TBA/TMA to opposite side of VM hole

Gabe Teeger (1):
  Revert "drm/amd/display: Send DTBCLK disable message on first commit"

George Shen (1):
  Revert "drm/amd/display: Add left edge pixel for YCbCr422/420 + ODM pipe 
split"

Hamza Mahfooz (2):
  drm/amdgpu: make damage clips support configurable
  drm/amdgpu: respect the abmlevel module parameter value if it is set

Hawking Zhang (8):
  drm/amdgpu: Add athub v4_1_0 ip headers (v5)
  drm/amdgpu: Add athub v4_1_0 ip block support
  drm/amdgpu: Add lsdma v7_0_0 ip headers (v3)
  drm/amdgpu: Add osssys v7_0_0 ip headers (v4)
  drm/amdgpu: Add hdp v7_0_0 ip headers (v3)
  drm/amdgpu: Add vcn v5_0_0 ip headers (v5)
  drm/amdgpu: Add mp v14_0_2 ip headers (v5)
  drm/amdgpu: Add psp v14_0 ip block support

Jonathan Kim (1):
  drm/amdkfd: fill in data for control stack header for gfx10

Kent Russell (1):
  drm/amdkfd: Fix L2 cache size reporting in GFX9.4.3

Laurent Morichetti (1):
  drm/amdkfd: pass debug exceptions to second-level trap handler

Lijo Lazar (1):
  drm/amd/pm: Allow setting max UCLK on SMU v13.0.6

Likun Gao (16):
  drm/amd/swsmu: add judgement for vcn jpeg dpm set
  drm/amdgpu: skip ucode bo reserve for RLC AUTOLOAD
  drm/amdgpu: support rlc auotload type set
  drm/amdgpu: Add lsdma v7_0 ip block support
  drm/amdgpu/discovery: Add lsdma v7_0 ip block
  drm/amdgpu: Add ih v7_0 ip block support
  drm/amdgpu/discovery: Add ih v7_0 ip block
  drm/amdgpu: Add hdp v7_0 ip block support
  drm/amdgpu/discovery: Add hdp v7_0 ip block
  drm/amdgpu: use spirom update wait_for helper for psp v14
  drm/amdgpu: support psp ip block for psp v14
  drm/amdgpu/psp: set autoload support by default
  drm/amdgpu/psp: handle TMR type via flag
  drm/amdgpu/psp: set boot_time_tmr flag
  drm/amdgpu: add psp_timeout to limit PSP related operation
  drm/amdgpu: support psp ip block discovery for psp v14

Mario Limonciello (3):
  drm/amd: Stop evicting resources on APUs in suspend
  Revert "drm/amd: flush any delayed gfxoff on suspend entry"
  drm/amd: Change `jpeg_v4_0_5_start_dpg_mode()` to void

Martin Tsai (1):
  drm/amd/display: should support dmub hw lock on Replay

Michael Strauss (1):
  drm/amd/display: Update FIXED_VS Retimer HWSS Test Pattern Sequences

Nicholas Kazlauskas (2):
  drm/amd/display: Add shared firmware state for DMUB IPS handshake
  drm/amd/display: Increase ips2_eval delay for DCN35

Nikita Zhandarovich (2):
  drm/radeon/ni: Fix wrong firmware size logging in ni_init_microcode()
  drm/amd/display: fix NULL checks for adev->dm.dc in amdgpu_dm_fini()

Rajneesh Bhardwaj (2):
  drm/amdkfd: update SIMD distribution algo for GFXIP 9.4.2 onwards
  drm/amdgpu: Fix implicit assumtion in gfx11 debug flags

Rodrigo Siqueira (1):
  drm/amd/display: Remove break after return

Roman Li (1):
  drm/amd/display: Fix array-index-out-of-bounds in dcn35_clkmgr

Saleemkhan Jamadar (2):
  drm/amdgpu: add ucode id for jpeg DPG support
  drm/amdgpu/jpeg: add support for jpeg DPG mode

Sohaib Nadeem (2):
  Revert "drm/amd/display: increased min_dcfclk_mhz and min_fclk_mhz"
  drm/amd/display: fixed integer types and null check locations

Sonny Jiang (7):
  drm/amdgpu: add VCN_5_0_0 firmware support
  drm/amdgpu: add VCN_5_0_0 IP block supp

Re: [PATCH] drm/amdgpu: Drop redundant parameter in amdgpu_gfx_kiq_init_ring

2024-02-19 Thread Deucher, Alexander
[Public]

Reviewed-by: Alex Deucher 

From: Ma, Jun 
Sent: Monday, February 19, 2024 1:40 AM
To: amd-gfx@lists.freedesktop.org ; Koenig, 
Christian ; Deucher, Alexander 

Cc: Ma, Jun 
Subject: [PATCH] drm/amdgpu: Drop redundant parameter in 
amdgpu_gfx_kiq_init_ring

Drop redundant parameters in function amdgpu_gfx_kiq_init_ring
to simplify the code

Signed-off-by: Ma Jun 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 6 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 4 +---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 5 ++---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  | 5 ++---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 5 ++---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 5 ++---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 4 +---
 7 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index e114694d1131..4835d6d899e7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -304,11 +304,11 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device 
*adev,
 return -EINVAL;
 }

-int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
-struct amdgpu_ring *ring,
-struct amdgpu_irq_src *irq, int xcc_id)
+int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev, int xcc_id)
 {
 struct amdgpu_kiq *kiq = &adev->gfx.kiq[xcc_id];
+   struct amdgpu_irq_src *irq = &kiq->irq;
+   struct amdgpu_ring *ring = &kiq->ring;
 int r = 0;

 spin_lock_init(&kiq->ring_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index f23bafec71c5..8fcf889ddce9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -471,9 +471,7 @@ static inline u32 amdgpu_gfx_create_bitmask(u32 bit_width)
 void amdgpu_gfx_parse_disable_cu(unsigned *mask, unsigned max_se,
  unsigned max_sh);

-int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
-struct amdgpu_ring *ring,
-struct amdgpu_irq_src *irq, int xcc_id);
+int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev, int xcc_id);

 void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring);

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index b02d63328f1c..691fa40e4e01 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -4490,7 +4490,7 @@ static int gfx_v10_0_compute_ring_init(struct 
amdgpu_device *adev, int ring_id,
 static int gfx_v10_0_sw_init(void *handle)
 {
 int i, j, k, r, ring_id = 0;
-   struct amdgpu_kiq *kiq;
+   int xcc_id = 0;
 struct amdgpu_device *adev = (struct amdgpu_device *)handle;

 switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
@@ -4619,8 +4619,7 @@ static int gfx_v10_0_sw_init(void *handle)
 return r;
 }

-   kiq = &adev->gfx.kiq[0];
-   r = amdgpu_gfx_kiq_init_ring(adev, &kiq->ring, &kiq->irq, 0);
+   r = amdgpu_gfx_kiq_init_ring(adev, xcc_id);
 if (r)
 return r;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 2fb1342d5bd9..9d8ec709cd52 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1329,7 +1329,7 @@ static int gfx_v11_0_rlc_backdoor_autoload_enable(struct 
amdgpu_device *adev)
 static int gfx_v11_0_sw_init(void *handle)
 {
 int i, j, k, r, ring_id = 0;
-   struct amdgpu_kiq *kiq;
+   int xcc_id = 0;
 struct amdgpu_device *adev = (struct amdgpu_device *)handle;

 switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
@@ -1454,8 +1454,7 @@ static int gfx_v11_0_sw_init(void *handle)
 return r;
 }

-   kiq = &adev->gfx.kiq[0];
-   r = amdgpu_gfx_kiq_init_ring(adev, &kiq->ring, &kiq->irq, 0);
+   r = amdgpu_gfx_kiq_init_ring(adev, xcc_id);
 if (r)
 return r;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index ea174b76ee70..b97ea62212b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1900,8 +1900,8 @@ static void gfx_v8_0_sq_irq_work_func(struct work_struct 
*work);
 static int gfx_v8_0_sw_init(void *handle)
 {
 int i, j, k, r, ring_id;
+   int xcc_id = 0;
 struct amdgpu_ring *ring;
-   struct amdgpu_kiq *kiq;
 struct amdgpu_device *adev = (struct amdgpu_device *)handle;

 switch (adev->asic_type) {
@@ -2022,8 +2022,7 @@ static int gfx_v8_0_sw_init(void *handle)
 return r;
 }

-   k

Re: [PATCH 2/2] drm/amdgpu: use new reset-affected accessors for userspace interfaces

2024-02-19 Thread Christian König

Am 16.02.24 um 16:13 schrieb Alex Deucher:

Use the new reset critical section accessors for debugfs, sysfs,
and the INFO IOCTL to provide proper mutual exclusivity
to hardware with respect the GPU resets.

Signed-off-by: Alex Deucher 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  20 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  10 +
  drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 215 
  drivers/gpu/drm/amd/pm/amdgpu_pm.c  |  91 -
  4 files changed, 235 insertions(+), 101 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 72eceb7d6667..d0e4a8729703 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1670,7 +1670,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
}
  
  	/* Avoid accidently unparking the sched thread during GPU reset */

-   r = down_write_killable(&adev->reset_domain->sem);
+   r = amdgpu_reset_domain_access_write_start(adev);
if (r)
return r;
  
@@ -1699,7 +1699,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)

kthread_unpark(ring->sched.thread);
}
  
-	up_write(&adev->reset_domain->sem);

+   amdgpu_reset_domain_access_write_end(adev);
  
  	pm_runtime_mark_last_busy(dev->dev);

pm_runtime_put_autosuspend(dev->dev);
@@ -1929,7 +1929,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
  
  	/* Avoid accidently unparking the sched thread during GPU reset */

-   r = down_read_killable(&adev->reset_domain->sem);
+   r = amdgpu_reset_domain_access_read_start(adev);
if (r)
goto pro_end;
  
@@ -1970,7 +1970,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)

/* restart the scheduler */
kthread_unpark(ring->sched.thread);
  
-	up_read(&adev->reset_domain->sem);

+   amdgpu_reset_domain_access_read_end(adev);
  
  pro_end:

kfree(fences);
@@ -2031,23 +2031,23 @@ static ssize_t 
amdgpu_reset_dump_register_list_read(struct file *f,
return 0;
  
  	memset(reg_offset, 0, 12);

-   ret = down_read_killable(&adev->reset_domain->sem);
+   ret = amdgpu_reset_domain_access_read_start(adev);
if (ret)
return ret;
  
  	for (i = 0; i < adev->reset_info.num_regs; i++) {

sprintf(reg_offset, "0x%x\n", 
adev->reset_info.reset_dump_reg_list[i]);
-   up_read(&adev->reset_domain->sem);
+   amdgpu_reset_domain_access_read_end(adev);
if (copy_to_user(buf + len, reg_offset, strlen(reg_offset)))
return -EFAULT;
  
  		len += strlen(reg_offset);

-   ret = down_read_killable(&adev->reset_domain->sem);
+   ret = amdgpu_reset_domain_access_read_start(adev);
if (ret)
return ret;
}
  
-	up_read(&adev->reset_domain->sem);

+   amdgpu_reset_domain_access_read_end(adev);
*pos += len;
  
  	return len;

@@ -2089,14 +2089,14 @@ static ssize_t 
amdgpu_reset_dump_register_list_write(struct file *f,
ret = -ENOMEM;
goto error_free;
}
-   ret = down_write_killable(&adev->reset_domain->sem);
+   ret = amdgpu_reset_domain_access_write_start(adev);
if (ret)
goto error_free;
  
  	swap(adev->reset_info.reset_dump_reg_list, tmp);

swap(adev->reset_info.reset_dump_reg_value, new);
adev->reset_info.num_regs = i;
-   up_write(&adev->reset_domain->sem);
+   amdgpu_reset_domain_access_write_end(adev);
ret = size;
  
  error_free:

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index a2df3025a754..4efb44a964ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -43,6 +43,7 @@
  #include "amdgpu_gem.h"
  #include "amdgpu_display.h"
  #include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
  #include "amd_pcie.h"
  
  void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev)

@@ -704,7 +705,11 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
return copy_to_user(out, &count, min(size, 4u)) ? -EFAULT : 0;
}
case AMDGPU_INFO_TIMESTAMP:
+   ret = amdgpu_reset_domain_access_read_start(adev);
+   if (ret)
+   return -EFAULT;


Probably better to return ret here and on other occasions as well.

Shouldn't make a practical different at the moment, but should we ever 
change the killable to interruptible easier to handle.


Apart from that looks like a massive cleanup to me.

Regards,
Christian.


ui64 = amdgpu_gfx_get_gpu_clock_counter(adev);
+   amdgpu_reset_domain_access_read_end(adev

Re: [PATCH 1/2] drm/amdgpu: add new accessors for GPU reset-affected critcal sections

2024-02-19 Thread Christian König

Am 16.02.24 um 16:13 schrieb Alex Deucher:

Provide helper functions for code which needs to be mutually
exclusive with GPU resets.  While we are here, move
amdgpu_in_reset in amdgpu_reset.c since that is the more
logical location for it and add documentation.

Signed-off-by: Alex Deucher 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  2 -
  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c|  1 +
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c |  1 +
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c |  1 +
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  5 --
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c   |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 86 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  6 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c  |  1 +
  drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c|  1 +
  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c|  1 +
  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c|  1 +
  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c |  1 +
  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |  1 +
  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c   |  1 +
  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c   |  1 +
  drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c|  1 +
  drivers/gpu/drm/amd/amdgpu/mes_v10_1.c|  1 +
  drivers/gpu/drm/amd/amdgpu/mes_v11_0.c|  1 +
  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c  |  1 +
  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |  1 +
  .../amd/display/amdgpu_dm/amdgpu_dm_crtc.c|  1 +
  drivers/gpu/drm/amd/pm/amdgpu_pm.c|  1 +
  .../drm/amd/pm/powerplay/inc/amd_powerplay.h  |  1 +
  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c |  1 +
  .../gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c |  1 +
  .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c|  1 +
  .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  |  1 +
  33 files changed, 121 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9246bca0a008..0e365cadcc3f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1602,8 +1602,6 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device 
*adev)
 return adev->gmc.tmz_enabled;
  }
  
-int amdgpu_in_reset(struct amdgpu_device *adev);

-
  extern const struct attribute_group amdgpu_vram_mgr_attr_group;
  extern const struct attribute_group amdgpu_gtt_mgr_attr_group;
  extern const struct attribute_group amdgpu_flash_attr_group;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 69810b3f1c63..78bec0b6c502 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -22,6 +22,7 @@
  #include "amdgpu.h"
  #include "amdgpu_amdkfd.h"
  #include "amdgpu_amdkfd_gfx_v10.h"
+#include "amdgpu_reset.h"
  #include "gc/gc_10_1_0_offset.h"
  #include "gc/gc_10_1_0_sh_mask.h"
  #include "athub/athub_2_0_0_offset.h"
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
index ca4a6b82817f..717a6d10b01c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
@@ -22,6 +22,7 @@
  
  #include "amdgpu.h"

  #include "amdgpu_amdkfd.h"
+#include "amdgpu_reset.h"
  #include "cikd.h"
  #include "cik_sdma.h"
  #include "gfx_v7_0.h"
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
index 0f3e2944edd7..d08e9c308835 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
@@ -22,6 +22,7 @@
  
  #include "amdgpu.h"

  #include "amdgpu_amdkfd.h"
+#include "amdgpu_reset.h"
  #include "gfx_v8_0.h"
  #include "gca/gfx_8_0_sh_mask.h"
  #include "gca/gfx_8_0_d.h"
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 5a35a8ca8922..b0ff1065491e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -21,6 +21,7 @@
   */
  #include "amdgpu.h"
  #include "amdgpu_amdkfd.h"
+#include "amdgpu_reset.h"
  #include "gc/gc_9_0_offset.h"
  #include "gc/gc_9_0_sh_mask.h"
  #include "vega10_enum.h"
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index e2ae9ba147ba..0eed6bb213b8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -27,6 +27,7 @@
  #include "amdgpu.h"
  #include "amdgpu_sched.h"
  #include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
  #in

Re: [PATCH] drm/amd: Only allow one entity to control ABM

2024-02-19 Thread Alex Deucher
On Mon, Feb 19, 2024 at 10:19 AM Christian König
 wrote:
>
> Am 16.02.24 um 19:37 schrieb Alex Deucher:
> > On Fri, Feb 16, 2024 at 10:42 AM Christian König
> >  wrote:
> >> Am 16.02.24 um 16:12 schrieb Mario Limonciello:
> >>> On 2/16/2024 09:05, Harry Wentland wrote:
> 
>  On 2024-02-16 09:47, Christian König wrote:
> > Am 16.02.24 um 15:42 schrieb Mario Limonciello:
> >> On 2/16/2024 08:38, Christian König wrote:
> >>> Am 16.02.24 um 15:07 schrieb Mario Limonciello:
>  By exporting ABM to sysfs it's possible that DRM master and software
>  controlling the sysfs file fight over the value programmed for ABM.
> 
>  Adjust the module parameter behavior to control who control ABM:
>  -2: DRM
>  -1: sysfs (IE via software like power-profiles-daemon)
> >>> Well that sounds extremely awkward. Why should a
> >>> power-profiles-deamon has control over the panel power saving
> >>> features?
> >>>
> >>> I mean we are talking about things like reducing backlight level
> >>> when the is inactivity, don't we?
> >> We're talking about activating the ABM algorithm when the system is
> >> in power saving mode; not from inactivity.  This allows the user to
> >> squeeze out some extra power "just" in that situation.
> >>
> >> But given the comments on the other patch, I tend to agree with
> >> Harry's proposal instead that we just drop the DRM property
> >> entirely as there are no consumers of it.
> > Yeah, but even then the design to let this be controlled by an
> > userspace deamon is questionable. Stuff like that is handled inside
> > the kernel and not exposed to userspace usually.
> >
> >>> Regarding the "how" and "why" of PPD; besides this panel power savings
> >>> sysfs file there are two other things that are nominally changed.
> >>>
> >>> ACPI platform profile:
> >>> https://www.kernel.org/doc/html/latest/userspace-api/sysfs-platform_profile.html
> >>>
> >>> AMD-Pstate EPP value:
> >>> https://www.kernel.org/doc/html//latest/admin-guide/pm/amd-pstate.html
> >>>
> >>> When a user goes into "power saving" mode both of those are tweaked.
> >>> Before we introduced the EPP tweaking in PPD we did discuss a callback
> >>> within the kernel so that userspace could change "just" the ACPI
> >>> platform profile and everything else would react.  There was pushback
> >>> on this, and so instead knobs are offered for things that should be
> >>> tweaked and the userspace daemon can set up policy for what to do when
> >>> a a user uses a userspace client (such as GNOME or KDE) to change the
> >>> desired system profile.
> >> Ok, well who came up with the idea of the userspace deamon? Cause I
> >> think there will be even more push back on this approach.
> >>
> >> Basically when we go from AC to battery (or whatever) the drivers
> >> usually handle that all inside the kernel today. Involving userspace is
> >> only done when there is a need for that, e.g. inactivity detection or
> >> similar.
> > Well, we don't want policy in the kernel unless it's a platform or
> > hardware requirement.  Kernel should provide the knobs and then
> > userspace can set them however they want depending on user preference.
>
> Well, you not have the policy itself but usually the handling inside the
> kernel.
>
> In other words when I connect/disconnect AC from my laptop I can hear
> the fan changing, which is a switch in power state. Only the beep which
> comes out of the speakers as conformation is handled in userspace I think.
>
> And IIRC changing background light is also handled completely inside the
> kernel and when I close the lid the display turns off on its own and not
> because of some userspace deamon.
>
> So why is for this suddenly a userspace deamon involved?

It's a user preference.  Some people won't like ABM, some will.  They
set the policy from user space.  It's similar to the backlight level.
Some users always prefer a bright backlight regardless of AC/DC state,
others want the backlight to get brighter when on AC power.  The
kernel provides the knobs to set the ABM level and then user space can
specify the level and also device when they want it enabled (never,
only on DC, etc.).  The kernel driver for the backlight doesn't change
the backlight at AC/DC switch, userspace gets an event when the power
source changes and it then talks to the kernel to change the backlight
level.  I think lid is handled the same way.  Userspace gets a lid
event and it turns off the displays and maybe enters suspend or maybe
not depending on what the user wants.

Alex

>
> Christian.
>
> >
> > Alex
> >
> >
>  I think we'll need a bit in our kernel docs describing ABM.
>  Assumptions around what it is and does seem to lead to confusion.
> 
>  The thing is that ABM has a visual impact that not all users like. It
>  would make sense for users to be able to change the ABM level as
>  desired, and/o

Re: [PATCH] drm/amd: Only allow one entity to control ABM

2024-02-19 Thread Christian König

Am 16.02.24 um 19:37 schrieb Alex Deucher:

On Fri, Feb 16, 2024 at 10:42 AM Christian König
 wrote:

Am 16.02.24 um 16:12 schrieb Mario Limonciello:

On 2/16/2024 09:05, Harry Wentland wrote:


On 2024-02-16 09:47, Christian König wrote:

Am 16.02.24 um 15:42 schrieb Mario Limonciello:

On 2/16/2024 08:38, Christian König wrote:

Am 16.02.24 um 15:07 schrieb Mario Limonciello:

By exporting ABM to sysfs it's possible that DRM master and software
controlling the sysfs file fight over the value programmed for ABM.

Adjust the module parameter behavior to control who control ABM:
-2: DRM
-1: sysfs (IE via software like power-profiles-daemon)

Well that sounds extremely awkward. Why should a
power-profiles-deamon has control over the panel power saving
features?

I mean we are talking about things like reducing backlight level
when the is inactivity, don't we?

We're talking about activating the ABM algorithm when the system is
in power saving mode; not from inactivity.  This allows the user to
squeeze out some extra power "just" in that situation.

But given the comments on the other patch, I tend to agree with
Harry's proposal instead that we just drop the DRM property
entirely as there are no consumers of it.

Yeah, but even then the design to let this be controlled by an
userspace deamon is questionable. Stuff like that is handled inside
the kernel and not exposed to userspace usually.


Regarding the "how" and "why" of PPD; besides this panel power savings
sysfs file there are two other things that are nominally changed.

ACPI platform profile:
https://www.kernel.org/doc/html/latest/userspace-api/sysfs-platform_profile.html

AMD-Pstate EPP value:
https://www.kernel.org/doc/html//latest/admin-guide/pm/amd-pstate.html

When a user goes into "power saving" mode both of those are tweaked.
Before we introduced the EPP tweaking in PPD we did discuss a callback
within the kernel so that userspace could change "just" the ACPI
platform profile and everything else would react.  There was pushback
on this, and so instead knobs are offered for things that should be
tweaked and the userspace daemon can set up policy for what to do when
a a user uses a userspace client (such as GNOME or KDE) to change the
desired system profile.

Ok, well who came up with the idea of the userspace deamon? Cause I
think there will be even more push back on this approach.

Basically when we go from AC to battery (or whatever) the drivers
usually handle that all inside the kernel today. Involving userspace is
only done when there is a need for that, e.g. inactivity detection or
similar.

Well, we don't want policy in the kernel unless it's a platform or
hardware requirement.  Kernel should provide the knobs and then
userspace can set them however they want depending on user preference.


Well, you not have the policy itself but usually the handling inside the 
kernel.


In other words when I connect/disconnect AC from my laptop I can hear 
the fan changing, which is a switch in power state. Only the beep which 
comes out of the speakers as conformation is handled in userspace I think.


And IIRC changing background light is also handled completely inside the 
kernel and when I close the lid the display turns off on its own and not 
because of some userspace deamon.


So why is for this suddenly a userspace deamon involved?

Christian.



Alex



I think we'll need a bit in our kernel docs describing ABM.
Assumptions around what it is and does seem to lead to confusion.

The thing is that ABM has a visual impact that not all users like. It
would make sense for users to be able to change the ABM level as
desired, and/or configure their power profiles to select a certain
ABM level.

ABM will reduce the backlight and compensate by adjusting brightness
and contrast of the image. It has 5 levels: 0, 1, 2, 3, 4. 0 means
off. 4 means maximum backlight reduction. IMO, 1 and 2 look okay. 3
and 4 can be quite impactful, both to power and visual fidelity.


Aside from this patch Hamza did make some improvements to the kdoc in
the recent patches, can you read through again and see if you think it
still needs improvements?

Well what exactly is the requirement? That we have different ABM
settings for AC and battery? If yes providing different DRM properties
would sound like the right approach to me.

Regards,
Christian.


Harry


Regards,
Christian.


Regards,
Christian.


0-4: User via command line

Also introduce a Kconfig option that allows distributions to choose
the default policy that is appropriate for them.

Fixes: f97e4303da16 ("drm/amd/display: add panel_power_savings
sysfs entry to eDP connectors")
Signed-off-by: Mario Limonciello 
---
Cc: Hamza Mahfooz 
Cc: Harry Wentland 
Cc: Sun peng (Leo) Li 
drivers/gpu/drm/amd/amdgpu/Kconfig| 72
+++
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   | 23 +++---
.../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |  6 +-
3 files changed, 90 insertions(+), 11 deletions(

Re: Kernel 6.7+ broke under-powering of my RX 6700XT. (Archlinux, mesa/amdgpu)

2024-02-19 Thread Roman Benes

Hello everyone,

patch by user @fililip was posted there, but not submitted:

/"I think I'd have to submit it to the linux kernel mailing list, which 
I am kinda scared of 😅. It could be better to submit that patch to Arch 
Linux maintainers; they could include it in their kernel builds."/


Implementation of this patch can be simplified by simply setting:

|smu->min_power_limit = amdgpu_ignore_min_pcap ? 0 : 
whatever_default_smuxx;|


and then leave rest of the code unchanged(except defining 
|amdgpu_ignore_min_pcap |variable of course). Nothing tricky nor need to 
revert anything should be needed I hope. Please add it to the general 
kernel as an option, it certainly should not be related to Archlinux only.


Roman


On 2/19/24 12:15, Linux regression tracking (Thorsten Leemhuis) wrote:

On 17.02.24 14:30, Greg KH wrote:

On Sat, Feb 17, 2024 at 02:01:54PM +0100, Roman Benes wrote:

Minimum power limit on latest(6.7+) kernels is 190W for my GPU (RX 6700XT,
mesa, archlinux) and I cannot get power cap as low as before(to 115W),
neither with Corectrl, LACT or TuxClocker and /sys have a variable read-only
even for root. This is not of above apps issue but of the kernel, I read
similar issues from other bug reports of above apps. I downgraded to v6.6.10
kernel and my 115W(under power)cap work again as before.

Any chance you can use 'git bisect' to figure out the offending change?

For the record and everyone that lands here: the cause is known now
(it's 1958946858a62b ("drm/amd/pm: Support for getting power1_cap_min
value") [v6.7-rc1]) and the issue afaics tracked here:

https://gitlab.freedesktop.org/drm/amd/-/issues/3183

Other mentions:
https://gitlab.freedesktop.org/drm/amd/-/issues/3137
https://gitlab.freedesktop.org/drm/amd/-/issues/2992

Haven't seen any statement from the amdgpu developers (now CCed) yet on
this there (but might have missed something!). From what I can see I
assume this will likely be somewhat tricky to handle, as a revert
overall might be a bad idea here. We'll see I guess.

Roman posted something that apparently was meant to go to the list, so
let me put it here:

"""
UPDATE: User fililip already posted patch, but it need to be merged,
discussion is on gitlab link below.

(PS: I hope I am replying correctly to "all" now? - using original addr.)



it seems that commit was already found(see user's 'fililip' comment):

https://gitlab.freedesktop.org/drm/amd/-/issues/3183
commit 1958946858a62b6b5392ed075aa219d199bcae39
Author: Ma Jun
Date:   Thu Oct 12 09:33:45 2023 +0800

 drm/amd/pm: Support for getting power1_cap_min value

 Support for getting power1_cap_min value on smu13 and smu11.
 For other Asics, we still use 0 as the default value.

 Signed-off-by: Ma Jun
 Reviewed-by: Kenneth Feng
 Signed-off-by: Alex Deucher

However, this is not good as it remove under-powering range too far. I

was getting only about 7% less performance but 90W(!) less consumption
when set to my 115W before. Also I wonder if we as a OS of options and
freedom have to stick to such very high reference for min values without
ability to override them through some sys ctrls. Commit was done by amd
guy and I wonder if because of maybe this post that I made few months
ago(business strategy?):



https://www.reddit.com/r/Amd/comments/183gye7/rx_6700xt_from_230w_to_capped_115w_at_only_10/

This is not a dangerous OC upwards where I can understand desire to

protect HW, it is downward, having min cap at 190W when card pull on
115W almost same speed is IMO crazy to deny. We don't talk about default
or reference values here either, just a move to lower the range of
options for whatever reason.

I don't know how much power you guys have over them, but please

consider either reverting this change, or give us an option to set
min_cap through say /sys (right now param is readonly, even for root).


Thank you in advance for looking into this, with regards:  Romano

"""

And while at it, let me add this issue to the tracking as well

[TLDR: I'm adding this report to the list of tracked Linux kernel
regressions; the text you find below is based on a few templates
paragraphs you might have encountered already in similar form.
See link in footer if these mails annoy you.]

Thanks for the report. To be sure the issue doesn't fall through the
cracks unnoticed, I'm adding it to regzbot, the Linux kernel regression
tracking bot:

#regzbot introduced 1958946858a62b /
#regzbot title drm: amdgpu: under-powering broke

Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat)
--
Everything you wanna know about Linux kernel regression tracking:
https://linux-regtracking.leemhuis.info/about/#tldr
That page also explains what to do if mails like this annoy you.

Re: Kernel 6.7+ broke under-powering of my RX 6700XT. (Archlinux, mesa/amdgpu)

2024-02-19 Thread Romano

Hello everyone,

patch by user @fililip was posted there, but not submitted:

"I think I'd have to submit it to the linux kernel mailing list, which I 
am kinda scared of 😅. It could be better to submit that patch to Arch 
Linux maintainers; they could include it in their kernel builds."


Implementation of this patch can be simplified by simply setting:

smu->min_power_limit = amdgpu_ignore_min_pcap ? 0 : whatever_default_smuxx;

and then leave rest of the code unchanged(except defining 
amdgpu_ignore_min_pcap variable of course). Nothing tricky nor need to 
revert anything should be needed I hope. Please add it to the general 
kernel as an option, it certainly should not be related to Archlinux only.


Roman



On 2/19/24 12:15, Linux regression tracking (Thorsten Leemhuis) wrote:

On 17.02.24 14:30, Greg KH wrote:

On Sat, Feb 17, 2024 at 02:01:54PM +0100, Roman Benes wrote:

Minimum power limit on latest(6.7+) kernels is 190W for my GPU (RX 6700XT,
mesa, archlinux) and I cannot get power cap as low as before(to 115W),
neither with Corectrl, LACT or TuxClocker and /sys have a variable read-only
even for root. This is not of above apps issue but of the kernel, I read
similar issues from other bug reports of above apps. I downgraded to v6.6.10
kernel and my 115W(under power)cap work again as before.

Any chance you can use 'git bisect' to figure out the offending change?

For the record and everyone that lands here: the cause is known now
(it's 1958946858a62b ("drm/amd/pm: Support for getting power1_cap_min
value") [v6.7-rc1]) and the issue afaics tracked here:

https://gitlab.freedesktop.org/drm/amd/-/issues/3183

Other mentions:
https://gitlab.freedesktop.org/drm/amd/-/issues/3137
https://gitlab.freedesktop.org/drm/amd/-/issues/2992

Haven't seen any statement from the amdgpu developers (now CCed) yet on
this there (but might have missed something!). From what I can see I
assume this will likely be somewhat tricky to handle, as a revert
overall might be a bad idea here. We'll see I guess.

Roman posted something that apparently was meant to go to the list, so
let me put it here:

"""
UPDATE: User fililip already posted patch, but it need to be merged,
discussion is on gitlab link below.

(PS: I hope I am replying correctly to "all" now? - using original addr.)



it seems that commit was already found(see user's 'fililip' comment):

https://gitlab.freedesktop.org/drm/amd/-/issues/3183
commit 1958946858a62b6b5392ed075aa219d199bcae39
Author: Ma Jun 
Date:   Thu Oct 12 09:33:45 2023 +0800

 drm/amd/pm: Support for getting power1_cap_min value

 Support for getting power1_cap_min value on smu13 and smu11.
 For other Asics, we still use 0 as the default value.

 Signed-off-by: Ma Jun 
 Reviewed-by: Kenneth Feng 
 Signed-off-by: Alex Deucher 

However, this is not good as it remove under-powering range too far. I

was getting only about 7% less performance but 90W(!) less consumption
when set to my 115W before. Also I wonder if we as a OS of options and
freedom have to stick to such very high reference for min values without
ability to override them through some sys ctrls. Commit was done by amd
guy and I wonder if because of maybe this post that I made few months
ago(business strategy?):



https://www.reddit.com/r/Amd/comments/183gye7/rx_6700xt_from_230w_to_capped_115w_at_only_10/

This is not a dangerous OC upwards where I can understand desire to

protect HW, it is downward, having min cap at 190W when card pull on
115W almost same speed is IMO crazy to deny. We don't talk about default
or reference values here either, just a move to lower the range of
options for whatever reason.

I don't know how much power you guys have over them, but please

consider either reverting this change, or give us an option to set
min_cap through say /sys (right now param is readonly, even for root).


Thank you in advance for looking into this, with regards:  Romano

"""

And while at it, let me add this issue to the tracking as well

[TLDR: I'm adding this report to the list of tracked Linux kernel
regressions; the text you find below is based on a few templates
paragraphs you might have encountered already in similar form.
See link in footer if these mails annoy you.]

Thanks for the report. To be sure the issue doesn't fall through the
cracks unnoticed, I'm adding it to regzbot, the Linux kernel regression
tracking bot:

#regzbot introduced 1958946858a62b /
#regzbot title drm: amdgpu: under-powering broke

Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat)
--
Everything you wanna know about Linux kernel regression tracking:
https://linux-regtracking.leemhuis.info/about/#tldr
That page also explains what to do if mails like this annoy you.


Re: Kernel 6.7+ broke under-powering of my RX 6700XT. (Archlinux, mesa/amdgpu)

2024-02-19 Thread Linux regression tracking (Thorsten Leemhuis)
On 17.02.24 14:30, Greg KH wrote:
> On Sat, Feb 17, 2024 at 02:01:54PM +0100, Roman Benes wrote:
>> Minimum power limit on latest(6.7+) kernels is 190W for my GPU (RX 6700XT,
>> mesa, archlinux) and I cannot get power cap as low as before(to 115W),
>> neither with Corectrl, LACT or TuxClocker and /sys have a variable read-only
>> even for root. This is not of above apps issue but of the kernel, I read
>> similar issues from other bug reports of above apps. I downgraded to v6.6.10
>> kernel and my 115W(under power)cap work again as before.
> 
> Any chance you can use 'git bisect' to figure out the offending change?

For the record and everyone that lands here: the cause is known now
(it's 1958946858a62b ("drm/amd/pm: Support for getting power1_cap_min
value") [v6.7-rc1]) and the issue afaics tracked here:

https://gitlab.freedesktop.org/drm/amd/-/issues/3183

Other mentions:
https://gitlab.freedesktop.org/drm/amd/-/issues/3137
https://gitlab.freedesktop.org/drm/amd/-/issues/2992

Haven't seen any statement from the amdgpu developers (now CCed) yet on
this there (but might have missed something!). From what I can see I
assume this will likely be somewhat tricky to handle, as a revert
overall might be a bad idea here. We'll see I guess.

Roman posted something that apparently was meant to go to the list, so
let me put it here:

"""
UPDATE: User fililip already posted patch, but it need to be merged,
discussion is on gitlab link below.

(PS: I hope I am replying correctly to "all" now? - using original addr.)


> it seems that commit was already found(see user's 'fililip' comment):
>
> https://gitlab.freedesktop.org/drm/amd/-/issues/3183
> commit 1958946858a62b6b5392ed075aa219d199bcae39
> Author: Ma Jun 
> Date:   Thu Oct 12 09:33:45 2023 +0800
>
> drm/amd/pm: Support for getting power1_cap_min value
>
> Support for getting power1_cap_min value on smu13 and smu11.
> For other Asics, we still use 0 as the default value.
>
> Signed-off-by: Ma Jun 
> Reviewed-by: Kenneth Feng 
> Signed-off-by: Alex Deucher 
>
> However, this is not good as it remove under-powering range too far. I
was getting only about 7% less performance but 90W(!) less consumption
when set to my 115W before. Also I wonder if we as a OS of options and
freedom have to stick to such very high reference for min values without
ability to override them through some sys ctrls. Commit was done by amd
guy and I wonder if because of maybe this post that I made few months
ago(business strategy?):
>
>
https://www.reddit.com/r/Amd/comments/183gye7/rx_6700xt_from_230w_to_capped_115w_at_only_10/
>
> This is not a dangerous OC upwards where I can understand desire to
protect HW, it is downward, having min cap at 190W when card pull on
115W almost same speed is IMO crazy to deny. We don't talk about default
or reference values here either, just a move to lower the range of
options for whatever reason.
>
> I don't know how much power you guys have over them, but please
consider either reverting this change, or give us an option to set
min_cap through say /sys (right now param is readonly, even for root).
>
>
> Thank you in advance for looking into this, with regards:  Romano
"""

And while at it, let me add this issue to the tracking as well

[TLDR: I'm adding this report to the list of tracked Linux kernel
regressions; the text you find below is based on a few templates
paragraphs you might have encountered already in similar form.
See link in footer if these mails annoy you.]

Thanks for the report. To be sure the issue doesn't fall through the
cracks unnoticed, I'm adding it to regzbot, the Linux kernel regression
tracking bot:

#regzbot introduced 1958946858a62b /
#regzbot title drm: amdgpu: under-powering broke

Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat)
--
Everything you wanna know about Linux kernel regression tracking:
https://linux-regtracking.leemhuis.info/about/#tldr
That page also explains what to do if mails like this annoy you.


Re: [PATCH v3 3/9] drm/ci: mediatek: Add job to test panfrost and powervr GPU driver

2024-02-19 Thread Helen Koike




On 19/02/2024 06:39, Vignesh Raman wrote:

Hi Helen,

On 09/02/24 23:51, Helen Koike wrote:



On 30/01/2024 12:03, Vignesh Raman wrote:

For mediatek mt8173, the GPU driver is powervr and for mediatek
mt8183, the GPU driver is panfrost. So add support in drm-ci to
test panfrost and powervr GPU driver for mediatek SOCs and update
xfails. Powervr driver was merged in linux kernel, but there's no
mediatek support yet. So disable the mt8173-gpu job which uses
powervr driver.

Add panfrost specific tests to testlist and skip KMS tests for
panfrost driver since it is not a not a KMS driver. Also update
the MAINTAINERS file to include xfails for panfrost driver.

Signed-off-by: Vignesh Raman 


Hi Vignesh, thanks for your work.

I'm still wondering about a few things, please check below.


---

v2:
   - Add panfrost and PVR GPU jobs for mediatek SOC with new xfails, 
add xfail

 entry to MAINTAINERS.


Maybe we should review how the xfails failes are named. I think they 
should start with the DRIVER_NAME instead of GPU_VERSION.


For instance, consider the following job:

mediatek:mt8183-gpu:
   extends:
 - .mt8183
   variables:
 GPU_VERSION: mediatek-mt8183-gpu
 DRIVER_NAME: panfrost

And we have mediatek-mt8183-gpu-skips.txt

If there is an error, we want to notify the panfrost driver 
maintainers (and maybe not the mediatek driver maintainers), so 
MAINTAINERS file doesn't correspond to this.


Agree.



How about a naming __ ?

powervr_mediatek-mt8173_gpu-skipts.txt
mediatek_mediatek-mt8173_display-skipts.txt
panfrost_mediatek-mt8183_gpu-skips.txt
mediatek_mediatek-mt8183_display-skips.txt
...

What do you think?


Yes we can keep this naming. In this case do we still need gpu/display 
in the xfails file name?


If you think this split is not required, then I'm fine dropping it.

Regards,
Helen



Regards,
Vignesh


Re: [PATCH 5/5] drm/amdgpu: skip GFX FED error in page fault handling

2024-02-19 Thread Lazar, Lijo



On 2/19/2024 1:45 PM, Tao Zhou wrote:
> Let kfd interrupt handler process it.
> 
> Signed-off-by: Tao Zhou 
> ---
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 10 +-
>  1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 773725a92cf1..70defc394b7b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -552,7 +552,7 @@ static int gmc_v9_0_process_interrupt(struct 
> amdgpu_device *adev,
>  {
>   bool retry_fault = !!(entry->src_data[1] & 0x80);
>   bool write_fault = !!(entry->src_data[1] & 0x20);
> - uint32_t status = 0, cid = 0, rw = 0;
> + uint32_t status = 0, cid = 0, rw = 0, fed = 0;
>   struct amdgpu_task_info task_info;
>   struct amdgpu_vmhub *hub;
>   const char *mmhub_cid;
> @@ -663,6 +663,14 @@ static int gmc_v9_0_process_interrupt(struct 
> amdgpu_device *adev,
>   status = RREG32(hub->vm_l2_pro_fault_status);
>   cid = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, CID);
>   rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW);
> + fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
> +
> + /* for gfx fed error, kfd will handle it, return directly */
> + if (fed && amdgpu_ras_is_poison_mode_supported(adev) &&
> + amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2) &&
> + !strcmp(hub_name, "gfxhub0"))
> + return 1;

amdgpu_irq_dispatch() gives the impression that return value of 1 is
treated as handled, hence won't be passed to kfd. The commit description
says it is intended to pass to kfd for handling.

Also, FED status check may be moved up so that it's not misunderstood as
a regular page fault with the extra prints coming to dmesg log.
Otherwise, poison status also needs to be added to dmesg.

Thanks,
Lijo

> +
>   WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
>  #ifdef HAVE_STRUCT_XARRAY
>   amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub);


Re: [PATCH] drm/amd: Drop abm_level property

2024-02-19 Thread Christian König

Am 16.02.24 um 16:33 schrieb Mario Limonciello:

This vendor specific property has never been used by userspace
software and conflicts with the panel_power_savings sysfs file.
That is a compositor and user could fight over the same data.

Fixes: f97e4303da16 ("drm/amd/display: add panel_power_savings sysfs entry to eDP 
connectors")
Suggested-by: Harry Wentland 
Signed-off-by: Mario Limonciello 


Acked-by: Christian König 


--
Cc: Hamza Mahfooz 
Cc: Sun peng (Leo) Li 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_display.c   |  8 
  drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h  |  2 --
  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 14 --
  3 files changed, 24 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
index b8fbe97efe1d..3ecc7ef95172 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
@@ -1350,14 +1350,6 @@ int amdgpu_display_modeset_create_props(struct 
amdgpu_device *adev)
 "dither",
 amdgpu_dither_enum_list, sz);
  
-	if (adev->dc_enabled) {

-   adev->mode_info.abm_level_property =
-   drm_property_create_range(adev_to_drm(adev), 0,
- "abm level", 0, 4);
-   if (!adev->mode_info.abm_level_property)
-   return -ENOMEM;
-   }
-
return 0;
  }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h

index 2e4911050cc5..1fe21a70ddd0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
@@ -324,8 +324,6 @@ struct amdgpu_mode_info {
struct drm_property *audio_property;
/* FMT dithering */
struct drm_property *dither_property;
-   /* Adaptive Backlight Modulation (power feature) */
-   struct drm_property *abm_level_property;
/* hardcoded DFP edid from BIOS */
struct edid *bios_hardcoded_edid;
int bios_hardcoded_edid_size;
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index b9ac3d2f8029..e3b32ffba85a 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -6388,9 +6388,6 @@ int amdgpu_dm_connector_atomic_set_property(struct 
drm_connector *connector,
} else if (property == adev->mode_info.underscan_property) {
dm_new_state->underscan_enable = val;
ret = 0;
-   } else if (property == adev->mode_info.abm_level_property) {
-   dm_new_state->abm_level = val ?: ABM_LEVEL_IMMEDIATE_DISABLE;
-   ret = 0;
}
  
  	return ret;

@@ -6433,10 +6430,6 @@ int amdgpu_dm_connector_atomic_get_property(struct 
drm_connector *connector,
} else if (property == adev->mode_info.underscan_property) {
*val = dm_state->underscan_enable;
ret = 0;
-   } else if (property == adev->mode_info.abm_level_property) {
-   *val = (dm_state->abm_level != ABM_LEVEL_IMMEDIATE_DISABLE) ?
-   dm_state->abm_level : 0;
-   ret = 0;
}
  
  	return ret;

@@ -7652,13 +7645,6 @@ void amdgpu_dm_connector_init_helper(struct 
amdgpu_display_manager *dm,
aconnector->base.state->max_bpc = 16;
aconnector->base.state->max_requested_bpc = 
aconnector->base.state->max_bpc;
  
-	if (connector_type == DRM_MODE_CONNECTOR_eDP &&

-   (dc_is_dmcu_initialized(adev->dm.dc) ||
-adev->dm.dc->ctx->dmub_srv) && amdgpu_dm_abm_level < 0) {
-   drm_object_attach_property(&aconnector->base.base,
-   adev->mode_info.abm_level_property, 0);
-   }
-
if (connector_type == DRM_MODE_CONNECTOR_HDMIA) {
/* Content Type is currently only implemented for HDMI. */
drm_connector_attach_content_type_property(&aconnector->base);




[PATCH] drm/amd/display: Fix potential null pointer dereference in dc_dmub_srv

2024-02-19 Thread Srinivasan Shanmugam
Fixes potential null pointer dereference warnings in the
dc_dmub_srv_cmd_list_queue_execute() and dc_dmub_srv_is_hw_pwr_up()
functions.

In both functions, the 'dc_dmub_srv' variable was being dereferenced
before it was checked for null. This could lead to a null pointer
dereference if 'dc_dmub_srv' is null. The fix is to check if
'dc_dmub_srv' is null before dereferencing it.

Thus moving the null checks for 'dc_dmub_srv' to the beginning of the
functions to ensure that 'dc_dmub_srv' is not null when it is
dereferenced.

Found by smatch & thus fixing the below:
drivers/gpu/drm/amd/amdgpu/../display/dc/dc_dmub_srv.c:133 
dc_dmub_srv_cmd_list_queue_execute() warn: variable dereferenced before check 
'dc_dmub_srv' (see line 128)
drivers/gpu/drm/amd/amdgpu/../display/dc/dc_dmub_srv.c:1167 
dc_dmub_srv_is_hw_pwr_up() warn: variable dereferenced before check 
'dc_dmub_srv' (see line 1164)

Fixes: 01fbdc34c687 ("drm/amd/display: decouple dmcub execution to reduce lock 
granularity")
Fixes: 65138eb72e1f ("drm/amd/display: Add DCN35 DMUB")
Cc: JinZe.Xu 
Cc: Hersen Wu 
Cc: Josip Pavic 
Cc: Roman Li 
Cc: Qingqing Zhuo 
Cc: Harry Wentland 
Cc: Rodrigo Siqueira 
Cc: Aurabindo Pillai 
Cc: Tom Chung 
Signed-off-by: Srinivasan Shanmugam 
---
 drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c 
b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
index 0bc32537e2eb..8bc361e0f404 100644
--- a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
+++ b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
@@ -128,7 +128,7 @@ bool dc_dmub_srv_cmd_list_queue_execute(struct dc_dmub_srv 
*dc_dmub_srv,
unsigned int count,
union dmub_rb_cmd *cmd_list)
 {
-   struct dc_context *dc_ctx = dc_dmub_srv->ctx;
+   struct dc_context *dc_ctx;
struct dmub_srv *dmub;
enum dmub_status status;
int i;
@@ -136,6 +136,7 @@ bool dc_dmub_srv_cmd_list_queue_execute(struct dc_dmub_srv 
*dc_dmub_srv,
if (!dc_dmub_srv || !dc_dmub_srv->dmub)
return false;
 
+   dc_ctx = dc_dmub_srv->ctx;
dmub = dc_dmub_srv->dmub;
 
for (i = 0 ; i < count; i++) {
@@ -1169,12 +1170,14 @@ void dc_dmub_srv_subvp_save_surf_addr(const struct 
dc_dmub_srv *dc_dmub_srv, con
 
 bool dc_dmub_srv_is_hw_pwr_up(struct dc_dmub_srv *dc_dmub_srv, bool wait)
 {
-   struct dc_context *dc_ctx = dc_dmub_srv->ctx;
+   struct dc_context *dc_ctx;
enum dmub_status status;
 
if (!dc_dmub_srv || !dc_dmub_srv->dmub)
return true;
 
+   dc_ctx = dc_dmub_srv->ctx;
+
if (dc_dmub_srv->ctx->dc->debug.dmcub_emulation)
return true;
 
-- 
2.34.1



Re: [PATCH v3 3/9] drm/ci: mediatek: Add job to test panfrost and powervr GPU driver

2024-02-19 Thread Vignesh Raman

Hi Helen,

On 09/02/24 23:51, Helen Koike wrote:



On 30/01/2024 12:03, Vignesh Raman wrote:

For mediatek mt8173, the GPU driver is powervr and for mediatek
mt8183, the GPU driver is panfrost. So add support in drm-ci to
test panfrost and powervr GPU driver for mediatek SOCs and update
xfails. Powervr driver was merged in linux kernel, but there's no
mediatek support yet. So disable the mt8173-gpu job which uses
powervr driver.

Add panfrost specific tests to testlist and skip KMS tests for
panfrost driver since it is not a not a KMS driver. Also update
the MAINTAINERS file to include xfails for panfrost driver.

Signed-off-by: Vignesh Raman 


Hi Vignesh, thanks for your work.

I'm still wondering about a few things, please check below.


---

v2:
   - Add panfrost and PVR GPU jobs for mediatek SOC with new xfails, 
add xfail

 entry to MAINTAINERS.


Maybe we should review how the xfails failes are named. I think they 
should start with the DRIVER_NAME instead of GPU_VERSION.


For instance, consider the following job:

mediatek:mt8183-gpu:
   extends:
     - .mt8183
   variables:
     GPU_VERSION: mediatek-mt8183-gpu
     DRIVER_NAME: panfrost

And we have mediatek-mt8183-gpu-skips.txt

If there is an error, we want to notify the panfrost driver maintainers 
(and maybe not the mediatek driver maintainers), so MAINTAINERS file 
doesn't correspond to this.


Agree.



How about a naming __ ?

powervr_mediatek-mt8173_gpu-skipts.txt
mediatek_mediatek-mt8173_display-skipts.txt
panfrost_mediatek-mt8183_gpu-skips.txt
mediatek_mediatek-mt8183_display-skips.txt
...

What do you think?


Yes we can keep this naming. In this case do we still need gpu/display 
in the xfails file name?


Regards,
Vignesh


Re: [PATCH v2] drm/amd/display: add panel_power_savings sysfs entry to eDP connectors

2024-02-19 Thread Pekka Paalanen
On Fri, 16 Feb 2024 10:32:10 -0600
Mario Limonciello  wrote:

> On 2/16/2024 10:13, Harry Wentland wrote:
> > 
> > 
> > On 2024-02-16 11:11, Harry Wentland wrote:  
> >>
> >>
> >> On 2024-02-16 10:42, Pekka Paalanen wrote:  
> >>> On Fri, 16 Feb 2024 09:33:47 -0500
> >>> Harry Wentland  wrote:
> >>>  
>  On 2024-02-16 03:19, Pekka Paalanen wrote:  
> > On Fri, 2 Feb 2024 10:28:35 -0500
> > Hamza Mahfooz  wrote:
> >  
> >> We want programs besides the compositor to be able to enable or disable
> >> panel power saving features.  
> >
> > Could you also explain why, in the commit message, please?
> >
> > It is unexpected for arbitrary programs to be able to override the KMS
> > client, and certainly new ways to do so should not be added without an
> > excellent justification.
> >
> > Maybe debugfs would be more appropriate if the purpose is only testing
> > rather than production environments?
> >  
> >> However, since they are currently only
> >> configurable through DRM properties, that isn't possible. So, to remedy
> >> that issue introduce a new "panel_power_savings" sysfs attribute.  
> >
> > When the DRM property was added, what was used as the userspace to
> > prove its workings?
> >  
> 
>  I don't think there ever was a userspace implementation and doubt any
>  exists today. Part of that is on me. In hindsight, the KMS prop should
>  have never gone upstream.
> 
>  I suggest we drop the KMS prop entirely.  
> >>>
> >>> Sounds good. What about the sysfs thing? Should it be a debugfs thing
> >>> instead, assuming the below question will be resolved?
> >>>  
> >>
> >>
> >> It's intended to be used by the power profiles daemon (PPD). I don't think
> >> debugfs is the right choice. See
> >> https://gitlab.freedesktop.org/upower/power-profiles-daemon/-/commit/41ed5d33a82b0ceb7b6d473551eb2aa62cade6bc
> >>  
>  As for the color accuracy topic, I think it is important that compositors
>  can have full control over that if needed, while it's also important
>  for HW vendors to optimize for power when absolute color accuracy is not
>  needed. An average end-user writing code or working on their slides
>  would rather have a longer battery life than a perfectly color-accurate
>  display. We should probably think of a solution that can support both
>  use-cases.  
> >>>
> >>> I agree. Maybe this pondering should start from "how would it work from
> >>> end user perspective"?
> >>>
> >>> "Automatically" is probably be most desirable answer. Some kind of  
> >>
> >> I agree
> >>  
> >>> desktop settings with options like "save power at the expense of image
> >>> quality":
> >>> - always
> >>> - not if watching movies/gaming
> >>> - on battery
> >>> - on battery, unless I'm watching movies/gaming
> >>> - never
> >>>  
> >>
> >> It's interesting that you split out movies/gaming, specifically. AMD's
> >> ABM algorithm seems to have considered movies in particular when
> >> evaluating the power/fidelity trade-off.
> >>
> >> I wouldn't think consumer media is very particular about a specific
> >> color fidelity (despite what HDR specs try to make you believe). Where
> >> color fidelity would matter to me is when I'd want to edit pictures or
> >> video.
> >>
> >> The "abm_level" property that we expose is really just that, a setting
> >> for the strength of the power-savings effect, with 0 being off and 4 being
> >> maximum strength and power saving, at the expense of fidelity.
> >>
> >> Mario's work is to let the PPD control it and set the ABM levels based on
> >> the selected power profile:
> >> 0 - Performance
> >> 1 - Balance
> >> 3 - Power
> >>
> >> And I believe we've looked at disabling ABM (setting it to 0) automatically
> >> if we know we're on AC power.
> >>  
> >>> Or maybe there already is something like that, and it only needs to be
> >>> plumbed through?
> >>>
> >>> Which would point towards KMS clients needing to control it, which
> >>> means a generic KMS prop rather than vendor specific?
> >>>
> >>> Or should the desktop compositor be talking to some daemon instead of
> >>> KMS for this? Maybe they already are?
> >>>  
> >>
> >> I think the intention is for the PPD to be that daemon. Mario can 
> >> elaborate.
> >>  
> > 
> > Some more details and screenshots on how the PPD is expected to work and 
> > look:
> > https://linuxconfig.org/how-to-manage-power-profiles-over-d-bus-with-power-profiles-daemon-on-linux
> >   
> 
> Right, thanks!
> 
> The most important point is that the user indicates intent from the GUI.
> The daemon orchestrates the various knobs to get that intent.
> 
> It's intentionally very coarse - 3 power states.  The policy of what to 
> do for those states is managed by the daemon.
> 
> In the case of ABM it will only apply the policy if the daemon detects 
> the system is on battery.
> 

Sounds like sysfs is the best option, and it 

Re: [PATCH v3 9/9] drm/ci: uprev IGT and update testlist

2024-02-19 Thread Vignesh Raman

Hi Maíra,

On 10/02/24 23:50, Maíra Canal wrote:

On 2/10/24 15:17, Maíra Canal wrote:

On 1/30/24 12:03, Vignesh Raman wrote:

Uprev IGT and add amd, v3d, vc4 and vgem specific
tests to testlist. Have testlist.txt per driver
and include a base testlist so that the driver
specific tests will run only on those hardware.

Signed-off-by: Vignesh Raman 
---

v3:
   - New patch in series to uprev IGT and update testlist.

---
  drivers/gpu/drm/ci/gitlab-ci.yml  |   2 +-
  drivers/gpu/drm/ci/igt_runner.sh  |  12 +-
  drivers/gpu/drm/ci/testlist-amdgpu.txt    | 151 ++
  drivers/gpu/drm/ci/testlist-msm.txt   |  50 ++
  drivers/gpu/drm/ci/testlist-panfrost.txt  |  17 ++
  drivers/gpu/drm/ci/testlist-v3d.txt   |  73 +
  drivers/gpu/drm/ci/testlist-vc4.txt   |  49 ++
  drivers/gpu/drm/ci/testlist.txt   | 100 
  .../gpu/drm/ci/xfails/amdgpu-stoney-fails.txt |  24 ++-
  .../drm/ci/xfails/amdgpu-stoney-flakes.txt    |   9 +-
  .../gpu/drm/ci/xfails/amdgpu-stoney-skips.txt |  10 +-
  11 files changed, 427 insertions(+), 70 deletions(-)
  create mode 100644 drivers/gpu/drm/ci/testlist-amdgpu.txt
  create mode 100644 drivers/gpu/drm/ci/testlist-msm.txt
  create mode 100644 drivers/gpu/drm/ci/testlist-panfrost.txt
  create mode 100644 drivers/gpu/drm/ci/testlist-v3d.txt
  create mode 100644 drivers/gpu/drm/ci/testlist-vc4.txt

diff --git a/drivers/gpu/drm/ci/gitlab-ci.yml 
b/drivers/gpu/drm/ci/gitlab-ci.yml

index bc8cb3420476..e2b021616a8e 100644
--- a/drivers/gpu/drm/ci/gitlab-ci.yml
+++ b/drivers/gpu/drm/ci/gitlab-ci.yml
@@ -5,7 +5,7 @@ variables:
    UPSTREAM_REPO: git://anongit.freedesktop.org/drm/drm
    TARGET_BRANCH: drm-next
-  IGT_VERSION: d2af13d9f5be5ce23d996e4afd3e45990f5ab977
+  IGT_VERSION: b0cc8160ebdc87ce08b7fd83bb3c99ff7a4d8610
    DEQP_RUNNER_GIT_URL: 
https://gitlab.freedesktop.org/anholt/deqp-runner.git

    DEQP_RUNNER_GIT_TAG: v0.15.0
diff --git a/drivers/gpu/drm/ci/igt_runner.sh 
b/drivers/gpu/drm/ci/igt_runner.sh

index f001e015d135..2fd09b9b7cf6 100755
--- a/drivers/gpu/drm/ci/igt_runner.sh
+++ b/drivers/gpu/drm/ci/igt_runner.sh
@@ -64,10 +64,20 @@ if ! grep -q "core_getversion" 
/install/testlist.txt; then

  fi
  set +e
+if [ "$DRIVER_NAME" = "amdgpu" ]; then
+    TEST_LIST="/install/testlist-amdgpu.txt"
+elif [ "$DRIVER_NAME" = "msm" ]; then
+    TEST_LIST="/install/testlist-msm.txt"
+elif [ "$DRIVER_NAME" = "panfrost" ]; then
+    TEST_LIST="/install/testlist-panfrost.txt"
+else
+    TEST_LIST="/install/testlist.txt"
+fi
+


Isn't V3D and VC4 testlists missing?


Yes. We need to add ci jobs to test v3d/vc4. The initial idea was just 
to split the testlist per driver and add vc4/v3d tests so that it can be 
used in future. I will add the jobs as part of v4.



It would be nice if you could provide us a link to a working pipeline.


Will provide the working pipeline link in next series.


Also, if possible, I would like to be CCed on the next version of this
patch, as I have interest in the V3D/VC4 tests.


Sure, will do.


Ah, one thing: it would be nice to add the testlists to the MAINTAINERS
file. This way, maintainers can keep track of any changes.


Yes, will add the testlists to MAINTAINERS file.

Thanks for the review and feedback.

Regards,
Vignesh


[PATCH 4/5] amd/amdkfd: get node id for query_utcl2_poison_status

2024-02-19 Thread Tao Zhou
Obtain it from ring entry.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 3 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index 9a06c6fb6605..747cb785a7d3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -367,10 +367,11 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
   client_id == SOC15_IH_CLIENTID_UTCL2) {
struct kfd_vm_fault_info info = {0};
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+   uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
struct kfd_hsa_memory_exception_data exception_data;
 
if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
-   
amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
+   amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, 
node_id)) {
event_interrupt_poison_consumption(dev, pasid, 
client_id);
return;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 91dd5e045b51..eb94d967c532 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -413,10 +413,11 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
   client_id == SOC15_IH_CLIENTID_UTCL2) {
struct kfd_vm_fault_info info = {0};
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+   uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
struct kfd_hsa_memory_exception_data exception_data;
 
if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
-   amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
+   amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, 
node_id)) {
event_interrupt_poison_consumption_v9(dev, pasid, 
client_id);
return;
}
-- 
2.34.1



[PATCH 5/5] drm/amdgpu: skip GFX FED error in page fault handling

2024-02-19 Thread Tao Zhou
Let kfd interrupt handler process it.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 773725a92cf1..70defc394b7b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -552,7 +552,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
 {
bool retry_fault = !!(entry->src_data[1] & 0x80);
bool write_fault = !!(entry->src_data[1] & 0x20);
-   uint32_t status = 0, cid = 0, rw = 0;
+   uint32_t status = 0, cid = 0, rw = 0, fed = 0;
struct amdgpu_task_info task_info;
struct amdgpu_vmhub *hub;
const char *mmhub_cid;
@@ -663,6 +663,14 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
status = RREG32(hub->vm_l2_pro_fault_status);
cid = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, CID);
rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW);
+   fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+
+   /* for gfx fed error, kfd will handle it, return directly */
+   if (fed && amdgpu_ras_is_poison_mode_supported(adev) &&
+   amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2) &&
+   !strcmp(hub_name, "gfxhub0"))
+   return 1;
+
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
 #ifdef HAVE_STRUCT_XARRAY
amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub);
-- 
2.34.1



[PATCH 3/5] drm/amdgpu: retire gfx ras query_utcl2_poison_status

2024-02-19 Thread Tao Zhou
Replace it with related interface in gfxhub functions.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  7 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h|  1 -
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c| 12 
 4 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index f759a42def59..bb509b26112d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -776,10 +776,11 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct 
amdgpu_device *adev,
return 0;
 }
 
-bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
+   uint32_t node_id)
 {
-   if (adev->gfx.ras && adev->gfx.ras->query_utcl2_poison_status)
-   return adev->gfx.ras->query_utcl2_poison_status(adev);
+   if (adev->gfxhub.funcs->query_utcl2_poison_status)
+   return adev->gfxhub.funcs->query_utcl2_poison_status(adev, 
node_id);
else
return false;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index b2990470d1c6..ae1e449e5479 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -404,7 +404,8 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
 bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem 
*mem);
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
-bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev);
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
+   uint32_t node_id);
 int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag, int8_t xcp_id);
 void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 2af2e28952db..d91f83bd7267 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -268,7 +268,6 @@ struct amdgpu_cu_info {
 struct amdgpu_gfx_ras {
struct amdgpu_ras_block_object  ras_block;
void (*enable_watchdog_timer)(struct amdgpu_device *adev);
-   bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);
int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
index 0d5b4133fdf7..e3ed568eaacc 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -1909,18 +1909,7 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct 
amdgpu_device *adev)
mutex_unlock(&adev->grbm_idx_mutex);
 }
 
-static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device *adev)
-{
-   u32 status = 0;
-   struct amdgpu_vmhub *hub;
-
-   hub = &adev->vmhub[AMDGPU_GFXHUB(0)];
-   status = RREG32(hub->vm_l2_pro_fault_status);
-   /* reset page fault status */
-   WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
 
-   return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
-}
 
 struct amdgpu_ras_block_hw_ops  gfx_v9_4_2_ras_ops = {
.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
@@ -1934,5 +1923,4 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
.hw_ops = &gfx_v9_4_2_ras_ops,
},
.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
-   .query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status,
 };
-- 
2.34.1



[PATCH 2/5] drm/amdgpu: add utcl2 poison query for gfxhub

2024-02-19 Thread Tao Zhou
Implement it for gfxhub 1.0 and 1.2.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c   | 17 +
 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c   | 15 +++
 3 files changed, 34 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
index c7b44aeb671b..12b131a9cc42 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
@@ -38,6 +38,8 @@ struct amdgpu_gfxhub_funcs {
void (*mode2_save_regs)(struct amdgpu_device *adev);
void (*mode2_restore_regs)(struct amdgpu_device *adev);
void (*halt)(struct amdgpu_device *adev);
+   bool (*query_utcl2_poison_status)(struct amdgpu_device *adev,
+   uint32_t node_id);
 };
 
 struct amdgpu_gfxhub {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
index 22175da0e16a..1c14b1665c9f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
@@ -443,6 +443,22 @@ static void gfxhub_v1_0_init(struct amdgpu_device *adev)
mmVM_INVALIDATE_ENG0_ADDR_RANGE_LO32;
 }
 
+static bool gfxhub_v1_0_query_utcl2_poison_status(struct amdgpu_device *adev,
+   uint32_t node_id)
+{
+   u32 status = 0;
+   struct amdgpu_vmhub *hub;
+
+   if (amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(9, 4, 2))
+   return false;
+
+   hub = &adev->vmhub[AMDGPU_GFXHUB(0)];
+   status = RREG32(hub->vm_l2_pro_fault_status);
+   /* reset page fault status */
+   WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
+
+   return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+}
 
 const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = {
.get_mc_fb_offset = gfxhub_v1_0_get_mc_fb_offset,
@@ -452,4 +468,5 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = {
.set_fault_enable_default = gfxhub_v1_0_set_fault_enable_default,
.init = gfxhub_v1_0_init,
.get_xgmi_info = gfxhub_v1_1_get_xgmi_info,
+   .query_utcl2_poison_status = gfxhub_v1_0_query_utcl2_poison_status,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
index 49aecdcee006..ebc96739e1c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
@@ -620,6 +620,20 @@ static int gfxhub_v1_2_get_xgmi_info(struct amdgpu_device 
*adev)
return 0;
 }
 
+static bool gfxhub_v1_2_query_utcl2_poison_status(struct amdgpu_device *adev,
+   uint32_t node_id)
+{
+   u32 fed, status;
+
+   status = RREG32_SOC15(GC, node_id, regVM_L2_PROTECTION_FAULT_STATUS);
+   fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+   /* reset page fault status */
+   WREG32_P(SOC15_REG_OFFSET(GC, node_id,
+   regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1);
+
+   return fed;
+}
+
 const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = {
.get_mc_fb_offset = gfxhub_v1_2_get_mc_fb_offset,
.setup_vm_pt_regs = gfxhub_v1_2_setup_vm_pt_regs,
@@ -628,6 +642,7 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = {
.set_fault_enable_default = gfxhub_v1_2_set_fault_enable_default,
.init = gfxhub_v1_2_init,
.get_xgmi_info = gfxhub_v1_2_get_xgmi_info,
+   .query_utcl2_poison_status = gfxhub_v1_2_query_utcl2_poison_status,
 };
 
 static int gfxhub_v1_2_xcp_resume(void *handle, uint32_t inst_mask)
-- 
2.34.1



[PATCH 1/5] drm/amdgpu: add new bit definitions for GC 9.0 PROTECTION_FAULT_STATUS

2024-02-19 Thread Tao Zhou
Add UCE and FED bit definitions.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h 
b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
index efc16ddf274a..2dfa0e5b1aa3 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h
@@ -6822,6 +6822,8 @@
 #define VM_L2_PROTECTION_FAULT_STATUS__VMID__SHIFT 
   0x14
 #define VM_L2_PROTECTION_FAULT_STATUS__VF__SHIFT   
   0x18
 #define VM_L2_PROTECTION_FAULT_STATUS__VFID__SHIFT 
   0x19
+#define VM_L2_PROTECTION_FAULT_STATUS__UCE__SHIFT  
   0x1d
+#define VM_L2_PROTECTION_FAULT_STATUS__FED__SHIFT  
   0x1e
 #define VM_L2_PROTECTION_FAULT_STATUS__MORE_FAULTS_MASK
   0x0001L
 #define VM_L2_PROTECTION_FAULT_STATUS__WALKER_ERROR_MASK   
   0x000EL
 #define VM_L2_PROTECTION_FAULT_STATUS__PERMISSION_FAULTS_MASK  
   0x00F0L
@@ -6832,6 +6834,8 @@
 #define VM_L2_PROTECTION_FAULT_STATUS__VMID_MASK   
   0x00F0L
 #define VM_L2_PROTECTION_FAULT_STATUS__VF_MASK 
   0x0100L
 #define VM_L2_PROTECTION_FAULT_STATUS__VFID_MASK   
   0x1E00L
+#define VM_L2_PROTECTION_FAULT_STATUS__UCE_MASK
   0x2000L
+#define VM_L2_PROTECTION_FAULT_STATUS__FED_MASK
   0x4000L
 //VM_L2_PROTECTION_FAULT_ADDR_LO32
 #define VM_L2_PROTECTION_FAULT_ADDR_LO32__LOGICAL_PAGE_ADDR_LO32__SHIFT
   0x0
 #define VM_L2_PROTECTION_FAULT_ADDR_LO32__LOGICAL_PAGE_ADDR_LO32_MASK  
   0xL
-- 
2.34.1