RE: [PATCH 3/3] drm/amdgpu: Remove pcie bw sys entry
[AMD Official Use Only - General] Series is Reviewed-by: Hawking Zhang Regards, Hawking -Original Message- From: Kamal, Asad Sent: Friday, February 16, 2024 20:59 To: amd-gfx@lists.freedesktop.org Cc: Lazar, Lijo ; Zhang, Hawking ; Ma, Le ; Zhang, Morris ; Kamal, Asad Subject: [PATCH 3/3] drm/amdgpu: Remove pcie bw sys entry Remove pcie bw sys entry for asics not supporting such function Signed-off-by: Asad Kamal Reviewed-by: Lijo Lazar --- drivers/gpu/drm/amd/amdgpu/soc15.c | 1 - drivers/gpu/drm/amd/pm/amdgpu_pm.c | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 7fc55e3262eb..20a4582885cc 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -895,7 +895,6 @@ static const struct amdgpu_asic_funcs aqua_vanjaram_asic_funcs = .get_config_memsize = &soc15_get_config_memsize, .need_full_reset = &soc15_need_full_reset, .init_doorbell_index = &aqua_vanjaram_doorbell_index_init, - .get_pcie_usage = &vega20_get_pcie_usage, .need_reset_on_init = &soc15_need_reset_on_init, .get_pcie_replay_count = &amdgpu_nbio_get_pcie_replay_count, .supports_baco = &soc15_supports_baco, diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index 087d57850304..1ff7fc821871 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -2174,7 +2174,8 @@ static int default_attr_update(struct amdgpu_device *adev, struct amdgpu_device_ *states = ATTR_STATE_UNSUPPORTED; } else if (DEVICE_ATTR_IS(pcie_bw)) { /* PCIe Perf counters won't work on APU nodes */ - if (adev->flags & AMD_IS_APU) + if (adev->flags & AMD_IS_APU || + !adev->asic_funcs->get_pcie_usage) *states = ATTR_STATE_UNSUPPORTED; } else if (DEVICE_ATTR_IS(unique_id)) { switch (gc_ver) { -- 2.42.0
Re: [PATCH 2/2] drm/amdgpu: use new reset-affected accessors for userspace interfaces
On 2/16/2024 8:43 PM, Alex Deucher wrote: > Use the new reset critical section accessors for debugfs, sysfs, > and the INFO IOCTL to provide proper mutual exclusivity > to hardware with respect the GPU resets. > This looks more like a priority inversion. When the device needs reset, it doesn't make sense for reset handling to wait on these. Instead, reset should be done at the earliest. Thanks, Lijo > Signed-off-by: Alex Deucher > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 20 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 10 + > drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 215 > drivers/gpu/drm/amd/pm/amdgpu_pm.c | 91 - > 4 files changed, 235 insertions(+), 101 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > index 72eceb7d6667..d0e4a8729703 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > @@ -1670,7 +1670,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file > *m, void *unused) > } > > /* Avoid accidently unparking the sched thread during GPU reset */ > - r = down_write_killable(&adev->reset_domain->sem); > + r = amdgpu_reset_domain_access_write_start(adev); > if (r) > return r; > > @@ -1699,7 +1699,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file > *m, void *unused) > kthread_unpark(ring->sched.thread); > } > > - up_write(&adev->reset_domain->sem); > + amdgpu_reset_domain_access_write_end(adev); > > pm_runtime_mark_last_busy(dev->dev); > pm_runtime_put_autosuspend(dev->dev); > @@ -1929,7 +1929,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 > val) > return -ENOMEM; > > /* Avoid accidently unparking the sched thread during GPU reset */ > - r = down_read_killable(&adev->reset_domain->sem); > + r = amdgpu_reset_domain_access_read_start(adev); > if (r) > goto pro_end; > > @@ -1970,7 +1970,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 > val) > /* restart the scheduler */ > kthread_unpark(ring->sched.thread); > > - up_read(&adev->reset_domain->sem); > + amdgpu_reset_domain_access_read_end(adev); > > pro_end: > kfree(fences); > @@ -2031,23 +2031,23 @@ static ssize_t > amdgpu_reset_dump_register_list_read(struct file *f, > return 0; > > memset(reg_offset, 0, 12); > - ret = down_read_killable(&adev->reset_domain->sem); > + ret = amdgpu_reset_domain_access_read_start(adev); > if (ret) > return ret; > > for (i = 0; i < adev->reset_info.num_regs; i++) { > sprintf(reg_offset, "0x%x\n", > adev->reset_info.reset_dump_reg_list[i]); > - up_read(&adev->reset_domain->sem); > + amdgpu_reset_domain_access_read_end(adev); > if (copy_to_user(buf + len, reg_offset, strlen(reg_offset))) > return -EFAULT; > > len += strlen(reg_offset); > - ret = down_read_killable(&adev->reset_domain->sem); > + ret = amdgpu_reset_domain_access_read_start(adev); > if (ret) > return ret; > } > > - up_read(&adev->reset_domain->sem); > + amdgpu_reset_domain_access_read_end(adev); > *pos += len; > > return len; > @@ -2089,14 +2089,14 @@ static ssize_t > amdgpu_reset_dump_register_list_write(struct file *f, > ret = -ENOMEM; > goto error_free; > } > - ret = down_write_killable(&adev->reset_domain->sem); > + ret = amdgpu_reset_domain_access_write_start(adev); > if (ret) > goto error_free; > > swap(adev->reset_info.reset_dump_reg_list, tmp); > swap(adev->reset_info.reset_dump_reg_value, new); > adev->reset_info.num_regs = i; > - up_write(&adev->reset_domain->sem); > + amdgpu_reset_domain_access_write_end(adev); > ret = size; > > error_free: > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > index a2df3025a754..4efb44a964ef 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > @@ -43,6 +43,7 @@ > #include "amdgpu_gem.h" > #include "amdgpu_display.h" > #include "amdgpu_ras.h" > +#include "amdgpu_reset.h" > #include "amd_pcie.h" > > void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev) > @@ -704,7 +705,11 @@ int amdgpu_info_ioctl(struct drm_device *dev, void > *data, struct drm_file *filp) > return copy_to_user(out, &count, min(size, 4u)) ? -EFAULT : 0; > } > case AMDGPU_INFO_TIMESTAMP: > + ret = amdgpu_reset_domain_access_read_start(adev); > + if (ret) > + return -EFAULT; > ui64 = amdgpu_gfx_get_gpu_clock_cou
RE: [PATCH 5/5] drm/amdgpu: skip GFX FED error in page fault handling
[AMD Official Use Only - General] > -Original Message- > From: Lazar, Lijo > Sent: Monday, February 19, 2024 8:40 PM > To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org > Subject: Re: [PATCH 5/5] drm/amdgpu: skip GFX FED error in page fault handling > > > > On 2/19/2024 1:45 PM, Tao Zhou wrote: > > Let kfd interrupt handler process it. > > > > Signed-off-by: Tao Zhou > > --- > > drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 10 +- > > 1 file changed, 9 insertions(+), 1 deletion(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > > index 773725a92cf1..70defc394b7b 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > > @@ -552,7 +552,7 @@ static int gmc_v9_0_process_interrupt(struct > > amdgpu_device *adev, { > > bool retry_fault = !!(entry->src_data[1] & 0x80); > > bool write_fault = !!(entry->src_data[1] & 0x20); > > - uint32_t status = 0, cid = 0, rw = 0; > > + uint32_t status = 0, cid = 0, rw = 0, fed = 0; > > struct amdgpu_task_info task_info; > > struct amdgpu_vmhub *hub; > > const char *mmhub_cid; > > @@ -663,6 +663,14 @@ static int gmc_v9_0_process_interrupt(struct > amdgpu_device *adev, > > status = RREG32(hub->vm_l2_pro_fault_status); > > cid = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, CID); > > rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW); > > + fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, > FED); > > + > > + /* for gfx fed error, kfd will handle it, return directly */ > > + if (fed && amdgpu_ras_is_poison_mode_supported(adev) && > > + amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2) && > > + !strcmp(hub_name, "gfxhub0")) > > + return 1; > > amdgpu_irq_dispatch() gives the impression that return value of 1 is treated > as > handled, hence won't be passed to kfd. The commit description says it is > intended > to pass to kfd for handling. [Tao] good catch, it should return 0 here, will update it in v2, thanks. > > Also, FED status check may be moved up so that it's not misunderstood as a > regular page fault with the extra prints coming to dmesg log. > Otherwise, poison status also needs to be added to dmesg. [Tao] there is poison consumption dmesg log in kfd interrupt handler, no neeed to add extra print here. My intention is to skip " WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1)", moving up the check will make the change a little bit more and I think the page fault log is acceptable. > > Thanks, > Lijo > > > + > > WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); #ifdef > > HAVE_STRUCT_XARRAY > > amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, > vmhub);
[pull] amdgpu, amdkfd, radeon drm-next-6.9
Hi Dave, Sima, More new stuff for 6.9. The following changes since commit d5597444032b2f5c8624918fb5b29be5bba78a3c: drm/amdgpu: Fix HDP flush for VFs on nbio v7.9 (2024-02-07 12:26:24 -0500) are available in the Git repository at: https://gitlab.freedesktop.org/agd5f/linux.git tags/amd-drm-next-6.9-2024-02-19 for you to fetch changes up to 31e0a586f3385134bcad00d8194eb0728cb1a17d: drm/amdgpu: add MMHUB 3.3.1 support (2024-02-19 14:50:46 -0500) amd-drm-next-6.9-2024-02-19: amdgpu: - ATHUB 4.1 support - EEPROM support updates - RAS updates - LSDMA 7.0 support - JPEG DPG support - IH 7.0 support - HDP 7.0 support - VCN 5.0 support - Misc display fixes - Retimer fixes - DCN 3.5 fixes - VCN 4.x fixes - PSR fixes - PSP 14.0 support - VA_RESERVED cleanup - SMU 13.0.6 updates - NBIO 7.11 updates - SDMA 6.1 updates - MMHUB 3.3 updates - Suspend/resume fixes - DMUB updates amdkfd: - Trap handler enhancements - Fix cache size reporting - Relocate the trap handler radeon: - fix typo in print statement Alex Deucher (1): drm/amdgpu/psp: update define to better align with its meaning Aric Cyr (1): drm/amd/display: 3.2.272 Charlene Liu (2): drm/amd/display: enable fgcg by default drm/amd/display: allow psr-su/replay for z8 Dan Carpenter (1): drm/amd/display: Fix && vs || typos Felix Kuehling (2): drm/amdgpu: Reduce VA_RESERVED_BOTTOM to 64KB drm/amdkfd: Relocate TBA/TMA to opposite side of VM hole Gabe Teeger (1): Revert "drm/amd/display: Send DTBCLK disable message on first commit" George Shen (1): Revert "drm/amd/display: Add left edge pixel for YCbCr422/420 + ODM pipe split" Hamza Mahfooz (2): drm/amdgpu: make damage clips support configurable drm/amdgpu: respect the abmlevel module parameter value if it is set Hawking Zhang (8): drm/amdgpu: Add athub v4_1_0 ip headers (v5) drm/amdgpu: Add athub v4_1_0 ip block support drm/amdgpu: Add lsdma v7_0_0 ip headers (v3) drm/amdgpu: Add osssys v7_0_0 ip headers (v4) drm/amdgpu: Add hdp v7_0_0 ip headers (v3) drm/amdgpu: Add vcn v5_0_0 ip headers (v5) drm/amdgpu: Add mp v14_0_2 ip headers (v5) drm/amdgpu: Add psp v14_0 ip block support Jonathan Kim (1): drm/amdkfd: fill in data for control stack header for gfx10 Kent Russell (1): drm/amdkfd: Fix L2 cache size reporting in GFX9.4.3 Laurent Morichetti (1): drm/amdkfd: pass debug exceptions to second-level trap handler Lijo Lazar (1): drm/amd/pm: Allow setting max UCLK on SMU v13.0.6 Likun Gao (16): drm/amd/swsmu: add judgement for vcn jpeg dpm set drm/amdgpu: skip ucode bo reserve for RLC AUTOLOAD drm/amdgpu: support rlc auotload type set drm/amdgpu: Add lsdma v7_0 ip block support drm/amdgpu/discovery: Add lsdma v7_0 ip block drm/amdgpu: Add ih v7_0 ip block support drm/amdgpu/discovery: Add ih v7_0 ip block drm/amdgpu: Add hdp v7_0 ip block support drm/amdgpu/discovery: Add hdp v7_0 ip block drm/amdgpu: use spirom update wait_for helper for psp v14 drm/amdgpu: support psp ip block for psp v14 drm/amdgpu/psp: set autoload support by default drm/amdgpu/psp: handle TMR type via flag drm/amdgpu/psp: set boot_time_tmr flag drm/amdgpu: add psp_timeout to limit PSP related operation drm/amdgpu: support psp ip block discovery for psp v14 Mario Limonciello (3): drm/amd: Stop evicting resources on APUs in suspend Revert "drm/amd: flush any delayed gfxoff on suspend entry" drm/amd: Change `jpeg_v4_0_5_start_dpg_mode()` to void Martin Tsai (1): drm/amd/display: should support dmub hw lock on Replay Michael Strauss (1): drm/amd/display: Update FIXED_VS Retimer HWSS Test Pattern Sequences Nicholas Kazlauskas (2): drm/amd/display: Add shared firmware state for DMUB IPS handshake drm/amd/display: Increase ips2_eval delay for DCN35 Nikita Zhandarovich (2): drm/radeon/ni: Fix wrong firmware size logging in ni_init_microcode() drm/amd/display: fix NULL checks for adev->dm.dc in amdgpu_dm_fini() Rajneesh Bhardwaj (2): drm/amdkfd: update SIMD distribution algo for GFXIP 9.4.2 onwards drm/amdgpu: Fix implicit assumtion in gfx11 debug flags Rodrigo Siqueira (1): drm/amd/display: Remove break after return Roman Li (1): drm/amd/display: Fix array-index-out-of-bounds in dcn35_clkmgr Saleemkhan Jamadar (2): drm/amdgpu: add ucode id for jpeg DPG support drm/amdgpu/jpeg: add support for jpeg DPG mode Sohaib Nadeem (2): Revert "drm/amd/display: increased min_dcfclk_mhz and min_fclk_mhz" drm/amd/display: fixed integer types and null check locations Sonny Jiang (7): drm/amdgpu: add VCN_5_0_0 firmware support drm/amdgpu: add VCN_5_0_0 IP block supp
Re: [PATCH] drm/amdgpu: Drop redundant parameter in amdgpu_gfx_kiq_init_ring
[Public] Reviewed-by: Alex Deucher From: Ma, Jun Sent: Monday, February 19, 2024 1:40 AM To: amd-gfx@lists.freedesktop.org ; Koenig, Christian ; Deucher, Alexander Cc: Ma, Jun Subject: [PATCH] drm/amdgpu: Drop redundant parameter in amdgpu_gfx_kiq_init_ring Drop redundant parameters in function amdgpu_gfx_kiq_init_ring to simplify the code Signed-off-by: Ma Jun --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 4 +--- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 5 ++--- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 5 ++--- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 5 ++--- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 5 ++--- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 4 +--- 7 files changed, 13 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index e114694d1131..4835d6d899e7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -304,11 +304,11 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev, return -EINVAL; } -int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev, -struct amdgpu_ring *ring, -struct amdgpu_irq_src *irq, int xcc_id) +int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev, int xcc_id) { struct amdgpu_kiq *kiq = &adev->gfx.kiq[xcc_id]; + struct amdgpu_irq_src *irq = &kiq->irq; + struct amdgpu_ring *ring = &kiq->ring; int r = 0; spin_lock_init(&kiq->ring_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index f23bafec71c5..8fcf889ddce9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -471,9 +471,7 @@ static inline u32 amdgpu_gfx_create_bitmask(u32 bit_width) void amdgpu_gfx_parse_disable_cu(unsigned *mask, unsigned max_se, unsigned max_sh); -int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev, -struct amdgpu_ring *ring, -struct amdgpu_irq_src *irq, int xcc_id); +int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev, int xcc_id); void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index b02d63328f1c..691fa40e4e01 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -4490,7 +4490,7 @@ static int gfx_v10_0_compute_ring_init(struct amdgpu_device *adev, int ring_id, static int gfx_v10_0_sw_init(void *handle) { int i, j, k, r, ring_id = 0; - struct amdgpu_kiq *kiq; + int xcc_id = 0; struct amdgpu_device *adev = (struct amdgpu_device *)handle; switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { @@ -4619,8 +4619,7 @@ static int gfx_v10_0_sw_init(void *handle) return r; } - kiq = &adev->gfx.kiq[0]; - r = amdgpu_gfx_kiq_init_ring(adev, &kiq->ring, &kiq->irq, 0); + r = amdgpu_gfx_kiq_init_ring(adev, xcc_id); if (r) return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 2fb1342d5bd9..9d8ec709cd52 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1329,7 +1329,7 @@ static int gfx_v11_0_rlc_backdoor_autoload_enable(struct amdgpu_device *adev) static int gfx_v11_0_sw_init(void *handle) { int i, j, k, r, ring_id = 0; - struct amdgpu_kiq *kiq; + int xcc_id = 0; struct amdgpu_device *adev = (struct amdgpu_device *)handle; switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { @@ -1454,8 +1454,7 @@ static int gfx_v11_0_sw_init(void *handle) return r; } - kiq = &adev->gfx.kiq[0]; - r = amdgpu_gfx_kiq_init_ring(adev, &kiq->ring, &kiq->irq, 0); + r = amdgpu_gfx_kiq_init_ring(adev, xcc_id); if (r) return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index ea174b76ee70..b97ea62212b6 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -1900,8 +1900,8 @@ static void gfx_v8_0_sq_irq_work_func(struct work_struct *work); static int gfx_v8_0_sw_init(void *handle) { int i, j, k, r, ring_id; + int xcc_id = 0; struct amdgpu_ring *ring; - struct amdgpu_kiq *kiq; struct amdgpu_device *adev = (struct amdgpu_device *)handle; switch (adev->asic_type) { @@ -2022,8 +2022,7 @@ static int gfx_v8_0_sw_init(void *handle) return r; } - k
Re: [PATCH 2/2] drm/amdgpu: use new reset-affected accessors for userspace interfaces
Am 16.02.24 um 16:13 schrieb Alex Deucher: Use the new reset critical section accessors for debugfs, sysfs, and the INFO IOCTL to provide proper mutual exclusivity to hardware with respect the GPU resets. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 20 +- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 10 + drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 215 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 91 - 4 files changed, 235 insertions(+), 101 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 72eceb7d6667..d0e4a8729703 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1670,7 +1670,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused) } /* Avoid accidently unparking the sched thread during GPU reset */ - r = down_write_killable(&adev->reset_domain->sem); + r = amdgpu_reset_domain_access_write_start(adev); if (r) return r; @@ -1699,7 +1699,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused) kthread_unpark(ring->sched.thread); } - up_write(&adev->reset_domain->sem); + amdgpu_reset_domain_access_write_end(adev); pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); @@ -1929,7 +1929,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) return -ENOMEM; /* Avoid accidently unparking the sched thread during GPU reset */ - r = down_read_killable(&adev->reset_domain->sem); + r = amdgpu_reset_domain_access_read_start(adev); if (r) goto pro_end; @@ -1970,7 +1970,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) /* restart the scheduler */ kthread_unpark(ring->sched.thread); - up_read(&adev->reset_domain->sem); + amdgpu_reset_domain_access_read_end(adev); pro_end: kfree(fences); @@ -2031,23 +2031,23 @@ static ssize_t amdgpu_reset_dump_register_list_read(struct file *f, return 0; memset(reg_offset, 0, 12); - ret = down_read_killable(&adev->reset_domain->sem); + ret = amdgpu_reset_domain_access_read_start(adev); if (ret) return ret; for (i = 0; i < adev->reset_info.num_regs; i++) { sprintf(reg_offset, "0x%x\n", adev->reset_info.reset_dump_reg_list[i]); - up_read(&adev->reset_domain->sem); + amdgpu_reset_domain_access_read_end(adev); if (copy_to_user(buf + len, reg_offset, strlen(reg_offset))) return -EFAULT; len += strlen(reg_offset); - ret = down_read_killable(&adev->reset_domain->sem); + ret = amdgpu_reset_domain_access_read_start(adev); if (ret) return ret; } - up_read(&adev->reset_domain->sem); + amdgpu_reset_domain_access_read_end(adev); *pos += len; return len; @@ -2089,14 +2089,14 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f, ret = -ENOMEM; goto error_free; } - ret = down_write_killable(&adev->reset_domain->sem); + ret = amdgpu_reset_domain_access_write_start(adev); if (ret) goto error_free; swap(adev->reset_info.reset_dump_reg_list, tmp); swap(adev->reset_info.reset_dump_reg_value, new); adev->reset_info.num_regs = i; - up_write(&adev->reset_domain->sem); + amdgpu_reset_domain_access_write_end(adev); ret = size; error_free: diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index a2df3025a754..4efb44a964ef 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -43,6 +43,7 @@ #include "amdgpu_gem.h" #include "amdgpu_display.h" #include "amdgpu_ras.h" +#include "amdgpu_reset.h" #include "amd_pcie.h" void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev) @@ -704,7 +705,11 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) return copy_to_user(out, &count, min(size, 4u)) ? -EFAULT : 0; } case AMDGPU_INFO_TIMESTAMP: + ret = amdgpu_reset_domain_access_read_start(adev); + if (ret) + return -EFAULT; Probably better to return ret here and on other occasions as well. Shouldn't make a practical different at the moment, but should we ever change the killable to interruptible easier to handle. Apart from that looks like a massive cleanup to me. Regards, Christian. ui64 = amdgpu_gfx_get_gpu_clock_counter(adev); + amdgpu_reset_domain_access_read_end(adev
Re: [PATCH 1/2] drm/amdgpu: add new accessors for GPU reset-affected critcal sections
Am 16.02.24 um 16:13 schrieb Alex Deucher: Provide helper functions for code which needs to be mutually exclusive with GPU resets. While we are here, move amdgpu_in_reset in amdgpu_reset.c since that is the more logical location for it and add documentation. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 - .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 1 + .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 1 + .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 1 + .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 5 -- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 86 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 6 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 1 + drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c| 1 + drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 1 + drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c| 1 + drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 1 + drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c| 1 + drivers/gpu/drm/amd/amdgpu/mes_v10_1.c| 1 + drivers/gpu/drm/amd/amdgpu/mes_v11_0.c| 1 + drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 1 + .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 1 + .../amd/display/amdgpu_dm/amdgpu_dm_crtc.c| 1 + drivers/gpu/drm/amd/pm/amdgpu_pm.c| 1 + .../drm/amd/pm/powerplay/inc/amd_powerplay.h | 1 + drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 1 + .../gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c | 1 + .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c| 1 + .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 1 + 33 files changed, 121 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 9246bca0a008..0e365cadcc3f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1602,8 +1602,6 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev) return adev->gmc.tmz_enabled; } -int amdgpu_in_reset(struct amdgpu_device *adev); - extern const struct attribute_group amdgpu_vram_mgr_attr_group; extern const struct attribute_group amdgpu_gtt_mgr_attr_group; extern const struct attribute_group amdgpu_flash_attr_group; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c index 69810b3f1c63..78bec0b6c502 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c @@ -22,6 +22,7 @@ #include "amdgpu.h" #include "amdgpu_amdkfd.h" #include "amdgpu_amdkfd_gfx_v10.h" +#include "amdgpu_reset.h" #include "gc/gc_10_1_0_offset.h" #include "gc/gc_10_1_0_sh_mask.h" #include "athub/athub_2_0_0_offset.h" diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index ca4a6b82817f..717a6d10b01c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -22,6 +22,7 @@ #include "amdgpu.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_reset.h" #include "cikd.h" #include "cik_sdma.h" #include "gfx_v7_0.h" diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c index 0f3e2944edd7..d08e9c308835 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c @@ -22,6 +22,7 @@ #include "amdgpu.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_reset.h" #include "gfx_v8_0.h" #include "gca/gfx_8_0_sh_mask.h" #include "gca/gfx_8_0_d.h" diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index 5a35a8ca8922..b0ff1065491e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -21,6 +21,7 @@ */ #include "amdgpu.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_reset.h" #include "gc/gc_9_0_offset.h" #include "gc/gc_9_0_sh_mask.h" #include "vega10_enum.h" diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index e2ae9ba147ba..0eed6bb213b8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -27,6 +27,7 @@ #include "amdgpu.h" #include "amdgpu_sched.h" #include "amdgpu_ras.h" +#include "amdgpu_reset.h" #in
Re: [PATCH] drm/amd: Only allow one entity to control ABM
On Mon, Feb 19, 2024 at 10:19 AM Christian König wrote: > > Am 16.02.24 um 19:37 schrieb Alex Deucher: > > On Fri, Feb 16, 2024 at 10:42 AM Christian König > > wrote: > >> Am 16.02.24 um 16:12 schrieb Mario Limonciello: > >>> On 2/16/2024 09:05, Harry Wentland wrote: > > On 2024-02-16 09:47, Christian König wrote: > > Am 16.02.24 um 15:42 schrieb Mario Limonciello: > >> On 2/16/2024 08:38, Christian König wrote: > >>> Am 16.02.24 um 15:07 schrieb Mario Limonciello: > By exporting ABM to sysfs it's possible that DRM master and software > controlling the sysfs file fight over the value programmed for ABM. > > Adjust the module parameter behavior to control who control ABM: > -2: DRM > -1: sysfs (IE via software like power-profiles-daemon) > >>> Well that sounds extremely awkward. Why should a > >>> power-profiles-deamon has control over the panel power saving > >>> features? > >>> > >>> I mean we are talking about things like reducing backlight level > >>> when the is inactivity, don't we? > >> We're talking about activating the ABM algorithm when the system is > >> in power saving mode; not from inactivity. This allows the user to > >> squeeze out some extra power "just" in that situation. > >> > >> But given the comments on the other patch, I tend to agree with > >> Harry's proposal instead that we just drop the DRM property > >> entirely as there are no consumers of it. > > Yeah, but even then the design to let this be controlled by an > > userspace deamon is questionable. Stuff like that is handled inside > > the kernel and not exposed to userspace usually. > > > >>> Regarding the "how" and "why" of PPD; besides this panel power savings > >>> sysfs file there are two other things that are nominally changed. > >>> > >>> ACPI platform profile: > >>> https://www.kernel.org/doc/html/latest/userspace-api/sysfs-platform_profile.html > >>> > >>> AMD-Pstate EPP value: > >>> https://www.kernel.org/doc/html//latest/admin-guide/pm/amd-pstate.html > >>> > >>> When a user goes into "power saving" mode both of those are tweaked. > >>> Before we introduced the EPP tweaking in PPD we did discuss a callback > >>> within the kernel so that userspace could change "just" the ACPI > >>> platform profile and everything else would react. There was pushback > >>> on this, and so instead knobs are offered for things that should be > >>> tweaked and the userspace daemon can set up policy for what to do when > >>> a a user uses a userspace client (such as GNOME or KDE) to change the > >>> desired system profile. > >> Ok, well who came up with the idea of the userspace deamon? Cause I > >> think there will be even more push back on this approach. > >> > >> Basically when we go from AC to battery (or whatever) the drivers > >> usually handle that all inside the kernel today. Involving userspace is > >> only done when there is a need for that, e.g. inactivity detection or > >> similar. > > Well, we don't want policy in the kernel unless it's a platform or > > hardware requirement. Kernel should provide the knobs and then > > userspace can set them however they want depending on user preference. > > Well, you not have the policy itself but usually the handling inside the > kernel. > > In other words when I connect/disconnect AC from my laptop I can hear > the fan changing, which is a switch in power state. Only the beep which > comes out of the speakers as conformation is handled in userspace I think. > > And IIRC changing background light is also handled completely inside the > kernel and when I close the lid the display turns off on its own and not > because of some userspace deamon. > > So why is for this suddenly a userspace deamon involved? It's a user preference. Some people won't like ABM, some will. They set the policy from user space. It's similar to the backlight level. Some users always prefer a bright backlight regardless of AC/DC state, others want the backlight to get brighter when on AC power. The kernel provides the knobs to set the ABM level and then user space can specify the level and also device when they want it enabled (never, only on DC, etc.). The kernel driver for the backlight doesn't change the backlight at AC/DC switch, userspace gets an event when the power source changes and it then talks to the kernel to change the backlight level. I think lid is handled the same way. Userspace gets a lid event and it turns off the displays and maybe enters suspend or maybe not depending on what the user wants. Alex > > Christian. > > > > > Alex > > > > > I think we'll need a bit in our kernel docs describing ABM. > Assumptions around what it is and does seem to lead to confusion. > > The thing is that ABM has a visual impact that not all users like. It > would make sense for users to be able to change the ABM level as > desired, and/o
Re: [PATCH] drm/amd: Only allow one entity to control ABM
Am 16.02.24 um 19:37 schrieb Alex Deucher: On Fri, Feb 16, 2024 at 10:42 AM Christian König wrote: Am 16.02.24 um 16:12 schrieb Mario Limonciello: On 2/16/2024 09:05, Harry Wentland wrote: On 2024-02-16 09:47, Christian König wrote: Am 16.02.24 um 15:42 schrieb Mario Limonciello: On 2/16/2024 08:38, Christian König wrote: Am 16.02.24 um 15:07 schrieb Mario Limonciello: By exporting ABM to sysfs it's possible that DRM master and software controlling the sysfs file fight over the value programmed for ABM. Adjust the module parameter behavior to control who control ABM: -2: DRM -1: sysfs (IE via software like power-profiles-daemon) Well that sounds extremely awkward. Why should a power-profiles-deamon has control over the panel power saving features? I mean we are talking about things like reducing backlight level when the is inactivity, don't we? We're talking about activating the ABM algorithm when the system is in power saving mode; not from inactivity. This allows the user to squeeze out some extra power "just" in that situation. But given the comments on the other patch, I tend to agree with Harry's proposal instead that we just drop the DRM property entirely as there are no consumers of it. Yeah, but even then the design to let this be controlled by an userspace deamon is questionable. Stuff like that is handled inside the kernel and not exposed to userspace usually. Regarding the "how" and "why" of PPD; besides this panel power savings sysfs file there are two other things that are nominally changed. ACPI platform profile: https://www.kernel.org/doc/html/latest/userspace-api/sysfs-platform_profile.html AMD-Pstate EPP value: https://www.kernel.org/doc/html//latest/admin-guide/pm/amd-pstate.html When a user goes into "power saving" mode both of those are tweaked. Before we introduced the EPP tweaking in PPD we did discuss a callback within the kernel so that userspace could change "just" the ACPI platform profile and everything else would react. There was pushback on this, and so instead knobs are offered for things that should be tweaked and the userspace daemon can set up policy for what to do when a a user uses a userspace client (such as GNOME or KDE) to change the desired system profile. Ok, well who came up with the idea of the userspace deamon? Cause I think there will be even more push back on this approach. Basically when we go from AC to battery (or whatever) the drivers usually handle that all inside the kernel today. Involving userspace is only done when there is a need for that, e.g. inactivity detection or similar. Well, we don't want policy in the kernel unless it's a platform or hardware requirement. Kernel should provide the knobs and then userspace can set them however they want depending on user preference. Well, you not have the policy itself but usually the handling inside the kernel. In other words when I connect/disconnect AC from my laptop I can hear the fan changing, which is a switch in power state. Only the beep which comes out of the speakers as conformation is handled in userspace I think. And IIRC changing background light is also handled completely inside the kernel and when I close the lid the display turns off on its own and not because of some userspace deamon. So why is for this suddenly a userspace deamon involved? Christian. Alex I think we'll need a bit in our kernel docs describing ABM. Assumptions around what it is and does seem to lead to confusion. The thing is that ABM has a visual impact that not all users like. It would make sense for users to be able to change the ABM level as desired, and/or configure their power profiles to select a certain ABM level. ABM will reduce the backlight and compensate by adjusting brightness and contrast of the image. It has 5 levels: 0, 1, 2, 3, 4. 0 means off. 4 means maximum backlight reduction. IMO, 1 and 2 look okay. 3 and 4 can be quite impactful, both to power and visual fidelity. Aside from this patch Hamza did make some improvements to the kdoc in the recent patches, can you read through again and see if you think it still needs improvements? Well what exactly is the requirement? That we have different ABM settings for AC and battery? If yes providing different DRM properties would sound like the right approach to me. Regards, Christian. Harry Regards, Christian. Regards, Christian. 0-4: User via command line Also introduce a Kconfig option that allows distributions to choose the default policy that is appropriate for them. Fixes: f97e4303da16 ("drm/amd/display: add panel_power_savings sysfs entry to eDP connectors") Signed-off-by: Mario Limonciello --- Cc: Hamza Mahfooz Cc: Harry Wentland Cc: Sun peng (Leo) Li drivers/gpu/drm/amd/amdgpu/Kconfig| 72 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 23 +++--- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 6 +- 3 files changed, 90 insertions(+), 11 deletions(
Re: Kernel 6.7+ broke under-powering of my RX 6700XT. (Archlinux, mesa/amdgpu)
Hello everyone, patch by user @fililip was posted there, but not submitted: /"I think I'd have to submit it to the linux kernel mailing list, which I am kinda scared of 😅. It could be better to submit that patch to Arch Linux maintainers; they could include it in their kernel builds."/ Implementation of this patch can be simplified by simply setting: |smu->min_power_limit = amdgpu_ignore_min_pcap ? 0 : whatever_default_smuxx;| and then leave rest of the code unchanged(except defining |amdgpu_ignore_min_pcap |variable of course). Nothing tricky nor need to revert anything should be needed I hope. Please add it to the general kernel as an option, it certainly should not be related to Archlinux only. Roman On 2/19/24 12:15, Linux regression tracking (Thorsten Leemhuis) wrote: On 17.02.24 14:30, Greg KH wrote: On Sat, Feb 17, 2024 at 02:01:54PM +0100, Roman Benes wrote: Minimum power limit on latest(6.7+) kernels is 190W for my GPU (RX 6700XT, mesa, archlinux) and I cannot get power cap as low as before(to 115W), neither with Corectrl, LACT or TuxClocker and /sys have a variable read-only even for root. This is not of above apps issue but of the kernel, I read similar issues from other bug reports of above apps. I downgraded to v6.6.10 kernel and my 115W(under power)cap work again as before. Any chance you can use 'git bisect' to figure out the offending change? For the record and everyone that lands here: the cause is known now (it's 1958946858a62b ("drm/amd/pm: Support for getting power1_cap_min value") [v6.7-rc1]) and the issue afaics tracked here: https://gitlab.freedesktop.org/drm/amd/-/issues/3183 Other mentions: https://gitlab.freedesktop.org/drm/amd/-/issues/3137 https://gitlab.freedesktop.org/drm/amd/-/issues/2992 Haven't seen any statement from the amdgpu developers (now CCed) yet on this there (but might have missed something!). From what I can see I assume this will likely be somewhat tricky to handle, as a revert overall might be a bad idea here. We'll see I guess. Roman posted something that apparently was meant to go to the list, so let me put it here: """ UPDATE: User fililip already posted patch, but it need to be merged, discussion is on gitlab link below. (PS: I hope I am replying correctly to "all" now? - using original addr.) it seems that commit was already found(see user's 'fililip' comment): https://gitlab.freedesktop.org/drm/amd/-/issues/3183 commit 1958946858a62b6b5392ed075aa219d199bcae39 Author: Ma Jun Date: Thu Oct 12 09:33:45 2023 +0800 drm/amd/pm: Support for getting power1_cap_min value Support for getting power1_cap_min value on smu13 and smu11. For other Asics, we still use 0 as the default value. Signed-off-by: Ma Jun Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher However, this is not good as it remove under-powering range too far. I was getting only about 7% less performance but 90W(!) less consumption when set to my 115W before. Also I wonder if we as a OS of options and freedom have to stick to such very high reference for min values without ability to override them through some sys ctrls. Commit was done by amd guy and I wonder if because of maybe this post that I made few months ago(business strategy?): https://www.reddit.com/r/Amd/comments/183gye7/rx_6700xt_from_230w_to_capped_115w_at_only_10/ This is not a dangerous OC upwards where I can understand desire to protect HW, it is downward, having min cap at 190W when card pull on 115W almost same speed is IMO crazy to deny. We don't talk about default or reference values here either, just a move to lower the range of options for whatever reason. I don't know how much power you guys have over them, but please consider either reverting this change, or give us an option to set min_cap through say /sys (right now param is readonly, even for root). Thank you in advance for looking into this, with regards: Romano """ And while at it, let me add this issue to the tracking as well [TLDR: I'm adding this report to the list of tracked Linux kernel regressions; the text you find below is based on a few templates paragraphs you might have encountered already in similar form. See link in footer if these mails annoy you.] Thanks for the report. To be sure the issue doesn't fall through the cracks unnoticed, I'm adding it to regzbot, the Linux kernel regression tracking bot: #regzbot introduced 1958946858a62b / #regzbot title drm: amdgpu: under-powering broke Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat) -- Everything you wanna know about Linux kernel regression tracking: https://linux-regtracking.leemhuis.info/about/#tldr That page also explains what to do if mails like this annoy you.
Re: Kernel 6.7+ broke under-powering of my RX 6700XT. (Archlinux, mesa/amdgpu)
Hello everyone, patch by user @fililip was posted there, but not submitted: "I think I'd have to submit it to the linux kernel mailing list, which I am kinda scared of 😅. It could be better to submit that patch to Arch Linux maintainers; they could include it in their kernel builds." Implementation of this patch can be simplified by simply setting: smu->min_power_limit = amdgpu_ignore_min_pcap ? 0 : whatever_default_smuxx; and then leave rest of the code unchanged(except defining amdgpu_ignore_min_pcap variable of course). Nothing tricky nor need to revert anything should be needed I hope. Please add it to the general kernel as an option, it certainly should not be related to Archlinux only. Roman On 2/19/24 12:15, Linux regression tracking (Thorsten Leemhuis) wrote: On 17.02.24 14:30, Greg KH wrote: On Sat, Feb 17, 2024 at 02:01:54PM +0100, Roman Benes wrote: Minimum power limit on latest(6.7+) kernels is 190W for my GPU (RX 6700XT, mesa, archlinux) and I cannot get power cap as low as before(to 115W), neither with Corectrl, LACT or TuxClocker and /sys have a variable read-only even for root. This is not of above apps issue but of the kernel, I read similar issues from other bug reports of above apps. I downgraded to v6.6.10 kernel and my 115W(under power)cap work again as before. Any chance you can use 'git bisect' to figure out the offending change? For the record and everyone that lands here: the cause is known now (it's 1958946858a62b ("drm/amd/pm: Support for getting power1_cap_min value") [v6.7-rc1]) and the issue afaics tracked here: https://gitlab.freedesktop.org/drm/amd/-/issues/3183 Other mentions: https://gitlab.freedesktop.org/drm/amd/-/issues/3137 https://gitlab.freedesktop.org/drm/amd/-/issues/2992 Haven't seen any statement from the amdgpu developers (now CCed) yet on this there (but might have missed something!). From what I can see I assume this will likely be somewhat tricky to handle, as a revert overall might be a bad idea here. We'll see I guess. Roman posted something that apparently was meant to go to the list, so let me put it here: """ UPDATE: User fililip already posted patch, but it need to be merged, discussion is on gitlab link below. (PS: I hope I am replying correctly to "all" now? - using original addr.) it seems that commit was already found(see user's 'fililip' comment): https://gitlab.freedesktop.org/drm/amd/-/issues/3183 commit 1958946858a62b6b5392ed075aa219d199bcae39 Author: Ma Jun Date: Thu Oct 12 09:33:45 2023 +0800 drm/amd/pm: Support for getting power1_cap_min value Support for getting power1_cap_min value on smu13 and smu11. For other Asics, we still use 0 as the default value. Signed-off-by: Ma Jun Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher However, this is not good as it remove under-powering range too far. I was getting only about 7% less performance but 90W(!) less consumption when set to my 115W before. Also I wonder if we as a OS of options and freedom have to stick to such very high reference for min values without ability to override them through some sys ctrls. Commit was done by amd guy and I wonder if because of maybe this post that I made few months ago(business strategy?): https://www.reddit.com/r/Amd/comments/183gye7/rx_6700xt_from_230w_to_capped_115w_at_only_10/ This is not a dangerous OC upwards where I can understand desire to protect HW, it is downward, having min cap at 190W when card pull on 115W almost same speed is IMO crazy to deny. We don't talk about default or reference values here either, just a move to lower the range of options for whatever reason. I don't know how much power you guys have over them, but please consider either reverting this change, or give us an option to set min_cap through say /sys (right now param is readonly, even for root). Thank you in advance for looking into this, with regards: Romano """ And while at it, let me add this issue to the tracking as well [TLDR: I'm adding this report to the list of tracked Linux kernel regressions; the text you find below is based on a few templates paragraphs you might have encountered already in similar form. See link in footer if these mails annoy you.] Thanks for the report. To be sure the issue doesn't fall through the cracks unnoticed, I'm adding it to regzbot, the Linux kernel regression tracking bot: #regzbot introduced 1958946858a62b / #regzbot title drm: amdgpu: under-powering broke Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat) -- Everything you wanna know about Linux kernel regression tracking: https://linux-regtracking.leemhuis.info/about/#tldr That page also explains what to do if mails like this annoy you.
Re: Kernel 6.7+ broke under-powering of my RX 6700XT. (Archlinux, mesa/amdgpu)
On 17.02.24 14:30, Greg KH wrote: > On Sat, Feb 17, 2024 at 02:01:54PM +0100, Roman Benes wrote: >> Minimum power limit on latest(6.7+) kernels is 190W for my GPU (RX 6700XT, >> mesa, archlinux) and I cannot get power cap as low as before(to 115W), >> neither with Corectrl, LACT or TuxClocker and /sys have a variable read-only >> even for root. This is not of above apps issue but of the kernel, I read >> similar issues from other bug reports of above apps. I downgraded to v6.6.10 >> kernel and my 115W(under power)cap work again as before. > > Any chance you can use 'git bisect' to figure out the offending change? For the record and everyone that lands here: the cause is known now (it's 1958946858a62b ("drm/amd/pm: Support for getting power1_cap_min value") [v6.7-rc1]) and the issue afaics tracked here: https://gitlab.freedesktop.org/drm/amd/-/issues/3183 Other mentions: https://gitlab.freedesktop.org/drm/amd/-/issues/3137 https://gitlab.freedesktop.org/drm/amd/-/issues/2992 Haven't seen any statement from the amdgpu developers (now CCed) yet on this there (but might have missed something!). From what I can see I assume this will likely be somewhat tricky to handle, as a revert overall might be a bad idea here. We'll see I guess. Roman posted something that apparently was meant to go to the list, so let me put it here: """ UPDATE: User fililip already posted patch, but it need to be merged, discussion is on gitlab link below. (PS: I hope I am replying correctly to "all" now? - using original addr.) > it seems that commit was already found(see user's 'fililip' comment): > > https://gitlab.freedesktop.org/drm/amd/-/issues/3183 > commit 1958946858a62b6b5392ed075aa219d199bcae39 > Author: Ma Jun > Date: Thu Oct 12 09:33:45 2023 +0800 > > drm/amd/pm: Support for getting power1_cap_min value > > Support for getting power1_cap_min value on smu13 and smu11. > For other Asics, we still use 0 as the default value. > > Signed-off-by: Ma Jun > Reviewed-by: Kenneth Feng > Signed-off-by: Alex Deucher > > However, this is not good as it remove under-powering range too far. I was getting only about 7% less performance but 90W(!) less consumption when set to my 115W before. Also I wonder if we as a OS of options and freedom have to stick to such very high reference for min values without ability to override them through some sys ctrls. Commit was done by amd guy and I wonder if because of maybe this post that I made few months ago(business strategy?): > > https://www.reddit.com/r/Amd/comments/183gye7/rx_6700xt_from_230w_to_capped_115w_at_only_10/ > > This is not a dangerous OC upwards where I can understand desire to protect HW, it is downward, having min cap at 190W when card pull on 115W almost same speed is IMO crazy to deny. We don't talk about default or reference values here either, just a move to lower the range of options for whatever reason. > > I don't know how much power you guys have over them, but please consider either reverting this change, or give us an option to set min_cap through say /sys (right now param is readonly, even for root). > > > Thank you in advance for looking into this, with regards: Romano """ And while at it, let me add this issue to the tracking as well [TLDR: I'm adding this report to the list of tracked Linux kernel regressions; the text you find below is based on a few templates paragraphs you might have encountered already in similar form. See link in footer if these mails annoy you.] Thanks for the report. To be sure the issue doesn't fall through the cracks unnoticed, I'm adding it to regzbot, the Linux kernel regression tracking bot: #regzbot introduced 1958946858a62b / #regzbot title drm: amdgpu: under-powering broke Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat) -- Everything you wanna know about Linux kernel regression tracking: https://linux-regtracking.leemhuis.info/about/#tldr That page also explains what to do if mails like this annoy you.
Re: [PATCH v3 3/9] drm/ci: mediatek: Add job to test panfrost and powervr GPU driver
On 19/02/2024 06:39, Vignesh Raman wrote: Hi Helen, On 09/02/24 23:51, Helen Koike wrote: On 30/01/2024 12:03, Vignesh Raman wrote: For mediatek mt8173, the GPU driver is powervr and for mediatek mt8183, the GPU driver is panfrost. So add support in drm-ci to test panfrost and powervr GPU driver for mediatek SOCs and update xfails. Powervr driver was merged in linux kernel, but there's no mediatek support yet. So disable the mt8173-gpu job which uses powervr driver. Add panfrost specific tests to testlist and skip KMS tests for panfrost driver since it is not a not a KMS driver. Also update the MAINTAINERS file to include xfails for panfrost driver. Signed-off-by: Vignesh Raman Hi Vignesh, thanks for your work. I'm still wondering about a few things, please check below. --- v2: - Add panfrost and PVR GPU jobs for mediatek SOC with new xfails, add xfail entry to MAINTAINERS. Maybe we should review how the xfails failes are named. I think they should start with the DRIVER_NAME instead of GPU_VERSION. For instance, consider the following job: mediatek:mt8183-gpu: extends: - .mt8183 variables: GPU_VERSION: mediatek-mt8183-gpu DRIVER_NAME: panfrost And we have mediatek-mt8183-gpu-skips.txt If there is an error, we want to notify the panfrost driver maintainers (and maybe not the mediatek driver maintainers), so MAINTAINERS file doesn't correspond to this. Agree. How about a naming __ ? powervr_mediatek-mt8173_gpu-skipts.txt mediatek_mediatek-mt8173_display-skipts.txt panfrost_mediatek-mt8183_gpu-skips.txt mediatek_mediatek-mt8183_display-skips.txt ... What do you think? Yes we can keep this naming. In this case do we still need gpu/display in the xfails file name? If you think this split is not required, then I'm fine dropping it. Regards, Helen Regards, Vignesh
Re: [PATCH 5/5] drm/amdgpu: skip GFX FED error in page fault handling
On 2/19/2024 1:45 PM, Tao Zhou wrote: > Let kfd interrupt handler process it. > > Signed-off-by: Tao Zhou > --- > drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 10 +- > 1 file changed, 9 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > index 773725a92cf1..70defc394b7b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > @@ -552,7 +552,7 @@ static int gmc_v9_0_process_interrupt(struct > amdgpu_device *adev, > { > bool retry_fault = !!(entry->src_data[1] & 0x80); > bool write_fault = !!(entry->src_data[1] & 0x20); > - uint32_t status = 0, cid = 0, rw = 0; > + uint32_t status = 0, cid = 0, rw = 0, fed = 0; > struct amdgpu_task_info task_info; > struct amdgpu_vmhub *hub; > const char *mmhub_cid; > @@ -663,6 +663,14 @@ static int gmc_v9_0_process_interrupt(struct > amdgpu_device *adev, > status = RREG32(hub->vm_l2_pro_fault_status); > cid = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, CID); > rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW); > + fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); > + > + /* for gfx fed error, kfd will handle it, return directly */ > + if (fed && amdgpu_ras_is_poison_mode_supported(adev) && > + amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2) && > + !strcmp(hub_name, "gfxhub0")) > + return 1; amdgpu_irq_dispatch() gives the impression that return value of 1 is treated as handled, hence won't be passed to kfd. The commit description says it is intended to pass to kfd for handling. Also, FED status check may be moved up so that it's not misunderstood as a regular page fault with the extra prints coming to dmesg log. Otherwise, poison status also needs to be added to dmesg. Thanks, Lijo > + > WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); > #ifdef HAVE_STRUCT_XARRAY > amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub);
Re: [PATCH] drm/amd: Drop abm_level property
Am 16.02.24 um 16:33 schrieb Mario Limonciello: This vendor specific property has never been used by userspace software and conflicts with the panel_power_savings sysfs file. That is a compositor and user could fight over the same data. Fixes: f97e4303da16 ("drm/amd/display: add panel_power_savings sysfs entry to eDP connectors") Suggested-by: Harry Wentland Signed-off-by: Mario Limonciello Acked-by: Christian König -- Cc: Hamza Mahfooz Cc: Sun peng (Leo) Li --- drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 8 drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h | 2 -- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 14 -- 3 files changed, 24 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c index b8fbe97efe1d..3ecc7ef95172 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c @@ -1350,14 +1350,6 @@ int amdgpu_display_modeset_create_props(struct amdgpu_device *adev) "dither", amdgpu_dither_enum_list, sz); - if (adev->dc_enabled) { - adev->mode_info.abm_level_property = - drm_property_create_range(adev_to_drm(adev), 0, - "abm level", 0, 4); - if (!adev->mode_info.abm_level_property) - return -ENOMEM; - } - return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h index 2e4911050cc5..1fe21a70ddd0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h @@ -324,8 +324,6 @@ struct amdgpu_mode_info { struct drm_property *audio_property; /* FMT dithering */ struct drm_property *dither_property; - /* Adaptive Backlight Modulation (power feature) */ - struct drm_property *abm_level_property; /* hardcoded DFP edid from BIOS */ struct edid *bios_hardcoded_edid; int bios_hardcoded_edid_size; diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index b9ac3d2f8029..e3b32ffba85a 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -6388,9 +6388,6 @@ int amdgpu_dm_connector_atomic_set_property(struct drm_connector *connector, } else if (property == adev->mode_info.underscan_property) { dm_new_state->underscan_enable = val; ret = 0; - } else if (property == adev->mode_info.abm_level_property) { - dm_new_state->abm_level = val ?: ABM_LEVEL_IMMEDIATE_DISABLE; - ret = 0; } return ret; @@ -6433,10 +6430,6 @@ int amdgpu_dm_connector_atomic_get_property(struct drm_connector *connector, } else if (property == adev->mode_info.underscan_property) { *val = dm_state->underscan_enable; ret = 0; - } else if (property == adev->mode_info.abm_level_property) { - *val = (dm_state->abm_level != ABM_LEVEL_IMMEDIATE_DISABLE) ? - dm_state->abm_level : 0; - ret = 0; } return ret; @@ -7652,13 +7645,6 @@ void amdgpu_dm_connector_init_helper(struct amdgpu_display_manager *dm, aconnector->base.state->max_bpc = 16; aconnector->base.state->max_requested_bpc = aconnector->base.state->max_bpc; - if (connector_type == DRM_MODE_CONNECTOR_eDP && - (dc_is_dmcu_initialized(adev->dm.dc) || -adev->dm.dc->ctx->dmub_srv) && amdgpu_dm_abm_level < 0) { - drm_object_attach_property(&aconnector->base.base, - adev->mode_info.abm_level_property, 0); - } - if (connector_type == DRM_MODE_CONNECTOR_HDMIA) { /* Content Type is currently only implemented for HDMI. */ drm_connector_attach_content_type_property(&aconnector->base);
[PATCH] drm/amd/display: Fix potential null pointer dereference in dc_dmub_srv
Fixes potential null pointer dereference warnings in the dc_dmub_srv_cmd_list_queue_execute() and dc_dmub_srv_is_hw_pwr_up() functions. In both functions, the 'dc_dmub_srv' variable was being dereferenced before it was checked for null. This could lead to a null pointer dereference if 'dc_dmub_srv' is null. The fix is to check if 'dc_dmub_srv' is null before dereferencing it. Thus moving the null checks for 'dc_dmub_srv' to the beginning of the functions to ensure that 'dc_dmub_srv' is not null when it is dereferenced. Found by smatch & thus fixing the below: drivers/gpu/drm/amd/amdgpu/../display/dc/dc_dmub_srv.c:133 dc_dmub_srv_cmd_list_queue_execute() warn: variable dereferenced before check 'dc_dmub_srv' (see line 128) drivers/gpu/drm/amd/amdgpu/../display/dc/dc_dmub_srv.c:1167 dc_dmub_srv_is_hw_pwr_up() warn: variable dereferenced before check 'dc_dmub_srv' (see line 1164) Fixes: 01fbdc34c687 ("drm/amd/display: decouple dmcub execution to reduce lock granularity") Fixes: 65138eb72e1f ("drm/amd/display: Add DCN35 DMUB") Cc: JinZe.Xu Cc: Hersen Wu Cc: Josip Pavic Cc: Roman Li Cc: Qingqing Zhuo Cc: Harry Wentland Cc: Rodrigo Siqueira Cc: Aurabindo Pillai Cc: Tom Chung Signed-off-by: Srinivasan Shanmugam --- drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c index 0bc32537e2eb..8bc361e0f404 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c +++ b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c @@ -128,7 +128,7 @@ bool dc_dmub_srv_cmd_list_queue_execute(struct dc_dmub_srv *dc_dmub_srv, unsigned int count, union dmub_rb_cmd *cmd_list) { - struct dc_context *dc_ctx = dc_dmub_srv->ctx; + struct dc_context *dc_ctx; struct dmub_srv *dmub; enum dmub_status status; int i; @@ -136,6 +136,7 @@ bool dc_dmub_srv_cmd_list_queue_execute(struct dc_dmub_srv *dc_dmub_srv, if (!dc_dmub_srv || !dc_dmub_srv->dmub) return false; + dc_ctx = dc_dmub_srv->ctx; dmub = dc_dmub_srv->dmub; for (i = 0 ; i < count; i++) { @@ -1169,12 +1170,14 @@ void dc_dmub_srv_subvp_save_surf_addr(const struct dc_dmub_srv *dc_dmub_srv, con bool dc_dmub_srv_is_hw_pwr_up(struct dc_dmub_srv *dc_dmub_srv, bool wait) { - struct dc_context *dc_ctx = dc_dmub_srv->ctx; + struct dc_context *dc_ctx; enum dmub_status status; if (!dc_dmub_srv || !dc_dmub_srv->dmub) return true; + dc_ctx = dc_dmub_srv->ctx; + if (dc_dmub_srv->ctx->dc->debug.dmcub_emulation) return true; -- 2.34.1
Re: [PATCH v3 3/9] drm/ci: mediatek: Add job to test panfrost and powervr GPU driver
Hi Helen, On 09/02/24 23:51, Helen Koike wrote: On 30/01/2024 12:03, Vignesh Raman wrote: For mediatek mt8173, the GPU driver is powervr and for mediatek mt8183, the GPU driver is panfrost. So add support in drm-ci to test panfrost and powervr GPU driver for mediatek SOCs and update xfails. Powervr driver was merged in linux kernel, but there's no mediatek support yet. So disable the mt8173-gpu job which uses powervr driver. Add panfrost specific tests to testlist and skip KMS tests for panfrost driver since it is not a not a KMS driver. Also update the MAINTAINERS file to include xfails for panfrost driver. Signed-off-by: Vignesh Raman Hi Vignesh, thanks for your work. I'm still wondering about a few things, please check below. --- v2: - Add panfrost and PVR GPU jobs for mediatek SOC with new xfails, add xfail entry to MAINTAINERS. Maybe we should review how the xfails failes are named. I think they should start with the DRIVER_NAME instead of GPU_VERSION. For instance, consider the following job: mediatek:mt8183-gpu: extends: - .mt8183 variables: GPU_VERSION: mediatek-mt8183-gpu DRIVER_NAME: panfrost And we have mediatek-mt8183-gpu-skips.txt If there is an error, we want to notify the panfrost driver maintainers (and maybe not the mediatek driver maintainers), so MAINTAINERS file doesn't correspond to this. Agree. How about a naming __ ? powervr_mediatek-mt8173_gpu-skipts.txt mediatek_mediatek-mt8173_display-skipts.txt panfrost_mediatek-mt8183_gpu-skips.txt mediatek_mediatek-mt8183_display-skips.txt ... What do you think? Yes we can keep this naming. In this case do we still need gpu/display in the xfails file name? Regards, Vignesh
Re: [PATCH v2] drm/amd/display: add panel_power_savings sysfs entry to eDP connectors
On Fri, 16 Feb 2024 10:32:10 -0600 Mario Limonciello wrote: > On 2/16/2024 10:13, Harry Wentland wrote: > > > > > > On 2024-02-16 11:11, Harry Wentland wrote: > >> > >> > >> On 2024-02-16 10:42, Pekka Paalanen wrote: > >>> On Fri, 16 Feb 2024 09:33:47 -0500 > >>> Harry Wentland wrote: > >>> > On 2024-02-16 03:19, Pekka Paalanen wrote: > > On Fri, 2 Feb 2024 10:28:35 -0500 > > Hamza Mahfooz wrote: > > > >> We want programs besides the compositor to be able to enable or disable > >> panel power saving features. > > > > Could you also explain why, in the commit message, please? > > > > It is unexpected for arbitrary programs to be able to override the KMS > > client, and certainly new ways to do so should not be added without an > > excellent justification. > > > > Maybe debugfs would be more appropriate if the purpose is only testing > > rather than production environments? > > > >> However, since they are currently only > >> configurable through DRM properties, that isn't possible. So, to remedy > >> that issue introduce a new "panel_power_savings" sysfs attribute. > > > > When the DRM property was added, what was used as the userspace to > > prove its workings? > > > > I don't think there ever was a userspace implementation and doubt any > exists today. Part of that is on me. In hindsight, the KMS prop should > have never gone upstream. > > I suggest we drop the KMS prop entirely. > >>> > >>> Sounds good. What about the sysfs thing? Should it be a debugfs thing > >>> instead, assuming the below question will be resolved? > >>> > >> > >> > >> It's intended to be used by the power profiles daemon (PPD). I don't think > >> debugfs is the right choice. See > >> https://gitlab.freedesktop.org/upower/power-profiles-daemon/-/commit/41ed5d33a82b0ceb7b6d473551eb2aa62cade6bc > >> > As for the color accuracy topic, I think it is important that compositors > can have full control over that if needed, while it's also important > for HW vendors to optimize for power when absolute color accuracy is not > needed. An average end-user writing code or working on their slides > would rather have a longer battery life than a perfectly color-accurate > display. We should probably think of a solution that can support both > use-cases. > >>> > >>> I agree. Maybe this pondering should start from "how would it work from > >>> end user perspective"? > >>> > >>> "Automatically" is probably be most desirable answer. Some kind of > >> > >> I agree > >> > >>> desktop settings with options like "save power at the expense of image > >>> quality": > >>> - always > >>> - not if watching movies/gaming > >>> - on battery > >>> - on battery, unless I'm watching movies/gaming > >>> - never > >>> > >> > >> It's interesting that you split out movies/gaming, specifically. AMD's > >> ABM algorithm seems to have considered movies in particular when > >> evaluating the power/fidelity trade-off. > >> > >> I wouldn't think consumer media is very particular about a specific > >> color fidelity (despite what HDR specs try to make you believe). Where > >> color fidelity would matter to me is when I'd want to edit pictures or > >> video. > >> > >> The "abm_level" property that we expose is really just that, a setting > >> for the strength of the power-savings effect, with 0 being off and 4 being > >> maximum strength and power saving, at the expense of fidelity. > >> > >> Mario's work is to let the PPD control it and set the ABM levels based on > >> the selected power profile: > >> 0 - Performance > >> 1 - Balance > >> 3 - Power > >> > >> And I believe we've looked at disabling ABM (setting it to 0) automatically > >> if we know we're on AC power. > >> > >>> Or maybe there already is something like that, and it only needs to be > >>> plumbed through? > >>> > >>> Which would point towards KMS clients needing to control it, which > >>> means a generic KMS prop rather than vendor specific? > >>> > >>> Or should the desktop compositor be talking to some daemon instead of > >>> KMS for this? Maybe they already are? > >>> > >> > >> I think the intention is for the PPD to be that daemon. Mario can > >> elaborate. > >> > > > > Some more details and screenshots on how the PPD is expected to work and > > look: > > https://linuxconfig.org/how-to-manage-power-profiles-over-d-bus-with-power-profiles-daemon-on-linux > > > > Right, thanks! > > The most important point is that the user indicates intent from the GUI. > The daemon orchestrates the various knobs to get that intent. > > It's intentionally very coarse - 3 power states. The policy of what to > do for those states is managed by the daemon. > > In the case of ABM it will only apply the policy if the daemon detects > the system is on battery. > Sounds like sysfs is the best option, and it
Re: [PATCH v3 9/9] drm/ci: uprev IGT and update testlist
Hi Maíra, On 10/02/24 23:50, Maíra Canal wrote: On 2/10/24 15:17, Maíra Canal wrote: On 1/30/24 12:03, Vignesh Raman wrote: Uprev IGT and add amd, v3d, vc4 and vgem specific tests to testlist. Have testlist.txt per driver and include a base testlist so that the driver specific tests will run only on those hardware. Signed-off-by: Vignesh Raman --- v3: - New patch in series to uprev IGT and update testlist. --- drivers/gpu/drm/ci/gitlab-ci.yml | 2 +- drivers/gpu/drm/ci/igt_runner.sh | 12 +- drivers/gpu/drm/ci/testlist-amdgpu.txt | 151 ++ drivers/gpu/drm/ci/testlist-msm.txt | 50 ++ drivers/gpu/drm/ci/testlist-panfrost.txt | 17 ++ drivers/gpu/drm/ci/testlist-v3d.txt | 73 + drivers/gpu/drm/ci/testlist-vc4.txt | 49 ++ drivers/gpu/drm/ci/testlist.txt | 100 .../gpu/drm/ci/xfails/amdgpu-stoney-fails.txt | 24 ++- .../drm/ci/xfails/amdgpu-stoney-flakes.txt | 9 +- .../gpu/drm/ci/xfails/amdgpu-stoney-skips.txt | 10 +- 11 files changed, 427 insertions(+), 70 deletions(-) create mode 100644 drivers/gpu/drm/ci/testlist-amdgpu.txt create mode 100644 drivers/gpu/drm/ci/testlist-msm.txt create mode 100644 drivers/gpu/drm/ci/testlist-panfrost.txt create mode 100644 drivers/gpu/drm/ci/testlist-v3d.txt create mode 100644 drivers/gpu/drm/ci/testlist-vc4.txt diff --git a/drivers/gpu/drm/ci/gitlab-ci.yml b/drivers/gpu/drm/ci/gitlab-ci.yml index bc8cb3420476..e2b021616a8e 100644 --- a/drivers/gpu/drm/ci/gitlab-ci.yml +++ b/drivers/gpu/drm/ci/gitlab-ci.yml @@ -5,7 +5,7 @@ variables: UPSTREAM_REPO: git://anongit.freedesktop.org/drm/drm TARGET_BRANCH: drm-next - IGT_VERSION: d2af13d9f5be5ce23d996e4afd3e45990f5ab977 + IGT_VERSION: b0cc8160ebdc87ce08b7fd83bb3c99ff7a4d8610 DEQP_RUNNER_GIT_URL: https://gitlab.freedesktop.org/anholt/deqp-runner.git DEQP_RUNNER_GIT_TAG: v0.15.0 diff --git a/drivers/gpu/drm/ci/igt_runner.sh b/drivers/gpu/drm/ci/igt_runner.sh index f001e015d135..2fd09b9b7cf6 100755 --- a/drivers/gpu/drm/ci/igt_runner.sh +++ b/drivers/gpu/drm/ci/igt_runner.sh @@ -64,10 +64,20 @@ if ! grep -q "core_getversion" /install/testlist.txt; then fi set +e +if [ "$DRIVER_NAME" = "amdgpu" ]; then + TEST_LIST="/install/testlist-amdgpu.txt" +elif [ "$DRIVER_NAME" = "msm" ]; then + TEST_LIST="/install/testlist-msm.txt" +elif [ "$DRIVER_NAME" = "panfrost" ]; then + TEST_LIST="/install/testlist-panfrost.txt" +else + TEST_LIST="/install/testlist.txt" +fi + Isn't V3D and VC4 testlists missing? Yes. We need to add ci jobs to test v3d/vc4. The initial idea was just to split the testlist per driver and add vc4/v3d tests so that it can be used in future. I will add the jobs as part of v4. It would be nice if you could provide us a link to a working pipeline. Will provide the working pipeline link in next series. Also, if possible, I would like to be CCed on the next version of this patch, as I have interest in the V3D/VC4 tests. Sure, will do. Ah, one thing: it would be nice to add the testlists to the MAINTAINERS file. This way, maintainers can keep track of any changes. Yes, will add the testlists to MAINTAINERS file. Thanks for the review and feedback. Regards, Vignesh
[PATCH 4/5] amd/amdkfd: get node id for query_utcl2_poison_status
Obtain it from ring entry. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 3 ++- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c index 9a06c6fb6605..747cb785a7d3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c @@ -367,10 +367,11 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, client_id == SOC15_IH_CLIENTID_UTCL2) { struct kfd_vm_fault_info info = {0}; uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); + uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry); struct kfd_hsa_memory_exception_data exception_data; if (client_id == SOC15_IH_CLIENTID_UTCL2 && - amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) { + amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, node_id)) { event_interrupt_poison_consumption(dev, pasid, client_id); return; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 91dd5e045b51..eb94d967c532 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -413,10 +413,11 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, client_id == SOC15_IH_CLIENTID_UTCL2) { struct kfd_vm_fault_info info = {0}; uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); + uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry); struct kfd_hsa_memory_exception_data exception_data; if (client_id == SOC15_IH_CLIENTID_UTCL2 && - amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) { + amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, node_id)) { event_interrupt_poison_consumption_v9(dev, pasid, client_id); return; } -- 2.34.1
[PATCH 5/5] drm/amdgpu: skip GFX FED error in page fault handling
Let kfd interrupt handler process it. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 773725a92cf1..70defc394b7b 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -552,7 +552,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, { bool retry_fault = !!(entry->src_data[1] & 0x80); bool write_fault = !!(entry->src_data[1] & 0x20); - uint32_t status = 0, cid = 0, rw = 0; + uint32_t status = 0, cid = 0, rw = 0, fed = 0; struct amdgpu_task_info task_info; struct amdgpu_vmhub *hub; const char *mmhub_cid; @@ -663,6 +663,14 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, status = RREG32(hub->vm_l2_pro_fault_status); cid = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, CID); rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW); + fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); + + /* for gfx fed error, kfd will handle it, return directly */ + if (fed && amdgpu_ras_is_poison_mode_supported(adev) && + amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2) && + !strcmp(hub_name, "gfxhub0")) + return 1; + WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); #ifdef HAVE_STRUCT_XARRAY amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub); -- 2.34.1
[PATCH 3/5] drm/amdgpu: retire gfx ras query_utcl2_poison_status
Replace it with related interface in gfxhub functions. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 7 --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h| 1 - drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c| 12 4 files changed, 6 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index f759a42def59..bb509b26112d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -776,10 +776,11 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, return 0; } -bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev) +bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev, + uint32_t node_id) { - if (adev->gfx.ras && adev->gfx.ras->query_utcl2_poison_status) - return adev->gfx.ras->query_utcl2_poison_status(adev); + if (adev->gfxhub.funcs->query_utcl2_poison_status) + return adev->gfxhub.funcs->query_utcl2_poison_status(adev, node_id); else return false; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index b2990470d1c6..ae1e449e5479 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -404,7 +404,8 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); void amdgpu_amdkfd_block_mmu_notifications(void *p); int amdgpu_amdkfd_criu_resume(void *p); -bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev); +bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev, + uint32_t node_id); int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, uint64_t size, u32 alloc_flag, int8_t xcp_id); void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 2af2e28952db..d91f83bd7267 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -268,7 +268,6 @@ struct amdgpu_cu_info { struct amdgpu_gfx_ras { struct amdgpu_ras_block_object ras_block; void (*enable_watchdog_timer)(struct amdgpu_device *adev); - bool (*query_utcl2_poison_status)(struct amdgpu_device *adev); int (*rlc_gc_fed_irq)(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c index 0d5b4133fdf7..e3ed568eaacc 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c @@ -1909,18 +1909,7 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev) mutex_unlock(&adev->grbm_idx_mutex); } -static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device *adev) -{ - u32 status = 0; - struct amdgpu_vmhub *hub; - - hub = &adev->vmhub[AMDGPU_GFXHUB(0)]; - status = RREG32(hub->vm_l2_pro_fault_status); - /* reset page fault status */ - WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); - return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); -} struct amdgpu_ras_block_hw_ops gfx_v9_4_2_ras_ops = { .query_ras_error_count = &gfx_v9_4_2_query_ras_error_count, @@ -1934,5 +1923,4 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = { .hw_ops = &gfx_v9_4_2_ras_ops, }, .enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer, - .query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status, }; -- 2.34.1
[PATCH 2/5] drm/amdgpu: add utcl2 poison query for gfxhub
Implement it for gfxhub 1.0 and 1.2. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h | 2 ++ drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c | 17 + drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c | 15 +++ 3 files changed, 34 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h index c7b44aeb671b..12b131a9cc42 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h @@ -38,6 +38,8 @@ struct amdgpu_gfxhub_funcs { void (*mode2_save_regs)(struct amdgpu_device *adev); void (*mode2_restore_regs)(struct amdgpu_device *adev); void (*halt)(struct amdgpu_device *adev); + bool (*query_utcl2_poison_status)(struct amdgpu_device *adev, + uint32_t node_id); }; struct amdgpu_gfxhub { diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c index 22175da0e16a..1c14b1665c9f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c @@ -443,6 +443,22 @@ static void gfxhub_v1_0_init(struct amdgpu_device *adev) mmVM_INVALIDATE_ENG0_ADDR_RANGE_LO32; } +static bool gfxhub_v1_0_query_utcl2_poison_status(struct amdgpu_device *adev, + uint32_t node_id) +{ + u32 status = 0; + struct amdgpu_vmhub *hub; + + if (amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(9, 4, 2)) + return false; + + hub = &adev->vmhub[AMDGPU_GFXHUB(0)]; + status = RREG32(hub->vm_l2_pro_fault_status); + /* reset page fault status */ + WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); + + return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); +} const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = { .get_mc_fb_offset = gfxhub_v1_0_get_mc_fb_offset, @@ -452,4 +468,5 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = { .set_fault_enable_default = gfxhub_v1_0_set_fault_enable_default, .init = gfxhub_v1_0_init, .get_xgmi_info = gfxhub_v1_1_get_xgmi_info, + .query_utcl2_poison_status = gfxhub_v1_0_query_utcl2_poison_status, }; diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c index 49aecdcee006..ebc96739e1c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c @@ -620,6 +620,20 @@ static int gfxhub_v1_2_get_xgmi_info(struct amdgpu_device *adev) return 0; } +static bool gfxhub_v1_2_query_utcl2_poison_status(struct amdgpu_device *adev, + uint32_t node_id) +{ + u32 fed, status; + + status = RREG32_SOC15(GC, node_id, regVM_L2_PROTECTION_FAULT_STATUS); + fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); + /* reset page fault status */ + WREG32_P(SOC15_REG_OFFSET(GC, node_id, + regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1); + + return fed; +} + const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = { .get_mc_fb_offset = gfxhub_v1_2_get_mc_fb_offset, .setup_vm_pt_regs = gfxhub_v1_2_setup_vm_pt_regs, @@ -628,6 +642,7 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = { .set_fault_enable_default = gfxhub_v1_2_set_fault_enable_default, .init = gfxhub_v1_2_init, .get_xgmi_info = gfxhub_v1_2_get_xgmi_info, + .query_utcl2_poison_status = gfxhub_v1_2_query_utcl2_poison_status, }; static int gfxhub_v1_2_xcp_resume(void *handle, uint32_t inst_mask) -- 2.34.1
[PATCH 1/5] drm/amdgpu: add new bit definitions for GC 9.0 PROTECTION_FAULT_STATUS
Add UCE and FED bit definitions. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h | 4 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h index efc16ddf274a..2dfa0e5b1aa3 100644 --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_0_sh_mask.h @@ -6822,6 +6822,8 @@ #define VM_L2_PROTECTION_FAULT_STATUS__VMID__SHIFT 0x14 #define VM_L2_PROTECTION_FAULT_STATUS__VF__SHIFT 0x18 #define VM_L2_PROTECTION_FAULT_STATUS__VFID__SHIFT 0x19 +#define VM_L2_PROTECTION_FAULT_STATUS__UCE__SHIFT 0x1d +#define VM_L2_PROTECTION_FAULT_STATUS__FED__SHIFT 0x1e #define VM_L2_PROTECTION_FAULT_STATUS__MORE_FAULTS_MASK 0x0001L #define VM_L2_PROTECTION_FAULT_STATUS__WALKER_ERROR_MASK 0x000EL #define VM_L2_PROTECTION_FAULT_STATUS__PERMISSION_FAULTS_MASK 0x00F0L @@ -6832,6 +6834,8 @@ #define VM_L2_PROTECTION_FAULT_STATUS__VMID_MASK 0x00F0L #define VM_L2_PROTECTION_FAULT_STATUS__VF_MASK 0x0100L #define VM_L2_PROTECTION_FAULT_STATUS__VFID_MASK 0x1E00L +#define VM_L2_PROTECTION_FAULT_STATUS__UCE_MASK 0x2000L +#define VM_L2_PROTECTION_FAULT_STATUS__FED_MASK 0x4000L //VM_L2_PROTECTION_FAULT_ADDR_LO32 #define VM_L2_PROTECTION_FAULT_ADDR_LO32__LOGICAL_PAGE_ADDR_LO32__SHIFT 0x0 #define VM_L2_PROTECTION_FAULT_ADDR_LO32__LOGICAL_PAGE_ADDR_LO32_MASK 0xL -- 2.34.1