date:20220314

[PATCH] drm/amdgpu: only allow secure submission on rings which support that

2022-03-14 Thread Lang Yu

Only GFX ring, SDMA ring and VCN decode ring support secure submission
at the moment.

Suggested-by: Christian König 
Signed-off-by: Lang Yu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c   | 4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c| 1 +
 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c   | 1 +
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c   | 1 +
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 4 
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 1 +
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c   | 1 +
 drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c| 1 +
 drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c| 1 +
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c| 2 ++
 drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c| 2 ++
 13 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index bc1297dcdf97..d583766ea392 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -166,8 +166,8 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned 
num_ibs,
}
 
if ((ib->flags & AMDGPU_IB_FLAGS_SECURE) &&
-   (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)) {
-   dev_err(adev->dev, "secure submissions not supported on compute 
rings\n");
+   (!ring->funcs->secure_submission_supported)) {
+   dev_err(adev->dev, "secure submissions not supported on ring 
<%s>\n", ring->name);
return -EINVAL;
}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index a8bed1b47899..5320bb0883d8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -155,6 +155,7 @@ struct amdgpu_ring_funcs {
u32 nop;
boolsupport_64bit_ptrs;
boolno_user_fence;
+   boolsecure_submission_supported;
unsignedvmhub;
unsignedextra_dw;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 713d39d89e30..f4c6accd3226 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -9377,6 +9377,7 @@ static const struct amdgpu_ring_funcs 
gfx_v10_0_ring_funcs_gfx = {
.align_mask = 0xff,
.nop = PACKET3(PACKET3_NOP, 0x3FFF),
.support_64bit_ptrs = true,
+   .secure_submission_supported = true,
.vmhub = AMDGPU_GFXHUB_0,
.get_rptr = gfx_v10_0_ring_get_rptr_gfx,
.get_wptr = gfx_v10_0_ring_get_wptr_gfx,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 8def7f630d4c..46d4bf27ebbb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6865,6 +6865,7 @@ static const struct amdgpu_ring_funcs 
gfx_v9_0_ring_funcs_gfx = {
.align_mask = 0xff,
.nop = PACKET3(PACKET3_NOP, 0x3FFF),
.support_64bit_ptrs = true,
+   .secure_submission_supported = true,
.vmhub = AMDGPU_GFXHUB_0,
.get_rptr = gfx_v9_0_ring_get_rptr_gfx,
.get_wptr = gfx_v9_0_ring_get_wptr_gfx,
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
index 4509bd4cce2d..1d8bbcbd7a37 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
@@ -1142,6 +1142,7 @@ static const struct amdgpu_ring_funcs 
sdma_v2_4_ring_funcs = {
.align_mask = 0xf,
.nop = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP),
.support_64bit_ptrs = false,
+   .secure_submission_supported = true,
.get_rptr = sdma_v2_4_ring_get_rptr,
.get_wptr = sdma_v2_4_ring_get_wptr,
.set_wptr = sdma_v2_4_ring_set_wptr,
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index 135727b59c41..4ef4feff5649 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -1580,6 +1580,7 @@ static const struct amdgpu_ring_funcs 
sdma_v3_0_ring_funcs = {
.align_mask = 0xf,
.nop = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP),
.support_64bit_ptrs = false,
+   .secure_submission_supported = true,
.get_rptr = sdma_v3_0_ring_get_rptr,
.get_wptr = sdma_v3_0_ring_get_wptr,
.set_wptr = sdma_v3_0_ring_set_wptr,
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 01b385568c14..d7e8f7232364 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -2414,6 +2414,7 @@ static const struct amdgpu_ring_funcs 
sdma_v4_0_ring_funcs = {
.align_mask = 0xf,
.nop = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP),
.support_64bit_ptrs = true,
+   .secure_submission_supported = true,

[PATCH] drm/amdgpu/vcn: fix vcn ring test failure in igt reload test

2022-03-14 Thread Tianci Yin

From: "Tianci.Yin" 

[why]
On Renoir, vcn ring test failed on the second time insmod in the reload
test. After invetigation, it proves that vcn only can disable dpg under
dpg unpause mode (dpg unpause mode is default for dec only, dpg pause
mode is for dec/enc).

[how]
unpause dpg in dpg stopping procedure.

Change-Id: If6ec3af694e1d6b63ebce386a563f03ca6d291c1
Signed-off-by: Tianci.Yin 
---
 drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
index 319ac8ea434b..6e0972cd1f2f 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
@@ -1098,8 +1098,10 @@ static int vcn_v2_0_start(struct amdgpu_device *adev)
 
 static int vcn_v2_0_stop_dpg_mode(struct amdgpu_device *adev)
 {
+   struct dpg_pause_state state = {.fw_based = VCN_DPG_STATE__UNPAUSE};
uint32_t tmp;
 
+   vcn_v2_0_pause_dpg_mode(adev, 0, );
/* Wait for power status to be 1 */
SOC15_WAIT_ON_RREG(UVD, 0, mmUVD_POWER_STATUS, 1,
UVD_POWER_STATUS__UVD_POWER_STATUS_MASK);
-- 
2.25.1

Re: [PATCH 00/22] drm: Review of mode copies

2022-03-14 Thread Ville Syrjälä

On Fri, Feb 18, 2022 at 12:03:41PM +0200, Ville Syrjala wrote:
>   drm: Add drm_mode_init()
>   drm/bridge: Use drm_mode_copy()
>   drm/imx: Use drm_mode_duplicate()
>   drm/panel: Use drm_mode_duplicate()
>   drm/vc4: Use drm_mode_copy()
These have been pushed to drm-misc-next.

>   drm/amdgpu: Remove pointless on stack mode copies
>   drm/amdgpu: Use drm_mode_init() for on-stack modes
>   drm/amdgpu: Use drm_mode_copy()
amdgpu ones are reviewed, but I'll leave them for the
AMD folks to push to whichever tree they prefer.


The rest are still in need of review:
>   drm/radeon: Use drm_mode_copy()
>   drm/gma500: Use drm_mode_copy()
>   drm/hisilicon: Use drm_mode_init() for on-stack modes
>   drm/msm: Nuke weird on stack mode copy
>   drm/msm: Use drm_mode_init() for on-stack modes
>   drm/msm: Use drm_mode_copy()
>   drm/mtk: Use drm_mode_init() for on-stack modes
>   drm/rockchip: Use drm_mode_copy()
>   drm/sti: Use drm_mode_copy()
>   drm/tilcdc: Use drm_mode_copy()
>   drm/i915: Use drm_mode_init() for on-stack modes
>   drm/i915: Use drm_mode_copy()
>   drm: Use drm_mode_init() for on-stack modes
>   drm: Use drm_mode_copy()

-- 
Ville Syrjälä
Intel

Re: [PATCH 2/5] dyndbg: add class_id field and query support

2022-03-14 Thread Jason Baron




On 3/11/22 20:06, jim.cro...@gmail.com wrote:
> On Fri, Mar 11, 2022 at 12:06 PM Jason Baron  wrote:
>>
>>
>>
>> On 3/10/22 23:47, Jim Cromie wrote:
>>> DRM defines/uses 10 enum drm_debug_category's to create exclusive
>>> classes of debug messages.  To support this directly in dynamic-debug,
>>> add the following:
>>>
>>> - struct _ddebug.class_id:4 - 4 bits is enough
>>> - define _DPRINTK_SITE_UNCLASSED 15 - see below
>>>
>>> and the query support:
>>> - struct _ddebug_query.class_id
>>> - ddebug_parse_query  - looks for "class" keyword, then calls..
>>> - parse_class - accepts uint: 0-15, sets query.class_id and marker
>>> - vpr_info_dq - displays new field
>>> - ddebug_proc_show- append column with "cls:%d" if !UNCLASSED
>>>
>>> With the patch, one can enable current/unclassed callsites by:
>>>
>>>   #> echo drm class 15 +p > /proc/dynamic_debug/control
>>>
>>
>> To me, this is hard to read, what the heck is '15'? I have to go look it
>> up in the control file and it's not descriptive. I think that using
>> classes/categories makes sense but I'm wondering if it can be a bit more
>> user friendly? Perhaps, we can pass an array of strings that is indexed
>> by the class id to each pr_debug() site that wants to use class. So
>> something like:
>>
> 
> Im not at all averse to nice names, but as something added on.
> 
> 1st, the interface to make friendlier is DRM's
> 
> echo 0x04 > /sys/module/drm/parameters/debug   # which category ?
> 
> parm:   debug:Enable debug output, where each bit enables a
> debug category.
> Bit 0 (0x01)  will enable CORE messages (drm core code)
> Bit 1 (0x02)  will enable DRIVER messages (drm controller code)
> Bit 2 (0x04)  will enable KMS messages (modesetting code)
> 
> echo DRM_UT_DRIVER,DRM_UT_KMS > /sys/module/drm/parameters/debug   #
> now its pretty clear
> 
> If that works, its cuz DEFINE_DYNAMIC_DEBUG_CLASSBITS()
> plucks class symbols out of its __VA_ARGS__, and #stringifes them.
> So that macro could then build the 1-per-module static constant string array
> and (only) the callbacks would be able to use it.
> 
> from there, it shouldnt be hard to ref that from the module's ddebug_table,
> so parse_query could validate class args against the module given (or
> fail if none given)
> 
> Speaking strictly, theres a gap:
>echo module * class DRM_UT_KMS +p > control
> is nonsense for * other than drm + drivers,
> but its fair/safe to just disallow wildcards on modname for this purpose.
> 
> it does however imply that module foo must exist for FOO_CAT_1 to be usable.
> thats not currently the case:
> bash-5.1# echo module foo +p > /proc/dynamic_debug/control
> [   15.403749] dyndbg: read 14 bytes from userspace
> [   15.405413] dyndbg: query 0: "module foo +p" mod:*
> [   15.406486] dyndbg: split into words: "module" "foo" "+p"
> [   15.407070] dyndbg: op='+'
> [   15.407388] dyndbg: flags=0x1
> [   15.407809] dyndbg: *flagsp=0x1 *maskp=0x
> [   15.408300] dyndbg: parsed: func="" file="" module="foo" format=""
> lineno=0-0 class=15
> [   15.409151] dyndbg: no matches for query
> [   15.409591] dyndbg: no-match: func="" file="" module="foo"
> format="" lineno=0-0 class=15
> [   15.410524] dyndbg: processed 1 queries, with 0 matches, 0 errs
> bash-5.1#
> 
> ISTM we can keep that "0 errs" response for that case, but still reject this:
> 
>echo module foo class FOO_NOT_HERE +p > /proc/dynamic_debug/control
> 
> 

Ok, yeah so I guess I'm thinking about the 'class' more as global namespace,
so that for example, it could span modules, or be specific to certain modules.
I'm also thinking of it as a 'string' which is maybe hierarchical, so that it's
clear what sub-system, or sub-sub-system it belongs to. So for DRM for example,
it could be a string like "DRM:CORE". The index num I think is still helpful for
implementation so we don't have to store a pointer size, but I don't think it's
really exposed (except perhaps in module params b/c drm is doing that already?).


>> enum levels {
>> LOW,
>> MEDIUM,
>> HIGH
>> };
> 
> I want to steer clear of "level" anything,
> since 2>1 implies non independence of the categories
> 
> 

Agreed, that was a bad example on my part.

I've put together a rough patch on top of your series, to make my thinking
hopefully clear. I also think that the module param callback thing could be
implemented in this 'global' space with the 0-14 low numbers, by adding
some sort of offset into the table. IE you would add the low number +
the offset to get the 'string' to add to the ddebug query. I commented it
out in my patch below b/c I didn't implement that part.

Thoughts?


diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 664bb83778d2..b0bc1b536d54 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -8,6 +8,14 @@

 #include 

+enum ddebug_categories {
+FOO_BAR = 0,
+DRM_A = 1,
+DRM_B = 2,
+DRM_C =

Re: [PATCH] drm/amdgpu: Add stolen reserved memory for MI25 SRIOV.

2022-03-14 Thread Alex Deucher

On Mon, Mar 14, 2022 at 5:10 PM Yongqiang Sun  wrote:
>
> MI25 SRIOV guest driver loading failed due to allocate
> memory overlaps with private memory area.

maybe instead of "private memory area", say something like "firmware
reserved area".

> Add below change to fix the issue:
> 1. Allocate stolen reserved memory for MI25 SRIOV specifically to avoid
> the memory overlap.
> 2. Move allocate reserve allocation to vbios allocation since both the
> two functions are doing similar asic type check and no need to have
> separate functions.

These could be split into two patches, one to merge
amdgpu_gmc_get_reserved_allocation() into
amdgpu_gmc_get_vbios_allocations(), and one to add the
stolen_reserved_* for vega10.

>
> Signed-off-by: Yongqiang Sun 
> Change-Id: I142127513047a3e81573eb983c510d763b548a24

With the above changes addressed:
Reviewed-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 38 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 -
>  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  1 -
>  3 files changed, 19 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 7c2a9555b7cc..7e4d298e8df8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -626,6 +626,13 @@ void amdgpu_gmc_get_vbios_allocations(struct 
> amdgpu_device *adev)
>  {
> unsigned size;
>
> +   /*
> +* Some ASICs need to reserve a region of video memory to avoid access
> +* from driver
> +*/
> +   adev->mman.stolen_reserved_offset = 0;
> +   adev->mman.stolen_reserved_size = 0;
> +
> /*
>  * TODO:
>  * Currently there is a bug where some memory client outside
> @@ -636,10 +643,22 @@ void amdgpu_gmc_get_vbios_allocations(struct 
> amdgpu_device *adev)
>  */
> switch (adev->asic_type) {
> case CHIP_VEGA10:
> +   adev->mman.keep_stolen_vga_memory = true;
> +   if (amdgpu_sriov_vf(adev)) {
> +   adev->mman.stolen_reserved_offset = 0x10;
> +   adev->mman.stolen_reserved_size = 0x60;
> +   }
> +   break;
> case CHIP_RAVEN:
> case CHIP_RENOIR:
> adev->mman.keep_stolen_vga_memory = true;
> break;
> +   case CHIP_YELLOW_CARP:
> +   if (amdgpu_discovery == 0) {
> +   adev->mman.stolen_reserved_offset = 0x1ffb;
> +   adev->mman.stolen_reserved_size = 64 * PAGE_SIZE;
> +   }
> +   break;
> default:
> adev->mman.keep_stolen_vga_memory = false;
> break;
> @@ -760,25 +779,6 @@ uint64_t amdgpu_gmc_vram_cpu_pa(struct amdgpu_device 
> *adev, struct amdgpu_bo *bo
> return amdgpu_bo_gpu_offset(bo) - adev->gmc.vram_start + 
> adev->gmc.aper_base;
>  }
>
> -void amdgpu_gmc_get_reserved_allocation(struct amdgpu_device *adev)
> -{
> -   /* Some ASICs need to reserve a region of video memory to avoid access
> -* from driver */
> -   adev->mman.stolen_reserved_offset = 0;
> -   adev->mman.stolen_reserved_size = 0;
> -
> -   switch (adev->asic_type) {
> -   case CHIP_YELLOW_CARP:
> -   if (amdgpu_discovery == 0) {
> -   adev->mman.stolen_reserved_offset = 0x1ffb;
> -   adev->mman.stolen_reserved_size = 64 * PAGE_SIZE;
> -   }
> -   break;
> -   default:
> -   break;
> -   }
> -}
> -
>  int amdgpu_gmc_vram_checking(struct amdgpu_device *adev)
>  {
> struct amdgpu_bo *vram_bo = NULL;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index 93505bb0a36c..032b0313f277 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -331,7 +331,6 @@ amdgpu_gmc_set_vm_fault_masks(struct amdgpu_device *adev, 
> int hub_type,
>   bool enable);
>
>  void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device *adev);
> -void amdgpu_gmc_get_reserved_allocation(struct amdgpu_device *adev);
>
>  void amdgpu_gmc_init_pdb0(struct amdgpu_device *adev);
>  uint64_t amdgpu_gmc_vram_mc2pa(struct amdgpu_device *adev, uint64_t mc_addr);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index f60b7bd4dbf5..3c1d440824a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -948,7 +948,6 @@ static int gmc_v10_0_sw_init(void *handle)
> return r;
>
> amdgpu_gmc_get_vbios_allocations(adev);
> -   amdgpu_gmc_get_reserved_allocation(adev);
>
> /* Memory manager */
> r = amdgpu_bo_init(adev);
> --
> 2.25.1
>

[PATCH] drm/amdgpu: Add stolen reserved memory for MI25 SRIOV.

2022-03-14 Thread Yongqiang Sun

MI25 SRIOV guest driver loading failed due to allocate
memory overlaps with private memory area.
Add below change to fix the issue:
1. Allocate stolen reserved memory for MI25 SRIOV specifically to avoid
the memory overlap.
2. Move allocate reserve allocation to vbios allocation since both the
two functions are doing similar asic type check and no need to have
separate functions.

Signed-off-by: Yongqiang Sun 
Change-Id: I142127513047a3e81573eb983c510d763b548a24
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 38 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 -
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  1 -
 3 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 7c2a9555b7cc..7e4d298e8df8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -626,6 +626,13 @@ void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device 
*adev)
 {
unsigned size;
 
+   /*
+* Some ASICs need to reserve a region of video memory to avoid access
+* from driver
+*/
+   adev->mman.stolen_reserved_offset = 0;
+   adev->mman.stolen_reserved_size = 0;
+
/*
 * TODO:
 * Currently there is a bug where some memory client outside
@@ -636,10 +643,22 @@ void amdgpu_gmc_get_vbios_allocations(struct 
amdgpu_device *adev)
 */
switch (adev->asic_type) {
case CHIP_VEGA10:
+   adev->mman.keep_stolen_vga_memory = true;
+   if (amdgpu_sriov_vf(adev)) {
+   adev->mman.stolen_reserved_offset = 0x10;
+   adev->mman.stolen_reserved_size = 0x60;
+   }
+   break;
case CHIP_RAVEN:
case CHIP_RENOIR:
adev->mman.keep_stolen_vga_memory = true;
break;
+   case CHIP_YELLOW_CARP:
+   if (amdgpu_discovery == 0) {
+   adev->mman.stolen_reserved_offset = 0x1ffb;
+   adev->mman.stolen_reserved_size = 64 * PAGE_SIZE;
+   }
+   break;
default:
adev->mman.keep_stolen_vga_memory = false;
break;
@@ -760,25 +779,6 @@ uint64_t amdgpu_gmc_vram_cpu_pa(struct amdgpu_device 
*adev, struct amdgpu_bo *bo
return amdgpu_bo_gpu_offset(bo) - adev->gmc.vram_start + 
adev->gmc.aper_base;
 }
 
-void amdgpu_gmc_get_reserved_allocation(struct amdgpu_device *adev)
-{
-   /* Some ASICs need to reserve a region of video memory to avoid access
-* from driver */
-   adev->mman.stolen_reserved_offset = 0;
-   adev->mman.stolen_reserved_size = 0;
-
-   switch (adev->asic_type) {
-   case CHIP_YELLOW_CARP:
-   if (amdgpu_discovery == 0) {
-   adev->mman.stolen_reserved_offset = 0x1ffb;
-   adev->mman.stolen_reserved_size = 64 * PAGE_SIZE;
-   }
-   break;
-   default:
-   break;
-   }
-}
-
 int amdgpu_gmc_vram_checking(struct amdgpu_device *adev)
 {
struct amdgpu_bo *vram_bo = NULL;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 93505bb0a36c..032b0313f277 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -331,7 +331,6 @@ amdgpu_gmc_set_vm_fault_masks(struct amdgpu_device *adev, 
int hub_type,
  bool enable);
 
 void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device *adev);
-void amdgpu_gmc_get_reserved_allocation(struct amdgpu_device *adev);
 
 void amdgpu_gmc_init_pdb0(struct amdgpu_device *adev);
 uint64_t amdgpu_gmc_vram_mc2pa(struct amdgpu_device *adev, uint64_t mc_addr);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index f60b7bd4dbf5..3c1d440824a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -948,7 +948,6 @@ static int gmc_v10_0_sw_init(void *handle)
return r;
 
amdgpu_gmc_get_vbios_allocations(adev);
-   amdgpu_gmc_get_reserved_allocation(adev);
 
/* Memory manager */
r = amdgpu_bo_init(adev);
-- 
2.25.1

Re: [PATCH 1/2] drm/amdgpu: Use fw vram offset when allocating stolen vga memory.

2022-03-14 Thread Alex Deucher

On Mon, Mar 14, 2022 at 3:35 PM Christian König
 wrote:
>
> Am 14.03.22 um 19:54 schrieb Yongqiang Sun:
> > [Why]
> > Memory access violation will happen in case of allocate stolen vga
> > memory with size isn't 0.
> >
> > [How]
> > when allocating stolen vga memory, use fw vram offset as the start point
> > instead of hard code value 0.
>
> Please stop separating commit message into [Why] and [How], that is not
> well received everywhere.
>
> Apart from that the patch is a certain NAK, you are messing things quite
> up here.
>
> >
> > Signed-off-by: Yongqiang Sun 
> > Change-Id: I7c555a6c1fb4b3fa8685753b4bdcbe215f89ea1e
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 31 +++--
> >   1 file changed, 24 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > index 41d6f604813d..1f635fdb0395 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > @@ -1708,6 +1708,7 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
> >   uint64_t gtt_size;
> >   int r;
> >   u64 vis_vram_limit;
> > + u64 memory_offset = adev->mman.fw_vram_usage_start_offset + 
> > adev->mman.fw_vram_usage_size;
> >
> >   mutex_init(>mman.gtt_window_lock);
> >
> > @@ -1774,24 +1775,40 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
> >* This is used for VGA emulation and pre-OS scanout buffers to
> >* avoid display artifacts while transitioning between pre-OS
> >* and driver.  */
> > - r = amdgpu_bo_create_kernel_at(adev, 0, adev->mman.stolen_vga_size,
> > + r = amdgpu_bo_create_kernel_at(adev,
> > +memory_offset,
> > +adev->mman.stolen_vga_size,
>
> That is certainly incorrect. See function amdgpu_ttm_fw_reserve_vram_init().
>
> The stolen VGA buffer always started at offset 0 and is independent of
> the range defined by fw_vram_usage_start_offset and fw_vram_usage_size.
>

Yeah these are separate ranges.  I just had a chat with Yongqiang and
there was some confusion around this, but this patch can be dropped.

Alex


> Regards,
> Christian.
>
> >  AMDGPU_GEM_DOMAIN_VRAM,
> >  >mman.stolen_vga_memory,
> >  NULL);
> >   if (r)
> >   return r;
> > - r = amdgpu_bo_create_kernel_at(adev, adev->mman.stolen_vga_size,
> > +
> > + memory_offset += adev->mman.stolen_vga_size;
> > +
> > + r = amdgpu_bo_create_kernel_at(adev,
> > +memory_offset,
> >  adev->mman.stolen_extended_size,
> >  AMDGPU_GEM_DOMAIN_VRAM,
> >  >mman.stolen_extended_memory,
> >  NULL);
> >   if (r)
> >   return r;
> > - r = amdgpu_bo_create_kernel_at(adev, 
> > adev->mman.stolen_reserved_offset,
> > -adev->mman.stolen_reserved_size,
> > -AMDGPU_GEM_DOMAIN_VRAM,
> > ->mman.stolen_reserved_memory,
> > -NULL);
> > +
> > + memory_offset += adev->mman.stolen_extended_size;
> > +
> > + if (adev->mman.stolen_reserved_offset > memory_offset)
> > + r = amdgpu_bo_create_kernel_at(adev, 
> > adev->mman.stolen_reserved_offset,
> > +
> > adev->mman.stolen_reserved_size,
> > +AMDGPU_GEM_DOMAIN_VRAM,
> > +
> > >mman.stolen_reserved_memory,
> > +NULL);
> > + else if (adev->mman.stolen_reserved_offset + 
> > adev->mman.stolen_reserved_size > memory_offset)
> > + r = amdgpu_bo_create_kernel_at(adev, memory_offset,
> > + 
> > adev->mman.stolen_reserved_offset + adev->mman.stolen_reserved_size - 
> > memory_offset,
> > +AMDGPU_GEM_DOMAIN_VRAM,
> > +
> > >mman.stolen_reserved_memory,
> > +NULL);
> >   if (r)
> >   return r;
> >
>

Re: [PATCH 2/2] drm/amdgpu: Add stolen reserved memory for MI25 SRIOV.

2022-03-14 Thread Alex Deucher

On Mon, Mar 14, 2022 at 3:41 PM Christian König
 wrote:
>
> Am 14.03.22 um 19:54 schrieb Yongqiang Sun:
> > [Why]
> > MI25 SRIOV guest driver loading failed due to allocate
> > memory overlaps with private memory area.
> >
> > [How]
> > 1. Allocate stolen reserved memory for MI25 SRIOV specifically to avoid
> > the memory overlap.
> > 2. Move allocate reserve allocation to vbios allocation since both the
> > two functions are doing similar asic type check and no need to have
> > seperate functions.
> >
> > Signed-off-by: Yongqiang Sun 
> > Change-Id: I142127513047a3e81573eb983c510d763b548a24
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 37 -
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 -
> >   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  1 -
> >   3 files changed, 18 insertions(+), 21 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> > index 7c2a9555b7cc..f7f4f00dd2b2 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> > @@ -626,6 +626,11 @@ void amdgpu_gmc_get_vbios_allocations(struct 
> > amdgpu_device *adev)
> >   {
> >   unsigned size;
> >
> > + /* Some ASICs need to reserve a region of video memory to avoid access
> > +  * from driver */
> > + adev->mman.stolen_reserved_offset = 0;
> > + adev->mman.stolen_reserved_size = 0;
> > +
> >   /*
> >* TODO:
> >* Currently there is a bug where some memory client outside
> > @@ -635,11 +640,24 @@ void amdgpu_gmc_get_vbios_allocations(struct 
> > amdgpu_device *adev)
> >* Keep the stolen memory reservation until the while this is not 
> > solved.
> >*/
> >   switch (adev->asic_type) {
> > +
> >   case CHIP_VEGA10:
>
> Please don't add empty lines between switch and case. Good practice is
> to check your patches with checkpatch.pl before sending it out.
>
> > + adev->mman.keep_stolen_vga_memory = true;
> > + if (amdgpu_sriov_vf(adev)) {
> > + adev->mman.stolen_reserved_offset = 0x10;
> > + adev->mman.stolen_reserved_size = 0x60;
> > + }
> > + break;
> >   case CHIP_RAVEN:
> >   case CHIP_RENOIR:
> >   adev->mman.keep_stolen_vga_memory = true;
> >   break;
> > + case CHIP_YELLOW_CARP:
> > + if (amdgpu_discovery == 0) {
> > + adev->mman.stolen_reserved_offset = 0x1ffb;
> > + adev->mman.stolen_reserved_size = 64 * PAGE_SIZE;
> > + }
> > + break;
>
> That looks like this is somehow mixed up. The stolen memory is for VGA
> emulation, but under SRIOV we should not have VGA emulation as far as I
> know.
>
> Alex, what's going on here?

I suggested calling amdgpu_gmc_get_reserved_allocation() from
amdgpu_gmc_get_vbios_allocations() rather calling
amdgpu_gmc_get_reserved_allocation() in every gmc file since it
handles additional reserved areas used allocated by firmware, etc.  My
understanding is that there is some additional reserved area set up in
the hypervisor that we want reserved in the guest.  The idea was to
just use stolen_reserved_offset for that similar to what we do for the
bring up case for yellow carp.

Alex


>
> Regards,
> Christian.
>
> >   default:
> >   adev->mman.keep_stolen_vga_memory = false;
> >   break;
> > @@ -760,25 +778,6 @@ uint64_t amdgpu_gmc_vram_cpu_pa(struct amdgpu_device 
> > *adev, struct amdgpu_bo *bo
> >   return amdgpu_bo_gpu_offset(bo) - adev->gmc.vram_start + 
> > adev->gmc.aper_base;
> >   }
> >
> > -void amdgpu_gmc_get_reserved_allocation(struct amdgpu_device *adev)
> > -{
> > - /* Some ASICs need to reserve a region of video memory to avoid access
> > -  * from driver */
> > - adev->mman.stolen_reserved_offset = 0;
> > - adev->mman.stolen_reserved_size = 0;
> > -
> > - switch (adev->asic_type) {
> > - case CHIP_YELLOW_CARP:
> > - if (amdgpu_discovery == 0) {
> > - adev->mman.stolen_reserved_offset = 0x1ffb;
> > - adev->mman.stolen_reserved_size = 64 * PAGE_SIZE;
> > - }
> > - break;
> > - default:
> > - break;
> > - }
> > -}
> > -
> >   int amdgpu_gmc_vram_checking(struct amdgpu_device *adev)
> >   {
> >   struct amdgpu_bo *vram_bo = NULL;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> > index 93505bb0a36c..032b0313f277 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> > @@ -331,7 +331,6 @@ amdgpu_gmc_set_vm_fault_masks(struct amdgpu_device 
> > *adev, int hub_type,
> > bool enable);
> >
> >   void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device *adev);
> > -void

Re: [PATCH 00/30] fix typos in comments

2022-03-14 Thread patchwork-bot+netdevbpf

Hello:

This series was applied to netdev/net-next.git (master)
by Jakub Kicinski :

On Mon, 14 Mar 2022 12:53:24 +0100 you wrote:
> Various spelling mistakes in comments.
> Detected with the help of Coccinelle.
> 
> ---
> 
>  drivers/base/devres.c   |4 ++--
>  drivers/clk/qcom/gcc-sm6125.c   |2 +-
>  drivers/clk/ti/clkctrl.c|2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |4 ++--
>  drivers/gpu/drm/amd/display/dc/bios/command_table.c |6 +++---
>  drivers/gpu/drm/amd/pm/amdgpu_pm.c  |2 +-
>  drivers/gpu/drm/bridge/analogix/analogix_dp_core.c  |4 ++--
>  drivers/gpu/drm/sti/sti_gdp.c   |2 +-
>  drivers/infiniband/hw/qib/qib_iba7220.c |4 ++--
>  drivers/leds/leds-pca963x.c |2 +-
>  drivers/media/i2c/ov5695.c  |2 +-
>  drivers/mfd/rohm-bd9576.c   |2 +-
>  drivers/mtd/ubi/block.c |2 +-
>  drivers/net/can/usb/ucan.c  |4 ++--
>  drivers/net/ethernet/packetengines/yellowfin.c  |2 +-
>  drivers/net/wireless/ath/ath6kl/htc_mbox.c  |2 +-
>  drivers/net/wireless/cisco/airo.c   |2 +-
>  drivers/net/wireless/mediatek/mt76/mt7915/init.c|2 +-
>  drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c |6 +++---
>  drivers/platform/x86/uv_sysfs.c |2 +-
>  drivers/s390/crypto/pkey_api.c  |2 +-
>  drivers/scsi/aic7xxx/aicasm/aicasm.c|2 +-
>  drivers/scsi/elx/libefc_sli/sli4.c  |2 +-
>  drivers/scsi/lpfc/lpfc_mbox.c   |2 +-
>  drivers/scsi/qla2xxx/qla_gs.c   |2 +-
>  drivers/spi/spi-sun4i.c |2 +-
>  drivers/staging/rtl8723bs/core/rtw_mlme.c   |2 +-
>  drivers/usb/gadget/udc/snps_udc_core.c  |2 +-
>  fs/kernfs/file.c|2 +-
>  kernel/events/core.c|2 +-
>  30 files changed, 39 insertions(+), 39 deletions(-)

Here is the summary with links:
  - [03/30] ath6kl: fix typos in comments
(no matching commit)
  - [10/30] mt76: mt7915: fix typos in comments
(no matching commit)
  - [12/30] drivers: net: packetengines: fix typos in comments
https://git.kernel.org/netdev/net-next/c/ebc0b8b5374e
  - [19/30] rtlwifi: rtl8821ae: fix typos in comments
(no matching commit)
  - [20/30] airo: fix typos in comments
(no matching commit)
  - [27/30] can: ucan: fix typos in comments
(no matching commit)

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html

Re: [PATCH 1/1] drm/amdkfd: evict svm bo worker handle error

2022-03-14 Thread Felix Kuehling


Am 2022-03-14 um 10:50 schrieb Philip Yang:

Migrate vram to ram may fail to find the vma if process is exiting
and vma is removed, evict svm bo worker sets prange->svm_bo to NULL
and warn svm_bo ref count != 1 only if migrating vram to ram
successfully.

Signed-off-by: Philip Yang 
---
  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 27 +++-
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 20 ++
  2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 7f689094be5a..c8aef2fe459b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -638,6 +638,22 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
return r;
  }
  
+/**

+ * svm_migrate_vma_to_ram - migrate range inside one vma from device to system
+ *
+ * @adev: amdgpu device to migrate from
+ * @prange: svm range structure
+ * @vma: vm_area_struct that range [start, end] belongs to
+ * @start: range start virtual address in pages
+ * @end: range end virtual address in pages
+ *
+ * Context: Process context, caller hold mmap read lock, svms lock, prange lock


I believe this is not quite correct. In svm_range_evict_svm_bo_worker 
we're not holding the smvs->lock.


Regards,
  Felix



+ *
+ * Return:
+ *   0 - success with all pages migrated
+ *   negative values - indicate error
+ *   positive values - partial migration, number of pages not migrated
+ */
  static long
  svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
   struct vm_area_struct *vma, uint64_t start, uint64_t end)
@@ -709,8 +725,6 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
pdd = svm_range_get_pdd_by_adev(prange, adev);
if (pdd)
WRITE_ONCE(pdd->page_out, pdd->page_out + cpages);
-
-   return upages;
}
return r ? r : upages;
  }
@@ -759,13 +773,16 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, 
struct mm_struct *mm)
unsigned long next;
  
  		vma = find_vma(mm, addr);

-   if (!vma || addr < vma->vm_start)
+   if (!vma || addr < vma->vm_start) {
+   pr_debug("failed to find vma for prange %p\n", prange);
+   r = -EFAULT;
break;
+   }
  
  		next = min(vma->vm_end, end);

r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next);
if (r < 0) {
-   pr_debug("failed %ld to migrate\n", r);
+   pr_debug("failed %ld to migrate prange %p\n", r, 
prange);
break;
} else {
upages += r;
@@ -773,7 +790,7 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, 
struct mm_struct *mm)
addr = next;
}
  
-	if (!upages) {

+   if (r >= 0 && !upages) {
svm_range_vram_node_free(prange);
prange->actual_loc = 0;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 509d915cbe69..215943424c06 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -3155,6 +3155,7 @@ static void svm_range_evict_svm_bo_worker(struct 
work_struct *work)
struct svm_range_bo *svm_bo;
struct kfd_process *p;
struct mm_struct *mm;
+   int r = 0;
  
  	svm_bo = container_of(work, struct svm_range_bo, eviction_work);

if (!svm_bo_ref_unless_zero(svm_bo))
@@ -3170,7 +3171,7 @@ static void svm_range_evict_svm_bo_worker(struct 
work_struct *work)
  
  	mmap_read_lock(mm);

spin_lock(_bo->list_lock);
-   while (!list_empty(_bo->range_list)) {
+   while (!list_empty(_bo->range_list) && !r) {
struct svm_range *prange =
list_first_entry(_bo->range_list,
struct svm_range, svm_bo_list);
@@ -3184,15 +3185,15 @@ static void svm_range_evict_svm_bo_worker(struct 
work_struct *work)
  
  		mutex_lock(>migrate_mutex);

do {
-   svm_migrate_vram_to_ram(prange,
+   r = svm_migrate_vram_to_ram(prange,
svm_bo->eviction_fence->mm);
-   } while (prange->actual_loc && --retries);
-   WARN(prange->actual_loc, "Migration failed during eviction");
-
-   mutex_lock(>lock);
-   prange->svm_bo = NULL;
-   mutex_unlock(>lock);
+   } while (!r && prange->actual_loc && --retries);
  
+		if (!prange->actual_loc) {

+   mutex_lock(>lock);
+   prange->svm_bo = NULL;
+   mutex_unlock(>lock);
+   }

Re: [PATCH 1/1] drm/amdkfd: evict svm bo worker handle error

2022-03-14 Thread Felix Kuehling


Am 2022-03-14 um 10:50 schrieb Philip Yang:

Migrate vram to ram may fail to find the vma if process is exiting
and vma is removed, evict svm bo worker sets prange->svm_bo to NULL
and warn svm_bo ref count != 1 only if migrating vram to ram
successfully.

Signed-off-by: Philip Yang 
---
  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 27 +++-
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 20 ++
  2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 7f689094be5a..c8aef2fe459b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -638,6 +638,22 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
return r;
  }
  
+/**

+ * svm_migrate_vma_to_ram - migrate range inside one vma from device to system
+ *
+ * @adev: amdgpu device to migrate from
+ * @prange: svm range structure
+ * @vma: vm_area_struct that range [start, end] belongs to
+ * @start: range start virtual address in pages
+ * @end: range end virtual address in pages
+ *
+ * Context: Process context, caller hold mmap read lock, svms lock, prange lock
+ *
+ * Return:
+ *   0 - success with all pages migrated
+ *   negative values - indicate error
+ *   positive values - partial migration, number of pages not migrated
+ */
  static long
  svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
   struct vm_area_struct *vma, uint64_t start, uint64_t end)
@@ -709,8 +725,6 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
pdd = svm_range_get_pdd_by_adev(prange, adev);
if (pdd)
WRITE_ONCE(pdd->page_out, pdd->page_out + cpages);
-
-   return upages;
}
return r ? r : upages;
  }
@@ -759,13 +773,16 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, 
struct mm_struct *mm)
unsigned long next;
  
  		vma = find_vma(mm, addr);

-   if (!vma || addr < vma->vm_start)
+   if (!vma || addr < vma->vm_start) {
+   pr_debug("failed to find vma for prange %p\n", prange);
+   r = -EFAULT;
break;
+   }
  
  		next = min(vma->vm_end, end);

r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next);
if (r < 0) {
-   pr_debug("failed %ld to migrate\n", r);
+   pr_debug("failed %ld to migrate prange %p\n", r, 
prange);
break;
} else {
upages += r;
@@ -773,7 +790,7 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, 
struct mm_struct *mm)
addr = next;
}
  
-	if (!upages) {

+   if (r >= 0 && !upages) {
svm_range_vram_node_free(prange);
prange->actual_loc = 0;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 509d915cbe69..215943424c06 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -3155,6 +3155,7 @@ static void svm_range_evict_svm_bo_worker(struct 
work_struct *work)
struct svm_range_bo *svm_bo;
struct kfd_process *p;
struct mm_struct *mm;
+   int r = 0;
  
  	svm_bo = container_of(work, struct svm_range_bo, eviction_work);

if (!svm_bo_ref_unless_zero(svm_bo))
@@ -3170,7 +3171,7 @@ static void svm_range_evict_svm_bo_worker(struct 
work_struct *work)
  
  	mmap_read_lock(mm);

spin_lock(_bo->list_lock);
-   while (!list_empty(_bo->range_list)) {
+   while (!list_empty(_bo->range_list) && !r) {
struct svm_range *prange =
list_first_entry(_bo->range_list,
struct svm_range, svm_bo_list);
@@ -3184,15 +3185,15 @@ static void svm_range_evict_svm_bo_worker(struct 
work_struct *work)
  
  		mutex_lock(>migrate_mutex);

do {
-   svm_migrate_vram_to_ram(prange,
+   r = svm_migrate_vram_to_ram(prange,
svm_bo->eviction_fence->mm);
-   } while (prange->actual_loc && --retries);
-   WARN(prange->actual_loc, "Migration failed during eviction");
-
-   mutex_lock(>lock);
-   prange->svm_bo = NULL;
-   mutex_unlock(>lock);
+   } while (!r && prange->actual_loc && --retries);


I think it would still be good to have a WARN here for other cases than 
process termination. Is there a way to distinguish the process 
termination case from the error code? Maybe we could even avoid the 
retry in this case.


Other than that, this patch is a good improvement on the error handling.

Regards,

Re: [PATCH 2/2] drm/amdgpu: Add stolen reserved memory for MI25 SRIOV.

2022-03-14 Thread Christian König


Am 14.03.22 um 19:54 schrieb Yongqiang Sun:

[Why]
MI25 SRIOV guest driver loading failed due to allocate
memory overlaps with private memory area.

[How]
1. Allocate stolen reserved memory for MI25 SRIOV specifically to avoid
the memory overlap.
2. Move allocate reserve allocation to vbios allocation since both the
two functions are doing similar asic type check and no need to have
seperate functions.

Signed-off-by: Yongqiang Sun 
Change-Id: I142127513047a3e81573eb983c510d763b548a24
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 37 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 -
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  1 -
  3 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 7c2a9555b7cc..f7f4f00dd2b2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -626,6 +626,11 @@ void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device 
*adev)
  {
unsigned size;
  
+	/* Some ASICs need to reserve a region of video memory to avoid access

+* from driver */
+   adev->mman.stolen_reserved_offset = 0;
+   adev->mman.stolen_reserved_size = 0;
+
/*
 * TODO:
 * Currently there is a bug where some memory client outside
@@ -635,11 +640,24 @@ void amdgpu_gmc_get_vbios_allocations(struct 
amdgpu_device *adev)
 * Keep the stolen memory reservation until the while this is not 
solved.
 */
switch (adev->asic_type) {
+
case CHIP_VEGA10:


Please don't add empty lines between switch and case. Good practice is 
to check your patches with checkpatch.pl before sending it out.



+   adev->mman.keep_stolen_vga_memory = true;
+   if (amdgpu_sriov_vf(adev)) {
+   adev->mman.stolen_reserved_offset = 0x10;
+   adev->mman.stolen_reserved_size = 0x60;
+   }
+   break;
case CHIP_RAVEN:
case CHIP_RENOIR:
adev->mman.keep_stolen_vga_memory = true;
break;
+   case CHIP_YELLOW_CARP:
+   if (amdgpu_discovery == 0) {
+   adev->mman.stolen_reserved_offset = 0x1ffb;
+   adev->mman.stolen_reserved_size = 64 * PAGE_SIZE;
+   }
+   break;


That looks like this is somehow mixed up. The stolen memory is for VGA 
emulation, but under SRIOV we should not have VGA emulation as far as I 
know.


Alex, what's going on here?

Regards,
Christian.


default:
adev->mman.keep_stolen_vga_memory = false;
break;
@@ -760,25 +778,6 @@ uint64_t amdgpu_gmc_vram_cpu_pa(struct amdgpu_device 
*adev, struct amdgpu_bo *bo
return amdgpu_bo_gpu_offset(bo) - adev->gmc.vram_start + 
adev->gmc.aper_base;
  }
  
-void amdgpu_gmc_get_reserved_allocation(struct amdgpu_device *adev)

-{
-   /* Some ASICs need to reserve a region of video memory to avoid access
-* from driver */
-   adev->mman.stolen_reserved_offset = 0;
-   adev->mman.stolen_reserved_size = 0;
-
-   switch (adev->asic_type) {
-   case CHIP_YELLOW_CARP:
-   if (amdgpu_discovery == 0) {
-   adev->mman.stolen_reserved_offset = 0x1ffb;
-   adev->mman.stolen_reserved_size = 64 * PAGE_SIZE;
-   }
-   break;
-   default:
-   break;
-   }
-}
-
  int amdgpu_gmc_vram_checking(struct amdgpu_device *adev)
  {
struct amdgpu_bo *vram_bo = NULL;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 93505bb0a36c..032b0313f277 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -331,7 +331,6 @@ amdgpu_gmc_set_vm_fault_masks(struct amdgpu_device *adev, 
int hub_type,
  bool enable);
  
  void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device *adev);

-void amdgpu_gmc_get_reserved_allocation(struct amdgpu_device *adev);
  
  void amdgpu_gmc_init_pdb0(struct amdgpu_device *adev);

  uint64_t amdgpu_gmc_vram_mc2pa(struct amdgpu_device *adev, uint64_t mc_addr);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index f60b7bd4dbf5..3c1d440824a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -948,7 +948,6 @@ static int gmc_v10_0_sw_init(void *handle)
return r;
  
  	amdgpu_gmc_get_vbios_allocations(adev);

-   amdgpu_gmc_get_reserved_allocation(adev);
  
  	/* Memory manager */

r = amdgpu_bo_init(adev);

[PATCH] drm: Fix a infinite loop condition when order becomes 0

2022-03-14 Thread Arunpravin

handle a situation in the condition order-- == min_order,
when order = 0, leading to order = -1, it now won't exit
the loop. To avoid this problem, added a order check in
the same condition, (i.e) when order is 0, we return
-ENOSPC

Signed-off-by: Arunpravin 
---
 drivers/gpu/drm/drm_buddy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 72f52f293249..5ab66aaf2bbd 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -685,7 +685,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
if (!IS_ERR(block))
break;
 
-   if (order-- == min_order) {
+   if (!order || order-- == min_order) {
err = -ENOSPC;
goto err_free;
}
-- 
2.25.1

Re: [PATCH 1/2] drm/amdgpu: Use fw vram offset when allocating stolen vga memory.

2022-03-14 Thread Christian König


Am 14.03.22 um 19:54 schrieb Yongqiang Sun:

[Why]
Memory access violation will happen in case of allocate stolen vga
memory with size isn't 0.

[How]
when allocating stolen vga memory, use fw vram offset as the start point
instead of hard code value 0.


Please stop separating commit message into [Why] and [How], that is not 
well received everywhere.


Apart from that the patch is a certain NAK, you are messing things quite 
up here.




Signed-off-by: Yongqiang Sun 
Change-Id: I7c555a6c1fb4b3fa8685753b4bdcbe215f89ea1e
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 31 +++--
  1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 41d6f604813d..1f635fdb0395 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1708,6 +1708,7 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
uint64_t gtt_size;
int r;
u64 vis_vram_limit;
+   u64 memory_offset = adev->mman.fw_vram_usage_start_offset + 
adev->mman.fw_vram_usage_size;
  
  	mutex_init(>mman.gtt_window_lock);
  
@@ -1774,24 +1775,40 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)

 * This is used for VGA emulation and pre-OS scanout buffers to
 * avoid display artifacts while transitioning between pre-OS
 * and driver.  */
-   r = amdgpu_bo_create_kernel_at(adev, 0, adev->mman.stolen_vga_size,
+   r = amdgpu_bo_create_kernel_at(adev,
+  memory_offset,
+  adev->mman.stolen_vga_size,


That is certainly incorrect. See function amdgpu_ttm_fw_reserve_vram_init().

The stolen VGA buffer always started at offset 0 and is independent of 
the range defined by fw_vram_usage_start_offset and fw_vram_usage_size.


Regards,
Christian.


   AMDGPU_GEM_DOMAIN_VRAM,
   >mman.stolen_vga_memory,
   NULL);
if (r)
return r;
-   r = amdgpu_bo_create_kernel_at(adev, adev->mman.stolen_vga_size,
+
+   memory_offset += adev->mman.stolen_vga_size;
+
+   r = amdgpu_bo_create_kernel_at(adev,
+  memory_offset,
   adev->mman.stolen_extended_size,
   AMDGPU_GEM_DOMAIN_VRAM,
   >mman.stolen_extended_memory,
   NULL);
if (r)
return r;
-   r = amdgpu_bo_create_kernel_at(adev, adev->mman.stolen_reserved_offset,
-  adev->mman.stolen_reserved_size,
-  AMDGPU_GEM_DOMAIN_VRAM,
-  >mman.stolen_reserved_memory,
-  NULL);
+
+   memory_offset += adev->mman.stolen_extended_size;
+
+   if (adev->mman.stolen_reserved_offset > memory_offset)
+   r = amdgpu_bo_create_kernel_at(adev, 
adev->mman.stolen_reserved_offset,
+  
adev->mman.stolen_reserved_size,
+  AMDGPU_GEM_DOMAIN_VRAM,
+  
>mman.stolen_reserved_memory,
+  NULL);
+   else if (adev->mman.stolen_reserved_offset + 
adev->mman.stolen_reserved_size > memory_offset)
+   r = amdgpu_bo_create_kernel_at(adev, memory_offset,
+   adev->mman.stolen_reserved_offset 
+ adev->mman.stolen_reserved_size - memory_offset,
+  AMDGPU_GEM_DOMAIN_VRAM,
+  
>mman.stolen_reserved_memory,
+  NULL);
if (r)
return r;

[PATCH] drm/i915: round_up the size to the alignment value

2022-03-14 Thread Arunpravin

handle instances when size is not aligned with the min_page_size.
Unigine Heaven has allocation requests for example required pages
are 161 and alignment request is 128. To allocate the left over
33 pages, continues the iteration to find the order value which
is 5 and 0 and when it compares with min_order = 7, triggers the
BUG_ON((order < min_order). To avoid this problem, round_up the
size to the alignment and enable the is_contiguous variable and
the block trimmed to the original size.

Signed-off-by: Arunpravin 
---
 drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c 
b/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c
index 129f668f21ff..318aa731de5b 100644
--- a/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c
+++ b/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c
@@ -40,6 +40,7 @@ static int i915_ttm_buddy_man_alloc(struct 
ttm_resource_manager *man,
struct i915_ttm_buddy_resource *bman_res;
struct drm_buddy *mm = >mm;
unsigned long n_pages, lpfn;
+   bool is_contiguous = 0;
u64 min_page_size;
u64 size;
int err;
@@ -48,6 +49,9 @@ static int i915_ttm_buddy_man_alloc(struct 
ttm_resource_manager *man,
if (!lpfn)
lpfn = man->size;
 
+   if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
+   is_contiguous = 1;
+
bman_res = kzalloc(sizeof(*bman_res), GFP_KERNEL);
if (!bman_res)
return -ENOMEM;
@@ -71,7 +75,12 @@ static int i915_ttm_buddy_man_alloc(struct 
ttm_resource_manager *man,
 
GEM_BUG_ON(min_page_size < mm->chunk_size);
 
-   if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
+   if (!is_contiguous && !IS_ALIGNED(size, min_page_size)) {
+   size = round_up(size, min_page_size);
+   is_contiguous = 1;
+   }
+
+   if (is_contiguous) {
unsigned long pages;
 
size = roundup_pow_of_two(size);
@@ -106,7 +115,7 @@ static int i915_ttm_buddy_man_alloc(struct 
ttm_resource_manager *man,
if (unlikely(err))
goto err_free_blocks;
 
-   if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
+   if (is_contiguous) {
u64 original_size = (u64)bman_res->base.num_pages << PAGE_SHIFT;
 
mutex_lock(>lock);

base-commit: b37605de46fef48555bf0cf463cccf355c51fac0
-- 
2.25.1

Re: [PATCH] drm: remove min_order BUG_ON check

2022-03-14 Thread Arunpravin




On 10/03/22 8:59 pm, Matthew Auld wrote:
> On 10/03/2022 14:47, Arunpravin wrote:
>>
>>
>> On 08/03/22 10:31 pm, Matthew Auld wrote:
>>> On 08/03/2022 13:59, Arunpravin wrote:


 On 07/03/22 10:11 pm, Matthew Auld wrote:
> On 07/03/2022 14:37, Arunpravin wrote:
>> place BUG_ON(order < min_order) outside do..while
>> loop as it fails Unigine Heaven benchmark.
>>
>> Unigine Heaven has buffer allocation requests for
>> example required pages are 161 and alignment request
>> is 128. To allocate the remaining 33 pages, continues
>> the iteration to find the order value which is 5 and
>> when it compares with min_order = 7, enables the
>> BUG_ON(). To avoid this problem, placed the BUG_ON
>> check outside of do..while loop.
>>
>> Signed-off-by: Arunpravin 
>> ---
>> drivers/gpu/drm/drm_buddy.c | 3 ++-
>> 1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
>> index 72f52f293249..ed94c56b720f 100644
>> --- a/drivers/gpu/drm/drm_buddy.c
>> +++ b/drivers/gpu/drm/drm_buddy.c
>> @@ -669,10 +669,11 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
>>  order = fls(pages) - 1;
>>  min_order = ilog2(min_page_size) - ilog2(mm->chunk_size);
>> 
>> +BUG_ON(order < min_order);
>
> Isn't the issue that we are allowing a size that is not aligned to the
> requested min_page_size? Should we not fix the caller(and throw a normal
> error here), or perhaps add the round_up() here instead?
>
 CASE 1:
 when size is not aligned to the requested min_page_size, for instance,
 required size = 161 pages, min_page_size = 128 pages, here we have 3
 possible options,
 a. AFAIK,This kind of situation is common in any workload,the first
 allocation (i.e) 128 pages is aligned to min_page_size, Should we just
 allocate the left over 33 pages (2 pow 5, 2 pow 0) since the caller does
 know the left over pages are not in min_page_size alignment?
>>>
>>> So IIUC looking at amdgpu_gem_create_ioctl(), userspace can specify some
>>> arbitrary physical alignment for an object? Is that not meant to apply
>>> to every page/chunk? The above example would only have the correct
>>> physical alignment guaranteed for the first chunk, or so, is this the
>>> expected ABI behaviour?
>>>
>> I gone through the function amdgpu_gem_create_ioctl(), it reads the
>> physical alignment in bytes from userspace, does i915 round up the size
>> value to the alignment or does i915 fails the allocation request if size
>> is not aligned with min_page_size? If not, I think running unigine
>> heaven or similar benchmark triggers BUG_ON() on current version of drm
>> buddy
> 
> i915 will always round_up the obj->base.size as per the 
> default_page_size. But in our case the default_page_size is selected by 
> the kernel, which is always either PAGE_SIZE, or 64K on some platforms, 
> due to the HW having some minimum GPU page-size for mapping VRAM pages. 
> We don't currently have anything similar to 
> amdgpu_gem_create_in.alignment, where userspace can request some 
> arbitrary physical alignment.
> 
>>> Also looking at this some more, the other related bug here is the
>>> order-- == min_order check, since it now won't bail when order == 0,
>>> leading to order = -1, if we are unlucky...
>> will add a fix
>>>
>>> Originally, if asking for min_page_size > chunk_size, then the
>>> allocation was meant to fail if it can't fill the resource request with
>>> pages of at least that size(and also alignment). Or at least that was
>>> the original meaning in i915 IIRC.
>> we can follow the same here too, failing the allocation request if size
>> is not aligned with min_page_size?
> 
> Yeah, seems reasonable to me.
I had internal discussion with Christian and he suggested to round_up
the size to the alignment and trim the block to the required original
size. I have sent the patch, please review.

Thanks,
Arun
> 
>>
>> I added a debug print for requested num_pages from userspace and its
>> alignment request and executed unigine heaven, I see many such instances
>> where min_page_size is not aligned to the size, how i915 handles such
>> requests?
>>>

 b. There are many such instances in unigine heaven workload (there would
 be many such workloads), throwing a normal error would lower the FPS? is
 it possible to fix at caller application?

 c. adding the round_up() is possible, but in every such instances we end
 up allocating extra unused memory. For example, if required pages = 1028
 and min_page_size = 1024 pages, we end up round up of left over 4 pages
 to the min_page_size, so the total size would be 2048 pages.

> i.e if someone does:
>
> alloc_blocks(mm, 0, end, 4096, 1<<16, , flags);
 CASE 2:
 I think this case should be detected (i.e) when

[PATCH 2/2] drm/amdgpu: Add stolen reserved memory for MI25 SRIOV.

2022-03-14 Thread Yongqiang Sun

[Why]
MI25 SRIOV guest driver loading failed due to allocate
memory overlaps with private memory area.

[How]
1. Allocate stolen reserved memory for MI25 SRIOV specifically to avoid
the memory overlap.
2. Move allocate reserve allocation to vbios allocation since both the
two functions are doing similar asic type check and no need to have
seperate functions.

Signed-off-by: Yongqiang Sun 
Change-Id: I142127513047a3e81573eb983c510d763b548a24
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 37 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 -
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  1 -
 3 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 7c2a9555b7cc..f7f4f00dd2b2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -626,6 +626,11 @@ void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device 
*adev)
 {
unsigned size;
 
+   /* Some ASICs need to reserve a region of video memory to avoid access
+* from driver */
+   adev->mman.stolen_reserved_offset = 0;
+   adev->mman.stolen_reserved_size = 0;
+
/*
 * TODO:
 * Currently there is a bug where some memory client outside
@@ -635,11 +640,24 @@ void amdgpu_gmc_get_vbios_allocations(struct 
amdgpu_device *adev)
 * Keep the stolen memory reservation until the while this is not 
solved.
 */
switch (adev->asic_type) {
+
case CHIP_VEGA10:
+   adev->mman.keep_stolen_vga_memory = true;
+   if (amdgpu_sriov_vf(adev)) {
+   adev->mman.stolen_reserved_offset = 0x10;
+   adev->mman.stolen_reserved_size = 0x60;
+   }
+   break;
case CHIP_RAVEN:
case CHIP_RENOIR:
adev->mman.keep_stolen_vga_memory = true;
break;
+   case CHIP_YELLOW_CARP:
+   if (amdgpu_discovery == 0) {
+   adev->mman.stolen_reserved_offset = 0x1ffb;
+   adev->mman.stolen_reserved_size = 64 * PAGE_SIZE;
+   }
+   break;
default:
adev->mman.keep_stolen_vga_memory = false;
break;
@@ -760,25 +778,6 @@ uint64_t amdgpu_gmc_vram_cpu_pa(struct amdgpu_device 
*adev, struct amdgpu_bo *bo
return amdgpu_bo_gpu_offset(bo) - adev->gmc.vram_start + 
adev->gmc.aper_base;
 }
 
-void amdgpu_gmc_get_reserved_allocation(struct amdgpu_device *adev)
-{
-   /* Some ASICs need to reserve a region of video memory to avoid access
-* from driver */
-   adev->mman.stolen_reserved_offset = 0;
-   adev->mman.stolen_reserved_size = 0;
-
-   switch (adev->asic_type) {
-   case CHIP_YELLOW_CARP:
-   if (amdgpu_discovery == 0) {
-   adev->mman.stolen_reserved_offset = 0x1ffb;
-   adev->mman.stolen_reserved_size = 64 * PAGE_SIZE;
-   }
-   break;
-   default:
-   break;
-   }
-}
-
 int amdgpu_gmc_vram_checking(struct amdgpu_device *adev)
 {
struct amdgpu_bo *vram_bo = NULL;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 93505bb0a36c..032b0313f277 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -331,7 +331,6 @@ amdgpu_gmc_set_vm_fault_masks(struct amdgpu_device *adev, 
int hub_type,
  bool enable);
 
 void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device *adev);
-void amdgpu_gmc_get_reserved_allocation(struct amdgpu_device *adev);
 
 void amdgpu_gmc_init_pdb0(struct amdgpu_device *adev);
 uint64_t amdgpu_gmc_vram_mc2pa(struct amdgpu_device *adev, uint64_t mc_addr);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index f60b7bd4dbf5..3c1d440824a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -948,7 +948,6 @@ static int gmc_v10_0_sw_init(void *handle)
return r;
 
amdgpu_gmc_get_vbios_allocations(adev);
-   amdgpu_gmc_get_reserved_allocation(adev);
 
/* Memory manager */
r = amdgpu_bo_init(adev);
-- 
2.25.1

[PATCH 1/2] drm/amdgpu: Use fw vram offset when allocating stolen vga memory.

2022-03-14 Thread Yongqiang Sun

[Why]
Memory access violation will happen in case of allocate stolen vga
memory with size isn't 0.

[How]
when allocating stolen vga memory, use fw vram offset as the start point
instead of hard code value 0.

Signed-off-by: Yongqiang Sun 
Change-Id: I7c555a6c1fb4b3fa8685753b4bdcbe215f89ea1e
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 31 +++--
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 41d6f604813d..1f635fdb0395 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1708,6 +1708,7 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
uint64_t gtt_size;
int r;
u64 vis_vram_limit;
+   u64 memory_offset = adev->mman.fw_vram_usage_start_offset + 
adev->mman.fw_vram_usage_size;
 
mutex_init(>mman.gtt_window_lock);
 
@@ -1774,24 +1775,40 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
 * This is used for VGA emulation and pre-OS scanout buffers to
 * avoid display artifacts while transitioning between pre-OS
 * and driver.  */
-   r = amdgpu_bo_create_kernel_at(adev, 0, adev->mman.stolen_vga_size,
+   r = amdgpu_bo_create_kernel_at(adev,
+  memory_offset,
+  adev->mman.stolen_vga_size,
   AMDGPU_GEM_DOMAIN_VRAM,
   >mman.stolen_vga_memory,
   NULL);
if (r)
return r;
-   r = amdgpu_bo_create_kernel_at(adev, adev->mman.stolen_vga_size,
+
+   memory_offset += adev->mman.stolen_vga_size;
+
+   r = amdgpu_bo_create_kernel_at(adev,
+  memory_offset,
   adev->mman.stolen_extended_size,
   AMDGPU_GEM_DOMAIN_VRAM,
   >mman.stolen_extended_memory,
   NULL);
if (r)
return r;
-   r = amdgpu_bo_create_kernel_at(adev, adev->mman.stolen_reserved_offset,
-  adev->mman.stolen_reserved_size,
-  AMDGPU_GEM_DOMAIN_VRAM,
-  >mman.stolen_reserved_memory,
-  NULL);
+
+   memory_offset += adev->mman.stolen_extended_size;
+
+   if (adev->mman.stolen_reserved_offset > memory_offset)
+   r = amdgpu_bo_create_kernel_at(adev, 
adev->mman.stolen_reserved_offset,
+  
adev->mman.stolen_reserved_size,
+  AMDGPU_GEM_DOMAIN_VRAM,
+  
>mman.stolen_reserved_memory,
+  NULL);
+   else if (adev->mman.stolen_reserved_offset + 
adev->mman.stolen_reserved_size > memory_offset)
+   r = amdgpu_bo_create_kernel_at(adev, memory_offset,
+   
adev->mman.stolen_reserved_offset + adev->mman.stolen_reserved_size - 
memory_offset,
+  AMDGPU_GEM_DOMAIN_VRAM,
+  
>mman.stolen_reserved_memory,
+  NULL);
if (r)
return r;
 
-- 
2.25.1

Re: [PATCH 3/3] drm/amdkfd: add RAS poison consumption support for utcl2

2022-03-14 Thread Felix Kuehling

I'm not sure I understand this change. It looks like you will check the 
RAS poison status on every UTCL2 VM fault? Is that because there is no 
dedicated interrupt source or client ID to distinguish UTCL2 poison 
consumption from VM faults?


Why is kfd_signal_poison_consumed_event not done for UTCL2 poison 
consumption? The comment says, because it's signaled to user mode as a 
VM fault. But a VM fault and a poison consumption event are different. 
You're basically sending the wrong event to user mode. Maybe it doesn't 
make a big difference in practice at the moment. But that could change 
in the future.


Two more comments inline ...


Am 2022-03-14 um 03:03 schrieb Tao Zhou:

Do RAS page retirement and use gpu reset as fallback in utcl2
fault handler.

Signed-off-by: Tao Zhou 
---
  .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 23 ---
  1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index f7def0bf0730..3991f71d865b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -93,11 +93,12 @@ enum SQ_INTERRUPT_ERROR_TYPE {
  static void event_interrupt_poison_consumption(struct kfd_dev *dev,
const uint32_t *ih_ring_entry)
  {
-   uint16_t source_id, pasid;
+   uint16_t source_id, client_id, pasid;
int ret = -EINVAL;
struct kfd_process *p;
  
  	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);

+   client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
  
  	p = kfd_lookup_process_by_pasid(pasid);

@@ -110,6 +111,7 @@ static void event_interrupt_poison_consumption(struct 
kfd_dev *dev,
return;
}
  
+	pr_debug("RAS poison consumption handling\n");

atomic_set(>poison, 1);
kfd_unref_process(p);
  
@@ -119,10 +121,14 @@ static void event_interrupt_poison_consumption(struct kfd_dev *dev,

break;
case SOC15_INTSRC_SDMA_ECC:
default:
+   if (client_id == SOC15_IH_CLIENTID_UTCL2)
+   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
break;
}
  
-	kfd_signal_poison_consumed_event(dev, pasid);

+   /* utcl2 page fault has its own vm fault event */
+   if (client_id != SOC15_IH_CLIENTID_UTCL2)
+   kfd_signal_poison_consumed_event(dev, pasid);
  
  	/* resetting queue passes, do page retirement without gpu reset

 * resetting queue fails, fallback to gpu reset solution
@@ -314,7 +320,18 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
info.prot_write = ring_id & 0x20;
  
  		kfd_smi_event_update_vmfault(dev, pasid);

-   kfd_dqm_evict_pasid(dev->dqm, pasid);
+
+   if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
+   dev->kfd2kgd->is_ras_utcl2_poison &&
+   dev->kfd2kgd->is_ras_utcl2_poison(dev->adev, client_id)) {
+   event_interrupt_poison_consumption(dev, ih_ring_entry);
+
+   if (dev->kfd2kgd->utcl2_fault_clear)
+   dev->kfd2kgd->utcl2_fault_clear(dev->adev);
+   }
+   else


Else should be on the same line as }. Please run checkpatch.pl, it would 
catch this. It's also good practice to use {}-braces around the 
else-branch if you have braces around the if-branch.




+   kfd_dqm_evict_pasid(dev->dqm, pasid);
+
kfd_signal_vm_fault_event(dev, pasid, );


If you move kfd_signal_vm_fault_event into the else-branch, you can 
unconditionally send the correct poison consumption event to user mode 
in event_interrupt_poison_consumption.


Regards,
  Felix



}
  }

Re: [PATCH 1/3] drm/amdkfd: update parameter for event_interrupt_poison_consumption

2022-03-14 Thread Felix Kuehling


Am 2022-03-14 um 03:03 schrieb Tao Zhou:

Other parameters can be gotten from ih_ring_entry, so only inputting
ih_ring_entry is enough.


I'm not sure what's the reason for this change. You remove one 
parameter, but end up duplicating the SOC15_..._FROM_IH_RING_ENTRY 
translations. It doesn't look like a net improvement to me.


Looking at this function a bit more, this code looks problematic:

    if (atomic_read(>poison)) {
    kfd_unref_process(p);
    return;
    }

    atomic_set(>poison, 1);
    kfd_unref_process(p);

Doing the read and set as two separate operations is not atomic. You 
should use atomic_cmpxchg here to make sure the poison-consumption is 
handled only once:


old_poison = atomic_cmpxchg(>poison, 0, 1);
kfd_unref_process(p);
if (old_poison)
return;
/* handle poison consumption */

Alternatively you could use atomic_inc_return and do the poison handling 
only if that returns exactly 1.


Regards,
  Felix




Signed-off-by: Tao Zhou 
---
  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 13 +
  1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 7eedbcd14828..f7def0bf0730 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -91,11 +91,16 @@ enum SQ_INTERRUPT_ERROR_TYPE {
  #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
  
  static void event_interrupt_poison_consumption(struct kfd_dev *dev,

-   uint16_t pasid, uint16_t source_id)
+   const uint32_t *ih_ring_entry)
  {
+   uint16_t source_id, pasid;
int ret = -EINVAL;
-   struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+   struct kfd_process *p;
  
+	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);

+   pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
+
+   p = kfd_lookup_process_by_pasid(pasid);
if (!p)
return;
  
@@ -270,7 +275,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,

sq_intr_err);
if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST 
&&
sq_intr_err != 
SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
-   event_interrupt_poison_consumption(dev, 
pasid, source_id);
+   event_interrupt_poison_consumption(dev, 
ih_ring_entry);
return;
}
break;
@@ -291,7 +296,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
if (source_id == SOC15_INTSRC_SDMA_TRAP) {
kfd_signal_event_interrupt(pasid, context_id0 & 
0xfff, 28);
} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
-   event_interrupt_poison_consumption(dev, pasid, 
source_id);
+   event_interrupt_poison_consumption(dev, ih_ring_entry);
return;
}
} else if (client_id == SOC15_IH_CLIENTID_VMC ||

Re: [PATCH] drm/amd/pm: fix indenting in __smu_cmn_reg_print_error()

2022-03-14 Thread Luben Tuikov

Thanks!

Reviewed-by: Luben Tuikov 

Regards,
Luben

On 2022-03-14 10:04, Dan Carpenter wrote:
> Smatch complains that the dev_err_ratelimited() is indented one tab more
> than the surrounding lines.
> 
>   drivers/gpu/drm/amd/amdgpu/../pm/swsmu/smu_cmn.c:174
>   __smu_cmn_reg_print_error() warn: inconsistent indenting
> 
> It looks like it's not a bug, just that the indenting needs to be cleaned
> up.
> 
> Signed-off-by: Dan Carpenter 
> ---
>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 16 
>  1 file changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index ae64d1980f10..b8d0c70ff668 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -164,17 +164,17 @@ static void __smu_cmn_reg_print_error(struct 
> smu_context *smu,
>  
>   switch (reg_c2pmsg_90) {
>   case SMU_RESP_NONE: {
> - if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 5)) {
> - msg_idx = RREG32_SOC15(MP1, 0, mmMP1_C2PMSG_2);
> - prm = RREG32_SOC15(MP1, 0, mmMP1_C2PMSG_34);
> - } else {
> - msg_idx = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66);
> - prm = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);
> - }
> + if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 5)) {
> + msg_idx = RREG32_SOC15(MP1, 0, mmMP1_C2PMSG_2);
> + prm = RREG32_SOC15(MP1, 0, mmMP1_C2PMSG_34);
> + } else {
> + msg_idx = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66);
> + prm = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);
> + }
>   dev_err_ratelimited(adev->dev,
>   "SMU: I'm not done with your previous 
> command: SMN_C2PMSG_66:0x%08X SMN_C2PMSG_82:0x%08X",
>   msg_idx, prm);
> - }
> + }
>   break;
>   case SMU_RESP_OK:
>   /* The SMU executed the command. It completed with a

Regards,
-- 
Luben

Re: [PATCH v2 1/2] drm: Add GPU reset sysfs event

2022-03-14 Thread Pekka Paalanen

On Mon, 14 Mar 2022 10:23:27 -0400
Alex Deucher  wrote:

> On Fri, Mar 11, 2022 at 3:30 AM Pekka Paalanen  wrote:
> >
> > On Thu, 10 Mar 2022 11:56:41 -0800
> > Rob Clark  wrote:
> >  
> > > For something like just notifying a compositor that a gpu crash
> > > happened, perhaps drm_event is more suitable.  See
> > > virtio_gpu_fence_event_create() for an example of adding new event
> > > types.  Although maybe you want it to be an event which is not device
> > > specific.  This isn't so much of a debugging use-case as simply
> > > notification.  
> >
> > Hi,
> >
> > for this particular use case, are we now talking about the display
> > device (KMS) crashing or the rendering device (OpenGL/Vulkan) crashing?
> >
> > If the former, I wasn't aware that display device crashes are a thing.
> > How should a userspace display server react to those?
> >
> > If the latter, don't we have EGL extensions or Vulkan API already to
> > deliver that?
> >
> > The above would be about device crashes that directly affect the
> > display server. Is that the use case in mind here, or is it instead
> > about notifying the display server that some application has caused a
> > driver/hardware crash? If the latter, how should a display server react
> > to that? Disconnect the application?
> >
> > Shashank, what is the actual use case you are developing this for?
> >
> > I've read all the emails here so far, and I don't recall seeing it
> > explained.
> >  
> 
> The idea is that a support daemon or compositor would listen for GPU
> reset notifications and do something useful with them (kill the guilty
> app, restart the desktop environment, etc.).  Today when the GPU
> resets, most applications just continue assuming nothing is wrong,
> meanwhile the GPU has stopped accepting work until the apps re-init
> their context so all of their command submissions just get rejected.
> 
> > Btw. somewhat relatedly, there has been work aiming to allow
> > graceful hot-unplug of DRM devices. There is a kernel doc outlining how
> > the various APIs should react towards userspace when a DRM device
> > suddenly disappears. That seems to have some overlap here IMO.
> >
> > See 
> > https://www.kernel.org/doc/html/latest/gpu/drm-uapi.html#device-hot-unplug
> > which also has a couple pointers to EGL and Vulkan APIs.  
> 
> The problem is most applications don't use the GL or VK robustness
> APIs.

Hi,

how would this new event help with that?

I mean, yeah, there could be a daemon that kills those GPU users, but
then what? You still lose any unsaved work, and may need to manually
restart them.

Is the idea that it is better to have the app crash and disappear than
to look like it froze while it otherwise still runs?

If some daemon or compositor goes killing apps that trigger GPU resets,
then how do we stop that for an app that actually does use the
appropriate EGL or Vulkan APIs to detect and remedy that situation
itself?

>  You could use something like that in the compositor, but those
> APIs tend to be focused more on the application itself rather than the
> GPU in general.  E.g., Is my context lost.  Which is fine for
> restarting your context, but doesn't really help if you want to try
> and do something with another application (i.e., the likely guilty
> app).  Also, on dGPU at least, when you reset the GPU, vram is usually
> lost (either due to the memory controller being reset, or vram being
> zero'd on init due to ECC support), so even if you are not the guilty
> process, in that case you'd need to re-init your context anyway.

Why should something like a compositor listen for this and kill apps
that triggered GPU resets, instead of e.g. Mesa noticing that in the app
and killing itself? Mesa in the app would know if robustness API is
being used.

Would be really nice to have the answers to all these questions to be
collected and reiterated in the next version of this proposal.


Thanks,
pq


pgpzIPyqNs8Ys.pgp
Description: OpenPGP digital signature

[PATCH 1/1] drm/amdkfd: evict svm bo worker handle error

2022-03-14 Thread Philip Yang

Migrate vram to ram may fail to find the vma if process is exiting
and vma is removed, evict svm bo worker sets prange->svm_bo to NULL
and warn svm_bo ref count != 1 only if migrating vram to ram
successfully.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 27 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 20 ++
 2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 7f689094be5a..c8aef2fe459b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -638,6 +638,22 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
return r;
 }
 
+/**
+ * svm_migrate_vma_to_ram - migrate range inside one vma from device to system
+ *
+ * @adev: amdgpu device to migrate from
+ * @prange: svm range structure
+ * @vma: vm_area_struct that range [start, end] belongs to
+ * @start: range start virtual address in pages
+ * @end: range end virtual address in pages
+ *
+ * Context: Process context, caller hold mmap read lock, svms lock, prange lock
+ *
+ * Return:
+ *   0 - success with all pages migrated
+ *   negative values - indicate error
+ *   positive values - partial migration, number of pages not migrated
+ */
 static long
 svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
   struct vm_area_struct *vma, uint64_t start, uint64_t end)
@@ -709,8 +725,6 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
pdd = svm_range_get_pdd_by_adev(prange, adev);
if (pdd)
WRITE_ONCE(pdd->page_out, pdd->page_out + cpages);
-
-   return upages;
}
return r ? r : upages;
 }
@@ -759,13 +773,16 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, 
struct mm_struct *mm)
unsigned long next;
 
vma = find_vma(mm, addr);
-   if (!vma || addr < vma->vm_start)
+   if (!vma || addr < vma->vm_start) {
+   pr_debug("failed to find vma for prange %p\n", prange);
+   r = -EFAULT;
break;
+   }
 
next = min(vma->vm_end, end);
r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next);
if (r < 0) {
-   pr_debug("failed %ld to migrate\n", r);
+   pr_debug("failed %ld to migrate prange %p\n", r, 
prange);
break;
} else {
upages += r;
@@ -773,7 +790,7 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, 
struct mm_struct *mm)
addr = next;
}
 
-   if (!upages) {
+   if (r >= 0 && !upages) {
svm_range_vram_node_free(prange);
prange->actual_loc = 0;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 509d915cbe69..215943424c06 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -3155,6 +3155,7 @@ static void svm_range_evict_svm_bo_worker(struct 
work_struct *work)
struct svm_range_bo *svm_bo;
struct kfd_process *p;
struct mm_struct *mm;
+   int r = 0;
 
svm_bo = container_of(work, struct svm_range_bo, eviction_work);
if (!svm_bo_ref_unless_zero(svm_bo))
@@ -3170,7 +3171,7 @@ static void svm_range_evict_svm_bo_worker(struct 
work_struct *work)
 
mmap_read_lock(mm);
spin_lock(_bo->list_lock);
-   while (!list_empty(_bo->range_list)) {
+   while (!list_empty(_bo->range_list) && !r) {
struct svm_range *prange =
list_first_entry(_bo->range_list,
struct svm_range, svm_bo_list);
@@ -3184,15 +3185,15 @@ static void svm_range_evict_svm_bo_worker(struct 
work_struct *work)
 
mutex_lock(>migrate_mutex);
do {
-   svm_migrate_vram_to_ram(prange,
+   r = svm_migrate_vram_to_ram(prange,
svm_bo->eviction_fence->mm);
-   } while (prange->actual_loc && --retries);
-   WARN(prange->actual_loc, "Migration failed during eviction");
-
-   mutex_lock(>lock);
-   prange->svm_bo = NULL;
-   mutex_unlock(>lock);
+   } while (!r && prange->actual_loc && --retries);
 
+   if (!prange->actual_loc) {
+   mutex_lock(>lock);
+   prange->svm_bo = NULL;
+   mutex_unlock(>lock);
+   }
mutex_unlock(>migrate_mutex);
 
spin_lock(_bo->list_lock);
@@ -3201,10 +3202,11 @@ static void

Re: [PATCH v2 1/2] drm: Add GPU reset sysfs event

2022-03-14 Thread Alex Deucher

On Fri, Mar 11, 2022 at 3:30 AM Pekka Paalanen  wrote:
>
> On Thu, 10 Mar 2022 11:56:41 -0800
> Rob Clark  wrote:
>
> > For something like just notifying a compositor that a gpu crash
> > happened, perhaps drm_event is more suitable.  See
> > virtio_gpu_fence_event_create() for an example of adding new event
> > types.  Although maybe you want it to be an event which is not device
> > specific.  This isn't so much of a debugging use-case as simply
> > notification.
>
> Hi,
>
> for this particular use case, are we now talking about the display
> device (KMS) crashing or the rendering device (OpenGL/Vulkan) crashing?
>
> If the former, I wasn't aware that display device crashes are a thing.
> How should a userspace display server react to those?
>
> If the latter, don't we have EGL extensions or Vulkan API already to
> deliver that?
>
> The above would be about device crashes that directly affect the
> display server. Is that the use case in mind here, or is it instead
> about notifying the display server that some application has caused a
> driver/hardware crash? If the latter, how should a display server react
> to that? Disconnect the application?
>
> Shashank, what is the actual use case you are developing this for?
>
> I've read all the emails here so far, and I don't recall seeing it
> explained.
>

The idea is that a support daemon or compositor would listen for GPU
reset notifications and do something useful with them (kill the guilty
app, restart the desktop environment, etc.).  Today when the GPU
resets, most applications just continue assuming nothing is wrong,
meanwhile the GPU has stopped accepting work until the apps re-init
their context so all of their command submissions just get rejected.

> Btw. somewhat relatedly, there has been work aiming to allow
> graceful hot-unplug of DRM devices. There is a kernel doc outlining how
> the various APIs should react towards userspace when a DRM device
> suddenly disappears. That seems to have some overlap here IMO.
>
> See https://www.kernel.org/doc/html/latest/gpu/drm-uapi.html#device-hot-unplug
> which also has a couple pointers to EGL and Vulkan APIs.

The problem is most applications don't use the GL or VK robustness
APIs.  You could use something like that in the compositor, but those
APIs tend to be focused more on the application itself rather than the
GPU in general.  E.g., Is my context lost.  Which is fine for
restarting your context, but doesn't really help if you want to try
and do something with another application (i.e., the likely guilty
app).  Also, on dGPU at least, when you reset the GPU, vram is usually
lost (either due to the memory controller being reset, or vram being
zero'd on init due to ECC support), so even if you are not the guilty
process, in that case you'd need to re-init your context anyway.

Alex

>
>
> Thanks,
> pq

Re: [PATCH v2] drm/amdgpu: add workarounds for Raven VCN TMZ issue

2022-03-14 Thread Alex Deucher

On Thu, Mar 10, 2022 at 9:32 PM Lang Yu  wrote:
>
> On 03/10/ , Christian König wrote:
> > Ok, thanks.
> >
> > Lang is that case your patch should work fine.
> >
> > Just add another patch with a check for the encode case to reject any CS
> > with TMZ buffers in it.
>
> Only VCN decode ring is cared in this patch. For encode ring
> (AMDGPU_HW_IP_VCN_ENC and AMDGPU_HW_IP_VCN_JPEG), is it fine
> we just reject secure IBs in amdgpu_ib_schedule like compute ring?
>

Yes,  correct.

Alex

> Regards,
> Lang
>
> > Thanks,
> > Christian.
> >
> > Am 10.03.22 um 15:17 schrieb Leo Liu:
> > > No need for encode. Encrypting uses TEE/TA to convert clear bitstream to
> > > encrypted bitstream, and has nothing to do with VCN encode and tmz.
> > >
> > > Regards,
> > >
> > > Leo
> > >
> > >
> > > On 2022-03-10 04:53, Christian König wrote:
> > > > Leo you didn't answered the question if we need TMZ for encode as well.
> > > >
> > > > Regards,
> > > > Christian.
> > > >
> > > > Am 10.03.22 um 09:45 schrieb Lang Yu:
> > > > > Ping.
> > > > >
> > > > > On 03/08/ , Leo Liu wrote:
> > > > > > On 2022-03-08 11:18, Leo Liu wrote:
> > > > > > > On 2022-03-08 04:16, Christian König wrote:
> > > > > > > > Am 08.03.22 um 09:06 schrieb Lang Yu:
> > > > > > > > > On 03/08/ , Christian König wrote:
> > > > > > > > > > Am 08.03.22 um 08:33 schrieb Lang Yu:
> > > > > > > > > > > On 03/08/ , Christian König wrote:
> > > > > > > > > > > > Am 08.03.22 um 04:39 schrieb Lang Yu:
> > > > > > > > > > > > > It is a hardware issue that VCN can't handle a GTT
> > > > > > > > > > > > > backing stored TMZ buffer on Raven.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Move such a TMZ buffer to VRAM domain before command
> > > > > > > > > > > > > submission.
> > > > > > > > > > > > >
> > > > > > > > > > > > > v2:
> > > > > > > > > > > > >  - Use patch_cs_in_place callback.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Suggested-by: Christian König 
> > > > > > > > > > > > > 
> > > > > > > > > > > > > Signed-off-by: Lang Yu 
> > > > > > > > > > > > > ---
> > > > > > > > > > > > >  drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c | 68
> > > > > > > > > > > > > +++
> > > > > > > > > > > > >  1 file changed, 68 insertions(+)
> > > > > > > > > > > > >
> > > > > > > > > > > > > diff --git
> > > > > > > > > > > > > a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
> > > > > > > > > > > > > b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
> > > > > > > > > > > > > index 7bbb9ba6b80b..810932abd3af 100644
> > > > > > > > > > > > > --- a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
> > > > > > > > > > > > > +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
> > > > > > > > > > > > > @@ -24,6 +24,7 @@
> > > > > > > > > > > > >  #include 
> > > > > > > > > > > > >  #include "amdgpu.h"
> > > > > > > > > > > > > +#include "amdgpu_cs.h"
> > > > > > > > > > > > >  #include "amdgpu_vcn.h"
> > > > > > > > > > > > >  #include "amdgpu_pm.h"
> > > > > > > > > > > > >  #include "soc15.h"
> > > > > > > > > > > > > @@ -1905,6 +1906,72 @@ static const struct
> > > > > > > > > > > > > amd_ip_funcs vcn_v1_0_ip_funcs = {
> > > > > > > > > > > > >  .set_powergating_state
> > > > > > > > > > > > > =
> > > > > > > > > > > > > vcn_v1_0_set_powergating_state,
> > > > > > > > > > > > >  };
> > > > > > > > > > > > > +/**
> > > > > > > > > > > > > + * It is a hardware issue that Raven VCN can't
> > > > > > > > > > > > > handle a GTT TMZ buffer.
> > > > > > > > > > > > > + * Move such a GTT TMZ buffer to VRAM domain
> > > > > > > > > > > > > before command submission.
> > > > > > > > > > > > > + */
> > > > > > > > > > > > > +static int
> > > > > > > > > > > > > vcn_v1_0_validate_bo(struct
> > > > > > > > > > > > > amdgpu_cs_parser *parser,
> > > > > > > > > > > > > +struct amdgpu_job *job,
> > > > > > > > > > > > > +uint64_t addr)
> > > > > > > > > > > > > +{
> > > > > > > > > > > > > +struct ttm_operation_ctx ctx = { false, false };
> > > > > > > > > > > > > +struct amdgpu_fpriv *fpriv = 
> > > > > > > > > > > > > parser->filp->driver_priv;
> > > > > > > > > > > > > +struct amdgpu_vm *vm = >vm;
> > > > > > > > > > > > > +struct amdgpu_bo_va_mapping *mapping;
> > > > > > > > > > > > > +struct amdgpu_bo *bo;
> > > > > > > > > > > > > +int r;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +addr &= AMDGPU_GMC_HOLE_MASK;
> > > > > > > > > > > > > +if (addr & 0x7) {
> > > > > > > > > > > > > +DRM_ERROR("VCN messages must be 8 byte 
> > > > > > > > > > > > > aligned!\n");
> > > > > > > > > > > > > +return -EINVAL;
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +mapping = amdgpu_vm_bo_lookup_mapping(vm,
> > > > > > > > > > > > > addr/AMDGPU_GPU_PAGE_SIZE);
> > > > > > > > > > > > > +if (!mapping ||
> > > > > > > > > > > > > !mapping->bo_va ||
> > > > > > > > > > > > > !mapping->bo_va->base.bo)
> > > > > > > > > > > > > +

[PATCH] drm/amd/pm: fix indenting in __smu_cmn_reg_print_error()

2022-03-14 Thread Dan Carpenter

Smatch complains that the dev_err_ratelimited() is indented one tab more
than the surrounding lines.

drivers/gpu/drm/amd/amdgpu/../pm/swsmu/smu_cmn.c:174
__smu_cmn_reg_print_error() warn: inconsistent indenting

It looks like it's not a bug, just that the indenting needs to be cleaned
up.

Signed-off-by: Dan Carpenter 
---
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index ae64d1980f10..b8d0c70ff668 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -164,17 +164,17 @@ static void __smu_cmn_reg_print_error(struct smu_context 
*smu,
 
switch (reg_c2pmsg_90) {
case SMU_RESP_NONE: {
-   if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 5)) {
-   msg_idx = RREG32_SOC15(MP1, 0, mmMP1_C2PMSG_2);
-   prm = RREG32_SOC15(MP1, 0, mmMP1_C2PMSG_34);
-   } else {
-   msg_idx = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66);
-   prm = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);
-   }
+   if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 5)) {
+   msg_idx = RREG32_SOC15(MP1, 0, mmMP1_C2PMSG_2);
+   prm = RREG32_SOC15(MP1, 0, mmMP1_C2PMSG_34);
+   } else {
+   msg_idx = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66);
+   prm = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);
+   }
dev_err_ratelimited(adev->dev,
"SMU: I'm not done with your previous 
command: SMN_C2PMSG_66:0x%08X SMN_C2PMSG_82:0x%08X",
msg_idx, prm);
-   }
+   }
break;
case SMU_RESP_OK:
/* The SMU executed the command. It completed with a
-- 
2.20.1

RE: [PATCH 00/12] DC Patches March 10, 2022

2022-03-14 Thread Wheeler, Daniel

[Public]

Hi all,
 
This week this patchset was tested on the following systems:
 
HP Envy 360, with Ryzen 5 4500U, with the following display types: eDP 1080p 
60hz, 4k 60hz  (via USB-C to DP/HDMI), 1440p 144hz (via USB-C to DP/HDMI), 
1680*1050 60hz (via USB-C to DP and then DP to DVI/VGA)
 
Lenovo Thinkpad T14s Gen2 with AMD Ryzen 5 5650U, with the following display 
types: eDP 1080p 60hz, 4k 60hz  (via USB-C to DP/HDMI), 1440p 144hz (via USB-C 
to DP/HDMI), 1680*1050 60hz (via USB-C to DP and then DP to DVI/VGA)
 
Sapphire Pulse RX5700XT with the following display types:
4k 60hz  (via DP/HDMI), 1440p 144hz (via DP/HDMI), 1680*1050 60hz (via DP to 
DVI/VGA)
 
Reference AMD RX6800 with the following display types:
4k 60hz  (via DP/HDMI and USB-C to DP/HDMI), 1440p 144hz (via USB-C to DP/HDMI 
and USB-C to DP/HDMI), 1680*1050 60hz (via DP to DVI/VGA)
 
Included testing using a Startech DP 1.4 MST hub at 2x 4k 60hz, and 3x 1080p 
60hz on all systems. Also tested DSC via USB-C to DP DSC Hub with 3x 4k 60hz on 
Ryzen 9 5900h and Ryzen 5 4500u.
 
Tested on Ubuntu 20.04.3 with Kernel Version 5.16
 
Tested-by: Daniel Wheeler 
 
 
Thank you,
 
Dan Wheeler
Technologist  |  AMD
SW Display
--
1 Commerce Valley Dr E, Thornhill, ON L3T 7X6
Facebook |  Twitter |  amd.com  

-Original Message-
From: amd-gfx  On Behalf Of Agustin 
Gutierrez
Sent: March 10, 2022 5:21 PM
To: amd-gfx@lists.freedesktop.org
Cc: Wang, Chao-kai (Stylon) ; Li, Sun peng (Leo) 
; Wentland, Harry ; Zhuo, Qingqing 
(Lillian) ; Siqueira, Rodrigo 
; Li, Roman ; Chiu, Solomon 
; Pillai, Aurabindo ; Lin, 
Wayne ; Lakha, Bhawanpreet ; 
Gutierrez, Agustin ; Kotarac, Pavle 

Subject: [PATCH 00/12] DC Patches March 10, 2022

This DC patchset brings improvements in multiple areas. In summary:

  * Fixes on lane status, zstate, engine ddc, debugfx entry.
  * Enhancements for Pollock, EDID status.
  * Amongst other.

Anthony Koo (1):
  drm/amd/display: [FW Promotion] Release 0.0.108.0

Aric Cyr (1):
  drm/amd/display: 3.2.177

Becle Lee (1):
  drm/amd/display: Wait for hubp read line for Pollock

Charlene Liu (1):
  drm/amd/display: Add save/restore PANEL_PWRSEQ_REF_DIV2

Dale Zhao (1):
  drm/amd/display: Add new enum for EDID status

Eric Yang (1):
  drm/amd/display: Block zstate when more than one plane enabled

JinZe.Xu (1):
  drm/amd/display: Add I2C escape to support query device exist.

Jing Zhou (2):
  drm/amd/display: Update engine ddc
  drm/amd/display: Add null pointer filter

Leo (Hanghong) Ma (1):
  drm/amd/display: Add function to get the pipe from the stream context

Wayne Lin (2):
  drm/amd/display: Fix a few parts in debugfs entry
  drm/amd/display: Retry when fail reading lane status during LT

 .../amd/display/amdgpu_dm/amdgpu_dm_debugfs.c | 10 +-
 drivers/gpu/drm/amd/display/dc/core/dc.c  | 13 +++
 drivers/gpu/drm/amd/display/dc/core/dc_link.c |  3 +  
.../gpu/drm/amd/display/dc/core/dc_link_dp.c  | 30 +++---
 .../gpu/drm/amd/display/dc/core/dc_stream.c   | 14 +++
 drivers/gpu/drm/amd/display/dc/dc.h   |  2 +-
 drivers/gpu/drm/amd/display/dc/dc_link.h  |  5 +
 drivers/gpu/drm/amd/display/dc/dc_stream.h|  2 +
 drivers/gpu/drm/amd/display/dc/dc_types.h |  1 +
 drivers/gpu/drm/amd/display/dc/dce/dce_aux.c  |  1 +  
drivers/gpu/drm/amd/display/dc/dce/dce_i2c.c  | 26 ++  
drivers/gpu/drm/amd/display/dc/dce/dce_i2c.h  |  6 ++  
.../gpu/drm/amd/display/dc/dcn10/dcn10_hubp.c | 15 +++  
.../gpu/drm/amd/display/dc/dcn10/dcn10_hubp.h |  4 +  
.../drm/amd/display/dc/dcn10/dcn10_resource.c | 15 +++
 .../drm/amd/display/dc/dcn20/dcn20_hwseq.c| 10 ++
 .../amd/display/dc/dcn31/dcn31_panel_cntl.c   |  5 +-
 .../drm/amd/display/dc/dml/dcn20/dcn20_fpu.c  | 16 ++--
 .../drm/amd/display/dc/gpio/gpio_service.c|  6 +-
 drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h  |  1 +
 .../drm/amd/display/dc/inc/hw/panel_cntl.h|  1 +
 .../amd/display/dc/inc/hw_sequencer_private.h |  1 +
 .../gpu/drm/amd/display/dmub/inc/dmub_cmd.h   | 92 ++-
 23 files changed, 253 insertions(+), 26 deletions(-)

--
2.35.1

[PATCH 23/30] drm/amdgpu/dc: fix typos in comments

2022-03-14 Thread Julia Lawall

Various spelling mistakes in comments.
Detected with the help of Coccinelle.

Signed-off-by: Julia Lawall 

---
 drivers/gpu/drm/amd/display/dc/bios/command_table.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/bios/command_table.c 
b/drivers/gpu/drm/amd/display/dc/bios/command_table.c
index ad13e4e36d77..0e36cd800fc9 100644
--- a/drivers/gpu/drm/amd/display/dc/bios/command_table.c
+++ b/drivers/gpu/drm/amd/display/dc/bios/command_table.c
@@ -456,7 +456,7 @@ static enum bp_result transmitter_control_v2(
if ((CONNECTOR_ID_DUAL_LINK_DVII == connector_id) ||
(CONNECTOR_ID_DUAL_LINK_DVID == connector_id))
/* on INIT this bit should be set according to the
-* phisycal connector
+* physical connector
 * Bit0: dual link connector flag
 * =0 connector is single link connector
 * =1 connector is dual link connector
@@ -468,7 +468,7 @@ static enum bp_result transmitter_control_v2(
cpu_to_le16((uint8_t)cntl->connector_obj_id.id);
break;
case TRANSMITTER_CONTROL_SET_VOLTAGE_AND_PREEMPASIS:
-   /* votage swing and pre-emphsis */
+   /* voltage swing and pre-emphsis */
params.asMode.ucLaneSel = (uint8_t)cntl->lane_select;
params.asMode.ucLaneSet = (uint8_t)cntl->lane_settings;
break;
@@ -2120,7 +2120,7 @@ static enum bp_result program_clock_v5(
memset(, 0, sizeof(params));
if (!bp->cmd_helper->clock_source_id_to_atom(
bp_params->pll_id, _pll_id)) {
-   BREAK_TO_DEBUGGER(); /* Invalid Inpute!! */
+   BREAK_TO_DEBUGGER(); /* Invalid Input!! */
return BP_RESULT_BADINPUT;
}

[PATCH 01/30] drm/amd/pm: fix typos in comments

2022-03-14 Thread Julia Lawall

Various spelling mistakes in comments.
Detected with the help of Coccinelle.

Signed-off-by: Julia Lawall 

---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index cbbbd4079249..5cd67ddf8495 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -1870,7 +1870,7 @@ static ssize_t amdgpu_set_smartshift_bias(struct device 
*dev,
amdgpu_smartshift_bias = bias;
r = count;
 
-   /* TODO: upadte bias level with SMU message */
+   /* TODO: update bias level with SMU message */
 
 out:
pm_runtime_mark_last_busy(ddev->dev);

Re: [REGRESSION] Too-low frequency limit for AMD GPU PCI-passed-through to Windows VM

2022-03-14 Thread James Turner

Hi all,

I've confirmed that changing the `amdgpu_atif_pci_probe_handle` function
to do nothing does make the GPU work properly in the VM. I started with
f9b7f3703ff9 ("drm/amdgpu/acpi: make ATPX/ATCS structures global (v2)")
and changed the function implementation to:

static bool amdgpu_atif_pci_probe_handle(struct pci_dev *pdev)
{
DRM_DEBUG_DRIVER("Entered amdgpu_atif_pci_probe_handle");
return false;
}

With that change, the GPU works properly in the VM.

I'm not sure where to go from here. This issue isn't much of a concern
for me anymore, since blacklisting `amdgpu` works for my machine. At
this point, my understanding is that the root problem needs to be fixed
in AMD's Windows GPU driver or Dell's firmware, not the Linux kernel. If
any of the AMD developers on this thread would like to forward it to the
AMD Windows driver team, I'd be happy to work with AMD to fix the issue
properly.

I've added a mention of this issue and workaround to the [Arch Wiki][1]
to make it more discoverable. If anyone has a better place to document
this, please let me know.

Thank you all for your help on this.

[1]: 
https://wiki.archlinux.org/title/PCI_passthrough_via_OVMF#Too-low_frequency_limit_for_AMD_GPU_passed-through_to_virtual_machine

James

Re: [PATCH v2] drm/amdgpu: Add support for drm_privacy_screen

2022-03-14 Thread Hans de Goede

Hi All,

On 3/9/22 18:53, Rajat Jain wrote:
> On Wed, Mar 9, 2022 at 7:06 AM Sean Paul  wrote:
>>
>> From: Sean Paul 
>>
>> This patch adds the necessary hooks to make amdgpu aware of privacy
>> screens. On devices with privacy screen drivers (such as thinkpad-acpi),
>> the amdgpu driver will defer probe until it's ready and then sync the sw
>> and hw state on each commit the connector is involved and enabled.
>>
>> Changes in v2:
>> -Tweaked the drm_privacy_screen_get() error check to avoid logging
>>  errors when privacy screen is absent (Hans)
>>
>> Signed-off-by: Sean Paul 
>> Link: https://patchwork.freedesktop.org/patch/477640/ #v1
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  9 +
>>  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c|  3 +++
>>  .../amd/display/amdgpu_dm/amdgpu_dm_mst_types.c  | 16 +++-
>>  3 files changed, 27 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 2ab675123ae3..e2cfae56c020 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -26,6 +26,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  #include 
>>  #include 
>>  #include "amdgpu_drv.h"
>> @@ -1988,6 +1989,7 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
>>  {
>> struct drm_device *ddev;
>> struct amdgpu_device *adev;
>> +   struct drm_privacy_screen *privacy_screen;
>> unsigned long flags = ent->driver_data;
>> int ret, retry = 0, i;
>> bool supports_atomic = false;
>> @@ -2063,6 +2065,13 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
>> size = pci_resource_len(pdev, 0);
>> is_fw_fb = amdgpu_is_fw_framebuffer(base, size);
>>
>> +   /* If the LCD panel has a privacy screen, defer probe until its 
>> ready */
>> +   privacy_screen = drm_privacy_screen_get(>dev, NULL);
>> +   if (IS_ERR(privacy_screen) && PTR_ERR(privacy_screen) == 
>> -EPROBE_DEFER)
>> +   return -EPROBE_DEFER;
>> +
>> +   drm_privacy_screen_put(privacy_screen);
>> +
>> /* Get rid of things like offb */
>> ret = drm_aperture_remove_conflicting_pci_framebuffers(pdev, 
>> _kms_driver);
>> if (ret)
>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
>> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>> index e1d3db3fe8de..9e2bb6523add 100644
>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>> @@ -9781,6 +9781,9 @@ static void amdgpu_dm_atomic_commit_tail(struct 
>> drm_atomic_state *state)
>> if (acrtc) {
>> new_crtc_state = 
>> drm_atomic_get_new_crtc_state(state, >base);
>> old_crtc_state = 
>> drm_atomic_get_old_crtc_state(state, >base);
>> +
>> +   /* Sync the privacy screen state between hw and sw */
>> +   drm_connector_update_privacy_screen(new_con_state);
>> }
>>
>> /* Skip any modesets/resets */
>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c 
>> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
>> index 740435ae3997..594a8002975a 100644
>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
>> @@ -27,6 +27,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  #include "dm_services.h"
>>  #include "amdgpu.h"
>>  #include "amdgpu_dm.h"
>> @@ -506,6 +507,7 @@ void amdgpu_dm_initialize_dp_connector(struct 
>> amdgpu_display_manager *dm,
>>struct amdgpu_dm_connector 
>> *aconnector,
>>int link_index)
>>  {
>> +   struct drm_device *dev = dm->ddev;
>> struct dc_link_settings max_link_enc_cap = {0};
>>
>> aconnector->dm_dp_aux.aux.name =
>> @@ -519,8 +521,20 @@ void amdgpu_dm_initialize_dp_connector(struct 
>> amdgpu_display_manager *dm,
>> drm_dp_cec_register_connector(>dm_dp_aux.aux,
>>   >base);
>>
>> -   if (aconnector->base.connector_type == DRM_MODE_CONNECTOR_eDP)
>> +   if (aconnector->base.connector_type == DRM_MODE_CONNECTOR_eDP) {
>> +   struct drm_privacy_screen *privacy_screen)
>> +
>> +   /* Reference given up in drm_connector_cleanup() */
>> +   privacy_screen = drm_privacy_screen_get(dev->dev, NULL);
> 
> Can we try to be more specific when looking up for privacy screen, e.g.:
> 
> privacy_screen = drm_privacy_screen_get(dev->dev,  "eDP-1");
> (and then also making the corresponding change in arch_init_data[] in
> drm_privacy_screen_x86.c"

So I just checked and yes I think we can be more specific at least
for the thinkpad_acpi supported models. See the attached patch
which has been

Re: [PATCH 2/5] dyndbg: add class_id field and query support

2022-03-14 Thread Jason Baron




On 3/10/22 23:47, Jim Cromie wrote:
> DRM defines/uses 10 enum drm_debug_category's to create exclusive
> classes of debug messages.  To support this directly in dynamic-debug,
> add the following:
> 
> - struct _ddebug.class_id:4 - 4 bits is enough
> - define _DPRINTK_SITE_UNCLASSED 15 - see below
> 
> and the query support:
> - struct _ddebug_query.class_id
> - ddebug_parse_query  - looks for "class" keyword, then calls..
> - parse_class - accepts uint: 0-15, sets query.class_id and marker
> - vpr_info_dq - displays new field
> - ddebug_proc_show- append column with "cls:%d" if !UNCLASSED
> 
> With the patch, one can enable current/unclassed callsites by:
> 
>   #> echo drm class 15 +p > /proc/dynamic_debug/control
> 

To me, this is hard to read, what the heck is '15'? I have to go look it
up in the control file and it's not descriptive. I think that using
classes/categories makes sense but I'm wondering if it can be a bit more
user friendly? Perhaps, we can pass an array of strings that is indexed
by the class id to each pr_debug() site that wants to use class. So
something like:

enum levels {
LOW,
MEDIUM,
HIGH
};

static const char * const level_to_strings[] = {
[LOW] = "low",
[MEDIUM] = "medium",
[HIGH] = "high",
};

And then you'd have a wrapper macros in your driver:

#define module_foo_pr_debug_class(level, fmt, args...)
pr_debug_class(level, level_to_strings, fmt, args);

Such that call sites look like:

module_foo_pr_debug_class(LOW, fmt, args...);

Such that you're not always passing the strings array around. Now, this
does mean another pointer for struct _ddebug and most wouldn't have it.
Maybe we could just add another linker section for these so as to save
space.

> parse_class() accepts 0 .. _DPRINTK_SITE_UNCLASSED, which allows the
>> control interface to explicitly manipulate unclassed callsites.
> 
> After parsing keywords, ddebug_parse_query() sets .class_id=15, iff it
> wasnt explicitly set.  This allows future classed/categorized
> callsites to be untouched by legacy (class unaware) queries.
> 
> DEFINE_DYNAMIC_DEBUG_METADATA gets _CLS(cls,) suffix and arg, and
> initializes the new .class_id=cls field.  The old name gets the default.
> 
> Then, these _CLS(cls,...) modifications are repeated up through the
> stack of *dynamic_func_call* macros that use the METADATA initializer,
> so as to actually supply the category into it.
> 
> NOTES:
> 
> _DPRINTK_SITE_UNCLASSED: this symbol is used to initialize all
> existing/unclassed pr-debug callsites.  Normally, the default would be
> zero, but DRM_UT_CORE "uses" that value, in the sense that 0 is
> exposed as a bit position in drm.debug.  Using 15 allows identity
> mapping from category to class, avoiding fiddly offsets.
> 
> Default .class_id = 15 means that ``echo +p > control`` no longer
> toggles ALL the callsites, only the unclassed ones.  This was only
> useful for static-branch toggle load testing anyway.
> 

I think that # echo +p > control should continue to work as is, why
should the introduction of classes change that ?

> RFC:
> 
> The new _CLS macro flavor gets a warning from DRM/dri-devel's CI,
> but not from checkpatch (on this subject).
> 
> a8f6c71f283e dyndbg: add class_id field and query support
> -:141: CHECK:MACRO_ARG_REUSE: Macro argument reuse 'id' - possible 
> side-effects?
> +#define __dynamic_func_call_cls(id, cls, fmt, func, ...) do {\
> + DEFINE_DYNAMIC_DEBUG_METADATA_CLS(id, cls, fmt);\
> + if (DYNAMIC_DEBUG_BRANCH(id))   \
> + func(, ##__VA_ARGS__);   \
>  } while (0)
> 
> I couldn't fix it with a ``typeof(id) _id = id`` construct.  I haven't
> seen the warning myself, on the _CLS extended macro, nor the original.
> 
> CC: Rasmus Villemoes 
> Signed-off-by: Jim Cromie 
> ---
>  .../admin-guide/dynamic-debug-howto.rst   |  7 +++
>  include/linux/dynamic_debug.h | 54 ++-
>  lib/dynamic_debug.c   | 48 ++---
>  3 files changed, 88 insertions(+), 21 deletions(-)
> 
> diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst 
> b/Documentation/admin-guide/dynamic-debug-howto.rst
> index a89cfa083155..8ef8d7dcd140 100644
> --- a/Documentation/admin-guide/dynamic-debug-howto.rst
> +++ b/Documentation/admin-guide/dynamic-debug-howto.rst
> @@ -35,6 +35,7 @@ Dynamic debug has even more useful features:
> - line number (including ranges of line numbers)
> - module name
> - format string
> +   - class number:0-15
>  
>   * Provides a debugfs control file: ``/dynamic_debug/control``
> which can be read to display the complete list of known debug
> @@ -143,6 +144,7 @@ against.  Possible keywords are:::
>'module' string |
>'format' string |
>'line' line-range
> +  'class' integer:[0-15]
>  
>line-range ::= lineno

Re: [PATCH v1 1/3] mm: split vm_normal_pages for LRU and non-LRU handling

2022-03-14 Thread David Hildenbrand

On 10.03.22 18:26, Alex Sierra wrote:
> DEVICE_COHERENT pages introduce a subtle distinction in the way
> "normal" pages can be used by various callers throughout the kernel.
> They behave like normal pages for purposes of mapping in CPU page
> tables, and for COW. But they do not support LRU lists, NUMA
> migration or THP. Therefore we split vm_normal_page into two
> functions vm_normal_any_page and vm_normal_lru_page. The latter will
> only return pages that can be put on an LRU list and that support
> NUMA migration, KSM and THP.
> 
> We also introduced a FOLL_LRU flag that adds the same behaviour to
> follow_page and related APIs, to allow callers to specify that they
> expect to put pages on an LRU list.
> 

I still don't see the need for s/vm_normal_page/vm_normal_any_page/. And
as this patch is dominated by that change, I'd suggest (again) to just
drop it as I don't see any value of that renaming. No specifier implies any.

The general idea of this change LGTM.

I wonder how this interacts with the actual DEVICE_COHERENT coherent
series. Is this a preparation? Should it be part of the DEVICE_COHERENT
series?

IOW, should this patch start with

"With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
device-managed anonymous pages that are not LRU pages. Although they
behave like normal pages for purposes of mapping in CPU page, and for
COW, they do not support LRU lists, NUMA migration or THP. [...]"

But then, I'm confused by patch 2 and 3, because it feels more like we'd
already have DEVICE_COHERENT then ("hmm_is_coherent_type").

-- 
Thanks,

David / dhildenb

[PATCH 00/30] fix typos in comments

2022-03-14 Thread Julia Lawall

Various spelling mistakes in comments.
Detected with the help of Coccinelle.

---

 drivers/base/devres.c   |4 ++--
 drivers/clk/qcom/gcc-sm6125.c   |2 +-
 drivers/clk/ti/clkctrl.c|2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |4 ++--
 drivers/gpu/drm/amd/display/dc/bios/command_table.c |6 +++---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c  |2 +-
 drivers/gpu/drm/bridge/analogix/analogix_dp_core.c  |4 ++--
 drivers/gpu/drm/sti/sti_gdp.c   |2 +-
 drivers/infiniband/hw/qib/qib_iba7220.c |4 ++--
 drivers/leds/leds-pca963x.c |2 +-
 drivers/media/i2c/ov5695.c  |2 +-
 drivers/mfd/rohm-bd9576.c   |2 +-
 drivers/mtd/ubi/block.c |2 +-
 drivers/net/can/usb/ucan.c  |4 ++--
 drivers/net/ethernet/packetengines/yellowfin.c  |2 +-
 drivers/net/wireless/ath/ath6kl/htc_mbox.c  |2 +-
 drivers/net/wireless/cisco/airo.c   |2 +-
 drivers/net/wireless/mediatek/mt76/mt7915/init.c|2 +-
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c |6 +++---
 drivers/platform/x86/uv_sysfs.c |2 +-
 drivers/s390/crypto/pkey_api.c  |2 +-
 drivers/scsi/aic7xxx/aicasm/aicasm.c|2 +-
 drivers/scsi/elx/libefc_sli/sli4.c  |2 +-
 drivers/scsi/lpfc/lpfc_mbox.c   |2 +-
 drivers/scsi/qla2xxx/qla_gs.c   |2 +-
 drivers/spi/spi-sun4i.c |2 +-
 drivers/staging/rtl8723bs/core/rtw_mlme.c   |2 +-
 drivers/usb/gadget/udc/snps_udc_core.c  |2 +-
 fs/kernfs/file.c|2 +-
 kernel/events/core.c|2 +-
 30 files changed, 39 insertions(+), 39 deletions(-)

[PATCH 29/30] drm/amdgpu: fix typos in comments

2022-03-14 Thread Julia Lawall

Various spelling mistakes in comments.
Detected with the help of Coccinelle.

Signed-off-by: Julia Lawall 

---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index fe660a8e150f..970b065e9a6b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -340,7 +340,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct 
amdgpu_device *adev,
if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
s64 min_us;
 
-   /* Be more aggresive on dGPUs. Try to fill a portion of free
+   /* Be more aggressive on dGPUs. Try to fill a portion of free
 * VRAM now.
 */
if (!(adev->flags & AMD_IS_APU))
@@ -1280,7 +1280,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
continue;
 
/*
-* Work around dma_resv shortcommings by wrapping up the
+* Work around dma_resv shortcomings by wrapping up the
 * submission in a dma_fence_chain and add it as exclusive
 * fence.
 */

Re: [PATCH 1/5] dyndbg: fix static_branch manipulation

2022-03-14 Thread Jason Baron




On 3/10/22 23:47, Jim Cromie wrote:
> In 
> https://urldefense.com/v3/__https://lore.kernel.org/lkml/20211209150910.ga23...@axis.com/__;!!GjvTz_vk!HGKKoni4RVdEBgv_V0zPSNSX428bpf02zkCy2WbeQkBdVtp1QJqGX-lJYlRDGg$
>  
> 
> Vincent's patch commented on, and worked around, a bug toggling
> static_branch's, when a 2nd PRINTK-ish flag was added.  The bug
> results in a premature static_branch_disable when the 1st of 2 flags
> was disabled.
> 
> The cited commit computed newflags, but then in the JUMP_LABEL block,
> failed to use that result, instead using just one of the terms in it.
> Using newflags instead made the code work properly.
> 
> This is Vincents test-case, reduced.  It needs the 2nd flag to work
> properly, but it's explanatory here.
> 
> pt_test() {
> echo 5 > /sys/module/dynamic_debug/verbose
> 
> site="module tcp" # just one callsite
> echo " $site =_ " > /proc/dynamic_debug/control # clear it
> 
> # A B ~A ~B
> for flg in +T +p "-T #broke here" -p; do
>   echo " $site $flg " > /proc/dynamic_debug/control
> done;
> 
> # A B ~B ~A
> for flg in +T +p "-p #broke here" -T; do
>   echo " $site $flg " > /proc/dynamic_debug/control
> done
> }
> pt_test
> 
> Fixes: 84da83a6ffc0 dyndbg: combine flags & mask into a struct, simplify with 
> it
> CC: vincent.whitchu...@axis.com
> Signed-off-by: Jim Cromie 
> 
> --
> .drop @stable, no exposed bug.
> ---
>  lib/dynamic_debug.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
> index dd7f56af9aed..a56c1286ffa4 100644
> --- a/lib/dynamic_debug.c
> +++ b/lib/dynamic_debug.c
> @@ -211,10 +211,11 @@ static int ddebug_change(const struct ddebug_query 
> *query,
>   continue;
>  #ifdef CONFIG_JUMP_LABEL
>   if (dp->flags & _DPRINTK_FLAGS_PRINT) {
> - if (!(modifiers->flags & _DPRINTK_FLAGS_PRINT))
> + if (!(newflags & _DPRINTK_FLAGS_PRINT))
>   
> static_branch_disable(>key.dd_key_true);
> - } else if (modifiers->flags & _DPRINTK_FLAGS_PRINT)
> + } else if (newflags & _DPRINTK_FLAGS_PRINT) {
>   static_branch_enable(>key.dd_key_true);
> + }
>  #endif
>   dp->flags = newflags;
>   v4pr_info("changed %s:%d [%s]%s =%s\n",



Hi Jim,

If iiuc this is currently a bug but could be if we add a second 'print' bit
such as for printing to the tracing logs. That said I agree that using 
'newflags'
here makes the code more straightforward/readable. So this one is fine with
me.

Acked-by: Jason Baron 

Thanks,

-Jason

RE: [PATCH 3/3] drm/amdkfd: add RAS poison consumption support for utcl2

2022-03-14 Thread Zhang, Hawking

Never mind Tao. I guess you just want to leverage client_id to differentiate 
sdma int source from the default, right? Then might consider to explicitly call 
out the UTCL2_FAULT source.

Regards,
Hawking

-Original Message-
From: amd-gfx  On Behalf Of Zhang, 
Hawking
Sent: Monday, March 14, 2022 18:40
To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org; Yang, 
Stanley ; Chai, Thomas ; Kuehling, 
Felix 
Subject: RE: [PATCH 3/3] drm/amdkfd: add RAS poison consumption support for 
utcl2

[AMD Official Use Only]

Copy Felix

@@ -119,10 +121,14 @@ static void event_interrupt_poison_consumption(struct 
kfd_dev *dev,
break;
case SOC15_INTSRC_SDMA_ECC:
default:
+   if (client_id == SOC15_IH_CLIENTID_UTCL2)
+   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
break;
}

This will break SDMA - We haven't enabled optimized poison consumption handling 
for sdma yet.

I'd suggest we explicitly call out the interrupt source id UTCL2_FAULT as a 
case, even it is the same as VM_FAULT. And it should be fine to start 
evict_queue directly after that because in ISR it already guarantee this is 
from UTCL2 client, right?

+   if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
+   dev->kfd2kgd->is_ras_utcl2_poison &&
+   dev->kfd2kgd->is_ras_utcl2_poison(dev->adev, client_id)) {
+   event_interrupt_poison_consumption(dev, ih_ring_entry);


In addition, is_ras_utcl2_poison can be renamed to query_utcl2_ras_status or 
poison_status, while utcl2_fault_clear to reset_utlc2_poison_status to align 
with naming style of ras hw op.

Thinking about this more, it's better we add this in gfx ras op, and expose to 
KFD. Thoughts?

Regards,
Hawking

-Original Message-
From: Zhou1, Tao  
Sent: Monday, March 14, 2022 15:03
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; 
Yang, Stanley ; Chai, Thomas 
Cc: Zhou1, Tao 
Subject: [PATCH 3/3] drm/amdkfd: add RAS poison consumption support for utcl2

Do RAS page retirement and use gpu reset as fallback in utcl2 fault handler.

Signed-off-by: Tao Zhou 
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 23 ---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index f7def0bf0730..3991f71d865b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -93,11 +93,12 @@ enum SQ_INTERRUPT_ERROR_TYPE {  static void 
event_interrupt_poison_consumption(struct kfd_dev *dev,
const uint32_t *ih_ring_entry)
 {
-   uint16_t source_id, pasid;
+   uint16_t source_id, client_id, pasid;
int ret = -EINVAL;
struct kfd_process *p;
 
source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+   client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
 
p = kfd_lookup_process_by_pasid(pasid);
@@ -110,6 +111,7 @@ static void event_interrupt_poison_consumption(struct 
kfd_dev *dev,
return;
}
 
+   pr_debug("RAS poison consumption handling\n");
atomic_set(>poison, 1);
kfd_unref_process(p);
 
@@ -119,10 +121,14 @@ static void event_interrupt_poison_consumption(struct 
kfd_dev *dev,
break;
case SOC15_INTSRC_SDMA_ECC:
default:
+   if (client_id == SOC15_IH_CLIENTID_UTCL2)
+   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
break;
}
 
-   kfd_signal_poison_consumed_event(dev, pasid);
+   /* utcl2 page fault has its own vm fault event */
+   if (client_id != SOC15_IH_CLIENTID_UTCL2)
+   kfd_signal_poison_consumed_event(dev, pasid);
 
/* resetting queue passes, do page retirement without gpu reset
 * resetting queue fails, fallback to gpu reset solution @@ -314,7 
+320,18 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
info.prot_write = ring_id & 0x20;
 
kfd_smi_event_update_vmfault(dev, pasid);
-   kfd_dqm_evict_pasid(dev->dqm, pasid);
+
+   if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
+   dev->kfd2kgd->is_ras_utcl2_poison &&
+   dev->kfd2kgd->is_ras_utcl2_poison(dev->adev, client_id)) {
+   event_interrupt_poison_consumption(dev, ih_ring_entry);
+
+   if (dev->kfd2kgd->utcl2_fault_clear)
+   dev->kfd2kgd->utcl2_fault_clear(dev->adev);
+   }
+   else
+   kfd_dqm_evict_pasid(dev->dqm, pasid);
+
kfd_signal_vm_fault_event(dev, pasid, );
}
 }
--
2.35.1

RE: [PATCH 3/3] drm/amdkfd: add RAS poison consumption support for utcl2

2022-03-14 Thread Zhang, Hawking

[AMD Official Use Only]

Copy Felix

@@ -119,10 +121,14 @@ static void event_interrupt_poison_consumption(struct 
kfd_dev *dev,
break;
case SOC15_INTSRC_SDMA_ECC:
default:
+   if (client_id == SOC15_IH_CLIENTID_UTCL2)
+   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
break;
}

This will break SDMA - We haven't enabled optimized poison consumption handling 
for sdma yet.

I'd suggest we explicitly call out the interrupt source id UTCL2_FAULT as a 
case, even it is the same as VM_FAULT. And it should be fine to start 
evict_queue directly after that because in ISR it already guarantee this is 
from UTCL2 client, right?

+   if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
+   dev->kfd2kgd->is_ras_utcl2_poison &&
+   dev->kfd2kgd->is_ras_utcl2_poison(dev->adev, client_id)) {
+   event_interrupt_poison_consumption(dev, ih_ring_entry);


In addition, is_ras_utcl2_poison can be renamed to query_utcl2_ras_status or 
poison_status, while utcl2_fault_clear to reset_utlc2_poison_status to align 
with naming style of ras hw op.

Thinking about this more, it's better we add this in gfx ras op, and expose to 
KFD. Thoughts?

Regards,
Hawking

-Original Message-
From: Zhou1, Tao  
Sent: Monday, March 14, 2022 15:03
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; 
Yang, Stanley ; Chai, Thomas 
Cc: Zhou1, Tao 
Subject: [PATCH 3/3] drm/amdkfd: add RAS poison consumption support for utcl2

Do RAS page retirement and use gpu reset as fallback in utcl2 fault handler.

Signed-off-by: Tao Zhou 
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 23 ---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index f7def0bf0730..3991f71d865b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -93,11 +93,12 @@ enum SQ_INTERRUPT_ERROR_TYPE {  static void 
event_interrupt_poison_consumption(struct kfd_dev *dev,
const uint32_t *ih_ring_entry)
 {
-   uint16_t source_id, pasid;
+   uint16_t source_id, client_id, pasid;
int ret = -EINVAL;
struct kfd_process *p;
 
source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+   client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
 
p = kfd_lookup_process_by_pasid(pasid);
@@ -110,6 +111,7 @@ static void event_interrupt_poison_consumption(struct 
kfd_dev *dev,
return;
}
 
+   pr_debug("RAS poison consumption handling\n");
atomic_set(>poison, 1);
kfd_unref_process(p);
 
@@ -119,10 +121,14 @@ static void event_interrupt_poison_consumption(struct 
kfd_dev *dev,
break;
case SOC15_INTSRC_SDMA_ECC:
default:
+   if (client_id == SOC15_IH_CLIENTID_UTCL2)
+   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
break;
}
 
-   kfd_signal_poison_consumed_event(dev, pasid);
+   /* utcl2 page fault has its own vm fault event */
+   if (client_id != SOC15_IH_CLIENTID_UTCL2)
+   kfd_signal_poison_consumed_event(dev, pasid);
 
/* resetting queue passes, do page retirement without gpu reset
 * resetting queue fails, fallback to gpu reset solution @@ -314,7 
+320,18 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
info.prot_write = ring_id & 0x20;
 
kfd_smi_event_update_vmfault(dev, pasid);
-   kfd_dqm_evict_pasid(dev->dqm, pasid);
+
+   if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
+   dev->kfd2kgd->is_ras_utcl2_poison &&
+   dev->kfd2kgd->is_ras_utcl2_poison(dev->adev, client_id)) {
+   event_interrupt_poison_consumption(dev, ih_ring_entry);
+
+   if (dev->kfd2kgd->utcl2_fault_clear)
+   dev->kfd2kgd->utcl2_fault_clear(dev->adev);
+   }
+   else
+   kfd_dqm_evict_pasid(dev->dqm, pasid);
+
kfd_signal_vm_fault_event(dev, pasid, );
}
 }
--
2.35.1

Re: [PATCH 3/3] drm/amdkfd: add RAS poison consumption support for utcl2

2022-03-14 Thread Lazar, Lijo





On 3/14/2022 12:33 PM, Tao Zhou wrote:

Do RAS page retirement and use gpu reset as fallback in utcl2
fault handler.

Signed-off-by: Tao Zhou 
---
  .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 23 ---
  1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index f7def0bf0730..3991f71d865b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -93,11 +93,12 @@ enum SQ_INTERRUPT_ERROR_TYPE {
  static void event_interrupt_poison_consumption(struct kfd_dev *dev,
const uint32_t *ih_ring_entry)
  {
-   uint16_t source_id, pasid;
+   uint16_t source_id, client_id, pasid;
int ret = -EINVAL;
struct kfd_process *p;
  
  	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);

+   client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
  
  	p = kfd_lookup_process_by_pasid(pasid);

@@ -110,6 +111,7 @@ static void event_interrupt_poison_consumption(struct 
kfd_dev *dev,
return;
}
  
+	pr_debug("RAS poison consumption handling\n");


dev is available through kfd_dev.


atomic_set(>poison, 1);
kfd_unref_process(p);
  
@@ -119,10 +121,14 @@ static void event_interrupt_poison_consumption(struct kfd_dev *dev,

break;
case SOC15_INTSRC_SDMA_ECC:
default:
+   if (client_id == SOC15_IH_CLIENTID_UTCL2)
+   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);


Since this doesn't logically belong to the switch condition, better to 
keep it outside of switch.



break;
}
  
-	kfd_signal_poison_consumed_event(dev, pasid);

+   /* utcl2 page fault has its own vm fault event */
+   if (client_id != SOC15_IH_CLIENTID_UTCL2)
+   kfd_signal_poison_consumed_event(dev, pasid);
  
  	/* resetting queue passes, do page retirement without gpu reset

 * resetting queue fails, fallback to gpu reset solution
@@ -314,7 +320,18 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
info.prot_write = ring_id & 0x20;
  
  		kfd_smi_event_update_vmfault(dev, pasid);

-   kfd_dqm_evict_pasid(dev->dqm, pasid);
+
+   if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
+   dev->kfd2kgd->is_ras_utcl2_poison &&
+   dev->kfd2kgd->is_ras_utcl2_poison(dev->adev, client_id)) {
+   event_interrupt_poison_consumption(dev, ih_ring_entry);
+
Is it expected that no other interrupt would come until this FED error 
is cleared? Otherwise subsequent ones could also be treated as poison.


Basically, whether to do this or not?
1) Clear FED
2) Handle poison consumption


Thanks,
Lijo


+   if (dev->kfd2kgd->utcl2_fault_clear)
+   dev->kfd2kgd->utcl2_fault_clear(dev->adev);
+   }
+   else
+   kfd_dqm_evict_pasid(dev->dqm, pasid);
+
kfd_signal_vm_fault_event(dev, pasid, );
}
  }

Re: [PATCH 2/3] drm/amdgpu: add utcl2 RAS poison functions for Aldebaran

2022-03-14 Thread Lazar, Lijo





On 3/14/2022 12:33 PM, Tao Zhou wrote:

Add help functions to check and clear RAS utcl2 poison status.

Signed-off-by: Tao Zhou 
---
  .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 28 ++-
  .../gpu/drm/amd/include/kgd_kfd_interface.h   |  3 ++
  2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index c8935d718207..ebd7d36d099b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -23,6 +23,30 @@
  #include "amdgpu_amdkfd.h"
  #include "amdgpu_amdkfd_arcturus.h"
  #include "amdgpu_amdkfd_gfx_v9.h"
+#include "soc15.h"
+#include "gc/gc_9_4_2_sh_mask.h"
+
+static bool kgd_aldebaran_is_ras_utcl2_poison(struct amdgpu_device *adev,
+   uint16_t client_id)
+{
+   uint32_t status = 0;
+   struct amdgpu_vmhub *hub;
+
+   if (client_id != SOC15_IH_CLIENTID_UTCL2)
+   return false;
+


Status check is not related to interrupt. Is IH client id needed here?

Thanks,
Lijo


+   hub = >vmhub[AMDGPU_GFXHUB_0];
+   status = RREG32(hub->vm_l2_pro_fault_status);
+   return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+}
+
+static void kgd_aldebaran_utcl2_fault_clear(struct amdgpu_device *adev)
+{
+   struct amdgpu_vmhub *hub = >vmhub[AMDGPU_GFXHUB_0];
+
+   hub = >vmhub[AMDGPU_GFXHUB_0];
+   WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
+}
  
  const struct kfd2kgd_calls aldebaran_kfd2kgd = {

.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
@@ -41,5 +65,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.get_atc_vmid_pasid_mapping_info =
kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
.set_vm_context_page_table_base = 
kgd_gfx_v9_set_vm_context_page_table_base,
-   .program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings
+   .program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings,
+   .is_ras_utcl2_poison = kgd_aldebaran_is_ras_utcl2_poison,
+   .utcl2_fault_clear = kgd_aldebaran_utcl2_fault_clear
  };
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h 
b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index 2f60cf35a444..78400479193e 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -291,6 +291,9 @@ struct kfd2kgd_calls {
int *wave_cnt, int *max_waves_per_cu);
void (*program_trap_handler_settings)(struct amdgpu_device *adev,
uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr);
+   bool (*is_ras_utcl2_poison)(struct amdgpu_device *adev,
+   uint16_t client_id);
+   void (*utcl2_fault_clear)(struct amdgpu_device *adev);
  };
  
  #endif	/* KGD_KFD_INTERFACE_H_INCLUDED */

Re: [PATCH] drm/amdgpu: reject secure submission on rings which don't support it

2022-03-14 Thread Christian König


Am 14.03.22 um 03:46 schrieb Lang Yu:

Only ring GFX, SDMA and VCN_DEC support secure submission at the moment.

Signed-off-by: Lang Yu 


it would be nicer if we have that as flag in ring->funcs, but that way 
works for now as well.


Reviewed-by: Christian König 

Thanks,
Christian.


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c   | 4 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 8 
  2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index bc1297dcdf97..840304691b92 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -166,8 +166,8 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned 
num_ibs,
}
  
  	if ((ib->flags & AMDGPU_IB_FLAGS_SECURE) &&

-   (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)) {
-   dev_err(adev->dev, "secure submissions not supported on compute 
rings\n");
+   !amdgpu_ring_secure_submission_supported(ring)) {
+   dev_err(adev->dev, "secure submissions not supported on ring 
<%s>\n", ring->name);
return -EINVAL;
}
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

index a8bed1b47899..3afe3d60e194 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -363,6 +363,14 @@ static inline void amdgpu_ring_write_multiple(struct 
amdgpu_ring *ring,
ring->count_dw -= count_dw;
  }
  
+static inline

+bool amdgpu_ring_secure_submission_supported(struct amdgpu_ring *ring)
+{
+   return (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
+   ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
+   ring->funcs->type == AMDGPU_RING_TYPE_VCN_DEC);
+}
+
  int amdgpu_ring_test_helper(struct amdgpu_ring *ring);
  
  void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,

Re: [PATCH v3] drm/amdgpu: add workarounds for VCN TMZ issue on CHIP_RAVEN

2022-03-14 Thread Christian König


Am 14.03.22 um 03:45 schrieb Lang Yu:

It is a hardware issue that VCN can't handle a GTT
backing stored TMZ buffer on CHIP_RAVEN series ASIC.

Move such a TMZ buffer to VRAM domain before command
submission as a wrokaround.

v2:
  - Use patch_cs_in_place callback.

v3:
  - Bail out early if unsecure IBs.

Suggested-by: Christian König 
Signed-off-by: Lang Yu 


Reviewed-by: Christian König 

But I would ask Leo to give his ok as well.

Thanks,
Christian.


---
  drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c | 71 +++
  1 file changed, 71 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
index 7bbb9ba6b80b..1ac9e06d3a4d 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
@@ -24,6 +24,7 @@
  #include 
  
  #include "amdgpu.h"

+#include "amdgpu_cs.h"
  #include "amdgpu_vcn.h"
  #include "amdgpu_pm.h"
  #include "soc15.h"
@@ -1905,6 +1906,75 @@ static const struct amd_ip_funcs vcn_v1_0_ip_funcs = {
.set_powergating_state = vcn_v1_0_set_powergating_state,
  };
  
+/*

+ * It is a hardware issue that VCN can't handle a GTT TMZ buffer on
+ * CHIP_RAVEN series ASIC. Move such a GTT TMZ buffer to VRAM domain
+ * before command submission as a workaround.
+ */
+static int vcn_v1_0_validate_bo(struct amdgpu_cs_parser *parser,
+   struct amdgpu_job *job,
+   uint64_t addr)
+{
+   struct ttm_operation_ctx ctx = { false, false };
+   struct amdgpu_fpriv *fpriv = parser->filp->driver_priv;
+   struct amdgpu_vm *vm = >vm;
+   struct amdgpu_bo_va_mapping *mapping;
+   struct amdgpu_bo *bo;
+   int r;
+
+   addr &= AMDGPU_GMC_HOLE_MASK;
+   if (addr & 0x7) {
+   DRM_ERROR("VCN messages must be 8 byte aligned!\n");
+   return -EINVAL;
+   }
+
+   mapping = amdgpu_vm_bo_lookup_mapping(vm, addr/AMDGPU_GPU_PAGE_SIZE);
+   if (!mapping || !mapping->bo_va || !mapping->bo_va->base.bo)
+   return -EINVAL;
+
+   bo = mapping->bo_va->base.bo;
+   if (!(bo->flags & AMDGPU_GEM_CREATE_ENCRYPTED))
+   return 0;
+
+   amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM);
+   r = ttm_bo_validate(>tbo, >placement, );
+   if (r) {
+   DRM_ERROR("Failed validating the VCN message BO (%d)!\n", r);
+   return r;
+   }
+
+   return r;
+}
+
+static int vcn_v1_0_ring_patch_cs_in_place(struct amdgpu_cs_parser *p,
+  struct amdgpu_job *job,
+  struct amdgpu_ib *ib)
+{
+   uint32_t msg_lo = 0, msg_hi = 0;
+   int i, r;
+
+   if (!(ib->flags & AMDGPU_IB_FLAGS_SECURE))
+   return 0;
+
+   for (i = 0; i < ib->length_dw; i += 2) {
+   uint32_t reg = amdgpu_ib_get_value(ib, i);
+   uint32_t val = amdgpu_ib_get_value(ib, i + 1);
+
+   if (reg == PACKET0(p->adev->vcn.internal.data0, 0)) {
+   msg_lo = val;
+   } else if (reg == PACKET0(p->adev->vcn.internal.data1, 0)) {
+   msg_hi = val;
+   } else if (reg == PACKET0(p->adev->vcn.internal.cmd, 0)) {
+   r = vcn_v1_0_validate_bo(p, job,
+((u64)msg_hi) << 32 | msg_lo);
+   if (r)
+   return r;
+   }
+   }
+
+   return 0;
+}
+
  static const struct amdgpu_ring_funcs vcn_v1_0_dec_ring_vm_funcs = {
.type = AMDGPU_RING_TYPE_VCN_DEC,
.align_mask = 0xf,
@@ -1914,6 +1984,7 @@ static const struct amdgpu_ring_funcs 
vcn_v1_0_dec_ring_vm_funcs = {
.get_rptr = vcn_v1_0_dec_ring_get_rptr,
.get_wptr = vcn_v1_0_dec_ring_get_wptr,
.set_wptr = vcn_v1_0_dec_ring_set_wptr,
+   .patch_cs_in_place = vcn_v1_0_ring_patch_cs_in_place,
.emit_frame_size =
6 + 6 + /* hdp invalidate / flush */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 6 +

[PATCH 2/3] drm/amdgpu: add utcl2 RAS poison functions for Aldebaran

2022-03-14 Thread Tao Zhou

Add help functions to check and clear RAS utcl2 poison status.

Signed-off-by: Tao Zhou 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 28 ++-
 .../gpu/drm/amd/include/kgd_kfd_interface.h   |  3 ++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index c8935d718207..ebd7d36d099b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -23,6 +23,30 @@
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_amdkfd_arcturus.h"
 #include "amdgpu_amdkfd_gfx_v9.h"
+#include "soc15.h"
+#include "gc/gc_9_4_2_sh_mask.h"
+
+static bool kgd_aldebaran_is_ras_utcl2_poison(struct amdgpu_device *adev,
+   uint16_t client_id)
+{
+   uint32_t status = 0;
+   struct amdgpu_vmhub *hub;
+
+   if (client_id != SOC15_IH_CLIENTID_UTCL2)
+   return false;
+
+   hub = >vmhub[AMDGPU_GFXHUB_0];
+   status = RREG32(hub->vm_l2_pro_fault_status);
+   return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+}
+
+static void kgd_aldebaran_utcl2_fault_clear(struct amdgpu_device *adev)
+{
+   struct amdgpu_vmhub *hub = >vmhub[AMDGPU_GFXHUB_0];
+
+   hub = >vmhub[AMDGPU_GFXHUB_0];
+   WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
+}
 
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
@@ -41,5 +65,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.get_atc_vmid_pasid_mapping_info =
kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
.set_vm_context_page_table_base = 
kgd_gfx_v9_set_vm_context_page_table_base,
-   .program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings
+   .program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings,
+   .is_ras_utcl2_poison = kgd_aldebaran_is_ras_utcl2_poison,
+   .utcl2_fault_clear = kgd_aldebaran_utcl2_fault_clear
 };
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h 
b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index 2f60cf35a444..78400479193e 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -291,6 +291,9 @@ struct kfd2kgd_calls {
int *wave_cnt, int *max_waves_per_cu);
void (*program_trap_handler_settings)(struct amdgpu_device *adev,
uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr);
+   bool (*is_ras_utcl2_poison)(struct amdgpu_device *adev,
+   uint16_t client_id);
+   void (*utcl2_fault_clear)(struct amdgpu_device *adev);
 };
 
 #endif /* KGD_KFD_INTERFACE_H_INCLUDED */
-- 
2.35.1

[PATCH 3/3] drm/amdkfd: add RAS poison consumption support for utcl2

2022-03-14 Thread Tao Zhou

Do RAS page retirement and use gpu reset as fallback in utcl2
fault handler.

Signed-off-by: Tao Zhou 
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 23 ---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index f7def0bf0730..3991f71d865b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -93,11 +93,12 @@ enum SQ_INTERRUPT_ERROR_TYPE {
 static void event_interrupt_poison_consumption(struct kfd_dev *dev,
const uint32_t *ih_ring_entry)
 {
-   uint16_t source_id, pasid;
+   uint16_t source_id, client_id, pasid;
int ret = -EINVAL;
struct kfd_process *p;
 
source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+   client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
 
p = kfd_lookup_process_by_pasid(pasid);
@@ -110,6 +111,7 @@ static void event_interrupt_poison_consumption(struct 
kfd_dev *dev,
return;
}
 
+   pr_debug("RAS poison consumption handling\n");
atomic_set(>poison, 1);
kfd_unref_process(p);
 
@@ -119,10 +121,14 @@ static void event_interrupt_poison_consumption(struct 
kfd_dev *dev,
break;
case SOC15_INTSRC_SDMA_ECC:
default:
+   if (client_id == SOC15_IH_CLIENTID_UTCL2)
+   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
break;
}
 
-   kfd_signal_poison_consumed_event(dev, pasid);
+   /* utcl2 page fault has its own vm fault event */
+   if (client_id != SOC15_IH_CLIENTID_UTCL2)
+   kfd_signal_poison_consumed_event(dev, pasid);
 
/* resetting queue passes, do page retirement without gpu reset
 * resetting queue fails, fallback to gpu reset solution
@@ -314,7 +320,18 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
info.prot_write = ring_id & 0x20;
 
kfd_smi_event_update_vmfault(dev, pasid);
-   kfd_dqm_evict_pasid(dev->dqm, pasid);
+
+   if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
+   dev->kfd2kgd->is_ras_utcl2_poison &&
+   dev->kfd2kgd->is_ras_utcl2_poison(dev->adev, client_id)) {
+   event_interrupt_poison_consumption(dev, ih_ring_entry);
+
+   if (dev->kfd2kgd->utcl2_fault_clear)
+   dev->kfd2kgd->utcl2_fault_clear(dev->adev);
+   }
+   else
+   kfd_dqm_evict_pasid(dev->dqm, pasid);
+
kfd_signal_vm_fault_event(dev, pasid, );
}
 }
-- 
2.35.1

[PATCH 1/3] drm/amdkfd: update parameter for event_interrupt_poison_consumption

2022-03-14 Thread Tao Zhou

Other parameters can be gotten from ih_ring_entry, so only inputting
ih_ring_entry is enough.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 7eedbcd14828..f7def0bf0730 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -91,11 +91,16 @@ enum SQ_INTERRUPT_ERROR_TYPE {
 #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
 
 static void event_interrupt_poison_consumption(struct kfd_dev *dev,
-   uint16_t pasid, uint16_t source_id)
+   const uint32_t *ih_ring_entry)
 {
+   uint16_t source_id, pasid;
int ret = -EINVAL;
-   struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+   struct kfd_process *p;
 
+   source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+   pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
+
+   p = kfd_lookup_process_by_pasid(pasid);
if (!p)
return;
 
@@ -270,7 +275,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
sq_intr_err);
if (sq_intr_err != 
SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
sq_intr_err != 
SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
-   event_interrupt_poison_consumption(dev, 
pasid, source_id);
+   event_interrupt_poison_consumption(dev, 
ih_ring_entry);
return;
}
break;
@@ -291,7 +296,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
if (source_id == SOC15_INTSRC_SDMA_TRAP) {
kfd_signal_event_interrupt(pasid, context_id0 & 
0xfff, 28);
} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
-   event_interrupt_poison_consumption(dev, pasid, 
source_id);
+   event_interrupt_poison_consumption(dev, ih_ring_entry);
return;
}
} else if (client_id == SOC15_IH_CLIENTID_VMC ||
-- 
2.35.1

Re: [PATCH] amd/display: set backlight only if required

2022-03-14 Thread S, Shirish




On 3/11/2022 9:11 PM, Harry Wentland wrote:


On 3/11/22 10:33, Shirish S wrote:

[Why]
comparing pwm bl values (coverted) with user brightness(converted)
levels in commit_tail leads to continuous setting of backlight via dmub
as they don't to match.

Why do the values not match?


Here is a sample of values:

dmub_abm_get_current_backlight() reads backlight value as 11526 => 
convert_to_user() as 45.


user_brightness value to be set at this point is 159 => 
convert_from_user() gives 40863.


Now, we are continuously comparing 45 (current backlight) with 159 (to 
be set from user space) in every commit tail till any actual changes 
happen to brightness.


Ideally, current brightness/backlight value read from pwm register, when 
converted should yield 159 but it returns 45.


Hence, I believe, there's a bug either in conversion back and forth of 
user space levels or pwm register is not the right way to arrive at 
current brightness values.



  It looks like the value mismatch
is our root cause.
Yes, apparently I could not find any other register read that could bail 
us out here and provide actual/proper values, hence this patch.

I remember a while back looking at an issue
where we the readback was from DMCU while we were setting BL
directly via PWM. I wonder if the opposite is happening now.

See this for the previous fix:
2bf3d62dabcc drm/amd/display: Get backlight from PWM if DMCU is not initialized


The sample values mentioned above are with this patch applied.

Is there a better way of reading current backlight levels, that reflect 
user space ones?




This leads overdrive in queuing of commands to DMCU that sometimes lead
to depending on load on DMCU fw:

"[drm:dc_dmub_srv_wait_idle] *ERROR* Error waiting for DMUB idle: status=3"

[How]
Store last successfully set backlight value and compare with it instead
of pwm reads which is not what we should compare with.


Does BL work reliably after S3 or S4 with your change? I wonder if
there are use-cases that might break because we're no longer comparing
against the actual BL value but against a stored variable.
I've verified this patch for boot, S0i3 and GUI method of changing 
brightness on ChromeOS



Signed-off-by: Shirish S 
---
  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 7 ---
  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 6 ++
  2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index df0980ff9a63..2b8337e47861 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -3972,7 +3972,7 @@ static u32 convert_brightness_to_user(const struct 
amdgpu_dm_backlight_caps *cap
 max - min);
  }
  
-static int amdgpu_dm_backlight_set_level(struct amdgpu_display_manager *dm,

+static void amdgpu_dm_backlight_set_level(struct amdgpu_display_manager *dm,
 int bl_idx,
 u32 user_brightness)
  {
@@ -4003,7 +4003,8 @@ static int amdgpu_dm_backlight_set_level(struct 
amdgpu_display_manager *dm,
DRM_DEBUG("DM: Failed to update backlight on 
eDP[%d]\n", bl_idx);
}
  
-	return rc ? 0 : 1;

+   if (rc)
+   dm->actual_brightness[bl_idx] = user_brightness;
  }
  
  static int amdgpu_dm_backlight_update_status(struct backlight_device *bd)

@@ -9944,7 +9945,7 @@ static void amdgpu_dm_atomic_commit_tail(struct 
drm_atomic_state *state)
/* restore the backlight level */
for (i = 0; i < dm->num_of_edps; i++) {
if (dm->backlight_dev[i] &&
-   (amdgpu_dm_backlight_get_level(dm, i) != dm->brightness[i]))
+   (dm->actual_brightness[i] != dm->brightness[i]))
amdgpu_dm_backlight_set_level(dm, i, dm->brightness[i]);
}
  #endif
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index 372f9adf091a..321279bc877b 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -540,6 +540,12 @@ struct amdgpu_display_manager {
 * cached backlight values.
 */
u32 brightness[AMDGPU_DM_MAX_NUM_EDP];
+   /**
+* @actual_brightness:

"actual" seems misleading here. We might want to call this
"last" or something along those lines.

But let's first see if we can fix the mismatch of BL reads
and writes.


Yes, lets thoroughly evaluate if there is any other way.

Regards,

Shirish S



Harry


+*
+* last successfully applied backlight values.
+*/
+   u32 actual_brightness[AMDGPU_DM_MAX_NUM_EDP];
  };
  
  enum dsc_clock_force_state {

RE: [PATCH] drm/amdgpu: fixed the warnings reported by kernel test robot

2022-03-14 Thread Zhou1, Tao

[AMD Official Use Only]

Reviewed-by: Tao Zhou 

> -Original Message-
> From: Chai, Thomas 
> Sent: Monday, March 14, 2022 1:52 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking
> ; Zhou1, Tao ; Clements,
> John ; Chai, Thomas 
> Subject: [PATCH] drm/amdgpu: fixed the warnings reported by kernel test robot
> 
> The reported warnings are as follows:
>   1.warning:no-previous-prototype-for-amdgpu_hdp_ras_fini.
>   2.warning:no-previous-prototype-for-amdgpu_mmhub_ras_fini.
> 
> Amdgpu_hdp_ras_fini and amdgpu_mmhub_ras_fini are unused in the code,
> they are the only functions in amdgpu_hdp.c and amdgpu_mmhub.c. After
> removing these two functions, both amdgpu_hdp.c and amdgpu_mmhub.c are
> empty, so these two files can be deleted to fix the warning.
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/Makefile   |  4 +--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c   | 30 ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c | 30 ---
>  3 files changed, 2 insertions(+), 62 deletions(-)  delete mode 100644
> drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
>  delete mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile
> b/drivers/gpu/drm/amd/amdgpu/Makefile
> index 5dfe08cb045e..40e2c6e2df79 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -53,11 +53,11 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
>   amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
>   amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o
> amdgpu_virt.o \
>   amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
> - amdgpu_debugfs.o amdgpu_ids.o amdgpu_gmc.o amdgpu_mmhub.o \
> + amdgpu_debugfs.o amdgpu_ids.o amdgpu_gmc.o \
>   amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o amdgpu_vm_cpu.o \
>   amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o
> amdgpu_nbio.o \
>   amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o
> amdgpu_rap.o \
> - amdgpu_fw_attestation.o amdgpu_securedisplay.o amdgpu_hdp.o \
> + amdgpu_fw_attestation.o amdgpu_securedisplay.o \
>   amdgpu_eeprom.o amdgpu_mca.o
> 
>  amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
> deleted file mode 100644
> index 3f3d92e16c2e..
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
> +++ /dev/null
> @@ -1,30 +0,0 @@
> -/*
> - * Copyright 2021 Advanced Micro Devices, Inc.
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the "Software"),
> - * to deal in the Software without restriction, including without limitation
> - * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> - * and/or sell copies of the Software, and to permit persons to whom the
> - * Software is furnished to do so, subject to the following conditions:
> - *
> - * The above copyright notice and this permission notice shall be included in
> - * all copies or substantial portions of the Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> EVENT SHALL
> - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> DAMAGES OR
> - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> OTHERWISE,
> - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
> USE OR
> - * OTHER DEALINGS IN THE SOFTWARE.
> - *
> - */
> -
> -#include "amdgpu.h"
> -#include "amdgpu_ras.h"
> -
> -void amdgpu_hdp_ras_fini(struct amdgpu_device *adev, struct ras_common_if
> *ras_block) -{
> -
> -}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c
> deleted file mode 100644
> index 8f2fa247d605..
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c
> +++ /dev/null
> @@ -1,30 +0,0 @@
> -/*
> - * Copyright 2019 Advanced Micro Devices, Inc.
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the "Software"),
> - * to deal in the Software without restriction, including without limitation
> - * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> - * and/or sell copies of the Software, and to permit persons to whom the
> - * Software is furnished to do so, subject to the following conditions:
> - *
> - * The above copyright notice and this permission notice shall be included in
> - * all copies or substantial portions of the Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> - *

47 matches

Mail list logo