Re: [PATCH] drm/i915/guc: Fix missing enable of Wa_14019159160 on ARL
On 8/9/2024 2:06 AM, john.c.harri...@intel.com wrote: From: John Harrison The previous update to enable the workaround on ARL only changed two out of three places where the w/a needs to be enabled. That meant the GuC side was operational but not the KMD side. And as the KMD side is the trigger, it meant the w/a was not actually active. So fix that. Fixes: 104bcfae57d8 ("drm/i915/arl: Enable Wa_14019159160 for ARL") Cc: John Harrison Cc: Vinay Belgaumkar Cc: Daniele Ceraolo Spurio Cc: Andi Shyti Cc: Lucas De Marchi Cc: Rodrigo Vivi Cc: Matt Roper Cc: Jonathan Cavitt Cc: Nirmoy Das Cc: Shuicheng Lin Signed-off-by: John Harrison Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 9400d0eb682b2..3e1c3bc56daf2 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -4506,7 +4506,7 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine) /* Wa_16019325821 */ /* Wa_14019159160 */ if ((engine->class == COMPUTE_CLASS || engine->class == RENDER_CLASS) && - IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 71))) + IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74))) engine->flags |= I915_ENGINE_USES_WA_HOLD_SWITCHOUT; /*
Re: [PATCH 1/2] drm/i915/gem: Do not look for the exact address in node
On 8/7/2024 12:05 PM, Andi Shyti wrote: In preparation for the upcoming partial memory mapping feature, we want to make sure that when looking for a node we consider also the offset and not just the starting address of the virtual memory node. Signed-off-by: Andi Shyti Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index cac6d4184506..d3ee8ef7ea2f 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -1071,9 +1071,9 @@ int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma) rcu_read_lock(); drm_vma_offset_lock_lookup(dev->vma_offset_manager); - node = drm_vma_offset_exact_lookup_locked(dev->vma_offset_manager, - vma->vm_pgoff, - vma_pages(vma)); + node = drm_vma_offset_lookup_locked(dev->vma_offset_manager, + vma->vm_pgoff, + vma_pages(vma)); if (node && drm_vma_node_is_allowed(node, priv)) { /* * Skip 0-refcnted objects as it is in the process of being
Re: [PATCH 2/2] drm/i915/gem: Calculate object page offset for partial memory mapping
On 8/7/2024 12:05 PM, Andi Shyti wrote: To enable partial memory mapping of GPU virtual memory, it's necessary to introduce an offset to the object's memory (obj->mm.pages) scatterlist. This adjustment compensates for instances when userspace mappings do not start from the beginning of the object. Based on a patch by Chris Wilson. Signed-off-by: Andi Shyti Cc: Chris Wilson Cc: Lionel Landwerlin Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 4 +++- drivers/gpu/drm/i915/i915_mm.c | 12 +++- drivers/gpu/drm/i915/i915_mm.h | 3 ++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index d3ee8ef7ea2f..bb00af317d59 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -252,6 +252,7 @@ static vm_fault_t vm_fault_cpu(struct vm_fault *vmf) struct vm_area_struct *area = vmf->vma; struct i915_mmap_offset *mmo = area->vm_private_data; struct drm_i915_gem_object *obj = mmo->obj; + unsigned long obj_offset; resource_size_t iomap; int err; @@ -273,10 +274,11 @@ static vm_fault_t vm_fault_cpu(struct vm_fault *vmf) iomap -= obj->mm.region->region.start; } + obj_offset = area->vm_pgoff - drm_vma_node_start(&mmo->vma_node); /* PTEs are revoked in obj->ops->put_pages() */ err = remap_io_sg(area, area->vm_start, area->vm_end - area->vm_start, - obj->mm.pages->sgl, iomap); + obj->mm.pages->sgl, obj_offset, iomap); if (area->vm_flags & VM_WRITE) { GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj)); diff --git a/drivers/gpu/drm/i915/i915_mm.c b/drivers/gpu/drm/i915/i915_mm.c index 7998bc74ab49..f5c97a620962 100644 --- a/drivers/gpu/drm/i915/i915_mm.c +++ b/drivers/gpu/drm/i915/i915_mm.c @@ -122,13 +122,15 @@ int remap_io_mapping(struct vm_area_struct *vma, * @addr: target user address to start at * @size: size of map area * @sgl: Start sg entry + * @offset: offset from the start of the page * @iobase: Use stored dma address offset by this address or pfn if -1 * * Note: this is only safe if the mm semaphore is held when called. */ int remap_io_sg(struct vm_area_struct *vma, unsigned long addr, unsigned long size, - struct scatterlist *sgl, resource_size_t iobase) + struct scatterlist *sgl, unsigned long offset, + resource_size_t iobase) { struct remap_pfn r = { .mm = vma->vm_mm, @@ -141,6 +143,14 @@ int remap_io_sg(struct vm_area_struct *vma, /* We rely on prevalidation of the io-mapping to skip track_pfn(). */ GEM_BUG_ON((vma->vm_flags & EXPECTED_FLAGS) != EXPECTED_FLAGS); + while (offset >= sg_dma_len(r.sgt.sgp) >> PAGE_SHIFT) { + offset -= sg_dma_len(r.sgt.sgp) >> PAGE_SHIFT; + r.sgt = __sgt_iter(__sg_next(r.sgt.sgp), use_dma(iobase)); + if (!r.sgt.sgp) + return -EINVAL; + } + r.sgt.curr = offset << PAGE_SHIFT; + if (!use_dma(iobase)) flush_cache_range(vma, addr, size); diff --git a/drivers/gpu/drm/i915/i915_mm.h b/drivers/gpu/drm/i915/i915_mm.h index 04c8974d822b..69f9351b1a1c 100644 --- a/drivers/gpu/drm/i915/i915_mm.h +++ b/drivers/gpu/drm/i915/i915_mm.h @@ -30,6 +30,7 @@ int remap_io_mapping(struct vm_area_struct *vma, int remap_io_sg(struct vm_area_struct *vma, unsigned long addr, unsigned long size, - struct scatterlist *sgl, resource_size_t iobase); + struct scatterlist *sgl, unsigned long offset, + resource_size_t iobase); #endif /* __I915_MM_H__ */
Re: [PATCH] drm/i915: Allow NULL memory region
On 7/12/2024 11:41 PM, Jonathan Cavitt wrote: Prevent a NULL pointer access in intel_memory_regions_hw_probe. Fixes: 05da7d9f717b ("drm/i915/gem: Downgrade stolen lmem setup warning") Reported-by: Dan Carpenter Signed-off-by: Jonathan Cavitt Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/11704 Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/intel_memory_region.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_memory_region.c b/drivers/gpu/drm/i915/intel_memory_region.c index 172dfa7c3588b..d40ee1b42110a 100644 --- a/drivers/gpu/drm/i915/intel_memory_region.c +++ b/drivers/gpu/drm/i915/intel_memory_region.c @@ -368,8 +368,10 @@ int intel_memory_regions_hw_probe(struct drm_i915_private *i915) goto out_cleanup; } - mem->id = i; - i915->mm.regions[i] = mem; + if (mem) { /* Skip on non-fatal errors */ + mem->id = i; + i915->mm.regions[i] = mem; + } } for (i = 0; i < ARRAY_SIZE(i915->mm.regions); i++) {
Re: [PATCH] drm/i915: Allow NULL memory region
On 7/17/2024 5:25 PM, Dan Carpenter wrote: On Wed, Jul 17, 2024 at 05:05:55PM +0200, Nirmoy Das wrote: On 7/12/2024 11:41 PM, Jonathan Cavitt wrote: Prevent a NULL pointer access in intel_memory_regions_hw_probe. Fixes: 05da7d9f717b ("drm/i915/gem: Downgrade stolen lmem setup warning") Reported-by: Dan Carpenter Signed-off-by: Jonathan Cavitt --- drivers/gpu/drm/i915/intel_memory_region.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_memory_region.c b/drivers/gpu/drm/i915/intel_memory_region.c index 172dfa7c3588b..d40ee1b42110a 100644 --- a/drivers/gpu/drm/i915/intel_memory_region.c +++ b/drivers/gpu/drm/i915/intel_memory_region.c @@ -368,8 +368,10 @@ int intel_memory_regions_hw_probe(struct drm_i915_private *i915) goto out_cleanup; } - mem->id = i; - i915->mm.regions[i] = mem; There is a check for mem just before that. You could use IS_ERR_OR_NULL(mem) instead of IS_ERR(). An error pointer return is normally completely different from a NULL return in how it's handled. intel_memory_regions_driver_release() skipped my eyes in the cleanup path. Here NULL is a special kind of success. I wrote a blog about this. https://staticthinking.wordpress.com/2022/08/01/mixing-error-pointers-and-null/ I am the perfect target audience for this blog post :) Thanks, Nirmoy regards, dan carpenter
Re: [PATCH] drm/i915: Allow NULL memory region
On 7/17/2024 5:30 PM, Cavitt, Jonathan wrote: -Original Message- From: Nirmoy Das Sent: Wednesday, July 17, 2024 8:22 AM To: Cavitt, Jonathan ; intel-gfx@lists.freedesktop.org Cc: Gupta, saurabhg ; dan.carpen...@linaro.org; chris.p.wil...@linux.intel.com; Andi Shyti Subject: Re: [PATCH] drm/i915: Allow NULL memory region On 7/17/2024 5:11 PM, Cavitt, Jonathan wrote: -Original Message- From: Nirmoy Das Sent: Wednesday, July 17, 2024 8:06 AM To: Cavitt, Jonathan ; intel-gfx@lists.freedesktop.org Cc: Gupta, saurabhg ; dan.carpen...@linaro.org; chris.p.wil...@linux.intel.com; Andi Shyti Subject: Re: [PATCH] drm/i915: Allow NULL memory region On 7/12/2024 11:41 PM, Jonathan Cavitt wrote: Prevent a NULL pointer access in intel_memory_regions_hw_probe. Fixes: 05da7d9f717b ("drm/i915/gem: Downgrade stolen lmem setup warning") Reported-by: Dan Carpenter Signed-off-by: Jonathan Cavitt --- drivers/gpu/drm/i915/intel_memory_region.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_memory_region.c b/drivers/gpu/drm/i915/intel_memory_region.c index 172dfa7c3588b..d40ee1b42110a 100644 --- a/drivers/gpu/drm/i915/intel_memory_region.c +++ b/drivers/gpu/drm/i915/intel_memory_region.c @@ -368,8 +368,10 @@ int intel_memory_regions_hw_probe(struct drm_i915_private *i915) goto out_cleanup; } - mem->id = i; - i915->mm.regions[i] = mem; There is a check for mem just before that. You could use IS_ERR_OR_NULL(mem) instead of IS_ERR(). I think you're referring to the "goto out_cleanup" path? Yes. mem being NULL is a valid use case, so we shouldn't take the error path when it's observed. Not an error path if you return expected/correct value. intel_memory_regions_driver_release releases all previously grabbed memory regions in the out_cleanup path. Ah, yes. Isn't so simple as I thought. Never mind ignore my previous comment. -Jonathan Cavitt You could do diff --git a/drivers/gpu/drm/i915/intel_memory_region.c b/drivers/gpu/drm/i915/intel_memory_region.c index 172dfa7c3588..41ef7fdfa69b 100644 --- a/drivers/gpu/drm/i915/intel_memory_region.c +++ b/drivers/gpu/drm/i915/intel_memory_region.c @@ -362,9 +362,10 @@ int intel_memory_regions_hw_probe(struct drm_i915_private *i915) if (IS_ERR(mem)) { err = PTR_ERR(mem); - drm_err(&i915->drm, - "Failed to setup region(%d) type=%d\n", - err, type); + if (err) + drm_err(&i915->drm, + "Failed to setup region(%d) type=%d\n", + err, type); goto out_cleanup; } PTR_ERR(NULL) should be 0 I think and could even add a info saying skipping setting up that reason. Regards, Nirmoy -Jonathan Cavitt Regards, Nirmoy + if (mem) { /* Skip on non-fatal errors */ + mem->id = i; + i915->mm.regions[i] = mem; + } } for (i = 0; i < ARRAY_SIZE(i915->mm.regions); i++) {
Re: [PATCH] drm/i915: Allow NULL memory region
On 7/17/2024 5:11 PM, Cavitt, Jonathan wrote: -Original Message- From: Nirmoy Das Sent: Wednesday, July 17, 2024 8:06 AM To: Cavitt, Jonathan ; intel-gfx@lists.freedesktop.org Cc: Gupta, saurabhg ; dan.carpen...@linaro.org; chris.p.wil...@linux.intel.com; Andi Shyti Subject: Re: [PATCH] drm/i915: Allow NULL memory region On 7/12/2024 11:41 PM, Jonathan Cavitt wrote: Prevent a NULL pointer access in intel_memory_regions_hw_probe. Fixes: 05da7d9f717b ("drm/i915/gem: Downgrade stolen lmem setup warning") Reported-by: Dan Carpenter Signed-off-by: Jonathan Cavitt --- drivers/gpu/drm/i915/intel_memory_region.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_memory_region.c b/drivers/gpu/drm/i915/intel_memory_region.c index 172dfa7c3588b..d40ee1b42110a 100644 --- a/drivers/gpu/drm/i915/intel_memory_region.c +++ b/drivers/gpu/drm/i915/intel_memory_region.c @@ -368,8 +368,10 @@ int intel_memory_regions_hw_probe(struct drm_i915_private *i915) goto out_cleanup; } - mem->id = i; - i915->mm.regions[i] = mem; There is a check for mem just before that. You could use IS_ERR_OR_NULL(mem) instead of IS_ERR(). I think you're referring to the "goto out_cleanup" path? Yes. mem being NULL is a valid use case, so we shouldn't take the error path when it's observed. Not an error path if you return expected/correct value. You could do diff --git a/drivers/gpu/drm/i915/intel_memory_region.c b/drivers/gpu/drm/i915/intel_memory_region.c index 172dfa7c3588..41ef7fdfa69b 100644 --- a/drivers/gpu/drm/i915/intel_memory_region.c +++ b/drivers/gpu/drm/i915/intel_memory_region.c @@ -362,9 +362,10 @@ int intel_memory_regions_hw_probe(struct drm_i915_private *i915) if (IS_ERR(mem)) { err = PTR_ERR(mem); - drm_err(&i915->drm, - "Failed to setup region(%d) type=%d\n", - err, type); + if (err) + drm_err(&i915->drm, + "Failed to setup region(%d) type=%d\n", + err, type); goto out_cleanup; } PTR_ERR(NULL) should be 0 I think and could even add a info saying skipping setting up that reason. Regards, Nirmoy -Jonathan Cavitt Regards, Nirmoy + if (mem) { /* Skip on non-fatal errors */ + mem->id = i; + i915->mm.regions[i] = mem; + } } for (i = 0; i < ARRAY_SIZE(i915->mm.regions); i++) {
Re: [PATCH] drm/i915: Allow NULL memory region
On 7/12/2024 11:41 PM, Jonathan Cavitt wrote: Prevent a NULL pointer access in intel_memory_regions_hw_probe. Fixes: 05da7d9f717b ("drm/i915/gem: Downgrade stolen lmem setup warning") Reported-by: Dan Carpenter Signed-off-by: Jonathan Cavitt --- drivers/gpu/drm/i915/intel_memory_region.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_memory_region.c b/drivers/gpu/drm/i915/intel_memory_region.c index 172dfa7c3588b..d40ee1b42110a 100644 --- a/drivers/gpu/drm/i915/intel_memory_region.c +++ b/drivers/gpu/drm/i915/intel_memory_region.c @@ -368,8 +368,10 @@ int intel_memory_regions_hw_probe(struct drm_i915_private *i915) goto out_cleanup; } - mem->id = i; - i915->mm.regions[i] = mem; There is a check for mem just before that. You could use IS_ERR_OR_NULL(mem) instead of IS_ERR(). Regards, Nirmoy + if (mem) { /* Skip on non-fatal errors */ + mem->id = i; + i915->mm.regions[i] = mem; + } } for (i = 0; i < ARRAY_SIZE(i915->mm.regions); i++) {
Re: [PATCH] drm/i915/gem: Suppress oom warning in favour of ENOMEM to userspace
Hi Andi, On 6/27/2024 12:04 PM, Andi Shyti wrote: Hi Nirmoy, On Wed, Jun 26, 2024 at 04:33:18PM +0200, Nirmoy Das wrote: We report object allocation failures to userspace with ENOMEM so add __GFP_NOWARN to remove superfluous oom warnings. I think this should be the default behavior. Yes, when drivers handle ENOMEM situation which is the case for i915/gem code ENOMEM doesn't necessarily mean that there is a kernel failure. Most of the time we just run out of memory, deal with it :-) Reviewed-by: Andi Shyti Thanks! Thanks, Andi
Re: [PATCH] drm/i915/gem: Suppress oom warning in favour of ENOMEM to userspace
Hi Rodrigo, On 6/26/2024 5:50 PM, Rodrigo Vivi wrote: On Wed, Jun 26, 2024 at 05:36:43PM +0200, Nirmoy Das wrote: Hi Rodrigo, On 6/26/2024 5:24 PM, Rodrigo Vivi wrote: On Wed, Jun 26, 2024 at 04:33:18PM +0200, Nirmoy Das wrote: >We report object allocation failures to userspace with ENOMEM >so add __GFP_NOWARN to remove superfluous oom warnings. >Closes: [1]https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/4936 >Cc: Andi Shyti [2] >Signed-off-by: Nirmoy Das [3] >--- > drivers/gpu/drm/i915/i915_scatterlist.c | 8 > 1 file changed, 4 insertions(+), 4 deletions(-) >diff --git a/drivers/gpu/drm/i915/i915_scatterlist.c b/drivers/gpu/drm/i915/i915_scatterlist.c >index e93d2538f298..4d830740946d 100644 >--- a/drivers/gpu/drm/i915/i915_scatterlist.c >+++ b/drivers/gpu/drm/i915/i915_scatterlist.c >@@ -90,7 +90,7 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, > >GEM_BUG_ON(!max_segment); > >- rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL); >+ rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL | __GFP_NOWARN); >if (!rsgt) >return ERR_PTR(-ENOMEM); is it really safe? I don't believe we can guarantee a good fallback plan here if allocation fails. __i915_refct_sgt_init might end up in a null dereference, no?! Kernel is now returning ENOMEM and also throwing a oom warning stack. With __GFP_NOWARN the oom warning stack won't be there in the dmesg but userspace will still get ENOMEM as expected. doh! I had missunderstand the flag. Thanks for the confirmation. Reviewed-by: Rodrigo Vivi BTW, what email clients are you using recently? Using the same client, Thunderbird. it is hard to parse your responses lately. Please check if it is really sending/replying as text-only mode. Thanks for notifying me. May be recent update changed some settings. I will check. Nirmoy Let me know if got your question correctly. Regards, Nirmoy > >@@ -104,7 +104,7 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, >} > >if (sg_alloc_table(st, DIV_ROUND_UP_ULL(node->size, segment_pages), >- GFP_KERNEL)) { >+ GFP_KERNEL | __GFP_NOWARN)) { >i915_refct_sgt_put(rsgt); >return ERR_PTR(-ENOMEM); >} >@@ -178,7 +178,7 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, >GEM_BUG_ON(list_empty(blocks)); >GEM_BUG_ON(!max_segment); > >- rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL); >+ rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL | __GFP_NOWARN); >if (!rsgt) >return ERR_PTR(-ENOMEM); > >@@ -190,7 +190,7 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, >return ERR_PTR(-E2BIG); >} > >- if (sg_alloc_table(st, PFN_UP(res->size), GFP_KERNEL)) { >+ if (sg_alloc_table(st, PFN_UP(res->size), GFP_KERNEL | __GFP_NOWARN)) { >i915_refct_sgt_put(rsgt); >return ERR_PTR(-ENOMEM); >} >-- >2.42.0 References Visible links 1. https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/4936 2. mailto:andi.sh...@linux.intel.com 3. mailto:nirmoy@intel.com
Re: [PATCH] drm/i915/gem: Suppress oom warning in favour of ENOMEM to userspace
Hi Rodrigo, On 6/26/2024 5:24 PM, Rodrigo Vivi wrote: On Wed, Jun 26, 2024 at 04:33:18PM +0200, Nirmoy Das wrote: We report object allocation failures to userspace with ENOMEM so add __GFP_NOWARN to remove superfluous oom warnings. Closes:https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/4936 Cc: Andi Shyti Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/i915_scatterlist.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_scatterlist.c b/drivers/gpu/drm/i915/i915_scatterlist.c index e93d2538f298..4d830740946d 100644 --- a/drivers/gpu/drm/i915/i915_scatterlist.c +++ b/drivers/gpu/drm/i915/i915_scatterlist.c @@ -90,7 +90,7 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, GEM_BUG_ON(!max_segment); - rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL); + rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL | __GFP_NOWARN); if (!rsgt) return ERR_PTR(-ENOMEM); is it really safe? I don't believe we can guarantee a good fallback plan here if allocation fails. __i915_refct_sgt_init might end up in a null dereference, no?! Kernel is now returning ENOMEM and also throwing a oom warning stack. With __GFP_NOWARN the oom warning stack won't be there in the dmesg but userspace will still get ENOMEM as expected. Let me know if got your question correctly. Regards, Nirmoy @@ -104,7 +104,7 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, } if (sg_alloc_table(st, DIV_ROUND_UP_ULL(node->size, segment_pages), - GFP_KERNEL)) { + GFP_KERNEL | __GFP_NOWARN)) { i915_refct_sgt_put(rsgt); return ERR_PTR(-ENOMEM); } @@ -178,7 +178,7 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, GEM_BUG_ON(list_empty(blocks)); GEM_BUG_ON(!max_segment); - rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL); + rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL | __GFP_NOWARN); if (!rsgt) return ERR_PTR(-ENOMEM); @@ -190,7 +190,7 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, return ERR_PTR(-E2BIG); } - if (sg_alloc_table(st, PFN_UP(res->size), GFP_KERNEL)) { + if (sg_alloc_table(st, PFN_UP(res->size), GFP_KERNEL | __GFP_NOWARN)) { i915_refct_sgt_put(rsgt); return ERR_PTR(-ENOMEM); } -- 2.42.0
[PATCH] drm/i915/gem: Suppress oom warning in favour of ENOMEM to userspace
We report object allocation failures to userspace with ENOMEM so add __GFP_NOWARN to remove superfluous oom warnings. Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/4936 Cc: Andi Shyti Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/i915_scatterlist.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_scatterlist.c b/drivers/gpu/drm/i915/i915_scatterlist.c index e93d2538f298..4d830740946d 100644 --- a/drivers/gpu/drm/i915/i915_scatterlist.c +++ b/drivers/gpu/drm/i915/i915_scatterlist.c @@ -90,7 +90,7 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, GEM_BUG_ON(!max_segment); - rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL); + rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL | __GFP_NOWARN); if (!rsgt) return ERR_PTR(-ENOMEM); @@ -104,7 +104,7 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, } if (sg_alloc_table(st, DIV_ROUND_UP_ULL(node->size, segment_pages), - GFP_KERNEL)) { + GFP_KERNEL | __GFP_NOWARN)) { i915_refct_sgt_put(rsgt); return ERR_PTR(-ENOMEM); } @@ -178,7 +178,7 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, GEM_BUG_ON(list_empty(blocks)); GEM_BUG_ON(!max_segment); - rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL); + rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL | __GFP_NOWARN); if (!rsgt) return ERR_PTR(-ENOMEM); @@ -190,7 +190,7 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, return ERR_PTR(-E2BIG); } - if (sg_alloc_table(st, PFN_UP(res->size), GFP_KERNEL)) { + if (sg_alloc_table(st, PFN_UP(res->size), GFP_KERNEL | __GFP_NOWARN)) { i915_refct_sgt_put(rsgt); return ERR_PTR(-ENOMEM); } -- 2.42.0
Re: [PATCH] drm/i915/gt: debugfs: Evaluate forcewake usage within locks
On 6/11/2024 3:58 PM, Tvrtko Ursulin wrote: On 10/06/2024 10:24, Nirmoy Das wrote: Hi Andi, On 6/7/2024 4:51 PM, Andi Shyti wrote: The forcewake count and domains listing is multi process critical and the uncore provides a spinlock for such cases. Lock the forcewake evaluation section in the fw_domains_show() debugfs interface. Signed-off-by: Andi Shyti Needs a Fixes tag, below seems to be correct one. Fixes: 9dd4b065446a ("drm/i915/gt: Move pm debug files into a gt aware debugfs") Cc: # v5.6+ Reviewed-by: Nirmoy Das What is the back story here and why would it need backporting? IGT cares about the atomic view of user_forcewake_count and individual domains or what? There is no serious back story. This came from a static code analyzer report. I keep forgetting debugfs isn't mounted on production systems so we don't have to backport this patch. Thanks, Nirmoy Regards, Tvrtko Regards, Nirmoy --- drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c | 4 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c b/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c index 4fcba42cfe34..0437fd8217e0 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c @@ -71,6 +71,8 @@ static int fw_domains_show(struct seq_file *m, void *data) struct intel_uncore_forcewake_domain *fw_domain; unsigned int tmp; + spin_lock_irq(&uncore->lock); + seq_printf(m, "user.bypass_count = %u\n", uncore->user_forcewake_count); @@ -79,6 +81,8 @@ static int fw_domains_show(struct seq_file *m, void *data) intel_uncore_forcewake_domain_to_str(fw_domain->id), READ_ONCE(fw_domain->wake_count)); + spin_unlock_irq(&uncore->lock); + return 0; } DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(fw_domains);
Re: [PATCH] drm/i915/gt: debugfs: Evaluate forcewake usage within locks
Hi Andi, On 6/7/2024 4:51 PM, Andi Shyti wrote: The forcewake count and domains listing is multi process critical and the uncore provides a spinlock for such cases. Lock the forcewake evaluation section in the fw_domains_show() debugfs interface. Signed-off-by: Andi Shyti Needs a Fixes tag, below seems to be correct one. Fixes: 9dd4b065446a ("drm/i915/gt: Move pm debug files into a gt aware debugfs") Cc: # v5.6+ Reviewed-by: Nirmoy Das Regards, Nirmoy --- drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c | 4 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c b/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c index 4fcba42cfe34..0437fd8217e0 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c @@ -71,6 +71,8 @@ static int fw_domains_show(struct seq_file *m, void *data) struct intel_uncore_forcewake_domain *fw_domain; unsigned int tmp; + spin_lock_irq(&uncore->lock); + seq_printf(m, "user.bypass_count = %u\n", uncore->user_forcewake_count); @@ -79,6 +81,8 @@ static int fw_domains_show(struct seq_file *m, void *data) intel_uncore_forcewake_domain_to_str(fw_domain->id), READ_ONCE(fw_domain->wake_count)); + spin_unlock_irq(&uncore->lock); + return 0; } DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(fw_domains);
Re: [PATCH v2] drm/i915: Increase FLR timeout from 3s to 9s
On 5/24/2024 1:58 AM, Andi Shyti wrote: Following the guidelines it takes 3 seconds to perform an FLR reset. Let's give it a bit more slack because this time can change depending on the platform and on the firmware Signed-off-by: Andi Shyti Reviewed-by: Nirmoy Das --- Hi, In this second version I removed patch 2 that was ignoring the FLR reset timeouts, until we develop a proper patch. This first patch is basically the same as v1. Thanks Nirmoy for your review. Andi drivers/gpu/drm/i915/intel_uncore.c | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index 729409a4bada..2eba289d88ad 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -2614,11 +2614,18 @@ void intel_uncore_prune_engine_fw_domains(struct intel_uncore *uncore, static void driver_initiated_flr(struct intel_uncore *uncore) { struct drm_i915_private *i915 = uncore->i915; - const unsigned int flr_timeout_ms = 3000; /* specs recommend a 3s wait */ + unsigned int flr_timeout_ms; int ret; drm_dbg(&i915->drm, "Triggering Driver-FLR\n"); + /* +* The specification recommends a 3 seconds FLR reset timeout. To be +* cautious, we will extend this to 9 seconds, three times the specified +* timeout. +*/ + flr_timeout_ms = 9000; + /* * Make sure any pending FLR requests have cleared by waiting for the * FLR trigger bit to go to zero. Also clear GU_DEBUG's DRIVERFLR_STATUS
Re: [PATCH 2/2] drm/i915: Don't treat FLR resets as errors
Hi Andi, On 5/21/2024 12:56 PM, Andi Shyti wrote: Hi Nirmoy, On Fri, May 17, 2024 at 10:13:37PM +0200, Nirmoy Das wrote: Hi Andi, On 5/17/2024 9:34 PM, Andi Shyti wrote: Hi Nirmoy, On Fri, May 17, 2024 at 04:00:02PM +0200, Nirmoy Das wrote: On 5/17/2024 1:25 PM, Andi Shyti wrote: If we timeout while waiting for an FLR reset, there is nothing we can do and i915 doesn't have any control on it. In any case the system is still perfectly usable If a FLR reset fails then we will have a dead GPU, I don't think the GPU is usable without a cold reboot. fact is that the GPU keeps going and even though the timeout has expired, the system moves to the next phase. The current test might look like it is has passed, but if you look into the subsequent tests you can see a dead GPU: <7>[ 369.168121] pci :00:02.0: [drm:intel_uncore_fini_mmio [i915]] Triggering Driver-FLR <3>[ 372.170189] pci :00:02.0: [drm] *ERROR* Driver-FLR-teardown wait completion failed! -110 <7>[ 372.437630] [IGT] i915_selftest: finished subtest requests, SUCCESS <7>[ 372.438356] [IGT] i915_selftest: starting dynamic subtest migrate <5>[ 373.110580] Setting dangerous option live_selftests - tainting kernel <3>[ 373.183499] i915 :00:02.0: Unable to change power state from D0 to D0, device inaccessible <3>[ 373.246921] i915 :00:02.0: [drm] *ERROR* Unrecognized display IP version 1023.255; disabling display. <7>[ 373.247130] i915 :00:02.0: [drm:intel_step_init [i915]] Using future steppings <7>[ 373.247716] i915 :00:02.0: [drm:intel_step_init [i915]] Using future steppings <7>[ 373.248263] i915 :00:02.0: [drm:intel_step_init [i915]] Using future display steppings <7>[ 373.251843] i915 :00:02.0: [drm:intel_gt_common_init_early [i915]] WOPCM: 2048K <7>[ 373.252505] i915 :00:02.0: [drm:intel_uc_init_early [i915]] GT0: enable_guc=3 (guc:yes submission:yes huc:no slpc:yes) <7>[ 373.253140] i915 :00:02.0: [drm:intel_gt_probe_all [i915]] GT0: Setting up Primary GT <7>[ 373.253556] i915 :00:02.0: [drm:intel_gt_probe_all [i915]] GT1: Setting up Standalone Media GT <7>[ 373.253941] i915 :00:02.0: [drm:intel_gt_common_init_early [i915]] WOPCM: 2048K <7>[ 373.254365] i915 :00:02.0: [drm:intel_uc_init_early [i915]] GT1: enable_guc=3 (guc:yes submission:yes huc:yes slpc:yes) <3>[ 375.256235] i915 :00:02.0: [drm] *ERROR* Device is non-operational; MMIO access returns 0x! <3>[ 375.259089] i915 :00:02.0: Device initialization failed (-5) <3>[ 375.260521] i915 :00:02.0: probe with driver i915 failed with error -5 <7>[ 375.392209] [IGT] i915_selftest: finished subtest migrate, FAIL https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_14724/bat-arls-3/dmesg0.txt Are we sure this is dependent on the FLR reset? Yes, while on FLR read into memory will return either 0/F. There are cases when the FLR reset doesn't make any difference and in any case this error is completely ignored by the driver. This happens at very late with no recovery possible and hope is module reload works. Perhaps we can change it to a warning? I think it should be error. CI will still complain even on warning. This is a serious issue and should be report as an error. I think we need to create a HW ticket to understand why is FLR reset fails. Maybe it takes longer and longer to reset. We've been sending several patches in the latest years to fix the timings. HW spec says 3 sec but we can try increasing it bit higher to try it out. We could go, then, with just patch 1 and see if it improves. Does it help ? If helps then we can go ahead with increased timeout. Also because, the FLR reset might also depend on the firmware. Possible. In that case we should wait for firmware fix ? Regards, Nirmoy Thanks, Nirmoy, Andi
Re: [PATCH 2/2] drm/i915: Don't treat FLR resets as errors
Hi Andi, On 5/17/2024 9:34 PM, Andi Shyti wrote: Hi Nirmoy, On Fri, May 17, 2024 at 04:00:02PM +0200, Nirmoy Das wrote: On 5/17/2024 1:25 PM, Andi Shyti wrote: If we timeout while waiting for an FLR reset, there is nothing we can do and i915 doesn't have any control on it. In any case the system is still perfectly usable If a FLR reset fails then we will have a dead GPU, I don't think the GPU is usable without a cold reboot. fact is that the GPU keeps going and even though the timeout has expired, the system moves to the next phase. The current test might look like it is has passed, but if you look into the subsequent tests you can see a dead GPU: <7>[ 369.168121] pci :00:02.0: [drm:intel_uncore_fini_mmio [i915]] Triggering Driver-FLR *<3>[ 372.170189] pci :00:02.0: [drm] *ERROR* Driver-FLR-teardown wait completion failed! -110* *<7>[ 372.437630] [IGT] i915_selftest: finished subtest requests, SUCCESS* <7>[ 372.438356] [IGT] i915_selftest: starting dynamic subtest migrate <5>[ 373.110580] Setting dangerous option live_selftests - tainting kernel <3>[ 373.183499] i915 :00:02.0: Unable to change power state from D0 to D0, device inaccessible <3>[ 373.246921] i915 :00:02.0: [drm] *ERROR* Unrecognized display IP version 1023.255; disabling display. <7>[ 373.247130] i915 :00:02.0: [drm:intel_step_init [i915]] Using future steppings <7>[ 373.247716] i915 :00:02.0: [drm:intel_step_init [i915]] Using future steppings <7>[ 373.248263] i915 :00:02.0: [drm:intel_step_init [i915]] Using future display steppings <7>[ 373.251843] i915 :00:02.0: [drm:intel_gt_common_init_early [i915]] WOPCM: 2048K <7>[ 373.252505] i915 :00:02.0: [drm:intel_uc_init_early [i915]] GT0: enable_guc=3 (guc:yes submission:yes huc:no slpc:yes) <7>[ 373.253140] i915 :00:02.0: [drm:intel_gt_probe_all [i915]] GT0: Setting up Primary GT <7>[ 373.253556] i915 :00:02.0: [drm:intel_gt_probe_all [i915]] GT1: Setting up Standalone Media GT <7>[ 373.253941] i915 :00:02.0: [drm:intel_gt_common_init_early [i915]] WOPCM: 2048K <7>[ 373.254365] i915 :00:02.0: [drm:intel_uc_init_early [i915]] GT1: enable_guc=3 (guc:yes submission:yes huc:yes slpc:yes) *<3>[ 375.256235] i915 :00:02.0: [drm] *ERROR* Device is non-operational; MMIO access returns 0x!* <3>[ 375.259089] i915 :00:02.0: Device initialization failed (-5) <3>[ 375.260521] i915 :00:02.0: probe with driver i915 failed with error -5 <7>[ 375.392209] [IGT] i915_selftest: finished subtest migrate, FAIL https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_14724/bat-arls-3/dmesg0.txt This is a serious issue and should be report as an error. I think we need to create a HW ticket to understand why is FLR reset fails. Maybe it takes longer and longer to reset. We've been sending several patches in the latest years to fix the timings. HW spec says 3 sec but we can try increasing it bit higher to try it out. Regards, Nirmoy Andi
Re: [PATCH 2/2] drm/i915: Don't treat FLR resets as errors
Hi Andi, On 5/17/2024 1:25 PM, Andi Shyti wrote: If we timeout while waiting for an FLR reset, there is nothing we can do and i915 doesn't have any control on it. In any case the system is still perfectly usable If a FLR reset fails then we will have a dead GPU, I don't think the GPU is usable without a cold reboot. This is a serious issue and should be report as an error. I think we need to create a HW ticket to understand why is FLR reset fails. Regards, Nirmoy and the function returns void. We don't need to be alarmed, therefore, print the timeout expiration as a debug message instead of an error. Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/10955 Signed-off-by: Andi Shyti --- drivers/gpu/drm/i915/intel_uncore.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index 2eba289d88ad..a3fa2ed91aae 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -2637,7 +2637,7 @@ static void driver_initiated_flr(struct intel_uncore *uncore) */ ret = intel_wait_for_register_fw(uncore, GU_CNTL, DRIVERFLR, 0, flr_timeout_ms); if (ret) { - drm_err(&i915->drm, + drm_dbg(&i915->drm, "Failed to wait for Driver-FLR bit to clear! %d\n", ret); return; @@ -2652,7 +2652,7 @@ static void driver_initiated_flr(struct intel_uncore *uncore) DRIVERFLR, 0, flr_timeout_ms); if (ret) { - drm_err(&i915->drm, "Driver-FLR-teardown wait completion failed! %d\n", ret); + drm_dbg(&i915->drm, "Driver-FLR-teardown wait completion failed! %d\n", ret); return; } @@ -2661,7 +2661,7 @@ static void driver_initiated_flr(struct intel_uncore *uncore) DRIVERFLR_STATUS, DRIVERFLR_STATUS, flr_timeout_ms); if (ret) { - drm_err(&i915->drm, "Driver-FLR-reinit wait completion failed! %d\n", ret); + drm_dbg(&i915->drm, "Driver-FLR-reinit wait completion failed! %d\n", ret); return; }
Re: [PATCH] drm/i915/selftests: Set always_coherent to false when reading from CPU
On 5/17/2024 1:53 PM, Jani Nikula wrote: On Fri, 17 May 2024, Nirmoy Das wrote: Hi Jani, On 5/17/2024 9:39 AM, Jani Nikula wrote: On Thu, 16 May 2024, Nirmoy Das wrote: The previous commit 'commit 8d4ba9fc1c6c ("drm/i915/selftests: Pick "previous commit" is a fairly vague reference once this gets committed. It's not going to be "previous" in any meaningful sense. Please just start with: Commit 8d4ba9fc1c6c ("drm/i915/selftests: Pick correct caching mode.") was not complete... Will do that. And probably add: Fixes: 8d4ba9fc1c6c ("drm/i915/selftests: Pick correct caching mode.") Do we need Fixes for selftest ? I always assumed it is not required as this code is for debug/CI Maybe not for stuff that's already in stable, but we do run CI on drm-next and -rc kernels, and if this causes issues there, why not have them fixed? Not sure a commit with Fixes flows from drm-intel-next to drm-next/-rc but I see no issue adding Fixes without CC-ing to stable. Pushed it to drm-intel-next with above modifications. b4-shazam picked Fixes as well which was nice. Thanks, Nirmoy BR, Jani. Thanks, Nirmoy BR, Jani. correct caching mode.")' was not complete as for non LLC sharing platforms cpu read can happen from LLC which probably doesn't have the latest changes made by GPU. Cc: Andi Shyti Cc: Janusz Krzysztofik Cc: Jonathan Cavitt Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c index 65a931ea80e9..3527b8f446fe 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c @@ -196,7 +196,7 @@ static int verify_access(struct drm_i915_private *i915, if (err) goto out_file; - mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, true); + mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, false); vaddr = i915_gem_object_pin_map_unlocked(native_obj, mode); if (IS_ERR(vaddr)) { err = PTR_ERR(vaddr);
Re: [PATCH] drm/i915/selftests: Set always_coherent to false when reading from CPU
Hi Jani, On 5/17/2024 9:39 AM, Jani Nikula wrote: On Thu, 16 May 2024, Nirmoy Das wrote: The previous commit 'commit 8d4ba9fc1c6c ("drm/i915/selftests: Pick "previous commit" is a fairly vague reference once this gets committed. It's not going to be "previous" in any meaningful sense. Please just start with: Commit 8d4ba9fc1c6c ("drm/i915/selftests: Pick correct caching mode.") was not complete... Will do that. And probably add: Fixes: 8d4ba9fc1c6c ("drm/i915/selftests: Pick correct caching mode.") Do we need Fixes for selftest ? I always assumed it is not required as this code is for debug/CI Thanks, Nirmoy BR, Jani. correct caching mode.")' was not complete as for non LLC sharing platforms cpu read can happen from LLC which probably doesn't have the latest changes made by GPU. Cc: Andi Shyti Cc: Janusz Krzysztofik Cc: Jonathan Cavitt Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c index 65a931ea80e9..3527b8f446fe 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c @@ -196,7 +196,7 @@ static int verify_access(struct drm_i915_private *i915, if (err) goto out_file; - mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, true); + mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, false); vaddr = i915_gem_object_pin_map_unlocked(native_obj, mode); if (IS_ERR(vaddr)) { err = PTR_ERR(vaddr);
[PATCH] drm/i915/selftests: Set always_coherent to false when reading from CPU
The previous commit 'commit 8d4ba9fc1c6c ("drm/i915/selftests: Pick correct caching mode.")' was not complete as for non LLC sharing platforms cpu read can happen from LLC which probably doesn't have the latest changes made by GPU. Cc: Andi Shyti Cc: Janusz Krzysztofik Cc: Jonathan Cavitt Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c index 65a931ea80e9..3527b8f446fe 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c @@ -196,7 +196,7 @@ static int verify_access(struct drm_i915_private *i915, if (err) goto out_file; - mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, true); + mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, false); vaddr = i915_gem_object_pin_map_unlocked(native_obj, mode); if (IS_ERR(vaddr)) { err = PTR_ERR(vaddr); -- 2.42.0
[PATCH] drm/i915: Use for_each_child instead of manual for-loop
Simplify child iteration using for_each_child macro instead of using manual for loop. There is no functional change. Cc: John Harrison Cc: Tvrtko Ursulin Signed-off-by: Nirmoy Das --- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 64 ++- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 0eaa1064242c..7e88d90e935b 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1800,14 +1800,37 @@ __unwind_incomplete_requests(struct intel_context *ce) spin_unlock_irqrestore(&sched_engine->lock, flags); } -static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t stalled) +static void guc_reset_context_state(struct intel_context *ce, intel_engine_mask_t stalled) { - bool guilty; struct i915_request *rq; - unsigned long flags; + bool guilty = false; u32 head; - int i, number_children = ce->parallel.number_children; - struct intel_context *parent = ce; + + if (!intel_context_is_pinned(ce)) + return; + + rq = intel_context_get_active_request(ce); + if (!rq) { + head = ce->ring->tail; + goto out_replay; + } + + if (i915_request_started(rq)) + guilty = stalled & ce->engine->mask; + + GEM_BUG_ON(i915_active_is_idle(&ce->active)); + head = intel_ring_wrap(ce->ring, rq->head); + + __i915_request_reset(rq, guilty); + i915_request_put(rq); +out_replay: + guc_reset_state(ce, head, guilty); +} + +static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t stalled) +{ + struct intel_context *child; + unsigned long flags; GEM_BUG_ON(intel_context_is_child(ce)); @@ -1826,34 +1849,13 @@ static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t st * For each context in the relationship find the hanging request * resetting each context / request as needed */ - for (i = 0; i < number_children + 1; ++i) { - if (!intel_context_is_pinned(ce)) - goto next_context; - - guilty = false; - rq = intel_context_get_active_request(ce); - if (!rq) { - head = ce->ring->tail; - goto out_replay; - } - - if (i915_request_started(rq)) - guilty = stalled & ce->engine->mask; - - GEM_BUG_ON(i915_active_is_idle(&ce->active)); - head = intel_ring_wrap(ce->ring, rq->head); - - __i915_request_reset(rq, guilty); - i915_request_put(rq); -out_replay: - guc_reset_state(ce, head, guilty); -next_context: - if (i != number_children) - ce = list_next_entry(ce, parallel.child_link); + guc_reset_context_state(ce, stalled); + for_each_child(ce, child) { + guc_reset_context_state(child, stalled); } - __unwind_incomplete_requests(parent); - intel_context_put(parent); + __unwind_incomplete_requests(ce); + intel_context_put(ce); } void wake_up_all_tlb_invalidate(struct intel_guc *guc) -- 2.42.0
Re: [PATCH] drm/i915: Correct error handler
On 5/11/2024 5:48 PM, Jiasheng Jiang wrote: Replace "slab_priorities" with "slab_dependencies" in the error handler to avoid memory leak. Nice catch. I would make the subject more like: drm/i915: Fix memory leak by correcting cache object name in error handler Fixes: 32eb6bcfdda9 ("drm/i915: Make request allocation caches global") Also need Cc: # v5.2+ With those: Reviewed-by: Nirmoy Das Nirmoy Signed-off-by: Jiasheng Jiang --- drivers/gpu/drm/i915/i915_scheduler.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c index 762127dd56c5..70a854557e6e 100644 --- a/drivers/gpu/drm/i915/i915_scheduler.c +++ b/drivers/gpu/drm/i915/i915_scheduler.c @@ -506,6 +506,6 @@ int __init i915_scheduler_module_init(void) return 0; err_priorities: - kmem_cache_destroy(slab_priorities); + kmem_cache_destroy(slab_dependencies); return -ENOMEM; }
Re: [PATCH] Revert "drm/i915: Remove extra multi-gt pm-references"
On 5/7/2024 7:10 PM, Rodrigo Vivi wrote: On Tue, May 07, 2024 at 10:54:11AM +0200, Janusz Krzysztofik wrote: On Tuesday, 7 May 2024 09:30:15 GMT+2 Nirmoy Das wrote: Hi Janusz, Just realized we need Fixes tag for this. Fixes: 1f33dc0c1189 ("drm/i915: Remove extra multi-gt pm-references") Whoever is going to push this patch, please feel free to add this tag. dim b4-shazam gets that automagically, now it was sent in reply ;) Nice! I just pushed the patch. thanks for the patch and reviews. Thanks, Nirmoy Thanks, Janusz Regards, Nirmoy On 5/6/2024 8:02 PM, Janusz Krzysztofik wrote: This reverts commit 1f33dc0c1189efb9ae19c6fc22b64dd3e26261fb. There was a patch supposed to fix an issue of illegal attempts to free a still active i915 VMA object when parking a GT believed to be idle, reported by CI on 2-GT Meteor Lake. As a solution, an extra wakeref for a Primary GT was acquired from i915_gem_do_execbuffer() -- see commit f56fe3e91787 ("drm/i915: Fix a VMA UAF for multi-gt platform"). However, that fix occurred insufficient -- the issue was still reported by CI. That wakeref was released on exit from i915_gem_do_execbuffer(), then potentially before completion of the request and deactivation of its associated VMAs. Moreover, CI reports indicated that single-GT platforms also suffered sporadically from the same race. Since that issue was fixed by another commit f3c71b2ded5c ("drm/i915/vma: Fix UAF on destroy against retire race"), the changes introduced by that insufficient fix were dropped as no longer useful. However, that series resulted in another VMA UAF scenario now being triggered in CI. <4> [260.290809] [ cut here ] <4> [260.290988] list_del corruption. prev->next should be 888118c5d990, but was 888118c5a510. (prev=888118c5a510) <4> [260.291004] WARNING: CPU: 2 PID: 1143 at lib/list_debug.c:62 __list_del_entry_valid_or_report+0xb7/0xe0 .. <4> [260.291055] CPU: 2 PID: 1143 Comm: kms_plane Not tainted 6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1 <4> [260.291058] Hardware name: Intel Corporation Meteor Lake Client Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024 <4> [260.291060] RIP: 0010:__list_del_entry_valid_or_report+0xb7/0xe0 ... <4> [260.291087] Call Trace: <4> [260.291089] <4> [260.291124] i915_vma_reopen+0x43/0x80 [i915] <4> [260.291298] eb_lookup_vmas+0x9cb/0xcc0 [i915] <4> [260.291579] i915_gem_do_execbuffer+0xc9a/0x26d0 [i915] <4> [260.291883] i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915] ... <4> [260.292301] ... <4> [260.292506] ---[ end trace ]--- <4> [260.292782] general protection fault, probably for non-canonical address 0x6b6b6b6b6b6b6ca3: [#1] PREEMPT SMP NOPTI <4> [260.303575] CPU: 2 PID: 1143 Comm: kms_plane Tainted: GW 6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1 <4> [260.313851] Hardware name: Intel Corporation Meteor Lake Client Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024 <4> [260.326359] RIP: 0010:eb_validate_vmas+0x114/0xd80 [i915] ... <4> [260.428756] Call Trace: <4> [260.431192] <4> [639.283393] i915_gem_do_execbuffer+0xd05/0x26d0 [i915] <4> [639.305245] i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915] ... <4> [639.411134] ... <4> [639.449979] ---[ end trace ]--- We defer actually closing, unbinding and destroying a VMA until next idle point, or until the object is freed in the meantime. By postponing the unbind, we allow for the VMA to be reopened by the client, avoiding the work required to rebind the VMA. Starting from commit b0647a5e79b1 ("drm/i915: Avoid live-lock with i915_vma_parked()"), we assume that as long as a GT is held idle, no VMA would be reopened while we destroy them. That assumption is no longer true in multi-GT configurations, where a VMA we reopen may be handled by a GT different from the one that we already keep active via its engine while we set up an execbuf request. Restoring the extra GT0 PM wakeref removed from i915_gem_do_execbuffer() processing path seems to fix this issue. Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10608 Signed-off-by: Janusz Krzysztofik Cc: Rodrigo Vivi Cc: Nirmoy Das --- drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 18 ++ 1 file changed, 18 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 42619fc05de48..090724fa766c9 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -255,6 +255,7 @@ struct i915_execbuffer { struct intel_context *context; /* logical state for the request */ struct i915_gem_context *gem_context; /** caller's context *
Re: [PATCH] Revert "drm/i915: Remove extra multi-gt pm-references"
Hi Janusz, Just realized we need Fixes tag for this. Fixes: 1f33dc0c1189 ("drm/i915: Remove extra multi-gt pm-references") Regards, Nirmoy On 5/6/2024 8:02 PM, Janusz Krzysztofik wrote: This reverts commit 1f33dc0c1189efb9ae19c6fc22b64dd3e26261fb. There was a patch supposed to fix an issue of illegal attempts to free a still active i915 VMA object when parking a GT believed to be idle, reported by CI on 2-GT Meteor Lake. As a solution, an extra wakeref for a Primary GT was acquired from i915_gem_do_execbuffer() -- see commit f56fe3e91787 ("drm/i915: Fix a VMA UAF for multi-gt platform"). However, that fix occurred insufficient -- the issue was still reported by CI. That wakeref was released on exit from i915_gem_do_execbuffer(), then potentially before completion of the request and deactivation of its associated VMAs. Moreover, CI reports indicated that single-GT platforms also suffered sporadically from the same race. Since that issue was fixed by another commit f3c71b2ded5c ("drm/i915/vma: Fix UAF on destroy against retire race"), the changes introduced by that insufficient fix were dropped as no longer useful. However, that series resulted in another VMA UAF scenario now being triggered in CI. <4> [260.290809] [ cut here ] <4> [260.290988] list_del corruption. prev->next should be 888118c5d990, but was 888118c5a510. (prev=888118c5a510) <4> [260.291004] WARNING: CPU: 2 PID: 1143 at lib/list_debug.c:62 __list_del_entry_valid_or_report+0xb7/0xe0 .. <4> [260.291055] CPU: 2 PID: 1143 Comm: kms_plane Not tainted 6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1 <4> [260.291058] Hardware name: Intel Corporation Meteor Lake Client Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024 <4> [260.291060] RIP: 0010:__list_del_entry_valid_or_report+0xb7/0xe0 ... <4> [260.291087] Call Trace: <4> [260.291089] <4> [260.291124] i915_vma_reopen+0x43/0x80 [i915] <4> [260.291298] eb_lookup_vmas+0x9cb/0xcc0 [i915] <4> [260.291579] i915_gem_do_execbuffer+0xc9a/0x26d0 [i915] <4> [260.291883] i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915] ... <4> [260.292301] ... <4> [260.292506] ---[ end trace ]--- <4> [260.292782] general protection fault, probably for non-canonical address 0x6b6b6b6b6b6b6ca3: [#1] PREEMPT SMP NOPTI <4> [260.303575] CPU: 2 PID: 1143 Comm: kms_plane Tainted: GW 6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1 <4> [260.313851] Hardware name: Intel Corporation Meteor Lake Client Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024 <4> [260.326359] RIP: 0010:eb_validate_vmas+0x114/0xd80 [i915] ... <4> [260.428756] Call Trace: <4> [260.431192] <4> [639.283393] i915_gem_do_execbuffer+0xd05/0x26d0 [i915] <4> [639.305245] i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915] ... <4> [639.411134] ... <4> [639.449979] ---[ end trace ]--- We defer actually closing, unbinding and destroying a VMA until next idle point, or until the object is freed in the meantime. By postponing the unbind, we allow for the VMA to be reopened by the client, avoiding the work required to rebind the VMA. Starting from commit b0647a5e79b1 ("drm/i915: Avoid live-lock with i915_vma_parked()"), we assume that as long as a GT is held idle, no VMA would be reopened while we destroy them. That assumption is no longer true in multi-GT configurations, where a VMA we reopen may be handled by a GT different from the one that we already keep active via its engine while we set up an execbuf request. Restoring the extra GT0 PM wakeref removed from i915_gem_do_execbuffer() processing path seems to fix this issue. Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10608 Signed-off-by: Janusz Krzysztofik Cc: Rodrigo Vivi Cc: Nirmoy Das --- drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 18 ++ 1 file changed, 18 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 42619fc05de48..090724fa766c9 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -255,6 +255,7 @@ struct i915_execbuffer { struct intel_context *context; /* logical state for the request */ struct i915_gem_context *gem_context; /** caller's context */ intel_wakeref_t wakeref; + intel_wakeref_t wakeref_gt0; /** our requests to build */ struct i915_request *requests[MAX_ENGINE_INSTANCE + 1]; @@ -2685,6 +2686,7 @@ static int eb_select_engine(struct i915_execbuffer *eb) { struct intel_context *ce, *child; + struct intel_gt *gt; unsigned int idx; int err; @@ -2708,10 +2710,17 @@ e
Re: [PATCH] Revert "drm/i915: Remove extra multi-gt pm-references"
On 5/6/2024 8:02 PM, Janusz Krzysztofik wrote: This reverts commit 1f33dc0c1189efb9ae19c6fc22b64dd3e26261fb. There was a patch supposed to fix an issue of illegal attempts to free a still active i915 VMA object when parking a GT believed to be idle, reported by CI on 2-GT Meteor Lake. As a solution, an extra wakeref for a Primary GT was acquired from i915_gem_do_execbuffer() -- see commit f56fe3e91787 ("drm/i915: Fix a VMA UAF for multi-gt platform"). However, that fix occurred insufficient -- the issue was still reported by CI. That wakeref was released on exit from i915_gem_do_execbuffer(), then potentially before completion of the request and deactivation of its associated VMAs. Moreover, CI reports indicated that single-GT platforms also suffered sporadically from the same race. Since that issue was fixed by another commit f3c71b2ded5c ("drm/i915/vma: Fix UAF on destroy against retire race"), the changes introduced by that insufficient fix were dropped as no longer useful. However, that series resulted in another VMA UAF scenario now being triggered in CI. <4> [260.290809] [ cut here ] <4> [260.290988] list_del corruption. prev->next should be 888118c5d990, but was 888118c5a510. (prev=888118c5a510) <4> [260.291004] WARNING: CPU: 2 PID: 1143 at lib/list_debug.c:62 __list_del_entry_valid_or_report+0xb7/0xe0 .. <4> [260.291055] CPU: 2 PID: 1143 Comm: kms_plane Not tainted 6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1 <4> [260.291058] Hardware name: Intel Corporation Meteor Lake Client Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024 <4> [260.291060] RIP: 0010:__list_del_entry_valid_or_report+0xb7/0xe0 ... <4> [260.291087] Call Trace: <4> [260.291089] <4> [260.291124] i915_vma_reopen+0x43/0x80 [i915] <4> [260.291298] eb_lookup_vmas+0x9cb/0xcc0 [i915] <4> [260.291579] i915_gem_do_execbuffer+0xc9a/0x26d0 [i915] <4> [260.291883] i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915] ... <4> [260.292301] ... <4> [260.292506] ---[ end trace ]--- <4> [260.292782] general protection fault, probably for non-canonical address 0x6b6b6b6b6b6b6ca3: [#1] PREEMPT SMP NOPTI <4> [260.303575] CPU: 2 PID: 1143 Comm: kms_plane Tainted: GW 6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1 <4> [260.313851] Hardware name: Intel Corporation Meteor Lake Client Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024 <4> [260.326359] RIP: 0010:eb_validate_vmas+0x114/0xd80 [i915] ... <4> [260.428756] Call Trace: <4> [260.431192] <4> [639.283393] i915_gem_do_execbuffer+0xd05/0x26d0 [i915] <4> [639.305245] i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915] ... <4> [639.411134] ... <4> [639.449979] ---[ end trace ]--- We defer actually closing, unbinding and destroying a VMA until next idle point, or until the object is freed in the meantime. By postponing the unbind, we allow for the VMA to be reopened by the client, avoiding the work required to rebind the VMA. Starting from commit b0647a5e79b1 ("drm/i915: Avoid live-lock with i915_vma_parked()"), we assume that as long as a GT is held idle, no VMA would be reopened while we destroy them. That assumption is no longer true in multi-GT configurations, where a VMA we reopen may be handled by a GT different from the one that we already keep active via its engine while we set up an execbuf request. Restoring the extra GT0 PM wakeref removed from i915_gem_do_execbuffer() processing path seems to fix this issue. Closes:https://gitlab.freedesktop.org/drm/intel/-/issues/10608 Signed-off-by: Janusz Krzysztofik Cc: Rodrigo Vivi Cc: Nirmoy Das Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 18 ++ 1 file changed, 18 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 42619fc05de48..090724fa766c9 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -255,6 +255,7 @@ struct i915_execbuffer { struct intel_context *context; /* logical state for the request */ struct i915_gem_context *gem_context; /** caller's context */ intel_wakeref_t wakeref; + intel_wakeref_t wakeref_gt0; /** our requests to build */ struct i915_request *requests[MAX_ENGINE_INSTANCE + 1]; @@ -2685,6 +2686,7 @@ static int eb_select_engine(struct i915_execbuffer *eb) { struct intel_context *ce, *child; + struct intel_gt *gt; unsigned int idx; int err; @@ -2708,10 +2710,17 @@ eb_select_engine(struct i915_execbuffer *eb) } } eb->num_batches = ce->parallel.number_children + 1; + gt = ce-&g
Re: [PATCH v3] drm/i915/vma: Fix UAF on reopen vs destroy race
may help with this one, which started appearing after I reverted that workaround. However, its effectiveness is limited to MTL topology. perhaps the safer path for this case indeed. something that could be really limited to a single platform would be better. I agree with Rodrigo here. it would be safe revert the mentioned patch now and think about more robust solution later on as the issue is effecting current user. Regards, Nirmoy But I confess that I don't have other better suggestions. If we need to go with this patch as a quick solution, it is apparently better than leaving the bug there as is. +Thomas. any good thoughts there or advices? Thanks, Rodrigo. Thanks, Janusz Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10608 Signed-off-by: Janusz Krzysztofik Cc: Chris Wilson Cc: Tvrtko Ursulin Cc: sta...@vger.kernel.org # v6.0+ --- .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 10 -- drivers/gpu/drm/i915/i915_vma.c | 32 +++ drivers/gpu/drm/i915/i915_vma.h | 2 +- drivers/gpu/drm/i915/i915_vma_types.h | 3 ++ 4 files changed, 37 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 42619fc05de48..97e014f94002e 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -847,9 +847,12 @@ static int __eb_add_lut(struct i915_execbuffer *eb, if (unlikely(!lut)) return -ENOMEM; + if (!i915_vma_open(vma)) { + err = -EEXIST; /* let eb_vma_lookup() retry */ + goto err_lut_free; + } + i915_vma_get(vma); - if (!atomic_fetch_inc(&vma->open_count)) - i915_vma_reopen(vma); lut->handle = handle; lut->ctx = ctx; @@ -880,8 +883,9 @@ static int __eb_add_lut(struct i915_execbuffer *eb, return 0; err: - i915_vma_close(vma); i915_vma_put(vma); + i915_vma_close(vma); +err_lut_free: i915_lut_handle_free(lut); return err; } diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index d2f064d2525cc..4435c76f28c8c 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -1735,14 +1735,33 @@ static void __i915_vma_remove_closed(struct i915_vma *vma) list_del_init(&vma->closed_link); } -void i915_vma_reopen(struct i915_vma *vma) +static struct i915_vma *i915_vma_reopen(struct i915_vma *vma) +{ + if (atomic_read(&vma->flags) & I915_VMA_PARKED) + return NULL; + + __i915_vma_remove_closed(vma); + return vma; +} + +struct i915_vma *i915_vma_open(struct i915_vma *vma) { struct intel_gt *gt = vma->vm->gt; + if (atomic_inc_not_zero(&vma->open_count)) + return vma; + spin_lock_irq(>->closed_lock); - if (i915_vma_is_closed(vma)) - __i915_vma_remove_closed(vma); + if (!atomic_inc_not_zero(&vma->open_count)) { + if (i915_vma_is_closed(vma)) + vma = i915_vma_reopen(vma); + + if (vma) + atomic_inc(&vma->open_count); + } spin_unlock_irq(>->closed_lock); + + return vma; } static void force_unbind(struct i915_vma *vma) @@ -1770,7 +1789,8 @@ static void release_references(struct i915_vma *vma, struct intel_gt *gt, spin_unlock(&obj->vma.lock); spin_lock_irq(>->closed_lock); - __i915_vma_remove_closed(vma); + if (!(atomic_read(&vma->flags) & I915_VMA_PARKED)) + __i915_vma_remove_closed(vma); spin_unlock_irq(>->closed_lock); if (vm_ddestroy) @@ -1854,22 +1874,22 @@ void i915_vma_parked(struct intel_gt *gt) } list_move(&vma->closed_link, &closed); + atomic_or(I915_VMA_PARKED, &vma->flags); } spin_unlock_irq(>->closed_lock); - /* As the GT is held idle, no vma can be reopened as we destroy them */ list_for_each_entry_safe(vma, next, &closed, closed_link) { struct drm_i915_gem_object *obj = vma->obj; struct i915_address_space *vm = vma->vm; if (i915_gem_object_trylock(obj, NULL)) { - INIT_LIST_HEAD(&vma->closed_link); i915_vma_destroy(vma); i915_gem_object_unlock(obj); } else { /* back you go.. */ spin_lock_irq(>->closed_lock); list_add(&vma->closed_link, >->closed_vma); + atomic_andnot(I915_VMA_PARKED, &vma->flags); spin_unlock_irq(>->closed_lock);
Re: [PATCH] drm/i915/gt: Disarm breadcrumbs if engines are already idle
On 4/23/2024 6:23 PM, Janusz Krzysztofik wrote: From: Chris Wilson The breadcrumbs use a GT wakeref for guarding the interrupt, but are disarmed during release of the engine wakeref. This leaves a hole where we may attach a breadcrumb just as the engine is parking (after it has parked its breadcrumbs), execute the irq worker with some signalers still attached, but never be woken again. That issue manifests itself in CI with IGT runner timeouts while tests are waiting indefinitely for release of all GT wakerefs. <6> [209.151778] i915: Running live_engine_pm_selftests/live_engine_busy_stats <7> [209.231628] i915 :00:02.0: [drm:intel_power_well_disable [i915]] disabling PW_5 <7> [209.231816] i915 :00:02.0: [drm:intel_power_well_disable [i915]] disabling PW_4 <7> [209.231944] i915 :00:02.0: [drm:intel_power_well_disable [i915]] disabling PW_3 <7> [209.232056] i915 :00:02.0: [drm:intel_power_well_disable [i915]] disabling PW_2 <7> [209.232166] i915 :00:02.0: [drm:intel_power_well_disable [i915]] disabling DC_off <7> [209.232270] i915 :00:02.0: [drm:skl_enable_dc6 [i915]] Enabling DC6 <7> [209.232368] i915 :00:02.0: [drm:gen9_set_dc_state.part.0 [i915]] Setting DC state from 00 to 02 <4> [299.356116] [IGT] Inactivity timeout exceeded. Killing the current test with SIGQUIT. ... <6> [299.356526] sysrq: Show State ... <6> [299.373964] task:i915_selftest state:D stack:11784 pid:5578 tgid:5578 ppid:873flags:0x4002 <6> [299.373967] Call Trace: <6> [299.373968] <6> [299.373970] __schedule+0x3bb/0xda0 <6> [299.373974] schedule+0x41/0x110 <6> [299.373976] intel_wakeref_wait_for_idle+0x82/0x100 [i915] <6> [299.374083] ? __pfx_var_wake_function+0x10/0x10 <6> [299.374087] live_engine_busy_stats+0x9b/0x500 [i915] <6> [299.374173] __i915_subtests+0xbe/0x240 [i915] <6> [299.374277] ? __pfx___intel_gt_live_setup+0x10/0x10 [i915] <6> [299.374369] ? __pfx___intel_gt_live_teardown+0x10/0x10 [i915] <6> [299.374456] intel_engine_live_selftests+0x1c/0x30 [i915] <6> [299.374547] __run_selftests+0xbb/0x190 [i915] <6> [299.374635] i915_live_selftests+0x4b/0x90 [i915] <6> [299.374717] i915_pci_probe+0x10d/0x210 [i915] At the end of the interrupt worker, if there are no more engines awake, disarm the breadcrumb and go to sleep. Fixes: 9d5612ca165a ("drm/i915/gt: Defer enabling the breadcrumb interrupt to after submission") Closes: https://gitlab.freedesktop.org/drm/intel/issues/10026 Signed-off-by: Chris Wilson Cc: Andrzej Hajda Cc: # v5.12+ Signed-off-by: Janusz Krzysztofik Acked-by: Nirmoy Das I will let others/Andrzej r-b this as I am not very familiar with the code. Thanks, Nirmoy --- drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 15 +++ 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c index d650beb8ed22f..20b9b04ec1e0b 100644 --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c @@ -263,8 +263,13 @@ static void signal_irq_work(struct irq_work *work) i915_request_put(rq); } + /* Lazy irq enabling after HW submission */ if (!READ_ONCE(b->irq_armed) && !list_empty(&b->signalers)) intel_breadcrumbs_arm_irq(b); + + /* And confirm that we still want irqs enabled before we yield */ + if (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) + intel_breadcrumbs_disarm_irq(b); } struct intel_breadcrumbs * @@ -315,13 +320,7 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b) return; /* Kick the work once more to drain the signalers, and disarm the irq */ - irq_work_sync(&b->irq_work); - while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) { - local_irq_disable(); - signal_irq_work(&b->irq_work); - local_irq_enable(); - cond_resched(); - } + irq_work_queue(&b->irq_work); } void intel_breadcrumbs_free(struct kref *kref) @@ -404,7 +403,7 @@ static void insert_breadcrumb(struct i915_request *rq) * the request as it may have completed and raised the interrupt as * we were attaching it into the lists. */ - if (!b->irq_armed || __i915_request_is_complete(rq)) + if (!READ_ONCE(b->irq_armed) || __i915_request_is_complete(rq)) irq_work_queue(&b->irq_work); }
Re: ✗ Fi.CI.IGT: failure for series starting with [v2,1/2] drm/i915: Refactor confusing __intel_gt_reset() (rev2)
Thanks a lot! -- Intel Deutschland GmbH Registered Address: Am Campeon 10, 85579 Neubiberg, Germany Tel: +49 89 99 8853-0, www.intel.de<http://www.intel.de> <http://www.intel.de><http://www.intel.de> Managing Directors: Christin Eisenschmid, Sharon Heck, Tiffany Doon Silva Chairperson of the Supervisory Board: Nicole Lau Registered Office: Munich Commercial Register: Amtsgericht Muenchen HRB 186928 From: Andi Shyti Sent: Wednesday, April 24, 2024 7:06 PM To: Nirmoy Das Cc: intel-gfx@lists.freedesktop.org ; Patchwork ; Das, Nirmoy ; Andi Shyti Subject: Re: ✗ Fi.CI.IGT: failure for series starting with [v2,1/2] drm/i915: Refactor confusing __intel_gt_reset() (rev2) Hi Nirmoy, On Wed, Apr 24, 2024 at 10:56:36AM +0200, Nirmoy Das wrote: > > On 4/24/2024 10:16 AM, Patchwork wrote: > > Patch Details > > Series: series starting with [v2,1/2] drm/i915: Refactor confusing > __intel_gt_reset() (rev2) > URL: https://patchwork.freedesktop.org/series/132731/ > State: failure > Details: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_132731v2/ > index.html > > CI Bug Log - changes from CI_DRM_14633_full -> Patchwork_132731v2_full > > Summary > > FAILURE > > Serious unknown changes coming with Patchwork_132731v2_full absolutely > need > to be > verified manually. > > If you think the reported changes have nothing to do with the changes > introduced in Patchwork_132731v2_full, please notify your bug team (' > i915-ci-in...@lists.freedesktop.org') to allow them > to document this new failure mode, which will reduce false positives in > CI. > > External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_132731v2/ > index.html > > Participating hosts (9 -> 8) > > Missing (1): shard-dg2-set2 > > Possible new issues > > Here are the unknown changes that may have been introduced in > Patchwork_132731v2_full: > > IGT changes > > Possible regressions > > □ igt@gem_exec_await@wide-all: > > ☆ shard-dg1: NOTRUN -> INCOMPLETE > □ igt@gem_exec_gttfill@engines@ccs0: > > ☆ shard-dg2: NOTRUN -> INCOMPLETE > > These are unrelated as the change only effects where GuC submission disabled. > > Andi, could you please help me merge this one. My dev machine is still broken. merged into drm-intel-gt-next. Thanks, Andi
Re: [PATCH v2 2/2] drm/i915: Fix gt reset with GuC submission is disabled
Hi Andi, On 4/23/2024 11:32 AM, Andi Shyti wrote: Hi Nirmoy, On Mon, Apr 22, 2024 at 10:19:51PM +0200, Nirmoy Das wrote: Currently intel_gt_reset() kills the GuC and then resets requested engines. This is problematic because there is a dedicated CSB FIFO which only GuC can access and if that FIFO fills up, the hardware will block on the next context switch until there is space that means the system is effectively hung. If an engine is reset whilst actively executing a context, a CSB entry will be sent to say that the context has gone idle. Thus if reset happens on a very busy system then killing GuC before killing the engines will lead to deadlock because of filled up CSB FIFO. is this a fix? I went quite far back in the commit logs, and it appears to me that we've always been using the current reset flow. I believe we don't perform a GT reset immediately after sending a number of requests, which is what the current failed test is doing. So, I don't think there will be any visible impact on the user with the current flow. To address this issue, the GuC should be killed only after resetting the requested engines and before calling intel_gt_init_hw(). v2: Improve commit message(John) Cc: John Harrison Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/intel_reset.c | 16 ++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index b1393863ca9b..6161f7a3ff70 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -879,8 +879,17 @@ static intel_engine_mask_t reset_prepare(struct intel_gt *gt) intel_engine_mask_t awake = 0; enum intel_engine_id id; - /* For GuC mode, ensure submission is disabled before stopping ring */ - intel_uc_reset_prepare(>->uc); + /** +* For GuC mode with submission enabled, ensure submission +* is disabled before stopping ring. nit: "stopping *the* ring" Will fix it while merging if I don't have to resend this again. +* +* For GuC mode with submission disabled, ensure that GuC is not +* sanitized, do that after engine reset. reset_prepare() +* is followed by engine reset which in this mode requires GuC to +* process any CSB FIFO entries generated by the resets. +*/ + if (intel_uc_uses_guc_submission(>->uc)) + intel_uc_reset_prepare(>->uc); for_each_engine(engine, gt, id) { if (intel_engine_pm_get_if_awake(engine)) @@ -1227,6 +1236,9 @@ void intel_gt_reset(struct intel_gt *gt, intel_overlay_reset(gt->i915); + /* sanitize uC after engine reset */ + if (!intel_uc_uses_guc_submission(>->uc)) + intel_uc_reset_prepare(>->uc); Reviewed-by: Andi Shyti Thanks, Nirmoy Thanks, Andi
Re: [PATCH v3 1/3] drm/i915/gem: Increment vma offset when mapping fb objects
Hi Andi, On 4/12/2024 2:48 AM, Andi Shyti wrote: Until now the "vm_pgoff" was not used and there has been no need to set its offset. But now, because we want to support partial mappings with a given offset, we need it to be set. Suggested-by: Chris Wilson Signed-off-by: Andi Shyti Do we have a IGT for partial FB mmap test ? Would be nice to have one but this patch looks good to me. Reviewed-by: Nirmoy Das Regards, Nirmoy --- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index a2195e28b625..ce10dd259812 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -1084,6 +1084,8 @@ int i915_gem_fb_mmap(struct drm_i915_gem_object *obj, struct vm_area_struct *vma mmo = mmap_offset_attach(obj, mmap_type, NULL); if (IS_ERR(mmo)) return PTR_ERR(mmo); + + vma->vm_pgoff += drm_vma_node_start(&mmo->vma_node); } /*
Re: [PATCH] drm/i915/gt: Refactor uabi engine class/instance list creation
Hi Andi, On 4/17/2024 12:49 AM, Andi Shyti wrote: For the upcoming changes we need a cleaner way to build the list of uabi engines. Suggested-by: Tvrtko Ursulin Signed-off-by: Andi Shyti --- Hi, just sending this patch to unburden the coming series from this single patch inherited from a previously sent series. Andi drivers/gpu/drm/i915/gt/intel_engine_user.c | 29 - 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c index 833987015b8b..11cc06c0c785 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c @@ -203,7 +203,7 @@ static void engine_rename(struct intel_engine_cs *engine, const char *name, u16 void intel_engines_driver_register(struct drm_i915_private *i915) { - u16 name_instance, other_instance = 0; + u16 class_instance[I915_LAST_UABI_ENGINE_CLASS + 2] = { }; +2 is confusing here. I think we need a better macro. struct legacy_ring ring = {}; struct list_head *it, *next; struct rb_node **p, *prev; @@ -214,6 +214,8 @@ void intel_engines_driver_register(struct drm_i915_private *i915) prev = NULL; p = &i915->uabi_engines.rb_node; list_for_each_safe(it, next, &engines) { + u16 uabi_class; + struct intel_engine_cs *engine = container_of(it, typeof(*engine), uabi_list); @@ -222,15 +224,14 @@ void intel_engines_driver_register(struct drm_i915_private *i915) GEM_BUG_ON(engine->class >= ARRAY_SIZE(uabi_classes)); engine->uabi_class = uabi_classes[engine->class]; - if (engine->uabi_class == I915_NO_UABI_CLASS) { - name_instance = other_instance++; - } else { - GEM_BUG_ON(engine->uabi_class >= - ARRAY_SIZE(i915->engine_uabi_class_count)); - name_instance = - i915->engine_uabi_class_count[engine->uabi_class]++; - } - engine->uabi_instance = name_instance; + + if (engine->uabi_class == I915_NO_UABI_CLASS) + uabi_class = I915_LAST_UABI_ENGINE_CLASS + 1; + else + uabi_class = engine->uabi_class; + + GEM_BUG_ON(uabi_class >= ARRAY_SIZE(class_instance)); + engine->uabi_instance = class_instance[uabi_class]++; /* * Replace the internal name with the final user and log facing @@ -238,11 +239,15 @@ void intel_engines_driver_register(struct drm_i915_private *i915) */ engine_rename(engine, intel_engine_class_repr(engine->class), - name_instance); + engine->uabi_instance); - if (engine->uabi_class == I915_NO_UABI_CLASS) + if (uabi_class > I915_LAST_UABI_ENGINE_CLASS) continue; + GEM_BUG_ON(uabi_class >= + ARRAY_SIZE(i915->engine_uabi_class_count)); + i915->engine_uabi_class_count[uabi_class]++; Shouldn't this be i915->engine_uabi_class_count[uabi_class] = class_instance[uabi_class]; ? What I see is that this patch mainly adding this class_instance array and rest looks the same. May be it make sense to add other upcoming patches to better understand why we need this patch. Regards, Nirmoy + rb_link_node(&engine->uabi_node, prev, p); rb_insert_color(&engine->uabi_node, &i915->uabi_engines);
[PATCH v2 2/2] drm/i915: Fix gt reset with GuC submission is disabled
Currently intel_gt_reset() kills the GuC and then resets requested engines. This is problematic because there is a dedicated CSB FIFO which only GuC can access and if that FIFO fills up, the hardware will block on the next context switch until there is space that means the system is effectively hung. If an engine is reset whilst actively executing a context, a CSB entry will be sent to say that the context has gone idle. Thus if reset happens on a very busy system then killing GuC before killing the engines will lead to deadlock because of filled up CSB FIFO. To address this issue, the GuC should be killed only after resetting the requested engines and before calling intel_gt_init_hw(). v2: Improve commit message(John) Cc: John Harrison Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/intel_reset.c | 16 ++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index b1393863ca9b..6161f7a3ff70 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -879,8 +879,17 @@ static intel_engine_mask_t reset_prepare(struct intel_gt *gt) intel_engine_mask_t awake = 0; enum intel_engine_id id; - /* For GuC mode, ensure submission is disabled before stopping ring */ - intel_uc_reset_prepare(>->uc); + /** +* For GuC mode with submission enabled, ensure submission +* is disabled before stopping ring. +* +* For GuC mode with submission disabled, ensure that GuC is not +* sanitized, do that after engine reset. reset_prepare() +* is followed by engine reset which in this mode requires GuC to +* process any CSB FIFO entries generated by the resets. +*/ + if (intel_uc_uses_guc_submission(>->uc)) + intel_uc_reset_prepare(>->uc); for_each_engine(engine, gt, id) { if (intel_engine_pm_get_if_awake(engine)) @@ -1227,6 +1236,9 @@ void intel_gt_reset(struct intel_gt *gt, intel_overlay_reset(gt->i915); + /* sanitize uC after engine reset */ + if (!intel_uc_uses_guc_submission(>->uc)) + intel_uc_reset_prepare(>->uc); /* * Next we need to restore the context, but we don't use those * yet either... -- 2.42.0
[PATCH v2 1/2] drm/i915: Refactor confusing __intel_gt_reset()
__intel_gt_reset() is really for resetting engines though the name might suggest something else. So add a helper function to remove confusions with no functional changes. v2: Move intel_gt_reset_all_engines() next to intel_gt_reset_engine() to make diff simple(John) Cc: John Harrison Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 2 +- .../drm/i915/gt/intel_execlists_submission.c | 2 +- drivers/gpu/drm/i915/gt/intel_gt.c| 2 +- drivers/gpu/drm/i915/gt/intel_gt_pm.c | 2 +- drivers/gpu/drm/i915/gt/intel_reset.c | 35 +++ drivers/gpu/drm/i915/gt/intel_reset.h | 3 +- drivers/gpu/drm/i915/gt/selftest_reset.c | 2 +- drivers/gpu/drm/i915/i915_driver.c| 2 +- 8 files changed, 37 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 8c44af1c3451..5c8e9ee3b008 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -678,7 +678,7 @@ void intel_engines_release(struct intel_gt *gt) */ GEM_BUG_ON(intel_gt_pm_is_awake(gt)); if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) - __intel_gt_reset(gt, ALL_ENGINES); + intel_gt_reset_all_engines(gt); /* Decouple the backend; but keep the layout for late GPU resets */ for_each_engine(engine, gt, id) { diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c index 355aab5b38ba..21829439e686 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c @@ -2898,7 +2898,7 @@ static void enable_error_interrupt(struct intel_engine_cs *engine) drm_err(&engine->i915->drm, "engine '%s' resumed still in error: %08x\n", engine->name, status); - __intel_gt_reset(engine->gt, engine->mask); + intel_gt_reset_engine(engine); } /* diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index 580b5141ce1e..626b166e67ef 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -832,7 +832,7 @@ void intel_gt_driver_unregister(struct intel_gt *gt) /* Scrub all HW state upon release */ with_intel_runtime_pm(gt->uncore->rpm, wakeref) - __intel_gt_reset(gt, ALL_ENGINES); + intel_gt_reset_all_engines(gt); } void intel_gt_driver_release(struct intel_gt *gt) diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c index 220ac4f92edf..c08fdb65cc69 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c @@ -159,7 +159,7 @@ static bool reset_engines(struct intel_gt *gt) if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) return false; - return __intel_gt_reset(gt, ALL_ENGINES) == 0; + return intel_gt_reset_all_engines(gt) == 0; } static void gt_sanitize(struct intel_gt *gt, bool force) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index c8e9aa41fdea..b1393863ca9b 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -764,7 +764,7 @@ wa_14015076503_end(struct intel_gt *gt, intel_engine_mask_t engine_mask) HECI_H_GS1_ER_PREP, 0); } -int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) +static int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) { const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1; reset_func reset; @@ -978,7 +978,7 @@ static void __intel_gt_set_wedged(struct intel_gt *gt) /* Even if the GPU reset fails, it should still stop the engines */ if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) - __intel_gt_reset(gt, ALL_ENGINES); + intel_gt_reset_all_engines(gt); for_each_engine(engine, gt, id) engine->submit_request = nop_submit_request; @@ -1089,7 +1089,7 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt) /* We must reset pending GPU events before restoring our submission */ ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */ if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) - ok = __intel_gt_reset(gt, ALL_ENGINES) == 0; + ok = intel_gt_reset_all_engines(gt) == 0; if (!ok) { /* * Warn CI about the unrecoverable wedged condition. @@ -1133,10 +1133,10 @@ static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask) {
Re: [PATCH 3/3] drm/i915: Fix gt reset with GuC submission disabled
Hi John, On 4/19/2024 1:38 AM, John Harrison wrote: On 4/18/2024 10:10, Nirmoy Das wrote: Currently intel_gt_reset() happens as follows: reset_prepare() ---> Sends GDRST to GuC, GuC is in GS_MIA_IN_RESET do_reset() intel_gt_reset_all_engines() *_engine_reset_prepare() -->RESET_CTL expects running GuC Not technically correct. There is no direct connection between RESET_CTL and GuC. *_reset_engines() intel_gt_init_hw() --> GuC comes out of GS_MIA_IN_RESET with FW loaded. Fix the issue by sanitizing the GuC only after resetting requested engines and before intel_gt_init_hw(). You never actually state what the issue is. The problem is that there is a dedicated CSB FIFO going to GuC (and nothing else has access to it). If that FIFO fills up, the hardware will block on the next context switch until there is space. If no-one (i.e. GuC) is draining it, that means the system is effectively hung. If an engine is reset whilst actively executing a context, a CSB entry will be sent to say that the context has gone idle. Thus if you reset a very busy system and start with killing GuC before killing the engines and only then re-enabling GuC, you run the risk of generating more CSB entries than will fit in the FIFO and deadlocking. Whereas, if the system is idle then you can reset the engines as much as you like while GuC is dead and it won't be a problem. I wasn't sure if I could talk about internal details so kept it to minimal. I will borrow above explanation and resend :) Note intel_uc_reset_finish() and intel_uc_reset() are nop when guc submission is disabled. Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/intel_reset.c | 16 ++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index 6504e8ba9c58..bd166f5aca4b 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -907,8 +907,17 @@ static intel_engine_mask_t reset_prepare(struct intel_gt *gt) intel_engine_mask_t awake = 0; enum intel_engine_id id; - /* For GuC mode, ensure submission is disabled before stopping ring */ - intel_uc_reset_prepare(>->uc); + /** + * For GuC mode with submission enabled, ensure submission + * is disabled before stopping ring. + * + * For GuC mode with submission disabled, ensure that GuC is not + * sanitized, do that at the end in reset_finish(). reset_prepare() + * is followed by engine reset which in this mode requires GuC to + * be functional to process engine reset events. -> to process any CSB FIFO entries generated by the resets. I will add this. Thanks, Nirmoy John. + */ + if (intel_uc_uses_guc_submission(>->uc)) + intel_uc_reset_prepare(>->uc); for_each_engine(engine, gt, id) { if (intel_engine_pm_get_if_awake(engine)) @@ -1255,6 +1264,9 @@ void intel_gt_reset(struct intel_gt *gt, intel_overlay_reset(gt->i915); + /* sanitize uC after engine reset */ + if (!intel_uc_uses_guc_submission(>->uc)) + intel_uc_reset_prepare(>->uc); /* * Next we need to restore the context, but we don't use those * yet either...
Re: [PATCH 2/3] drm/i915 Rename intel_engine_reset to intel_gt_engine_recover
Hi John, On 4/19/2024 1:27 AM, John Harrison wrote: On 4/18/2024 10:10, Nirmoy Das wrote: intel_engine_reset() not only reset a engine but also tries to recover it so give it a proper name without any functional changes. Not seeing what the difference is. If this was a super low level function (with an __ prefix for example) then one might expect it to literally just poke the reset register and leave the engine in a dead state. But as a high level function, I think it is reasonable to expect a reset function to 'recover' the entity being reset. Also, many of the callers are tests that are explicitly testing reset. So now the tests all talk about attempting resets, resets failing, etc. but around a call to 'recover' instead of 'reset', which seems confusing. Didn't think about it, I will drop it. Thanks, Nirmoy John. Signed-off-by: Nirmoy Das --- .../drm/i915/gem/selftests/i915_gem_context.c | 2 +- .../drm/i915/gt/intel_execlists_submission.c | 2 +- drivers/gpu/drm/i915/gt/intel_reset.c | 4 ++-- drivers/gpu/drm/i915/gt/intel_reset.h | 4 ++-- drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 20 +-- drivers/gpu/drm/i915/gt/selftest_mocs.c | 4 ++-- drivers/gpu/drm/i915/gt/selftest_reset.c | 2 +- .../gpu/drm/i915/gt/selftest_workarounds.c | 6 +++--- 8 files changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c index 89d4dc8b60c6..4f4cde55f621 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c @@ -1171,7 +1171,7 @@ __sseu_finish(const char *name, int ret = 0; if (flags & TEST_RESET) { - ret = intel_engine_reset(ce->engine, "sseu"); + ret = intel_gt_engine_recover(ce->engine, "sseu"); if (ret) goto out; } diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c index 21829439e686..9485a622a704 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c @@ -2404,7 +2404,7 @@ static void execlists_reset(struct intel_engine_cs *engine, const char *msg) ring_set_paused(engine, 1); /* Freeze the current request in place */ execlists_capture(engine); - intel_engine_reset(engine, msg); + intel_gt_engine_recover(engine, msg); tasklet_enable(&engine->sched_engine->tasklet); clear_and_wake_up_bit(bit, lock); diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index b825daace58e..6504e8ba9c58 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -1348,7 +1348,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg) } /** - * intel_engine_reset - reset GPU engine to recover from a hang + * intel_gt_engine_recover - reset GPU engine to recover from a hang * @engine: engine to reset * @msg: reason for GPU reset; or NULL for no drm_notice() * @@ -1360,7 +1360,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg) * - reset engine (which will force the engine to idle) * - re-init/configure engine */ -int intel_engine_reset(struct intel_engine_cs *engine, const char *msg) +int intel_gt_engine_recover(struct intel_engine_cs *engine, const char *msg) { int err; diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h b/drivers/gpu/drm/i915/gt/intel_reset.h index c00de353075c..be984357bf27 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.h +++ b/drivers/gpu/drm/i915/gt/intel_reset.h @@ -31,8 +31,8 @@ void intel_gt_handle_error(struct intel_gt *gt, void intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask, const char *reason); -int intel_engine_reset(struct intel_engine_cs *engine, - const char *reason); +int intel_gt_engine_recover(struct intel_engine_cs *engine, + const char *reason); int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *reason); diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c index 9ce8ff1c04fe..9bfda3f2bd24 100644 --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c @@ -495,9 +495,9 @@ static int igt_reset_nop_engine(void *arg) i915_request_add(rq); } - err = intel_engine_reset(engine, NULL); + err = intel_gt_engine_recover(engine, NULL); if (err) { - pr_err("intel_engine_reset(%s) failed, err:%d\n", + pr_err("intel_gt_engine_recover(%
Re: [PATCH 1/3] drm/i915: Refactor confusing __intel_gt_reset()
Hi John. On 4/19/2024 1:27 AM, John Harrison wrote: On 4/18/2024 10:10, Nirmoy Das wrote: __intel_gt_reset() is really for resetting engines though the name might suggest something else. So add two helper functions to remove confusions with no functional changes. Technically you only added one and just moved the other :). It already existed, it just wasn't being used everywhere that it could be! I did have one more helper functions but I removed it in favor of intel_gt_reset_engine() but didn't change the commit message :/ Thanks for catching it. I will fix it. Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 2 +- .../drm/i915/gt/intel_execlists_submission.c | 2 +- drivers/gpu/drm/i915/gt/intel_gt.c | 2 +- drivers/gpu/drm/i915/gt/intel_gt_pm.c | 2 +- drivers/gpu/drm/i915/gt/intel_reset.c | 43 ++- drivers/gpu/drm/i915/gt/intel_reset.h | 3 +- drivers/gpu/drm/i915/gt/selftest_reset.c | 2 +- drivers/gpu/drm/i915/i915_driver.c | 2 +- 8 files changed, 41 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 8c44af1c3451..5c8e9ee3b008 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -678,7 +678,7 @@ void intel_engines_release(struct intel_gt *gt) */ GEM_BUG_ON(intel_gt_pm_is_awake(gt)); if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) - __intel_gt_reset(gt, ALL_ENGINES); + intel_gt_reset_all_engines(gt); /* Decouple the backend; but keep the layout for late GPU resets */ for_each_engine(engine, gt, id) { diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c index 355aab5b38ba..21829439e686 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c @@ -2898,7 +2898,7 @@ static void enable_error_interrupt(struct intel_engine_cs *engine) drm_err(&engine->i915->drm, "engine '%s' resumed still in error: %08x\n", engine->name, status); - __intel_gt_reset(engine->gt, engine->mask); + intel_gt_reset_engine(engine); } /* diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index 580b5141ce1e..626b166e67ef 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -832,7 +832,7 @@ void intel_gt_driver_unregister(struct intel_gt *gt) /* Scrub all HW state upon release */ with_intel_runtime_pm(gt->uncore->rpm, wakeref) - __intel_gt_reset(gt, ALL_ENGINES); + intel_gt_reset_all_engines(gt); } void intel_gt_driver_release(struct intel_gt *gt) diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c index 220ac4f92edf..c08fdb65cc69 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c @@ -159,7 +159,7 @@ static bool reset_engines(struct intel_gt *gt) if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) return false; - return __intel_gt_reset(gt, ALL_ENGINES) == 0; + return intel_gt_reset_all_engines(gt) == 0; } static void gt_sanitize(struct intel_gt *gt, bool force) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index c8e9aa41fdea..b825daace58e 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -764,7 +764,7 @@ wa_14015076503_end(struct intel_gt *gt, intel_engine_mask_t engine_mask) HECI_H_GS1_ER_PREP, 0); } -int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) +static int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) { const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1; reset_func reset; @@ -795,6 +795,34 @@ int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) return ret; } +/** + * intel_gt_reset_all_engines() - Reset all engines in the given gt. + * @gt: the GT to reset all engines for. + * + * This function resets all engines within the given gt. + * + * Returns: + * Zero on success, negative error code on failure. + */ +int intel_gt_reset_all_engines(struct intel_gt *gt) +{ + return __intel_gt_reset(gt, ALL_ENGINES); +} + +/** + * intel_gt_reset_engine() - Reset a specific engine within a gt. + * @engine: engine to be reset. + * + * This function resets the specified engine within a gt. + * + * Returns: + * Zero on success, negative error code on failure. + */ +int intel_gt_reset_engine(struct intel_engine_cs *engine) +{ + return __intel_gt_reset(engine->gt, engine->mask); +} +
[PATCH 3/3] drm/i915: Fix gt reset with GuC submission disabled
Currently intel_gt_reset() happens as follows: reset_prepare() ---> Sends GDRST to GuC, GuC is in GS_MIA_IN_RESET do_reset() intel_gt_reset_all_engines() *_engine_reset_prepare() -->RESET_CTL expects running GuC *_reset_engines() intel_gt_init_hw() --> GuC comes out of GS_MIA_IN_RESET with FW loaded. Fix the issue by sanitizing the GuC only after resetting requested engines and before intel_gt_init_hw(). Note intel_uc_reset_finish() and intel_uc_reset() are nop when guc submission is disabled. Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/intel_reset.c | 16 ++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index 6504e8ba9c58..bd166f5aca4b 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -907,8 +907,17 @@ static intel_engine_mask_t reset_prepare(struct intel_gt *gt) intel_engine_mask_t awake = 0; enum intel_engine_id id; - /* For GuC mode, ensure submission is disabled before stopping ring */ - intel_uc_reset_prepare(>->uc); + /** +* For GuC mode with submission enabled, ensure submission +* is disabled before stopping ring. +* +* For GuC mode with submission disabled, ensure that GuC is not +* sanitized, do that at the end in reset_finish(). reset_prepare() +* is followed by engine reset which in this mode requires GuC to +* be functional to process engine reset events. +*/ + if (intel_uc_uses_guc_submission(>->uc)) + intel_uc_reset_prepare(>->uc); for_each_engine(engine, gt, id) { if (intel_engine_pm_get_if_awake(engine)) @@ -1255,6 +1264,9 @@ void intel_gt_reset(struct intel_gt *gt, intel_overlay_reset(gt->i915); + /* sanitize uC after engine reset */ + if (!intel_uc_uses_guc_submission(>->uc)) + intel_uc_reset_prepare(>->uc); /* * Next we need to restore the context, but we don't use those * yet either... -- 2.42.0
[PATCH 2/3] drm/i915 Rename intel_engine_reset to intel_gt_engine_recover
intel_engine_reset() not only reset a engine but also tries to recover it so give it a proper name without any functional changes. Signed-off-by: Nirmoy Das --- .../drm/i915/gem/selftests/i915_gem_context.c | 2 +- .../drm/i915/gt/intel_execlists_submission.c | 2 +- drivers/gpu/drm/i915/gt/intel_reset.c | 4 ++-- drivers/gpu/drm/i915/gt/intel_reset.h | 4 ++-- drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 20 +-- drivers/gpu/drm/i915/gt/selftest_mocs.c | 4 ++-- drivers/gpu/drm/i915/gt/selftest_reset.c | 2 +- .../gpu/drm/i915/gt/selftest_workarounds.c| 6 +++--- 8 files changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c index 89d4dc8b60c6..4f4cde55f621 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c @@ -1171,7 +1171,7 @@ __sseu_finish(const char *name, int ret = 0; if (flags & TEST_RESET) { - ret = intel_engine_reset(ce->engine, "sseu"); + ret = intel_gt_engine_recover(ce->engine, "sseu"); if (ret) goto out; } diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c index 21829439e686..9485a622a704 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c @@ -2404,7 +2404,7 @@ static void execlists_reset(struct intel_engine_cs *engine, const char *msg) ring_set_paused(engine, 1); /* Freeze the current request in place */ execlists_capture(engine); - intel_engine_reset(engine, msg); + intel_gt_engine_recover(engine, msg); tasklet_enable(&engine->sched_engine->tasklet); clear_and_wake_up_bit(bit, lock); diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index b825daace58e..6504e8ba9c58 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -1348,7 +1348,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg) } /** - * intel_engine_reset - reset GPU engine to recover from a hang + * intel_gt_engine_recover - reset GPU engine to recover from a hang * @engine: engine to reset * @msg: reason for GPU reset; or NULL for no drm_notice() * @@ -1360,7 +1360,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg) * - reset engine (which will force the engine to idle) * - re-init/configure engine */ -int intel_engine_reset(struct intel_engine_cs *engine, const char *msg) +int intel_gt_engine_recover(struct intel_engine_cs *engine, const char *msg) { int err; diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h b/drivers/gpu/drm/i915/gt/intel_reset.h index c00de353075c..be984357bf27 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.h +++ b/drivers/gpu/drm/i915/gt/intel_reset.h @@ -31,8 +31,8 @@ void intel_gt_handle_error(struct intel_gt *gt, void intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask, const char *reason); -int intel_engine_reset(struct intel_engine_cs *engine, - const char *reason); +int intel_gt_engine_recover(struct intel_engine_cs *engine, + const char *reason); int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *reason); diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c index 9ce8ff1c04fe..9bfda3f2bd24 100644 --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c @@ -495,9 +495,9 @@ static int igt_reset_nop_engine(void *arg) i915_request_add(rq); } - err = intel_engine_reset(engine, NULL); + err = intel_gt_engine_recover(engine, NULL); if (err) { - pr_err("intel_engine_reset(%s) failed, err:%d\n", + pr_err("intel_gt_engine_recover(%s) failed, err:%d\n", engine->name, err); break; } @@ -574,7 +574,7 @@ static int igt_reset_fail_engine(void *arg) >->reset.flags)); force_reset_timeout(engine); - err = intel_engine_reset(engine, NULL); + err = intel_gt_engine_recover(engine, NULL); cancel_reset_timeout(engine); if (err == 0) /* timeouts only generated on gen8+ */ goto
[PATCH 1/3] drm/i915: Refactor confusing __intel_gt_reset()
__intel_gt_reset() is really for resetting engines though the name might suggest something else. So add two helper functions to remove confusions with no functional changes. Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 2 +- .../drm/i915/gt/intel_execlists_submission.c | 2 +- drivers/gpu/drm/i915/gt/intel_gt.c| 2 +- drivers/gpu/drm/i915/gt/intel_gt_pm.c | 2 +- drivers/gpu/drm/i915/gt/intel_reset.c | 43 ++- drivers/gpu/drm/i915/gt/intel_reset.h | 3 +- drivers/gpu/drm/i915/gt/selftest_reset.c | 2 +- drivers/gpu/drm/i915/i915_driver.c| 2 +- 8 files changed, 41 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 8c44af1c3451..5c8e9ee3b008 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -678,7 +678,7 @@ void intel_engines_release(struct intel_gt *gt) */ GEM_BUG_ON(intel_gt_pm_is_awake(gt)); if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) - __intel_gt_reset(gt, ALL_ENGINES); + intel_gt_reset_all_engines(gt); /* Decouple the backend; but keep the layout for late GPU resets */ for_each_engine(engine, gt, id) { diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c index 355aab5b38ba..21829439e686 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c @@ -2898,7 +2898,7 @@ static void enable_error_interrupt(struct intel_engine_cs *engine) drm_err(&engine->i915->drm, "engine '%s' resumed still in error: %08x\n", engine->name, status); - __intel_gt_reset(engine->gt, engine->mask); + intel_gt_reset_engine(engine); } /* diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index 580b5141ce1e..626b166e67ef 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -832,7 +832,7 @@ void intel_gt_driver_unregister(struct intel_gt *gt) /* Scrub all HW state upon release */ with_intel_runtime_pm(gt->uncore->rpm, wakeref) - __intel_gt_reset(gt, ALL_ENGINES); + intel_gt_reset_all_engines(gt); } void intel_gt_driver_release(struct intel_gt *gt) diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c index 220ac4f92edf..c08fdb65cc69 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c @@ -159,7 +159,7 @@ static bool reset_engines(struct intel_gt *gt) if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) return false; - return __intel_gt_reset(gt, ALL_ENGINES) == 0; + return intel_gt_reset_all_engines(gt) == 0; } static void gt_sanitize(struct intel_gt *gt, bool force) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index c8e9aa41fdea..b825daace58e 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -764,7 +764,7 @@ wa_14015076503_end(struct intel_gt *gt, intel_engine_mask_t engine_mask) HECI_H_GS1_ER_PREP, 0); } -int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) +static int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) { const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1; reset_func reset; @@ -795,6 +795,34 @@ int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) return ret; } +/** + * intel_gt_reset_all_engines() - Reset all engines in the given gt. + * @gt: the GT to reset all engines for. + * + * This function resets all engines within the given gt. + * + * Returns: + * Zero on success, negative error code on failure. + */ +int intel_gt_reset_all_engines(struct intel_gt *gt) +{ + return __intel_gt_reset(gt, ALL_ENGINES); +} + +/** + * intel_gt_reset_engine() - Reset a specific engine within a gt. + * @engine: engine to be reset. + * + * This function resets the specified engine within a gt. + * + * Returns: + * Zero on success, negative error code on failure. + */ +int intel_gt_reset_engine(struct intel_engine_cs *engine) +{ + return __intel_gt_reset(engine->gt, engine->mask); +} + bool intel_has_gpu_reset(const struct intel_gt *gt) { if (!gt->i915->params.reset) @@ -978,7 +1006,7 @@ static void __intel_gt_set_wedged(struct intel_gt *gt) /* Even if the GPU reset fails, it should still stop the engines */ if (!INTEL_INFO(gt->i915)->g
Re: [RFC PATCH] drm/i915: Don't reset GuC before engine reset on full GT reset
Hi John, On 4/17/2024 2:37 AM, John Harrison wrote: On 4/15/2024 09:44, Nirmoy Das wrote: Currently intel_gt_reset() happens as follows: reset_prepare() ---> Sends GDRST to GuC, GuC is in GS_MIA_IN_RESET do_reset() __intel_gt_reset() *_engine_reset_prepare() -->RESET_CTL expects running GuC *_reset_engines() intel_gt_init_hw() --> GuC FW loading happens, GuC comes out of GS_MIA_IN_RESET. Fix the above flow so that GuC reset happens after all the engines reset is done. Cc: John Harrison Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/intel_reset.c | 9 -- drivers/gpu/drm/i915/gt/uc/intel_uc.c | 42 +-- drivers/gpu/drm/i915/gt/uc/intel_uc.h | 1 + 3 files changed, 41 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index c8e9aa41fdea..9ebd68ce0c22 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -879,8 +879,11 @@ static intel_engine_mask_t reset_prepare(struct intel_gt *gt) intel_engine_mask_t awake = 0; enum intel_engine_id id; - /* For GuC mode, ensure submission is disabled before stopping ring */ - intel_uc_reset_prepare(>->uc); + /* + * For GuC mode, ensure submission is disabled before stopping ring. + * Don't reset the GuC a engine reset requires GuC to be running. These two lines appear to be mutually exclusive unless there is a test for GuC submission being enabled, which I am not seeing. Note that "ensure submission is disabled" means "reset the GuC". + */ + intel_uc_reset_prepare_without_guc_reset(>->uc); for_each_engine(engine, gt, id) { if (intel_engine_pm_get_if_awake(engine)) @@ -1227,6 +1230,8 @@ void intel_gt_reset(struct intel_gt *gt, intel_overlay_reset(gt->i915); + /* Now that all engines are clean, Reset the GuC */ + intel_uc_reset_prepare(>->uc); /* * Next we need to restore the context, but we don't use those * yet either... diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c index 7a63abf8f644..5feee4db2ccc 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c @@ -345,7 +345,7 @@ static void __uc_fini(struct intel_uc *uc) intel_guc_fini(&uc->guc); } -static int __uc_sanitize(struct intel_uc *uc) +static void __uc_sanitize_without_guc_reset(struct intel_uc *uc) { struct intel_guc *guc = &uc->guc; struct intel_huc *huc = &uc->huc; @@ -354,7 +354,11 @@ static int __uc_sanitize(struct intel_uc *uc) intel_huc_sanitize(huc); intel_guc_sanitize(guc); +} This seems like an extremely bad idea. You are wiping out all the GuC communication structures on the host side while the GuC itself is still executing and using those same structures. Is the failure when doing individual engine resets or when doing a full GT reset? The failed test is doing "intel_gt_reset(gt, ALL_ENGINES, NULL)" so a full GT reset. If the former, I think a better approach would be to just not reset GuC at all (or indeed any UC) if not using GuC submission. Although, looking at the code, I'm not seeing an engine only reset path that does nuke the UC layers? Yes, intel_engine_reset() doesn't touch UC layer. If it is the latter, This is the case here. then how/why are individual engine resets happening in the middle of a full GT reset? Don't we just splat everything all at once? It seems we use __intel_gt_reset(engine->gt, engine_mask) to reset all or some engines. Either way, it would be safer to split at the GT reset code layer rather than inside the UC layer. That is, when not using GuC submission, do the entire prepare/reset/init sequence of the UC layers as one 'atomic' operation either before the GT/engine reset or after it (or potentially both before and after?). I think this should work. Let me try it out Thanks, Nirmoy John. +static int __uc_sanitize(struct intel_uc *uc) +{ + __uc_sanitize_without_guc_reset(uc); return __intel_uc_reset_hw(uc); } @@ -593,13 +597,7 @@ static void __uc_fini_hw(struct intel_uc *uc) __uc_sanitize(uc); } -/** - * intel_uc_reset_prepare - Prepare for reset - * @uc: the intel_uc structure - * - * Preparing for full gpu reset. - */ -void intel_uc_reset_prepare(struct intel_uc *uc) +static void __intel_uc_reset_prepare(struct intel_uc *uc, bool reset_guc) { struct intel_guc *guc = &uc->guc; @@ -617,9 +615,35 @@ void intel_uc_reset_prepare(struct intel_uc *uc) intel_guc_submission_reset_prepare(guc); sanitize: - __uc_sanitize(uc); + if (reset_guc) + __uc_sanitize(uc); + else + __uc_sanitize_without_guc_reset(uc); } +/** +
Re: [PATCH i-g-t] i915/gem_mmap_offset: Partial mmap and munmap
On 4/12/2024 2:42 AM, Andi Shyti wrote: From: Chris Wilson Based on a test case developed by Lionel Landwerlin, this exercises creation of partial mmaps using both direct methods of a partial mmap() (where the mmap() only covers a portion of the object) and munmap() to do the same. Signed-off-by: Chris Wilson Signed-off-by: Andi Shyti --- tests/intel/gem_mmap_offset.c | 84 +++ 1 file changed, 84 insertions(+) diff --git a/tests/intel/gem_mmap_offset.c b/tests/intel/gem_mmap_offset.c index 95d2158ca88f..0ba2f9591f85 100644 --- a/tests/intel/gem_mmap_offset.c +++ b/tests/intel/gem_mmap_offset.c @@ -56,6 +56,8 @@ * SUBTEST: isolation * SUBTEST: oob-read * SUBTEST: open-flood + * SUBTEST: partial-mmap + * SUBTEST: partial-unmap * SUBTEST: perf * SUBTEST: pf-nonblock * SUBTEST: ptrace @@ -874,6 +876,83 @@ static void blt_coherency(int i915) igt_assert_f(compare_ok, "Problem with coherency, flush is too late\n"); } +static void partial_mmap(int i915) +{ + uint32_t handle; + + handle = gem_create(i915, SZ_2M); + + for_each_mmap_offset_type(i915, t) { + struct drm_i915_gem_mmap_offset arg = { + .handle = handle, + .flags = t->type, + }; + uint32_t *ptr; + + if (mmap_offset_ioctl(i915, &arg)) + continue; + + ptr = mmap(0, SZ_4K, PROT_WRITE, MAP_SHARED, i915, arg.offset); + if (ptr == MAP_FAILED) + continue; + + memset(ptr, 0xcc, SZ_4K); + munmap(ptr, SZ_4K); + + ptr = mmap(0, SZ_4K, PROT_READ, MAP_SHARED, i915, arg.offset + SZ_2M - SZ_4K); + igt_assert(ptr != MAP_FAILED); + + for (uint32_t i = 0; i < SZ_4K / sizeof(uint32_t); i++) + igt_assert_eq_u32(ptr[i], 0); + + munmap(ptr, SZ_4K); + } + + gem_close(i915, handle); +} + +static void partial_unmap(int i915) +{ + uint32_t handle; + + handle = gem_create(i915, SZ_2M); + + for_each_mmap_offset_type(i915, t) { + uint8_t *ptr_a, *ptr_b; + + /* mmap the same GEM BO twice */ + ptr_a = __mmap_offset(i915, handle, 0, SZ_2M, + PROT_READ | PROT_WRITE, + t->type); + if (!ptr_a) + continue; + + ptr_b = __mmap_offset(i915, handle, 0, SZ_2M, + PROT_READ | PROT_WRITE, + t->type); + if (!ptr_b) + continue; + + /* unmap the first mapping but the last 4k */ + munmap(ptr_a, SZ_2M - SZ_4K); + + /* memset that remaining 4k with 0xcc */ + memset(ptr_a + SZ_2M - SZ_4K, 0xcc, SZ_4K); + + /* memset the first page of the 2Mb with 0xdd */ + memset(ptr_b, 0xdd, SZ_4K); + + for (uint32_t i = 0; i < SZ_4K; i++) + igt_assert_eq_u32(ptr_a[SZ_2M - SZ_4K + i], 0xcc); + + munmap(ptr_a + SZ_2M - SZ_4K, SZ_4K); + memset(ptr_b, 0, SZ_2M); Do we need this extra memset() ? Otherwise Reviewed-by: Nirmoy Das + munmap(ptr_b, SZ_2M); + } + + gem_close(i915, handle); +} + static int mmap_gtt_version(int i915) { int gtt_version = -1; @@ -931,6 +1010,11 @@ igt_main igt_subtest_f("open-flood") open_flood(i915, 20); + igt_subtest_f("partial-mmap") + partial_mmap(i915); + igt_subtest_f("partial-unmap") + partial_unmap(i915); + igt_subtest_with_dynamic("clear") { for_each_memory_region(r, i915) { igt_dynamic_f("%s", r->name)
Re: [PATCH v3 20/21] drm/i915/display: perform transient flush
Hi Matt, On 4/15/2024 7:07 PM, Matt Roper wrote: On Mon, Apr 15, 2024 at 01:44:22PM +0530, Balasubramani Vivekanandan wrote: From: Matthew Auld Perform manual transient cache flush prior to flip and at the end of frontbuffer_flush. This is needed to ensure display engine doesn't see garbage if the surface is L3:XD dirty. Testcase: igt@xe-pat@display-vs-wb-transient Has the IGT patch for this been sent yet? Yes, the test seems to be available https://gitlab.freedesktop.org/drm/igt-gpu-tools/-/blob/master/tests/intel/xe_pat.c#L728 Regards, Nirmoy If not, we should probably make sure that happens soon, and then use the CI Test-with: thing if there winds up being another revision of this series so that this will be included in the CI results. Anyway, the changes here look good to me, Reviewed-by: Matt Roper Signed-off-by: Matthew Auld Signed-off-by: Balasubramani Vivekanandan Acked-by: Nirmoy Das --- drivers/gpu/drm/i915/display/intel_display.c | 3 +++ .../gpu/drm/i915/display/intel_frontbuffer.c | 2 ++ drivers/gpu/drm/i915/display/intel_tdf.h | 25 +++ drivers/gpu/drm/xe/Makefile | 3 ++- drivers/gpu/drm/xe/display/xe_tdf.c | 13 ++ 5 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/i915/display/intel_tdf.h create mode 100644 drivers/gpu/drm/xe/display/xe_tdf.c diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index 67697d9a559c..4fc46edcb4ad 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -110,6 +110,7 @@ #include "intel_sdvo.h" #include "intel_snps_phy.h" #include "intel_tc.h" +#include "intel_tdf.h" #include "intel_tv.h" #include "intel_vblank.h" #include "intel_vdsc.h" @@ -7242,6 +7243,8 @@ static void intel_atomic_commit_tail(struct intel_atomic_state *state) intel_atomic_commit_fence_wait(state); + intel_td_flush(dev_priv); + drm_atomic_helper_wait_for_dependencies(&state->base); drm_dp_mst_atomic_wait_for_dependencies(&state->base); intel_atomic_global_state_wait_for_dependencies(state); diff --git a/drivers/gpu/drm/i915/display/intel_frontbuffer.c b/drivers/gpu/drm/i915/display/intel_frontbuffer.c index 2ea37c0414a9..4923c340a0b6 100644 --- a/drivers/gpu/drm/i915/display/intel_frontbuffer.c +++ b/drivers/gpu/drm/i915/display/intel_frontbuffer.c @@ -65,6 +65,7 @@ #include "intel_fbc.h" #include "intel_frontbuffer.h" #include "intel_psr.h" +#include "intel_tdf.h" /** * frontbuffer_flush - flush frontbuffer @@ -93,6 +94,7 @@ static void frontbuffer_flush(struct drm_i915_private *i915, trace_intel_frontbuffer_flush(i915, frontbuffer_bits, origin); might_sleep(); + intel_td_flush(i915); intel_drrs_flush(i915, frontbuffer_bits); intel_psr_flush(i915, frontbuffer_bits, origin); intel_fbc_flush(i915, frontbuffer_bits, origin); diff --git a/drivers/gpu/drm/i915/display/intel_tdf.h b/drivers/gpu/drm/i915/display/intel_tdf.h new file mode 100644 index ..353cde21f6c2 --- /dev/null +++ b/drivers/gpu/drm/i915/display/intel_tdf.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2024 Intel Corporation + */ + +#ifndef __INTEL_TDF_H__ +#define __INTEL_TDF_H__ + +/* + * TDF (Transient-Data-Flush) is needed for Xe2+ where special L3:XD caching can + * be enabled through various PAT index modes. Idea is to use this caching mode + * when for example rendering onto the display surface, with the promise that + * KMD will ensure transient cache entries are always flushed by the time we do + * the display flip, since display engine is never coherent with CPU/GPU caches. + */ + +struct drm_i915_private; + +#ifdef I915 +static inline void intel_td_flush(struct drm_i915_private *i915) {} +#else +void intel_td_flush(struct drm_i915_private *i915); +#endif + +#endif diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index 6015c9e41f24..97a8674cdd76 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -198,7 +198,8 @@ xe-$(CONFIG_DRM_XE_DISPLAY) += \ display/xe_dsb_buffer.o \ display/xe_fb_pin.o \ display/xe_hdcp_gsc.o \ - display/xe_plane_initial.o + display/xe_plane_initial.o \ + display/xe_tdf.o # SOC code shared with i915 xe-$(CONFIG_DRM_XE_DISPLAY) += \ diff --git a/drivers/gpu/drm/xe/display/xe_tdf.c b/drivers/gpu/drm/xe/display/xe_tdf.c new file mode 100644 index ..2c0d4e144e09 --- /dev/null +++ b/drivers/gpu/drm/xe/display/xe_tdf.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2024 Intel Corporation + */ + +#include "xe_device.h" +#include "intel_display_types.h" +#include "intel_tdf.h" + +void intel_td_flush(struct drm_i915_private *i915) +{ + xe_device_td_flush(i915); +} -- 2.25.1
[RFC PATCH] drm/i915: Don't reset GuC before engine reset on full GT reset
Currently intel_gt_reset() happens as follows: reset_prepare() ---> Sends GDRST to GuC, GuC is in GS_MIA_IN_RESET do_reset() __intel_gt_reset() *_engine_reset_prepare() -->RESET_CTL expects running GuC *_reset_engines() intel_gt_init_hw() --> GuC FW loading happens, GuC comes out of GS_MIA_IN_RESET. Fix the above flow so that GuC reset happens after all the engines reset is done. Cc: John Harrison Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/intel_reset.c | 9 -- drivers/gpu/drm/i915/gt/uc/intel_uc.c | 42 +-- drivers/gpu/drm/i915/gt/uc/intel_uc.h | 1 + 3 files changed, 41 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index c8e9aa41fdea..9ebd68ce0c22 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -879,8 +879,11 @@ static intel_engine_mask_t reset_prepare(struct intel_gt *gt) intel_engine_mask_t awake = 0; enum intel_engine_id id; - /* For GuC mode, ensure submission is disabled before stopping ring */ - intel_uc_reset_prepare(>->uc); + /* +* For GuC mode, ensure submission is disabled before stopping ring. +* Don't reset the GuC a engine reset requires GuC to be running. +*/ + intel_uc_reset_prepare_without_guc_reset(>->uc); for_each_engine(engine, gt, id) { if (intel_engine_pm_get_if_awake(engine)) @@ -1227,6 +1230,8 @@ void intel_gt_reset(struct intel_gt *gt, intel_overlay_reset(gt->i915); + /* Now that all engines are clean, Reset the GuC */ + intel_uc_reset_prepare(>->uc); /* * Next we need to restore the context, but we don't use those * yet either... diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c index 7a63abf8f644..5feee4db2ccc 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c @@ -345,7 +345,7 @@ static void __uc_fini(struct intel_uc *uc) intel_guc_fini(&uc->guc); } -static int __uc_sanitize(struct intel_uc *uc) +static void __uc_sanitize_without_guc_reset(struct intel_uc *uc) { struct intel_guc *guc = &uc->guc; struct intel_huc *huc = &uc->huc; @@ -354,7 +354,11 @@ static int __uc_sanitize(struct intel_uc *uc) intel_huc_sanitize(huc); intel_guc_sanitize(guc); +} +static int __uc_sanitize(struct intel_uc *uc) +{ + __uc_sanitize_without_guc_reset(uc); return __intel_uc_reset_hw(uc); } @@ -593,13 +597,7 @@ static void __uc_fini_hw(struct intel_uc *uc) __uc_sanitize(uc); } -/** - * intel_uc_reset_prepare - Prepare for reset - * @uc: the intel_uc structure - * - * Preparing for full gpu reset. - */ -void intel_uc_reset_prepare(struct intel_uc *uc) +static void __intel_uc_reset_prepare(struct intel_uc *uc, bool reset_guc) { struct intel_guc *guc = &uc->guc; @@ -617,9 +615,35 @@ void intel_uc_reset_prepare(struct intel_uc *uc) intel_guc_submission_reset_prepare(guc); sanitize: - __uc_sanitize(uc); + if (reset_guc) + __uc_sanitize(uc); + else + __uc_sanitize_without_guc_reset(uc); } +/** + * intel_uc_reset_prepare - Prepare for reset + * @uc: the intel_uc structure + * + * Preparing for full gpu reset. + */ +void intel_uc_reset_prepare(struct intel_uc *uc) +{ + __intel_uc_reset_prepare(uc, true); +} +/** + * intel_uc_reset_prepare_without_guc_reset - Prepare for reset but don't reset + * the GuC + * @uc: the intel_uc structure + * + * Preparing for full gpu reset. + */ +void intel_uc_reset_prepare_without_guc_reset(struct intel_uc *uc) +{ + __intel_uc_reset_prepare(uc, false); +} + + void intel_uc_reset(struct intel_uc *uc, intel_engine_mask_t stalled) { struct intel_guc *guc = &uc->guc; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.h b/drivers/gpu/drm/i915/gt/uc/intel_uc.h index 014bb7d83689..9d6191ece498 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.h @@ -46,6 +46,7 @@ void intel_uc_driver_late_release(struct intel_uc *uc); void intel_uc_driver_remove(struct intel_uc *uc); void intel_uc_init_mmio(struct intel_uc *uc); void intel_uc_reset_prepare(struct intel_uc *uc); +void intel_uc_reset_prepare_without_guc_reset(struct intel_uc *uc); void intel_uc_reset(struct intel_uc *uc, intel_engine_mask_t stalled); void intel_uc_reset_finish(struct intel_uc *uc); void intel_uc_cancel_requests(struct intel_uc *uc); -- 2.42.0
Re: [PATCH v2 2/2] drm/i915/gem: Calculate object page offset for partial memory mapping
Hi Andi, On 3/29/2024 5:39 PM, Andi Shyti wrote: To enable partial memory mapping of GPU virtual memory, it's necessary to introduce an offset to the object's memory (obj->mm.pages) scatterlist. This adjustment compensates for instances when userspace mappings do not start from the beginning of the object. I quickly tried https://gitlab.freedesktop.org/llandwerlin/igt-gpu-tools/-/tree/wip/gem_mmap_offset-partial-unmap?ref_type=heads that didn't work for GTT. Please make sure a proper IGT test is available for this as this looks very risky change. Regards, Nirmoy Based on a patch by Chris Wilson. Signed-off-by: Andi Shyti Cc: Chris Wilson Cc: Lionel Landwerlin --- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 10 +++--- drivers/gpu/drm/i915/i915_mm.c | 12 +++- drivers/gpu/drm/i915/i915_mm.h | 3 ++- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index ce10dd259812..9bd2b4c2e501 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -252,6 +252,7 @@ static vm_fault_t vm_fault_cpu(struct vm_fault *vmf) struct vm_area_struct *area = vmf->vma; struct i915_mmap_offset *mmo = area->vm_private_data; struct drm_i915_gem_object *obj = mmo->obj; + unsigned long obj_offset; resource_size_t iomap; int err; @@ -273,10 +274,11 @@ static vm_fault_t vm_fault_cpu(struct vm_fault *vmf) iomap -= obj->mm.region->region.start; } + obj_offset = area->vm_pgoff - drm_vma_node_start(&mmo->vma_node); /* PTEs are revoked in obj->ops->put_pages() */ err = remap_io_sg(area, area->vm_start, area->vm_end - area->vm_start, - obj->mm.pages->sgl, iomap); + obj->mm.pages->sgl, obj_offset, iomap); if (area->vm_flags & VM_WRITE) { GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj)); @@ -302,14 +304,16 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf) struct i915_ggtt *ggtt = to_gt(i915)->ggtt; bool write = area->vm_flags & VM_WRITE; struct i915_gem_ww_ctx ww; + unsigned long obj_offset; intel_wakeref_t wakeref; struct i915_vma *vma; pgoff_t page_offset; int srcu; int ret; - /* We don't use vmf->pgoff since that has the fake offset */ + obj_offset = area->vm_pgoff - drm_vma_node_start(&mmo->vma_node); page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT; + page_offset += obj_offset; trace_i915_gem_object_fault(obj, page_offset, true, write); @@ -404,7 +408,7 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf) /* Finally, remap it using the new GTT offset */ ret = remap_io_mapping(area, - area->vm_start + (vma->gtt_view.partial.offset << PAGE_SHIFT), + area->vm_start + ((vma->gtt_view.partial.offset - obj_offset) << PAGE_SHIFT), (ggtt->gmadr.start + i915_ggtt_offset(vma)) >> PAGE_SHIFT, min_t(u64, vma->size, area->vm_end - area->vm_start), &ggtt->iomap); diff --git a/drivers/gpu/drm/i915/i915_mm.c b/drivers/gpu/drm/i915/i915_mm.c index 7998bc74ab49..f5c97a620962 100644 --- a/drivers/gpu/drm/i915/i915_mm.c +++ b/drivers/gpu/drm/i915/i915_mm.c @@ -122,13 +122,15 @@ int remap_io_mapping(struct vm_area_struct *vma, * @addr: target user address to start at * @size: size of map area * @sgl: Start sg entry + * @offset: offset from the start of the page * @iobase: Use stored dma address offset by this address or pfn if -1 * * Note: this is only safe if the mm semaphore is held when called. */ int remap_io_sg(struct vm_area_struct *vma, unsigned long addr, unsigned long size, - struct scatterlist *sgl, resource_size_t iobase) + struct scatterlist *sgl, unsigned long offset, + resource_size_t iobase) { struct remap_pfn r = { .mm = vma->vm_mm, @@ -141,6 +143,14 @@ int remap_io_sg(struct vm_area_struct *vma, /* We rely on prevalidation of the io-mapping to skip track_pfn(). */ GEM_BUG_ON((vma->vm_flags & EXPECTED_FLAGS) != EXPECTED_FLAGS); + while (offset >= sg_dma_len(r.sgt.sgp) >> PAGE_SHIFT) { + offset -= sg_dma_len(r.sgt.sgp) >> PAGE_SHIFT; + r.sgt = __sgt_iter(__sg_next(r.sgt.sgp), use_dma(iobase)); + if (!r.sgt.sgp) + return -EINVAL; + } + r.sgt.curr = offset << PAGE_SHIFT; +
Re: [PATCH v2 23/25] drm/xe/device: implement transient flush
Hi Bala, On 4/3/2024 1:22 PM, Balasubramani Vivekanandan wrote: From: Nirmoy Das Display surfaces can be tagged as transient by mapping it using one of the various L3:XD PAT index modes on Xe2. The expectation is that KMD needs to request transient data flush at the start of flip sequence to ensure all transient data in L3 cache is flushed to memory. Add a routine for this which we can then call from the display code. Signed-off-by: Nirmoy Das Co-developed-by: Matthew Auld Signed-off-by: Matthew Auld Signed-off-by: Balasubramani Vivekanandan --- drivers/gpu/drm/xe/regs/xe_gt_regs.h | 3 ++ drivers/gpu/drm/xe/xe_device.c | 52 drivers/gpu/drm/xe/xe_device.h | 2 ++ 3 files changed, 57 insertions(+) diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h index 6617c86a096b..7afe810b3441 100644 --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h @@ -306,6 +306,9 @@ #define XE2LPM_L3SQCREG5 XE_REG_MCR(0xb658) +#define XE2_TDF_CTRLXE_REG(0xb418) +#define TRANSIENT_FLUSH_REQUEST REG_BIT(0) + #define XEHP_MERT_MOD_CTRLXE_REG_MCR(0xcf28) #define RENDER_MOD_CTRL XE_REG_MCR(0xcf2c) #define COMP_MOD_CTRL XE_REG_MCR(0xcf30) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 01bd5ccf05ca..0c9769fe04f6 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -641,6 +641,58 @@ void xe_device_wmb(struct xe_device *xe) xe_mmio_write32(gt, SOFTWARE_FLAGS_SPR33, 0); } +/** + * xe_device_td_flush() - Flush transient L3 cache entries + * @xe: The device + * + * Display engine has direct access to memory and is never coherent with L3/L4 + * caches (or CPU caches), however KMD is responsible for specifically flushing + * transient L3 GPU cache entries prior to the flip sequence to ensure scanout + * can happen from such a surface without seeing corruption. + * + * Display surfaces can be tagged as transient by mapping it using one of the + * various L3:XD PAT index modes on Xe2. + * + * Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is flushed + * at the end of each submission via PIPE_CONTROL for compute/render, since SA + * Media is not coherent with L3 and we want to support render-vs-media + * usescases. For other engines like copy/blt the HW internally forces uncached + * behaviour, hence why we can skip the TDF on such platforms. + */ +void xe_device_td_flush(struct xe_device *xe) +{ + struct xe_gt *gt; + int err; + u8 id; + + if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20) + return; + + for_each_gt(gt, xe, id) { + if (xe_gt_is_media_type(gt)) + continue; + + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + if (err) + return; This can be if (xe_force_wake_get()..) without needing the err variable. Sorry, this was my oversight from this morning. Regards, Nirmoy + + xe_mmio_write32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST); + /* +* FIXME: We can likely do better here with our choice of +* timeout. Currently we just assume the worst case, but really +* we should make this dependent on how much actual L3 there is +* for this system. Recomendation is to allow ~64us in the worst +* case for 8M of L3 (assumes all entries are transient and need +* to be flushed). +*/ + if (xe_mmio_wait32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST, 0, + 150, NULL, false)) + xe_gt_err_once(gt, "TD flush timeout\n"); + + xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); + } +} + u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size) { return xe_device_has_flat_ccs(xe) ? diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h index d413bc2c6be5..d3430f4b820a 100644 --- a/drivers/gpu/drm/xe/xe_device.h +++ b/drivers/gpu/drm/xe/xe_device.h @@ -176,4 +176,6 @@ void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p); u64 xe_device_canonicalize_addr(struct xe_device *xe, u64 address); u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address); +void xe_device_td_flush(struct xe_device *xe); + #endif
Re: [PATCH v2 24/25] drm/i915/display: perform transient flush
+Jouni On 4/3/2024 1:22 PM, Balasubramani Vivekanandan wrote: From: Matthew Auld Perform manual transient cache flush prior to flip and at the end of frontbuffer_flush. This is needed to ensure display engine doesn't see garbage if the surface is L3:XD dirty. Testcase: igt@xe-pat@display-vs-wb-transient Signed-off-by: Matthew Auld Signed-off-by: Balasubramani Vivekanandan Acked-by: Nirmoy Das --- drivers/gpu/drm/i915/display/intel_display.c | 3 +++ .../gpu/drm/i915/display/intel_frontbuffer.c | 2 ++ drivers/gpu/drm/i915/display/intel_tdf.h | 25 +++ drivers/gpu/drm/xe/Makefile | 3 ++- drivers/gpu/drm/xe/display/xe_tdf.c | 13 ++ 5 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/i915/display/intel_tdf.h create mode 100644 drivers/gpu/drm/xe/display/xe_tdf.c diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index aed25890b6f5..0a720e9d12a7 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -110,6 +110,7 @@ #include "intel_sdvo.h" #include "intel_snps_phy.h" #include "intel_tc.h" +#include "intel_tdf.h" #include "intel_tv.h" #include "intel_vblank.h" #include "intel_vdsc.h" @@ -7095,6 +7096,8 @@ static void intel_atomic_commit_tail(struct intel_atomic_state *state) intel_atomic_commit_fence_wait(state); + intel_td_flush(dev_priv); + drm_atomic_helper_wait_for_dependencies(&state->base); drm_dp_mst_atomic_wait_for_dependencies(&state->base); intel_atomic_global_state_wait_for_dependencies(state); diff --git a/drivers/gpu/drm/i915/display/intel_frontbuffer.c b/drivers/gpu/drm/i915/display/intel_frontbuffer.c index 2ea37c0414a9..4923c340a0b6 100644 --- a/drivers/gpu/drm/i915/display/intel_frontbuffer.c +++ b/drivers/gpu/drm/i915/display/intel_frontbuffer.c @@ -65,6 +65,7 @@ #include "intel_fbc.h" #include "intel_frontbuffer.h" #include "intel_psr.h" +#include "intel_tdf.h" /** * frontbuffer_flush - flush frontbuffer @@ -93,6 +94,7 @@ static void frontbuffer_flush(struct drm_i915_private *i915, trace_intel_frontbuffer_flush(i915, frontbuffer_bits, origin); might_sleep(); + intel_td_flush(i915); intel_drrs_flush(i915, frontbuffer_bits); intel_psr_flush(i915, frontbuffer_bits, origin); intel_fbc_flush(i915, frontbuffer_bits, origin); diff --git a/drivers/gpu/drm/i915/display/intel_tdf.h b/drivers/gpu/drm/i915/display/intel_tdf.h new file mode 100644 index ..353cde21f6c2 --- /dev/null +++ b/drivers/gpu/drm/i915/display/intel_tdf.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2024 Intel Corporation + */ + +#ifndef __INTEL_TDF_H__ +#define __INTEL_TDF_H__ + +/* + * TDF (Transient-Data-Flush) is needed for Xe2+ where special L3:XD caching can + * be enabled through various PAT index modes. Idea is to use this caching mode + * when for example rendering onto the display surface, with the promise that + * KMD will ensure transient cache entries are always flushed by the time we do + * the display flip, since display engine is never coherent with CPU/GPU caches. + */ + +struct drm_i915_private; + +#ifdef I915 +static inline void intel_td_flush(struct drm_i915_private *i915) {} +#else +void intel_td_flush(struct drm_i915_private *i915); +#endif + +#endif diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index e5b1715f721e..401a4492c625 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -196,7 +196,8 @@ xe-$(CONFIG_DRM_XE_DISPLAY) += \ display/xe_dsb_buffer.o \ display/xe_fb_pin.o \ display/xe_hdcp_gsc.o \ - display/xe_plane_initial.o + display/xe_plane_initial.o \ + display/xe_tdf.o # SOC code shared with i915 xe-$(CONFIG_DRM_XE_DISPLAY) += \ diff --git a/drivers/gpu/drm/xe/display/xe_tdf.c b/drivers/gpu/drm/xe/display/xe_tdf.c new file mode 100644 index ..2c0d4e144e09 --- /dev/null +++ b/drivers/gpu/drm/xe/display/xe_tdf.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2024 Intel Corporation + */ + +#include "xe_device.h" +#include "intel_display_types.h" +#include "intel_tdf.h" + +void intel_td_flush(struct drm_i915_private *i915) +{ + xe_device_td_flush(i915); +}
Re: [PATCH v2 22/25] drm/xe/gt_print: add xe_gt_err_once()
On 4/3/2024 1:22 PM, Balasubramani Vivekanandan wrote: From: Matthew Auld Needed in an upcoming patch, where we want GT level print, but only which to trigger once to avoid flooding dmesg. Signed-off-by: Matthew Auld Signed-off-by: Balasubramani Vivekanandan Reviewed-by: Nirmoy Das --- drivers/gpu/drm/xe/xe_gt_printk.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_gt_printk.h b/drivers/gpu/drm/xe/xe_gt_printk.h index c2b004d3f48e..d6228baaff1e 100644 --- a/drivers/gpu/drm/xe/xe_gt_printk.h +++ b/drivers/gpu/drm/xe/xe_gt_printk.h @@ -13,6 +13,9 @@ #define xe_gt_printk(_gt, _level, _fmt, ...) \ drm_##_level(>_to_xe(_gt)->drm, "GT%u: " _fmt, (_gt)->info.id, ##__VA_ARGS__) +#define xe_gt_err_once(_gt, _fmt, ...) \ + xe_gt_printk((_gt), err_once, _fmt, ##__VA_ARGS__) + #define xe_gt_err(_gt, _fmt, ...) \ xe_gt_printk((_gt), err, _fmt, ##__VA_ARGS__)
Re: [PATCH 23/25] drm/xe/device: implement transient flush
There is new fixup patch(PR#630) which modifies this patch. Could you please bring that in as well. Regards, Nirmoy On 4/3/2024 12:51 PM, Balasubramani Vivekanandan wrote: From: Nirmoy Das Display surfaces can be tagged as transient by mapping it using one of the various L3:XD PAT index modes on Xe2. The expectation is that KMD needs to request transient data flush at the start of flip sequence to ensure all transient data in L3 cache is flushed to memory. Add a routine for this which we can then call from the display code. Signed-off-by: Nirmoy Das Co-developed-by: Matthew Auld Signed-off-by: Matthew Auld Signed-off-by: Balasubramani Vivekanandan --- drivers/gpu/drm/xe/regs/xe_gt_regs.h | 3 ++ drivers/gpu/drm/xe/xe_device.c | 49 drivers/gpu/drm/xe/xe_device.h | 2 ++ 3 files changed, 54 insertions(+) diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h index d5b21f03beaa..9c6549830e24 100644 --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h @@ -305,6 +305,9 @@ #define XE2LPM_L3SQCREG5 XE_REG_MCR(0xb658) +#define XE2_TDF_CTRLXE_REG(0xb418) +#define TRANSIENT_FLUSH_REQUEST REG_BIT(0) + #define XEHP_MERT_MOD_CTRLXE_REG_MCR(0xcf28) #define RENDER_MOD_CTRL XE_REG_MCR(0xcf2c) #define COMP_MOD_CTRL XE_REG_MCR(0xcf30) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 01bd5ccf05ca..66182220e663 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -641,6 +641,55 @@ void xe_device_wmb(struct xe_device *xe) xe_mmio_write32(gt, SOFTWARE_FLAGS_SPR33, 0); } +/** + * xe_device_td_flush() - Flush transient L3 cache entries + * @xe: The device + * + * Display engine has direct access to memory and is never coherent with L3/L4 + * caches (or CPU caches), however KMD is responsible for specifically flushing + * transient L3 GPU cache entries prior to the flip sequence to ensure scanout + * can happen from such a surface without seeing corruption. + * + * Display surfaces can be tagged as transient by mapping it using one of the + * various L3:XD PAT index modes on Xe2. + * + * Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is flushed + * at the end of each submission via PIPE_CONTROL for compute/render, since SA + * Media is not coherent with L3 and we want to support render-vs-media + * usescases. For other engines like copy/blt the HW internally forces uncached + * behaviour, hence why we can skip the TDF on such platforms. + */ +void xe_device_td_flush(struct xe_device *xe) +{ + struct xe_gt *gt; + u8 id; + + if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20) + return; + + for_each_gt(gt, xe, id) { + if (xe_gt_is_media_type(gt)) + continue; + + xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + + xe_mmio_write32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST); + /* +* FIXME: We can likely do better here with our choice of +* timeout. Currently we just assume the worst case, but really +* we should make this dependent on how much actual L3 there is +* for this system. Recomendation is to allow ~64us in the worst +* case for 8M of L3 (assumes all entries are transient and need +* to be flushed). +*/ + if (xe_mmio_wait32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST, 0, + 150, NULL, false)) + xe_gt_err_once(gt, "TD flush timeout\n"); + + xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); + } +} + u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size) { return xe_device_has_flat_ccs(xe) ? diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h index d413bc2c6be5..d3430f4b820a 100644 --- a/drivers/gpu/drm/xe/xe_device.h +++ b/drivers/gpu/drm/xe/xe_device.h @@ -176,4 +176,6 @@ void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p); u64 xe_device_canonicalize_addr(struct xe_device *xe, u64 address); u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address); +void xe_device_td_flush(struct xe_device *xe); + #endif
Re: [PATCH] drm/i915/guc: Fix the fix for reset lock confusion
On 3/30/2024 12:53 AM, john.c.harri...@intel.com wrote: From: John Harrison The previous fix for the circlular lock splat about the busyness worker wasn't quite complete. Even though the reset-in-progress flag is cleared at the start of intel_uc_reset_finish, the entire function is still inside the reset mutex lock. Not sure why the patch appeared to fix the issue both locally and in CI. However, it is now back again. There is a further complication the wedge code path within intel_gt_reset() jumps around so much it results in nested reset_prepare/_finish calls. That is, the call sequence is: intel_gt_reset | reset_prepare | __intel_gt_set_wedged | | reset_prepare | | reset_finish | reset_finish The nested finish means that even if the clear of the in-progress flag was moved to the end of _finish, it would still be clear for the entire second call. Surprisingly, this does not seem to be causing any other problems at present. As an aside, a wedge on fini does not call the finish functions at all. The reset_in_progress flag is left set (twice). So instead of trying to cancel the worker anywhere at all in the reset path, just add a cancel to intel_guc_submission_fini instead. Note that it is not a problem if the worker is still active during a reset. Either it will run before the reset path starts locking things and will simply block the reset code for a tiny amount of time. Or it will run after the locks have been acquired and will early exit due to the try-lock. Also, do not use the reset-in-progress flag to decide whether a synchronous cancel is safe (from a lockdep perspective) or not. Instead, use the actual reset mutex state (both the genuine one and the custom rolled BACKOFF one). Fixes: 0e00a8814eec ("drm/i915/guc: Avoid circular locking issue on busyness flush") Signed-off-by: John Harrison Cc: Zhanjun Dong Cc: John Harrison Cc: Andi Shyti Cc: Daniel Vetter Cc: Daniel Vetter Cc: Rodrigo Vivi Cc: Nirmoy Das Cc: Tvrtko Ursulin Cc: Umesh Nerlige Ramappa Cc: Andrzej Hajda Cc: Matt Roper Cc: Jonathan Cavitt Cc: Prathap Kumar Valsan Cc: Alan Previn Cc: Madhumitha Tolakanahalli Pradeep Cc: Daniele Ceraolo Spurio Cc: Ashutosh Dixit Cc: Dnyaneshwar Bhadane Thanks for the details, looks good to me: Reviewed-by: Nirmoy Das --- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 23 --- drivers/gpu/drm/i915/gt/uc/intel_uc.c | 4 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 16640d6dd0589..00757d6333e88 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1403,14 +1403,17 @@ static void guc_cancel_busyness_worker(struct intel_guc *guc) * Trying to pass a 'need_sync' or 'in_reset' flag all the way down through * every possible call stack is unfeasible. It would be too intrusive to many * areas that really don't care about the GuC backend. However, there is the -* 'reset_in_progress' flag available, so just use that. +* I915_RESET_BACKOFF flag and the gt->reset.mutex can be tested for is_locked. +* So just use those. Note that testing both is required due to the hideously +* complex nature of the i915 driver's reset code paths. * * And note that in the case of a reset occurring during driver unload -* (wedge_on_fini), skipping the cancel in _prepare (when the reset flag is set -* is fine because there is another cancel in _finish (when the reset flag is -* not). +* (wedged_on_fini), skipping the cancel in reset_prepare/reset_fini (when the +* reset flag/mutex are set) is fine because there is another explicit cancel in +* intel_guc_submission_fini (when the reset flag/mutex are not). */ - if (guc_to_gt(guc)->uc.reset_in_progress) + if (mutex_is_locked(&guc_to_gt(guc)->reset.mutex) || + test_bit(I915_RESET_BACKOFF, &guc_to_gt(guc)->reset.flags)) cancel_delayed_work(&guc->timestamp.work); else cancel_delayed_work_sync(&guc->timestamp.work); @@ -1424,8 +1427,6 @@ static void __reset_guc_busyness_stats(struct intel_guc *guc) unsigned long flags; ktime_t unused; - guc_cancel_busyness_worker(guc); - spin_lock_irqsave(&guc->timestamp.lock, flags); guc_update_pm_timestamp(guc, &unused); @@ -2004,13 +2005,6 @@ void intel_guc_submission_cancel_requests(struct intel_guc *guc) void intel_guc_submission_reset_finish(struct intel_guc *guc) { - /* -* Ensure the busyness worker gets cancelled even on a fatal wedge. -* Note that reset_prepare is
Re: [PATCH] drm/i915/gt: Limit the reserved VM space to only the platforms that need it
Hi Andi, On 3/27/2024 9:05 PM, Andi Shyti wrote: Commit 9bb66c179f50 ("drm/i915: Reserve some kernel space per vm") reduces the available VM space of one page in order to apply Wa_16018031267 and Wa_16018063123. This page was reserved indiscrimitely in all platforms even when not needed. Limit it to DG2 onwards. I would use "Limit it to platforms that need WAs" as those WA are only needed till 12.71, otherwise Reviewed-by: Nirmoy Das Fixes: 9bb66c179f50 ("drm/i915: Reserve some kernel space per vm") Signed-off-by: Andi Shyti Cc: Andrzej Hajda Cc: Chris Wilson Cc: Jonathan Cavitt Cc: Nirmoy Das --- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 3 +++ drivers/gpu/drm/i915/gt/intel_gt.c | 6 ++ drivers/gpu/drm/i915/gt/intel_gt.h | 9 + 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c index 1bd0e041e15c..398d60a66410 100644 --- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c @@ -961,6 +961,9 @@ static int gen8_init_rsvd(struct i915_address_space *vm) struct i915_vma *vma; int ret; + if (!intel_gt_needs_wa_16018031267(vm->gt)) + return 0; + /* The memory will be used only by GPU. */ obj = i915_gem_object_create_lmem(i915, PAGE_SIZE, I915_BO_ALLOC_VOLATILE | diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index 2c6d31b8fc1a..580b5141ce1e 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -1024,6 +1024,12 @@ enum i915_map_type intel_gt_coherent_map_type(struct intel_gt *gt, return I915_MAP_WC; } +bool intel_gt_needs_wa_16018031267(struct intel_gt *gt) +{ + /* Wa_16018031267, Wa_16018063123 */ + return IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 55), IP_VER(12, 71)); +} + bool intel_gt_needs_wa_22016122933(struct intel_gt *gt) { return MEDIA_VER_FULL(gt->i915) == IP_VER(13, 0) && gt->type == GT_MEDIA; diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h index 6e7cab60834c..b5e114d284ad 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.h +++ b/drivers/gpu/drm/i915/gt/intel_gt.h @@ -82,17 +82,18 @@ struct drm_printer; ##__VA_ARGS__); \ } while (0) -#define NEEDS_FASTCOLOR_BLT_WABB(engine) ( \ - IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 55), IP_VER(12, 71)) && \ - engine->class == COPY_ENGINE_CLASS && engine->instance == 0) - static inline bool gt_is_root(struct intel_gt *gt) { return !gt->info.id; } +bool intel_gt_needs_wa_16018031267(struct intel_gt *gt); bool intel_gt_needs_wa_22016122933(struct intel_gt *gt); +#define NEEDS_FASTCOLOR_BLT_WABB(engine) ( \ + intel_gt_needs_wa_16018031267(engine->gt) && \ + engine->class == COPY_ENGINE_CLASS && engine->instance == 0) + static inline struct intel_gt *uc_to_gt(struct intel_uc *uc) { return container_of(uc, struct intel_gt, uc);
Re: [PATCH] drm/i915/gem: Calculate object page offset for partial memory mapping
Hi Andi, On 3/26/2024 12:12 PM, Andi Shyti wrote: Hi Nirmoy, ... diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index a2195e28b625..57a2dda2c3cc 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -276,7 +276,7 @@ static vm_fault_t vm_fault_cpu(struct vm_fault *vmf) /* PTEs are revoked in obj->ops->put_pages() */ err = remap_io_sg(area, area->vm_start, area->vm_end - area->vm_start, - obj->mm.pages->sgl, iomap); + obj->mm.pages->sgl, 0, iomap); Why don't we need partial mmap for CPU but only for GTT ? As far as I understood we don't. I have a version with the CPU offset as well in trybot[*] But without support for segmented buffer objects, I don't know how much this has any effect. You confused me more :) Why segmented buffer object is needed for partial CPU mmap but not for GTT ? From high level, GTT and CPU both should support partial mmap unless I missing something here. Sounds like this also need to be cover by a IGT tests. Yes, I it does need some igt work, working on it. Don't we need "Fixes" tag for this? Why should we? I'm not fixing anything here, If userspace expects partial mmap to work then this is a bug/gap in i915 so we need to backport this as far as possible. Need some information about the requirement about why we need this patch suddenly? Regards, Nirmoy I'm just recalculating the mapping not starting from the beginning of the scatter page. Andi [*] https://patchwork.freedesktop.org/patch/584474/?series=131539&rev=2
Re: [PATCH] drm/i915/gem: Calculate object page offset for partial memory mapping
Hi Andi, I have too many questions :) I think the patch makes sense but need more context, see below: On 3/25/2024 2:40 PM, Andi Shyti wrote: To enable partial memory mapping of GPU virtual memory, it's necessary to introduce an offset to the object's memory (obj->mm.pages) scatterlist. This adjustment compensates for instances when userspace mappings do not start from the beginning of the object. Based on a patch by Chris Wilson . Signed-off-by: Andi Shyti Cc: Chris Wilson Cc: Lionel Landwerlin --- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 8 +--- drivers/gpu/drm/i915/i915_mm.c | 12 +++- drivers/gpu/drm/i915/i915_mm.h | 3 ++- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index a2195e28b625..57a2dda2c3cc 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -276,7 +276,7 @@ static vm_fault_t vm_fault_cpu(struct vm_fault *vmf) /* PTEs are revoked in obj->ops->put_pages() */ err = remap_io_sg(area, area->vm_start, area->vm_end - area->vm_start, - obj->mm.pages->sgl, iomap); + obj->mm.pages->sgl, 0, iomap); Why don't we need partial mmap for CPU but only for GTT ? Sounds like this also need to be cover by a IGT tests. Don't we need "Fixes" tag for this? Regards, Nirmoy if (area->vm_flags & VM_WRITE) { GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj)); @@ -302,14 +302,16 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf) struct i915_ggtt *ggtt = to_gt(i915)->ggtt; bool write = area->vm_flags & VM_WRITE; struct i915_gem_ww_ctx ww; + unsigned long obj_offset; intel_wakeref_t wakeref; struct i915_vma *vma; pgoff_t page_offset; int srcu; int ret; - /* We don't use vmf->pgoff since that has the fake offset */ + obj_offset = area->vm_pgoff - drm_vma_node_start(&mmo->vma_node); page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT; + page_offset += obj_offset; trace_i915_gem_object_fault(obj, page_offset, true, write); @@ -404,7 +406,7 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf) /* Finally, remap it using the new GTT offset */ ret = remap_io_mapping(area, - area->vm_start + (vma->gtt_view.partial.offset << PAGE_SHIFT), + area->vm_start + ((vma->gtt_view.partial.offset - obj_offset) << PAGE_SHIFT), (ggtt->gmadr.start + i915_ggtt_offset(vma)) >> PAGE_SHIFT, min_t(u64, vma->size, area->vm_end - area->vm_start), &ggtt->iomap); diff --git a/drivers/gpu/drm/i915/i915_mm.c b/drivers/gpu/drm/i915/i915_mm.c index 7998bc74ab49..f5c97a620962 100644 --- a/drivers/gpu/drm/i915/i915_mm.c +++ b/drivers/gpu/drm/i915/i915_mm.c @@ -122,13 +122,15 @@ int remap_io_mapping(struct vm_area_struct *vma, * @addr: target user address to start at * @size: size of map area * @sgl: Start sg entry + * @offset: offset from the start of the page * @iobase: Use stored dma address offset by this address or pfn if -1 * * Note: this is only safe if the mm semaphore is held when called. */ int remap_io_sg(struct vm_area_struct *vma, unsigned long addr, unsigned long size, - struct scatterlist *sgl, resource_size_t iobase) + struct scatterlist *sgl, unsigned long offset, + resource_size_t iobase) { struct remap_pfn r = { .mm = vma->vm_mm, @@ -141,6 +143,14 @@ int remap_io_sg(struct vm_area_struct *vma, /* We rely on prevalidation of the io-mapping to skip track_pfn(). */ GEM_BUG_ON((vma->vm_flags & EXPECTED_FLAGS) != EXPECTED_FLAGS); + while (offset >= sg_dma_len(r.sgt.sgp) >> PAGE_SHIFT) { + offset -= sg_dma_len(r.sgt.sgp) >> PAGE_SHIFT; + r.sgt = __sgt_iter(__sg_next(r.sgt.sgp), use_dma(iobase)); + if (!r.sgt.sgp) + return -EINVAL; + } + r.sgt.curr = offset << PAGE_SHIFT; + if (!use_dma(iobase)) flush_cache_range(vma, addr, size); diff --git a/drivers/gpu/drm/i915/i915_mm.h b/drivers/gpu/drm/i915/i915_mm.h index 04c8974d822b..69f9351b1a1c 100644 --- a/drivers/gpu/drm/i915/i915_mm.h +++ b/drivers/gpu/drm/i915/i915_mm.h @@ -30,6 +30,7 @@ int remap_io_mapping(struct vm_area_struct *vma, int remap_io_sg(struct vm_area_struct *vma, unsigned long addr, unsigned long size, - struct scatterlist *sgl, resource_size_t iobase); + struct scatterlist *sgl, unsigned long offset, + resource_size_t iobase); #endif /* __I915_MM_H__ */
Re: [PATCH v2] drm/i915/gt: Report full vm address range
Hi Andi, On 3/21/2024 4:17 PM, Andi Shyti wrote: Commit 9bb66c179f50 ("drm/i915: Reserve some kernel space per vm") has reserved an object for kernel space usage. Userspace, though, needs to know the full address range. In the former patch the reserved space was substructed from the total amount of the VM space. Add it back when the user requests the GTT size through ioctl (I915_CONTEXT_PARAM_GTT_SIZE). Fixes: 9bb66c179f50 ("drm/i915: Reserve some kernel space per vm") Signed-off-by: Andi Shyti Cc: Andrzej Hajda Cc: Chris Wilson Cc: Lionel Landwerlin Cc: Michal Mrozek Cc: Nirmoy Das Cc: # v6.2+ Acked-by: Michal Mrozek Acked-by: Lionel Landwerlin --- Hi, Just proposing a different implementation that doesn't affect i915 internally but provides the same result. Instead of not substracting the space during the reservation, I add it back during the ioctl call. All the "vm->rsvd.vma->node.size" looks a bit ugly, Yes, this need document and also vm->total should be vm->total and may be we should have vm->usable which will be used by kernel internal and return vm->total. For me, I am fine with the kernel change as long as UMD is aware/fine of side-effect if UMD ended up using the reserved page. Basically we need to document this well :) Also may be we should limit this reserving page only on platform where it is required ? Regards, Nirmoy but that's how it is. Maybe a comment can help to understand better why there is this addition. I kept the Ack from Michal and Lionel, because the outcome from userspace perspactive doesn't really change. Andi drivers/gpu/drm/i915/gem/i915_gem_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index 81f65cab1330..60d9e7fe33b3 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -2454,7 +2454,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, case I915_CONTEXT_PARAM_GTT_SIZE: args->size = 0; vm = i915_gem_context_get_eb_vm(ctx); - args->value = vm->total; + args->value = vm->total + vm->rsvd.vma->node.size; i915_vm_put(vm); break;
Re: [PATCH v2] drm/i915/gem: Execbuffer objects must have struct pages.
On 3/12/2024 3:55 PM, Jonathan Cavitt wrote: We cannot write requests to objects without struct pages, so escape early if the requests are bound to objects that lack them. Signed-off-by: Jonathan Cavitt --- v2: s/vma-obj/vma->obj drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index d3a771afb083e..adb4f9e78cb49 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -3313,6 +3313,13 @@ eb_requests_create(struct i915_execbuffer *eb, struct dma_fence *in_fence, unsigned int i; for_each_batch_create_order(eb, i) { + /* Do not write requests to objects without struct pages. */ + if (eb->batches[i]->vma && + !i915_gem_object_has_struct_page(eb->batches[i]->vma->obj)) { As far as I understand, motivation of this patch is to avoid doing execbuf on dmabuf imported BO which are in error state of something. i915_gem_object_has_struct_page() checks "obj->mem_flags & I915_BO_FLAG_STRUCT_PAGE" which is very i915 specific. So I think this will not work and will cause regression in existing program which are trying to do the same with valid BO. Unfortunately I don't have any idea how to better detect that at this moment. Regards, Nirmoy + out_fence = ERR_PTR(-EINVAL); + return out_fence; + } + /* Allocate a request for this batch buffer nice and early. */ eb->requests[i] = i915_request_create(eb_find_context(eb, i)); if (IS_ERR(eb->requests[i])) {
Re: [PATCH] drm/i915/gt: Report full vm address range
On 3/14/2024 3:04 PM, Lionel Landwerlin wrote: Hi Andi, In Mesa we've been relying on I915_CONTEXT_PARAM_GTT_SIZE so as long as that is adjusted by the kernel What do you mean by adjusted by, should it be a aligned size? I915_CONTEXT_PARAM_GTT_SIZE ioctl is returning vm->total which is adjusted(reduced by a page). This patch might cause silent error as it is not removing WABB which is using the reserved page to add dummy blt and if userspace is using that page then it will be overwritten. Regards, Nirmoy , we should be able to continue working without issues. Acked-by: Lionel Landwerlin Thanks, -Lionel On 13/03/2024 21:39, Andi Shyti wrote: Commit 9bb66c179f50 ("drm/i915: Reserve some kernel space per vm") has reserved an object for kernel space usage. Userspace, though, needs to know the full address range. Fixes: 9bb66c179f50 ("drm/i915: Reserve some kernel space per vm") Signed-off-by: Andi Shyti Cc: Andrzej Hajda Cc: Chris Wilson Cc: Lionel Landwerlin Cc: Michal Mrozek Cc: Nirmoy Das Cc: # v6.2+ --- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c index fa46d2308b0e..d76831f50106 100644 --- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c @@ -982,8 +982,9 @@ static int gen8_init_rsvd(struct i915_address_space *vm) vm->rsvd.vma = i915_vma_make_unshrinkable(vma); vm->rsvd.obj = obj; - vm->total -= vma->node.size; + return 0; + unref: i915_gem_object_put(obj); return ret;
Re: [PATCH] drm/i915/selftests: Pick correct caching mode.
On 3/12/2024 3:28 PM, Andi Shyti wrote: Hi Nirmoy, On Tue, Mar 12, 2024 at 12:18:15PM +0100, Nirmoy Das wrote: Caching mode is HW dependent so pick a correct one using intel_gt_coherent_map_type(). Cc: Andi Shyti Cc: Janusz Krzysztofik Cc: Jonathan Cavitt Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10249 Signed-off-by: Nirmoy Das I think it's a good choice not to have the Fixes tag here either. Yes, fixes tag isn't needed for selftests Reviewed-by: Andi Shyti Thanks, Nirmoy Thanks, Andi
[PATCH] drm/i915/selftests: Pick correct caching mode.
Caching mode is HW dependent so pick a correct one using intel_gt_coherent_map_type(). Cc: Andi Shyti Cc: Janusz Krzysztofik Cc: Jonathan Cavitt Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10249 Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c index d684a70f2c04..65a931ea80e9 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c @@ -7,6 +7,7 @@ #include "i915_drv.h" #include "i915_selftest.h" #include "gem/i915_gem_context.h" +#include "gt/intel_gt.h" #include "mock_context.h" #include "mock_dmabuf.h" @@ -155,6 +156,7 @@ static int verify_access(struct drm_i915_private *i915, struct file *file; u32 *vaddr; int err = 0, i; + unsigned int mode; file = mock_file(i915); if (IS_ERR(file)) @@ -194,7 +196,8 @@ static int verify_access(struct drm_i915_private *i915, if (err) goto out_file; - vaddr = i915_gem_object_pin_map_unlocked(native_obj, I915_MAP_WB); + mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, true); + vaddr = i915_gem_object_pin_map_unlocked(native_obj, mode); if (IS_ERR(vaddr)) { err = PTR_ERR(vaddr); goto out_file; -- 2.42.0
Re: [PATCH v7 2/3] drm/i915: Remove extra multi-gt pm-references
On 3/5/2024 3:35 PM, Janusz Krzysztofik wrote: There was an attempt to fix an issue of illegal attempts to free a still active i915 VMA object when parking a GT believed to be idle, reported by CI on 2-GT Meteor Lake. As a solution, an extra wakeref for a Primary GT was acquired from i915_gem_do_execbuffer() -- see commit f56fe3e91787 ("drm/i915: Fix a VMA UAF for multi-gt platform"). However, that fix occurred insufficient -- the issue was still reported by CI. That wakeref was released on exit from i915_gem_do_execbuffer(), then potentially before completion of the request and deactivation of its associated VMAs. Moreover, CI reports indicated that single-GT platforms also suffered sporadically from the same race. Since the issue has now been fixed by a preceding patch "drm/i915/vma: Fix UAF on destroy against retire race", drop the no longer useful changes introduced by that insufficient fix. v3: Also drop the no longer used .wakeref_gt0 field from struct i915_execbuffer. v2: Avoid the word "revert" in commit message (Rodrigo), - update commit description reusing relevant chunks dropped from the description of the proper fix (Rodrigo). Signed-off-by: Janusz Krzysztofik Cc: Nirmoy Das Cc: Rodrigo Vivi Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 18 -- 1 file changed, 18 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index d3a771afb083e..3f20fe3811999 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -255,7 +255,6 @@ struct i915_execbuffer { struct intel_context *context; /* logical state for the request */ struct i915_gem_context *gem_context; /** caller's context */ intel_wakeref_t wakeref; - intel_wakeref_t wakeref_gt0; /** our requests to build */ struct i915_request *requests[MAX_ENGINE_INSTANCE + 1]; @@ -2686,7 +2685,6 @@ static int eb_select_engine(struct i915_execbuffer *eb) { struct intel_context *ce, *child; - struct intel_gt *gt; unsigned int idx; int err; @@ -2710,17 +2708,10 @@ eb_select_engine(struct i915_execbuffer *eb) } } eb->num_batches = ce->parallel.number_children + 1; - gt = ce->engine->gt; for_each_child(ce, child) intel_context_get(child); eb->wakeref = intel_gt_pm_get(ce->engine->gt); - /* -* Keep GT0 active on MTL so that i915_vma_parked() doesn't -* free VMAs while execbuf ioctl is validating VMAs. -*/ - if (gt->info.id) - eb->wakeref_gt0 = intel_gt_pm_get(to_gt(gt->i915)); if (!test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) { err = intel_context_alloc_state(ce); @@ -2759,9 +2750,6 @@ eb_select_engine(struct i915_execbuffer *eb) return err; err: - if (gt->info.id) - intel_gt_pm_put(to_gt(gt->i915), eb->wakeref_gt0); - intel_gt_pm_put(ce->engine->gt, eb->wakeref); for_each_child(ce, child) intel_context_put(child); @@ -2775,12 +2763,6 @@ eb_put_engine(struct i915_execbuffer *eb) struct intel_context *child; i915_vm_put(eb->context->vm); - /* -* This works in conjunction with eb_select_engine() to prevent -* i915_vma_parked() from interfering while execbuf validates vmas. -*/ - if (eb->gt->info.id) - intel_gt_pm_put(to_gt(eb->gt->i915), eb->wakeref_gt0); intel_gt_pm_put(eb->context->engine->gt, eb->wakeref); for_each_child(eb->context, child) intel_context_put(child);
Re: [PATCH v7 1/3] drm/i915/vma: Fix UAF on destroy against retire race
On 3/5/2024 3:35 PM, Janusz Krzysztofik wrote: Object debugging tools were sporadically reporting illegal attempts to free a still active i915 VMA object when parking a GT believed to be idle. [161.359441] ODEBUG: free active (active state 0) object: 88811643b958 object type: i915_active hint: __i915_vma_active+0x0/0x50 [i915] [161.360082] WARNING: CPU: 5 PID: 276 at lib/debugobjects.c:514 debug_print_object+0x80/0xb0 ... [161.360304] CPU: 5 PID: 276 Comm: kworker/5:2 Not tainted 6.5.0-rc1-CI_DRM_13375-g003f860e5577+ #1 [161.360314] Hardware name: Intel Corporation Rocket Lake Client Platform/RocketLake S UDIMM 6L RVP, BIOS RKLSFWI1.R00.3173.A03.2204210138 04/21/2022 [161.360322] Workqueue: i915-unordered __intel_wakeref_put_work [i915] [161.360592] RIP: 0010:debug_print_object+0x80/0xb0 ... [161.361347] debug_object_free+0xeb/0x110 [161.361362] i915_active_fini+0x14/0x130 [i915] [161.361866] release_references+0xfe/0x1f0 [i915] [161.362543] i915_vma_parked+0x1db/0x380 [i915] [161.363129] __gt_park+0x121/0x230 [i915] [161.363515] intel_wakeref_put_last+0x1f/0x70 [i915] That has been tracked down to be happening when another thread is deactivating the VMA inside __active_retire() helper, after the VMA's active counter has been already decremented to 0, but before deactivation of the VMA's object is reported to the object debugging tool. We could prevent from that race by serializing i915_active_fini() with __active_retire() via ref->tree_lock, but that wouldn't stop the VMA from being used, e.g. from __i915_vma_retire() called at the end of __active_retire(), after that VMA has been already freed by a concurrent i915_vma_destroy() on return from the i915_active_fini(). Then, we should rather fix the issue at the VMA level, not in i915_active. Since __i915_vma_parked() is called from __gt_park() on last put of the GT's wakeref, the issue could be addressed by holding the GT wakeref long enough for __active_retire() to complete before that wakeref is released and the GT parked. I believe the issue was introduced by commit d93939730347 ("drm/i915: Remove the vma refcount") which moved a call to i915_active_fini() from a dropped i915_vma_release(), called on last put of the removed VMA kref, to i915_vma_parked() processing path called on last put of a GT wakeref. However, its visibility to the object debugging tool was suppressed by a bug in i915_active that was fixed two weeks later with commit e92eb246feb9 ("drm/i915/active: Fix missing debug object activation"). A VMA associated with a request doesn't acquire a GT wakeref by itself. Instead, it depends on a wakeref held directly by the request's active intel_context for a GT associated with its VM, and indirectly on that intel_context's engine wakeref if the engine belongs to the same GT as the VMA's VM. Those wakerefs are released asynchronously to VMA deactivation. Fix the issue by getting a wakeref for the VMA's GT when activating it, and putting that wakeref only after the VMA is deactivated. However, exclude global GTT from that processing path, otherwise the GPU never goes idle. Since __i915_vma_retire() may be called from atomic contexts, use async variant of wakeref put. Also, to avoid circular locking dependency, take care of acquiring the wakeref before VM mutex when both are needed. v7: Add inline comments with justifications for: - using untracked variants of intel_gt_pm_get/put() (Nirmoy), - using async variant of _put(), - not getting the wakeref in case of a global GTT, - always getting the first wakeref outside vm->mutex. v6: Since __i915_vma_active/retire() callbacks are not serialized, storing a wakeref tracking handle inside struct i915_vma is not safe, and there is no other good place for that. Use untracked variants of intel_gt_pm_get/put_async(). v5: Replace "tile" with "GT" across commit description (Rodrigo), - avoid mentioning multi-GT case in commit description (Rodrigo), - explain why we need to take a temporary wakeref unconditionally inside i915_vma_pin_ww() (Rodrigo). v4: Refresh on top of commit 5e4e06e4087e ("drm/i915: Track gt pm wakerefs") (Andi), - for more easy backporting, split out removal of former insufficient workarounds and move them to separate patches (Nirmoy). - clean up commit message and description a bit. v3: Identify root cause more precisely, and a commit to blame, - identify and drop former workarounds, - update commit message and description. v2: Get the wakeref before VM mutex to avoid circular locking dependency, - drop questionable Fixes: tag. Fixes: d93939730347 ("drm/i915: Remove the vma refcount") Closes:https://gitlab.freedesktop.org/drm/intel/issues/8875 Signed-off-by: Janusz Krzysztofik Cc: Thomas Hellström Cc: Nirmoy Das Cc: Andi Shyti Cc: Rodrigo Vivi Cc:sta...@vger.kernel.org # v5.19
Re: [PATCH v7 3/3] Revert "drm/i915: Wait for active retire before i915_active_fini()"
On 3/5/2024 3:35 PM, Janusz Krzysztofik wrote: This reverts commit 7a2280e8dcd2f1f436db9631287c0b21cf6a92b0, obsoleted by "drm/i915/vma: Fix UAF on destroy against retire race". Signed-off-by: Janusz Krzysztofik Cc: Nirmoy Das Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/i915_vma.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index b70715b1411d6..d2f064d2525cc 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -1776,8 +1776,6 @@ static void release_references(struct i915_vma *vma, struct intel_gt *gt, if (vm_ddestroy) i915_vm_resv_put(vma->vm); - /* Wait for async active retire */ - i915_active_wait(&vma->active); i915_active_fini(&vma->active); GEM_WARN_ON(vma->resource); i915_vma_free(vma);
Re: [PATCH v6 1/3] drm/i915/vma: Fix UAF on destroy against retire race
On 3/1/2024 8:29 AM, Janusz Krzysztofik wrote: Object debugging tools were sporadically reporting illegal attempts to free a still active i915 VMA object when parking a GT believed to be idle. [161.359441] ODEBUG: free active (active state 0) object: 88811643b958 object type: i915_active hint: __i915_vma_active+0x0/0x50 [i915] [161.360082] WARNING: CPU: 5 PID: 276 at lib/debugobjects.c:514 debug_print_object+0x80/0xb0 ... [161.360304] CPU: 5 PID: 276 Comm: kworker/5:2 Not tainted 6.5.0-rc1-CI_DRM_13375-g003f860e5577+ #1 [161.360314] Hardware name: Intel Corporation Rocket Lake Client Platform/RocketLake S UDIMM 6L RVP, BIOS RKLSFWI1.R00.3173.A03.2204210138 04/21/2022 [161.360322] Workqueue: i915-unordered __intel_wakeref_put_work [i915] [161.360592] RIP: 0010:debug_print_object+0x80/0xb0 ... [161.361347] debug_object_free+0xeb/0x110 [161.361362] i915_active_fini+0x14/0x130 [i915] [161.361866] release_references+0xfe/0x1f0 [i915] [161.362543] i915_vma_parked+0x1db/0x380 [i915] [161.363129] __gt_park+0x121/0x230 [i915] [161.363515] intel_wakeref_put_last+0x1f/0x70 [i915] That has been tracked down to be happening when another thread is deactivating the VMA inside __active_retire() helper, after the VMA's active counter has been already decremented to 0, but before deactivation of the VMA's object is reported to the object debugging tool. We could prevent from that race by serializing i915_active_fini() with __active_retire() via ref->tree_lock, but that wouldn't stop the VMA from being used, e.g. from __i915_vma_retire() called at the end of __active_retire(), after that VMA has been already freed by a concurrent i915_vma_destroy() on return from the i915_active_fini(). Then, we should rather fix the issue at the VMA level, not in i915_active. Since __i915_vma_parked() is called from __gt_park() on last put of the GT's wakeref, the issue could be addressed by holding the GT wakeref long enough for __active_retire() to complete before that wakeref is released and the GT parked. I believe the issue was introduced by commit d93939730347 ("drm/i915: Remove the vma refcount") which moved a call to i915_active_fini() from a dropped i915_vma_release(), called on last put of the removed VMA kref, to i915_vma_parked() processing path called on last put of a GT wakeref. However, its visibility to the object debugging tool was suppressed by a bug in i915_active that was fixed two weeks later with commit e92eb246feb9 ("drm/i915/active: Fix missing debug object activation"). A VMA associated with a request doesn't acquire a GT wakeref by itself. Instead, it depends on a wakeref held directly by the request's active intel_context for a GT associated with its VM, and indirectly on that intel_context's engine wakeref if the engine belongs to the same GT as the VMA's VM. Those wakerefs are released asynchronously to VMA deactivation. Fix the issue by getting a wakeref for the VMA's GT when activating it, and putting that wakeref only after the VMA is deactivated. However, exclude global GTT from that processing path, otherwise the GPU never goes idle. Since __i915_vma_retire() may be called from atomic contexts, use async variant of wakeref put. Also, to avoid circular locking dependency, take care of acquiring the wakeref before VM mutex when both are needed. v6: Since __i915_vma_active/retire() callbacks are not serialized, storing a wakeref tracking handle inside struct i915_vma is not safe, and there is no other good place for that. Use untracked variants of intel_gt_pm_get/put_async(). v5: Replace "tile" with "GT" across commit description (Rodrigo), - avoid mentioning multi-GT case in commit description (Rodrigo), - explain why we need to take a temporary wakeref unconditionally inside i915_vma_pin_ww() (Rodrigo). v4: Refresh on top of commit 5e4e06e4087e ("drm/i915: Track gt pm wakerefs") (Andi), - for more easy backporting, split out removal of former insufficient workarounds and move them to separate patches (Nirmoy). - clean up commit message and description a bit. v3: Identify root cause more precisely, and a commit to blame, - identify and drop former workarounds, - update commit message and description. v2: Get the wakeref before VM mutex to avoid circular locking dependency, - drop questionable Fixes: tag. Fixes: d93939730347 ("drm/i915: Remove the vma refcount") Closes: https://gitlab.freedesktop.org/drm/intel/issues/8875 Signed-off-by: Janusz Krzysztofik Cc: Thomas Hellström Cc: Nirmoy Das Cc: Andi Shyti Cc: Rodrigo Vivi Cc: sta...@vger.kernel.org # v5.19+ --- drivers/gpu/drm/i915/i915_vma.c | 26 +++--- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index d09aad34ba37f..ffe81fe338f7e 100644 --- a/drivers/g
Re: [PATCH] drm/i915: Add missing doc for drm_i915_reset_stats
Hi Andi, On 2/29/2024 4:28 PM, Andi Shyti wrote: Hi Nirmoy, On Thu, Feb 29, 2024 at 02:29:18PM +0100, Nirmoy Das wrote: Add missing doc for struct drm_i915_reset_stats. Cc: Andi Shyti Signed-off-by: Nirmoy Das Reviewed-by: Andi Shyti Thanks, merged to din. Nirmoy Thanks, Andi
[PATCH] drm/i915: Add missing doc for drm_i915_reset_stats
Add missing doc for struct drm_i915_reset_stats. Cc: Andi Shyti Signed-off-by: Nirmoy Das --- include/uapi/drm/i915_drm.h | 16 +--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 2ee338860b7e..1279a6b2bece 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -2623,19 +2623,29 @@ struct drm_i915_reg_read { * */ +/* + * struct drm_i915_reset_stats - Return global reset and other context stats + * + * Driver keeps few stats for each contexts and also global reset count. + * This struct can be used to query those stats. + */ struct drm_i915_reset_stats { + /** @ctx_id: ID of the requested context */ __u32 ctx_id; + + /** @flags: MBZ */ __u32 flags; - /* All resets since boot/module reload, for all contexts */ + /** @reset_count: All resets since boot/module reload, for all contexts */ __u32 reset_count; - /* Number of batches lost when active in GPU, for this context */ + /** @batch_active: Number of batches lost when active in GPU, for this context */ __u32 batch_active; - /* Number of batches lost pending for execution, for this context */ + /** @batch_pending: Number of batches lost pending for execution, for this context */ __u32 batch_pending; + /** @pad: MBZ */ __u32 pad; }; -- 2.42.0
Re: [PATCH] drm/i915: check before removing mm notifier
On 2/28/2024 2:24 PM, Tvrtko Ursulin wrote: On 27/02/2024 09:26, Nirmoy Das wrote: Hi Tvrtko, On 2/27/2024 10:04 AM, Tvrtko Ursulin wrote: On 21/02/2024 11:52, Nirmoy Das wrote: Merged it to drm-intel-gt-next with s/check/Check Shouldn't this have had: Fixes: ed29c2691188 ("drm/i915: Fix userptr so we do not have to worry about obj->mm.lock, v7.") Cc: # v5.13+ ? Yes. Sorry, I missed that. Can we still the tag ? I've added them and force pushed the branch since commit was still at the top. Thanks a lot, Tvrtko! FYI + Jani, Joonas and Rodrigo Regards, Tvrtko Thanks, Nirmoy Regards, Tvrtko On 2/19/2024 1:50 PM, Nirmoy Das wrote: Error in mmu_interval_notifier_insert() can leave a NULL notifier.mm pointer. Catch that and return early. Cc: Andi Shyti Cc: Shawn Lee Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c index 0e21ce9d3e5a..61abfb505766 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c @@ -349,6 +349,9 @@ i915_gem_userptr_release(struct drm_i915_gem_object *obj) { GEM_WARN_ON(obj->userptr.page_ref); + if (!obj->userptr.notifier.mm) + return; + mmu_interval_notifier_remove(&obj->userptr.notifier); obj->userptr.notifier.mm = NULL; }
Re: [PATCH] drm/i915: check before removing mm notifier
Hi Tvrtko, On 2/27/2024 10:04 AM, Tvrtko Ursulin wrote: On 21/02/2024 11:52, Nirmoy Das wrote: Merged it to drm-intel-gt-next with s/check/Check Shouldn't this have had: Fixes: ed29c2691188 ("drm/i915: Fix userptr so we do not have to worry about obj->mm.lock, v7.") Cc: # v5.13+ ? Yes. Sorry, I missed that. Can we still the tag ? Thanks, Nirmoy Regards, Tvrtko On 2/19/2024 1:50 PM, Nirmoy Das wrote: Error in mmu_interval_notifier_insert() can leave a NULL notifier.mm pointer. Catch that and return early. Cc: Andi Shyti Cc: Shawn Lee Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c index 0e21ce9d3e5a..61abfb505766 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c @@ -349,6 +349,9 @@ i915_gem_userptr_release(struct drm_i915_gem_object *obj) { GEM_WARN_ON(obj->userptr.page_ref); + if (!obj->userptr.notifier.mm) + return; + mmu_interval_notifier_remove(&obj->userptr.notifier); obj->userptr.notifier.mm = NULL; }
Re: [PATCH] drm/i915: check before removing mm notifier
Merged it to drm-intel-gt-next with s/check/Check On 2/19/2024 1:50 PM, Nirmoy Das wrote: Error in mmu_interval_notifier_insert() can leave a NULL notifier.mm pointer. Catch that and return early. Cc: Andi Shyti Cc: Shawn Lee Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c index 0e21ce9d3e5a..61abfb505766 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c @@ -349,6 +349,9 @@ i915_gem_userptr_release(struct drm_i915_gem_object *obj) { GEM_WARN_ON(obj->userptr.page_ref); + if (!obj->userptr.notifier.mm) + return; + mmu_interval_notifier_remove(&obj->userptr.notifier); obj->userptr.notifier.mm = NULL; }
Re: [PATCH] drm/i915: check before removing mm notifier
Hi Rodrigo, On 2/19/2024 9:12 PM, Rodrigo Vivi wrote: On Mon, Feb 19, 2024 at 01:50:47PM +0100, Nirmoy Das wrote: Error in mmu_interval_notifier_insert() can leave a NULL notifier.mm pointer. Catch that and return early. Cc: Andi Shyti Cc: Shawn Lee Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c index 0e21ce9d3e5a..61abfb505766 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c @@ -349,6 +349,9 @@ i915_gem_userptr_release(struct drm_i915_gem_object *obj) { GEM_WARN_ON(obj->userptr.page_ref); + if (!obj->userptr.notifier.mm) + return; + hmmm... right, it looks that we need this protection. But... I mean, feel free to use Reviewed-by: Rodrigo Vivi for this patch, but I believe that if this mmu insert failed we might have other deeper problems like when checking i915_gem_object_is_userptr() ? No?! We are returning an error if mmu insert fails while creating a userptr object so the obj struct is only available to obj cleanup methods. As far as I see, i915_gem_object_is_userptr() should not happen on such obj struct. Thanks, Nirmoy mmu_interval_notifier_remove(&obj->userptr.notifier); obj->userptr.notifier.mm = NULL; } -- 2.42.0
[PATCH] drm/i915: check before removing mm notifier
Error in mmu_interval_notifier_insert() can leave a NULL notifier.mm pointer. Catch that and return early. Cc: Andi Shyti Cc: Shawn Lee Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c index 0e21ce9d3e5a..61abfb505766 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c @@ -349,6 +349,9 @@ i915_gem_userptr_release(struct drm_i915_gem_object *obj) { GEM_WARN_ON(obj->userptr.page_ref); + if (!obj->userptr.notifier.mm) + return; + mmu_interval_notifier_remove(&obj->userptr.notifier); obj->userptr.notifier.mm = NULL; } -- 2.42.0
Re: [PATCH v5 0/3] drm/i915: Fix VMA UAF on destroy against deactivate race
Hi Janusz, There seems to be a regression in CI related to this: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_129026v2/bat-dg1-7/igt@gem_lmem_swapping@random-engi...@lmem0.html#dmesg-warnings1053 Please have a look. Regards, Nirmoy On 1/24/2024 6:13 PM, Janusz Krzysztofik wrote: Object debugging tools were sporadically reporting illegal attempts to free a still active i915 VMA object when parking a GT believed to be idle. [161.359441] ODEBUG: free active (active state 0) object: 88811643b958 object type: i915_active hint: __i915_vma_active+0x0/0x50 [i915] [161.360082] WARNING: CPU: 5 PID: 276 at lib/debugobjects.c:514 debug_print_object+0x80/0xb0 ... [161.360304] CPU: 5 PID: 276 Comm: kworker/5:2 Not tainted 6.5.0-rc1-CI_DRM_13375-g003f860e5577+ #1 [161.360314] Hardware name: Intel Corporation Rocket Lake Client Platform/RocketLake S UDIMM 6L RVP, BIOS RKLSFWI1.R00.3173.A03.2204210138 04/21/2022 [161.360322] Workqueue: i915-unordered __intel_wakeref_put_work [i915] [161.360592] RIP: 0010:debug_print_object+0x80/0xb0 ... [161.361347] debug_object_free+0xeb/0x110 [161.361362] i915_active_fini+0x14/0x130 [i915] [161.361866] release_references+0xfe/0x1f0 [i915] [161.362543] i915_vma_parked+0x1db/0x380 [i915] [161.363129] __gt_park+0x121/0x230 [i915] [161.363515] intel_wakeref_put_last+0x1f/0x70 [i915] That has been tracked down to be happening when another thread is deactivating the VMA inside __active_retire() helper, after the VMA's active counter has been already decremented to 0, but before deactivation of the VMA's object is reported to the object debugging tool. We could prevent from that race by serializing i915_active_fini() with __active_retire() via ref->tree_lock, but that wouldn't stop the VMA from being used, e.g. from __i915_vma_retire() called at the end of __active_retire(), after that VMA has been already freed by a concurrent i915_vma_destroy() on return from the i915_active_fini(). Then, we should rather fix the issue at the VMA level, not in i915_active. Since __i915_vma_parked() is called from __gt_park() on last put of the GT's wakeref, the issue could be addressed by holding the GT wakeref long enough for __active_retire() to complete before that wakeref is released and the GT parked. A VMA associated with a request doesn't acquire a GT wakeref by itself. Instead, it depends on a wakeref held directly by the request's active intel_context for a GT associated with its VM, and indirectly on that intel_context's engine wakeref if the engine belongs to the same GT as the VMA's VM. Those wakerefs are released asynchronously to VMA deactivation. In case of single-GT platforms, at least one of those wakerefs is usually held long enough for the request's VMA to be deactivated on time, before it is destroyed on last put of its VM GT wakeref. However, on multi-GT platforms, a request may use a VMA from a GT other than the one that hosts the request's engine, then it is protected only with the intel_context's VM GT wakeref. There was an attempt to fix the issue on 2-GT Meteor Lake by acquiring an extra wakeref for a Primary GT from i915_gem_do_execbuffer() -- see commit f56fe3e91787 ("drm/i915: Fix a VMA UAF for multi-gt platform"). However, that fix occurred insufficient -- the issue was still reported by CI. That wakeref was released on exit from i915_gem_do_execbuffer(), then potentially before completion of the request and deactivation of its associated VMAs. Moreover, CI reports indicated that single-GT platforms also suffered sporadically from the same race. I believe the issue was introduced by commit d93939730347 ("drm/i915: Remove the vma refcount") which moved a call to i915_active_fini() from a dropped i915_vma_release(), called on last put of the removed VMA kref, to i915_vma_parked() processing path called on last put of a GT wakeref. However, its visibility to the object debugging tool was suppressed by a bug in i915_active that was fixed two weeks later with commit e92eb246feb9 ("drm/i915/active: Fix missing debug object activation"). Fix the issue by getting a wakeref for the VMA's GT when activating it, and putting that wakeref only after the VMA is deactivated. However, exclude global GTT from that processing path, otherwise the GPU never goes idle. Since __i915_vma_retire() may be called from atomic contexts, use async variant of wakeref put. Also, to avoid circular locking dependency, take care of acquiring the wakeref before VM mutex when both are needed. Having that fixed, stop explicitly acquiring the extra GT0 wakeref from inside i915_gem_do_execbuffer(), and also drop an extra call to i915_active_wait(), introduced by commit 7a2280e8dcd2 ("drm/i915: Wait for active retire before i915_active_fini()") as another insufficient fix for this UAF race. v5: Replace "tile" with "GT" across commit descrip
Re: [PATCH v3 05/16] drm/i915: Disable the "binder"
On 1/19/2024 11:47 AM, Nirmoy Das wrote: On 1/19/2024 12:12 AM, Ville Syrjälä wrote: On Wed, Jan 17, 2024 at 06:46:24PM +0100, Nirmoy Das wrote: On 1/17/2024 3:13 PM, Michał Winiarski wrote: On Tue, Jan 16, 2024 at 09:56:25AM +0200, Ville Syrjala wrote: From: Ville Syrjälä Now that the GGTT PTE updates go straight to GSMBASE (bypassing GTTMMADR) there should be no more risk of system hangs? So the "binder" (ie. update the PTEs via MI_UPDATE_GTT) is no longer necessary, disable it. My main worry with the MI_UPDATE_GTT are: - only used on this one platform so very limited testing coverage - async so more opprtunities to screw things up - what happens if the engine hangs while we're waiting for MI_UPDATE_GTT to finish? - requires working command submission, so even getting a working display now depends on a lot more extra components working correctly TODO: MI_UPDATE_GTT might be interesting as an optimization though, so perhaps someone should look into always using it (assuming the GPU is alive and well)? v2: Keep using MI_UPDATE_GTT on VM guests Cc: Paz Zcharya Cc: Nirmoy Das Signed-off-by: Ville Syrjälä --- drivers/gpu/drm/i915/gt/intel_gtt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c index 86f73fe558ca..e83dabc56a14 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.c +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c @@ -24,7 +24,8 @@ bool i915_ggtt_require_binder(struct drm_i915_private *i915) { /* Wa_13010847436 & Wa_14019519902 */ - return MEDIA_VER_FULL(i915) == IP_VER(13, 0); + return i915_run_as_guest() && + MEDIA_VER_FULL(i915) == IP_VER(13, 0); Note that i915_run_as_guest() is not the most reliable way to decide whether to use MI_UPDATE_GTT or straight to GSMBASE, as it requires the hypervisor to "opt-in" and set the X86_FEATURE_HYPERVISOR. If it's not set - the driver will go into GSMBASE, which is not mapped inside the guest. Does the system firmware advertise whether GSMBASE is "open" or "closed" to CPU access in any way? Had a chat with David from IVE team, David suggested to read 0x138914 to determine that. "GOP needs to qualify the WA by reading GFX MMIO offset 0x138914 and verify the value there is 0x1." -> as per the HSD-22018444074 OK, so we can confirm the firmware is on board. I suppose no real harm in doing so even though it would clearly be a rather weird if someone would ship some ancient firmware that doesn't handle this. But that still won't help with the guest side handling because that register will read the same in the guest. We are back to the same question :/ How about if (boot_cpu_has(X86_FEATURE_HYPERVISOR) && !i915_run_as_guest() hmm, never mind that was stupid. disable binder Regards, Nirmoy
Re: [PATCH v3 05/16] drm/i915: Disable the "binder"
On 1/19/2024 12:12 AM, Ville Syrjälä wrote: On Wed, Jan 17, 2024 at 06:46:24PM +0100, Nirmoy Das wrote: On 1/17/2024 3:13 PM, Michał Winiarski wrote: On Tue, Jan 16, 2024 at 09:56:25AM +0200, Ville Syrjala wrote: From: Ville Syrjälä Now that the GGTT PTE updates go straight to GSMBASE (bypassing GTTMMADR) there should be no more risk of system hangs? So the "binder" (ie. update the PTEs via MI_UPDATE_GTT) is no longer necessary, disable it. My main worry with the MI_UPDATE_GTT are: - only used on this one platform so very limited testing coverage - async so more opprtunities to screw things up - what happens if the engine hangs while we're waiting for MI_UPDATE_GTT to finish? - requires working command submission, so even getting a working display now depends on a lot more extra components working correctly TODO: MI_UPDATE_GTT might be interesting as an optimization though, so perhaps someone should look into always using it (assuming the GPU is alive and well)? v2: Keep using MI_UPDATE_GTT on VM guests Cc: Paz Zcharya Cc: Nirmoy Das Signed-off-by: Ville Syrjälä --- drivers/gpu/drm/i915/gt/intel_gtt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c index 86f73fe558ca..e83dabc56a14 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.c +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c @@ -24,7 +24,8 @@ bool i915_ggtt_require_binder(struct drm_i915_private *i915) { /* Wa_13010847436 & Wa_14019519902 */ - return MEDIA_VER_FULL(i915) == IP_VER(13, 0); + return i915_run_as_guest() && + MEDIA_VER_FULL(i915) == IP_VER(13, 0); Note that i915_run_as_guest() is not the most reliable way to decide whether to use MI_UPDATE_GTT or straight to GSMBASE, as it requires the hypervisor to "opt-in" and set the X86_FEATURE_HYPERVISOR. If it's not set - the driver will go into GSMBASE, which is not mapped inside the guest. Does the system firmware advertise whether GSMBASE is "open" or "closed" to CPU access in any way? Had a chat with David from IVE team, David suggested to read 0x138914 to determine that. "GOP needs to qualify the WA by reading GFX MMIO offset 0x138914 and verify the value there is 0x1." -> as per the HSD-22018444074 OK, so we can confirm the firmware is on board. I suppose no real harm in doing so even though it would clearly be a rather weird if someone would ship some ancient firmware that doesn't handle this. But that still won't help with the guest side handling because that register will read the same in the guest. We are back to the same question :/ How about if (boot_cpu_has(X86_FEATURE_HYPERVISOR) && !i915_run_as_guest() disable binder Regards, Nirmoy
Re: [PATCH v3 05/16] drm/i915: Disable the "binder"
On 1/17/2024 3:13 PM, Michał Winiarski wrote: On Tue, Jan 16, 2024 at 09:56:25AM +0200, Ville Syrjala wrote: From: Ville Syrjälä Now that the GGTT PTE updates go straight to GSMBASE (bypassing GTTMMADR) there should be no more risk of system hangs? So the "binder" (ie. update the PTEs via MI_UPDATE_GTT) is no longer necessary, disable it. My main worry with the MI_UPDATE_GTT are: - only used on this one platform so very limited testing coverage - async so more opprtunities to screw things up - what happens if the engine hangs while we're waiting for MI_UPDATE_GTT to finish? - requires working command submission, so even getting a working display now depends on a lot more extra components working correctly TODO: MI_UPDATE_GTT might be interesting as an optimization though, so perhaps someone should look into always using it (assuming the GPU is alive and well)? v2: Keep using MI_UPDATE_GTT on VM guests Cc: Paz Zcharya Cc: Nirmoy Das Signed-off-by: Ville Syrjälä --- drivers/gpu/drm/i915/gt/intel_gtt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c index 86f73fe558ca..e83dabc56a14 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.c +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c @@ -24,7 +24,8 @@ bool i915_ggtt_require_binder(struct drm_i915_private *i915) { /* Wa_13010847436 & Wa_14019519902 */ - return MEDIA_VER_FULL(i915) == IP_VER(13, 0); + return i915_run_as_guest() && + MEDIA_VER_FULL(i915) == IP_VER(13, 0); Note that i915_run_as_guest() is not the most reliable way to decide whether to use MI_UPDATE_GTT or straight to GSMBASE, as it requires the hypervisor to "opt-in" and set the X86_FEATURE_HYPERVISOR. If it's not set - the driver will go into GSMBASE, which is not mapped inside the guest. Does the system firmware advertise whether GSMBASE is "open" or "closed" to CPU access in any way? Had a chat with David from IVE team, David suggested to read 0x138914 to determine that. "GOP needs to qualify the WA by reading GFX MMIO offset 0x138914 and verify the value there is 0x1." -> as per the HSD-22018444074 Regards, Nirmoy -Michał } static bool intel_ggtt_update_needs_vtd_wa(struct drm_i915_private *i915) -- 2.41.0
Re: [PATCH v3 07/16] drm/i915: Fix PTE decode during initial plane readout
On 1/16/2024 8:56 AM, Ville Syrjala wrote: From: Ville Syrjälä When multiple pipes are enabled by the BIOS we try to read out each in turn. But we do the readout for the second only after the inherited vma for the first has been rebound into its original place (and thus the PTEs have been rewritten). Unlike the BIOS we set some high caching bits in the PTE on MTL which confuses the readout for the second plane. Filter out the non-address bits from the PTE value appropriately to fix this. I suppose it might also be possible that the BIOS would already set some caching bits as well, in which case we'd run into this same issue already for the first plane. TODO: - should abstract the PTE decoding to avoid details leaking all over - should probably do the readout for all the planes before we touch anything (including the PTEs) so that we truly read out the BIOS state Cc: Paz Zcharya Reviewed-by: Andrzej Hajda Signed-off-by: Ville Syrjälä Acked-by: Nirmoy Das --- drivers/gpu/drm/i915/display/intel_plane_initial.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_plane_initial.c b/drivers/gpu/drm/i915/display/intel_plane_initial.c index a55c09cbd0e4..ffc92b18fcf5 100644 --- a/drivers/gpu/drm/i915/display/intel_plane_initial.c +++ b/drivers/gpu/drm/i915/display/intel_plane_initial.c @@ -72,7 +72,7 @@ initial_plane_vma(struct drm_i915_private *i915, return NULL; } - phys_base = pte & I915_GTT_PAGE_MASK; + phys_base = pte & GEN12_GGTT_PTE_ADDR_MASK; mem = i915->mm.regions[INTEL_REGION_LMEM_0]; /*
Re: [PATCH v3 06/16] drm/i915: Rename the DSM/GSM registers
On 1/16/2024 8:56 AM, Ville Syrjala wrote: From: Ville Syrjälä 0x108100 and 0x1080c0 have been around since snb. Rename the defines appropriately. Cc: Paz Zcharya Reviewed-by: Andrzej Hajda Signed-off-by: Ville Syrjälä Acked-by: Nirmoy Das --- drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 4 ++-- drivers/gpu/drm/i915/gt/intel_ggtt.c| 2 +- drivers/gpu/drm/i915/gt/intel_region_lmem.c | 2 +- drivers/gpu/drm/i915/i915_reg.h | 7 --- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c index 0b429f1ecd99..ce6b860b393e 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c @@ -935,7 +935,7 @@ i915_gem_stolen_lmem_setup(struct drm_i915_private *i915, u16 type, GEM_BUG_ON((dsm_base + dsm_size) > lmem_size); } else { /* Use DSM base address instead for stolen memory */ - dsm_base = intel_uncore_read64(uncore, GEN12_DSMBASE) & GEN12_BDSM_MASK; + dsm_base = intel_uncore_read64(uncore, GEN6_DSMBASE) & GEN11_BDSM_MASK; if (WARN_ON(lmem_size < dsm_base)) return ERR_PTR(-ENODEV); dsm_size = ALIGN_DOWN(lmem_size - dsm_base, SZ_1M); @@ -951,7 +951,7 @@ i915_gem_stolen_lmem_setup(struct drm_i915_private *i915, u16 type, * Normally this would not work but on MTL the system firmware * should have relaxed the access permissions sufficiently. */ - io_start = intel_uncore_read64(uncore, GEN12_DSMBASE) & GEN12_BDSM_MASK; + io_start = intel_uncore_read64(uncore, GEN6_DSMBASE) & GEN11_BDSM_MASK; io_size = dsm_size; } else if (pci_resource_len(pdev, GEN12_LMEM_BAR) < lmem_size) { io_start = 0; diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c index 7a716ff16070..b87933e7671d 100644 --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c @@ -1170,7 +1170,7 @@ static int ggtt_probe_common(struct i915_ggtt *ggtt, u64 size) * should have relaxed the access permissions sufficiently. */ if (IS_METEORLAKE(i915) && !i915_run_as_guest()) - phys_addr = intel_uncore_read64(uncore, GEN12_GSMBASE) & GEN12_BDSM_MASK; + phys_addr = intel_uncore_read64(uncore, GEN6_GSMBASE) & GEN11_BDSM_MASK; else phys_addr = pci_resource_start(pdev, GEN4_GTTMMADR_BAR) + gen6_gttadr_offset(i915); diff --git a/drivers/gpu/drm/i915/gt/intel_region_lmem.c b/drivers/gpu/drm/i915/gt/intel_region_lmem.c index af357089da6e..51bb27e10a4f 100644 --- a/drivers/gpu/drm/i915/gt/intel_region_lmem.c +++ b/drivers/gpu/drm/i915/gt/intel_region_lmem.c @@ -240,7 +240,7 @@ static struct intel_memory_region *setup_lmem(struct intel_gt *gt) lmem_size -= tile_stolen; } else { /* Stolen starts from GSMBASE without CCS */ - lmem_size = intel_uncore_read64(&i915->uncore, GEN12_GSMBASE); + lmem_size = intel_uncore_read64(&i915->uncore, GEN6_GSMBASE); } i915_resize_lmem_bar(i915, lmem_size); diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 75bc08081fce..0d35173a7718 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -6320,9 +6320,10 @@ enum skl_power_gate { #define GMS_MASKREG_GENMASK(15, 8) #define GGMS_MASK REG_GENMASK(7, 6) -#define GEN12_GSMBASE _MMIO(0x108100) -#define GEN12_DSMBASE _MMIO(0x1080C0) -#define GEN12_BDSM_MASK REG_GENMASK64(63, 20) +#define GEN6_GSMBASE _MMIO(0x108100) +#define GEN6_DSMBASE _MMIO(0x1080C0) +#define GEN6_BDSM_MASK REG_GENMASK64(31, 20) +#define GEN11_BDSM_MASK REG_GENMASK64(63, 20) #define XEHP_CLOCK_GATE_DIS _MMIO(0x101014) #define SGSI_SIDECLK_DISREG_BIT(17)
Re: [PATCH v3 05/16] drm/i915: Disable the "binder"
On 1/16/2024 8:56 AM, Ville Syrjala wrote: From: Ville Syrjälä Now that the GGTT PTE updates go straight to GSMBASE (bypassing GTTMMADR) there should be no more risk of system hangs? So the "binder" (ie. update the PTEs via MI_UPDATE_GTT) is no longer necessary, disable it. My main worry with the MI_UPDATE_GTT are: - only used on this one platform so very limited testing coverage - async so more opprtunities to screw things up - what happens if the engine hangs while we're waiting for MI_UPDATE_GTT to finish? - requires working command submission, so even getting a working display now depends on a lot more extra components working correctly TODO: MI_UPDATE_GTT might be interesting as an optimization though, so perhaps someone should look into always using it (assuming the GPU is alive and well)? v2: Keep using MI_UPDATE_GTT on VM guests Cc: Paz Zcharya Cc: Nirmoy Das Signed-off-by: Ville Syrjälä Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/intel_gtt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c index 86f73fe558ca..e83dabc56a14 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.c +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c @@ -24,7 +24,8 @@ bool i915_ggtt_require_binder(struct drm_i915_private *i915) { /* Wa_13010847436 & Wa_14019519902 */ - return MEDIA_VER_FULL(i915) == IP_VER(13, 0); + return i915_run_as_guest() && + MEDIA_VER_FULL(i915) == IP_VER(13, 0); } static bool intel_ggtt_update_needs_vtd_wa(struct drm_i915_private *i915)
Re: [PATCH v3 04/16] drm/i915: Bypass LMEMBAR/GTTMMADR for MTL stolen memory access
On 1/16/2024 8:56 AM, Ville Syrjala wrote: From: Ville Syrjälä On MTL accessing stolen memory via the BARs is somehow borked, and it can hang the machine. As a workaround let's bypass the BARs and just go straight to DSMBASE/GSMBASE instead. Note that on every other platform this itself would hang the machine, but on MTL the system firmware is expected to relax the access permission guarding stolen memory to enable this workaround, and thus direct CPU accesses should be fine. The raw stolen memory areas won't be passed to VMs so we'll need to risk using the BAR there for the initial setup. Once command submission is up we should switch to MI_UPDATE_GTT which at least shouldn't hang the whole machine. v2: Don't use direct GSM/DSM access on guests Add w/a number Cc: Paz Zcharya Cc: Nirmoy Das Cc: Joonas Lahtinen Reviewed-by: Andrzej Hajda Reviewed-by: Radhakrishna Sripada Signed-off-by: Ville Syrjälä I think i915_run_as_guest() should work. Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 14 +- drivers/gpu/drm/i915/gt/intel_ggtt.c | 16 +++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c index ee237043c302..0b429f1ecd99 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c @@ -941,7 +941,19 @@ i915_gem_stolen_lmem_setup(struct drm_i915_private *i915, u16 type, dsm_size = ALIGN_DOWN(lmem_size - dsm_base, SZ_1M); } - if (pci_resource_len(pdev, GEN12_LMEM_BAR) < lmem_size) { + if (IS_METEORLAKE(i915) && !i915_run_as_guest()) { + /* +* Wa_22018444074 +* +* Access via BAR can hang MTL, go directly to DSM, +* except for VM guests which won't have access to it. +* +* Normally this would not work but on MTL the system firmware +* should have relaxed the access permissions sufficiently. +*/ + io_start = intel_uncore_read64(uncore, GEN12_DSMBASE) & GEN12_BDSM_MASK; + io_size = dsm_size; + } else if (pci_resource_len(pdev, GEN12_LMEM_BAR) < lmem_size) { io_start = 0; io_size = 0; } else { diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c index 21a7e3191c18..7a716ff16070 100644 --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c @@ -24,6 +24,7 @@ #include "intel_ring.h" #include "i915_drv.h" #include "i915_pci.h" +#include "i915_reg.h" #include "i915_request.h" #include "i915_scatterlist.h" #include "i915_utils.h" @@ -1152,13 +1153,26 @@ static unsigned int gen6_gttadr_offset(struct drm_i915_private *i915) static int ggtt_probe_common(struct i915_ggtt *ggtt, u64 size) { struct drm_i915_private *i915 = ggtt->vm.i915; + struct intel_uncore *uncore = ggtt->vm.gt->uncore; struct pci_dev *pdev = to_pci_dev(i915->drm.dev); phys_addr_t phys_addr; u32 pte_flags; int ret; GEM_WARN_ON(pci_resource_len(pdev, GEN4_GTTMMADR_BAR) != gen6_gttmmadr_size(i915)); - phys_addr = pci_resource_start(pdev, GEN4_GTTMMADR_BAR) + gen6_gttadr_offset(i915); + /* +* Wa_22018444074 +* +* Access via BAR can hang MTL, go directly to GSM, +* except for VM guests which won't have access to it. +* +* Normally this would not work but on MTL the system firmware +* should have relaxed the access permissions sufficiently. +*/ + if (IS_METEORLAKE(i915) && !i915_run_as_guest()) + phys_addr = intel_uncore_read64(uncore, GEN12_GSMBASE) & GEN12_BDSM_MASK; + else + phys_addr = pci_resource_start(pdev, GEN4_GTTMMADR_BAR) + gen6_gttadr_offset(i915); if (needs_wc_ggtt_mapping(i915)) ggtt->gsm = ioremap_wc(phys_addr, size);
Re: [PATCH v3 03/16] drm/i915: Remove ad-hoc lmem/stolen debugs
On 1/16/2024 8:56 AM, Ville Syrjala wrote: From: Ville Syrjälä Now that intel_memory_regions_hw_probe() prints out each and every memory region there's no reason to have ad-hoc debugs to do similar things elsewhere. Cc: Paz Zcharya Reviewed-by: Andrzej Hajda Signed-off-by: Ville Syrjälä Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 4 drivers/gpu/drm/i915/gt/intel_region_lmem.c | 3 --- 2 files changed, 7 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c index d2440c793f84..ee237043c302 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c @@ -828,7 +828,6 @@ static const struct intel_memory_region_ops i915_region_stolen_smem_ops = { static int init_stolen_lmem(struct intel_memory_region *mem) { - struct drm_i915_private *i915 = mem->i915; int err; if (GEM_WARN_ON(resource_size(&mem->region) == 0)) @@ -844,9 +843,6 @@ static int init_stolen_lmem(struct intel_memory_region *mem) !io_mapping_init_wc(&mem->iomap, mem->io.start, resource_size(&mem->io))) goto err_cleanup; - drm_dbg(&i915->drm, "Stolen Local DSM: %pR\n", &mem->region); - drm_dbg(&i915->drm, "Stolen Local memory IO: %pR\n", &mem->io); - return 0; err_cleanup: diff --git a/drivers/gpu/drm/i915/gt/intel_region_lmem.c b/drivers/gpu/drm/i915/gt/intel_region_lmem.c index 6f96a6b70601..af357089da6e 100644 --- a/drivers/gpu/drm/i915/gt/intel_region_lmem.c +++ b/drivers/gpu/drm/i915/gt/intel_region_lmem.c @@ -273,9 +273,6 @@ static struct intel_memory_region *setup_lmem(struct intel_gt *gt) if (err) goto err_region_put; - drm_dbg(&i915->drm, "Local memory: %pR\n", &mem->region); - drm_dbg(&i915->drm, "Local memory IO: %pR\n", &mem->io); - if (io_size < lmem_size) drm_info(&i915->drm, "Using a reduced BAR size of %lluMiB. Consider enabling 'Resizable BAR' or similar, if available in the BIOS.\n", (u64)io_size >> 20);
Re: [PATCH v3 01/16] drm/i915: Use struct resource for memory region IO as well
On 1/16/2024 8:56 AM, Ville Syrjala wrote: From: Ville Syrjälä mem->region is a struct resource, but mem->io_start and mem->io_size are not for whatever reason. Let's unify this and convert the io stuff into a struct resource as well. Should make life a little less annoying when you don't have juggle between two different approaches all the time. Mostly done using cocci (with manual tweaks at all the places where we mutate io_size by hand): @@ struct intel_memory_region *M; expression START, SIZE; @@ - M->io_start = START; - M->io_size = SIZE; + M->io = DEFINE_RES_MEM(START, SIZE); @@ struct intel_memory_region *M; @@ - M->io_start + M->io.start @@ struct intel_memory_region M; @@ - M.io_start + M.io.start @@ expression M; @@ - M->io_size + resource_size(&M->io) @@ expression M; @@ - M.io_size + resource_size(&M.io) Cc: Paz Zcharya Reviewed-by: Andrzej Hajda Signed-off-by: Ville Syrjälä Acked-by: Nirmoy Das --- drivers/gpu/drm/i915/display/intel_fbdev_fb.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_region.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 17 + drivers/gpu/drm/i915/gem/i915_gem_ttm.c| 8 .../gpu/drm/i915/gem/selftests/i915_gem_mman.c | 18 +- drivers/gpu/drm/i915/gt/intel_region_lmem.c| 11 +++ drivers/gpu/drm/i915/gt/selftest_tlb.c | 4 ++-- drivers/gpu/drm/i915/i915_gpu_error.c | 2 +- drivers/gpu/drm/i915/i915_query.c | 2 +- drivers/gpu/drm/i915/intel_memory_region.c | 15 +++ drivers/gpu/drm/i915/intel_memory_region.h | 3 +-- drivers/gpu/drm/i915/intel_region_ttm.c| 8 .../drm/i915/selftests/intel_memory_region.c | 4 ++-- 13 files changed, 45 insertions(+), 51 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_fbdev_fb.c b/drivers/gpu/drm/i915/display/intel_fbdev_fb.c index 717c3a3237c4..1ac05d90b2e8 100644 --- a/drivers/gpu/drm/i915/display/intel_fbdev_fb.c +++ b/drivers/gpu/drm/i915/display/intel_fbdev_fb.c @@ -78,7 +78,7 @@ int intel_fbdev_fb_fill_info(struct drm_i915_private *i915, struct fb_info *info /* Use fbdev's framebuffer from lmem for discrete */ info->fix.smem_start = - (unsigned long)(mem->io_start + + (unsigned long)(mem->io.start + i915_gem_object_get_dma_address(obj, 0)); info->fix.smem_len = obj->base.size; } else { diff --git a/drivers/gpu/drm/i915/gem/i915_gem_region.c b/drivers/gpu/drm/i915/gem/i915_gem_region.c index a4fb577eceb4..b09b74a2448b 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_region.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_region.c @@ -129,7 +129,7 @@ i915_gem_object_create_region_at(struct intel_memory_region *mem, return ERR_PTR(-EINVAL); if (!(flags & I915_BO_ALLOC_GPU_ONLY) && - offset + size > mem->io_size && + offset + size > resource_size(&mem->io) && !i915_ggtt_has_aperture(to_gt(mem->i915)->ggtt)) return ERR_PTR(-ENOSPC); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c index 8c88075eeab2..d2440c793f84 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c @@ -541,7 +541,9 @@ static int i915_gem_init_stolen(struct intel_memory_region *mem) /* Exclude the reserved region from driver use */ mem->region.end = i915->dsm.reserved.start - 1; - mem->io_size = min(mem->io_size, resource_size(&mem->region)); + mem->io = DEFINE_RES_MEM(mem->io.start, +min(resource_size(&mem->io), +resource_size(&mem->region))); i915->dsm.usable_size = resource_size(&mem->region); @@ -752,7 +754,7 @@ static int _i915_gem_object_stolen_init(struct intel_memory_region *mem, * With discrete devices, where we lack a mappable aperture there is no * possible way to ever access this memory on the CPU side. */ - if (mem->type == INTEL_MEMORY_STOLEN_LOCAL && !mem->io_size && + if (mem->type == INTEL_MEMORY_STOLEN_LOCAL && !resource_size(&mem->io) && !(flags & I915_BO_ALLOC_GPU_ONLY)) return -ENOSPC; @@ -838,13 +840,12 @@ static int init_stolen_lmem(struct intel_memory_region *mem) return 0; } - if (mem->io_size && - !io_mapping_init_wc(&mem->iomap, mem->io_start, mem->io_size)) + if (resource_size(&mem->io) && + !io_mapping_init_wc(&mem->iomap, mem->io.start, resource_size(&mem-
Re: [PATCH v3 02/16] drm/i915: Print memory region info during probe
On 1/16/2024 8:56 AM, Ville Syrjala wrote: From: Ville Syrjälä Dump the details about every memory region into dmesg at probe time. Avoids having to dig those out from random places when debugging stuff. Cc: Paz Zcharya Reviewed-by: Andrzej Hajda Signed-off-by: Ville Syrjälä Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/intel_memory_region.c | 18 ++ 1 file changed, 18 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_memory_region.c b/drivers/gpu/drm/i915/intel_memory_region.c index b2708f8cac2a..52d998e5c21a 100644 --- a/drivers/gpu/drm/i915/intel_memory_region.c +++ b/drivers/gpu/drm/i915/intel_memory_region.c @@ -372,6 +372,24 @@ int intel_memory_regions_hw_probe(struct drm_i915_private *i915) i915->mm.regions[i] = mem; } + for (i = 0; i < ARRAY_SIZE(i915->mm.regions); i++) { + struct intel_memory_region *mem = i915->mm.regions[i]; + u64 region_size, io_size; + + if (!mem) + continue; + + region_size = resource_size(&mem->region) >> 20; + io_size = resource_size(&mem->io) >> 20; + + if (resource_size(&mem->io)) + drm_dbg(&i915->drm, "Memory region(%d): %s: %llu MiB %pR, io: %llu MiB %pR\n", + mem->id, mem->name, region_size, &mem->region, io_size, &mem->io); + else + drm_dbg(&i915->drm, "Memory region(%d): %s: %llu MiB %pR, io: n/a\n", + mem->id, mem->name, region_size, &mem->region); + } + return 0; out_cleanup:
Re: [PATCH v2 04/15] drm/i915: Bypass LMEMBAR/GTTMMADR for MTL stolen memory access
On 1/12/2024 4:12 PM, Ville Syrjälä wrote: On Wed, Jan 10, 2024 at 11:49:47AM +0100, Nirmoy Das wrote: Hi Ville, Apologies, but I lost track of this series after I returned from sick leave. On 12/15/2023 11:59 AM, Ville Syrjala wrote: From: Ville Syrjälä On MTL accessing stolen memory via the BARs is somehow borked, and it can hang the machine. As a workaround let's bypass the BARs and just go straight to DSMBASE/GSMBASE instead. Note that on every other platform this itself would hang the machine, but on MTL the system firmware is expected to relax the access permission guarding stolen memory to enable this workaround, and thus direct CPU accesses should be fine. TODO: add w/a numbers and whatnot Cc: Paz Zcharya Cc: Nirmoy Das Cc: Radhakrishna Sripada Cc: Joonas Lahtinen Signed-off-by: Ville Syrjälä --- drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 11 ++- drivers/gpu/drm/i915/gt/intel_ggtt.c | 13 - 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c index ee237043c302..252fe5cd6ede 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c @@ -941,7 +941,16 @@ i915_gem_stolen_lmem_setup(struct drm_i915_private *i915, u16 type, dsm_size = ALIGN_DOWN(lmem_size - dsm_base, SZ_1M); } - if (pci_resource_len(pdev, GEN12_LMEM_BAR) < lmem_size) { + if (IS_METEORLAKE(i915)) { + /* +* Workaround: access via BAR can hang MTL, go directly to DSM. +* +* Normally this would not work but on MTL the system firmware +* should have relaxed the access permissions sufficiently. +*/ + io_start = intel_uncore_read64(uncore, GEN12_DSMBASE) & GEN12_BDSM_MASK; + io_size = dsm_size; This will work well on host driver but I am afraid this will not work on VM when someone tries to do direct device assignment of the igfx. GSMBASE/DSMBASE is reserved region so won't show up in VM, last I checked. Hmm. So BARs get passed over but other regions won't be? I wonder if there's a way to pass them explicitly... Yes, when a user ask qemu to pass though a pci device then qemu will ensure to map those BARs. This is an obscure usages but are we suppose to support that? If so then we need to detect that and fall back to binder approach. I suppose some people may attempt it. But I'm not sure how well that will work in practice even on other platforms. I don't think we've ever really considered that use case any kind of priority so bug reports tend to go unanswered. My main worry with the MI_UPDATE_GTT stuff is: - only used on this one platform so very limited testing coverage - async so more opprtunities to screw things up - what happens if the engine hangs while we're waiting for MI_UPDATE_GTT to finish? - requires working command submission, so even getting a working display now depends on a lot more extra components working correctly hence the patch to disable it. During testing my MTL was very unstable so I wanted to eliminate all potential sources of new bugs. Valid concerns but unfortunately MI_UPDATE_GTT is the only generic solution came up in the discussions which supports host, vm, also SRIOV case. Hmm. But we can't even use MI_UPDATE_GTT until command submission is up and running, so we still need the direct CPU path for early ggtt setup no? It is very unlikely for the bug to appear when there is only single user of the GPU. So the HW team is fine with having a small window where we do modify GTT using stolen. How about a modparam which defaults to your approach and have a doc saying to use binder on VM ? It would be nice if i915 could detect if it is running in virtualized environment but I don't have any ideas for that. Regards, Nirmoy So if we can't pass the stolen directly to the VM the only option would be to use the BARs for that and risk hanging the machine. Question how would i915 detect if it is running in VM environment
Re: [PATCH v2 04/15] drm/i915: Bypass LMEMBAR/GTTMMADR for MTL stolen memory access
On 1/10/2024 11:49 AM, Nirmoy Das wrote: Hi Ville, Apologies, but I lost track of this series after I returned from sick leave. Please ignore the uncontextual "but" in the previous response. I need to disable auto correct options. Regards, Nirmoy On 12/15/2023 11:59 AM, Ville Syrjala wrote: From: Ville Syrjälä On MTL accessing stolen memory via the BARs is somehow borked, and it can hang the machine. As a workaround let's bypass the BARs and just go straight to DSMBASE/GSMBASE instead. Note that on every other platform this itself would hang the machine, but on MTL the system firmware is expected to relax the access permission guarding stolen memory to enable this workaround, and thus direct CPU accesses should be fine. TODO: add w/a numbers and whatnot Cc: Paz Zcharya Cc: Nirmoy Das Cc: Radhakrishna Sripada Cc: Joonas Lahtinen Signed-off-by: Ville Syrjälä --- drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 11 ++- drivers/gpu/drm/i915/gt/intel_ggtt.c | 13 - 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c index ee237043c302..252fe5cd6ede 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c @@ -941,7 +941,16 @@ i915_gem_stolen_lmem_setup(struct drm_i915_private *i915, u16 type, dsm_size = ALIGN_DOWN(lmem_size - dsm_base, SZ_1M); } - if (pci_resource_len(pdev, GEN12_LMEM_BAR) < lmem_size) { + if (IS_METEORLAKE(i915)) { + /* + * Workaround: access via BAR can hang MTL, go directly to DSM. + * + * Normally this would not work but on MTL the system firmware + * should have relaxed the access permissions sufficiently. + */ + io_start = intel_uncore_read64(uncore, GEN12_DSMBASE) & GEN12_BDSM_MASK; + io_size = dsm_size; This will work well on host driver but I am afraid this will not work on VM when someone tries to do direct device assignment of the igfx. GSMBASE/DSMBASE is reserved region so won't show up in VM, last I checked. This is an obscure usages but are we suppose to support that? If so then we need to detect that and fall back to binder approach. Regards, Nirmoy + } else if (pci_resource_len(pdev, GEN12_LMEM_BAR) < lmem_size) { io_start = 0; io_size = 0; } else { diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c index 21a7e3191c18..ab71d74ec426 100644 --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c @@ -24,6 +24,7 @@ #include "intel_ring.h" #include "i915_drv.h" #include "i915_pci.h" +#include "i915_reg.h" #include "i915_request.h" #include "i915_scatterlist.h" #include "i915_utils.h" @@ -1152,13 +1153,23 @@ static unsigned int gen6_gttadr_offset(struct drm_i915_private *i915) static int ggtt_probe_common(struct i915_ggtt *ggtt, u64 size) { struct drm_i915_private *i915 = ggtt->vm.i915; + struct intel_uncore *uncore = ggtt->vm.gt->uncore; struct pci_dev *pdev = to_pci_dev(i915->drm.dev); phys_addr_t phys_addr; u32 pte_flags; int ret; GEM_WARN_ON(pci_resource_len(pdev, GEN4_GTTMMADR_BAR) != gen6_gttmmadr_size(i915)); - phys_addr = pci_resource_start(pdev, GEN4_GTTMMADR_BAR) + gen6_gttadr_offset(i915); + /* + * Workaround: access via BAR can hang MTL, go directly to GSM. + * + * Normally this would not work but on MTL the system firmware + * should have relaxed the access permissions sufficiently. + */ + if (IS_METEORLAKE(i915)) + phys_addr = intel_uncore_read64(uncore, GEN12_GSMBASE) & GEN12_BDSM_MASK; + else + phys_addr = pci_resource_start(pdev, GEN4_GTTMMADR_BAR) + gen6_gttadr_offset(i915); if (needs_wc_ggtt_mapping(i915)) ggtt->gsm = ioremap_wc(phys_addr, size);
Re: [PATCH v2 04/15] drm/i915: Bypass LMEMBAR/GTTMMADR for MTL stolen memory access
Hi Ville, Apologies, but I lost track of this series after I returned from sick leave. On 12/15/2023 11:59 AM, Ville Syrjala wrote: From: Ville Syrjälä On MTL accessing stolen memory via the BARs is somehow borked, and it can hang the machine. As a workaround let's bypass the BARs and just go straight to DSMBASE/GSMBASE instead. Note that on every other platform this itself would hang the machine, but on MTL the system firmware is expected to relax the access permission guarding stolen memory to enable this workaround, and thus direct CPU accesses should be fine. TODO: add w/a numbers and whatnot Cc: Paz Zcharya Cc: Nirmoy Das Cc: Radhakrishna Sripada Cc: Joonas Lahtinen Signed-off-by: Ville Syrjälä --- drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 11 ++- drivers/gpu/drm/i915/gt/intel_ggtt.c | 13 - 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c index ee237043c302..252fe5cd6ede 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c @@ -941,7 +941,16 @@ i915_gem_stolen_lmem_setup(struct drm_i915_private *i915, u16 type, dsm_size = ALIGN_DOWN(lmem_size - dsm_base, SZ_1M); } - if (pci_resource_len(pdev, GEN12_LMEM_BAR) < lmem_size) { + if (IS_METEORLAKE(i915)) { + /* +* Workaround: access via BAR can hang MTL, go directly to DSM. +* +* Normally this would not work but on MTL the system firmware +* should have relaxed the access permissions sufficiently. +*/ + io_start = intel_uncore_read64(uncore, GEN12_DSMBASE) & GEN12_BDSM_MASK; + io_size = dsm_size; This will work well on host driver but I am afraid this will not work on VM when someone tries to do direct device assignment of the igfx. GSMBASE/DSMBASE is reserved region so won't show up in VM, last I checked. This is an obscure usages but are we suppose to support that? If so then we need to detect that and fall back to binder approach. Regards, Nirmoy + } else if (pci_resource_len(pdev, GEN12_LMEM_BAR) < lmem_size) { io_start = 0; io_size = 0; } else { diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c index 21a7e3191c18..ab71d74ec426 100644 --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c @@ -24,6 +24,7 @@ #include "intel_ring.h" #include "i915_drv.h" #include "i915_pci.h" +#include "i915_reg.h" #include "i915_request.h" #include "i915_scatterlist.h" #include "i915_utils.h" @@ -1152,13 +1153,23 @@ static unsigned int gen6_gttadr_offset(struct drm_i915_private *i915) static int ggtt_probe_common(struct i915_ggtt *ggtt, u64 size) { struct drm_i915_private *i915 = ggtt->vm.i915; + struct intel_uncore *uncore = ggtt->vm.gt->uncore; struct pci_dev *pdev = to_pci_dev(i915->drm.dev); phys_addr_t phys_addr; u32 pte_flags; int ret; GEM_WARN_ON(pci_resource_len(pdev, GEN4_GTTMMADR_BAR) != gen6_gttmmadr_size(i915)); - phys_addr = pci_resource_start(pdev, GEN4_GTTMMADR_BAR) + gen6_gttadr_offset(i915); + /* +* Workaround: access via BAR can hang MTL, go directly to GSM. +* +* Normally this would not work but on MTL the system firmware +* should have relaxed the access permissions sufficiently. +*/ + if (IS_METEORLAKE(i915)) + phys_addr = intel_uncore_read64(uncore, GEN12_GSMBASE) & GEN12_BDSM_MASK; + else + phys_addr = pci_resource_start(pdev, GEN4_GTTMMADR_BAR) + gen6_gttadr_offset(i915); if (needs_wc_ggtt_mapping(i915)) ggtt->gsm = ioremap_wc(phys_addr, size);
Re: [PATCH v3 4/4] drm/i915/guc: Use the ce_to_guc() wrapper whenever possible
On 12/6/2023 9:46 PM, Andi Shyti wrote: Get the guc reference from the ce using the ce_to_guc() helper. Just a leftover from previous cleanups. Signed-off-by: Andi Shyti Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 4f51cc5f1604..3c7821ae9f0d 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -3513,7 +3513,7 @@ static inline void sub_context_inflight_prio(struct intel_context *ce, static inline void update_context_prio(struct intel_context *ce) { - struct intel_guc *guc = &ce->engine->gt->uc.guc; + struct intel_guc *guc = ce_to_guc(ce); int i; BUILD_BUG_ON(GUC_CLIENT_PRIORITY_KMD_HIGH != 0);
Re: [PATCH v3 3/4] drm/i915: Use the new gt_to_guc() wrapper
On 12/6/2023 9:46 PM, Andi Shyti wrote: Get the guc reference from the gt using the gt_to_guc() helper. Signed-off-by: Andi Shyti Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/i915_debugfs_params.c | 2 +- drivers/gpu/drm/i915/selftests/intel_scheduler_helpers.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs_params.c b/drivers/gpu/drm/i915/i915_debugfs_params.c index 8bca02025e09..74b7f2fd8b57 100644 --- a/drivers/gpu/drm/i915/i915_debugfs_params.c +++ b/drivers/gpu/drm/i915/i915_debugfs_params.c @@ -43,7 +43,7 @@ static int notify_guc(struct drm_i915_private *i915) for_each_gt(gt, i915, i) { if (intel_uc_uses_guc_submission(>->uc)) - ret = intel_guc_global_policies_update(>->uc.guc); + ret = intel_guc_global_policies_update(gt_to_guc(gt)); } return ret; diff --git a/drivers/gpu/drm/i915/selftests/intel_scheduler_helpers.c b/drivers/gpu/drm/i915/selftests/intel_scheduler_helpers.c index 2990dd4d4a0d..d9d8f0336702 100644 --- a/drivers/gpu/drm/i915/selftests/intel_scheduler_helpers.c +++ b/drivers/gpu/drm/i915/selftests/intel_scheduler_helpers.c @@ -65,7 +65,7 @@ int intel_selftest_modify_policy(struct intel_engine_cs *engine, if (!intel_engine_uses_guc(engine)) return 0; - err = intel_guc_global_policies_update(&engine->gt->uc.guc); + err = intel_guc_global_policies_update(gt_to_guc(engine->gt)); if (err) intel_selftest_restore_policy(engine, saved); @@ -84,7 +84,7 @@ int intel_selftest_restore_policy(struct intel_engine_cs *engine, if (!intel_engine_uses_guc(engine)) return 0; - return intel_guc_global_policies_update(&engine->gt->uc.guc); + return intel_guc_global_policies_update(gt_to_guc(engine->gt)); } int intel_selftest_wait_for_rq(struct i915_request *rq)
Re: [PATCH v3 2/4] drm/i915/guc: Use the new gt_to_guc() wrapper
On 12/6/2023 9:46 PM, Andi Shyti wrote: Get the guc reference from the gt using the gt_to_guc() helper. Signed-off-by: Andi Shyti Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c | 4 +-- drivers/gpu/drm/i915/gt/uc/intel_gsc_proxy.c | 3 +- drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c| 2 +- .../gpu/drm/i915/gt/uc/intel_guc_capture.c| 6 ++-- .../gpu/drm/i915/gt/uc/intel_guc_hwconfig.c | 2 +- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 28 +-- drivers/gpu/drm/i915/gt/uc/intel_huc.c| 4 +-- drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 4 +-- drivers/gpu/drm/i915/gt/uc/selftest_guc.c | 2 +- 9 files changed, 28 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c index e2e42b3e0d5d..3b69bc6616bd 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c @@ -298,7 +298,7 @@ static int gsc_fw_load_prepare(struct intel_gsc_uc *gsc) memcpy_toio(gsc->local_vaddr, src, gsc->fw.size); memset_io(gsc->local_vaddr + gsc->fw.size, 0, gsc->local->size - gsc->fw.size); - intel_guc_write_barrier(>->uc.guc); + intel_guc_write_barrier(gt_to_guc(gt)); i915_gem_object_unpin_map(gsc->fw.obj); @@ -351,7 +351,7 @@ static int gsc_fw_query_compatibility_version(struct intel_gsc_uc *gsc) void *vaddr; int err; - err = intel_guc_allocate_and_map_vma(>->uc.guc, GSC_VER_PKT_SZ * 2, + err = intel_guc_allocate_and_map_vma(gt_to_guc(gt), GSC_VER_PKT_SZ * 2, &vma, &vaddr); if (err) { gt_err(gt, "failed to allocate vma for GSC version query\n"); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_gsc_proxy.c b/drivers/gpu/drm/i915/gt/uc/intel_gsc_proxy.c index 40817ebcca71..a7d5465655f9 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_gsc_proxy.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_gsc_proxy.c @@ -358,7 +358,8 @@ static int proxy_channel_alloc(struct intel_gsc_uc *gsc) void *vaddr; int err; - err = intel_guc_allocate_and_map_vma(>->uc.guc, GSC_PROXY_CHANNEL_SIZE, + err = intel_guc_allocate_and_map_vma(gt_to_guc(gt), +GSC_PROXY_CHANNEL_SIZE, &vma, &vaddr); if (err) return err; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c index 63724e17829a..1ef470e64604 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c @@ -956,7 +956,7 @@ u32 intel_guc_engine_usage_offset(struct intel_guc *guc) struct iosys_map intel_guc_engine_usage_record_map(struct intel_engine_cs *engine) { - struct intel_guc *guc = &engine->gt->uc.guc; + struct intel_guc *guc = gt_to_guc(engine->gt); u8 guc_class = engine_class_to_guc_class(engine->class); size_t offset = offsetof(struct __guc_ads_blob, engine_usage.engines[guc_class][ilog2(engine->logical_mask)]); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c index a4da0208c883..84a8807391c5 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c @@ -1441,7 +1441,7 @@ int intel_guc_capture_print_engine_node(struct drm_i915_error_state_buf *ebuf, if (!cap || !ee->engine) return -ENODEV; - guc = &ee->engine->gt->uc.guc; + guc = gt_to_guc(ee->engine->gt); i915_error_printf(ebuf, "global --- GuC Error Capture on %s command stream:\n", ee->engine->name); @@ -1543,7 +1543,7 @@ bool intel_guc_capture_is_matching_engine(struct intel_gt *gt, if (!gt || !ce || !engine) return false; - guc = >->uc.guc; + guc = gt_to_guc(gt); if (!guc->capture) return false; @@ -1573,7 +1573,7 @@ void intel_guc_capture_get_matching_node(struct intel_gt *gt, if (!gt || !ee || !ce) return; - guc = >->uc.guc; + guc = gt_to_guc(gt); if (!guc->capture) return; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_hwconfig.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_hwconfig.c index cc9569af7f0c..b67a15f74276 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_hwconfig.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_hwconfig.c @@ -111,7 +111,7 @@ static bool has_table(struct drm_i915_private *i915) static int guc_hwconfig_init(struct intel_gt *gt) { struct intel_hwconfig *hwconfig = >->info.hwconfig; - struct intel_guc *guc = >->uc.guc; +
Re: [PATCH v3 1/4] drm/i915/gt: Create the gt_to_guc() wrapper
On 12/6/2023 9:46 PM, Andi Shyti wrote: We already have guc_to_gt() and getting to guc from the GT it requires some mental effort. Add the gt_to_guc(). Given the reference to the "gt", the gt_to_guc() will return the pinter to the "guc". Update all the files under the gt/ directory. Signed-off-by: Andi Shyti Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 4 ++-- drivers/gpu/drm/i915/gt/intel_ggtt.c | 9 +++-- drivers/gpu/drm/i915/gt/intel_gt.h| 5 + drivers/gpu/drm/i915/gt/intel_gt_irq.c| 6 +++--- drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c | 2 +- drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c | 8 drivers/gpu/drm/i915/gt/intel_rc6.c | 4 ++-- drivers/gpu/drm/i915/gt/intel_rps.c | 2 +- drivers/gpu/drm/i915/gt/intel_tlb.c | 2 +- drivers/gpu/drm/i915/gt/selftest_slpc.c | 6 +++--- 10 files changed, 25 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 40687806d22a..bede7f09d4af 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -589,7 +589,7 @@ u64 intel_clamp_preempt_timeout_ms(struct intel_engine_cs *engine, u64 value) * NB: The GuC API only supports 32bit values. However, the limit is further * reduced due to internal calculations which would otherwise overflow. */ - if (intel_guc_submission_is_wanted(&engine->gt->uc.guc)) + if (intel_guc_submission_is_wanted(gt_to_guc(engine->gt))) value = min_t(u64, value, guc_policy_max_preempt_timeout_ms()); value = min_t(u64, value, jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT)); @@ -610,7 +610,7 @@ u64 intel_clamp_timeslice_duration_ms(struct intel_engine_cs *engine, u64 value) * NB: The GuC API only supports 32bit values. However, the limit is further * reduced due to internal calculations which would otherwise overflow. */ - if (intel_guc_submission_is_wanted(&engine->gt->uc.guc)) + if (intel_guc_submission_is_wanted(gt_to_guc(engine->gt))) value = min_t(u64, value, guc_policy_max_exec_quantum_ms()); value = min_t(u64, value, jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT)); diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c index 21a7e3191c18..aa1e9249d393 100644 --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c @@ -230,11 +230,8 @@ static void guc_ggtt_ct_invalidate(struct intel_gt *gt) struct intel_uncore *uncore = gt->uncore; intel_wakeref_t wakeref; - with_intel_runtime_pm_if_active(uncore->rpm, wakeref) { - struct intel_guc *guc = >->uc.guc; - - intel_guc_invalidate_tlb_guc(guc); - } + with_intel_runtime_pm_if_active(uncore->rpm, wakeref) + intel_guc_invalidate_tlb_guc(gt_to_guc(gt)); } static void guc_ggtt_invalidate(struct i915_ggtt *ggtt) @@ -245,7 +242,7 @@ static void guc_ggtt_invalidate(struct i915_ggtt *ggtt) gen8_ggtt_invalidate(ggtt); list_for_each_entry(gt, &ggtt->gt_list, ggtt_link) { - if (intel_guc_tlb_invalidation_is_available(>->uc.guc)) + if (intel_guc_tlb_invalidation_is_available(gt_to_guc(gt))) guc_ggtt_ct_invalidate(gt); else if (GRAPHICS_VER(i915) >= 12) intel_uncore_write_fw(gt->uncore, diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h index b0e453e27ea8..d7c859039828 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.h +++ b/drivers/gpu/drm/i915/gt/intel_gt.h @@ -118,6 +118,11 @@ static inline struct intel_gt *gsc_to_gt(struct intel_gsc *gsc) return container_of(gsc, struct intel_gt, gsc); } +static inline struct intel_guc *gt_to_guc(struct intel_gt *gt) +{ + return >->uc.guc; +} + void intel_gt_common_init_early(struct intel_gt *gt); int intel_root_gt_init_early(struct drm_i915_private *i915); int intel_gt_assign_ggtt(struct intel_gt *gt); diff --git a/drivers/gpu/drm/i915/gt/intel_gt_irq.c b/drivers/gpu/drm/i915/gt/intel_gt_irq.c index 77fb57223465..ad4c51f18d3a 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_irq.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_irq.c @@ -68,9 +68,9 @@ gen11_other_irq_handler(struct intel_gt *gt, const u8 instance, struct intel_gt *media_gt = gt->i915->media_gt; if (instance == OTHER_GUC_INSTANCE) - return guc_irq_handler(>->uc.guc, iir); + return guc_irq_handler(gt_to_guc(gt), iir); if (instance == OTHER_MEDIA_GUC_INSTANCE && media_gt) - return guc_irq_handler(&media_gt->uc.guc, iir); + return guc_irq_handler(gt
Re: [Intel-gfx] [PATCH v2] drm/i915/gt: Convert reset prepare failure log to trace
Hi John, On 12/5/2023 8:50 PM, John Harrison wrote: On 12/5/2023 02:39, Nirmoy Das wrote: Hi John, On 12/5/2023 10:10 AM, John Harrison wrote: On 12/5/2023 00:52, Nirmoy Das wrote: gen8_engine_reset_prepare() can fail when HW fails to set RESET_CTL_READY_TO_RESET bit. In some cases this is not fatal error as driver will retry. Convert the log to a trace log for debugging without triggering unnecessary concerns in CI or for end-users during non-fatal scenarios. I strongly disagree with this change. The hardware spec for the RESET_CTL and GDRST registers are that they will self clear within a matter of microseconds. If something is so badly wrong with the hardware that it can't even manage to reset This message is for reset readiness poll timeout not that the reset is failed which doesn't sound so serious if the subsequent attempt managed reset the engine. Not sure what the distinction is. The reset procedure is poke RESET_CTL wait for it to clear, poke GDRST and wait for it to clear. Just because step one is failing rather than step 2 does not mean that the reset as a whole has not failed. Note that the purpose of RESET_CTL is to pause a bunch of stuff like the command streamers to prevent them from issuing new memory requests while the reset is in progress. If it fails, it likely means that a CS is refusing to stop. Most probably because it can't reach a stopping point because it is stuck waiting on a lost memory request or similar. And the point of stopping further memory requests during reset is that if the memory channel gets out of sync (because only the GT side is reset during a GT reset) then that can result in total system failure. As in potentially even the CPU can no longer get to memory if it is an integrated platform. So yes, it can be quite a serious failure indeed. Thanks bspec didn't explain those details. My intention was to acknowledge that engine reset is a complicated process which why the driver retries and don't spook CI/user if subsequent reset works but I get your objection on this. I couldn't get enough details when this can happen that HW takes very long time to set the readiness bit. Is it simply 'taking a long time' or is never clearing at all? If it is just that the timeout is too short then the proper fix would be to increase the timeout. But if it is taking seconds or longer or just never succeeding at all, then something is very bad. I tried with 10x timeout without any help so I think the CS is stuck though re-try works. I will try to get more details from HW team on this issue. then that is something that very much warrants more than a completely silent trace event. It most certainly should be flagged as a failure in CI. Just because the driver will retry does not mean that this is not a serious error. And if the first attempt failed, why would a subsequent attempt succeed? The patch is not ignoring the failure. If the subsequent attempt fails then driver load will fail or it will be wedged if that happens after driver load. One thing I really hate about our driver is the total lack of information when something goes wrong during load. The driver wedges in total silence. There are many error paths that have no reporting at all. Which means you are left with a totally useless bug report. Escalating to FLR may have more success, but that is not something that i915 currently does. Do we still need to do FLR if a subsequent engine reset failure ? Assuming that we are talking about modern(ish) platforms, an engine reset failure would be hit by GuC rather than i915, but that would be escalated to an i915 based full GT reset. Generally speaking though, if the engine reset fails the GT reset isn't going to do much better. It would fix a dead GuC problem but it can't help with memory related issues. If the full GT reset fails then we are out of escalation routes as there is no FLR path at present (I think we have that at driver unload on MTL but not for general reset?). The FLR resets a lot more than just the GT, so it does have a chance to fix some issues that a GT reset can't. After driver-level FLR, there is PCI level FLR. Not sure if that involves a full power down and restart, but if not then that would be the last escalation possible. A power cycle really should fix any issues, if it doesn't then it's time to return the system as being totally dead! My recollection is that the vast majority of engine reset failures I've looked at have been completely catastrophic and the system only recovered after a reboot. I.e. after the card was power cycled. Such issues were generally caused by bad memory. Once the path to memory has died, there really is not much of the GPU that can do anything at all and there isn't much that can be done to recover it. Thanks, Nirmoy John. Regards, Nirmoy John.
Re: [Intel-gfx] [PATCH] drm/i915/gt: Reduce log severity on reset prepare.
Hi Tvrtko, On 12/5/2023 11:05 AM, Tvrtko Ursulin wrote: On 05/12/2023 08:50, Nirmoy Das wrote: Hi Tvrtko, On 12/5/2023 9:34 AM, Tvrtko Ursulin wrote: On 01/12/2023 15:44, Nirmoy Das wrote: gen8_engine_reset_prepare() can fail when HW fails to set RESET_CTL_READY_TO_RESET bit. In some cases this is not fatal error as driver will retry. Let the caller of gen8_engine_reset_prepare() decide if a failure in gen8_engine_reset_prepare is an error or not. No complaints per se but I don't see the caller deciding and it is not really reducing log level but converting to trace. So commit message and patch do not align for me which I think should be improved. I meant the return value is checked by the caller, gen8_reset_engines(). I will resend with a improved commit message. Ah okay, maybe my bad for not figuring out that possibility. I guess it might be passable as is, but yes, clearer commit text would be better. I sent a v2 already :) Trace is good enough - we are not usually interested in seeing those as dbg/info/notice? Idea is that all the GT related events are recorded in trace and dmesg could be noisy some times. Regards, Nirmoy Regards, Tvrtko Thanks, Nirmoy Regards, Tvrtko Cc: Tvrtko Ursulin Cc: John Harrison Cc: Andi Shyti Cc: Andrzej Hajda Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/5591 Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/intel_reset.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index d5ed904f355d..e6fbc6202c80 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -593,10 +593,10 @@ static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) ret = __intel_wait_for_register_fw(uncore, reg, mask, ack, 700, 0, NULL); if (ret) - gt_err(engine->gt, - "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", - engine->name, request, - intel_uncore_read_fw(uncore, reg)); + GT_TRACE(engine->gt, + "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", + engine->name, request, + intel_uncore_read_fw(uncore, reg)); return ret; }
Re: [Intel-gfx] [PATCH v2] drm/i915/gt: Convert reset prepare failure log to trace
Hi John, On 12/5/2023 10:10 AM, John Harrison wrote: On 12/5/2023 00:52, Nirmoy Das wrote: gen8_engine_reset_prepare() can fail when HW fails to set RESET_CTL_READY_TO_RESET bit. In some cases this is not fatal error as driver will retry. Convert the log to a trace log for debugging without triggering unnecessary concerns in CI or for end-users during non-fatal scenarios. I strongly disagree with this change. The hardware spec for the RESET_CTL and GDRST registers are that they will self clear within a matter of microseconds. If something is so badly wrong with the hardware that it can't even manage to reset This message is for reset readiness poll timeout not that the reset is failed which doesn't sound so serious if the subsequent attempt managed reset the engine. I couldn't get enough details when this can happen that HW takes very long time to set the readiness bit. then that is something that very much warrants more than a completely silent trace event. It most certainly should be flagged as a failure in CI. Just because the driver will retry does not mean that this is not a serious error. And if the first attempt failed, why would a subsequent attempt succeed? The patch is not ignoring the failure. If the subsequent attempt fails then driver load will fail or it will be wedged if that happens after driver load. Escalating to FLR may have more success, but that is not something that i915 currently does. Do we still need to do FLR if a subsequent engine reset failure ? Regards, Nirmoy John. v2: Improve commit message(Tvrtko) Cc: Tvrtko Ursulin Cc: John Harrison Cc: Andi Shyti Cc: Andrzej Hajda Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/5591 Signed-off-by: Nirmoy Das Reviewed-by: Andi Shyti Reviewed-by: Andrzej Hajda --- drivers/gpu/drm/i915/gt/intel_reset.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index d5ed904f355d..e6fbc6202c80 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -593,10 +593,10 @@ static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) ret = __intel_wait_for_register_fw(uncore, reg, mask, ack, 700, 0, NULL); if (ret) - gt_err(engine->gt, - "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", - engine->name, request, - intel_uncore_read_fw(uncore, reg)); + GT_TRACE(engine->gt, + "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", + engine->name, request, + intel_uncore_read_fw(uncore, reg)); return ret; }
[Intel-gfx] [PATCH v2] drm/i915/gt: Convert reset prepare failure log to trace
gen8_engine_reset_prepare() can fail when HW fails to set RESET_CTL_READY_TO_RESET bit. In some cases this is not fatal error as driver will retry. Convert the log to a trace log for debugging without triggering unnecessary concerns in CI or for end-users during non-fatal scenarios. v2: Improve commit message(Tvrtko) Cc: Tvrtko Ursulin Cc: John Harrison Cc: Andi Shyti Cc: Andrzej Hajda Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/5591 Signed-off-by: Nirmoy Das Reviewed-by: Andi Shyti Reviewed-by: Andrzej Hajda --- drivers/gpu/drm/i915/gt/intel_reset.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index d5ed904f355d..e6fbc6202c80 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -593,10 +593,10 @@ static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) ret = __intel_wait_for_register_fw(uncore, reg, mask, ack, 700, 0, NULL); if (ret) - gt_err(engine->gt, - "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", - engine->name, request, - intel_uncore_read_fw(uncore, reg)); + GT_TRACE(engine->gt, +"%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", +engine->name, request, +intel_uncore_read_fw(uncore, reg)); return ret; } -- 2.42.0
Re: [Intel-gfx] [PATCH] drm/i915/gt: Reduce log severity on reset prepare.
Hi Tvrtko, On 12/5/2023 9:34 AM, Tvrtko Ursulin wrote: On 01/12/2023 15:44, Nirmoy Das wrote: gen8_engine_reset_prepare() can fail when HW fails to set RESET_CTL_READY_TO_RESET bit. In some cases this is not fatal error as driver will retry. Let the caller of gen8_engine_reset_prepare() decide if a failure in gen8_engine_reset_prepare is an error or not. No complaints per se but I don't see the caller deciding and it is not really reducing log level but converting to trace. So commit message and patch do not align for me which I think should be improved. I meant the return value is checked by the caller, gen8_reset_engines(). I will resend with a improved commit message. Thanks, Nirmoy Regards, Tvrtko Cc: Tvrtko Ursulin Cc: John Harrison Cc: Andi Shyti Cc: Andrzej Hajda Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/5591 Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/intel_reset.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index d5ed904f355d..e6fbc6202c80 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -593,10 +593,10 @@ static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) ret = __intel_wait_for_register_fw(uncore, reg, mask, ack, 700, 0, NULL); if (ret) - gt_err(engine->gt, - "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", - engine->name, request, - intel_uncore_read_fw(uncore, reg)); + GT_TRACE(engine->gt, + "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", + engine->name, request, + intel_uncore_read_fw(uncore, reg)); return ret; }
[Intel-gfx] [PATCH] drm/i915/gt: Reduce log severity on reset prepare.
gen8_engine_reset_prepare() can fail when HW fails to set RESET_CTL_READY_TO_RESET bit. In some cases this is not fatal error as driver will retry. Let the caller of gen8_engine_reset_prepare() decide if a failure in gen8_engine_reset_prepare is an error or not. Cc: Tvrtko Ursulin Cc: John Harrison Cc: Andi Shyti Cc: Andrzej Hajda Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/5591 Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/intel_reset.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index d5ed904f355d..e6fbc6202c80 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -593,10 +593,10 @@ static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) ret = __intel_wait_for_register_fw(uncore, reg, mask, ack, 700, 0, NULL); if (ret) - gt_err(engine->gt, - "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", - engine->name, request, - intel_uncore_read_fw(uncore, reg)); + GT_TRACE(engine->gt, +"%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", +engine->name, request, +intel_uncore_read_fw(uncore, reg)); return ret; } -- 2.42.0
Re: [Intel-gfx] [PATCH] drm/i915/gt: add missing new-line to GT_TRACE
On 11/15/2023 1:10 PM, Andrzej Hajda wrote: Trace requires new-line at the end of message (in opposition to printk), otherwise trace dump becomes messy. Signed-off-by: Andrzej Hajda Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/gt/intel_gt_pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c index f5899d503e234b..471b7cdc10ba0f 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c @@ -167,7 +167,7 @@ static void gt_sanitize(struct intel_gt *gt, bool force) enum intel_engine_id id; intel_wakeref_t wakeref; - GT_TRACE(gt, "force:%s", str_yes_no(force)); + GT_TRACE(gt, "force:%s\n", str_yes_no(force)); /* Use a raw wakeref to avoid calling intel_display_power_get early */ wakeref = intel_runtime_pm_get(gt->uncore->rpm); --- base-commit: 1489bab52c281a869295414031a56506a375b036 change-id: 20231115-eols-20f9f52cf338 Best regards,
Re: [Intel-gfx] [PATCH v2] drm/i915: do not clean GT table on error path
On 11/15/2023 11:54 AM, Andrzej Hajda wrote: The only task of intel_gt_release_all is to zero gt table. Calling it on error path prevents intel_gt_driver_late_release_all (called from i915_driver_late_release) to cleanup GTs, causing leakage. After i915_driver_late_release GT array is not used anymore so it does not need cleaning at all. Sample leak report: BUG i915_request (...): Objects remaining in i915_request on __kmem_cache_shutdown() ... Object 0x888113420040 @offset=64 Allocated in __i915_request_create+0x75/0x610 [i915] age=18339 cpu=1 pid=1454 kmem_cache_alloc+0x25b/0x270 __i915_request_create+0x75/0x610 [i915] i915_request_create+0x109/0x290 [i915] __engines_record_defaults+0xca/0x440 [i915] intel_gt_init+0x275/0x430 [i915] i915_gem_init+0x135/0x2c0 [i915] i915_driver_probe+0x8d1/0xdc0 [i915] v2: removed whole intel_gt_release_all Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/8489 Fixes: bec68cc9ea42d8 ("drm/i915: Prepare for multiple GTs") Signed-off-by: Andrzej Hajda Reviewed-by: Nirmoy Das --- - Link to v1: https://lore.kernel.org/r/20231114-dont_clean_gt_on_error_path-v1-1-37f2fa827...@intel.com --- drivers/gpu/drm/i915/gt/intel_gt.c | 11 --- drivers/gpu/drm/i915/i915_driver.c | 4 +--- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index ed32bf5b15464e..ba1186fc524f84 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -982,8 +982,6 @@ int intel_gt_probe_all(struct drm_i915_private *i915) err: i915_probe_error(i915, "Failed to initialize %s! (%d)\n", gtdef->name, ret); - intel_gt_release_all(i915); - return ret; } @@ -1002,15 +1000,6 @@ int intel_gt_tiles_init(struct drm_i915_private *i915) return 0; } -void intel_gt_release_all(struct drm_i915_private *i915) -{ - struct intel_gt *gt; - unsigned int id; - - for_each_gt(gt, i915, id) - i915->gt[id] = NULL; -} - void intel_gt_info_print(const struct intel_gt_info *info, struct drm_printer *p) { diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c index 01fd25b622d16c..2a1faf4039659c 100644 --- a/drivers/gpu/drm/i915/i915_driver.c +++ b/drivers/gpu/drm/i915/i915_driver.c @@ -776,7 +776,7 @@ int i915_driver_probe(struct pci_dev *pdev, const struct pci_device_id *ent) ret = i915_driver_mmio_probe(i915); if (ret < 0) - goto out_tiles_cleanup; + goto out_runtime_pm_put; ret = i915_driver_hw_probe(i915); if (ret < 0) @@ -836,8 +836,6 @@ int i915_driver_probe(struct pci_dev *pdev, const struct pci_device_id *ent) i915_ggtt_driver_late_release(i915); out_cleanup_mmio: i915_driver_mmio_release(i915); -out_tiles_cleanup: - intel_gt_release_all(i915); out_runtime_pm_put: enable_rpm_wakeref_asserts(&i915->runtime_pm); i915_driver_late_release(i915); --- base-commit: 1489bab52c281a869295414031a56506a375b036 change-id: 20231114-dont_clean_gt_on_error_path-91cd9c3caa0a Best regards,
Re: [Intel-gfx] [PATCH] drm/i915/mtl: Apply notify_guc to all GTs
On 11/6/2023 1:45 PM, Jani Nikula wrote: On Wed, 25 Oct 2023, Nirmoy Das wrote: Handle platforms with multiple GTs by iterate over all GTs. Add a Fixes commit so this gets propagated for MTL support. Fixes: 213c43676beb ("drm/i915/mtl: Remove the 'force_probe' requirement for Meteor Lake") This came up in another patch. I don't like abusing Fixes: like this. I understand the motivation here, but this patch does not fix the referenced commit. I wasn't aware of a better solution but now I have from your response to the other patch. I will keep that in my mind. Thanks, Nirmoy BR, Jani. Suggested-by: John Harrison Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tvrtko Ursulin Cc: Andi Shyti Cc: Andrzej Hajda Signed-off-by: Nirmoy Das --- drivers/gpu/drm/i915/i915_debugfs_params.c | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs_params.c b/drivers/gpu/drm/i915/i915_debugfs_params.c index 614bde321589..8bca02025e09 100644 --- a/drivers/gpu/drm/i915/i915_debugfs_params.c +++ b/drivers/gpu/drm/i915/i915_debugfs_params.c @@ -38,10 +38,13 @@ static int i915_param_int_open(struct inode *inode, struct file *file) static int notify_guc(struct drm_i915_private *i915) { - int ret = 0; + struct intel_gt *gt; + int i, ret = 0; - if (intel_uc_uses_guc_submission(&to_gt(i915)->uc)) - ret = intel_guc_global_policies_update(&to_gt(i915)->uc.guc); + for_each_gt(gt, i915, i) { + if (intel_uc_uses_guc_submission(>->uc)) + ret = intel_guc_global_policies_update(>->uc.guc); + } return ret; }
Re: [Intel-gfx] [PATCH RESEND 2/3] drm/i915: move gpu error debugfs to i915_gpu_error.c
On 10/31/2023 3:18 PM, Jani Nikula wrote: On Tue, 31 Oct 2023, Nirmoy Das wrote: On 10/31/2023 1:45 PM, Jani Nikula wrote: +void i915_gpu_error_debugfs_register(struct drm_i915_private *i915) +{ + struct drm_minor *minor = i915->drm.primary; + + debugfs_create_file("i915_error_state", 0644, nit: s/0644/S_IRUGO | S_IWUSR The direction pretty much across the kernel is to go towards octal permissions because the macros are harder to understand. Personally I prefer octal but didn't realize this is preferred in general[*]. [*]https://lore.kernel.org/lkml/7232ef011d05a92f4caa86a5e9830d87966a2eaf.1470180926.git@perches.com/ Regards, Nirmoy Reviewed-by: Nirmoy Das Thanks, Jani.
Re: [Intel-gfx] [PATCH RESEND 3/3] drm/i915: move gpu error sysfs to i915_gpu_error.c
On 10/31/2023 1:45 PM, Jani Nikula wrote: Hide gpu error specifics in i915_gpu_error.c. This is also cleaner wrt conditional compilation, as i915_gpu_error.c is only built with DRM_I915_CAPTURE_ERROR=y. With this, we can also make i915_first_error_state() static. Signed-off-by: Jani Nikula Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/i915_gpu_error.c | 75 - drivers/gpu/drm/i915/i915_gpu_error.h | 17 +++--- drivers/gpu/drm/i915/i915_sysfs.c | 79 +-- 3 files changed, 86 insertions(+), 85 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index f195df91d9e6..00559a75b798 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -57,6 +57,7 @@ #include "i915_memcpy.h" #include "i915_reg.h" #include "i915_scatterlist.h" +#include "i915_sysfs.h" #include "i915_utils.h" #define ALLOW_FAIL (__GFP_KSWAPD_RECLAIM | __GFP_RETRY_MAYFAIL | __GFP_NOWARN) @@ -2211,7 +2212,7 @@ void i915_capture_error_state(struct intel_gt *gt, i915_gpu_coredump_put(error); } -struct i915_gpu_coredump * +static struct i915_gpu_coredump * i915_first_error_state(struct drm_i915_private *i915) { struct i915_gpu_coredump *error; @@ -2487,3 +2488,75 @@ void i915_gpu_error_debugfs_register(struct drm_i915_private *i915) debugfs_create_file("i915_gpu_info", 0644, minor->debugfs_root, i915, &i915_gpu_info_fops); } + +static ssize_t error_state_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + + struct device *kdev = kobj_to_dev(kobj); + struct drm_i915_private *i915 = kdev_minor_to_i915(kdev); + struct i915_gpu_coredump *gpu; + ssize_t ret = 0; + + /* +* FIXME: Concurrent clients triggering resets and reading + clearing +* dumps can cause inconsistent sysfs reads when a user calls in with a +* non-zero offset to complete a prior partial read but the +* gpu_coredump has been cleared or replaced. +*/ + + gpu = i915_first_error_state(i915); + if (IS_ERR(gpu)) { + ret = PTR_ERR(gpu); + } else if (gpu) { + ret = i915_gpu_coredump_copy_to_buffer(gpu, buf, off, count); + i915_gpu_coredump_put(gpu); + } else { + const char *str = "No error state collected\n"; + size_t len = strlen(str); + + if (off < len) { + ret = min_t(size_t, count, len - off); + memcpy(buf, str + off, ret); + } + } + + return ret; +} + +static ssize_t error_state_write(struct file *file, struct kobject *kobj, +struct bin_attribute *attr, char *buf, +loff_t off, size_t count) +{ + struct device *kdev = kobj_to_dev(kobj); + struct drm_i915_private *dev_priv = kdev_minor_to_i915(kdev); + + drm_dbg(&dev_priv->drm, "Resetting error state\n"); + i915_reset_error_state(dev_priv); + + return count; +} + +static const struct bin_attribute error_state_attr = { + .attr.name = "error", + .attr.mode = S_IRUSR | S_IWUSR, + .size = 0, + .read = error_state_read, + .write = error_state_write, +}; + +void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915) +{ + struct device *kdev = i915->drm.primary->kdev; + + if (sysfs_create_bin_file(&kdev->kobj, &error_state_attr)) + drm_err(&i915->drm, "error_state sysfs setup failed\n"); +} + +void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915) +{ + struct device *kdev = i915->drm.primary->kdev; + + sysfs_remove_bin_file(&kdev->kobj, &error_state_attr); +} diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index f851189b0ff1..fa886620d3f8 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -325,11 +325,12 @@ static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu) kref_put(&gpu->ref, __i915_gpu_coredump_free); } -struct i915_gpu_coredump *i915_first_error_state(struct drm_i915_private *i915); void i915_reset_error_state(struct drm_i915_private *i915); void i915_disable_error_state(struct drm_i915_private *i915, int err); void i915_gpu_error_debugfs_register(struct drm_i915_private *i915); +void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915); +void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915); #else @@ -398,12 +399,6 @@ static inline v
Re: [Intel-gfx] [PATCH RESEND 2/3] drm/i915: move gpu error debugfs to i915_gpu_error.c
s/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -2140,7 +2140,7 @@ __i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 du return error; } -struct i915_gpu_coredump * +static struct i915_gpu_coredump * i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags) { static DEFINE_MUTEX(capture_mutex); @@ -2378,3 +2378,112 @@ void intel_klog_error_capture(struct intel_gt *gt, drm_info(&i915->drm, "[Capture/%d.%d] Dumped %zd bytes\n", l_count, line++, pos_err); } #endif + +static ssize_t gpu_state_read(struct file *file, char __user *ubuf, + size_t count, loff_t *pos) +{ + struct i915_gpu_coredump *error; + ssize_t ret; + void *buf; + + error = file->private_data; + if (!error) + return 0; + + /* Bounce buffer required because of kernfs __user API convenience. */ + buf = kmalloc(count, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + ret = i915_gpu_coredump_copy_to_buffer(error, buf, *pos, count); + if (ret <= 0) + goto out; + + if (!copy_to_user(ubuf, buf, ret)) + *pos += ret; + else + ret = -EFAULT; + +out: + kfree(buf); + return ret; +} + +static int gpu_state_release(struct inode *inode, struct file *file) +{ + i915_gpu_coredump_put(file->private_data); + return 0; +} + +static int i915_gpu_info_open(struct inode *inode, struct file *file) +{ + struct drm_i915_private *i915 = inode->i_private; + struct i915_gpu_coredump *gpu; + intel_wakeref_t wakeref; + + gpu = NULL; + with_intel_runtime_pm(&i915->runtime_pm, wakeref) + gpu = i915_gpu_coredump(to_gt(i915), ALL_ENGINES, CORE_DUMP_FLAG_NONE); + + if (IS_ERR(gpu)) + return PTR_ERR(gpu); + + file->private_data = gpu; + return 0; +} + +static const struct file_operations i915_gpu_info_fops = { + .owner = THIS_MODULE, + .open = i915_gpu_info_open, + .read = gpu_state_read, + .llseek = default_llseek, + .release = gpu_state_release, +}; + +static ssize_t +i915_error_state_write(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) +{ + struct i915_gpu_coredump *error = filp->private_data; + + if (!error) + return 0; + + drm_dbg(&error->i915->drm, "Resetting error state\n"); + i915_reset_error_state(error->i915); + + return cnt; +} + +static int i915_error_state_open(struct inode *inode, struct file *file) +{ + struct i915_gpu_coredump *error; + + error = i915_first_error_state(inode->i_private); + if (IS_ERR(error)) + return PTR_ERR(error); + + file->private_data = error; + return 0; +} + +static const struct file_operations i915_error_state_fops = { + .owner = THIS_MODULE, + .open = i915_error_state_open, + .read = gpu_state_read, + .write = i915_error_state_write, + .llseek = default_llseek, + .release = gpu_state_release, +}; + +void i915_gpu_error_debugfs_register(struct drm_i915_private *i915) +{ + struct drm_minor *minor = i915->drm.primary; + + debugfs_create_file("i915_error_state", 0644, nit: s/0644/S_IRUGO | S_IWUSR Reviewed-by: Nirmoy Das minor->debugfs_root, i915, + &i915_error_state_fops); + debugfs_create_file("i915_gpu_info", 0644, minor->debugfs_root, i915, + &i915_gpu_info_fops); +} diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index 8f9cdf056181..f851189b0ff1 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -278,8 +278,6 @@ static inline void intel_klog_error_capture(struct intel_gt *gt, __printf(2, 3) void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...); -struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt, - intel_engine_mask_t engine_mask, u32 dump_flags); void i915_capture_error_state(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags); @@ -331,6 +329,8 @@ struct i915_gpu_coredump *i915_first_error_state(struct drm_i915_private *i915); void i915_reset_error_state(struct drm_i915_private *i915); void i915_disable_error_state(struct drm_i915_private *i915, int err); +void i915_gpu_error_debugfs_register(struct drm_i915_private *i915); + #else __printf(2, 3) @@ -413,6 +413,10 @@ static inline void i915_disable_error_state(struct drm_i915_private *i915, { } +static inline void i915_gpu_error_debugfs_register(struct drm_i915_private *i915) +{ +} + #endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */ #endif /* _I915_GPU_ERROR_H_ */
Re: [Intel-gfx] [PATCH RESEND 1/3] drm/i915: make some error capture functions static
On 10/31/2023 1:45 PM, Jani Nikula wrote: Not needed outside of i915_gpu_error.c. Signed-off-by: Jani Nikula Reviewed-by: Nirmoy Das --- drivers/gpu/drm/i915/i915_gpu_error.c | 8 drivers/gpu/drm/i915/i915_gpu_error.h | 5 - 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 8275f9b6a47d..889db834f07d 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -520,7 +520,7 @@ __find_vma(struct i915_vma_coredump *vma, const char *name) return NULL; } -struct i915_vma_coredump * +static struct i915_vma_coredump * intel_gpu_error_find_batch(const struct intel_engine_coredump *ee) { return __find_vma(ee->vma, "batch"); @@ -609,9 +609,9 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) va_end(args); } -void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m, - const struct intel_engine_cs *engine, - const struct i915_vma_coredump *vma) +static void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m, + const struct intel_engine_cs *engine, + const struct i915_vma_coredump *vma) { char out[ASCII85_BUFSZ]; struct page *page; diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index 4ce227f7e1e1..8f9cdf056181 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -277,11 +277,6 @@ static inline void intel_klog_error_capture(struct intel_gt *gt, __printf(2, 3) void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...); -void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m, - const struct intel_engine_cs *engine, - const struct i915_vma_coredump *vma); -struct i915_vma_coredump * -intel_gpu_error_find_batch(const struct intel_engine_coredump *ee); struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags);