Re: [PATCH] drm/i915/guc: Fix missing enable of Wa_14019159160 on ARL

2024-08-09 Thread Nirmoy Das


On 8/9/2024 2:06 AM, john.c.harri...@intel.com wrote:

From: John Harrison

The previous update to enable the workaround on ARL only changed two
out of three places where the w/a needs to be enabled. That meant the
GuC side was operational but not the KMD side. And as the KMD side is
the trigger, it meant the w/a was not actually active. So fix that.

Fixes: 104bcfae57d8 ("drm/i915/arl: Enable Wa_14019159160 for ARL")
Cc: John Harrison
Cc: Vinay Belgaumkar
Cc: Daniele Ceraolo Spurio
Cc: Andi Shyti
Cc: Lucas De Marchi
Cc: Rodrigo Vivi
Cc: Matt Roper
Cc: Jonathan Cavitt
Cc: Nirmoy Das
Cc: Shuicheng Lin
Signed-off-by: John Harrison

Reviewed-by: Nirmoy Das 

---
  drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 9400d0eb682b2..3e1c3bc56daf2 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -4506,7 +4506,7 @@ static void guc_default_vfuncs(struct intel_engine_cs 
*engine)
/* Wa_16019325821 */
/* Wa_14019159160 */
if ((engine->class == COMPUTE_CLASS || engine->class == RENDER_CLASS) &&
-   IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 71)))
+   IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74)))
engine->flags |= I915_ENGINE_USES_WA_HOLD_SWITCHOUT;
  
  	/*

Re: [PATCH 1/2] drm/i915/gem: Do not look for the exact address in node

2024-08-08 Thread Nirmoy Das



On 8/7/2024 12:05 PM, Andi Shyti wrote:

In preparation for the upcoming partial memory mapping feature,
we want to make sure that when looking for a node we consider
also the offset and not just the starting address of the virtual
memory node.

Signed-off-by: Andi Shyti 

Reviewed-by: Nirmoy Das 

---
  drivers/gpu/drm/i915/gem/i915_gem_mman.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c 
b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index cac6d4184506..d3ee8ef7ea2f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -1071,9 +1071,9 @@ int i915_gem_mmap(struct file *filp, struct 
vm_area_struct *vma)
  
  	rcu_read_lock();

drm_vma_offset_lock_lookup(dev->vma_offset_manager);
-   node = drm_vma_offset_exact_lookup_locked(dev->vma_offset_manager,
- vma->vm_pgoff,
- vma_pages(vma));
+   node = drm_vma_offset_lookup_locked(dev->vma_offset_manager,
+   vma->vm_pgoff,
+   vma_pages(vma));
if (node && drm_vma_node_is_allowed(node, priv)) {
/*
 * Skip 0-refcnted objects as it is in the process of being


Re: [PATCH 2/2] drm/i915/gem: Calculate object page offset for partial memory mapping

2024-08-08 Thread Nirmoy Das



On 8/7/2024 12:05 PM, Andi Shyti wrote:

To enable partial memory mapping of GPU virtual memory, it's
necessary to introduce an offset to the object's memory
(obj->mm.pages) scatterlist. This adjustment compensates for
instances when userspace mappings do not start from the beginning
of the object.

Based on a patch by Chris Wilson.

Signed-off-by: Andi Shyti 
Cc: Chris Wilson 
Cc: Lionel Landwerlin 

Reviewed-by: Nirmoy Das 

---
  drivers/gpu/drm/i915/gem/i915_gem_mman.c |  4 +++-
  drivers/gpu/drm/i915/i915_mm.c   | 12 +++-
  drivers/gpu/drm/i915/i915_mm.h   |  3 ++-
  3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c 
b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index d3ee8ef7ea2f..bb00af317d59 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -252,6 +252,7 @@ static vm_fault_t vm_fault_cpu(struct vm_fault *vmf)
struct vm_area_struct *area = vmf->vma;
struct i915_mmap_offset *mmo = area->vm_private_data;
struct drm_i915_gem_object *obj = mmo->obj;
+   unsigned long obj_offset;
resource_size_t iomap;
int err;
  
@@ -273,10 +274,11 @@ static vm_fault_t vm_fault_cpu(struct vm_fault *vmf)

iomap -= obj->mm.region->region.start;
}
  
+	obj_offset = area->vm_pgoff - drm_vma_node_start(&mmo->vma_node);

/* PTEs are revoked in obj->ops->put_pages() */
err = remap_io_sg(area,
  area->vm_start, area->vm_end - area->vm_start,
- obj->mm.pages->sgl, iomap);
+ obj->mm.pages->sgl, obj_offset, iomap);
  
  	if (area->vm_flags & VM_WRITE) {

GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
diff --git a/drivers/gpu/drm/i915/i915_mm.c b/drivers/gpu/drm/i915/i915_mm.c
index 7998bc74ab49..f5c97a620962 100644
--- a/drivers/gpu/drm/i915/i915_mm.c
+++ b/drivers/gpu/drm/i915/i915_mm.c
@@ -122,13 +122,15 @@ int remap_io_mapping(struct vm_area_struct *vma,
   * @addr: target user address to start at
   * @size: size of map area
   * @sgl: Start sg entry
+ * @offset: offset from the start of the page
   * @iobase: Use stored dma address offset by this address or pfn if -1
   *
   *  Note: this is only safe if the mm semaphore is held when called.
   */
  int remap_io_sg(struct vm_area_struct *vma,
unsigned long addr, unsigned long size,
-   struct scatterlist *sgl, resource_size_t iobase)
+   struct scatterlist *sgl, unsigned long offset,
+   resource_size_t iobase)
  {
struct remap_pfn r = {
.mm = vma->vm_mm,
@@ -141,6 +143,14 @@ int remap_io_sg(struct vm_area_struct *vma,
/* We rely on prevalidation of the io-mapping to skip track_pfn(). */
GEM_BUG_ON((vma->vm_flags & EXPECTED_FLAGS) != EXPECTED_FLAGS);
  
+	while (offset >= sg_dma_len(r.sgt.sgp) >> PAGE_SHIFT) {

+   offset -= sg_dma_len(r.sgt.sgp) >> PAGE_SHIFT;
+   r.sgt = __sgt_iter(__sg_next(r.sgt.sgp), use_dma(iobase));
+   if (!r.sgt.sgp)
+   return -EINVAL;
+   }
+   r.sgt.curr = offset << PAGE_SHIFT;
+
if (!use_dma(iobase))
flush_cache_range(vma, addr, size);
  
diff --git a/drivers/gpu/drm/i915/i915_mm.h b/drivers/gpu/drm/i915/i915_mm.h

index 04c8974d822b..69f9351b1a1c 100644
--- a/drivers/gpu/drm/i915/i915_mm.h
+++ b/drivers/gpu/drm/i915/i915_mm.h
@@ -30,6 +30,7 @@ int remap_io_mapping(struct vm_area_struct *vma,
  
  int remap_io_sg(struct vm_area_struct *vma,

unsigned long addr, unsigned long size,
-   struct scatterlist *sgl, resource_size_t iobase);
+   struct scatterlist *sgl, unsigned long offset,
+   resource_size_t iobase);
  
  #endif /* __I915_MM_H__ */


Re: [PATCH] drm/i915: Allow NULL memory region

2024-07-17 Thread Nirmoy Das



On 7/12/2024 11:41 PM, Jonathan Cavitt wrote:

Prevent a NULL pointer access in intel_memory_regions_hw_probe.

Fixes: 05da7d9f717b ("drm/i915/gem: Downgrade stolen lmem setup warning")
Reported-by: Dan Carpenter 
Signed-off-by: Jonathan Cavitt 


Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/11704

Reviewed-by: Nirmoy Das 


---
  drivers/gpu/drm/i915/intel_memory_region.c | 6 --
  1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_memory_region.c 
b/drivers/gpu/drm/i915/intel_memory_region.c
index 172dfa7c3588b..d40ee1b42110a 100644
--- a/drivers/gpu/drm/i915/intel_memory_region.c
+++ b/drivers/gpu/drm/i915/intel_memory_region.c
@@ -368,8 +368,10 @@ int intel_memory_regions_hw_probe(struct drm_i915_private 
*i915)
goto out_cleanup;
}
  
-		mem->id = i;

-   i915->mm.regions[i] = mem;
+   if (mem) { /* Skip on non-fatal errors */
+   mem->id = i;
+   i915->mm.regions[i] = mem;
+   }
}
  
  	for (i = 0; i < ARRAY_SIZE(i915->mm.regions); i++) {


Re: [PATCH] drm/i915: Allow NULL memory region

2024-07-17 Thread Nirmoy Das



On 7/17/2024 5:25 PM, Dan Carpenter wrote:

On Wed, Jul 17, 2024 at 05:05:55PM +0200, Nirmoy Das wrote:

On 7/12/2024 11:41 PM, Jonathan Cavitt wrote:

Prevent a NULL pointer access in intel_memory_regions_hw_probe.

Fixes: 05da7d9f717b ("drm/i915/gem: Downgrade stolen lmem setup warning")
Reported-by: Dan Carpenter 
Signed-off-by: Jonathan Cavitt 
---
   drivers/gpu/drm/i915/intel_memory_region.c | 6 --
   1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_memory_region.c 
b/drivers/gpu/drm/i915/intel_memory_region.c
index 172dfa7c3588b..d40ee1b42110a 100644
--- a/drivers/gpu/drm/i915/intel_memory_region.c
+++ b/drivers/gpu/drm/i915/intel_memory_region.c
@@ -368,8 +368,10 @@ int intel_memory_regions_hw_probe(struct drm_i915_private 
*i915)
goto out_cleanup;
}
-   mem->id = i;
-   i915->mm.regions[i] = mem;

There is a check for mem just before that. You could use IS_ERR_OR_NULL(mem)
instead of IS_ERR().

An error pointer return is normally completely different from a NULL
return in how it's handled.



intel_memory_regions_driver_release() skipped my eyes  in the cleanup path.


  Here NULL is a special kind of success.  I
wrote a blog about this.

https://staticthinking.wordpress.com/2022/08/01/mixing-error-pointers-and-null/



I am the perfect target audience for this blog post :)


Thanks,

Nirmoy



regards,
dan carpenter


Re: [PATCH] drm/i915: Allow NULL memory region

2024-07-17 Thread Nirmoy Das



On 7/17/2024 5:30 PM, Cavitt, Jonathan wrote:

-Original Message-
From: Nirmoy Das 
Sent: Wednesday, July 17, 2024 8:22 AM
To: Cavitt, Jonathan ; 
intel-gfx@lists.freedesktop.org
Cc: Gupta, saurabhg ; dan.carpen...@linaro.org; 
chris.p.wil...@linux.intel.com; Andi Shyti 
Subject: Re: [PATCH] drm/i915: Allow NULL memory region


On 7/17/2024 5:11 PM, Cavitt, Jonathan wrote:

-Original Message-
From: Nirmoy Das 
Sent: Wednesday, July 17, 2024 8:06 AM
To: Cavitt, Jonathan ; 
intel-gfx@lists.freedesktop.org
Cc: Gupta, saurabhg ; dan.carpen...@linaro.org; 
chris.p.wil...@linux.intel.com; Andi Shyti 
Subject: Re: [PATCH] drm/i915: Allow NULL memory region

On 7/12/2024 11:41 PM, Jonathan Cavitt wrote:

Prevent a NULL pointer access in intel_memory_regions_hw_probe.

Fixes: 05da7d9f717b ("drm/i915/gem: Downgrade stolen lmem setup warning")
Reported-by: Dan Carpenter 
Signed-off-by: Jonathan Cavitt 
---
drivers/gpu/drm/i915/intel_memory_region.c | 6 --
1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_memory_region.c 
b/drivers/gpu/drm/i915/intel_memory_region.c
index 172dfa7c3588b..d40ee1b42110a 100644
--- a/drivers/gpu/drm/i915/intel_memory_region.c
+++ b/drivers/gpu/drm/i915/intel_memory_region.c
@@ -368,8 +368,10 @@ int intel_memory_regions_hw_probe(struct drm_i915_private 
*i915)
goto out_cleanup;
}

-		mem->id = i;

-   i915->mm.regions[i] = mem;

There is a check for mem just before that. You could use
IS_ERR_OR_NULL(mem) instead of IS_ERR().

I think you're referring to the "goto out_cleanup" path?

Yes.


mem being NULL is a valid use case, so we
shouldn't take the error path when it's observed.

Not an error path if you return expected/correct value.

intel_memory_regions_driver_release releases all previously
grabbed memory regions in the out_cleanup path.



Ah, yes. Isn't so simple as I thought.  Never mind ignore my previous 
comment.



-Jonathan Cavitt


You could do
diff --git a/drivers/gpu/drm/i915/intel_memory_region.c 
b/drivers/gpu/drm/i915/intel_memory_region.c
index 172dfa7c3588..41ef7fdfa69b 100644
--- a/drivers/gpu/drm/i915/intel_memory_region.c
+++ b/drivers/gpu/drm/i915/intel_memory_region.c
@@ -362,9 +362,10 @@ int intel_memory_regions_hw_probe(struct drm_i915_private 
*i915)

  if (IS_ERR(mem)) {
  err = PTR_ERR(mem);
-   drm_err(&i915->drm,
-   "Failed to setup region(%d) type=%d\n",
-   err, type);
+   if (err)
+   drm_err(&i915->drm,
+   "Failed to setup region(%d) type=%d\n",
+   err, type);
  goto out_cleanup;
  }

PTR_ERR(NULL) should be 0 I think and could even add a info saying skipping 
setting up that reason.

Regards,
Nirmoy


-Jonathan Cavitt


Regards,

Nirmoy


+   if (mem) { /* Skip on non-fatal errors */
+   mem->id = i;
+   i915->mm.regions[i] = mem;
+   }
}

	for (i = 0; i < ARRAY_SIZE(i915->mm.regions); i++) {


Re: [PATCH] drm/i915: Allow NULL memory region

2024-07-17 Thread Nirmoy Das



On 7/17/2024 5:11 PM, Cavitt, Jonathan wrote:

-Original Message-
From: Nirmoy Das 
Sent: Wednesday, July 17, 2024 8:06 AM
To: Cavitt, Jonathan ; 
intel-gfx@lists.freedesktop.org
Cc: Gupta, saurabhg ; dan.carpen...@linaro.org; 
chris.p.wil...@linux.intel.com; Andi Shyti 
Subject: Re: [PATCH] drm/i915: Allow NULL memory region


On 7/12/2024 11:41 PM, Jonathan Cavitt wrote:

Prevent a NULL pointer access in intel_memory_regions_hw_probe.

Fixes: 05da7d9f717b ("drm/i915/gem: Downgrade stolen lmem setup warning")
Reported-by: Dan Carpenter 
Signed-off-by: Jonathan Cavitt 
---
   drivers/gpu/drm/i915/intel_memory_region.c | 6 --
   1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_memory_region.c 
b/drivers/gpu/drm/i915/intel_memory_region.c
index 172dfa7c3588b..d40ee1b42110a 100644
--- a/drivers/gpu/drm/i915/intel_memory_region.c
+++ b/drivers/gpu/drm/i915/intel_memory_region.c
@@ -368,8 +368,10 @@ int intel_memory_regions_hw_probe(struct drm_i915_private 
*i915)
goto out_cleanup;
}
   
-		mem->id = i;

-   i915->mm.regions[i] = mem;

There is a check for mem just before that. You could use
IS_ERR_OR_NULL(mem) instead of IS_ERR().

I think you're referring to the "goto out_cleanup" path?


Yes.



mem being NULL is a valid use case, so we
shouldn't take the error path when it's observed.

Not an error path if you return expected/correct value.

You could do
diff --git a/drivers/gpu/drm/i915/intel_memory_region.c 
b/drivers/gpu/drm/i915/intel_memory_region.c
index 172dfa7c3588..41ef7fdfa69b 100644
--- a/drivers/gpu/drm/i915/intel_memory_region.c
+++ b/drivers/gpu/drm/i915/intel_memory_region.c
@@ -362,9 +362,10 @@ int intel_memory_regions_hw_probe(struct drm_i915_private 
*i915)

if (IS_ERR(mem)) {
err = PTR_ERR(mem);
-   drm_err(&i915->drm,
-   "Failed to setup region(%d) type=%d\n",
-   err, type);
+   if (err)
+   drm_err(&i915->drm,
+   "Failed to setup region(%d) type=%d\n",
+   err, type);
goto out_cleanup;
}

PTR_ERR(NULL) should be 0 I think and could even add a info saying skipping 
setting up that reason.

Regards,
Nirmoy


-Jonathan Cavitt



Regards,

Nirmoy


+   if (mem) { /* Skip on non-fatal errors */
+   mem->id = i;
+   i915->mm.regions[i] = mem;
+   }
}
   
   	for (i = 0; i < ARRAY_SIZE(i915->mm.regions); i++) {


Re: [PATCH] drm/i915: Allow NULL memory region

2024-07-17 Thread Nirmoy Das



On 7/12/2024 11:41 PM, Jonathan Cavitt wrote:

Prevent a NULL pointer access in intel_memory_regions_hw_probe.

Fixes: 05da7d9f717b ("drm/i915/gem: Downgrade stolen lmem setup warning")
Reported-by: Dan Carpenter 
Signed-off-by: Jonathan Cavitt 
---
  drivers/gpu/drm/i915/intel_memory_region.c | 6 --
  1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_memory_region.c 
b/drivers/gpu/drm/i915/intel_memory_region.c
index 172dfa7c3588b..d40ee1b42110a 100644
--- a/drivers/gpu/drm/i915/intel_memory_region.c
+++ b/drivers/gpu/drm/i915/intel_memory_region.c
@@ -368,8 +368,10 @@ int intel_memory_regions_hw_probe(struct drm_i915_private 
*i915)
goto out_cleanup;
}
  
-		mem->id = i;

-   i915->mm.regions[i] = mem;


There is a check for mem just before that. You could use 
IS_ERR_OR_NULL(mem) instead of IS_ERR().



Regards,

Nirmoy


+   if (mem) { /* Skip on non-fatal errors */
+   mem->id = i;
+   i915->mm.regions[i] = mem;
+   }
}
  
  	for (i = 0; i < ARRAY_SIZE(i915->mm.regions); i++) {


Re: [PATCH] drm/i915/gem: Suppress oom warning in favour of ENOMEM to userspace

2024-06-27 Thread Nirmoy Das

Hi Andi,

On 6/27/2024 12:04 PM, Andi Shyti wrote:

Hi Nirmoy,

On Wed, Jun 26, 2024 at 04:33:18PM +0200, Nirmoy Das wrote:

We report object allocation failures to userspace with ENOMEM
so add __GFP_NOWARN to remove superfluous oom warnings.

I think this should be the default behavior.
Yes, when drivers handle ENOMEM situation which is the case for i915/gem 
code

  ENOMEM doesn't
necessarily mean that there is a kernel failure. Most of the time
we just run out of memory, deal with it :-)

Reviewed-by: Andi Shyti 


Thanks!




Thanks,
Andi


Re: [PATCH] drm/i915/gem: Suppress oom warning in favour of ENOMEM to userspace

2024-06-26 Thread Nirmoy Das

Hi Rodrigo,

On 6/26/2024 5:50 PM, Rodrigo Vivi wrote:

On Wed, Jun 26, 2024 at 05:36:43PM +0200, Nirmoy Das wrote:

Hi Rodrigo,
 
On 6/26/2024 5:24 PM, Rodrigo Vivi wrote:
 
On Wed, Jun 26, 2024 at 04:33:18PM +0200, Nirmoy Das wrote:
 
>We report object allocation failures to userspace with ENOMEM

>so add __GFP_NOWARN to remove superfluous oom warnings.
 
>Closes: [1]https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/4936

>Cc: Andi Shyti [2]
>Signed-off-by: Nirmoy Das [3]
>---
> drivers/gpu/drm/i915/i915_scatterlist.c | 8 
> 1 file changed, 4 insertions(+), 4 deletions(-)
 
>diff --git a/drivers/gpu/drm/i915/i915_scatterlist.c b/drivers/gpu/drm/i915/i915_scatterlist.c

>index e93d2538f298..4d830740946d 100644
>--- a/drivers/gpu/drm/i915/i915_scatterlist.c
>+++ b/drivers/gpu/drm/i915/i915_scatterlist.c
>@@ -90,7 +90,7 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const 
struct drm_mm_node *node,
>
>GEM_BUG_ON(!max_segment);
>
>-   rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL);
>+   rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL | __GFP_NOWARN);
>if (!rsgt)
>return ERR_PTR(-ENOMEM);
 
is it really safe?

I don't believe we can guarantee a good fallback plan here if allocation 
fails.
__i915_refct_sgt_init
might end up in a null dereference, no?!
 
Kernel is now returning  ENOMEM and also throwing a oom warning stack.

With __GFP_NOWARN
 
the oom warning stack won't be there in the dmesg but userspace will still

get ENOMEM as expected.

doh! I had missunderstand the flag. Thanks for the confirmation.

Reviewed-by: Rodrigo Vivi 

BTW, what email clients are you using recently?


Using the same client, Thunderbird.



it is hard to parse your responses lately. Please check if it is really
sending/replying as text-only mode.


Thanks for notifying me. May be recent update changed some settings. I 
will check.



Nirmoy



 
Let me know if got your question correctly.
 
Regards,
     
Nirmoy
 
 
 
>

>@@ -104,7 +104,7 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const 
struct drm_mm_node *node,
>}
>
>if (sg_alloc_table(st, DIV_ROUND_UP_ULL(node->size, segment_pages),
>-  GFP_KERNEL)) {
>+  GFP_KERNEL | __GFP_NOWARN)) {
>i915_refct_sgt_put(rsgt);
>return ERR_PTR(-ENOMEM);
>}
>@@ -178,7 +178,7 @@ struct i915_refct_sgt 
*i915_rsgt_from_buddy_resource(struct ttm_resource *res,
>GEM_BUG_ON(list_empty(blocks));
>GEM_BUG_ON(!max_segment);
>
>-   rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL);
>+   rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL | __GFP_NOWARN);
>if (!rsgt)
>return ERR_PTR(-ENOMEM);
>
>@@ -190,7 +190,7 @@ struct i915_refct_sgt 
*i915_rsgt_from_buddy_resource(struct ttm_resource *res,
>return ERR_PTR(-E2BIG);
>}
>
>-   if (sg_alloc_table(st, PFN_UP(res->size), GFP_KERNEL)) {
>+   if (sg_alloc_table(st, PFN_UP(res->size), GFP_KERNEL | 
__GFP_NOWARN)) {
>i915_refct_sgt_put(rsgt);
>return ERR_PTR(-ENOMEM);
>}
>--
>2.42.0

References

Visible links
1. https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/4936
2. mailto:andi.sh...@linux.intel.com
3. mailto:nirmoy@intel.com


Re: [PATCH] drm/i915/gem: Suppress oom warning in favour of ENOMEM to userspace

2024-06-26 Thread Nirmoy Das

Hi Rodrigo,

On 6/26/2024 5:24 PM, Rodrigo Vivi wrote:

On Wed, Jun 26, 2024 at 04:33:18PM +0200, Nirmoy Das wrote:

We report object allocation failures to userspace with ENOMEM
so add __GFP_NOWARN to remove superfluous oom warnings.

Closes:https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/4936
Cc: Andi Shyti
Signed-off-by: Nirmoy Das
---
  drivers/gpu/drm/i915/i915_scatterlist.c | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_scatterlist.c 
b/drivers/gpu/drm/i915/i915_scatterlist.c
index e93d2538f298..4d830740946d 100644
--- a/drivers/gpu/drm/i915/i915_scatterlist.c
+++ b/drivers/gpu/drm/i915/i915_scatterlist.c
@@ -90,7 +90,7 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct 
drm_mm_node *node,
  
  	GEM_BUG_ON(!max_segment);
  
-	rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL);

+   rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL | __GFP_NOWARN);
if (!rsgt)
return ERR_PTR(-ENOMEM);

is it really safe?
I don't believe we can guarantee a good fallback plan here if allocation fails.
__i915_refct_sgt_init
might end up in a null dereference, no?!


Kernel is now returning ENOMEM and also throwing a oom warning stack. 
With __GFP_NOWARN


the oom warning stack won't be there in the dmesg but userspace will 
still get ENOMEM as expected.


Let me know if got your question correctly.

Regards,

Nirmoy



  
@@ -104,7 +104,7 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node,

}
  
  	if (sg_alloc_table(st, DIV_ROUND_UP_ULL(node->size, segment_pages),

-  GFP_KERNEL)) {
+  GFP_KERNEL | __GFP_NOWARN)) {
i915_refct_sgt_put(rsgt);
return ERR_PTR(-ENOMEM);
}
@@ -178,7 +178,7 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct 
ttm_resource *res,
GEM_BUG_ON(list_empty(blocks));
GEM_BUG_ON(!max_segment);
  
-	rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL);

+   rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL | __GFP_NOWARN);
if (!rsgt)
return ERR_PTR(-ENOMEM);
  
@@ -190,7 +190,7 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res,

return ERR_PTR(-E2BIG);
}
  
-	if (sg_alloc_table(st, PFN_UP(res->size), GFP_KERNEL)) {

+   if (sg_alloc_table(st, PFN_UP(res->size), GFP_KERNEL | __GFP_NOWARN)) {
i915_refct_sgt_put(rsgt);
return ERR_PTR(-ENOMEM);
}
--
2.42.0


[PATCH] drm/i915/gem: Suppress oom warning in favour of ENOMEM to userspace

2024-06-26 Thread Nirmoy Das
We report object allocation failures to userspace with ENOMEM
so add __GFP_NOWARN to remove superfluous oom warnings.

Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/4936
Cc: Andi Shyti 
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/i915/i915_scatterlist.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_scatterlist.c 
b/drivers/gpu/drm/i915/i915_scatterlist.c
index e93d2538f298..4d830740946d 100644
--- a/drivers/gpu/drm/i915/i915_scatterlist.c
+++ b/drivers/gpu/drm/i915/i915_scatterlist.c
@@ -90,7 +90,7 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct 
drm_mm_node *node,
 
GEM_BUG_ON(!max_segment);
 
-   rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL);
+   rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL | __GFP_NOWARN);
if (!rsgt)
return ERR_PTR(-ENOMEM);
 
@@ -104,7 +104,7 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct 
drm_mm_node *node,
}
 
if (sg_alloc_table(st, DIV_ROUND_UP_ULL(node->size, segment_pages),
-  GFP_KERNEL)) {
+  GFP_KERNEL | __GFP_NOWARN)) {
i915_refct_sgt_put(rsgt);
return ERR_PTR(-ENOMEM);
}
@@ -178,7 +178,7 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct 
ttm_resource *res,
GEM_BUG_ON(list_empty(blocks));
GEM_BUG_ON(!max_segment);
 
-   rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL);
+   rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL | __GFP_NOWARN);
if (!rsgt)
return ERR_PTR(-ENOMEM);
 
@@ -190,7 +190,7 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct 
ttm_resource *res,
return ERR_PTR(-E2BIG);
}
 
-   if (sg_alloc_table(st, PFN_UP(res->size), GFP_KERNEL)) {
+   if (sg_alloc_table(st, PFN_UP(res->size), GFP_KERNEL | __GFP_NOWARN)) {
i915_refct_sgt_put(rsgt);
return ERR_PTR(-ENOMEM);
}
-- 
2.42.0



Re: [PATCH] drm/i915/gt: debugfs: Evaluate forcewake usage within locks

2024-06-11 Thread Nirmoy Das



On 6/11/2024 3:58 PM, Tvrtko Ursulin wrote:


On 10/06/2024 10:24, Nirmoy Das wrote:

Hi Andi,

On 6/7/2024 4:51 PM, Andi Shyti wrote:

The forcewake count and domains listing is multi process critical
and the uncore provides a spinlock for such cases.

Lock the forcewake evaluation section in the fw_domains_show()
debugfs interface.

Signed-off-by: Andi Shyti 


Needs a Fixes tag, below seems to be correct one.


Fixes: 9dd4b065446a ("drm/i915/gt: Move pm debug files into a gt 
aware debugfs")


Cc:  # v5.6+

Reviewed-by: Nirmoy Das 


What is the back story here and why would it need backporting? IGT 
cares about the atomic view of user_forcewake_count and individual 
domains or what?


There is no serious back story. This came from a static code analyzer 
report. I keep forgetting debugfs isn't mounted on production systems so 
we don't have to backport this patch.



Thanks,

Nirmoy



Regards,

Tvrtko




Regards,

Nirmoy



---
  drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c | 4 
  1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c 
b/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c

index 4fcba42cfe34..0437fd8217e0 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c
@@ -71,6 +71,8 @@ static int fw_domains_show(struct seq_file *m, 
void *data)

  struct intel_uncore_forcewake_domain *fw_domain;
  unsigned int tmp;
+    spin_lock_irq(&uncore->lock);
+
  seq_printf(m, "user.bypass_count = %u\n",
 uncore->user_forcewake_count);
@@ -79,6 +81,8 @@ static int fw_domains_show(struct seq_file *m, 
void *data)

intel_uncore_forcewake_domain_to_str(fw_domain->id),
 READ_ONCE(fw_domain->wake_count));
+    spin_unlock_irq(&uncore->lock);
+
  return 0;
  }
  DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(fw_domains);


Re: [PATCH] drm/i915/gt: debugfs: Evaluate forcewake usage within locks

2024-06-10 Thread Nirmoy Das

Hi Andi,

On 6/7/2024 4:51 PM, Andi Shyti wrote:

The forcewake count and domains listing is multi process critical
and the uncore provides a spinlock for such cases.

Lock the forcewake evaluation section in the fw_domains_show()
debugfs interface.

Signed-off-by: Andi Shyti 


Needs a Fixes tag, below seems to be correct one.


Fixes: 9dd4b065446a ("drm/i915/gt: Move pm debug files into a gt aware 
debugfs")


Cc:  # v5.6+

Reviewed-by: Nirmoy Das 


Regards,

Nirmoy



---
  drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c | 4 
  1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c 
b/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c
index 4fcba42cfe34..0437fd8217e0 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c
@@ -71,6 +71,8 @@ static int fw_domains_show(struct seq_file *m, void *data)
struct intel_uncore_forcewake_domain *fw_domain;
unsigned int tmp;
  
+	spin_lock_irq(&uncore->lock);

+
seq_printf(m, "user.bypass_count = %u\n",
   uncore->user_forcewake_count);
  
@@ -79,6 +81,8 @@ static int fw_domains_show(struct seq_file *m, void *data)

   intel_uncore_forcewake_domain_to_str(fw_domain->id),
   READ_ONCE(fw_domain->wake_count));
  
+	spin_unlock_irq(&uncore->lock);

+
return 0;
  }
  DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(fw_domains);


Re: [PATCH v2] drm/i915: Increase FLR timeout from 3s to 9s

2024-05-29 Thread Nirmoy Das



On 5/24/2024 1:58 AM, Andi Shyti wrote:

Following the guidelines it takes 3 seconds to perform an FLR
reset. Let's give it a bit more slack because this time can
change depending on the platform and on the firmware

Signed-off-by: Andi Shyti 

Reviewed-by: Nirmoy Das 

---
Hi,

In this second version I removed patch 2 that was ignoring the
FLR reset timeouts, until we develop a proper patch.

This first patch is basically the same as v1. Thanks Nirmoy for
your review.

Andi

  drivers/gpu/drm/i915/intel_uncore.c | 9 -
  1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/intel_uncore.c 
b/drivers/gpu/drm/i915/intel_uncore.c
index 729409a4bada..2eba289d88ad 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -2614,11 +2614,18 @@ void intel_uncore_prune_engine_fw_domains(struct 
intel_uncore *uncore,
  static void driver_initiated_flr(struct intel_uncore *uncore)
  {
struct drm_i915_private *i915 = uncore->i915;
-   const unsigned int flr_timeout_ms = 3000; /* specs recommend a 3s wait 
*/
+   unsigned int flr_timeout_ms;
int ret;
  
  	drm_dbg(&i915->drm, "Triggering Driver-FLR\n");
  
+	/*

+* The specification recommends a 3 seconds FLR reset timeout. To be
+* cautious, we will extend this to 9 seconds, three times the specified
+* timeout.
+*/
+   flr_timeout_ms = 9000;
+
/*
 * Make sure any pending FLR requests have cleared by waiting for the
 * FLR trigger bit to go to zero. Also clear GU_DEBUG's DRIVERFLR_STATUS


Re: [PATCH 2/2] drm/i915: Don't treat FLR resets as errors

2024-05-22 Thread Nirmoy Das

Hi Andi,

On 5/21/2024 12:56 PM, Andi Shyti wrote:

Hi Nirmoy,

On Fri, May 17, 2024 at 10:13:37PM +0200, Nirmoy Das wrote:

Hi Andi,

On 5/17/2024 9:34 PM, Andi Shyti wrote:

 Hi Nirmoy,

 On Fri, May 17, 2024 at 04:00:02PM +0200, Nirmoy Das wrote:

 On 5/17/2024 1:25 PM, Andi Shyti wrote:

 If we timeout while waiting for an FLR reset, there is nothing we
 can do and i915 doesn't have any control on it. In any case the
 system is still perfectly usable

 If a FLR reset fails then we will have a dead GPU, I don't think the 
GPU is
 usable without a cold reboot.

 fact is that the GPU keeps going and even though the timeout has
 expired, the system moves to the next phase.

The current test might look like it is has passed, but if you look into the
subsequent tests you can see a dead GPU:

<7>[  369.168121] pci :00:02.0: [drm:intel_uncore_fini_mmio [i915]] 
Triggering Driver-FLR
<3>[  372.170189] pci :00:02.0: [drm] *ERROR* Driver-FLR-teardown wait 
completion failed! -110
<7>[  372.437630] [IGT] i915_selftest: finished subtest requests, SUCCESS
<7>[  372.438356] [IGT] i915_selftest: starting dynamic subtest migrate
<5>[  373.110580] Setting dangerous option live_selftests - tainting kernel
<3>[  373.183499] i915 :00:02.0: Unable to change power state from D0 to 
D0, device inaccessible
<3>[  373.246921] i915 :00:02.0: [drm] *ERROR* Unrecognized display IP 
version 1023.255; disabling display.
<7>[  373.247130] i915 :00:02.0: [drm:intel_step_init [i915]] Using future 
steppings
<7>[  373.247716] i915 :00:02.0: [drm:intel_step_init [i915]] Using future 
steppings
<7>[  373.248263] i915 :00:02.0: [drm:intel_step_init [i915]] Using future 
display steppings
<7>[  373.251843] i915 :00:02.0: [drm:intel_gt_common_init_early [i915]] 
WOPCM: 2048K
<7>[  373.252505] i915 :00:02.0: [drm:intel_uc_init_early [i915]] GT0: 
enable_guc=3 (guc:yes submission:yes huc:no slpc:yes)
<7>[  373.253140] i915 :00:02.0: [drm:intel_gt_probe_all [i915]] GT0: 
Setting up Primary GT
<7>[  373.253556] i915 :00:02.0: [drm:intel_gt_probe_all [i915]] GT1: 
Setting up Standalone Media GT
<7>[  373.253941] i915 :00:02.0: [drm:intel_gt_common_init_early [i915]] 
WOPCM: 2048K
<7>[  373.254365] i915 :00:02.0: [drm:intel_uc_init_early [i915]] GT1: 
enable_guc=3 (guc:yes submission:yes huc:yes slpc:yes)
<3>[  375.256235] i915 :00:02.0: [drm] *ERROR* Device is non-operational; 
MMIO access returns 0x!
<3>[  375.259089] i915 :00:02.0: Device initialization failed (-5)
<3>[  375.260521] i915 :00:02.0: probe with driver i915 failed with error -5
<7>[  375.392209] [IGT] i915_selftest: finished subtest migrate, FAIL

https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_14724/bat-arls-3/dmesg0.txt

Are we sure this is dependent on the FLR reset?


Yes, while on FLR read into memory will return either 0/F.



  There are cases
when the FLR reset doesn't make any difference and in any case
this error is completely ignored by the driver.


This happens at very late with no recovery possible and hope is module  
reload works.





Perhaps we can change it to a warning?


I think it should be error. CI will still complain even on warning.





 This is a serious issue and should be report as an error.  I think we 
need
 to create a HW ticket to understand

 why is FLR reset fails.

 Maybe it takes longer and longer to reset. We've been sending
 several patches in the latest years to fix the timings.

HW spec says 3 sec but we can try increasing it bit higher to try it out.

We could go, then, with just patch 1 and see if it improves.


Does it help ? If helps then we can go ahead with increased timeout.



  Also
because, the FLR reset might also depend on the firmware.


Possible. In that case we should wait for firmware fix ?


Regards,

Nirmoy



Thanks, Nirmoy,
Andi


Re: [PATCH 2/2] drm/i915: Don't treat FLR resets as errors

2024-05-17 Thread Nirmoy Das

Hi Andi,

On 5/17/2024 9:34 PM, Andi Shyti wrote:

Hi Nirmoy,

On Fri, May 17, 2024 at 04:00:02PM +0200, Nirmoy Das wrote:

On 5/17/2024 1:25 PM, Andi Shyti wrote:

If we timeout while waiting for an FLR reset, there is nothing we
can do and i915 doesn't have any control on it. In any case the
system is still perfectly usable

If a FLR reset fails then we will have a dead GPU, I don't think the GPU is
usable without a cold reboot.

fact is that the GPU keeps going and even though the timeout has
expired, the system moves to the next phase.
The current test might look like it is has passed, but if you look into 
the subsequent tests you can see a dead GPU:


<7>[  369.168121] pci :00:02.0: [drm:intel_uncore_fini_mmio [i915]] 
Triggering Driver-FLR
*<3>[ 372.170189] pci :00:02.0: [drm] *ERROR* Driver-FLR-teardown 
wait completion failed! -110*

*<7>[ 372.437630] [IGT] i915_selftest: finished subtest requests, SUCCESS*
<7>[  372.438356] [IGT] i915_selftest: starting dynamic subtest migrate
<5>[  373.110580] Setting dangerous option live_selftests - tainting kernel
<3>[  373.183499] i915 :00:02.0: Unable to change power state from D0 to 
D0, device inaccessible
<3>[  373.246921] i915 :00:02.0: [drm] *ERROR* Unrecognized display IP 
version 1023.255; disabling display.
<7>[  373.247130] i915 :00:02.0: [drm:intel_step_init [i915]] Using future 
steppings
<7>[  373.247716] i915 :00:02.0: [drm:intel_step_init [i915]] Using future 
steppings
<7>[  373.248263] i915 :00:02.0: [drm:intel_step_init [i915]] Using future 
display steppings
<7>[  373.251843] i915 :00:02.0: [drm:intel_gt_common_init_early [i915]] 
WOPCM: 2048K
<7>[  373.252505] i915 :00:02.0: [drm:intel_uc_init_early [i915]] GT0: 
enable_guc=3 (guc:yes submission:yes huc:no slpc:yes)
<7>[  373.253140] i915 :00:02.0: [drm:intel_gt_probe_all [i915]] GT0: 
Setting up Primary GT
<7>[  373.253556] i915 :00:02.0: [drm:intel_gt_probe_all [i915]] GT1: 
Setting up Standalone Media GT
<7>[  373.253941] i915 :00:02.0: [drm:intel_gt_common_init_early [i915]] 
WOPCM: 2048K
<7>[  373.254365] i915 :00:02.0: [drm:intel_uc_init_early [i915]] GT1: 
enable_guc=3 (guc:yes submission:yes huc:yes slpc:yes)
*<3>[ 375.256235] i915 :00:02.0: [drm] *ERROR* Device is 
non-operational; MMIO access returns 0x!*

<3>[  375.259089] i915 :00:02.0: Device initialization failed (-5)
<3>[  375.260521] i915 :00:02.0: probe with driver i915 failed with error -5
<7>[  375.392209] [IGT] i915_selftest: finished subtest migrate, FAIL

https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_14724/bat-arls-3/dmesg0.txt




This is a serious issue and should be report as an error.  I think we need
to create a HW ticket to understand

why is FLR reset fails.

Maybe it takes longer and longer to reset. We've been sending
several patches in the latest years to fix the timings.


HW spec says 3 sec but we can try increasing it bit higher to try it out.


Regards,

Nirmoy



Andi

Re: [PATCH 2/2] drm/i915: Don't treat FLR resets as errors

2024-05-17 Thread Nirmoy Das

Hi Andi,

On 5/17/2024 1:25 PM, Andi Shyti wrote:

If we timeout while waiting for an FLR reset, there is nothing we
can do and i915 doesn't have any control on it. In any case the
system is still perfectly usable


If a FLR reset fails then we will have a dead GPU, I don't think the GPU 
is usable without a cold reboot.


This is a serious issue and should be report as an error.  I think we 
need to create a HW ticket to understand


why is FLR reset fails.


Regards,

Nirmoy




  and the function returns void.

We don't need to be alarmed, therefore, print the timeout
expiration as a debug message instead of an error.

Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/10955
Signed-off-by: Andi Shyti 
---
  drivers/gpu/drm/i915/intel_uncore.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_uncore.c 
b/drivers/gpu/drm/i915/intel_uncore.c
index 2eba289d88ad..a3fa2ed91aae 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -2637,7 +2637,7 @@ static void driver_initiated_flr(struct intel_uncore 
*uncore)
 */
ret = intel_wait_for_register_fw(uncore, GU_CNTL, DRIVERFLR, 0, 
flr_timeout_ms);
if (ret) {
-   drm_err(&i915->drm,
+   drm_dbg(&i915->drm,
"Failed to wait for Driver-FLR bit to clear! %d\n",
ret);
return;
@@ -2652,7 +2652,7 @@ static void driver_initiated_flr(struct intel_uncore 
*uncore)
 DRIVERFLR, 0,
 flr_timeout_ms);
if (ret) {
-   drm_err(&i915->drm, "Driver-FLR-teardown wait completion failed! 
%d\n", ret);
+   drm_dbg(&i915->drm, "Driver-FLR-teardown wait completion failed! 
%d\n", ret);
return;
}
  
@@ -2661,7 +2661,7 @@ static void driver_initiated_flr(struct intel_uncore *uncore)

 DRIVERFLR_STATUS, DRIVERFLR_STATUS,
 flr_timeout_ms);
if (ret) {
-   drm_err(&i915->drm, "Driver-FLR-reinit wait completion failed! 
%d\n", ret);
+   drm_dbg(&i915->drm, "Driver-FLR-reinit wait completion failed! 
%d\n", ret);
return;
}
  


Re: [PATCH] drm/i915/selftests: Set always_coherent to false when reading from CPU

2024-05-17 Thread Nirmoy Das



On 5/17/2024 1:53 PM, Jani Nikula wrote:

On Fri, 17 May 2024, Nirmoy Das  wrote:

Hi Jani,

On 5/17/2024 9:39 AM, Jani Nikula wrote:

On Thu, 16 May 2024, Nirmoy Das  wrote:

The previous commit 'commit 8d4ba9fc1c6c ("drm/i915/selftests: Pick

"previous commit" is a fairly vague reference once this gets
committed. It's not going to be "previous" in any meaningful sense.

Please just start with:

Commit 8d4ba9fc1c6c ("drm/i915/selftests: Pick correct caching mode.")
was not complete...

Will do that.



And probably add:

Fixes: 8d4ba9fc1c6c ("drm/i915/selftests: Pick correct caching mode.")

Do we need Fixes for selftest ? I always assumed it is not required as
this code is for debug/CI

Maybe not for stuff that's already in stable, but we do run CI on
drm-next and -rc kernels, and if this causes issues there, why not have
them fixed?


Not sure a commit with Fixes flows from drm-intel-next to drm-next/-rc 
but I see no issue adding Fixes without CC-ing to stable.


Pushed it to drm-intel-next with above modifications.  b4-shazam picked 
Fixes as well which was nice.



Thanks,

Nirmoy



BR,
Jani.



Thanks,

Nirmoy


BR,
Jani.


correct caching mode.")' was not complete as for non LLC  sharing platforms
cpu read can happen from LLC which probably doesn't have the latest
changes made by GPU.

Cc: Andi Shyti 
Cc: Janusz Krzysztofik 
Cc: Jonathan Cavitt 
Signed-off-by: Nirmoy Das 
---
   drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c | 2 +-
   1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c 
b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
index 65a931ea80e9..3527b8f446fe 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
@@ -196,7 +196,7 @@ static int verify_access(struct drm_i915_private *i915,
if (err)
goto out_file;
   
-	mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, true);

+   mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, false);
vaddr = i915_gem_object_pin_map_unlocked(native_obj, mode);
if (IS_ERR(vaddr)) {
err = PTR_ERR(vaddr);


Re: [PATCH] drm/i915/selftests: Set always_coherent to false when reading from CPU

2024-05-17 Thread Nirmoy Das

Hi Jani,

On 5/17/2024 9:39 AM, Jani Nikula wrote:

On Thu, 16 May 2024, Nirmoy Das  wrote:

The previous commit 'commit 8d4ba9fc1c6c ("drm/i915/selftests: Pick

"previous commit" is a fairly vague reference once this gets
committed. It's not going to be "previous" in any meaningful sense.

Please just start with:

Commit 8d4ba9fc1c6c ("drm/i915/selftests: Pick correct caching mode.")
was not complete...


Will do that.




And probably add:

Fixes: 8d4ba9fc1c6c ("drm/i915/selftests: Pick correct caching mode.")


Do we need Fixes for selftest ? I always assumed it is not required as 
this code is for debug/CI



Thanks,

Nirmoy



BR,
Jani.


correct caching mode.")' was not complete as for non LLC  sharing platforms
cpu read can happen from LLC which probably doesn't have the latest
changes made by GPU.

Cc: Andi Shyti 
Cc: Janusz Krzysztofik 
Cc: Jonathan Cavitt 
Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c 
b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
index 65a931ea80e9..3527b8f446fe 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
@@ -196,7 +196,7 @@ static int verify_access(struct drm_i915_private *i915,
if (err)
goto out_file;
  
-	mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, true);

+   mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, false);
vaddr = i915_gem_object_pin_map_unlocked(native_obj, mode);
if (IS_ERR(vaddr)) {
err = PTR_ERR(vaddr);


[PATCH] drm/i915/selftests: Set always_coherent to false when reading from CPU

2024-05-16 Thread Nirmoy Das
The previous commit 'commit 8d4ba9fc1c6c ("drm/i915/selftests: Pick
correct caching mode.")' was not complete as for non LLC  sharing platforms
cpu read can happen from LLC which probably doesn't have the latest
changes made by GPU.

Cc: Andi Shyti 
Cc: Janusz Krzysztofik 
Cc: Jonathan Cavitt 
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c 
b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
index 65a931ea80e9..3527b8f446fe 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
@@ -196,7 +196,7 @@ static int verify_access(struct drm_i915_private *i915,
if (err)
goto out_file;
 
-   mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, true);
+   mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, false);
vaddr = i915_gem_object_pin_map_unlocked(native_obj, mode);
if (IS_ERR(vaddr)) {
err = PTR_ERR(vaddr);
-- 
2.42.0



[PATCH] drm/i915: Use for_each_child instead of manual for-loop

2024-05-14 Thread Nirmoy Das
Simplify child iteration using for_each_child macro
instead of using manual for loop. There is no functional
change.

Cc: John Harrison 
Cc: Tvrtko Ursulin 
Signed-off-by: Nirmoy Das 
---
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 64 ++-
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 0eaa1064242c..7e88d90e935b 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1800,14 +1800,37 @@ __unwind_incomplete_requests(struct intel_context *ce)
spin_unlock_irqrestore(&sched_engine->lock, flags);
 }
 
-static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t 
stalled)
+static void guc_reset_context_state(struct intel_context *ce, 
intel_engine_mask_t stalled)
 {
-   bool guilty;
struct i915_request *rq;
-   unsigned long flags;
+   bool guilty = false;
u32 head;
-   int i, number_children = ce->parallel.number_children;
-   struct intel_context *parent = ce;
+
+   if (!intel_context_is_pinned(ce))
+   return;
+
+   rq = intel_context_get_active_request(ce);
+   if (!rq) {
+   head = ce->ring->tail;
+   goto out_replay;
+   }
+
+   if (i915_request_started(rq))
+   guilty = stalled & ce->engine->mask;
+
+   GEM_BUG_ON(i915_active_is_idle(&ce->active));
+   head = intel_ring_wrap(ce->ring, rq->head);
+
+   __i915_request_reset(rq, guilty);
+   i915_request_put(rq);
+out_replay:
+   guc_reset_state(ce, head, guilty);
+}
+
+static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t 
stalled)
+{
+   struct intel_context *child;
+   unsigned long flags;
 
GEM_BUG_ON(intel_context_is_child(ce));
 
@@ -1826,34 +1849,13 @@ static void __guc_reset_context(struct intel_context 
*ce, intel_engine_mask_t st
 * For each context in the relationship find the hanging request
 * resetting each context / request as needed
 */
-   for (i = 0; i < number_children + 1; ++i) {
-   if (!intel_context_is_pinned(ce))
-   goto next_context;
-
-   guilty = false;
-   rq = intel_context_get_active_request(ce);
-   if (!rq) {
-   head = ce->ring->tail;
-   goto out_replay;
-   }
-
-   if (i915_request_started(rq))
-   guilty = stalled & ce->engine->mask;
-
-   GEM_BUG_ON(i915_active_is_idle(&ce->active));
-   head = intel_ring_wrap(ce->ring, rq->head);
-
-   __i915_request_reset(rq, guilty);
-   i915_request_put(rq);
-out_replay:
-   guc_reset_state(ce, head, guilty);
-next_context:
-   if (i != number_children)
-   ce = list_next_entry(ce, parallel.child_link);
+   guc_reset_context_state(ce, stalled);
+   for_each_child(ce, child) {
+   guc_reset_context_state(child, stalled);
}
 
-   __unwind_incomplete_requests(parent);
-   intel_context_put(parent);
+   __unwind_incomplete_requests(ce);
+   intel_context_put(ce);
 }
 
 void wake_up_all_tlb_invalidate(struct intel_guc *guc)
-- 
2.42.0



Re: [PATCH] drm/i915: Correct error handler

2024-05-13 Thread Nirmoy Das



On 5/11/2024 5:48 PM, Jiasheng Jiang wrote:

Replace "slab_priorities" with "slab_dependencies" in the error handler to 
avoid memory leak.


Nice catch. I would make the subject more like:

drm/i915: Fix memory leak by correcting cache object name in error handler



Fixes: 32eb6bcfdda9 ("drm/i915: Make request allocation caches global")


Also need Cc:  # v5.2+

With those:

Reviewed-by: Nirmoy Das 


Nirmoy


Signed-off-by: Jiasheng Jiang 
---
  drivers/gpu/drm/i915/i915_scheduler.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_scheduler.c 
b/drivers/gpu/drm/i915/i915_scheduler.c
index 762127dd56c5..70a854557e6e 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -506,6 +506,6 @@ int __init i915_scheduler_module_init(void)
return 0;
  
  err_priorities:

-   kmem_cache_destroy(slab_priorities);
+   kmem_cache_destroy(slab_dependencies);
return -ENOMEM;
  }


Re: [PATCH] Revert "drm/i915: Remove extra multi-gt pm-references"

2024-05-07 Thread Nirmoy Das



On 5/7/2024 7:10 PM, Rodrigo Vivi wrote:

On Tue, May 07, 2024 at 10:54:11AM +0200, Janusz Krzysztofik wrote:

On Tuesday, 7 May 2024 09:30:15 GMT+2 Nirmoy Das wrote:

Hi Janusz,


Just realized we need Fixes tag for this.

Fixes: 1f33dc0c1189 ("drm/i915: Remove extra multi-gt pm-references")

Whoever is going to push this patch, please feel free to add this tag.

dim b4-shazam gets that automagically, now it was sent in reply ;)

Nice!


I just pushed the patch. thanks for the patch and reviews.



Thanks,

Nirmoy




Thanks,
Janusz



Regards,

Nirmoy

On 5/6/2024 8:02 PM, Janusz Krzysztofik wrote:

This reverts commit 1f33dc0c1189efb9ae19c6fc22b64dd3e26261fb.

There was a patch supposed to fix an issue of illegal attempts to free a
still active i915 VMA object when parking a GT believed to be idle,
reported by CI on 2-GT Meteor Lake.  As a solution, an extra wakeref for
a Primary GT was acquired from i915_gem_do_execbuffer() -- see commit
f56fe3e91787 ("drm/i915: Fix a VMA UAF for multi-gt platform").

However, that fix occurred insufficient -- the issue was still reported by
CI.  That wakeref was released on exit from i915_gem_do_execbuffer(), then
potentially before completion of the request and deactivation of its
associated VMAs.  Moreover, CI reports indicated that single-GT platforms
also suffered sporadically from the same race.

Since that issue was fixed by another commit f3c71b2ded5c ("drm/i915/vma:
Fix UAF on destroy against retire race"), the changes introduced by that
insufficient fix were dropped as no longer useful.  However, that series
resulted in another VMA UAF scenario now being triggered in CI.

<4> [260.290809] [ cut here ]
<4> [260.290988] list_del corruption. prev->next should be 888118c5d990, 
but was 888118c5a510. (prev=888118c5a510)
<4> [260.291004] WARNING: CPU: 2 PID: 1143 at lib/list_debug.c:62 
__list_del_entry_valid_or_report+0xb7/0xe0
..
<4> [260.291055] CPU: 2 PID: 1143 Comm: kms_plane Not tainted 
6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1
<4> [260.291058] Hardware name: Intel Corporation Meteor Lake Client 
Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024
<4> [260.291060] RIP: 0010:__list_del_entry_valid_or_report+0xb7/0xe0
...
<4> [260.291087] Call Trace:
<4> [260.291089]  
<4> [260.291124]  i915_vma_reopen+0x43/0x80 [i915]
<4> [260.291298]  eb_lookup_vmas+0x9cb/0xcc0 [i915]
<4> [260.291579]  i915_gem_do_execbuffer+0xc9a/0x26d0 [i915]
<4> [260.291883]  i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915]
...
<4> [260.292301]  
...
<4> [260.292506] ---[ end trace  ]---
<4> [260.292782] general protection fault, probably for non-canonical address 
0x6b6b6b6b6b6b6ca3:  [#1] PREEMPT SMP NOPTI
<4> [260.303575] CPU: 2 PID: 1143 Comm: kms_plane Tainted: GW  
6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1
<4> [260.313851] Hardware name: Intel Corporation Meteor Lake Client 
Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024
<4> [260.326359] RIP: 0010:eb_validate_vmas+0x114/0xd80 [i915]
...
<4> [260.428756] Call Trace:
<4> [260.431192]  
<4> [639.283393]  i915_gem_do_execbuffer+0xd05/0x26d0 [i915]
<4> [639.305245]  i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915]
...
<4> [639.411134]  
...
<4> [639.449979] ---[ end trace  ]---

We defer actually closing, unbinding and destroying a VMA until next idle
point, or until the object is freed in the meantime.  By postponing the
unbind, we allow for the VMA to be reopened by the client, avoiding the
work required to rebind the VMA.

Starting from commit b0647a5e79b1 ("drm/i915: Avoid live-lock with
i915_vma_parked()"), we assume that as long as a GT is held idle, no VMA
would be reopened while we destroy them.  That assumption is no longer
true in multi-GT configurations, where a VMA we reopen may be handled by a
GT different from the one that we already keep active via its engine while
we set up an execbuf request.

Restoring the extra GT0 PM wakeref removed from i915_gem_do_execbuffer()
processing path seems to fix this issue.

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10608
Signed-off-by: Janusz Krzysztofik 
Cc: Rodrigo Vivi 
Cc: Nirmoy Das 
---
   drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 18 ++
   1 file changed, 18 insertions(+)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 42619fc05de48..090724fa766c9 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -255,6 +255,7 @@ struct i915_execbuffer {
struct intel_context *context; /* logical state for the request */
struct i915_gem_context *gem_context; /** caller's context *

Re: [PATCH] Revert "drm/i915: Remove extra multi-gt pm-references"

2024-05-07 Thread Nirmoy Das

Hi Janusz,


Just realized we need Fixes tag for this.

Fixes: 1f33dc0c1189 ("drm/i915: Remove extra multi-gt pm-references")


Regards,

Nirmoy

On 5/6/2024 8:02 PM, Janusz Krzysztofik wrote:

This reverts commit 1f33dc0c1189efb9ae19c6fc22b64dd3e26261fb.

There was a patch supposed to fix an issue of illegal attempts to free a
still active i915 VMA object when parking a GT believed to be idle,
reported by CI on 2-GT Meteor Lake.  As a solution, an extra wakeref for
a Primary GT was acquired from i915_gem_do_execbuffer() -- see commit
f56fe3e91787 ("drm/i915: Fix a VMA UAF for multi-gt platform").

However, that fix occurred insufficient -- the issue was still reported by
CI.  That wakeref was released on exit from i915_gem_do_execbuffer(), then
potentially before completion of the request and deactivation of its
associated VMAs.  Moreover, CI reports indicated that single-GT platforms
also suffered sporadically from the same race.

Since that issue was fixed by another commit f3c71b2ded5c ("drm/i915/vma:
Fix UAF on destroy against retire race"), the changes introduced by that
insufficient fix were dropped as no longer useful.  However, that series
resulted in another VMA UAF scenario now being triggered in CI.

<4> [260.290809] [ cut here ]
<4> [260.290988] list_del corruption. prev->next should be 888118c5d990, 
but was 888118c5a510. (prev=888118c5a510)
<4> [260.291004] WARNING: CPU: 2 PID: 1143 at lib/list_debug.c:62 
__list_del_entry_valid_or_report+0xb7/0xe0
..
<4> [260.291055] CPU: 2 PID: 1143 Comm: kms_plane Not tainted 
6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1
<4> [260.291058] Hardware name: Intel Corporation Meteor Lake Client 
Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024
<4> [260.291060] RIP: 0010:__list_del_entry_valid_or_report+0xb7/0xe0
...
<4> [260.291087] Call Trace:
<4> [260.291089]  
<4> [260.291124]  i915_vma_reopen+0x43/0x80 [i915]
<4> [260.291298]  eb_lookup_vmas+0x9cb/0xcc0 [i915]
<4> [260.291579]  i915_gem_do_execbuffer+0xc9a/0x26d0 [i915]
<4> [260.291883]  i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915]
...
<4> [260.292301]  
...
<4> [260.292506] ---[ end trace  ]---
<4> [260.292782] general protection fault, probably for non-canonical address 
0x6b6b6b6b6b6b6ca3:  [#1] PREEMPT SMP NOPTI
<4> [260.303575] CPU: 2 PID: 1143 Comm: kms_plane Tainted: GW  
6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1
<4> [260.313851] Hardware name: Intel Corporation Meteor Lake Client 
Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024
<4> [260.326359] RIP: 0010:eb_validate_vmas+0x114/0xd80 [i915]
...
<4> [260.428756] Call Trace:
<4> [260.431192]  
<4> [639.283393]  i915_gem_do_execbuffer+0xd05/0x26d0 [i915]
<4> [639.305245]  i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915]
...
<4> [639.411134]  
...
<4> [639.449979] ---[ end trace  ]---

We defer actually closing, unbinding and destroying a VMA until next idle
point, or until the object is freed in the meantime.  By postponing the
unbind, we allow for the VMA to be reopened by the client, avoiding the
work required to rebind the VMA.

Starting from commit b0647a5e79b1 ("drm/i915: Avoid live-lock with
i915_vma_parked()"), we assume that as long as a GT is held idle, no VMA
would be reopened while we destroy them.  That assumption is no longer
true in multi-GT configurations, where a VMA we reopen may be handled by a
GT different from the one that we already keep active via its engine while
we set up an execbuf request.

Restoring the extra GT0 PM wakeref removed from i915_gem_do_execbuffer()
processing path seems to fix this issue.

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10608
Signed-off-by: Janusz Krzysztofik 
Cc: Rodrigo Vivi 
Cc: Nirmoy Das 
---
  drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 18 ++
  1 file changed, 18 insertions(+)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 42619fc05de48..090724fa766c9 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -255,6 +255,7 @@ struct i915_execbuffer {
struct intel_context *context; /* logical state for the request */
struct i915_gem_context *gem_context; /** caller's context */
intel_wakeref_t wakeref;
+   intel_wakeref_t wakeref_gt0;
  
  	/** our requests to build */

struct i915_request *requests[MAX_ENGINE_INSTANCE + 1];
@@ -2685,6 +2686,7 @@ static int
  eb_select_engine(struct i915_execbuffer *eb)
  {
struct intel_context *ce, *child;
+   struct intel_gt *gt;
unsigned int idx;
int err;
  
@@ -2708,10 +2710,17 @@ e

Re: [PATCH] Revert "drm/i915: Remove extra multi-gt pm-references"

2024-05-06 Thread Nirmoy Das


On 5/6/2024 8:02 PM, Janusz Krzysztofik wrote:

This reverts commit 1f33dc0c1189efb9ae19c6fc22b64dd3e26261fb.

There was a patch supposed to fix an issue of illegal attempts to free a
still active i915 VMA object when parking a GT believed to be idle,
reported by CI on 2-GT Meteor Lake.  As a solution, an extra wakeref for
a Primary GT was acquired from i915_gem_do_execbuffer() -- see commit
f56fe3e91787 ("drm/i915: Fix a VMA UAF for multi-gt platform").

However, that fix occurred insufficient -- the issue was still reported by
CI.  That wakeref was released on exit from i915_gem_do_execbuffer(), then
potentially before completion of the request and deactivation of its
associated VMAs.  Moreover, CI reports indicated that single-GT platforms
also suffered sporadically from the same race.

Since that issue was fixed by another commit f3c71b2ded5c ("drm/i915/vma:
Fix UAF on destroy against retire race"), the changes introduced by that
insufficient fix were dropped as no longer useful.  However, that series
resulted in another VMA UAF scenario now being triggered in CI.

<4> [260.290809] [ cut here ]
<4> [260.290988] list_del corruption. prev->next should be 888118c5d990, 
but was 888118c5a510. (prev=888118c5a510)
<4> [260.291004] WARNING: CPU: 2 PID: 1143 at lib/list_debug.c:62 
__list_del_entry_valid_or_report+0xb7/0xe0
..
<4> [260.291055] CPU: 2 PID: 1143 Comm: kms_plane Not tainted 
6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1
<4> [260.291058] Hardware name: Intel Corporation Meteor Lake Client 
Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024
<4> [260.291060] RIP: 0010:__list_del_entry_valid_or_report+0xb7/0xe0
...
<4> [260.291087] Call Trace:
<4> [260.291089]  
<4> [260.291124]  i915_vma_reopen+0x43/0x80 [i915]
<4> [260.291298]  eb_lookup_vmas+0x9cb/0xcc0 [i915]
<4> [260.291579]  i915_gem_do_execbuffer+0xc9a/0x26d0 [i915]
<4> [260.291883]  i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915]
...
<4> [260.292301]  
...
<4> [260.292506] ---[ end trace  ]---
<4> [260.292782] general protection fault, probably for non-canonical address 
0x6b6b6b6b6b6b6ca3:  [#1] PREEMPT SMP NOPTI
<4> [260.303575] CPU: 2 PID: 1143 Comm: kms_plane Tainted: GW  
6.9.0-rc2-CI_DRM_14524-ga25d180c6853+ #1
<4> [260.313851] Hardware name: Intel Corporation Meteor Lake Client 
Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024
<4> [260.326359] RIP: 0010:eb_validate_vmas+0x114/0xd80 [i915]
...
<4> [260.428756] Call Trace:
<4> [260.431192]  
<4> [639.283393]  i915_gem_do_execbuffer+0xd05/0x26d0 [i915]
<4> [639.305245]  i915_gem_execbuffer2_ioctl+0x123/0x2a0 [i915]
...
<4> [639.411134]  
...
<4> [639.449979] ---[ end trace  ]---

We defer actually closing, unbinding and destroying a VMA until next idle
point, or until the object is freed in the meantime.  By postponing the
unbind, we allow for the VMA to be reopened by the client, avoiding the
work required to rebind the VMA.

Starting from commit b0647a5e79b1 ("drm/i915: Avoid live-lock with
i915_vma_parked()"), we assume that as long as a GT is held idle, no VMA
would be reopened while we destroy them.  That assumption is no longer
true in multi-GT configurations, where a VMA we reopen may be handled by a
GT different from the one that we already keep active via its engine while
we set up an execbuf request.

Restoring the extra GT0 PM wakeref removed from i915_gem_do_execbuffer()
processing path seems to fix this issue.

Closes:https://gitlab.freedesktop.org/drm/intel/-/issues/10608
Signed-off-by: Janusz Krzysztofik
Cc: Rodrigo Vivi
Cc: Nirmoy Das


Reviewed-by: Nirmoy Das 


---
  drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 18 ++
  1 file changed, 18 insertions(+)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 42619fc05de48..090724fa766c9 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -255,6 +255,7 @@ struct i915_execbuffer {
struct intel_context *context; /* logical state for the request */
struct i915_gem_context *gem_context; /** caller's context */
intel_wakeref_t wakeref;
+   intel_wakeref_t wakeref_gt0;
  
  	/** our requests to build */

struct i915_request *requests[MAX_ENGINE_INSTANCE + 1];
@@ -2685,6 +2686,7 @@ static int
  eb_select_engine(struct i915_execbuffer *eb)
  {
struct intel_context *ce, *child;
+   struct intel_gt *gt;
unsigned int idx;
int err;
  
@@ -2708,10 +2710,17 @@ eb_select_engine(struct i915_execbuffer *eb)

}
}
eb->num_batches = ce->parallel.number_children + 1;
+   gt = ce-&g

Re: [PATCH v3] drm/i915/vma: Fix UAF on reopen vs destroy race

2024-05-06 Thread Nirmoy Das
may help with this one, which
started appearing after I reverted that workaround.  However, its
effectiveness is limited to MTL topology.

perhaps the safer path for this case indeed. something that could be really
limited to a single platform would be better.



I agree with Rodrigo here. it would be safe revert the mentioned patch 
now and think about more robust solution


later on as the issue is effecting current user.


Regards,

Nirmoy



But I confess that I don't have other better suggestions.
If we need to go with this patch as a quick solution, it is apparently
better than leaving the bug there as is.

+Thomas. any good thoughts there or advices?

Thanks,
Rodrigo.


Thanks,
Janusz


Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10608
Signed-off-by: Janusz Krzysztofik 
Cc: Chris Wilson 
Cc: Tvrtko Ursulin 
Cc: sta...@vger.kernel.org # v6.0+
---
  .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 10 --
  drivers/gpu/drm/i915/i915_vma.c   | 32 +++
  drivers/gpu/drm/i915/i915_vma.h   |  2 +-
  drivers/gpu/drm/i915/i915_vma_types.h |  3 ++
  4 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 42619fc05de48..97e014f94002e 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -847,9 +847,12 @@ static int __eb_add_lut(struct i915_execbuffer *eb,
if (unlikely(!lut))
return -ENOMEM;
  
+	if (!i915_vma_open(vma)) {

+   err = -EEXIST;  /* let eb_vma_lookup() retry */
+   goto err_lut_free;
+   }
+
i915_vma_get(vma);
-   if (!atomic_fetch_inc(&vma->open_count))
-   i915_vma_reopen(vma);
lut->handle = handle;
lut->ctx = ctx;
  
@@ -880,8 +883,9 @@ static int __eb_add_lut(struct i915_execbuffer *eb,

return 0;
  
  err:

-   i915_vma_close(vma);
i915_vma_put(vma);
+   i915_vma_close(vma);
+err_lut_free:
i915_lut_handle_free(lut);
return err;
  }
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index d2f064d2525cc..4435c76f28c8c 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -1735,14 +1735,33 @@ static void __i915_vma_remove_closed(struct i915_vma 
*vma)
list_del_init(&vma->closed_link);
  }
  
-void i915_vma_reopen(struct i915_vma *vma)

+static struct i915_vma *i915_vma_reopen(struct i915_vma *vma)
+{
+   if (atomic_read(&vma->flags) & I915_VMA_PARKED)
+   return NULL;
+
+   __i915_vma_remove_closed(vma);
+   return vma;
+}
+
+struct i915_vma *i915_vma_open(struct i915_vma *vma)
  {
struct intel_gt *gt = vma->vm->gt;
  
+	if (atomic_inc_not_zero(&vma->open_count))

+   return vma;
+
spin_lock_irq(>->closed_lock);
-   if (i915_vma_is_closed(vma))
-   __i915_vma_remove_closed(vma);
+   if (!atomic_inc_not_zero(&vma->open_count)) {
+   if (i915_vma_is_closed(vma))
+   vma = i915_vma_reopen(vma);
+
+   if (vma)
+   atomic_inc(&vma->open_count);
+   }
spin_unlock_irq(>->closed_lock);
+
+   return vma;
  }
  
  static void force_unbind(struct i915_vma *vma)

@@ -1770,7 +1789,8 @@ static void release_references(struct i915_vma *vma, 
struct intel_gt *gt,
spin_unlock(&obj->vma.lock);
  
  	spin_lock_irq(>->closed_lock);

-   __i915_vma_remove_closed(vma);
+   if (!(atomic_read(&vma->flags) & I915_VMA_PARKED))
+   __i915_vma_remove_closed(vma);
spin_unlock_irq(>->closed_lock);
  
  	if (vm_ddestroy)

@@ -1854,22 +1874,22 @@ void i915_vma_parked(struct intel_gt *gt)
}
  
  		list_move(&vma->closed_link, &closed);

+   atomic_or(I915_VMA_PARKED, &vma->flags);
}
spin_unlock_irq(>->closed_lock);
  
-	/* As the GT is held idle, no vma can be reopened as we destroy them */

list_for_each_entry_safe(vma, next, &closed, closed_link) {
struct drm_i915_gem_object *obj = vma->obj;
struct i915_address_space *vm = vma->vm;
  
  		if (i915_gem_object_trylock(obj, NULL)) {

-   INIT_LIST_HEAD(&vma->closed_link);
i915_vma_destroy(vma);
i915_gem_object_unlock(obj);
} else {
/* back you go.. */
spin_lock_irq(>->closed_lock);
list_add(&vma->closed_link, >->closed_vma);
+   atomic_andnot(I915_VMA_PARKED, &vma->flags);
spin_unlock_irq(>->closed_lock);

Re: [PATCH] drm/i915/gt: Disarm breadcrumbs if engines are already idle

2024-04-26 Thread Nirmoy Das



On 4/23/2024 6:23 PM, Janusz Krzysztofik wrote:

From: Chris Wilson 

The breadcrumbs use a GT wakeref for guarding the interrupt, but are
disarmed during release of the engine wakeref. This leaves a hole where
we may attach a breadcrumb just as the engine is parking (after it has
parked its breadcrumbs), execute the irq worker with some signalers still
attached, but never be woken again.

That issue manifests itself in CI with IGT runner timeouts while tests
are waiting indefinitely for release of all GT wakerefs.

<6> [209.151778] i915: Running live_engine_pm_selftests/live_engine_busy_stats
<7> [209.231628] i915 :00:02.0: [drm:intel_power_well_disable [i915]] 
disabling PW_5
<7> [209.231816] i915 :00:02.0: [drm:intel_power_well_disable [i915]] 
disabling PW_4
<7> [209.231944] i915 :00:02.0: [drm:intel_power_well_disable [i915]] 
disabling PW_3
<7> [209.232056] i915 :00:02.0: [drm:intel_power_well_disable [i915]] 
disabling PW_2
<7> [209.232166] i915 :00:02.0: [drm:intel_power_well_disable [i915]] 
disabling DC_off
<7> [209.232270] i915 :00:02.0: [drm:skl_enable_dc6 [i915]] Enabling DC6
<7> [209.232368] i915 :00:02.0: [drm:gen9_set_dc_state.part.0 [i915]] 
Setting DC state from 00 to 02
<4> [299.356116] [IGT] Inactivity timeout exceeded. Killing the current test 
with SIGQUIT.
...
<6> [299.356526] sysrq: Show State
...
<6> [299.373964] task:i915_selftest   state:D stack:11784 pid:5578  tgid:5578  
ppid:873flags:0x4002
<6> [299.373967] Call Trace:
<6> [299.373968]  
<6> [299.373970]  __schedule+0x3bb/0xda0
<6> [299.373974]  schedule+0x41/0x110
<6> [299.373976]  intel_wakeref_wait_for_idle+0x82/0x100 [i915]
<6> [299.374083]  ? __pfx_var_wake_function+0x10/0x10
<6> [299.374087]  live_engine_busy_stats+0x9b/0x500 [i915]
<6> [299.374173]  __i915_subtests+0xbe/0x240 [i915]
<6> [299.374277]  ? __pfx___intel_gt_live_setup+0x10/0x10 [i915]
<6> [299.374369]  ? __pfx___intel_gt_live_teardown+0x10/0x10 [i915]
<6> [299.374456]  intel_engine_live_selftests+0x1c/0x30 [i915]
<6> [299.374547]  __run_selftests+0xbb/0x190 [i915]
<6> [299.374635]  i915_live_selftests+0x4b/0x90 [i915]
<6> [299.374717]  i915_pci_probe+0x10d/0x210 [i915]

At the end of the interrupt worker, if there are no more engines awake,
disarm the breadcrumb and go to sleep.

Fixes: 9d5612ca165a ("drm/i915/gt: Defer enabling the breadcrumb interrupt to after 
submission")
Closes: https://gitlab.freedesktop.org/drm/intel/issues/10026
Signed-off-by: Chris Wilson 
Cc: Andrzej Hajda 
Cc:  # v5.12+
Signed-off-by: Janusz Krzysztofik 



Acked-by: Nirmoy Das 

I will let others/Andrzej r-b this as I am not very familiar with the code.


Thanks,

Nirmoy


---
  drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 15 +++
  1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c 
b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
index d650beb8ed22f..20b9b04ec1e0b 100644
--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
@@ -263,8 +263,13 @@ static void signal_irq_work(struct irq_work *work)
i915_request_put(rq);
}
  
+	/* Lazy irq enabling after HW submission */

if (!READ_ONCE(b->irq_armed) && !list_empty(&b->signalers))
intel_breadcrumbs_arm_irq(b);
+
+   /* And confirm that we still want irqs enabled before we yield */
+   if (READ_ONCE(b->irq_armed) && !atomic_read(&b->active))
+   intel_breadcrumbs_disarm_irq(b);
  }
  
  struct intel_breadcrumbs *

@@ -315,13 +320,7 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b)
return;
  
  	/* Kick the work once more to drain the signalers, and disarm the irq */

-   irq_work_sync(&b->irq_work);
-   while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) {
-   local_irq_disable();
-   signal_irq_work(&b->irq_work);
-   local_irq_enable();
-   cond_resched();
-   }
+   irq_work_queue(&b->irq_work);
  }
  
  void intel_breadcrumbs_free(struct kref *kref)

@@ -404,7 +403,7 @@ static void insert_breadcrumb(struct i915_request *rq)
 * the request as it may have completed and raised the interrupt as
 * we were attaching it into the lists.
 */
-   if (!b->irq_armed || __i915_request_is_complete(rq))
+   if (!READ_ONCE(b->irq_armed) || __i915_request_is_complete(rq))
irq_work_queue(&b->irq_work);
  }
  


Re: ✗ Fi.CI.IGT: failure for series starting with [v2,1/2] drm/i915: Refactor confusing __intel_gt_reset() (rev2)

2024-04-24 Thread Das, Nirmoy
Thanks a lot!


--

Intel Deutschland GmbH
Registered Address: Am Campeon 10, 85579 Neubiberg, Germany
Tel: +49 89 99 8853-0, www.intel.de<http://www.intel.de> 
<http://www.intel.de><http://www.intel.de>
Managing Directors: Christin Eisenschmid, Sharon Heck, Tiffany Doon Silva
Chairperson of the Supervisory Board: Nicole Lau
Registered Office: Munich
Commercial Register: Amtsgericht Muenchen HRB 186928



From: Andi Shyti 
Sent: Wednesday, April 24, 2024 7:06 PM
To: Nirmoy Das 
Cc: intel-gfx@lists.freedesktop.org ; 
Patchwork ; Das, Nirmoy 
; Andi Shyti 
Subject: Re: ✗ Fi.CI.IGT: failure for series starting with [v2,1/2] drm/i915: 
Refactor confusing __intel_gt_reset() (rev2)

Hi Nirmoy,

On Wed, Apr 24, 2024 at 10:56:36AM +0200, Nirmoy Das wrote:
>
> On 4/24/2024 10:16 AM, Patchwork wrote:
>
> Patch Details
>
> Series:  series starting with [v2,1/2] drm/i915: Refactor confusing
>  __intel_gt_reset() (rev2)
> URL: https://patchwork.freedesktop.org/series/132731/
> State:   failure
> Details: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_132731v2/
>  index.html
>
> CI Bug Log - changes from CI_DRM_14633_full -> Patchwork_132731v2_full
>
> Summary
>
> FAILURE
>
> Serious unknown changes coming with Patchwork_132731v2_full absolutely 
> need
> to be
> verified manually.
>
> If you think the reported changes have nothing to do with the changes
> introduced in Patchwork_132731v2_full, please notify your bug team ('
> i915-ci-in...@lists.freedesktop.org') to allow them
> to document this new failure mode, which will reduce false positives in 
> CI.
>
> External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_132731v2/
> index.html
>
> Participating hosts (9 -> 8)
>
> Missing (1): shard-dg2-set2
>
> Possible new issues
>
> Here are the unknown changes that may have been introduced in
> Patchwork_132731v2_full:
>
> IGT changes
>
> Possible regressions
>
>   □ igt@gem_exec_await@wide-all:
>
>   ☆ shard-dg1: NOTRUN -> INCOMPLETE
>   □ igt@gem_exec_gttfill@engines@ccs0:
>
>   ☆ shard-dg2: NOTRUN -> INCOMPLETE
>
> These are unrelated as the change only effects where GuC submission disabled.
>
> Andi, could you please help me merge this one. My dev machine is still broken.

merged into drm-intel-gt-next.

Thanks,
Andi


Re: [PATCH v2 2/2] drm/i915: Fix gt reset with GuC submission is disabled

2024-04-23 Thread Nirmoy Das

Hi Andi,

On 4/23/2024 11:32 AM, Andi Shyti wrote:

Hi Nirmoy,

On Mon, Apr 22, 2024 at 10:19:51PM +0200, Nirmoy Das wrote:

Currently intel_gt_reset() kills the GuC and then resets requested
engines. This is problematic because there is a dedicated CSB FIFO
which only GuC can access and if that FIFO fills up, the hardware
will block on the next context switch until there is space that means
the system is effectively hung. If an engine is reset whilst actively
executing a context, a CSB entry will be sent to say that the context
has gone idle. Thus if reset happens on a very busy system then
killing GuC before killing the engines will lead to deadlock because
of filled up CSB FIFO.

is this a fix?


I went quite far back in the commit logs, and it appears to me that 
we've always been using the current reset flow.


I believe we don't perform a GT reset immediately after sending a number 
of requests, which is what the current failed test is doing.


So, I don't think there will be any visible impact on the user with the 
current flow.





To address this issue, the GuC should be killed only after resetting
the requested engines and before calling intel_gt_init_hw().

v2: Improve commit message(John)

Cc: John Harrison 
Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/i915/gt/intel_reset.c | 16 ++--
  1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index b1393863ca9b..6161f7a3ff70 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -879,8 +879,17 @@ static intel_engine_mask_t reset_prepare(struct intel_gt 
*gt)
intel_engine_mask_t awake = 0;
enum intel_engine_id id;
  
-	/* For GuC mode, ensure submission is disabled before stopping ring */

-   intel_uc_reset_prepare(>->uc);
+   /**
+* For GuC mode with submission enabled, ensure submission
+* is disabled before stopping ring.

nit: "stopping *the* ring"

Will fix it while merging if I don't have to resend this again.



+*
+* For GuC mode with submission disabled, ensure that GuC is not
+* sanitized, do that after engine reset. reset_prepare()
+* is followed by engine reset which in this mode requires GuC to
+* process any CSB FIFO entries generated by the resets.
+*/
+   if (intel_uc_uses_guc_submission(>->uc))
+   intel_uc_reset_prepare(>->uc);
  
  	for_each_engine(engine, gt, id) {

if (intel_engine_pm_get_if_awake(engine))
@@ -1227,6 +1236,9 @@ void intel_gt_reset(struct intel_gt *gt,
  
  	intel_overlay_reset(gt->i915);
  
+	/* sanitize uC after engine reset */

+   if (!intel_uc_uses_guc_submission(>->uc))
+   intel_uc_reset_prepare(>->uc);

Reviewed-by: Andi Shyti 


Thanks,

Nirmoy


Thanks,
Andi


Re: [PATCH v3 1/3] drm/i915/gem: Increment vma offset when mapping fb objects

2024-04-23 Thread Nirmoy Das

Hi Andi,

On 4/12/2024 2:48 AM, Andi Shyti wrote:

Until now the "vm_pgoff" was not used and there has been no need
to set its offset.

But now, because we want to support partial mappings with a given
offset, we need it to be set.

Suggested-by: Chris Wilson 
Signed-off-by: Andi Shyti 


Do we have a IGT for partial FB mmap test ? Would be nice to have one 
but this patch looks good to me.



Reviewed-by: Nirmoy Das 


Regards,

Nirmoy


---
  drivers/gpu/drm/i915/gem/i915_gem_mman.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c 
b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index a2195e28b625..ce10dd259812 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -1084,6 +1084,8 @@ int i915_gem_fb_mmap(struct drm_i915_gem_object *obj, 
struct vm_area_struct *vma
mmo = mmap_offset_attach(obj, mmap_type, NULL);
if (IS_ERR(mmo))
return PTR_ERR(mmo);
+
+   vma->vm_pgoff += drm_vma_node_start(&mmo->vma_node);
}
  
  	/*


Re: [PATCH] drm/i915/gt: Refactor uabi engine class/instance list creation

2024-04-23 Thread Nirmoy Das

Hi Andi,

On 4/17/2024 12:49 AM, Andi Shyti wrote:

For the upcoming changes we need a cleaner way to build the list
of uabi engines.

Suggested-by: Tvrtko Ursulin
Signed-off-by: Andi Shyti
---
Hi,

just sending this patch to unburden the coming series from this
single patch inherited from a previously sent series.

Andi

  drivers/gpu/drm/i915/gt/intel_engine_user.c | 29 -
  1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c 
b/drivers/gpu/drm/i915/gt/intel_engine_user.c
index 833987015b8b..11cc06c0c785 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
@@ -203,7 +203,7 @@ static void engine_rename(struct intel_engine_cs *engine, 
const char *name, u16
  
  void intel_engines_driver_register(struct drm_i915_private *i915)

  {
-   u16 name_instance, other_instance = 0;
+   u16 class_instance[I915_LAST_UABI_ENGINE_CLASS + 2] = { };

+2 is confusing here. I think we need a better macro.

struct legacy_ring ring = {};
struct list_head *it, *next;
struct rb_node **p, *prev;
@@ -214,6 +214,8 @@ void intel_engines_driver_register(struct drm_i915_private 
*i915)
prev = NULL;
p = &i915->uabi_engines.rb_node;
list_for_each_safe(it, next, &engines) {
+   u16 uabi_class;
+
struct intel_engine_cs *engine =
container_of(it, typeof(*engine), uabi_list);
  
@@ -222,15 +224,14 @@ void intel_engines_driver_register(struct drm_i915_private *i915)
  
  		GEM_BUG_ON(engine->class >= ARRAY_SIZE(uabi_classes));

engine->uabi_class = uabi_classes[engine->class];
-   if (engine->uabi_class == I915_NO_UABI_CLASS) {
-   name_instance = other_instance++;
-   } else {
-   GEM_BUG_ON(engine->uabi_class >=
-  ARRAY_SIZE(i915->engine_uabi_class_count));
-   name_instance =
-   
i915->engine_uabi_class_count[engine->uabi_class]++;
-   }
-   engine->uabi_instance = name_instance;
+
+   if (engine->uabi_class == I915_NO_UABI_CLASS)
+   uabi_class = I915_LAST_UABI_ENGINE_CLASS + 1;
+   else
+   uabi_class = engine->uabi_class;
+
+   GEM_BUG_ON(uabi_class >= ARRAY_SIZE(class_instance));
+   engine->uabi_instance = class_instance[uabi_class]++;
  
  		/*

 * Replace the internal name with the final user and log facing
@@ -238,11 +239,15 @@ void intel_engines_driver_register(struct 
drm_i915_private *i915)
 */
engine_rename(engine,
  intel_engine_class_repr(engine->class),
- name_instance);
+ engine->uabi_instance);
  
-		if (engine->uabi_class == I915_NO_UABI_CLASS)

+   if (uabi_class > I915_LAST_UABI_ENGINE_CLASS)
continue;
  
+		GEM_BUG_ON(uabi_class >=

+  ARRAY_SIZE(i915->engine_uabi_class_count));
+   i915->engine_uabi_class_count[uabi_class]++;
Shouldn't this be  i915->engine_uabi_class_count[uabi_class] = 
class_instance[uabi_class]; ?


What I see is that this patch mainly adding this class_instance array 
and rest looks the same.
May be it make sense to add other upcoming  patches to better understand 
why we need this patch.


Regards,
Nirmoy


+
rb_link_node(&engine->uabi_node, prev, p);
rb_insert_color(&engine->uabi_node, &i915->uabi_engines);
  

[PATCH v2 2/2] drm/i915: Fix gt reset with GuC submission is disabled

2024-04-22 Thread Nirmoy Das
Currently intel_gt_reset() kills the GuC and then resets requested
engines. This is problematic because there is a dedicated CSB FIFO
which only GuC can access and if that FIFO fills up, the hardware
will block on the next context switch until there is space that means
the system is effectively hung. If an engine is reset whilst actively
executing a context, a CSB entry will be sent to say that the context
has gone idle. Thus if reset happens on a very busy system then
killing GuC before killing the engines will lead to deadlock because
of filled up CSB FIFO.

To address this issue, the GuC should be killed only after resetting
the requested engines and before calling intel_gt_init_hw().

v2: Improve commit message(John)

Cc: John Harrison 
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/i915/gt/intel_reset.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index b1393863ca9b..6161f7a3ff70 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -879,8 +879,17 @@ static intel_engine_mask_t reset_prepare(struct intel_gt 
*gt)
intel_engine_mask_t awake = 0;
enum intel_engine_id id;
 
-   /* For GuC mode, ensure submission is disabled before stopping ring */
-   intel_uc_reset_prepare(>->uc);
+   /**
+* For GuC mode with submission enabled, ensure submission
+* is disabled before stopping ring.
+*
+* For GuC mode with submission disabled, ensure that GuC is not
+* sanitized, do that after engine reset. reset_prepare()
+* is followed by engine reset which in this mode requires GuC to
+* process any CSB FIFO entries generated by the resets.
+*/
+   if (intel_uc_uses_guc_submission(>->uc))
+   intel_uc_reset_prepare(>->uc);
 
for_each_engine(engine, gt, id) {
if (intel_engine_pm_get_if_awake(engine))
@@ -1227,6 +1236,9 @@ void intel_gt_reset(struct intel_gt *gt,
 
intel_overlay_reset(gt->i915);
 
+   /* sanitize uC after engine reset */
+   if (!intel_uc_uses_guc_submission(>->uc))
+   intel_uc_reset_prepare(>->uc);
/*
 * Next we need to restore the context, but we don't use those
 * yet either...
-- 
2.42.0



[PATCH v2 1/2] drm/i915: Refactor confusing __intel_gt_reset()

2024-04-22 Thread Nirmoy Das
__intel_gt_reset() is really for resetting engines though
the name might suggest something else. So add a helper function
to remove confusions with no functional changes.

v2: Move intel_gt_reset_all_engines() next to
intel_gt_reset_engine() to make diff simple(John)

Cc: John Harrison 
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c |  2 +-
 .../drm/i915/gt/intel_execlists_submission.c  |  2 +-
 drivers/gpu/drm/i915/gt/intel_gt.c|  2 +-
 drivers/gpu/drm/i915/gt/intel_gt_pm.c |  2 +-
 drivers/gpu/drm/i915/gt/intel_reset.c | 35 +++
 drivers/gpu/drm/i915/gt/intel_reset.h |  3 +-
 drivers/gpu/drm/i915/gt/selftest_reset.c  |  2 +-
 drivers/gpu/drm/i915/i915_driver.c|  2 +-
 8 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 8c44af1c3451..5c8e9ee3b008 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -678,7 +678,7 @@ void intel_engines_release(struct intel_gt *gt)
 */
GEM_BUG_ON(intel_gt_pm_is_awake(gt));
if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
-   __intel_gt_reset(gt, ALL_ENGINES);
+   intel_gt_reset_all_engines(gt);
 
/* Decouple the backend; but keep the layout for late GPU resets */
for_each_engine(engine, gt, id) {
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 355aab5b38ba..21829439e686 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -2898,7 +2898,7 @@ static void enable_error_interrupt(struct intel_engine_cs 
*engine)
drm_err(&engine->i915->drm,
"engine '%s' resumed still in error: %08x\n",
engine->name, status);
-   __intel_gt_reset(engine->gt, engine->mask);
+   intel_gt_reset_engine(engine);
}
 
/*
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c 
b/drivers/gpu/drm/i915/gt/intel_gt.c
index 580b5141ce1e..626b166e67ef 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -832,7 +832,7 @@ void intel_gt_driver_unregister(struct intel_gt *gt)
 
/* Scrub all HW state upon release */
with_intel_runtime_pm(gt->uncore->rpm, wakeref)
-   __intel_gt_reset(gt, ALL_ENGINES);
+   intel_gt_reset_all_engines(gt);
 }
 
 void intel_gt_driver_release(struct intel_gt *gt)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c 
b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 220ac4f92edf..c08fdb65cc69 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -159,7 +159,7 @@ static bool reset_engines(struct intel_gt *gt)
if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
return false;
 
-   return __intel_gt_reset(gt, ALL_ENGINES) == 0;
+   return intel_gt_reset_all_engines(gt) == 0;
 }
 
 static void gt_sanitize(struct intel_gt *gt, bool force)
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index c8e9aa41fdea..b1393863ca9b 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -764,7 +764,7 @@ wa_14015076503_end(struct intel_gt *gt, intel_engine_mask_t 
engine_mask)
 HECI_H_GS1_ER_PREP, 0);
 }
 
-int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
+static int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t 
engine_mask)
 {
const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
reset_func reset;
@@ -978,7 +978,7 @@ static void __intel_gt_set_wedged(struct intel_gt *gt)
 
/* Even if the GPU reset fails, it should still stop the engines */
if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
-   __intel_gt_reset(gt, ALL_ENGINES);
+   intel_gt_reset_all_engines(gt);
 
for_each_engine(engine, gt, id)
engine->submit_request = nop_submit_request;
@@ -1089,7 +1089,7 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt)
/* We must reset pending GPU events before restoring our submission */
ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */
if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
-   ok = __intel_gt_reset(gt, ALL_ENGINES) == 0;
+   ok = intel_gt_reset_all_engines(gt) == 0;
if (!ok) {
/*
 * Warn CI about the unrecoverable wedged condition.
@@ -1133,10 +1133,10 @@ static int do_reset(struct intel_gt *gt, 
intel_engine_mask_t stalled_mask)
 {
   

Re: [PATCH 3/3] drm/i915: Fix gt reset with GuC submission disabled

2024-04-19 Thread Nirmoy Das

Hi John,

On 4/19/2024 1:38 AM, John Harrison wrote:

On 4/18/2024 10:10, Nirmoy Das wrote:

Currently intel_gt_reset() happens as follows:

reset_prepare() ---> Sends GDRST to GuC, GuC is in GS_MIA_IN_RESET
do_reset()
   intel_gt_reset_all_engines()
 *_engine_reset_prepare() -->RESET_CTL expects running GuC
Not technically correct. There is no direct connection between 
RESET_CTL and GuC.



 *_reset_engines()
intel_gt_init_hw() --> GuC comes out of GS_MIA_IN_RESET with FW loaded.

Fix the issue by sanitizing the GuC only after resetting requested
engines and before intel_gt_init_hw().

You never actually state what the issue is.

The problem is that there is a dedicated CSB FIFO going to GuC (and 
nothing else has access to it). If that FIFO fills up, the hardware 
will block on the next context switch until there is space. If no-one 
(i.e. GuC) is draining it, that means the system is effectively hung. 
If an engine is reset whilst actively executing a context, a CSB entry 
will be sent to say that the context has gone idle. Thus if you reset 
a very busy system and start with killing GuC before killing the 
engines and only then re-enabling GuC, you run the risk of generating 
more CSB entries than will fit in the FIFO and deadlocking. Whereas, 
if the system is idle then you can reset the engines as much as you 
like while GuC is dead and it won't be a problem.



I wasn't sure if I could talk about internal details so kept it to 
minimal. I will borrow above explanation and resend :)






Note intel_uc_reset_finish() and intel_uc_reset() are nop when
guc submission is disabled.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/i915/gt/intel_reset.c | 16 ++--
  1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c

index 6504e8ba9c58..bd166f5aca4b 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -907,8 +907,17 @@ static intel_engine_mask_t reset_prepare(struct 
intel_gt *gt)

  intel_engine_mask_t awake = 0;
  enum intel_engine_id id;
  -    /* For GuC mode, ensure submission is disabled before stopping 
ring */

-    intel_uc_reset_prepare(>->uc);
+    /**
+ * For GuC mode with submission enabled, ensure submission
+ * is disabled before stopping ring.
+ *
+ * For GuC mode with submission disabled, ensure that GuC is not
+ * sanitized, do that at the end in reset_finish(). reset_prepare()
+ * is followed by engine reset which in this mode requires GuC to
+ * be functional to process engine reset events.

-> to process any CSB FIFO entries generated by the resets.


I will add this.


Thanks,

Nirmoy



John.


+ */
+    if (intel_uc_uses_guc_submission(>->uc))
+    intel_uc_reset_prepare(>->uc);
    for_each_engine(engine, gt, id) {
  if (intel_engine_pm_get_if_awake(engine))
@@ -1255,6 +1264,9 @@ void intel_gt_reset(struct intel_gt *gt,
    intel_overlay_reset(gt->i915);
  +    /* sanitize uC after engine reset */
+    if (!intel_uc_uses_guc_submission(>->uc))
+    intel_uc_reset_prepare(>->uc);
  /*
   * Next we need to restore the context, but we don't use those
   * yet either...




Re: [PATCH 2/3] drm/i915 Rename intel_engine_reset to intel_gt_engine_recover

2024-04-19 Thread Nirmoy Das

Hi John,

On 4/19/2024 1:27 AM, John Harrison wrote:

On 4/18/2024 10:10, Nirmoy Das wrote:

intel_engine_reset() not only reset a engine but also
tries to recover it so give it a proper name without
any functional changes.
Not seeing what the difference is. If this was a super low level 
function (with an __ prefix for example) then one might expect it to 
literally just poke the reset register and leave the engine in a dead 
state. But as a high level function, I think it is reasonable to 
expect a reset function to 'recover' the entity being reset.


Also, many of the callers are tests that are explicitly testing reset. 
So now the tests all talk about attempting resets, resets failing, 
etc. but around a call to 'recover' instead of 'reset', which seems 
confusing.



Didn't think about it, I will drop it.


Thanks,

Nirmoy



John.



Signed-off-by: Nirmoy Das 
---
  .../drm/i915/gem/selftests/i915_gem_context.c |  2 +-
  .../drm/i915/gt/intel_execlists_submission.c  |  2 +-
  drivers/gpu/drm/i915/gt/intel_reset.c |  4 ++--
  drivers/gpu/drm/i915/gt/intel_reset.h |  4 ++--
  drivers/gpu/drm/i915/gt/selftest_hangcheck.c  | 20 +--
  drivers/gpu/drm/i915/gt/selftest_mocs.c   |  4 ++--
  drivers/gpu/drm/i915/gt/selftest_reset.c  |  2 +-
  .../gpu/drm/i915/gt/selftest_workarounds.c    |  6 +++---
  8 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c

index 89d4dc8b60c6..4f4cde55f621 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
@@ -1171,7 +1171,7 @@ __sseu_finish(const char *name,
  int ret = 0;
    if (flags & TEST_RESET) {
-    ret = intel_engine_reset(ce->engine, "sseu");
+    ret = intel_gt_engine_recover(ce->engine, "sseu");
  if (ret)
  goto out;
  }
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c

index 21829439e686..9485a622a704 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -2404,7 +2404,7 @@ static void execlists_reset(struct 
intel_engine_cs *engine, const char *msg)
    ring_set_paused(engine, 1); /* Freeze the current request in 
place */

  execlists_capture(engine);
-    intel_engine_reset(engine, msg);
+    intel_gt_engine_recover(engine, msg);
tasklet_enable(&engine->sched_engine->tasklet);
  clear_and_wake_up_bit(bit, lock);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c

index b825daace58e..6504e8ba9c58 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -1348,7 +1348,7 @@ int __intel_engine_reset_bh(struct 
intel_engine_cs *engine, const char *msg)

  }
    /**
- * intel_engine_reset - reset GPU engine to recover from a hang
+ * intel_gt_engine_recover - reset GPU engine to recover from a hang
   * @engine: engine to reset
   * @msg: reason for GPU reset; or NULL for no drm_notice()
   *
@@ -1360,7 +1360,7 @@ int __intel_engine_reset_bh(struct 
intel_engine_cs *engine, const char *msg)

   *  - reset engine (which will force the engine to idle)
   *  - re-init/configure engine
   */
-int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
+int intel_gt_engine_recover(struct intel_engine_cs *engine, const 
char *msg)

  {
  int err;
  diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h 
b/drivers/gpu/drm/i915/gt/intel_reset.h

index c00de353075c..be984357bf27 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.h
+++ b/drivers/gpu/drm/i915/gt/intel_reset.h
@@ -31,8 +31,8 @@ void intel_gt_handle_error(struct intel_gt *gt,
  void intel_gt_reset(struct intel_gt *gt,
  intel_engine_mask_t stalled_mask,
  const char *reason);
-int intel_engine_reset(struct intel_engine_cs *engine,
-   const char *reason);
+int intel_gt_engine_recover(struct intel_engine_cs *engine,
+    const char *reason);
  int __intel_engine_reset_bh(struct intel_engine_cs *engine,
  const char *reason);
  diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c 
b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c

index 9ce8ff1c04fe..9bfda3f2bd24 100644
--- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
+++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
@@ -495,9 +495,9 @@ static int igt_reset_nop_engine(void *arg)
    i915_request_add(rq);
  }
-    err = intel_engine_reset(engine, NULL);
+    err = intel_gt_engine_recover(engine, NULL);
  if (err) {
-    pr_err("intel_engine_reset(%s) failed, err:%d\n",
+    pr_err("intel_gt_engine_recover(%

Re: [PATCH 1/3] drm/i915: Refactor confusing __intel_gt_reset()

2024-04-19 Thread Nirmoy Das

Hi John.

On 4/19/2024 1:27 AM, John Harrison wrote:

On 4/18/2024 10:10, Nirmoy Das wrote:

__intel_gt_reset() is really for resetting engines though
the name might suggest something else. So add two helper functions
to remove confusions with no functional changes.
Technically you only added one and just moved the other :). It already 
existed, it just wasn't being used everywhere that it could be!


I did have one more helper functions but I removed it in favor of 
intel_gt_reset_engine() but didn't change the commit message :/


Thanks for catching it. I will fix it.





Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/i915/gt/intel_engine_cs.c |  2 +-
  .../drm/i915/gt/intel_execlists_submission.c  |  2 +-
  drivers/gpu/drm/i915/gt/intel_gt.c    |  2 +-
  drivers/gpu/drm/i915/gt/intel_gt_pm.c |  2 +-
  drivers/gpu/drm/i915/gt/intel_reset.c | 43 ++-
  drivers/gpu/drm/i915/gt/intel_reset.h |  3 +-
  drivers/gpu/drm/i915/gt/selftest_reset.c  |  2 +-
  drivers/gpu/drm/i915/i915_driver.c    |  2 +-
  8 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c

index 8c44af1c3451..5c8e9ee3b008 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -678,7 +678,7 @@ void intel_engines_release(struct intel_gt *gt)
   */
  GEM_BUG_ON(intel_gt_pm_is_awake(gt));
  if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
-    __intel_gt_reset(gt, ALL_ENGINES);
+    intel_gt_reset_all_engines(gt);
    /* Decouple the backend; but keep the layout for late GPU 
resets */

  for_each_engine(engine, gt, id) {
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c

index 355aab5b38ba..21829439e686 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -2898,7 +2898,7 @@ static void enable_error_interrupt(struct 
intel_engine_cs *engine)

  drm_err(&engine->i915->drm,
  "engine '%s' resumed still in error: %08x\n",
  engine->name, status);
-    __intel_gt_reset(engine->gt, engine->mask);
+    intel_gt_reset_engine(engine);
  }
    /*
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c 
b/drivers/gpu/drm/i915/gt/intel_gt.c

index 580b5141ce1e..626b166e67ef 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -832,7 +832,7 @@ void intel_gt_driver_unregister(struct intel_gt *gt)
    /* Scrub all HW state upon release */
  with_intel_runtime_pm(gt->uncore->rpm, wakeref)
-    __intel_gt_reset(gt, ALL_ENGINES);
+    intel_gt_reset_all_engines(gt);
  }
    void intel_gt_driver_release(struct intel_gt *gt)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c 
b/drivers/gpu/drm/i915/gt/intel_gt_pm.c

index 220ac4f92edf..c08fdb65cc69 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -159,7 +159,7 @@ static bool reset_engines(struct intel_gt *gt)
  if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
  return false;
  -    return __intel_gt_reset(gt, ALL_ENGINES) == 0;
+    return intel_gt_reset_all_engines(gt) == 0;
  }
    static void gt_sanitize(struct intel_gt *gt, bool force)
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c

index c8e9aa41fdea..b825daace58e 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -764,7 +764,7 @@ wa_14015076503_end(struct intel_gt *gt, 
intel_engine_mask_t engine_mask)

   HECI_H_GS1_ER_PREP, 0);
  }
  -int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t 
engine_mask)
+static int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t 
engine_mask)

  {
  const int retries = engine_mask == ALL_ENGINES ? 
RESET_MAX_RETRIES : 1;

  reset_func reset;
@@ -795,6 +795,34 @@ int __intel_gt_reset(struct intel_gt *gt, 
intel_engine_mask_t engine_mask)

  return ret;
  }
  +/**
+ * intel_gt_reset_all_engines() - Reset all engines in the given gt.
+ * @gt: the GT to reset all engines for.
+ *
+ * This function resets all engines within the given gt.
+ *
+ * Returns:
+ * Zero on success, negative error code on failure.
+ */
+int intel_gt_reset_all_engines(struct intel_gt *gt)
+{
+    return __intel_gt_reset(gt, ALL_ENGINES);
+}
+
+/**
+ * intel_gt_reset_engine() - Reset a specific engine within a gt.
+ * @engine: engine to be reset.
+ *
+ * This function resets the specified engine within a gt.
+ *
+ * Returns:
+ * Zero on success, negative error code on failure.
+ */
+int intel_gt_reset_engine(struct intel_engine_cs *engine)
+{
+    return __intel_gt_reset(engine->gt, engine->mask);
+}
+

[PATCH 3/3] drm/i915: Fix gt reset with GuC submission disabled

2024-04-18 Thread Nirmoy Das
Currently intel_gt_reset() happens as follows:

reset_prepare() ---> Sends GDRST to GuC, GuC is in GS_MIA_IN_RESET
do_reset()
  intel_gt_reset_all_engines()
*_engine_reset_prepare() -->RESET_CTL expects running GuC
*_reset_engines()
intel_gt_init_hw() --> GuC comes out of GS_MIA_IN_RESET with FW loaded.

Fix the issue by sanitizing the GuC only after resetting requested
engines and before intel_gt_init_hw().

Note intel_uc_reset_finish() and intel_uc_reset() are nop when
guc submission is disabled.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/i915/gt/intel_reset.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index 6504e8ba9c58..bd166f5aca4b 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -907,8 +907,17 @@ static intel_engine_mask_t reset_prepare(struct intel_gt 
*gt)
intel_engine_mask_t awake = 0;
enum intel_engine_id id;
 
-   /* For GuC mode, ensure submission is disabled before stopping ring */
-   intel_uc_reset_prepare(>->uc);
+   /**
+* For GuC mode with submission enabled, ensure submission
+* is disabled before stopping ring.
+*
+* For GuC mode with submission disabled, ensure that GuC is not
+* sanitized, do that at the end in reset_finish(). reset_prepare()
+* is followed by engine reset which in this mode requires GuC to
+* be functional to process engine reset events.
+*/
+   if (intel_uc_uses_guc_submission(>->uc))
+   intel_uc_reset_prepare(>->uc);
 
for_each_engine(engine, gt, id) {
if (intel_engine_pm_get_if_awake(engine))
@@ -1255,6 +1264,9 @@ void intel_gt_reset(struct intel_gt *gt,
 
intel_overlay_reset(gt->i915);
 
+   /* sanitize uC after engine reset */
+   if (!intel_uc_uses_guc_submission(>->uc))
+   intel_uc_reset_prepare(>->uc);
/*
 * Next we need to restore the context, but we don't use those
 * yet either...
-- 
2.42.0



[PATCH 2/3] drm/i915 Rename intel_engine_reset to intel_gt_engine_recover

2024-04-18 Thread Nirmoy Das
intel_engine_reset() not only reset a engine but also
tries to recover it so give it a proper name without
any functional changes.

Signed-off-by: Nirmoy Das 
---
 .../drm/i915/gem/selftests/i915_gem_context.c |  2 +-
 .../drm/i915/gt/intel_execlists_submission.c  |  2 +-
 drivers/gpu/drm/i915/gt/intel_reset.c |  4 ++--
 drivers/gpu/drm/i915/gt/intel_reset.h |  4 ++--
 drivers/gpu/drm/i915/gt/selftest_hangcheck.c  | 20 +--
 drivers/gpu/drm/i915/gt/selftest_mocs.c   |  4 ++--
 drivers/gpu/drm/i915/gt/selftest_reset.c  |  2 +-
 .../gpu/drm/i915/gt/selftest_workarounds.c|  6 +++---
 8 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
index 89d4dc8b60c6..4f4cde55f621 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
@@ -1171,7 +1171,7 @@ __sseu_finish(const char *name,
int ret = 0;
 
if (flags & TEST_RESET) {
-   ret = intel_engine_reset(ce->engine, "sseu");
+   ret = intel_gt_engine_recover(ce->engine, "sseu");
if (ret)
goto out;
}
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 21829439e686..9485a622a704 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -2404,7 +2404,7 @@ static void execlists_reset(struct intel_engine_cs 
*engine, const char *msg)
 
ring_set_paused(engine, 1); /* Freeze the current request in place */
execlists_capture(engine);
-   intel_engine_reset(engine, msg);
+   intel_gt_engine_recover(engine, msg);
 
tasklet_enable(&engine->sched_engine->tasklet);
clear_and_wake_up_bit(bit, lock);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index b825daace58e..6504e8ba9c58 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -1348,7 +1348,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs 
*engine, const char *msg)
 }
 
 /**
- * intel_engine_reset - reset GPU engine to recover from a hang
+ * intel_gt_engine_recover - reset GPU engine to recover from a hang
  * @engine: engine to reset
  * @msg: reason for GPU reset; or NULL for no drm_notice()
  *
@@ -1360,7 +1360,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs 
*engine, const char *msg)
  *  - reset engine (which will force the engine to idle)
  *  - re-init/configure engine
  */
-int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
+int intel_gt_engine_recover(struct intel_engine_cs *engine, const char *msg)
 {
int err;
 
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h 
b/drivers/gpu/drm/i915/gt/intel_reset.h
index c00de353075c..be984357bf27 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.h
+++ b/drivers/gpu/drm/i915/gt/intel_reset.h
@@ -31,8 +31,8 @@ void intel_gt_handle_error(struct intel_gt *gt,
 void intel_gt_reset(struct intel_gt *gt,
intel_engine_mask_t stalled_mask,
const char *reason);
-int intel_engine_reset(struct intel_engine_cs *engine,
-  const char *reason);
+int intel_gt_engine_recover(struct intel_engine_cs *engine,
+   const char *reason);
 int __intel_engine_reset_bh(struct intel_engine_cs *engine,
const char *reason);
 
diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c 
b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
index 9ce8ff1c04fe..9bfda3f2bd24 100644
--- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
+++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
@@ -495,9 +495,9 @@ static int igt_reset_nop_engine(void *arg)
 
i915_request_add(rq);
}
-   err = intel_engine_reset(engine, NULL);
+   err = intel_gt_engine_recover(engine, NULL);
if (err) {
-   pr_err("intel_engine_reset(%s) failed, 
err:%d\n",
+   pr_err("intel_gt_engine_recover(%s) failed, 
err:%d\n",
   engine->name, err);
break;
}
@@ -574,7 +574,7 @@ static int igt_reset_fail_engine(void *arg)
>->reset.flags));
 
force_reset_timeout(engine);
-   err = intel_engine_reset(engine, NULL);
+   err = intel_gt_engine_recover(engine, NULL);
cancel_reset_timeout(engine);
if (err == 0) /* timeouts only generated on gen8+ */
goto

[PATCH 1/3] drm/i915: Refactor confusing __intel_gt_reset()

2024-04-18 Thread Nirmoy Das
__intel_gt_reset() is really for resetting engines though
the name might suggest something else. So add two helper functions
to remove confusions with no functional changes.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c |  2 +-
 .../drm/i915/gt/intel_execlists_submission.c  |  2 +-
 drivers/gpu/drm/i915/gt/intel_gt.c|  2 +-
 drivers/gpu/drm/i915/gt/intel_gt_pm.c |  2 +-
 drivers/gpu/drm/i915/gt/intel_reset.c | 43 ++-
 drivers/gpu/drm/i915/gt/intel_reset.h |  3 +-
 drivers/gpu/drm/i915/gt/selftest_reset.c  |  2 +-
 drivers/gpu/drm/i915/i915_driver.c|  2 +-
 8 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 8c44af1c3451..5c8e9ee3b008 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -678,7 +678,7 @@ void intel_engines_release(struct intel_gt *gt)
 */
GEM_BUG_ON(intel_gt_pm_is_awake(gt));
if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
-   __intel_gt_reset(gt, ALL_ENGINES);
+   intel_gt_reset_all_engines(gt);
 
/* Decouple the backend; but keep the layout for late GPU resets */
for_each_engine(engine, gt, id) {
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 355aab5b38ba..21829439e686 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -2898,7 +2898,7 @@ static void enable_error_interrupt(struct intel_engine_cs 
*engine)
drm_err(&engine->i915->drm,
"engine '%s' resumed still in error: %08x\n",
engine->name, status);
-   __intel_gt_reset(engine->gt, engine->mask);
+   intel_gt_reset_engine(engine);
}
 
/*
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c 
b/drivers/gpu/drm/i915/gt/intel_gt.c
index 580b5141ce1e..626b166e67ef 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -832,7 +832,7 @@ void intel_gt_driver_unregister(struct intel_gt *gt)
 
/* Scrub all HW state upon release */
with_intel_runtime_pm(gt->uncore->rpm, wakeref)
-   __intel_gt_reset(gt, ALL_ENGINES);
+   intel_gt_reset_all_engines(gt);
 }
 
 void intel_gt_driver_release(struct intel_gt *gt)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c 
b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 220ac4f92edf..c08fdb65cc69 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -159,7 +159,7 @@ static bool reset_engines(struct intel_gt *gt)
if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
return false;
 
-   return __intel_gt_reset(gt, ALL_ENGINES) == 0;
+   return intel_gt_reset_all_engines(gt) == 0;
 }
 
 static void gt_sanitize(struct intel_gt *gt, bool force)
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index c8e9aa41fdea..b825daace58e 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -764,7 +764,7 @@ wa_14015076503_end(struct intel_gt *gt, intel_engine_mask_t 
engine_mask)
 HECI_H_GS1_ER_PREP, 0);
 }
 
-int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
+static int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t 
engine_mask)
 {
const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
reset_func reset;
@@ -795,6 +795,34 @@ int __intel_gt_reset(struct intel_gt *gt, 
intel_engine_mask_t engine_mask)
return ret;
 }
 
+/**
+ * intel_gt_reset_all_engines() - Reset all engines in the given gt.
+ * @gt: the GT to reset all engines for.
+ *
+ * This function resets all engines within the given gt.
+ *
+ * Returns:
+ * Zero on success, negative error code on failure.
+ */
+int intel_gt_reset_all_engines(struct intel_gt *gt)
+{
+   return __intel_gt_reset(gt, ALL_ENGINES);
+}
+
+/**
+ * intel_gt_reset_engine() - Reset a specific engine within a gt.
+ * @engine: engine to be reset.
+ *
+ * This function resets the specified engine within a gt.
+ *
+ * Returns:
+ * Zero on success, negative error code on failure.
+ */
+int intel_gt_reset_engine(struct intel_engine_cs *engine)
+{
+   return __intel_gt_reset(engine->gt, engine->mask);
+}
+
 bool intel_has_gpu_reset(const struct intel_gt *gt)
 {
if (!gt->i915->params.reset)
@@ -978,7 +1006,7 @@ static void __intel_gt_set_wedged(struct intel_gt *gt)
 
/* Even if the GPU reset fails, it should still stop the engines */
if (!INTEL_INFO(gt->i915)->g

Re: [RFC PATCH] drm/i915: Don't reset GuC before engine reset on full GT reset

2024-04-17 Thread Nirmoy Das

Hi John,

On 4/17/2024 2:37 AM, John Harrison wrote:

On 4/15/2024 09:44, Nirmoy Das wrote:

Currently intel_gt_reset() happens as follows:

reset_prepare() ---> Sends GDRST to GuC, GuC is in GS_MIA_IN_RESET
do_reset()
__intel_gt_reset()
    *_engine_reset_prepare() -->RESET_CTL expects running
    GuC
    *_reset_engines()
intel_gt_init_hw() --> GuC FW loading happens, GuC comes out of
GS_MIA_IN_RESET.

Fix the above flow so that GuC reset happens after all the
engines reset is done.

Cc: John Harrison 
Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/i915/gt/intel_reset.c |  9 --
  drivers/gpu/drm/i915/gt/uc/intel_uc.c | 42 +--
  drivers/gpu/drm/i915/gt/uc/intel_uc.h |  1 +
  3 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c

index c8e9aa41fdea..9ebd68ce0c22 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -879,8 +879,11 @@ static intel_engine_mask_t reset_prepare(struct 
intel_gt *gt)

  intel_engine_mask_t awake = 0;
  enum intel_engine_id id;
  -    /* For GuC mode, ensure submission is disabled before stopping 
ring */

-    intel_uc_reset_prepare(>->uc);
+    /*
+ * For GuC mode, ensure submission is disabled before stopping 
ring.

+ * Don't reset the GuC a engine reset requires GuC to be running.
These two lines appear to be mutually exclusive unless there is a test 
for GuC submission being enabled, which I am not seeing. Note that 
"ensure submission is disabled" means "reset the GuC".



+ */
+    intel_uc_reset_prepare_without_guc_reset(>->uc);
    for_each_engine(engine, gt, id) {
  if (intel_engine_pm_get_if_awake(engine))
@@ -1227,6 +1230,8 @@ void intel_gt_reset(struct intel_gt *gt,
    intel_overlay_reset(gt->i915);
  +    /* Now that all engines are clean, Reset the GuC */
+    intel_uc_reset_prepare(>->uc);
  /*
   * Next we need to restore the context, but we don't use those
   * yet either...
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c 
b/drivers/gpu/drm/i915/gt/uc/intel_uc.c

index 7a63abf8f644..5feee4db2ccc 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -345,7 +345,7 @@ static void __uc_fini(struct intel_uc *uc)
  intel_guc_fini(&uc->guc);
  }
  -static int __uc_sanitize(struct intel_uc *uc)
+static void __uc_sanitize_without_guc_reset(struct intel_uc *uc)
  {
  struct intel_guc *guc = &uc->guc;
  struct intel_huc *huc = &uc->huc;
@@ -354,7 +354,11 @@ static int __uc_sanitize(struct intel_uc *uc)
    intel_huc_sanitize(huc);
  intel_guc_sanitize(guc);
+}
This seems like an extremely bad idea. You are wiping out all the GuC 
communication structures on the host side while the GuC itself is 
still executing and using those same structures.


Is the failure when doing individual engine resets or when doing a 
full GT reset?


The failed test is doing "intel_gt_reset(gt, ALL_ENGINES, NULL)" so a 
full GT reset.





If the former, I think a better approach would be to just not reset 
GuC at all (or indeed any UC) if not using GuC submission. Although, 
looking at the code, I'm not seeing an engine only reset path that 
does nuke the UC layers?



Yes, intel_engine_reset() doesn't touch UC layer.



If it is the latter, 


This is the case here.


then how/why are individual engine resets happening in the middle of a 
full GT reset? Don't we just splat everything all at once? 


It seems we use __intel_gt_reset(engine->gt, engine_mask) to reset all 
or some engines.


Either way, it would be safer to split at the GT reset code layer 
rather than inside the UC layer. That is, when not using GuC 
submission, do the entire prepare/reset/init sequence of the UC layers 
as one 'atomic' operation either before the GT/engine reset or after 
it (or potentially both before and after?).


I think this should work. Let me try it out


Thanks,

Nirmoy




John.



  +static int __uc_sanitize(struct intel_uc *uc)
+{
+    __uc_sanitize_without_guc_reset(uc);
  return __intel_uc_reset_hw(uc);
  }
  @@ -593,13 +597,7 @@ static void __uc_fini_hw(struct intel_uc *uc)
  __uc_sanitize(uc);
  }
  -/**
- * intel_uc_reset_prepare - Prepare for reset
- * @uc: the intel_uc structure
- *
- * Preparing for full gpu reset.
- */
-void intel_uc_reset_prepare(struct intel_uc *uc)
+static void __intel_uc_reset_prepare(struct intel_uc *uc, bool 
reset_guc)

  {
  struct intel_guc *guc = &uc->guc;
  @@ -617,9 +615,35 @@ void intel_uc_reset_prepare(struct intel_uc *uc)
  intel_guc_submission_reset_prepare(guc);
    sanitize:
-    __uc_sanitize(uc);
+    if (reset_guc)
+    __uc_sanitize(uc);
+    else
+    __uc_sanitize_without_guc_reset(uc);
  }
  +/**
+

Re: [PATCH i-g-t] i915/gem_mmap_offset: Partial mmap and munmap

2024-04-17 Thread Nirmoy Das



On 4/12/2024 2:42 AM, Andi Shyti wrote:

From: Chris Wilson 

Based on a test case developed by Lionel Landwerlin, this exercises
creation of partial mmaps using both direct methods of a partial mmap()
(where the mmap() only covers a portion of the object) and
munmap() to do the same.

Signed-off-by: Chris Wilson 
Signed-off-by: Andi Shyti 
---
  tests/intel/gem_mmap_offset.c | 84 +++
  1 file changed, 84 insertions(+)

diff --git a/tests/intel/gem_mmap_offset.c b/tests/intel/gem_mmap_offset.c
index 95d2158ca88f..0ba2f9591f85 100644
--- a/tests/intel/gem_mmap_offset.c
+++ b/tests/intel/gem_mmap_offset.c
@@ -56,6 +56,8 @@
   * SUBTEST: isolation
   * SUBTEST: oob-read
   * SUBTEST: open-flood
+ * SUBTEST: partial-mmap
+ * SUBTEST: partial-unmap
   * SUBTEST: perf
   * SUBTEST: pf-nonblock
   * SUBTEST: ptrace
@@ -874,6 +876,83 @@ static void blt_coherency(int i915)
igt_assert_f(compare_ok, "Problem with coherency, flush is too late\n");
  }
  
+static void partial_mmap(int i915)

+{
+   uint32_t handle;
+
+   handle = gem_create(i915, SZ_2M);
+
+   for_each_mmap_offset_type(i915, t) {
+   struct drm_i915_gem_mmap_offset arg = {
+   .handle = handle,
+   .flags = t->type,
+   };
+   uint32_t *ptr;
+
+   if (mmap_offset_ioctl(i915, &arg))
+   continue;
+
+   ptr = mmap(0, SZ_4K, PROT_WRITE, MAP_SHARED, i915, arg.offset);
+   if (ptr == MAP_FAILED)
+   continue;
+
+   memset(ptr, 0xcc, SZ_4K);
+   munmap(ptr, SZ_4K);
+
+   ptr = mmap(0, SZ_4K, PROT_READ, MAP_SHARED, i915, arg.offset + 
SZ_2M - SZ_4K);
+   igt_assert(ptr != MAP_FAILED);
+
+   for (uint32_t i = 0; i < SZ_4K / sizeof(uint32_t); i++)
+   igt_assert_eq_u32(ptr[i], 0);
+
+   munmap(ptr, SZ_4K);
+   }
+
+   gem_close(i915, handle);
+}
+
+static void partial_unmap(int i915)
+{
+   uint32_t handle;
+
+   handle = gem_create(i915, SZ_2M);
+
+   for_each_mmap_offset_type(i915, t) {
+   uint8_t *ptr_a, *ptr_b;
+
+   /* mmap the same GEM BO twice */
+   ptr_a = __mmap_offset(i915, handle, 0, SZ_2M,
+   PROT_READ | PROT_WRITE,
+   t->type);
+   if (!ptr_a)
+   continue;
+
+   ptr_b = __mmap_offset(i915, handle, 0, SZ_2M,
+   PROT_READ | PROT_WRITE,
+   t->type);
+   if (!ptr_b)
+   continue;
+
+   /* unmap the first mapping but the last 4k */
+   munmap(ptr_a, SZ_2M - SZ_4K);
+
+   /* memset that remaining 4k with 0xcc */
+   memset(ptr_a + SZ_2M - SZ_4K, 0xcc, SZ_4K);
+
+   /* memset the first page of the 2Mb with 0xdd */
+   memset(ptr_b, 0xdd, SZ_4K);
+
+   for (uint32_t i = 0; i < SZ_4K; i++)
+   igt_assert_eq_u32(ptr_a[SZ_2M - SZ_4K + i], 0xcc);
+
+   munmap(ptr_a + SZ_2M - SZ_4K, SZ_4K);
+   memset(ptr_b, 0, SZ_2M);


Do we need this extra memset() ? Otherwise

Reviewed-by: Nirmoy Das 



+   munmap(ptr_b, SZ_2M);
+   }
+
+   gem_close(i915, handle);
+}
+
  static int mmap_gtt_version(int i915)
  {
int gtt_version = -1;
@@ -931,6 +1010,11 @@ igt_main
igt_subtest_f("open-flood")
open_flood(i915, 20);
  
+	igt_subtest_f("partial-mmap")

+   partial_mmap(i915);
+   igt_subtest_f("partial-unmap")
+   partial_unmap(i915);
+
igt_subtest_with_dynamic("clear") {
for_each_memory_region(r, i915) {
igt_dynamic_f("%s", r->name)


Re: [PATCH v3 20/21] drm/i915/display: perform transient flush

2024-04-15 Thread Nirmoy Das

Hi Matt,

On 4/15/2024 7:07 PM, Matt Roper wrote:

On Mon, Apr 15, 2024 at 01:44:22PM +0530, Balasubramani Vivekanandan wrote:

From: Matthew Auld 

Perform manual transient cache flush prior to flip and at the end of
frontbuffer_flush. This is needed to ensure display engine doesn't see
garbage if the surface is L3:XD dirty.

Testcase: igt@xe-pat@display-vs-wb-transient

Has the IGT patch for this been sent yet?


Yes, the test seems to be available 
https://gitlab.freedesktop.org/drm/igt-gpu-tools/-/blob/master/tests/intel/xe_pat.c#L728 




Regards,

Nirmoy


   If not, we should probably
make sure that happens soon, and then use the CI Test-with: thing if
there winds up being another revision of this series so that this will
be included in the CI results.

Anyway, the changes here look good to me,

Reviewed-by: Matt Roper 


Signed-off-by: Matthew Auld 
Signed-off-by: Balasubramani Vivekanandan 
Acked-by: Nirmoy Das 
---
  drivers/gpu/drm/i915/display/intel_display.c  |  3 +++
  .../gpu/drm/i915/display/intel_frontbuffer.c  |  2 ++
  drivers/gpu/drm/i915/display/intel_tdf.h  | 25 +++
  drivers/gpu/drm/xe/Makefile   |  3 ++-
  drivers/gpu/drm/xe/display/xe_tdf.c   | 13 ++
  5 files changed, 45 insertions(+), 1 deletion(-)
  create mode 100644 drivers/gpu/drm/i915/display/intel_tdf.h
  create mode 100644 drivers/gpu/drm/xe/display/xe_tdf.c

diff --git a/drivers/gpu/drm/i915/display/intel_display.c 
b/drivers/gpu/drm/i915/display/intel_display.c
index 67697d9a559c..4fc46edcb4ad 100644
--- a/drivers/gpu/drm/i915/display/intel_display.c
+++ b/drivers/gpu/drm/i915/display/intel_display.c
@@ -110,6 +110,7 @@
  #include "intel_sdvo.h"
  #include "intel_snps_phy.h"
  #include "intel_tc.h"
+#include "intel_tdf.h"
  #include "intel_tv.h"
  #include "intel_vblank.h"
  #include "intel_vdsc.h"
@@ -7242,6 +7243,8 @@ static void intel_atomic_commit_tail(struct 
intel_atomic_state *state)
  
  	intel_atomic_commit_fence_wait(state);
  
+	intel_td_flush(dev_priv);

+
drm_atomic_helper_wait_for_dependencies(&state->base);
drm_dp_mst_atomic_wait_for_dependencies(&state->base);
intel_atomic_global_state_wait_for_dependencies(state);
diff --git a/drivers/gpu/drm/i915/display/intel_frontbuffer.c 
b/drivers/gpu/drm/i915/display/intel_frontbuffer.c
index 2ea37c0414a9..4923c340a0b6 100644
--- a/drivers/gpu/drm/i915/display/intel_frontbuffer.c
+++ b/drivers/gpu/drm/i915/display/intel_frontbuffer.c
@@ -65,6 +65,7 @@
  #include "intel_fbc.h"
  #include "intel_frontbuffer.h"
  #include "intel_psr.h"
+#include "intel_tdf.h"
  
  /**

   * frontbuffer_flush - flush frontbuffer
@@ -93,6 +94,7 @@ static void frontbuffer_flush(struct drm_i915_private *i915,
trace_intel_frontbuffer_flush(i915, frontbuffer_bits, origin);
  
  	might_sleep();

+   intel_td_flush(i915);
intel_drrs_flush(i915, frontbuffer_bits);
intel_psr_flush(i915, frontbuffer_bits, origin);
intel_fbc_flush(i915, frontbuffer_bits, origin);
diff --git a/drivers/gpu/drm/i915/display/intel_tdf.h 
b/drivers/gpu/drm/i915/display/intel_tdf.h
new file mode 100644
index ..353cde21f6c2
--- /dev/null
+++ b/drivers/gpu/drm/i915/display/intel_tdf.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2024 Intel Corporation
+ */
+
+#ifndef __INTEL_TDF_H__
+#define __INTEL_TDF_H__
+
+/*
+ * TDF (Transient-Data-Flush) is needed for Xe2+ where special L3:XD caching 
can
+ * be enabled through various PAT index modes. Idea is to use this caching mode
+ * when for example rendering onto the display surface, with the promise that
+ * KMD will ensure transient cache entries are always flushed by the time we do
+ * the display flip, since display engine is never coherent with CPU/GPU 
caches.
+ */
+
+struct drm_i915_private;
+
+#ifdef I915
+static inline void intel_td_flush(struct drm_i915_private *i915) {}
+#else
+void intel_td_flush(struct drm_i915_private *i915);
+#endif
+
+#endif
diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
index 6015c9e41f24..97a8674cdd76 100644
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -198,7 +198,8 @@ xe-$(CONFIG_DRM_XE_DISPLAY) += \
display/xe_dsb_buffer.o \
display/xe_fb_pin.o \
display/xe_hdcp_gsc.o \
-   display/xe_plane_initial.o
+   display/xe_plane_initial.o \
+   display/xe_tdf.o
  
  # SOC code shared with i915

  xe-$(CONFIG_DRM_XE_DISPLAY) += \
diff --git a/drivers/gpu/drm/xe/display/xe_tdf.c 
b/drivers/gpu/drm/xe/display/xe_tdf.c
new file mode 100644
index ..2c0d4e144e09
--- /dev/null
+++ b/drivers/gpu/drm/xe/display/xe_tdf.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2024 Intel Corporation
+ */
+
+#include "xe_device.h"
+#include "intel_display_types.h"
+#include "intel_tdf.h"
+
+void intel_td_flush(struct drm_i915_private *i915)
+{
+   xe_device_td_flush(i915);
+}
--
2.25.1



[RFC PATCH] drm/i915: Don't reset GuC before engine reset on full GT reset

2024-04-15 Thread Nirmoy Das
Currently intel_gt_reset() happens as follows:

reset_prepare() ---> Sends GDRST to GuC, GuC is in GS_MIA_IN_RESET
do_reset()
__intel_gt_reset()
*_engine_reset_prepare() -->RESET_CTL expects running
GuC
*_reset_engines()
intel_gt_init_hw() --> GuC FW loading happens, GuC comes out of
GS_MIA_IN_RESET.

Fix the above flow so that GuC reset happens after all the
engines reset is done.

Cc: John Harrison 
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/i915/gt/intel_reset.c |  9 --
 drivers/gpu/drm/i915/gt/uc/intel_uc.c | 42 +--
 drivers/gpu/drm/i915/gt/uc/intel_uc.h |  1 +
 3 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index c8e9aa41fdea..9ebd68ce0c22 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -879,8 +879,11 @@ static intel_engine_mask_t reset_prepare(struct intel_gt 
*gt)
intel_engine_mask_t awake = 0;
enum intel_engine_id id;
 
-   /* For GuC mode, ensure submission is disabled before stopping ring */
-   intel_uc_reset_prepare(>->uc);
+   /*
+* For GuC mode, ensure submission is disabled before stopping ring.
+* Don't reset the GuC a engine reset requires GuC to be running.
+*/
+   intel_uc_reset_prepare_without_guc_reset(>->uc);
 
for_each_engine(engine, gt, id) {
if (intel_engine_pm_get_if_awake(engine))
@@ -1227,6 +1230,8 @@ void intel_gt_reset(struct intel_gt *gt,
 
intel_overlay_reset(gt->i915);
 
+   /* Now that all engines are clean, Reset the GuC */
+   intel_uc_reset_prepare(>->uc);
/*
 * Next we need to restore the context, but we don't use those
 * yet either...
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c 
b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
index 7a63abf8f644..5feee4db2ccc 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -345,7 +345,7 @@ static void __uc_fini(struct intel_uc *uc)
intel_guc_fini(&uc->guc);
 }
 
-static int __uc_sanitize(struct intel_uc *uc)
+static void __uc_sanitize_without_guc_reset(struct intel_uc *uc)
 {
struct intel_guc *guc = &uc->guc;
struct intel_huc *huc = &uc->huc;
@@ -354,7 +354,11 @@ static int __uc_sanitize(struct intel_uc *uc)
 
intel_huc_sanitize(huc);
intel_guc_sanitize(guc);
+}
 
+static int __uc_sanitize(struct intel_uc *uc)
+{
+   __uc_sanitize_without_guc_reset(uc);
return __intel_uc_reset_hw(uc);
 }
 
@@ -593,13 +597,7 @@ static void __uc_fini_hw(struct intel_uc *uc)
__uc_sanitize(uc);
 }
 
-/**
- * intel_uc_reset_prepare - Prepare for reset
- * @uc: the intel_uc structure
- *
- * Preparing for full gpu reset.
- */
-void intel_uc_reset_prepare(struct intel_uc *uc)
+static void __intel_uc_reset_prepare(struct intel_uc *uc, bool reset_guc)
 {
struct intel_guc *guc = &uc->guc;
 
@@ -617,9 +615,35 @@ void intel_uc_reset_prepare(struct intel_uc *uc)
intel_guc_submission_reset_prepare(guc);
 
 sanitize:
-   __uc_sanitize(uc);
+   if (reset_guc)
+   __uc_sanitize(uc);
+   else
+   __uc_sanitize_without_guc_reset(uc);
 }
 
+/**
+ * intel_uc_reset_prepare - Prepare for reset
+ * @uc: the intel_uc structure
+ *
+ * Preparing for full gpu reset.
+ */
+void intel_uc_reset_prepare(struct intel_uc *uc)
+{
+   __intel_uc_reset_prepare(uc, true);
+}
+/**
+ * intel_uc_reset_prepare_without_guc_reset - Prepare for reset but don't reset
+ * the GuC
+ * @uc: the intel_uc structure
+ *
+ * Preparing for full gpu reset.
+ */
+void intel_uc_reset_prepare_without_guc_reset(struct intel_uc *uc)
+{
+   __intel_uc_reset_prepare(uc, false);
+}
+
+
 void intel_uc_reset(struct intel_uc *uc, intel_engine_mask_t stalled)
 {
struct intel_guc *guc = &uc->guc;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.h 
b/drivers/gpu/drm/i915/gt/uc/intel_uc.h
index 014bb7d83689..9d6191ece498 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.h
@@ -46,6 +46,7 @@ void intel_uc_driver_late_release(struct intel_uc *uc);
 void intel_uc_driver_remove(struct intel_uc *uc);
 void intel_uc_init_mmio(struct intel_uc *uc);
 void intel_uc_reset_prepare(struct intel_uc *uc);
+void intel_uc_reset_prepare_without_guc_reset(struct intel_uc *uc);
 void intel_uc_reset(struct intel_uc *uc, intel_engine_mask_t stalled);
 void intel_uc_reset_finish(struct intel_uc *uc);
 void intel_uc_cancel_requests(struct intel_uc *uc);
-- 
2.42.0



Re: [PATCH v2 2/2] drm/i915/gem: Calculate object page offset for partial memory mapping

2024-04-11 Thread Nirmoy Das

Hi Andi,

On 3/29/2024 5:39 PM, Andi Shyti wrote:

To enable partial memory mapping of GPU virtual memory, it's
necessary to introduce an offset to the object's memory
(obj->mm.pages) scatterlist. This adjustment compensates for
instances when userspace mappings do not start from the beginning
of the object.


I quickly tried 
https://gitlab.freedesktop.org/llandwerlin/igt-gpu-tools/-/tree/wip/gem_mmap_offset-partial-unmap?ref_type=heads 
that didn't work for GTT.


Please make sure a proper IGT test is available for this as this looks 
very risky change.



Regards,

Nirmoy



Based on a patch by Chris Wilson.

Signed-off-by: Andi Shyti 
Cc: Chris Wilson 
Cc: Lionel Landwerlin 
---
  drivers/gpu/drm/i915/gem/i915_gem_mman.c | 10 +++---
  drivers/gpu/drm/i915/i915_mm.c   | 12 +++-
  drivers/gpu/drm/i915/i915_mm.h   |  3 ++-
  3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c 
b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index ce10dd259812..9bd2b4c2e501 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -252,6 +252,7 @@ static vm_fault_t vm_fault_cpu(struct vm_fault *vmf)
struct vm_area_struct *area = vmf->vma;
struct i915_mmap_offset *mmo = area->vm_private_data;
struct drm_i915_gem_object *obj = mmo->obj;
+   unsigned long obj_offset;
resource_size_t iomap;
int err;
  
@@ -273,10 +274,11 @@ static vm_fault_t vm_fault_cpu(struct vm_fault *vmf)

iomap -= obj->mm.region->region.start;
}
  
+	obj_offset = area->vm_pgoff - drm_vma_node_start(&mmo->vma_node);

/* PTEs are revoked in obj->ops->put_pages() */
err = remap_io_sg(area,
  area->vm_start, area->vm_end - area->vm_start,
- obj->mm.pages->sgl, iomap);
+ obj->mm.pages->sgl, obj_offset, iomap);
  
  	if (area->vm_flags & VM_WRITE) {

GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
@@ -302,14 +304,16 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf)
struct i915_ggtt *ggtt = to_gt(i915)->ggtt;
bool write = area->vm_flags & VM_WRITE;
struct i915_gem_ww_ctx ww;
+   unsigned long obj_offset;
intel_wakeref_t wakeref;
struct i915_vma *vma;
pgoff_t page_offset;
int srcu;
int ret;
  
-	/* We don't use vmf->pgoff since that has the fake offset */

+   obj_offset = area->vm_pgoff - drm_vma_node_start(&mmo->vma_node);
page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
+   page_offset += obj_offset;
  
  	trace_i915_gem_object_fault(obj, page_offset, true, write);
  
@@ -404,7 +408,7 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf)
  
  	/* Finally, remap it using the new GTT offset */

ret = remap_io_mapping(area,
-  area->vm_start + (vma->gtt_view.partial.offset 
<< PAGE_SHIFT),
+  area->vm_start + ((vma->gtt_view.partial.offset - 
obj_offset) << PAGE_SHIFT),
   (ggtt->gmadr.start + i915_ggtt_offset(vma)) >> 
PAGE_SHIFT,
   min_t(u64, vma->size, area->vm_end - 
area->vm_start),
   &ggtt->iomap);
diff --git a/drivers/gpu/drm/i915/i915_mm.c b/drivers/gpu/drm/i915/i915_mm.c
index 7998bc74ab49..f5c97a620962 100644
--- a/drivers/gpu/drm/i915/i915_mm.c
+++ b/drivers/gpu/drm/i915/i915_mm.c
@@ -122,13 +122,15 @@ int remap_io_mapping(struct vm_area_struct *vma,
   * @addr: target user address to start at
   * @size: size of map area
   * @sgl: Start sg entry
+ * @offset: offset from the start of the page
   * @iobase: Use stored dma address offset by this address or pfn if -1
   *
   *  Note: this is only safe if the mm semaphore is held when called.
   */
  int remap_io_sg(struct vm_area_struct *vma,
unsigned long addr, unsigned long size,
-   struct scatterlist *sgl, resource_size_t iobase)
+   struct scatterlist *sgl, unsigned long offset,
+   resource_size_t iobase)
  {
struct remap_pfn r = {
.mm = vma->vm_mm,
@@ -141,6 +143,14 @@ int remap_io_sg(struct vm_area_struct *vma,
/* We rely on prevalidation of the io-mapping to skip track_pfn(). */
GEM_BUG_ON((vma->vm_flags & EXPECTED_FLAGS) != EXPECTED_FLAGS);
  
+	while (offset >= sg_dma_len(r.sgt.sgp) >> PAGE_SHIFT) {

+   offset -= sg_dma_len(r.sgt.sgp) >> PAGE_SHIFT;
+   r.sgt = __sgt_iter(__sg_next(r.sgt.sgp), use_dma(iobase));
+   if (!r.sgt.sgp)
+   return -EINVAL;
+   }
+   r.sgt.curr = offset << PAGE_SHIFT;
+
 

Re: [PATCH v2 23/25] drm/xe/device: implement transient flush

2024-04-03 Thread Nirmoy Das

Hi Bala,

On 4/3/2024 1:22 PM, Balasubramani Vivekanandan wrote:

From: Nirmoy Das 

Display surfaces can be tagged as transient by mapping it using one of
the various L3:XD PAT index modes on Xe2. The expectation is that KMD
needs to request transient data flush at the start of flip sequence to
ensure all transient data in L3 cache is flushed to memory. Add a
routine for this which we can then call from the display code.

Signed-off-by: Nirmoy Das 
Co-developed-by: Matthew Auld 
Signed-off-by: Matthew Auld 
Signed-off-by: Balasubramani Vivekanandan 
---
  drivers/gpu/drm/xe/regs/xe_gt_regs.h |  3 ++
  drivers/gpu/drm/xe/xe_device.c   | 52 
  drivers/gpu/drm/xe/xe_device.h   |  2 ++
  3 files changed, 57 insertions(+)

diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h 
b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
index 6617c86a096b..7afe810b3441 100644
--- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
@@ -306,6 +306,9 @@
  
  #define XE2LPM_L3SQCREG5			XE_REG_MCR(0xb658)
  
+#define XE2_TDF_CTRLXE_REG(0xb418)

+#define   TRANSIENT_FLUSH_REQUEST  REG_BIT(0)
+
  #define XEHP_MERT_MOD_CTRLXE_REG_MCR(0xcf28)
  #define RENDER_MOD_CTRL   XE_REG_MCR(0xcf2c)
  #define COMP_MOD_CTRL XE_REG_MCR(0xcf30)
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 01bd5ccf05ca..0c9769fe04f6 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -641,6 +641,58 @@ void xe_device_wmb(struct xe_device *xe)
xe_mmio_write32(gt, SOFTWARE_FLAGS_SPR33, 0);
  }
  
+/**

+ * xe_device_td_flush() - Flush transient L3 cache entries
+ * @xe: The device
+ *
+ * Display engine has direct access to memory and is never coherent with L3/L4
+ * caches (or CPU caches), however KMD is responsible for specifically flushing
+ * transient L3 GPU cache entries prior to the flip sequence to ensure scanout
+ * can happen from such a surface without seeing corruption.
+ *
+ * Display surfaces can be tagged as transient by mapping it using one of the
+ * various L3:XD PAT index modes on Xe2.
+ *
+ * Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is 
flushed
+ * at the end of each submission via PIPE_CONTROL for compute/render, since SA
+ * Media is not coherent with L3 and we want to support render-vs-media
+ * usescases. For other engines like copy/blt the HW internally forces uncached
+ * behaviour, hence why we can skip the TDF on such platforms.
+ */
+void xe_device_td_flush(struct xe_device *xe)
+{
+   struct xe_gt *gt;
+   int err;
+   u8 id;
+
+   if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20)
+   return;
+
+   for_each_gt(gt, xe, id) {
+   if (xe_gt_is_media_type(gt))
+   continue;
+
+   err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
+   if (err)
+   return;


This can be if (xe_force_wake_get()..) without needing the err variable. 
Sorry, this was my oversight  from this morning.



Regards,

Nirmoy


+
+   xe_mmio_write32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST);
+   /*
+* FIXME: We can likely do better here with our choice of
+* timeout.  Currently we just assume the worst case, but really
+* we should make this dependent on how much actual L3 there is
+* for this system. Recomendation is to allow ~64us in the worst
+* case for 8M of L3 (assumes all entries are transient and need
+* to be flushed).
+*/
+   if (xe_mmio_wait32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST, 0,
+  150, NULL, false))
+   xe_gt_err_once(gt, "TD flush timeout\n");
+
+   xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
+   }
+}
+
  u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size)
  {
return xe_device_has_flat_ccs(xe) ?
diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
index d413bc2c6be5..d3430f4b820a 100644
--- a/drivers/gpu/drm/xe/xe_device.h
+++ b/drivers/gpu/drm/xe/xe_device.h
@@ -176,4 +176,6 @@ void xe_device_snapshot_print(struct xe_device *xe, struct 
drm_printer *p);
  u64 xe_device_canonicalize_addr(struct xe_device *xe, u64 address);
  u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address);
  
+void xe_device_td_flush(struct xe_device *xe);

+
  #endif


Re: [PATCH v2 24/25] drm/i915/display: perform transient flush

2024-04-03 Thread Nirmoy Das

+Jouni

On 4/3/2024 1:22 PM, Balasubramani Vivekanandan wrote:

From: Matthew Auld 

Perform manual transient cache flush prior to flip and at the end of
frontbuffer_flush. This is needed to ensure display engine doesn't see
garbage if the surface is L3:XD dirty.

Testcase: igt@xe-pat@display-vs-wb-transient
Signed-off-by: Matthew Auld 
Signed-off-by: Balasubramani Vivekanandan 

Acked-by: Nirmoy Das 

---
  drivers/gpu/drm/i915/display/intel_display.c  |  3 +++
  .../gpu/drm/i915/display/intel_frontbuffer.c  |  2 ++
  drivers/gpu/drm/i915/display/intel_tdf.h  | 25 +++
  drivers/gpu/drm/xe/Makefile   |  3 ++-
  drivers/gpu/drm/xe/display/xe_tdf.c   | 13 ++
  5 files changed, 45 insertions(+), 1 deletion(-)
  create mode 100644 drivers/gpu/drm/i915/display/intel_tdf.h
  create mode 100644 drivers/gpu/drm/xe/display/xe_tdf.c

diff --git a/drivers/gpu/drm/i915/display/intel_display.c 
b/drivers/gpu/drm/i915/display/intel_display.c
index aed25890b6f5..0a720e9d12a7 100644
--- a/drivers/gpu/drm/i915/display/intel_display.c
+++ b/drivers/gpu/drm/i915/display/intel_display.c
@@ -110,6 +110,7 @@
  #include "intel_sdvo.h"
  #include "intel_snps_phy.h"
  #include "intel_tc.h"
+#include "intel_tdf.h"
  #include "intel_tv.h"
  #include "intel_vblank.h"
  #include "intel_vdsc.h"
@@ -7095,6 +7096,8 @@ static void intel_atomic_commit_tail(struct 
intel_atomic_state *state)
  
  	intel_atomic_commit_fence_wait(state);
  
+	intel_td_flush(dev_priv);

+
drm_atomic_helper_wait_for_dependencies(&state->base);
drm_dp_mst_atomic_wait_for_dependencies(&state->base);
intel_atomic_global_state_wait_for_dependencies(state);
diff --git a/drivers/gpu/drm/i915/display/intel_frontbuffer.c 
b/drivers/gpu/drm/i915/display/intel_frontbuffer.c
index 2ea37c0414a9..4923c340a0b6 100644
--- a/drivers/gpu/drm/i915/display/intel_frontbuffer.c
+++ b/drivers/gpu/drm/i915/display/intel_frontbuffer.c
@@ -65,6 +65,7 @@
  #include "intel_fbc.h"
  #include "intel_frontbuffer.h"
  #include "intel_psr.h"
+#include "intel_tdf.h"
  
  /**

   * frontbuffer_flush - flush frontbuffer
@@ -93,6 +94,7 @@ static void frontbuffer_flush(struct drm_i915_private *i915,
trace_intel_frontbuffer_flush(i915, frontbuffer_bits, origin);
  
  	might_sleep();

+   intel_td_flush(i915);
intel_drrs_flush(i915, frontbuffer_bits);
intel_psr_flush(i915, frontbuffer_bits, origin);
intel_fbc_flush(i915, frontbuffer_bits, origin);
diff --git a/drivers/gpu/drm/i915/display/intel_tdf.h 
b/drivers/gpu/drm/i915/display/intel_tdf.h
new file mode 100644
index ..353cde21f6c2
--- /dev/null
+++ b/drivers/gpu/drm/i915/display/intel_tdf.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2024 Intel Corporation
+ */
+
+#ifndef __INTEL_TDF_H__
+#define __INTEL_TDF_H__
+
+/*
+ * TDF (Transient-Data-Flush) is needed for Xe2+ where special L3:XD caching 
can
+ * be enabled through various PAT index modes. Idea is to use this caching mode
+ * when for example rendering onto the display surface, with the promise that
+ * KMD will ensure transient cache entries are always flushed by the time we do
+ * the display flip, since display engine is never coherent with CPU/GPU 
caches.
+ */
+
+struct drm_i915_private;
+
+#ifdef I915
+static inline void intel_td_flush(struct drm_i915_private *i915) {}
+#else
+void intel_td_flush(struct drm_i915_private *i915);
+#endif
+
+#endif
diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
index e5b1715f721e..401a4492c625 100644
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -196,7 +196,8 @@ xe-$(CONFIG_DRM_XE_DISPLAY) += \
display/xe_dsb_buffer.o \
display/xe_fb_pin.o \
display/xe_hdcp_gsc.o \
-   display/xe_plane_initial.o
+   display/xe_plane_initial.o \
+   display/xe_tdf.o
  
  # SOC code shared with i915

  xe-$(CONFIG_DRM_XE_DISPLAY) += \
diff --git a/drivers/gpu/drm/xe/display/xe_tdf.c 
b/drivers/gpu/drm/xe/display/xe_tdf.c
new file mode 100644
index ..2c0d4e144e09
--- /dev/null
+++ b/drivers/gpu/drm/xe/display/xe_tdf.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2024 Intel Corporation
+ */
+
+#include "xe_device.h"
+#include "intel_display_types.h"
+#include "intel_tdf.h"
+
+void intel_td_flush(struct drm_i915_private *i915)
+{
+   xe_device_td_flush(i915);
+}


Re: [PATCH v2 22/25] drm/xe/gt_print: add xe_gt_err_once()

2024-04-03 Thread Nirmoy Das



On 4/3/2024 1:22 PM, Balasubramani Vivekanandan wrote:

From: Matthew Auld 

Needed in an upcoming patch, where we want GT level print, but only
which to trigger once to avoid flooding dmesg.

Signed-off-by: Matthew Auld 
Signed-off-by: Balasubramani Vivekanandan 

Reviewed-by: Nirmoy Das 

---
  drivers/gpu/drm/xe/xe_gt_printk.h | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_gt_printk.h 
b/drivers/gpu/drm/xe/xe_gt_printk.h
index c2b004d3f48e..d6228baaff1e 100644
--- a/drivers/gpu/drm/xe/xe_gt_printk.h
+++ b/drivers/gpu/drm/xe/xe_gt_printk.h
@@ -13,6 +13,9 @@
  #define xe_gt_printk(_gt, _level, _fmt, ...) \
drm_##_level(>_to_xe(_gt)->drm, "GT%u: " _fmt, (_gt)->info.id, 
##__VA_ARGS__)
  
+#define xe_gt_err_once(_gt, _fmt, ...) \

+   xe_gt_printk((_gt), err_once, _fmt, ##__VA_ARGS__)
+
  #define xe_gt_err(_gt, _fmt, ...) \
xe_gt_printk((_gt), err, _fmt, ##__VA_ARGS__)
  


Re: [PATCH 23/25] drm/xe/device: implement transient flush

2024-04-03 Thread Nirmoy Das
There is new fixup patch(PR#630) which modifies this patch. Could you 
please bring that in as well.



Regards,

Nirmoy

On 4/3/2024 12:51 PM, Balasubramani Vivekanandan wrote:

From: Nirmoy Das 

Display surfaces can be tagged as transient by mapping it using one of
the various L3:XD PAT index modes on Xe2. The expectation is that KMD
needs to request transient data flush at the start of flip sequence to
ensure all transient data in L3 cache is flushed to memory. Add a
routine for this which we can then call from the display code.

Signed-off-by: Nirmoy Das 
Co-developed-by: Matthew Auld 
Signed-off-by: Matthew Auld 
Signed-off-by: Balasubramani Vivekanandan 
---
  drivers/gpu/drm/xe/regs/xe_gt_regs.h |  3 ++
  drivers/gpu/drm/xe/xe_device.c   | 49 
  drivers/gpu/drm/xe/xe_device.h   |  2 ++
  3 files changed, 54 insertions(+)

diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h 
b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
index d5b21f03beaa..9c6549830e24 100644
--- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
@@ -305,6 +305,9 @@
  
  #define XE2LPM_L3SQCREG5			XE_REG_MCR(0xb658)
  
+#define XE2_TDF_CTRLXE_REG(0xb418)

+#define   TRANSIENT_FLUSH_REQUEST  REG_BIT(0)
+
  #define XEHP_MERT_MOD_CTRLXE_REG_MCR(0xcf28)
  #define RENDER_MOD_CTRL   XE_REG_MCR(0xcf2c)
  #define COMP_MOD_CTRL XE_REG_MCR(0xcf30)
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 01bd5ccf05ca..66182220e663 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -641,6 +641,55 @@ void xe_device_wmb(struct xe_device *xe)
xe_mmio_write32(gt, SOFTWARE_FLAGS_SPR33, 0);
  }
  
+/**

+ * xe_device_td_flush() - Flush transient L3 cache entries
+ * @xe: The device
+ *
+ * Display engine has direct access to memory and is never coherent with L3/L4
+ * caches (or CPU caches), however KMD is responsible for specifically flushing
+ * transient L3 GPU cache entries prior to the flip sequence to ensure scanout
+ * can happen from such a surface without seeing corruption.
+ *
+ * Display surfaces can be tagged as transient by mapping it using one of the
+ * various L3:XD PAT index modes on Xe2.
+ *
+ * Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is 
flushed
+ * at the end of each submission via PIPE_CONTROL for compute/render, since SA
+ * Media is not coherent with L3 and we want to support render-vs-media
+ * usescases. For other engines like copy/blt the HW internally forces uncached
+ * behaviour, hence why we can skip the TDF on such platforms.
+ */
+void xe_device_td_flush(struct xe_device *xe)
+{
+   struct xe_gt *gt;
+   u8 id;
+
+   if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20)
+   return;
+
+   for_each_gt(gt, xe, id) {
+   if (xe_gt_is_media_type(gt))
+   continue;
+
+   xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
+
+   xe_mmio_write32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST);
+   /*
+* FIXME: We can likely do better here with our choice of
+* timeout.  Currently we just assume the worst case, but really
+* we should make this dependent on how much actual L3 there is
+* for this system. Recomendation is to allow ~64us in the worst
+* case for 8M of L3 (assumes all entries are transient and need
+* to be flushed).
+*/
+   if (xe_mmio_wait32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST, 0,
+  150, NULL, false))
+   xe_gt_err_once(gt, "TD flush timeout\n");
+
+   xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
+   }
+}
+
  u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size)
  {
return xe_device_has_flat_ccs(xe) ?
diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
index d413bc2c6be5..d3430f4b820a 100644
--- a/drivers/gpu/drm/xe/xe_device.h
+++ b/drivers/gpu/drm/xe/xe_device.h
@@ -176,4 +176,6 @@ void xe_device_snapshot_print(struct xe_device *xe, struct 
drm_printer *p);
  u64 xe_device_canonicalize_addr(struct xe_device *xe, u64 address);
  u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address);
  
+void xe_device_td_flush(struct xe_device *xe);

+
  #endif


Re: [PATCH] drm/i915/guc: Fix the fix for reset lock confusion

2024-04-03 Thread Nirmoy Das



On 3/30/2024 12:53 AM, john.c.harri...@intel.com wrote:

From: John Harrison 

The previous fix for the circlular lock splat about the busyness
worker wasn't quite complete. Even though the reset-in-progress flag
is cleared at the start of intel_uc_reset_finish, the entire function
is still inside the reset mutex lock. Not sure why the patch appeared
to fix the issue both locally and in CI. However, it is now back
again.

There is a further complication the wedge code path within
intel_gt_reset() jumps around so much it results in nested
reset_prepare/_finish calls. That is, the call sequence is:
   intel_gt_reset
   | reset_prepare
   | __intel_gt_set_wedged
   | | reset_prepare
   | | reset_finish
   | reset_finish

The nested finish means that even if the clear of the in-progress flag
was moved to the end of _finish, it would still be clear for the
entire second call. Surprisingly, this does not seem to be causing any
other problems at present.

As an aside, a wedge on fini does not call the finish functions at
all. The reset_in_progress flag is left set (twice).

So instead of trying to cancel the worker anywhere at all in the reset
path, just add a cancel to intel_guc_submission_fini instead. Note
that it is not a problem if the worker is still active during a reset.
Either it will run before the reset path starts locking things and
will simply block the reset code for a tiny amount of time. Or it will
run after the locks have been acquired and will early exit due to the
try-lock.

Also, do not use the reset-in-progress flag to decide whether a
synchronous cancel is safe (from a lockdep perspective) or not.
Instead, use the actual reset mutex state (both the genuine one and
the custom rolled BACKOFF one).

Fixes: 0e00a8814eec ("drm/i915/guc: Avoid circular locking issue on busyness 
flush")
Signed-off-by: John Harrison 
Cc: Zhanjun Dong 
Cc: John Harrison 
Cc: Andi Shyti 
Cc: Daniel Vetter 
Cc: Daniel Vetter 
Cc: Rodrigo Vivi 
Cc: Nirmoy Das 
Cc: Tvrtko Ursulin 
Cc: Umesh Nerlige Ramappa 
Cc: Andrzej Hajda 
Cc: Matt Roper 
Cc: Jonathan Cavitt 
Cc: Prathap Kumar Valsan 
Cc: Alan Previn 
Cc: Madhumitha Tolakanahalli Pradeep 

Cc: Daniele Ceraolo Spurio 
Cc: Ashutosh Dixit 
Cc: Dnyaneshwar Bhadane 


Thanks for the details, looks good to me:

Reviewed-by: Nirmoy Das 


---
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 23 ---
  drivers/gpu/drm/i915/gt/uc/intel_uc.c |  4 
  2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 16640d6dd0589..00757d6333e88 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1403,14 +1403,17 @@ static void guc_cancel_busyness_worker(struct intel_guc 
*guc)
 * Trying to pass a 'need_sync' or 'in_reset' flag all the way down 
through
 * every possible call stack is unfeasible. It would be too intrusive 
to many
 * areas that really don't care about the GuC backend. However, there 
is the
-* 'reset_in_progress' flag available, so just use that.
+* I915_RESET_BACKOFF flag and the gt->reset.mutex can be tested for 
is_locked.
+* So just use those. Note that testing both is required due to the 
hideously
+* complex nature of the i915 driver's reset code paths.
 *
 * And note that in the case of a reset occurring during driver unload
-* (wedge_on_fini), skipping the cancel in _prepare (when the reset 
flag is set
-* is fine because there is another cancel in _finish (when the reset 
flag is
-* not).
+* (wedged_on_fini), skipping the cancel in reset_prepare/reset_fini 
(when the
+* reset flag/mutex are set) is fine because there is another explicit 
cancel in
+* intel_guc_submission_fini (when the reset flag/mutex are not).
 */
-   if (guc_to_gt(guc)->uc.reset_in_progress)
+   if (mutex_is_locked(&guc_to_gt(guc)->reset.mutex) ||
+   test_bit(I915_RESET_BACKOFF, &guc_to_gt(guc)->reset.flags))
cancel_delayed_work(&guc->timestamp.work);
else
cancel_delayed_work_sync(&guc->timestamp.work);
@@ -1424,8 +1427,6 @@ static void __reset_guc_busyness_stats(struct intel_guc 
*guc)
unsigned long flags;
ktime_t unused;
  
-	guc_cancel_busyness_worker(guc);

-
spin_lock_irqsave(&guc->timestamp.lock, flags);
  
  	guc_update_pm_timestamp(guc, &unused);

@@ -2004,13 +2005,6 @@ void intel_guc_submission_cancel_requests(struct 
intel_guc *guc)
  
  void intel_guc_submission_reset_finish(struct intel_guc *guc)

  {
-   /*
-* Ensure the busyness worker gets cancelled even on a fatal wedge.
-* Note that reset_prepare is

Re: [PATCH] drm/i915/gt: Limit the reserved VM space to only the platforms that need it

2024-03-28 Thread Nirmoy Das

Hi Andi,

On 3/27/2024 9:05 PM, Andi Shyti wrote:

Commit 9bb66c179f50 ("drm/i915: Reserve some kernel space per
vm") reduces the available VM space of one page in order to apply
Wa_16018031267 and Wa_16018063123.

This page was reserved indiscrimitely in all platforms even when
not needed. Limit it to DG2 onwards.


I would use "Limit it to platforms that need WAs" as those WA are only 
needed till 12.71,  otherwise


Reviewed-by: Nirmoy Das 



Fixes: 9bb66c179f50 ("drm/i915: Reserve some kernel space per vm")
Signed-off-by: Andi Shyti 
Cc: Andrzej Hajda 
Cc: Chris Wilson 
Cc: Jonathan Cavitt 
Cc: Nirmoy Das 
---
  drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 3 +++
  drivers/gpu/drm/i915/gt/intel_gt.c   | 6 ++
  drivers/gpu/drm/i915/gt/intel_gt.h   | 9 +
  3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c 
b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
index 1bd0e041e15c..398d60a66410 100644
--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
@@ -961,6 +961,9 @@ static int gen8_init_rsvd(struct i915_address_space *vm)
struct i915_vma *vma;
int ret;
  
+	if (!intel_gt_needs_wa_16018031267(vm->gt))

+   return 0;
+
/* The memory will be used only by GPU. */
obj = i915_gem_object_create_lmem(i915, PAGE_SIZE,
  I915_BO_ALLOC_VOLATILE |
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c 
b/drivers/gpu/drm/i915/gt/intel_gt.c
index 2c6d31b8fc1a..580b5141ce1e 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -1024,6 +1024,12 @@ enum i915_map_type intel_gt_coherent_map_type(struct 
intel_gt *gt,
return I915_MAP_WC;
  }
  
+bool intel_gt_needs_wa_16018031267(struct intel_gt *gt)

+{
+   /* Wa_16018031267, Wa_16018063123 */
+   return IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 55), IP_VER(12, 71));
+}
+
  bool intel_gt_needs_wa_22016122933(struct intel_gt *gt)
  {
return MEDIA_VER_FULL(gt->i915) == IP_VER(13, 0) && gt->type == 
GT_MEDIA;
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h 
b/drivers/gpu/drm/i915/gt/intel_gt.h
index 6e7cab60834c..b5e114d284ad 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt.h
@@ -82,17 +82,18 @@ struct drm_printer;
  ##__VA_ARGS__);   \
  } while (0)
  
-#define NEEDS_FASTCOLOR_BLT_WABB(engine) ( \

-   IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 55), IP_VER(12, 71)) && \
-   engine->class == COPY_ENGINE_CLASS && engine->instance == 0)
-
  static inline bool gt_is_root(struct intel_gt *gt)
  {
return !gt->info.id;
  }
  
+bool intel_gt_needs_wa_16018031267(struct intel_gt *gt);

  bool intel_gt_needs_wa_22016122933(struct intel_gt *gt);
  
+#define NEEDS_FASTCOLOR_BLT_WABB(engine) ( \

+   intel_gt_needs_wa_16018031267(engine->gt) && \
+   engine->class == COPY_ENGINE_CLASS && engine->instance == 0)
+
  static inline struct intel_gt *uc_to_gt(struct intel_uc *uc)
  {
return container_of(uc, struct intel_gt, uc);


Re: [PATCH] drm/i915/gem: Calculate object page offset for partial memory mapping

2024-03-26 Thread Nirmoy Das

Hi Andi,

On 3/26/2024 12:12 PM, Andi Shyti wrote:

Hi Nirmoy,

...


diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c 
b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index a2195e28b625..57a2dda2c3cc 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -276,7 +276,7 @@ static vm_fault_t vm_fault_cpu(struct vm_fault *vmf)
/* PTEs are revoked in obj->ops->put_pages() */
err = remap_io_sg(area,
  area->vm_start, area->vm_end - area->vm_start,
- obj->mm.pages->sgl, iomap);
+ obj->mm.pages->sgl, 0, iomap);

Why don't we need partial mmap for CPU but only for GTT ?

As far as I understood we don't. I have a version with the CPU
offset as well in trybot[*]

But without support for segmented buffer objects, I don't know
how much this has any effect.


You confused me more :) Why segmented buffer object is needed for 
partial CPU mmap but not for GTT  ?


From high level,  GTT and CPU both should support partial mmap unless I 
missing something here.





Sounds like this also need to be cover by a IGT tests.

Yes, I it does need some igt work, working on it.


Don't we need "Fixes" tag for this?

Why should we? I'm not fixing anything here,


If userspace  expects partial mmap to work then this is a bug/gap in 
i915 so we need to


backport this as far as possible. Need some information about the 
requirement about  why we need this patch suddenly?



Regards,

Nirmoy


  I'm just
recalculating the mapping not starting from the beginning of the
scatter page.

Andi

[*] https://patchwork.freedesktop.org/patch/584474/?series=131539&rev=2


Re: [PATCH] drm/i915/gem: Calculate object page offset for partial memory mapping

2024-03-25 Thread Nirmoy Das

Hi Andi,

I have too many questions :) I think the patch makes sense but need more 
context, see below:


On 3/25/2024 2:40 PM, Andi Shyti wrote:

To enable partial memory mapping of GPU virtual memory, it's
necessary to introduce an offset to the object's memory
(obj->mm.pages) scatterlist. This adjustment compensates for
instances when userspace mappings do not start from the beginning
of the object.

Based on a patch by Chris Wilson
.

Signed-off-by: Andi Shyti 
Cc: Chris Wilson 
Cc: Lionel Landwerlin 
---
  drivers/gpu/drm/i915/gem/i915_gem_mman.c |  8 +---
  drivers/gpu/drm/i915/i915_mm.c   | 12 +++-
  drivers/gpu/drm/i915/i915_mm.h   |  3 ++-
  3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c 
b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index a2195e28b625..57a2dda2c3cc 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -276,7 +276,7 @@ static vm_fault_t vm_fault_cpu(struct vm_fault *vmf)
/* PTEs are revoked in obj->ops->put_pages() */
err = remap_io_sg(area,
  area->vm_start, area->vm_end - area->vm_start,
- obj->mm.pages->sgl, iomap);
+ obj->mm.pages->sgl, 0, iomap);


Why don't we need partial mmap for CPU but only for GTT ?

Sounds like this also need to be cover by a IGT tests.  Don't we need 
"Fixes" tag for this?


Regards,

Nirmoy

  
  	if (area->vm_flags & VM_WRITE) {

GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
@@ -302,14 +302,16 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf)
struct i915_ggtt *ggtt = to_gt(i915)->ggtt;
bool write = area->vm_flags & VM_WRITE;
struct i915_gem_ww_ctx ww;
+   unsigned long obj_offset;
intel_wakeref_t wakeref;
struct i915_vma *vma;
pgoff_t page_offset;
int srcu;
int ret;
  
-	/* We don't use vmf->pgoff since that has the fake offset */

+   obj_offset = area->vm_pgoff - drm_vma_node_start(&mmo->vma_node);
page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
+   page_offset += obj_offset;
  
  	trace_i915_gem_object_fault(obj, page_offset, true, write);
  
@@ -404,7 +406,7 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf)
  
  	/* Finally, remap it using the new GTT offset */

ret = remap_io_mapping(area,
-  area->vm_start + (vma->gtt_view.partial.offset 
<< PAGE_SHIFT),
+  area->vm_start + ((vma->gtt_view.partial.offset - 
obj_offset) << PAGE_SHIFT),
   (ggtt->gmadr.start + i915_ggtt_offset(vma)) >> 
PAGE_SHIFT,
   min_t(u64, vma->size, area->vm_end - 
area->vm_start),
   &ggtt->iomap);
diff --git a/drivers/gpu/drm/i915/i915_mm.c b/drivers/gpu/drm/i915/i915_mm.c
index 7998bc74ab49..f5c97a620962 100644
--- a/drivers/gpu/drm/i915/i915_mm.c
+++ b/drivers/gpu/drm/i915/i915_mm.c
@@ -122,13 +122,15 @@ int remap_io_mapping(struct vm_area_struct *vma,
   * @addr: target user address to start at
   * @size: size of map area
   * @sgl: Start sg entry
+ * @offset: offset from the start of the page
   * @iobase: Use stored dma address offset by this address or pfn if -1
   *
   *  Note: this is only safe if the mm semaphore is held when called.
   */
  int remap_io_sg(struct vm_area_struct *vma,
unsigned long addr, unsigned long size,
-   struct scatterlist *sgl, resource_size_t iobase)
+   struct scatterlist *sgl, unsigned long offset,
+   resource_size_t iobase)
  {
struct remap_pfn r = {
.mm = vma->vm_mm,
@@ -141,6 +143,14 @@ int remap_io_sg(struct vm_area_struct *vma,
/* We rely on prevalidation of the io-mapping to skip track_pfn(). */
GEM_BUG_ON((vma->vm_flags & EXPECTED_FLAGS) != EXPECTED_FLAGS);
  
+	while (offset >= sg_dma_len(r.sgt.sgp) >> PAGE_SHIFT) {

+   offset -= sg_dma_len(r.sgt.sgp) >> PAGE_SHIFT;
+   r.sgt = __sgt_iter(__sg_next(r.sgt.sgp), use_dma(iobase));
+   if (!r.sgt.sgp)
+   return -EINVAL;
+   }
+   r.sgt.curr = offset << PAGE_SHIFT;
+
if (!use_dma(iobase))
flush_cache_range(vma, addr, size);
  
diff --git a/drivers/gpu/drm/i915/i915_mm.h b/drivers/gpu/drm/i915/i915_mm.h

index 04c8974d822b..69f9351b1a1c 100644
--- a/drivers/gpu/drm/i915/i915_mm.h
+++ b/drivers/gpu/drm/i915/i915_mm.h
@@ -30,6 +30,7 @@ int remap_io_mapping(struct vm_area_struct *vma,
  
  int remap_io_sg(struct vm_area_struct *vma,

unsigned long addr, unsigned long size,
-   struct scatterlist *sgl, resource_size_t iobase);
+   struct scatterlist *sgl, unsigned long offset,
+   resource_size_t iobase);
  
  #endif /* __I915_MM_H__ */


Re: [PATCH v2] drm/i915/gt: Report full vm address range

2024-03-21 Thread Nirmoy Das

Hi Andi,

On 3/21/2024 4:17 PM, Andi Shyti wrote:

Commit 9bb66c179f50 ("drm/i915: Reserve some kernel space per
vm") has reserved an object for kernel space usage.

Userspace, though, needs to know the full address range.

In the former patch the reserved space was substructed from the
total amount of the VM space. Add it back when the user requests
the GTT size through ioctl (I915_CONTEXT_PARAM_GTT_SIZE).

Fixes: 9bb66c179f50 ("drm/i915: Reserve some kernel space per vm")
Signed-off-by: Andi Shyti 
Cc: Andrzej Hajda 
Cc: Chris Wilson 
Cc: Lionel Landwerlin 
Cc: Michal Mrozek 
Cc: Nirmoy Das 
Cc:  # v6.2+
Acked-by: Michal Mrozek 
Acked-by: Lionel Landwerlin 
---
Hi,

Just proposing a different implementation that doesn't affect
i915 internally but provides the same result. Instead of not
substracting the space during the reservation, I add it back
during the ioctl call.

All the "vm->rsvd.vma->node.size" looks a bit ugly,


Yes, this need document and also vm->total should be vm->total and may 
be we should have


vm->usable which will be used by kernel internal and return vm->total.

For me, I am fine with the kernel change as long as UMD is aware/fine of 
side-effect if


UMD ended up using the reserved page. Basically we need to document this 
well :)


Also may be we should limit this reserving page only on platform where 
it is required ?



Regards,

Nirmoy


  but that's
how it is. Maybe a comment can help to understand better why
there is this addition.

I kept the Ack from Michal and Lionel, because the outcome from
userspace perspactive doesn't really change.

Andi

  drivers/gpu/drm/i915/gem/i915_gem_context.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index 81f65cab1330..60d9e7fe33b3 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -2454,7 +2454,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device 
*dev, void *data,
case I915_CONTEXT_PARAM_GTT_SIZE:
args->size = 0;
vm = i915_gem_context_get_eb_vm(ctx);
-   args->value = vm->total;
+   args->value = vm->total + vm->rsvd.vma->node.size;
i915_vm_put(vm);
  
  		break;


Re: [PATCH v2] drm/i915/gem: Execbuffer objects must have struct pages.

2024-03-14 Thread Nirmoy Das



On 3/12/2024 3:55 PM, Jonathan Cavitt wrote:

We cannot write requests to objects without struct pages, so escape
early if the requests are bound to objects that lack them.

Signed-off-by: Jonathan Cavitt 
---

v2: s/vma-obj/vma->obj

  drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 7 +++
  1 file changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index d3a771afb083e..adb4f9e78cb49 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -3313,6 +3313,13 @@ eb_requests_create(struct i915_execbuffer *eb, struct 
dma_fence *in_fence,
unsigned int i;
  
  	for_each_batch_create_order(eb, i) {

+   /* Do not write requests to objects without struct pages. */
+   if (eb->batches[i]->vma &&
+   !i915_gem_object_has_struct_page(eb->batches[i]->vma->obj)) 
{


As far as I understand, motivation of this patch is to avoid doing 
execbuf on dmabuf imported BO which are in error state of something. 
i915_gem_object_has_struct_page()  checks "obj->mem_flags & 
I915_BO_FLAG_STRUCT_PAGE" which is very i915 specific.


So I think this will not work and will cause regression in existing 
program which are trying to do the same with valid BO. Unfortunately I 
don't have any idea how to better detect that at this moment.



Regards,

Nirmoy


+   out_fence = ERR_PTR(-EINVAL);
+   return out_fence;
+   }
+
/* Allocate a request for this batch buffer nice and early. */
eb->requests[i] = i915_request_create(eb_find_context(eb, i));
if (IS_ERR(eb->requests[i])) {


Re: [PATCH] drm/i915/gt: Report full vm address range

2024-03-14 Thread Nirmoy Das



On 3/14/2024 3:04 PM, Lionel Landwerlin wrote:

Hi Andi,

In Mesa we've been relying on I915_CONTEXT_PARAM_GTT_SIZE so as long 
as that is adjusted by the kernel


What do you mean by adjusted by, should it be a aligned size?

I915_CONTEXT_PARAM_GTT_SIZE ioctl is returning vm->total which is 
adjusted(reduced by a page).


This patch might cause silent error as it is not removing WABB which is 
using the reserved page to add dummy blt and if userspace is using that


page then it will be overwritten.


Regards,

Nirmoy


, we should be able to continue working without issues.

Acked-by: Lionel Landwerlin 

Thanks,

-Lionel

On 13/03/2024 21:39, Andi Shyti wrote:

Commit 9bb66c179f50 ("drm/i915: Reserve some kernel space per
vm") has reserved an object for kernel space usage.

Userspace, though, needs to know the full address range.

Fixes: 9bb66c179f50 ("drm/i915: Reserve some kernel space per vm")
Signed-off-by: Andi Shyti 
Cc: Andrzej Hajda 
Cc: Chris Wilson 
Cc: Lionel Landwerlin 
Cc: Michal Mrozek 
Cc: Nirmoy Das 
Cc:  # v6.2+
---
  drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c 
b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c

index fa46d2308b0e..d76831f50106 100644
--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
@@ -982,8 +982,9 @@ static int gen8_init_rsvd(struct 
i915_address_space *vm)

    vm->rsvd.vma = i915_vma_make_unshrinkable(vma);
  vm->rsvd.obj = obj;
-    vm->total -= vma->node.size;
+
  return 0;
+
  unref:
  i915_gem_object_put(obj);
  return ret;





Re: [PATCH] drm/i915/selftests: Pick correct caching mode.

2024-03-12 Thread Nirmoy Das



On 3/12/2024 3:28 PM, Andi Shyti wrote:

Hi Nirmoy,

On Tue, Mar 12, 2024 at 12:18:15PM +0100, Nirmoy Das wrote:

Caching mode is HW dependent so pick a correct one using
intel_gt_coherent_map_type().

Cc: Andi Shyti 
Cc: Janusz Krzysztofik 
Cc: Jonathan Cavitt 
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10249
Signed-off-by: Nirmoy Das 

I think it's a good choice not to have the Fixes tag here either.


Yes,  fixes tag isn't needed for selftests




Reviewed-by: Andi Shyti 


Thanks,

Nirmoy



Thanks,
Andi


[PATCH] drm/i915/selftests: Pick correct caching mode.

2024-03-12 Thread Nirmoy Das
Caching mode is HW dependent so pick a correct one using
intel_gt_coherent_map_type().

Cc: Andi Shyti 
Cc: Janusz Krzysztofik 
Cc: Jonathan Cavitt 
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10249
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c 
b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
index d684a70f2c04..65a931ea80e9 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c
@@ -7,6 +7,7 @@
 #include "i915_drv.h"
 #include "i915_selftest.h"
 #include "gem/i915_gem_context.h"
+#include "gt/intel_gt.h"
 
 #include "mock_context.h"
 #include "mock_dmabuf.h"
@@ -155,6 +156,7 @@ static int verify_access(struct drm_i915_private *i915,
struct file *file;
u32 *vaddr;
int err = 0, i;
+   unsigned int mode;
 
file = mock_file(i915);
if (IS_ERR(file))
@@ -194,7 +196,8 @@ static int verify_access(struct drm_i915_private *i915,
if (err)
goto out_file;
 
-   vaddr = i915_gem_object_pin_map_unlocked(native_obj, I915_MAP_WB);
+   mode = intel_gt_coherent_map_type(to_gt(i915), native_obj, true);
+   vaddr = i915_gem_object_pin_map_unlocked(native_obj, mode);
if (IS_ERR(vaddr)) {
err = PTR_ERR(vaddr);
goto out_file;
-- 
2.42.0



Re: [PATCH v7 2/3] drm/i915: Remove extra multi-gt pm-references

2024-03-05 Thread Nirmoy Das


On 3/5/2024 3:35 PM, Janusz Krzysztofik wrote:

There was an attempt to fix an issue of illegal attempts to free a still
active i915 VMA object when parking a GT believed to be idle, reported by
CI on 2-GT Meteor Lake.  As a solution, an extra wakeref for a Primary GT
was acquired from i915_gem_do_execbuffer() -- see commit f56fe3e91787
("drm/i915: Fix a VMA UAF for multi-gt platform").

However, that fix occurred insufficient -- the issue was still reported by
CI.  That wakeref was released on exit from i915_gem_do_execbuffer(), then
potentially before completion of the request and deactivation of its
associated VMAs.  Moreover, CI reports indicated that single-GT platforms
also suffered sporadically from the same race.

Since the issue has now been fixed by a preceding patch "drm/i915/vma: Fix
UAF on destroy against retire race", drop the no longer useful changes
introduced by that insufficient fix.

v3: Also drop the no longer used .wakeref_gt0 field from struct
 i915_execbuffer.
v2: Avoid the word "revert" in commit message (Rodrigo),
   - update commit description reusing relevant chunks dropped from the
 description of the proper fix (Rodrigo).

Signed-off-by: Janusz Krzysztofik
Cc: Nirmoy Das
Cc: Rodrigo Vivi


Reviewed-by: Nirmoy Das 


---
  drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 18 --
  1 file changed, 18 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index d3a771afb083e..3f20fe3811999 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -255,7 +255,6 @@ struct i915_execbuffer {
struct intel_context *context; /* logical state for the request */
struct i915_gem_context *gem_context; /** caller's context */
intel_wakeref_t wakeref;
-   intel_wakeref_t wakeref_gt0;
  
  	/** our requests to build */

struct i915_request *requests[MAX_ENGINE_INSTANCE + 1];
@@ -2686,7 +2685,6 @@ static int
  eb_select_engine(struct i915_execbuffer *eb)
  {
struct intel_context *ce, *child;
-   struct intel_gt *gt;
unsigned int idx;
int err;
  
@@ -2710,17 +2708,10 @@ eb_select_engine(struct i915_execbuffer *eb)

}
}
eb->num_batches = ce->parallel.number_children + 1;
-   gt = ce->engine->gt;
  
  	for_each_child(ce, child)

intel_context_get(child);
eb->wakeref = intel_gt_pm_get(ce->engine->gt);
-   /*
-* Keep GT0 active on MTL so that i915_vma_parked() doesn't
-* free VMAs while execbuf ioctl is validating VMAs.
-*/
-   if (gt->info.id)
-   eb->wakeref_gt0 = intel_gt_pm_get(to_gt(gt->i915));
  
  	if (!test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) {

err = intel_context_alloc_state(ce);
@@ -2759,9 +2750,6 @@ eb_select_engine(struct i915_execbuffer *eb)
return err;
  
  err:

-   if (gt->info.id)
-   intel_gt_pm_put(to_gt(gt->i915), eb->wakeref_gt0);
-
intel_gt_pm_put(ce->engine->gt, eb->wakeref);
for_each_child(ce, child)
intel_context_put(child);
@@ -2775,12 +2763,6 @@ eb_put_engine(struct i915_execbuffer *eb)
struct intel_context *child;
  
  	i915_vm_put(eb->context->vm);

-   /*
-* This works in conjunction with eb_select_engine() to prevent
-* i915_vma_parked() from interfering while execbuf validates vmas.
-*/
-   if (eb->gt->info.id)
-   intel_gt_pm_put(to_gt(eb->gt->i915), eb->wakeref_gt0);
intel_gt_pm_put(eb->context->engine->gt, eb->wakeref);
for_each_child(eb->context, child)
intel_context_put(child);

Re: [PATCH v7 1/3] drm/i915/vma: Fix UAF on destroy against retire race

2024-03-05 Thread Nirmoy Das


On 3/5/2024 3:35 PM, Janusz Krzysztofik wrote:

Object debugging tools were sporadically reporting illegal attempts to
free a still active i915 VMA object when parking a GT believed to be idle.

[161.359441] ODEBUG: free active (active state 0) object: 88811643b958 
object type: i915_active hint: __i915_vma_active+0x0/0x50 [i915]
[161.360082] WARNING: CPU: 5 PID: 276 at lib/debugobjects.c:514 
debug_print_object+0x80/0xb0
...
[161.360304] CPU: 5 PID: 276 Comm: kworker/5:2 Not tainted 
6.5.0-rc1-CI_DRM_13375-g003f860e5577+ #1
[161.360314] Hardware name: Intel Corporation Rocket Lake Client 
Platform/RocketLake S UDIMM 6L RVP, BIOS RKLSFWI1.R00.3173.A03.2204210138 
04/21/2022
[161.360322] Workqueue: i915-unordered __intel_wakeref_put_work [i915]
[161.360592] RIP: 0010:debug_print_object+0x80/0xb0
...
[161.361347] debug_object_free+0xeb/0x110
[161.361362] i915_active_fini+0x14/0x130 [i915]
[161.361866] release_references+0xfe/0x1f0 [i915]
[161.362543] i915_vma_parked+0x1db/0x380 [i915]
[161.363129] __gt_park+0x121/0x230 [i915]
[161.363515] intel_wakeref_put_last+0x1f/0x70 [i915]

That has been tracked down to be happening when another thread is
deactivating the VMA inside __active_retire() helper, after the VMA's
active counter has been already decremented to 0, but before deactivation
of the VMA's object is reported to the object debugging tool.

We could prevent from that race by serializing i915_active_fini() with
__active_retire() via ref->tree_lock, but that wouldn't stop the VMA from
being used, e.g. from __i915_vma_retire() called at the end of
__active_retire(), after that VMA has been already freed by a concurrent
i915_vma_destroy() on return from the i915_active_fini().  Then, we should
rather fix the issue at the VMA level, not in i915_active.

Since __i915_vma_parked() is called from __gt_park() on last put of the
GT's wakeref, the issue could be addressed by holding the GT wakeref long
enough for __active_retire() to complete before that wakeref is released
and the GT parked.

I believe the issue was introduced by commit d93939730347 ("drm/i915:
Remove the vma refcount") which moved a call to i915_active_fini() from
a dropped i915_vma_release(), called on last put of the removed VMA kref,
to i915_vma_parked() processing path called on last put of a GT wakeref.
However, its visibility to the object debugging tool was suppressed by a
bug in i915_active that was fixed two weeks later with commit e92eb246feb9
("drm/i915/active: Fix missing debug object activation").

A VMA associated with a request doesn't acquire a GT wakeref by itself.
Instead, it depends on a wakeref held directly by the request's active
intel_context for a GT associated with its VM, and indirectly on that
intel_context's engine wakeref if the engine belongs to the same GT as the
VMA's VM.  Those wakerefs are released asynchronously to VMA deactivation.

Fix the issue by getting a wakeref for the VMA's GT when activating it,
and putting that wakeref only after the VMA is deactivated.  However,
exclude global GTT from that processing path, otherwise the GPU never goes
idle.  Since __i915_vma_retire() may be called from atomic contexts, use
async variant of wakeref put.  Also, to avoid circular locking dependency,
take care of acquiring the wakeref before VM mutex when both are needed.

v7: Add inline comments with justifications for:
 - using untracked variants of intel_gt_pm_get/put() (Nirmoy),
 - using async variant of _put(),
 - not getting the wakeref in case of a global GTT,
 - always getting the first wakeref outside vm->mutex.
v6: Since __i915_vma_active/retire() callbacks are not serialized, storing
 a wakeref tracking handle inside struct i915_vma is not safe, and
 there is no other good place for that.  Use untracked variants of
 intel_gt_pm_get/put_async().
v5: Replace "tile" with "GT" across commit description (Rodrigo),
   - avoid mentioning multi-GT case in commit description (Rodrigo),
   - explain why we need to take a temporary wakeref unconditionally inside
 i915_vma_pin_ww() (Rodrigo).
v4: Refresh on top of commit 5e4e06e4087e ("drm/i915: Track gt pm
 wakerefs") (Andi),
   - for more easy backporting, split out removal of former insufficient
 workarounds and move them to separate patches (Nirmoy).
   - clean up commit message and description a bit.
v3: Identify root cause more precisely, and a commit to blame,
   - identify and drop former workarounds,
   - update commit message and description.
v2: Get the wakeref before VM mutex to avoid circular locking dependency,
   - drop questionable Fixes: tag.

Fixes: d93939730347 ("drm/i915: Remove the vma refcount")
Closes:https://gitlab.freedesktop.org/drm/intel/issues/8875
Signed-off-by: Janusz Krzysztofik
Cc: Thomas Hellström
Cc: Nirmoy Das
Cc: Andi Shyti
Cc: Rodrigo Vivi
Cc:sta...@vger.kernel.org  # v5.19

Re: [PATCH v7 3/3] Revert "drm/i915: Wait for active retire before i915_active_fini()"

2024-03-05 Thread Nirmoy Das


On 3/5/2024 3:35 PM, Janusz Krzysztofik wrote:

This reverts commit 7a2280e8dcd2f1f436db9631287c0b21cf6a92b0, obsoleted
by "drm/i915/vma: Fix UAF on destroy against retire race".

Signed-off-by: Janusz Krzysztofik
Cc: Nirmoy Das


Reviewed-by: Nirmoy Das 


---
  drivers/gpu/drm/i915/i915_vma.c | 2 --
  1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index b70715b1411d6..d2f064d2525cc 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -1776,8 +1776,6 @@ static void release_references(struct i915_vma *vma, 
struct intel_gt *gt,
if (vm_ddestroy)
i915_vm_resv_put(vma->vm);
  
-	/* Wait for async active retire */

-   i915_active_wait(&vma->active);
i915_active_fini(&vma->active);
GEM_WARN_ON(vma->resource);
i915_vma_free(vma);

Re: [PATCH v6 1/3] drm/i915/vma: Fix UAF on destroy against retire race

2024-03-04 Thread Nirmoy Das



On 3/1/2024 8:29 AM, Janusz Krzysztofik wrote:

Object debugging tools were sporadically reporting illegal attempts to
free a still active i915 VMA object when parking a GT believed to be idle.

[161.359441] ODEBUG: free active (active state 0) object: 88811643b958 
object type: i915_active hint: __i915_vma_active+0x0/0x50 [i915]
[161.360082] WARNING: CPU: 5 PID: 276 at lib/debugobjects.c:514 
debug_print_object+0x80/0xb0
...
[161.360304] CPU: 5 PID: 276 Comm: kworker/5:2 Not tainted 
6.5.0-rc1-CI_DRM_13375-g003f860e5577+ #1
[161.360314] Hardware name: Intel Corporation Rocket Lake Client 
Platform/RocketLake S UDIMM 6L RVP, BIOS RKLSFWI1.R00.3173.A03.2204210138 
04/21/2022
[161.360322] Workqueue: i915-unordered __intel_wakeref_put_work [i915]
[161.360592] RIP: 0010:debug_print_object+0x80/0xb0
...
[161.361347] debug_object_free+0xeb/0x110
[161.361362] i915_active_fini+0x14/0x130 [i915]
[161.361866] release_references+0xfe/0x1f0 [i915]
[161.362543] i915_vma_parked+0x1db/0x380 [i915]
[161.363129] __gt_park+0x121/0x230 [i915]
[161.363515] intel_wakeref_put_last+0x1f/0x70 [i915]

That has been tracked down to be happening when another thread is
deactivating the VMA inside __active_retire() helper, after the VMA's
active counter has been already decremented to 0, but before deactivation
of the VMA's object is reported to the object debugging tool.

We could prevent from that race by serializing i915_active_fini() with
__active_retire() via ref->tree_lock, but that wouldn't stop the VMA from
being used, e.g. from __i915_vma_retire() called at the end of
__active_retire(), after that VMA has been already freed by a concurrent
i915_vma_destroy() on return from the i915_active_fini().  Then, we should
rather fix the issue at the VMA level, not in i915_active.

Since __i915_vma_parked() is called from __gt_park() on last put of the
GT's wakeref, the issue could be addressed by holding the GT wakeref long
enough for __active_retire() to complete before that wakeref is released
and the GT parked.

I believe the issue was introduced by commit d93939730347 ("drm/i915:
Remove the vma refcount") which moved a call to i915_active_fini() from
a dropped i915_vma_release(), called on last put of the removed VMA kref,
to i915_vma_parked() processing path called on last put of a GT wakeref.
However, its visibility to the object debugging tool was suppressed by a
bug in i915_active that was fixed two weeks later with commit e92eb246feb9
("drm/i915/active: Fix missing debug object activation").

A VMA associated with a request doesn't acquire a GT wakeref by itself.
Instead, it depends on a wakeref held directly by the request's active
intel_context for a GT associated with its VM, and indirectly on that
intel_context's engine wakeref if the engine belongs to the same GT as the
VMA's VM.  Those wakerefs are released asynchronously to VMA deactivation.

Fix the issue by getting a wakeref for the VMA's GT when activating it,
and putting that wakeref only after the VMA is deactivated.  However,
exclude global GTT from that processing path, otherwise the GPU never goes
idle.  Since __i915_vma_retire() may be called from atomic contexts, use
async variant of wakeref put.  Also, to avoid circular locking dependency,
take care of acquiring the wakeref before VM mutex when both are needed.

v6: Since __i915_vma_active/retire() callbacks are not serialized, storing
 a wakeref tracking handle inside struct i915_vma is not safe, and
 there is no other good place for that.  Use untracked variants of
 intel_gt_pm_get/put_async().
v5: Replace "tile" with "GT" across commit description (Rodrigo),
   - avoid mentioning multi-GT case in commit description (Rodrigo),
   - explain why we need to take a temporary wakeref unconditionally inside
 i915_vma_pin_ww() (Rodrigo).
v4: Refresh on top of commit 5e4e06e4087e ("drm/i915: Track gt pm
 wakerefs") (Andi),
   - for more easy backporting, split out removal of former insufficient
 workarounds and move them to separate patches (Nirmoy).
   - clean up commit message and description a bit.
v3: Identify root cause more precisely, and a commit to blame,
   - identify and drop former workarounds,
   - update commit message and description.
v2: Get the wakeref before VM mutex to avoid circular locking dependency,
   - drop questionable Fixes: tag.

Fixes: d93939730347 ("drm/i915: Remove the vma refcount")
Closes: https://gitlab.freedesktop.org/drm/intel/issues/8875
Signed-off-by: Janusz Krzysztofik 
Cc: Thomas Hellström 
Cc: Nirmoy Das 
Cc: Andi Shyti 
Cc: Rodrigo Vivi 
Cc: sta...@vger.kernel.org # v5.19+
---
  drivers/gpu/drm/i915/i915_vma.c | 26 +++---
  1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index d09aad34ba37f..ffe81fe338f7e 100644
--- a/drivers/g

Re: [PATCH] drm/i915: Add missing doc for drm_i915_reset_stats

2024-03-01 Thread Nirmoy Das

Hi Andi,

On 2/29/2024 4:28 PM, Andi Shyti wrote:

Hi Nirmoy,

On Thu, Feb 29, 2024 at 02:29:18PM +0100, Nirmoy Das wrote:

Add missing doc for struct drm_i915_reset_stats.

Cc: Andi Shyti 
Signed-off-by: Nirmoy Das 

Reviewed-by: Andi Shyti 


Thanks, merged to din.

Nirmoy



Thanks,
Andi


[PATCH] drm/i915: Add missing doc for drm_i915_reset_stats

2024-02-29 Thread Nirmoy Das
Add missing doc for struct drm_i915_reset_stats.

Cc: Andi Shyti 
Signed-off-by: Nirmoy Das 
---
 include/uapi/drm/i915_drm.h | 16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 2ee338860b7e..1279a6b2bece 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -2623,19 +2623,29 @@ struct drm_i915_reg_read {
  *
  */
 
+/*
+ * struct drm_i915_reset_stats - Return global reset and other context stats
+ *
+ * Driver keeps few stats for each contexts and also global reset count.
+ * This struct can be used to query those stats.
+ */
 struct drm_i915_reset_stats {
+   /** @ctx_id: ID of the requested context */
__u32 ctx_id;
+
+   /** @flags: MBZ */
__u32 flags;
 
-   /* All resets since boot/module reload, for all contexts */
+   /** @reset_count: All resets since boot/module reload, for all contexts 
*/
__u32 reset_count;
 
-   /* Number of batches lost when active in GPU, for this context */
+   /** @batch_active: Number of batches lost when active in GPU, for this 
context */
__u32 batch_active;
 
-   /* Number of batches lost pending for execution, for this context */
+   /** @batch_pending: Number of batches lost pending for execution, for 
this context */
__u32 batch_pending;
 
+   /** @pad: MBZ */
__u32 pad;
 };
 
-- 
2.42.0



Re: [PATCH] drm/i915: check before removing mm notifier

2024-02-28 Thread Nirmoy Das



On 2/28/2024 2:24 PM, Tvrtko Ursulin wrote:


On 27/02/2024 09:26, Nirmoy Das wrote:

Hi Tvrtko,

On 2/27/2024 10:04 AM, Tvrtko Ursulin wrote:


On 21/02/2024 11:52, Nirmoy Das wrote:

Merged it to drm-intel-gt-next with s/check/Check


Shouldn't this have had:

Fixes: ed29c2691188 ("drm/i915: Fix userptr so we do not have to 
worry about obj->mm.lock, v7.")

Cc:  # v5.13+

?


Yes. Sorry, I missed that. Can we still the tag ?


I've added them and force pushed the branch since commit was still at 
the top.


Thanks a lot, Tvrtko!




FYI + Jani, Joonas and Rodrigo

Regards,

Tvrtko




Thanks,

Nirmoy


Regards,

Tvrtko


On 2/19/2024 1:50 PM, Nirmoy Das wrote:

Error in mmu_interval_notifier_insert() can leave a NULL
notifier.mm pointer. Catch that and return early.

Cc: Andi Shyti 
Cc: Shawn Lee 
Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c

index 0e21ce9d3e5a..61abfb505766 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -349,6 +349,9 @@ i915_gem_userptr_release(struct 
drm_i915_gem_object *obj)

  {
  GEM_WARN_ON(obj->userptr.page_ref);
+    if (!obj->userptr.notifier.mm)
+    return;
+
mmu_interval_notifier_remove(&obj->userptr.notifier);
  obj->userptr.notifier.mm = NULL;
  }


Re: [PATCH] drm/i915: check before removing mm notifier

2024-02-27 Thread Nirmoy Das

Hi Tvrtko,

On 2/27/2024 10:04 AM, Tvrtko Ursulin wrote:


On 21/02/2024 11:52, Nirmoy Das wrote:

Merged it to drm-intel-gt-next with s/check/Check


Shouldn't this have had:

Fixes: ed29c2691188 ("drm/i915: Fix userptr so we do not have to worry 
about obj->mm.lock, v7.")

Cc:  # v5.13+

?


Yes. Sorry, I missed that. Can we still the tag ?


Thanks,

Nirmoy


Regards,

Tvrtko


On 2/19/2024 1:50 PM, Nirmoy Das wrote:

Error in mmu_interval_notifier_insert() can leave a NULL
notifier.mm pointer. Catch that and return early.

Cc: Andi Shyti 
Cc: Shawn Lee 
Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c

index 0e21ce9d3e5a..61abfb505766 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -349,6 +349,9 @@ i915_gem_userptr_release(struct 
drm_i915_gem_object *obj)

  {
  GEM_WARN_ON(obj->userptr.page_ref);
+    if (!obj->userptr.notifier.mm)
+    return;
+
mmu_interval_notifier_remove(&obj->userptr.notifier);
  obj->userptr.notifier.mm = NULL;
  }


Re: [PATCH] drm/i915: check before removing mm notifier

2024-02-21 Thread Nirmoy Das

Merged it to drm-intel-gt-next with s/check/Check

On 2/19/2024 1:50 PM, Nirmoy Das wrote:

Error in mmu_interval_notifier_insert() can leave a NULL
notifier.mm pointer. Catch that and return early.

Cc: Andi Shyti 
Cc: Shawn Lee 
Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
index 0e21ce9d3e5a..61abfb505766 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -349,6 +349,9 @@ i915_gem_userptr_release(struct drm_i915_gem_object *obj)
  {
GEM_WARN_ON(obj->userptr.page_ref);
  
+	if (!obj->userptr.notifier.mm)

+   return;
+
mmu_interval_notifier_remove(&obj->userptr.notifier);
obj->userptr.notifier.mm = NULL;
  }


Re: [PATCH] drm/i915: check before removing mm notifier

2024-02-20 Thread Nirmoy Das

Hi Rodrigo,

On 2/19/2024 9:12 PM, Rodrigo Vivi wrote:

On Mon, Feb 19, 2024 at 01:50:47PM +0100, Nirmoy Das wrote:

Error in mmu_interval_notifier_insert() can leave a NULL
notifier.mm pointer. Catch that and return early.

Cc: Andi Shyti
Cc: Shawn Lee
Signed-off-by: Nirmoy Das
---
  drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
index 0e21ce9d3e5a..61abfb505766 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -349,6 +349,9 @@ i915_gem_userptr_release(struct drm_i915_gem_object *obj)
  {
GEM_WARN_ON(obj->userptr.page_ref);
  
+	if (!obj->userptr.notifier.mm)

+   return;
+

hmmm... right, it looks that we need this protection. But...

I mean, feel free to use
Reviewed-by: Rodrigo Vivi

for this patch,

but I believe that if this mmu insert failed we might have other
deeper problems like when checking i915_gem_object_is_userptr() ?

No?!


We are returning an error if mmu insert fails while creating a userptr 
object  so the obj struct is only available to obj cleanup methods.


As far as I see, i915_gem_object_is_userptr() should not happen on such obj 
struct.

Thanks,
Nirmoy


mmu_interval_notifier_remove(&obj->userptr.notifier);
obj->userptr.notifier.mm = NULL;
  }
--
2.42.0


[PATCH] drm/i915: check before removing mm notifier

2024-02-19 Thread Nirmoy Das
Error in mmu_interval_notifier_insert() can leave a NULL
notifier.mm pointer. Catch that and return early.

Cc: Andi Shyti 
Cc: Shawn Lee 
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
index 0e21ce9d3e5a..61abfb505766 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -349,6 +349,9 @@ i915_gem_userptr_release(struct drm_i915_gem_object *obj)
 {
GEM_WARN_ON(obj->userptr.page_ref);
 
+   if (!obj->userptr.notifier.mm)
+   return;
+
mmu_interval_notifier_remove(&obj->userptr.notifier);
obj->userptr.notifier.mm = NULL;
 }
-- 
2.42.0



Re: [PATCH v5 0/3] drm/i915: Fix VMA UAF on destroy against deactivate race

2024-01-29 Thread Nirmoy Das

Hi Janusz,

There seems to be a regression in CI related to this:

https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_129026v2/bat-dg1-7/igt@gem_lmem_swapping@random-engi...@lmem0.html#dmesg-warnings1053

Please have a look.


Regards,

Nirmoy

On 1/24/2024 6:13 PM, Janusz Krzysztofik wrote:

Object debugging tools were sporadically reporting illegal attempts to
free a still active i915 VMA object when parking a GT believed to be idle.

[161.359441] ODEBUG: free active (active state 0) object: 88811643b958 
object type: i915_active hint: __i915_vma_active+0x0/0x50 [i915]
[161.360082] WARNING: CPU: 5 PID: 276 at lib/debugobjects.c:514 
debug_print_object+0x80/0xb0
...
[161.360304] CPU: 5 PID: 276 Comm: kworker/5:2 Not tainted 
6.5.0-rc1-CI_DRM_13375-g003f860e5577+ #1
[161.360314] Hardware name: Intel Corporation Rocket Lake Client 
Platform/RocketLake S UDIMM 6L RVP, BIOS RKLSFWI1.R00.3173.A03.2204210138 
04/21/2022
[161.360322] Workqueue: i915-unordered __intel_wakeref_put_work [i915]
[161.360592] RIP: 0010:debug_print_object+0x80/0xb0
...
[161.361347] debug_object_free+0xeb/0x110
[161.361362] i915_active_fini+0x14/0x130 [i915]
[161.361866] release_references+0xfe/0x1f0 [i915]
[161.362543] i915_vma_parked+0x1db/0x380 [i915]
[161.363129] __gt_park+0x121/0x230 [i915]
[161.363515] intel_wakeref_put_last+0x1f/0x70 [i915]

That has been tracked down to be happening when another thread is
deactivating the VMA inside __active_retire() helper, after the VMA's
active counter has been already decremented to 0, but before deactivation
of the VMA's object is reported to the object debugging tool.

We could prevent from that race by serializing i915_active_fini() with
__active_retire() via ref->tree_lock, but that wouldn't stop the VMA from
being used, e.g. from __i915_vma_retire() called at the end of
__active_retire(), after that VMA has been already freed by a concurrent
i915_vma_destroy() on return from the i915_active_fini().  Then, we should
rather fix the issue at the VMA level, not in i915_active.

Since __i915_vma_parked() is called from __gt_park() on last put of the
GT's wakeref, the issue could be addressed by holding the GT wakeref long
enough for __active_retire() to complete before that wakeref is released
and the GT parked.

A VMA associated with a request doesn't acquire a GT wakeref by itself.
Instead, it depends on a wakeref held directly by the request's active
intel_context for a GT associated with its VM, and indirectly on that
intel_context's engine wakeref if the engine belongs to the same GT as the
VMA's VM.  Those wakerefs are released asynchronously to VMA deactivation.

In case of single-GT platforms, at least one of those wakerefs is usually
held long enough for the request's VMA to be deactivated on time, before
it is destroyed on last put of its VM GT wakeref.  However, on multi-GT
platforms, a request may use a VMA from a GT other than the one that hosts
the request's engine, then it is protected only with the intel_context's
VM GT wakeref.

There was an attempt to fix the issue on 2-GT Meteor Lake by acquiring an
extra wakeref for a Primary GT from i915_gem_do_execbuffer() -- see commit
f56fe3e91787 ("drm/i915: Fix a VMA UAF for multi-gt platform").  However,
that fix occurred insufficient -- the issue was still reported by CI.
That wakeref was released on exit from i915_gem_do_execbuffer(), then
potentially before completion of the request and deactivation of its
associated VMAs.  Moreover, CI reports indicated that single-GT platforms
also suffered sporadically from the same race.

I believe the issue was introduced by commit d93939730347 ("drm/i915:
Remove the vma refcount") which moved a call to i915_active_fini() from
a dropped i915_vma_release(), called on last put of the removed VMA kref,
to i915_vma_parked() processing path called on last put of a GT wakeref.
However, its visibility to the object debugging tool was suppressed by a
bug in i915_active that was fixed two weeks later with commit e92eb246feb9
("drm/i915/active: Fix missing debug object activation").

Fix the issue by getting a wakeref for the VMA's GT when activating it,
and putting that wakeref only after the VMA is deactivated.  However,
exclude global GTT from that processing path, otherwise the GPU never goes
idle.  Since __i915_vma_retire() may be called from atomic contexts, use
async variant of wakeref put.  Also, to avoid circular locking dependency,
take care of acquiring the wakeref before VM mutex when both are needed.

Having that fixed, stop explicitly acquiring the extra GT0 wakeref from
inside i915_gem_do_execbuffer(), and also drop an extra call to
i915_active_wait(), introduced by commit 7a2280e8dcd2 ("drm/i915: Wait for
active retire before i915_active_fini()") as another insufficient fix for
this UAF race.

v5: Replace "tile" with "GT" across commit descrip

Re: [PATCH v3 05/16] drm/i915: Disable the "binder"

2024-01-19 Thread Nirmoy Das


On 1/19/2024 11:47 AM, Nirmoy Das wrote:



On 1/19/2024 12:12 AM, Ville Syrjälä wrote:

On Wed, Jan 17, 2024 at 06:46:24PM +0100, Nirmoy Das wrote:

On 1/17/2024 3:13 PM, Michał Winiarski wrote:

On Tue, Jan 16, 2024 at 09:56:25AM +0200, Ville Syrjala wrote:

From: Ville Syrjälä

Now that the GGTT PTE updates go straight to GSMBASE (bypassing
GTTMMADR) there should be no more risk of system hangs? So the
"binder" (ie. update the PTEs via MI_UPDATE_GTT) is no longer
necessary, disable it.

My main worry with the MI_UPDATE_GTT are:
- only used on this one platform so very limited testing coverage
- async so more opprtunities to screw things up
- what happens if the engine hangs while we're waiting for MI_UPDATE_GTT
to finish?
- requires working command submission, so even getting a working
display now depends on a lot more extra components working correctly

TODO: MI_UPDATE_GTT might be interesting as an optimization
though, so perhaps someone should look into always using it
(assuming the GPU is alive and well)?

v2: Keep using MI_UPDATE_GTT on VM guests

Cc: Paz Zcharya
Cc: Nirmoy Das
Signed-off-by: Ville Syrjälä
---
   drivers/gpu/drm/i915/gt/intel_gtt.c | 3 ++-
   1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c 
b/drivers/gpu/drm/i915/gt/intel_gtt.c
index 86f73fe558ca..e83dabc56a14 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.c
@@ -24,7 +24,8 @@
   bool i915_ggtt_require_binder(struct drm_i915_private *i915)
   {
/* Wa_13010847436 & Wa_14019519902 */
-   return MEDIA_VER_FULL(i915) == IP_VER(13, 0);
+   return i915_run_as_guest() &&
+   MEDIA_VER_FULL(i915) == IP_VER(13, 0);

Note that i915_run_as_guest() is not the most reliable way to decide
whether to use MI_UPDATE_GTT or straight to GSMBASE, as it requires the
hypervisor to "opt-in" and set the X86_FEATURE_HYPERVISOR.
If it's not set - the driver will go into GSMBASE, which is not mapped
inside the guest.
Does the system firmware advertise whether GSMBASE is "open" or "closed"
to CPU access in any way?

Had a chat with David from IVE team, David suggested to read 0x138914 to
determine that.  "GOP needs to qualify the WA by reading GFX MMIO offset
0x138914 and verify the value there is 0x1." -> as per the HSD-22018444074

OK, so we can confirm the firmware is on board. I suppose no real harm
in doing so even though it would clearly be a rather weird if someone
would ship some ancient firmware that doesn't handle this.

But that still won't help with the guest side handling because that
register will read the same in the guest.



We are back to the same question :/ How about
if (boot_cpu_has(X86_FEATURE_HYPERVISOR) && !i915_run_as_guest()


hmm, never mind that was stupid.



disable binder

Regards,

Nirmoy


Re: [PATCH v3 05/16] drm/i915: Disable the "binder"

2024-01-19 Thread Nirmoy Das


On 1/19/2024 12:12 AM, Ville Syrjälä wrote:

On Wed, Jan 17, 2024 at 06:46:24PM +0100, Nirmoy Das wrote:

On 1/17/2024 3:13 PM, Michał Winiarski wrote:

On Tue, Jan 16, 2024 at 09:56:25AM +0200, Ville Syrjala wrote:

From: Ville Syrjälä

Now that the GGTT PTE updates go straight to GSMBASE (bypassing
GTTMMADR) there should be no more risk of system hangs? So the
"binder" (ie. update the PTEs via MI_UPDATE_GTT) is no longer
necessary, disable it.

My main worry with the MI_UPDATE_GTT are:
- only used on this one platform so very limited testing coverage
- async so more opprtunities to screw things up
- what happens if the engine hangs while we're waiting for MI_UPDATE_GTT
to finish?
- requires working command submission, so even getting a working
display now depends on a lot more extra components working correctly

TODO: MI_UPDATE_GTT might be interesting as an optimization
though, so perhaps someone should look into always using it
(assuming the GPU is alive and well)?

v2: Keep using MI_UPDATE_GTT on VM guests

Cc: Paz Zcharya
Cc: Nirmoy Das
Signed-off-by: Ville Syrjälä
---
   drivers/gpu/drm/i915/gt/intel_gtt.c | 3 ++-
   1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c 
b/drivers/gpu/drm/i915/gt/intel_gtt.c
index 86f73fe558ca..e83dabc56a14 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.c
@@ -24,7 +24,8 @@
   bool i915_ggtt_require_binder(struct drm_i915_private *i915)
   {
/* Wa_13010847436 & Wa_14019519902 */
-   return MEDIA_VER_FULL(i915) == IP_VER(13, 0);
+   return i915_run_as_guest() &&
+   MEDIA_VER_FULL(i915) == IP_VER(13, 0);

Note that i915_run_as_guest() is not the most reliable way to decide
whether to use MI_UPDATE_GTT or straight to GSMBASE, as it requires the
hypervisor to "opt-in" and set the X86_FEATURE_HYPERVISOR.
If it's not set - the driver will go into GSMBASE, which is not mapped
inside the guest.
Does the system firmware advertise whether GSMBASE is "open" or "closed"
to CPU access in any way?

Had a chat with David from IVE team, David suggested to read 0x138914 to
determine that.  "GOP needs to qualify the WA by reading GFX MMIO offset
0x138914 and verify the value there is 0x1." -> as per the HSD-22018444074

OK, so we can confirm the firmware is on board. I suppose no real harm
in doing so even though it would clearly be a rather weird if someone
would ship some ancient firmware that doesn't handle this.

But that still won't help with the guest side handling because that
register will read the same in the guest.



We are back to the same question :/ How about
if (boot_cpu_has(X86_FEATURE_HYPERVISOR) && !i915_run_as_guest()

disable binder

Regards,

Nirmoy



Re: [PATCH v3 05/16] drm/i915: Disable the "binder"

2024-01-17 Thread Nirmoy Das



On 1/17/2024 3:13 PM, Michał Winiarski wrote:

On Tue, Jan 16, 2024 at 09:56:25AM +0200, Ville Syrjala wrote:

From: Ville Syrjälä 

Now that the GGTT PTE updates go straight to GSMBASE (bypassing
GTTMMADR) there should be no more risk of system hangs? So the
"binder" (ie. update the PTEs via MI_UPDATE_GTT) is no longer
necessary, disable it.

My main worry with the MI_UPDATE_GTT are:
- only used on this one platform so very limited testing coverage
- async so more opprtunities to screw things up
- what happens if the engine hangs while we're waiting for MI_UPDATE_GTT
   to finish?
- requires working command submission, so even getting a working
   display now depends on a lot more extra components working correctly

TODO: MI_UPDATE_GTT might be interesting as an optimization
though, so perhaps someone should look into always using it
(assuming the GPU is alive and well)?

v2: Keep using MI_UPDATE_GTT on VM guests

Cc: Paz Zcharya 
Cc: Nirmoy Das 
Signed-off-by: Ville Syrjälä 
---
  drivers/gpu/drm/i915/gt/intel_gtt.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c 
b/drivers/gpu/drm/i915/gt/intel_gtt.c
index 86f73fe558ca..e83dabc56a14 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.c
@@ -24,7 +24,8 @@
  bool i915_ggtt_require_binder(struct drm_i915_private *i915)
  {
/* Wa_13010847436 & Wa_14019519902 */
-   return MEDIA_VER_FULL(i915) == IP_VER(13, 0);
+   return i915_run_as_guest() &&
+   MEDIA_VER_FULL(i915) == IP_VER(13, 0);

Note that i915_run_as_guest() is not the most reliable way to decide
whether to use MI_UPDATE_GTT or straight to GSMBASE, as it requires the
hypervisor to "opt-in" and set the X86_FEATURE_HYPERVISOR.
If it's not set - the driver will go into GSMBASE, which is not mapped
inside the guest.
Does the system firmware advertise whether GSMBASE is "open" or "closed"
to CPU access in any way?


Had a chat with David from IVE team, David suggested to read 0x138914 to 
determine that.  "GOP needs to qualify the WA by reading GFX MMIO offset 
0x138914 and verify the value there is 0x1." -> as per the HSD-22018444074




Regards,

Nirmoy



-Michał


  }
  
  static bool intel_ggtt_update_needs_vtd_wa(struct drm_i915_private *i915)

--
2.41.0



Re: [PATCH v3 07/16] drm/i915: Fix PTE decode during initial plane readout

2024-01-16 Thread Nirmoy Das



On 1/16/2024 8:56 AM, Ville Syrjala wrote:

From: Ville Syrjälä 

When multiple pipes are enabled by the BIOS we try to read out each
in turn. But we do the readout for the second only after the inherited
vma for the first has been rebound into its original place (and thus
the PTEs have been rewritten). Unlike the BIOS we set some high caching
bits in the PTE on MTL which confuses the readout for the second plane.
Filter out the non-address bits from the PTE value appropriately to
fix this.

I suppose it might also be possible that the BIOS would already set
some caching bits as well, in which case we'd run into this same
issue already for the first plane.

TODO:
- should abstract the PTE decoding to avoid details leaking all over
- should probably do the readout for all the planes before
   we touch anything (including the PTEs) so that we truly read
   out the BIOS state

Cc: Paz Zcharya 
Reviewed-by: Andrzej Hajda 
Signed-off-by: Ville Syrjälä 

Acked-by: Nirmoy Das 

---
  drivers/gpu/drm/i915/display/intel_plane_initial.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/display/intel_plane_initial.c 
b/drivers/gpu/drm/i915/display/intel_plane_initial.c
index a55c09cbd0e4..ffc92b18fcf5 100644
--- a/drivers/gpu/drm/i915/display/intel_plane_initial.c
+++ b/drivers/gpu/drm/i915/display/intel_plane_initial.c
@@ -72,7 +72,7 @@ initial_plane_vma(struct drm_i915_private *i915,
return NULL;
}
  
-		phys_base = pte & I915_GTT_PAGE_MASK;

+   phys_base = pte & GEN12_GGTT_PTE_ADDR_MASK;
mem = i915->mm.regions[INTEL_REGION_LMEM_0];
  
  		/*


Re: [PATCH v3 06/16] drm/i915: Rename the DSM/GSM registers

2024-01-16 Thread Nirmoy Das



On 1/16/2024 8:56 AM, Ville Syrjala wrote:

From: Ville Syrjälä 

0x108100 and 0x1080c0 have been around since snb. Rename the
defines appropriately.

Cc: Paz Zcharya 
Reviewed-by: Andrzej Hajda 
Signed-off-by: Ville Syrjälä 

Acked-by: Nirmoy Das 

---
  drivers/gpu/drm/i915/gem/i915_gem_stolen.c  | 4 ++--
  drivers/gpu/drm/i915/gt/intel_ggtt.c| 2 +-
  drivers/gpu/drm/i915/gt/intel_region_lmem.c | 2 +-
  drivers/gpu/drm/i915/i915_reg.h | 7 ---
  4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c 
b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
index 0b429f1ecd99..ce6b860b393e 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
@@ -935,7 +935,7 @@ i915_gem_stolen_lmem_setup(struct drm_i915_private *i915, 
u16 type,
GEM_BUG_ON((dsm_base + dsm_size) > lmem_size);
} else {
/* Use DSM base address instead for stolen memory */
-   dsm_base = intel_uncore_read64(uncore, GEN12_DSMBASE) & 
GEN12_BDSM_MASK;
+   dsm_base = intel_uncore_read64(uncore, GEN6_DSMBASE) & 
GEN11_BDSM_MASK;
if (WARN_ON(lmem_size < dsm_base))
return ERR_PTR(-ENODEV);
dsm_size = ALIGN_DOWN(lmem_size - dsm_base, SZ_1M);
@@ -951,7 +951,7 @@ i915_gem_stolen_lmem_setup(struct drm_i915_private *i915, 
u16 type,
 * Normally this would not work but on MTL the system firmware
 * should have relaxed the access permissions sufficiently.
 */
-   io_start = intel_uncore_read64(uncore, GEN12_DSMBASE) & 
GEN12_BDSM_MASK;
+   io_start = intel_uncore_read64(uncore, GEN6_DSMBASE) & 
GEN11_BDSM_MASK;
io_size = dsm_size;
} else if (pci_resource_len(pdev, GEN12_LMEM_BAR) < lmem_size) {
io_start = 0;
diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index 7a716ff16070..b87933e7671d 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -1170,7 +1170,7 @@ static int ggtt_probe_common(struct i915_ggtt *ggtt, u64 
size)
 * should have relaxed the access permissions sufficiently.
 */
if (IS_METEORLAKE(i915) && !i915_run_as_guest())
-   phys_addr = intel_uncore_read64(uncore, GEN12_GSMBASE) & 
GEN12_BDSM_MASK;
+   phys_addr = intel_uncore_read64(uncore, GEN6_GSMBASE) & 
GEN11_BDSM_MASK;
else
phys_addr = pci_resource_start(pdev, GEN4_GTTMMADR_BAR) + 
gen6_gttadr_offset(i915);
  
diff --git a/drivers/gpu/drm/i915/gt/intel_region_lmem.c b/drivers/gpu/drm/i915/gt/intel_region_lmem.c

index af357089da6e..51bb27e10a4f 100644
--- a/drivers/gpu/drm/i915/gt/intel_region_lmem.c
+++ b/drivers/gpu/drm/i915/gt/intel_region_lmem.c
@@ -240,7 +240,7 @@ static struct intel_memory_region *setup_lmem(struct 
intel_gt *gt)
lmem_size -= tile_stolen;
} else {
/* Stolen starts from GSMBASE without CCS */
-   lmem_size = intel_uncore_read64(&i915->uncore, GEN12_GSMBASE);
+   lmem_size = intel_uncore_read64(&i915->uncore, GEN6_GSMBASE);
}
  
  	i915_resize_lmem_bar(i915, lmem_size);

diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 75bc08081fce..0d35173a7718 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -6320,9 +6320,10 @@ enum skl_power_gate {
  #define   GMS_MASKREG_GENMASK(15, 8)
  #define   GGMS_MASK   REG_GENMASK(7, 6)
  
-#define GEN12_GSMBASE			_MMIO(0x108100)

-#define GEN12_DSMBASE  _MMIO(0x1080C0)
-#define   GEN12_BDSM_MASK  REG_GENMASK64(63, 20)
+#define GEN6_GSMBASE   _MMIO(0x108100)
+#define GEN6_DSMBASE   _MMIO(0x1080C0)
+#define   GEN6_BDSM_MASK   REG_GENMASK64(31, 20)
+#define   GEN11_BDSM_MASK  REG_GENMASK64(63, 20)
  
  #define XEHP_CLOCK_GATE_DIS		_MMIO(0x101014)

  #define   SGSI_SIDECLK_DISREG_BIT(17)


Re: [PATCH v3 05/16] drm/i915: Disable the "binder"

2024-01-16 Thread Nirmoy Das



On 1/16/2024 8:56 AM, Ville Syrjala wrote:

From: Ville Syrjälä 

Now that the GGTT PTE updates go straight to GSMBASE (bypassing
GTTMMADR) there should be no more risk of system hangs? So the
"binder" (ie. update the PTEs via MI_UPDATE_GTT) is no longer
necessary, disable it.

My main worry with the MI_UPDATE_GTT are:
- only used on this one platform so very limited testing coverage
- async so more opprtunities to screw things up
- what happens if the engine hangs while we're waiting for MI_UPDATE_GTT
   to finish?
- requires working command submission, so even getting a working
   display now depends on a lot more extra components working correctly

TODO: MI_UPDATE_GTT might be interesting as an optimization
though, so perhaps someone should look into always using it
(assuming the GPU is alive and well)?

v2: Keep using MI_UPDATE_GTT on VM guests

Cc: Paz Zcharya 
Cc: Nirmoy Das 
Signed-off-by: Ville Syrjälä 


Reviewed-by: Nirmoy Das 


---
  drivers/gpu/drm/i915/gt/intel_gtt.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c 
b/drivers/gpu/drm/i915/gt/intel_gtt.c
index 86f73fe558ca..e83dabc56a14 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.c
@@ -24,7 +24,8 @@
  bool i915_ggtt_require_binder(struct drm_i915_private *i915)
  {
/* Wa_13010847436 & Wa_14019519902 */
-   return MEDIA_VER_FULL(i915) == IP_VER(13, 0);
+   return i915_run_as_guest() &&
+   MEDIA_VER_FULL(i915) == IP_VER(13, 0);
  }
  
  static bool intel_ggtt_update_needs_vtd_wa(struct drm_i915_private *i915)


Re: [PATCH v3 04/16] drm/i915: Bypass LMEMBAR/GTTMMADR for MTL stolen memory access

2024-01-16 Thread Nirmoy Das


On 1/16/2024 8:56 AM, Ville Syrjala wrote:

From: Ville Syrjälä

On MTL accessing stolen memory via the BARs is somehow borked,
and it can hang the machine. As a workaround let's bypass the
BARs and just go straight to DSMBASE/GSMBASE instead.

Note that on every other platform this itself would hang the
machine, but on MTL the system firmware is expected to relax
the access permission guarding stolen memory to enable this
workaround, and thus direct CPU accesses should be fine.

The raw stolen memory areas won't be passed to VMs so we'll
need to risk using the BAR there for the initial setup. Once
command submission is up we should switch to MI_UPDATE_GTT
which at least shouldn't hang the whole machine.

v2: Don't use direct GSM/DSM access on guests
 Add w/a number

Cc: Paz Zcharya
Cc: Nirmoy Das
Cc: Joonas Lahtinen
Reviewed-by: Andrzej Hajda
Reviewed-by: Radhakrishna Sripada
Signed-off-by: Ville Syrjälä


I think i915_run_as_guest() should work.

Reviewed-by: Nirmoy Das 


---
  drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 14 +-
  drivers/gpu/drm/i915/gt/intel_ggtt.c   | 16 +++-
  2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c 
b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
index ee237043c302..0b429f1ecd99 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
@@ -941,7 +941,19 @@ i915_gem_stolen_lmem_setup(struct drm_i915_private *i915, 
u16 type,
dsm_size = ALIGN_DOWN(lmem_size - dsm_base, SZ_1M);
}
  
-	if (pci_resource_len(pdev, GEN12_LMEM_BAR) < lmem_size) {

+   if (IS_METEORLAKE(i915) && !i915_run_as_guest()) {
+   /*
+* Wa_22018444074
+*
+* Access via BAR can hang MTL, go directly to DSM,
+* except for VM guests which won't have access to it.
+*
+* Normally this would not work but on MTL the system firmware
+* should have relaxed the access permissions sufficiently.
+*/
+   io_start = intel_uncore_read64(uncore, GEN12_DSMBASE) & 
GEN12_BDSM_MASK;
+   io_size = dsm_size;
+   } else if (pci_resource_len(pdev, GEN12_LMEM_BAR) < lmem_size) {
io_start = 0;
io_size = 0;
} else {
diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index 21a7e3191c18..7a716ff16070 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -24,6 +24,7 @@
  #include "intel_ring.h"
  #include "i915_drv.h"
  #include "i915_pci.h"
+#include "i915_reg.h"
  #include "i915_request.h"
  #include "i915_scatterlist.h"
  #include "i915_utils.h"
@@ -1152,13 +1153,26 @@ static unsigned int gen6_gttadr_offset(struct 
drm_i915_private *i915)
  static int ggtt_probe_common(struct i915_ggtt *ggtt, u64 size)
  {
struct drm_i915_private *i915 = ggtt->vm.i915;
+   struct intel_uncore *uncore = ggtt->vm.gt->uncore;
struct pci_dev *pdev = to_pci_dev(i915->drm.dev);
phys_addr_t phys_addr;
u32 pte_flags;
int ret;
  
  	GEM_WARN_ON(pci_resource_len(pdev, GEN4_GTTMMADR_BAR) != gen6_gttmmadr_size(i915));

-   phys_addr = pci_resource_start(pdev, GEN4_GTTMMADR_BAR) + 
gen6_gttadr_offset(i915);
+   /*
+* Wa_22018444074
+*
+* Access via BAR can hang MTL, go directly to GSM,
+* except for VM guests which won't have access to it.
+*
+* Normally this would not work but on MTL the system firmware
+* should have relaxed the access permissions sufficiently.
+*/
+   if (IS_METEORLAKE(i915) && !i915_run_as_guest())
+   phys_addr = intel_uncore_read64(uncore, GEN12_GSMBASE) & 
GEN12_BDSM_MASK;
+   else
+   phys_addr = pci_resource_start(pdev, GEN4_GTTMMADR_BAR) + 
gen6_gttadr_offset(i915);
  
  	if (needs_wc_ggtt_mapping(i915))

ggtt->gsm = ioremap_wc(phys_addr, size);

Re: [PATCH v3 03/16] drm/i915: Remove ad-hoc lmem/stolen debugs

2024-01-16 Thread Nirmoy Das



On 1/16/2024 8:56 AM, Ville Syrjala wrote:

From: Ville Syrjälä 

Now that intel_memory_regions_hw_probe() prints out each and every
memory region there's no reason to have ad-hoc debugs to do similar
things elsewhere.

Cc: Paz Zcharya 
Reviewed-by: Andrzej Hajda 
Signed-off-by: Ville Syrjälä 

Reviewed-by: Nirmoy Das 

---
  drivers/gpu/drm/i915/gem/i915_gem_stolen.c  | 4 
  drivers/gpu/drm/i915/gt/intel_region_lmem.c | 3 ---
  2 files changed, 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c 
b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
index d2440c793f84..ee237043c302 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
@@ -828,7 +828,6 @@ static const struct intel_memory_region_ops 
i915_region_stolen_smem_ops = {
  
  static int init_stolen_lmem(struct intel_memory_region *mem)

  {
-   struct drm_i915_private *i915 = mem->i915;
int err;
  
  	if (GEM_WARN_ON(resource_size(&mem->region) == 0))

@@ -844,9 +843,6 @@ static int init_stolen_lmem(struct intel_memory_region *mem)
!io_mapping_init_wc(&mem->iomap, mem->io.start, 
resource_size(&mem->io)))
goto err_cleanup;
  
-	drm_dbg(&i915->drm, "Stolen Local DSM: %pR\n", &mem->region);

-   drm_dbg(&i915->drm, "Stolen Local memory IO: %pR\n", &mem->io);
-
return 0;
  
  err_cleanup:

diff --git a/drivers/gpu/drm/i915/gt/intel_region_lmem.c 
b/drivers/gpu/drm/i915/gt/intel_region_lmem.c
index 6f96a6b70601..af357089da6e 100644
--- a/drivers/gpu/drm/i915/gt/intel_region_lmem.c
+++ b/drivers/gpu/drm/i915/gt/intel_region_lmem.c
@@ -273,9 +273,6 @@ static struct intel_memory_region *setup_lmem(struct 
intel_gt *gt)
if (err)
goto err_region_put;
  
-	drm_dbg(&i915->drm, "Local memory: %pR\n", &mem->region);

-   drm_dbg(&i915->drm, "Local memory IO: %pR\n", &mem->io);
-
if (io_size < lmem_size)
drm_info(&i915->drm, "Using a reduced BAR size of %lluMiB. Consider 
enabling 'Resizable BAR' or similar, if available in the BIOS.\n",
 (u64)io_size >> 20);


Re: [PATCH v3 01/16] drm/i915: Use struct resource for memory region IO as well

2024-01-16 Thread Nirmoy Das



On 1/16/2024 8:56 AM, Ville Syrjala wrote:

From: Ville Syrjälä 

mem->region is a struct resource, but mem->io_start and
mem->io_size are not for whatever reason. Let's unify this
and convert the io stuff into a struct resource as well.
Should make life a little less annoying when you don't have
juggle between two different approaches all the time.

Mostly done using cocci (with manual tweaks at all the
places where we mutate io_size by hand):
@@
struct intel_memory_region *M;
expression START, SIZE;
@@
- M->io_start = START;
- M->io_size = SIZE;
+ M->io = DEFINE_RES_MEM(START, SIZE);

@@
struct intel_memory_region *M;
@@
- M->io_start
+ M->io.start

@@
struct intel_memory_region M;
@@
- M.io_start
+ M.io.start

@@
expression M;
@@
- M->io_size
+ resource_size(&M->io)

@@
expression M;
@@
- M.io_size
+ resource_size(&M.io)

Cc: Paz Zcharya 
Reviewed-by: Andrzej Hajda 
Signed-off-by: Ville Syrjälä 

Acked-by: Nirmoy Das 

---
  drivers/gpu/drm/i915/display/intel_fbdev_fb.c  |  2 +-
  drivers/gpu/drm/i915/gem/i915_gem_region.c |  2 +-
  drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 17 +
  drivers/gpu/drm/i915/gem/i915_gem_ttm.c|  8 
  .../gpu/drm/i915/gem/selftests/i915_gem_mman.c | 18 +-
  drivers/gpu/drm/i915/gt/intel_region_lmem.c| 11 +++
  drivers/gpu/drm/i915/gt/selftest_tlb.c |  4 ++--
  drivers/gpu/drm/i915/i915_gpu_error.c  |  2 +-
  drivers/gpu/drm/i915/i915_query.c  |  2 +-
  drivers/gpu/drm/i915/intel_memory_region.c | 15 +++
  drivers/gpu/drm/i915/intel_memory_region.h |  3 +--
  drivers/gpu/drm/i915/intel_region_ttm.c|  8 
  .../drm/i915/selftests/intel_memory_region.c   |  4 ++--
  13 files changed, 45 insertions(+), 51 deletions(-)

diff --git a/drivers/gpu/drm/i915/display/intel_fbdev_fb.c 
b/drivers/gpu/drm/i915/display/intel_fbdev_fb.c
index 717c3a3237c4..1ac05d90b2e8 100644
--- a/drivers/gpu/drm/i915/display/intel_fbdev_fb.c
+++ b/drivers/gpu/drm/i915/display/intel_fbdev_fb.c
@@ -78,7 +78,7 @@ int intel_fbdev_fb_fill_info(struct drm_i915_private *i915, 
struct fb_info *info
  
  		/* Use fbdev's framebuffer from lmem for discrete */

info->fix.smem_start =
-   (unsigned long)(mem->io_start +
+   (unsigned long)(mem->io.start +
i915_gem_object_get_dma_address(obj, 
0));
info->fix.smem_len = obj->base.size;
} else {
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_region.c 
b/drivers/gpu/drm/i915/gem/i915_gem_region.c
index a4fb577eceb4..b09b74a2448b 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_region.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_region.c
@@ -129,7 +129,7 @@ i915_gem_object_create_region_at(struct intel_memory_region 
*mem,
return ERR_PTR(-EINVAL);
  
  	if (!(flags & I915_BO_ALLOC_GPU_ONLY) &&

-   offset + size > mem->io_size &&
+   offset + size > resource_size(&mem->io) &&
!i915_ggtt_has_aperture(to_gt(mem->i915)->ggtt))
return ERR_PTR(-ENOSPC);
  
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c

index 8c88075eeab2..d2440c793f84 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
@@ -541,7 +541,9 @@ static int i915_gem_init_stolen(struct intel_memory_region 
*mem)
  
  	/* Exclude the reserved region from driver use */

mem->region.end = i915->dsm.reserved.start - 1;
-   mem->io_size = min(mem->io_size, resource_size(&mem->region));
+   mem->io = DEFINE_RES_MEM(mem->io.start,
+min(resource_size(&mem->io),
+resource_size(&mem->region)));
  
  	i915->dsm.usable_size = resource_size(&mem->region);
  
@@ -752,7 +754,7 @@ static int _i915_gem_object_stolen_init(struct intel_memory_region *mem,

 * With discrete devices, where we lack a mappable aperture there is no
 * possible way to ever access this memory on the CPU side.
 */
-   if (mem->type == INTEL_MEMORY_STOLEN_LOCAL && !mem->io_size &&
+   if (mem->type == INTEL_MEMORY_STOLEN_LOCAL && !resource_size(&mem->io) 
&&
!(flags & I915_BO_ALLOC_GPU_ONLY))
return -ENOSPC;
  
@@ -838,13 +840,12 @@ static int init_stolen_lmem(struct intel_memory_region *mem)

return 0;
}
  
-	if (mem->io_size &&

-   !io_mapping_init_wc(&mem->iomap, mem->io_start, mem->io_size))
+   if (resource_size(&mem->io) &&
+   !io_mapping_init_wc(&mem->iomap, mem->io.start, 
resource_size(&mem-

Re: [PATCH v3 02/16] drm/i915: Print memory region info during probe

2024-01-16 Thread Nirmoy Das



On 1/16/2024 8:56 AM, Ville Syrjala wrote:

From: Ville Syrjälä 

Dump the details about every memory region into dmesg at probe time.
Avoids having to dig those out from random places when debugging stuff.

Cc: Paz Zcharya 
Reviewed-by: Andrzej Hajda 
Signed-off-by: Ville Syrjälä 

Reviewed-by: Nirmoy Das 

---
  drivers/gpu/drm/i915/intel_memory_region.c | 18 ++
  1 file changed, 18 insertions(+)

diff --git a/drivers/gpu/drm/i915/intel_memory_region.c 
b/drivers/gpu/drm/i915/intel_memory_region.c
index b2708f8cac2a..52d998e5c21a 100644
--- a/drivers/gpu/drm/i915/intel_memory_region.c
+++ b/drivers/gpu/drm/i915/intel_memory_region.c
@@ -372,6 +372,24 @@ int intel_memory_regions_hw_probe(struct drm_i915_private 
*i915)
i915->mm.regions[i] = mem;
}
  
+	for (i = 0; i < ARRAY_SIZE(i915->mm.regions); i++) {

+   struct intel_memory_region *mem = i915->mm.regions[i];
+   u64 region_size, io_size;
+
+   if (!mem)
+   continue;
+
+   region_size = resource_size(&mem->region) >> 20;
+   io_size = resource_size(&mem->io) >> 20;
+
+   if (resource_size(&mem->io))
+   drm_dbg(&i915->drm, "Memory region(%d): %s: %llu MiB %pR, 
io: %llu MiB %pR\n",
+   mem->id, mem->name, region_size, &mem->region, 
io_size, &mem->io);
+   else
+   drm_dbg(&i915->drm, "Memory region(%d): %s: %llu MiB %pR, 
io: n/a\n",
+   mem->id, mem->name, region_size, &mem->region);
+   }
+
return 0;
  
  out_cleanup:


Re: [PATCH v2 04/15] drm/i915: Bypass LMEMBAR/GTTMMADR for MTL stolen memory access

2024-01-12 Thread Nirmoy Das


On 1/12/2024 4:12 PM, Ville Syrjälä wrote:

On Wed, Jan 10, 2024 at 11:49:47AM +0100, Nirmoy Das wrote:

Hi Ville,

Apologies, but I lost track of this series after I returned from sick leave.


On 12/15/2023 11:59 AM, Ville Syrjala wrote:

From: Ville Syrjälä

On MTL accessing stolen memory via the BARs is somehow borked,
and it can hang the machine. As a workaround let's bypass the
BARs and just go straight to DSMBASE/GSMBASE instead.

Note that on every other platform this itself would hang the
machine, but on MTL the system firmware is expected to relax
the access permission guarding stolen memory to enable this
workaround, and thus direct CPU accesses should be fine.

TODO: add w/a numbers and whatnot

Cc: Paz Zcharya
Cc: Nirmoy Das
Cc: Radhakrishna Sripada
Cc: Joonas Lahtinen
Signed-off-by: Ville Syrjälä
---
   drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 11 ++-
   drivers/gpu/drm/i915/gt/intel_ggtt.c   | 13 -
   2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c 
b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
index ee237043c302..252fe5cd6ede 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
@@ -941,7 +941,16 @@ i915_gem_stolen_lmem_setup(struct drm_i915_private *i915, 
u16 type,
dsm_size = ALIGN_DOWN(lmem_size - dsm_base, SZ_1M);
}
   
-	if (pci_resource_len(pdev, GEN12_LMEM_BAR) < lmem_size) {

+   if (IS_METEORLAKE(i915)) {
+   /*
+* Workaround: access via BAR can hang MTL, go directly to DSM.
+*
+* Normally this would not work but on MTL the system firmware
+* should have relaxed the access permissions sufficiently.
+*/
+   io_start = intel_uncore_read64(uncore, GEN12_DSMBASE) & 
GEN12_BDSM_MASK;
+   io_size = dsm_size;

This will work well on host driver but I am afraid this will not work on
VM when someone tries to do direct device assignment of the igfx.

GSMBASE/DSMBASE is reserved region so won't show up in VM, last I checked.

Hmm. So BARs get passed over but other regions won't be? I wonder if
there's a way to pass them explicitly...


Yes, when a user ask qemu to pass though a pci device then qemu will 
ensure to map those


BARs.




This is an obscure usages but are we suppose to support that? If so then
we need to detect that and fall back to binder approach.

I suppose some people may attempt it. But I'm not sure how well that
will work in practice even on other platforms. I don't think we've
ever really considered that use case any kind of priority so bug
reports tend to go unanswered.

My main worry with the MI_UPDATE_GTT stuff is:
- only used on this one platform so very limited testing coverage
- async so more opprtunities to screw things up
- what happens if the engine hangs while we're waiting for MI_UPDATE_GTT
   to finish?
- requires working command submission, so even getting a working
   display now depends on a lot more extra components working correctly

hence the patch to disable it. During testing my MTL was very unstable
so I wanted to eliminate all potential sources of new bugs.


Valid concerns but unfortunately MI_UPDATE_GTT is the only generic 
solution came up in the discussions


which supports host, vm, also SRIOV case.



Hmm. But we can't even use MI_UPDATE_GTT until command submission is
up and running, so we still need the direct CPU path for early ggtt
setup no?


It is very unlikely for the bug to appear when there is only single user 
of the GPU. So the HW team is fine with


having a small window where we do modify GTT using stolen.


How about a modparam which defaults to your approach and have a doc 
saying to use binder on VM ?


It would be nice if i915 could detect if it is running in virtualized 
environment but I don't have any ideas for that.



Regards,

Nirmoy



  So if we can't pass the stolen directly to the VM the only
option would be to use the BARs for that and risk hanging the machine.

Question how would i915 detect if it is running in VM environment


Re: [PATCH v2 04/15] drm/i915: Bypass LMEMBAR/GTTMMADR for MTL stolen memory access

2024-01-10 Thread Nirmoy Das



On 1/10/2024 11:49 AM, Nirmoy Das wrote:

Hi Ville,

Apologies, but I lost track of this series after I returned from sick 
leave.


Please ignore the uncontextual "but" in the previous response. I need to 
disable auto correct options.



Regards,

Nirmoy





On 12/15/2023 11:59 AM, Ville Syrjala wrote:

From: Ville Syrjälä 

On MTL accessing stolen memory via the BARs is somehow borked,
and it can hang the machine. As a workaround let's bypass the
BARs and just go straight to DSMBASE/GSMBASE instead.

Note that on every other platform this itself would hang the
machine, but on MTL the system firmware is expected to relax
the access permission guarding stolen memory to enable this
workaround, and thus direct CPU accesses should be fine.

TODO: add w/a numbers and whatnot

Cc: Paz Zcharya 
Cc: Nirmoy Das 
Cc: Radhakrishna Sripada 
Cc: Joonas Lahtinen 
Signed-off-by: Ville Syrjälä 
---
  drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 11 ++-
  drivers/gpu/drm/i915/gt/intel_ggtt.c   | 13 -
  2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c 
b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c

index ee237043c302..252fe5cd6ede 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
@@ -941,7 +941,16 @@ i915_gem_stolen_lmem_setup(struct 
drm_i915_private *i915, u16 type,

  dsm_size = ALIGN_DOWN(lmem_size - dsm_base, SZ_1M);
  }
  -    if (pci_resource_len(pdev, GEN12_LMEM_BAR) < lmem_size) {
+    if (IS_METEORLAKE(i915)) {
+    /*
+ * Workaround: access via BAR can hang MTL, go directly to DSM.
+ *
+ * Normally this would not work but on MTL the system firmware
+ * should have relaxed the access permissions sufficiently.
+ */
+    io_start = intel_uncore_read64(uncore, GEN12_DSMBASE) & 
GEN12_BDSM_MASK;

+    io_size = dsm_size;


This will work well on host driver but I am afraid this will not work 
on VM when someone tries to do direct device assignment of the igfx.


GSMBASE/DSMBASE is reserved region so won't show up in VM, last I 
checked.


This is an obscure usages but are we suppose to support that? If so 
then we need to detect that and fall back to binder approach.



Regards,

Nirmoy


+    } else if (pci_resource_len(pdev, GEN12_LMEM_BAR) < lmem_size) {
  io_start = 0;
  io_size = 0;
  } else {
diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c

index 21a7e3191c18..ab71d74ec426 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -24,6 +24,7 @@
  #include "intel_ring.h"
  #include "i915_drv.h"
  #include "i915_pci.h"
+#include "i915_reg.h"
  #include "i915_request.h"
  #include "i915_scatterlist.h"
  #include "i915_utils.h"
@@ -1152,13 +1153,23 @@ static unsigned int gen6_gttadr_offset(struct 
drm_i915_private *i915)

  static int ggtt_probe_common(struct i915_ggtt *ggtt, u64 size)
  {
  struct drm_i915_private *i915 = ggtt->vm.i915;
+    struct intel_uncore *uncore = ggtt->vm.gt->uncore;
  struct pci_dev *pdev = to_pci_dev(i915->drm.dev);
  phys_addr_t phys_addr;
  u32 pte_flags;
  int ret;
    GEM_WARN_ON(pci_resource_len(pdev, GEN4_GTTMMADR_BAR) != 
gen6_gttmmadr_size(i915));
-    phys_addr = pci_resource_start(pdev, GEN4_GTTMMADR_BAR) + 
gen6_gttadr_offset(i915);

+    /*
+ * Workaround: access via BAR can hang MTL, go directly to GSM.
+ *
+ * Normally this would not work but on MTL the system firmware
+ * should have relaxed the access permissions sufficiently.
+ */
+    if (IS_METEORLAKE(i915))
+    phys_addr = intel_uncore_read64(uncore, GEN12_GSMBASE) & 
GEN12_BDSM_MASK;

+    else
+    phys_addr = pci_resource_start(pdev, GEN4_GTTMMADR_BAR) + 
gen6_gttadr_offset(i915);

    if (needs_wc_ggtt_mapping(i915))
  ggtt->gsm = ioremap_wc(phys_addr, size);


Re: [PATCH v2 04/15] drm/i915: Bypass LMEMBAR/GTTMMADR for MTL stolen memory access

2024-01-10 Thread Nirmoy Das

Hi Ville,

Apologies, but I lost track of this series after I returned from sick leave.


On 12/15/2023 11:59 AM, Ville Syrjala wrote:

From: Ville Syrjälä 

On MTL accessing stolen memory via the BARs is somehow borked,
and it can hang the machine. As a workaround let's bypass the
BARs and just go straight to DSMBASE/GSMBASE instead.

Note that on every other platform this itself would hang the
machine, but on MTL the system firmware is expected to relax
the access permission guarding stolen memory to enable this
workaround, and thus direct CPU accesses should be fine.

TODO: add w/a numbers and whatnot

Cc: Paz Zcharya 
Cc: Nirmoy Das 
Cc: Radhakrishna Sripada 
Cc: Joonas Lahtinen 
Signed-off-by: Ville Syrjälä 
---
  drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 11 ++-
  drivers/gpu/drm/i915/gt/intel_ggtt.c   | 13 -
  2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c 
b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
index ee237043c302..252fe5cd6ede 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
@@ -941,7 +941,16 @@ i915_gem_stolen_lmem_setup(struct drm_i915_private *i915, 
u16 type,
dsm_size = ALIGN_DOWN(lmem_size - dsm_base, SZ_1M);
}
  
-	if (pci_resource_len(pdev, GEN12_LMEM_BAR) < lmem_size) {

+   if (IS_METEORLAKE(i915)) {
+   /*
+* Workaround: access via BAR can hang MTL, go directly to DSM.
+*
+* Normally this would not work but on MTL the system firmware
+* should have relaxed the access permissions sufficiently.
+*/
+   io_start = intel_uncore_read64(uncore, GEN12_DSMBASE) & 
GEN12_BDSM_MASK;
+   io_size = dsm_size;


This will work well on host driver but I am afraid this will not work on 
VM when someone tries to do direct device assignment of the igfx.


GSMBASE/DSMBASE is reserved region so won't show up in VM, last I checked.

This is an obscure usages but are we suppose to support that? If so then 
we need to detect that and fall back to binder approach.



Regards,

Nirmoy


+   } else if (pci_resource_len(pdev, GEN12_LMEM_BAR) < lmem_size) {
io_start = 0;
io_size = 0;
} else {
diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index 21a7e3191c18..ab71d74ec426 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -24,6 +24,7 @@
  #include "intel_ring.h"
  #include "i915_drv.h"
  #include "i915_pci.h"
+#include "i915_reg.h"
  #include "i915_request.h"
  #include "i915_scatterlist.h"
  #include "i915_utils.h"
@@ -1152,13 +1153,23 @@ static unsigned int gen6_gttadr_offset(struct 
drm_i915_private *i915)
  static int ggtt_probe_common(struct i915_ggtt *ggtt, u64 size)
  {
struct drm_i915_private *i915 = ggtt->vm.i915;
+   struct intel_uncore *uncore = ggtt->vm.gt->uncore;
struct pci_dev *pdev = to_pci_dev(i915->drm.dev);
phys_addr_t phys_addr;
u32 pte_flags;
int ret;
  
  	GEM_WARN_ON(pci_resource_len(pdev, GEN4_GTTMMADR_BAR) != gen6_gttmmadr_size(i915));

-   phys_addr = pci_resource_start(pdev, GEN4_GTTMMADR_BAR) + 
gen6_gttadr_offset(i915);
+   /*
+* Workaround: access via BAR can hang MTL, go directly to GSM.
+*
+* Normally this would not work but on MTL the system firmware
+* should have relaxed the access permissions sufficiently.
+*/
+   if (IS_METEORLAKE(i915))
+   phys_addr = intel_uncore_read64(uncore, GEN12_GSMBASE) & 
GEN12_BDSM_MASK;
+   else
+   phys_addr = pci_resource_start(pdev, GEN4_GTTMMADR_BAR) + 
gen6_gttadr_offset(i915);
  
  	if (needs_wc_ggtt_mapping(i915))

ggtt->gsm = ioremap_wc(phys_addr, size);


Re: [PATCH v3 4/4] drm/i915/guc: Use the ce_to_guc() wrapper whenever possible

2023-12-21 Thread Nirmoy Das



On 12/6/2023 9:46 PM, Andi Shyti wrote:

Get the guc reference from the ce using the ce_to_guc() helper.
Just a leftover from previous cleanups.

Signed-off-by: Andi Shyti 

Reviewed-by: Nirmoy Das 

---
  drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 4f51cc5f1604..3c7821ae9f0d 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -3513,7 +3513,7 @@ static inline void sub_context_inflight_prio(struct 
intel_context *ce,
  
  static inline void update_context_prio(struct intel_context *ce)

  {
-   struct intel_guc *guc = &ce->engine->gt->uc.guc;
+   struct intel_guc *guc = ce_to_guc(ce);
int i;
  
  	BUILD_BUG_ON(GUC_CLIENT_PRIORITY_KMD_HIGH != 0);


Re: [PATCH v3 3/4] drm/i915: Use the new gt_to_guc() wrapper

2023-12-21 Thread Nirmoy Das



On 12/6/2023 9:46 PM, Andi Shyti wrote:

Get the guc reference from the gt using the gt_to_guc() helper.

Signed-off-by: Andi Shyti 

Reviewed-by: Nirmoy Das 

---
  drivers/gpu/drm/i915/i915_debugfs_params.c   | 2 +-
  drivers/gpu/drm/i915/selftests/intel_scheduler_helpers.c | 4 ++--
  2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs_params.c 
b/drivers/gpu/drm/i915/i915_debugfs_params.c
index 8bca02025e09..74b7f2fd8b57 100644
--- a/drivers/gpu/drm/i915/i915_debugfs_params.c
+++ b/drivers/gpu/drm/i915/i915_debugfs_params.c
@@ -43,7 +43,7 @@ static int notify_guc(struct drm_i915_private *i915)
  
  	for_each_gt(gt, i915, i) {

if (intel_uc_uses_guc_submission(>->uc))
-   ret = intel_guc_global_policies_update(>->uc.guc);
+   ret = intel_guc_global_policies_update(gt_to_guc(gt));
}
  
  	return ret;

diff --git a/drivers/gpu/drm/i915/selftests/intel_scheduler_helpers.c 
b/drivers/gpu/drm/i915/selftests/intel_scheduler_helpers.c
index 2990dd4d4a0d..d9d8f0336702 100644
--- a/drivers/gpu/drm/i915/selftests/intel_scheduler_helpers.c
+++ b/drivers/gpu/drm/i915/selftests/intel_scheduler_helpers.c
@@ -65,7 +65,7 @@ int intel_selftest_modify_policy(struct intel_engine_cs 
*engine,
if (!intel_engine_uses_guc(engine))
return 0;
  
-	err = intel_guc_global_policies_update(&engine->gt->uc.guc);

+   err = intel_guc_global_policies_update(gt_to_guc(engine->gt));
if (err)
intel_selftest_restore_policy(engine, saved);
  
@@ -84,7 +84,7 @@ int intel_selftest_restore_policy(struct intel_engine_cs *engine,

if (!intel_engine_uses_guc(engine))
return 0;
  
-	return intel_guc_global_policies_update(&engine->gt->uc.guc);

+   return intel_guc_global_policies_update(gt_to_guc(engine->gt));
  }
  
  int intel_selftest_wait_for_rq(struct i915_request *rq)


Re: [PATCH v3 2/4] drm/i915/guc: Use the new gt_to_guc() wrapper

2023-12-21 Thread Nirmoy Das



On 12/6/2023 9:46 PM, Andi Shyti wrote:

Get the guc reference from the gt using the gt_to_guc() helper.

Signed-off-by: Andi Shyti 

Reviewed-by: Nirmoy Das 

---
  drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c |  4 +--
  drivers/gpu/drm/i915/gt/uc/intel_gsc_proxy.c  |  3 +-
  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c|  2 +-
  .../gpu/drm/i915/gt/uc/intel_guc_capture.c|  6 ++--
  .../gpu/drm/i915/gt/uc/intel_guc_hwconfig.c   |  2 +-
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 28 +--
  drivers/gpu/drm/i915/gt/uc/intel_huc.c|  4 +--
  drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c  |  4 +--
  drivers/gpu/drm/i915/gt/uc/selftest_guc.c |  2 +-
  9 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c 
b/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c
index e2e42b3e0d5d..3b69bc6616bd 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c
@@ -298,7 +298,7 @@ static int gsc_fw_load_prepare(struct intel_gsc_uc *gsc)
memcpy_toio(gsc->local_vaddr, src, gsc->fw.size);
memset_io(gsc->local_vaddr + gsc->fw.size, 0, gsc->local->size - 
gsc->fw.size);
  
-	intel_guc_write_barrier(>->uc.guc);

+   intel_guc_write_barrier(gt_to_guc(gt));
  
  	i915_gem_object_unpin_map(gsc->fw.obj);
  
@@ -351,7 +351,7 @@ static int gsc_fw_query_compatibility_version(struct intel_gsc_uc *gsc)

void *vaddr;
int err;
  
-	err = intel_guc_allocate_and_map_vma(>->uc.guc, GSC_VER_PKT_SZ * 2,

+   err = intel_guc_allocate_and_map_vma(gt_to_guc(gt), GSC_VER_PKT_SZ * 2,
 &vma, &vaddr);
if (err) {
gt_err(gt, "failed to allocate vma for GSC version query\n");
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_gsc_proxy.c 
b/drivers/gpu/drm/i915/gt/uc/intel_gsc_proxy.c
index 40817ebcca71..a7d5465655f9 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_gsc_proxy.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_gsc_proxy.c
@@ -358,7 +358,8 @@ static int proxy_channel_alloc(struct intel_gsc_uc *gsc)
void *vaddr;
int err;
  
-	err = intel_guc_allocate_and_map_vma(>->uc.guc, GSC_PROXY_CHANNEL_SIZE,

+   err = intel_guc_allocate_and_map_vma(gt_to_guc(gt),
+GSC_PROXY_CHANNEL_SIZE,
 &vma, &vaddr);
if (err)
return err;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
index 63724e17829a..1ef470e64604 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
@@ -956,7 +956,7 @@ u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
  
  struct iosys_map intel_guc_engine_usage_record_map(struct intel_engine_cs *engine)

  {
-   struct intel_guc *guc = &engine->gt->uc.guc;
+   struct intel_guc *guc = gt_to_guc(engine->gt);
u8 guc_class = engine_class_to_guc_class(engine->class);
size_t offset = offsetof(struct __guc_ads_blob,
 
engine_usage.engines[guc_class][ilog2(engine->logical_mask)]);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
index a4da0208c883..84a8807391c5 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
@@ -1441,7 +1441,7 @@ int intel_guc_capture_print_engine_node(struct 
drm_i915_error_state_buf *ebuf,
if (!cap || !ee->engine)
return -ENODEV;
  
-	guc = &ee->engine->gt->uc.guc;

+   guc = gt_to_guc(ee->engine->gt);
  
  	i915_error_printf(ebuf, "global --- GuC Error Capture on %s command stream:\n",

  ee->engine->name);
@@ -1543,7 +1543,7 @@ bool intel_guc_capture_is_matching_engine(struct intel_gt 
*gt,
if (!gt || !ce || !engine)
return false;
  
-	guc = >->uc.guc;

+   guc = gt_to_guc(gt);
if (!guc->capture)
return false;
  
@@ -1573,7 +1573,7 @@ void intel_guc_capture_get_matching_node(struct intel_gt *gt,

if (!gt || !ee || !ce)
return;
  
-	guc = >->uc.guc;

+   guc = gt_to_guc(gt);
if (!guc->capture)
return;
  
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_hwconfig.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_hwconfig.c

index cc9569af7f0c..b67a15f74276 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_hwconfig.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_hwconfig.c
@@ -111,7 +111,7 @@ static bool has_table(struct drm_i915_private *i915)
  static int guc_hwconfig_init(struct intel_gt *gt)
  {
struct intel_hwconfig *hwconfig = >->info.hwconfig;
-   struct intel_guc *guc = >->uc.guc;
+  

Re: [PATCH v3 1/4] drm/i915/gt: Create the gt_to_guc() wrapper

2023-12-21 Thread Nirmoy Das



On 12/6/2023 9:46 PM, Andi Shyti wrote:

We already have guc_to_gt() and getting to guc from the GT it
requires some mental effort. Add the gt_to_guc().

Given the reference to the "gt", the gt_to_guc() will return the
pinter to the "guc".

Update all the files under the gt/ directory.

Signed-off-by: Andi Shyti 

Reviewed-by: Nirmoy Das 

---
  drivers/gpu/drm/i915/gt/intel_engine_cs.c | 4 ++--
  drivers/gpu/drm/i915/gt/intel_ggtt.c  | 9 +++--
  drivers/gpu/drm/i915/gt/intel_gt.h| 5 +
  drivers/gpu/drm/i915/gt/intel_gt_irq.c| 6 +++---
  drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c | 2 +-
  drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c   | 8 
  drivers/gpu/drm/i915/gt/intel_rc6.c   | 4 ++--
  drivers/gpu/drm/i915/gt/intel_rps.c   | 2 +-
  drivers/gpu/drm/i915/gt/intel_tlb.c   | 2 +-
  drivers/gpu/drm/i915/gt/selftest_slpc.c   | 6 +++---
  10 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 40687806d22a..bede7f09d4af 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -589,7 +589,7 @@ u64 intel_clamp_preempt_timeout_ms(struct intel_engine_cs 
*engine, u64 value)
 * NB: The GuC API only supports 32bit values. However, the limit is 
further
 * reduced due to internal calculations which would otherwise overflow.
 */
-   if (intel_guc_submission_is_wanted(&engine->gt->uc.guc))
+   if (intel_guc_submission_is_wanted(gt_to_guc(engine->gt)))
value = min_t(u64, value, guc_policy_max_preempt_timeout_ms());
  
  	value = min_t(u64, value, jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT));

@@ -610,7 +610,7 @@ u64 intel_clamp_timeslice_duration_ms(struct 
intel_engine_cs *engine, u64 value)
 * NB: The GuC API only supports 32bit values. However, the limit is 
further
 * reduced due to internal calculations which would otherwise overflow.
 */
-   if (intel_guc_submission_is_wanted(&engine->gt->uc.guc))
+   if (intel_guc_submission_is_wanted(gt_to_guc(engine->gt)))
value = min_t(u64, value, guc_policy_max_exec_quantum_ms());
  
  	value = min_t(u64, value, jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT));

diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index 21a7e3191c18..aa1e9249d393 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -230,11 +230,8 @@ static void guc_ggtt_ct_invalidate(struct intel_gt *gt)
struct intel_uncore *uncore = gt->uncore;
intel_wakeref_t wakeref;
  
-	with_intel_runtime_pm_if_active(uncore->rpm, wakeref) {

-   struct intel_guc *guc = >->uc.guc;
-
-   intel_guc_invalidate_tlb_guc(guc);
-   }
+   with_intel_runtime_pm_if_active(uncore->rpm, wakeref)
+   intel_guc_invalidate_tlb_guc(gt_to_guc(gt));
  }
  
  static void guc_ggtt_invalidate(struct i915_ggtt *ggtt)

@@ -245,7 +242,7 @@ static void guc_ggtt_invalidate(struct i915_ggtt *ggtt)
gen8_ggtt_invalidate(ggtt);
  
  	list_for_each_entry(gt, &ggtt->gt_list, ggtt_link) {

-   if (intel_guc_tlb_invalidation_is_available(>->uc.guc))
+   if (intel_guc_tlb_invalidation_is_available(gt_to_guc(gt)))
guc_ggtt_ct_invalidate(gt);
else if (GRAPHICS_VER(i915) >= 12)
intel_uncore_write_fw(gt->uncore,
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h 
b/drivers/gpu/drm/i915/gt/intel_gt.h
index b0e453e27ea8..d7c859039828 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt.h
@@ -118,6 +118,11 @@ static inline struct intel_gt *gsc_to_gt(struct intel_gsc 
*gsc)
return container_of(gsc, struct intel_gt, gsc);
  }
  
+static inline struct intel_guc *gt_to_guc(struct intel_gt *gt)

+{
+   return >->uc.guc;
+}
+
  void intel_gt_common_init_early(struct intel_gt *gt);
  int intel_root_gt_init_early(struct drm_i915_private *i915);
  int intel_gt_assign_ggtt(struct intel_gt *gt);
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_irq.c 
b/drivers/gpu/drm/i915/gt/intel_gt_irq.c
index 77fb57223465..ad4c51f18d3a 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_irq.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_irq.c
@@ -68,9 +68,9 @@ gen11_other_irq_handler(struct intel_gt *gt, const u8 
instance,
struct intel_gt *media_gt = gt->i915->media_gt;
  
  	if (instance == OTHER_GUC_INSTANCE)

-   return guc_irq_handler(>->uc.guc, iir);
+   return guc_irq_handler(gt_to_guc(gt), iir);
if (instance == OTHER_MEDIA_GUC_INSTANCE && media_gt)
-   return guc_irq_handler(&media_gt->uc.guc, iir);
+   return guc_irq_handler(gt

Re: [Intel-gfx] [PATCH v2] drm/i915/gt: Convert reset prepare failure log to trace

2023-12-06 Thread Nirmoy Das

Hi John,

On 12/5/2023 8:50 PM, John Harrison wrote:

On 12/5/2023 02:39, Nirmoy Das wrote:

Hi John,

On 12/5/2023 10:10 AM, John Harrison wrote:

On 12/5/2023 00:52, Nirmoy Das wrote:

gen8_engine_reset_prepare() can fail when HW fails to set
RESET_CTL_READY_TO_RESET bit. In some cases this is not fatal
error as driver will retry.

Convert the log to a trace log for debugging without triggering
unnecessary concerns in CI or for end-users during non-fatal 
scenarios.
I strongly disagree with this change. The hardware spec for the 
RESET_CTL and GDRST registers are that they will self clear within a 
matter of microseconds. If something is so badly wrong with the 
hardware that it can't even manage to reset



This message is for reset readiness  poll timeout not that the reset 
is failed which doesn't sound so serious if the subsequent attempt 
managed reset the engine.
Not sure what the distinction is. The reset procedure is poke 
RESET_CTL wait for it to clear, poke GDRST and wait for it to clear. 
Just because step one is failing rather than step 2 does not mean that 
the reset as a whole has not failed.


Note that the purpose of RESET_CTL is to pause a bunch of stuff like 
the command streamers to prevent them from issuing new memory requests 
while the reset is in progress. If it fails, it likely means that a CS 
is refusing to stop. Most probably because it can't reach a stopping 
point because it is stuck waiting on a lost memory request or similar. 
And the point of stopping further memory requests during reset is that 
if the memory channel gets out of sync (because only the GT side is 
reset during a GT reset) then that can result in total system failure. 
As in potentially even the CPU can no longer get to memory if it is an 
integrated platform. So yes, it can be quite a serious failure indeed.




Thanks bspec didn't explain those details. My intention was to 
acknowledge that engine reset is a complicated process which why the 
driver retries  and don't spook CI/user if subsequent reset works but I 
get your objection on this.




I couldn't get enough details when this can happen that HW takes very 
long time to set the readiness bit.
Is it simply 'taking a long time' or is never clearing at all? If it 
is just that the timeout is too short then the proper fix would be to 
increase the timeout. But if it is taking seconds or longer or just 
never succeeding at all, then something is very bad.


I tried with 10x timeout without any help so I think the CS is stuck 
though re-try works. I will try to get more details from HW team on this 
issue.







then that is something that very much warrants more than a 
completely silent trace event. It most certainly should be flagged 
as a failure in CI.


Just because the driver will retry does not mean that this is not a 
serious error. And if the first attempt failed, why would a 
subsequent attempt succeed?


The patch is not ignoring the failure. If the subsequent attempt 
fails then driver load will fail or it will be wedged if that happens 
after driver load.
One thing I really hate about our driver is the total lack of 
information when something goes wrong during load. The driver wedges 
in total silence. There are many error paths that have no reporting at 
all. Which means you are left with a totally useless bug report.






Escalating to FLR may have more success, but that is not something 
that i915 currently does.


Do we still need to do FLR if a subsequent engine reset failure ?
Assuming that we are talking about modern(ish) platforms, an engine 
reset failure would be hit by GuC rather than i915, but that would be 
escalated to an i915 based full GT reset. Generally speaking though, 
if the engine reset fails the GT reset isn't going to do much better. 
It would fix a dead GuC problem but it can't help with memory related 
issues. If the full GT reset fails then we are out of escalation 
routes as there is no FLR path at present (I think we have that at 
driver unload on MTL but not for general reset?). The FLR resets a lot 
more than just the GT, so it does have a chance to fix some issues 
that a GT reset can't. After driver-level FLR, there is PCI level FLR. 
Not sure if that involves a full power down and restart, but if not 
then that would be the last escalation possible. A power cycle really 
should fix any issues, if it doesn't then it's time to return the 
system as being totally dead!


My recollection is that the vast majority of engine reset failures 
I've looked at have been completely catastrophic and the system only 
recovered after a reboot. I.e. after the card was power cycled. Such 
issues were generally caused by bad memory. Once the path to memory 
has died, there really is not much of the GPU that can do anything at 
all and there isn't much that can be done to recover it.



Thanks,

Nirmoy



John.





Regards,

Nirmoy



John.



Re: [Intel-gfx] [PATCH] drm/i915/gt: Reduce log severity on reset prepare.

2023-12-05 Thread Nirmoy Das

Hi Tvrtko,

On 12/5/2023 11:05 AM, Tvrtko Ursulin wrote:


On 05/12/2023 08:50, Nirmoy Das wrote:

Hi Tvrtko,

On 12/5/2023 9:34 AM, Tvrtko Ursulin wrote:


On 01/12/2023 15:44, Nirmoy Das wrote:

gen8_engine_reset_prepare() can fail when HW fails to set
RESET_CTL_READY_TO_RESET bit. In some cases this is not fatal
error as driver will retry.

Let the caller of gen8_engine_reset_prepare() decide if a
failure in gen8_engine_reset_prepare is an error or not.


No complaints per se but I don't see the caller deciding and it is 
not really reducing log level but converting to trace. So commit 
message and patch do not align for me which I think should be improved.



I meant the return value is checked by the caller, 
gen8_reset_engines(). I will resend with a improved commit message.


Ah okay, maybe my bad for not figuring out that possibility. I guess 
it might be passable as is, but yes, clearer commit text would be better.


I sent a v2 already :)




Trace is good enough - we are not usually interested in seeing those 
as dbg/info/notice?



Idea is that all the GT related events are recorded in trace and dmesg 
could be noisy some times.



Regards,

Nirmoy



Regards,

Tvrtko



Thanks,

Nirmoy



Regards,

Tvrtko


Cc: Tvrtko Ursulin 
Cc: John Harrison 
Cc: Andi Shyti 
Cc: Andrzej Hajda 
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/5591
Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/i915/gt/intel_reset.c | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c

index d5ed904f355d..e6fbc6202c80 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -593,10 +593,10 @@ static int gen8_engine_reset_prepare(struct 
intel_engine_cs *engine)

  ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
 700, 0, NULL);
  if (ret)
-    gt_err(engine->gt,
-   "%s reset request timed out: {request: %08x, 
RESET_CTL: %08x}\n",

-   engine->name, request,
-   intel_uncore_read_fw(uncore, reg));
+    GT_TRACE(engine->gt,
+ "%s reset request timed out: {request: %08x, 
RESET_CTL: %08x}\n",

+ engine->name, request,
+ intel_uncore_read_fw(uncore, reg));
    return ret;
  }


Re: [Intel-gfx] [PATCH v2] drm/i915/gt: Convert reset prepare failure log to trace

2023-12-05 Thread Nirmoy Das

Hi John,

On 12/5/2023 10:10 AM, John Harrison wrote:

On 12/5/2023 00:52, Nirmoy Das wrote:

gen8_engine_reset_prepare() can fail when HW fails to set
RESET_CTL_READY_TO_RESET bit. In some cases this is not fatal
error as driver will retry.

Convert the log to a trace log for debugging without triggering
unnecessary concerns in CI or for end-users during non-fatal scenarios.
I strongly disagree with this change. The hardware spec for the 
RESET_CTL and GDRST registers are that they will self clear within a 
matter of microseconds. If something is so badly wrong with the 
hardware that it can't even manage to reset



This message is for reset readiness  poll timeout not that the reset is 
failed which doesn't sound so serious if the subsequent attempt managed 
reset the engine.


I couldn't get enough details when this can happen that HW takes very 
long time to set the readiness bit.



then that is something that very much warrants more than a completely 
silent trace event. It most certainly should be flagged as a failure 
in CI.


Just because the driver will retry does not mean that this is not a 
serious error. And if the first attempt failed, why would a subsequent 
attempt succeed?


The patch is not ignoring the failure. If the subsequent attempt fails 
then driver load will fail or it will be wedged if that happens after 
driver load.



Escalating to FLR may have more success, but that is not something 
that i915 currently does.


Do we still need to do FLR if a subsequent engine reset failure ?


Regards,

Nirmoy



John.




v2: Improve commit message(Tvrtko)

Cc: Tvrtko Ursulin 
Cc: John Harrison 
Cc: Andi Shyti 
Cc: Andrzej Hajda 
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/5591
Signed-off-by: Nirmoy Das 
Reviewed-by: Andi Shyti 
Reviewed-by: Andrzej Hajda 
---
  drivers/gpu/drm/i915/gt/intel_reset.c | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c

index d5ed904f355d..e6fbc6202c80 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -593,10 +593,10 @@ static int gen8_engine_reset_prepare(struct 
intel_engine_cs *engine)

  ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
 700, 0, NULL);
  if (ret)
-    gt_err(engine->gt,
-   "%s reset request timed out: {request: %08x, 
RESET_CTL: %08x}\n",

-   engine->name, request,
-   intel_uncore_read_fw(uncore, reg));
+    GT_TRACE(engine->gt,
+ "%s reset request timed out: {request: %08x, RESET_CTL: 
%08x}\n",

+ engine->name, request,
+ intel_uncore_read_fw(uncore, reg));
    return ret;
  }




[Intel-gfx] [PATCH v2] drm/i915/gt: Convert reset prepare failure log to trace

2023-12-05 Thread Nirmoy Das
gen8_engine_reset_prepare() can fail when HW fails to set
RESET_CTL_READY_TO_RESET bit. In some cases this is not fatal
error as driver will retry.

Convert the log to a trace log for debugging without triggering
unnecessary concerns in CI or for end-users during non-fatal scenarios.

v2: Improve commit message(Tvrtko)

Cc: Tvrtko Ursulin 
Cc: John Harrison 
Cc: Andi Shyti 
Cc: Andrzej Hajda 
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/5591
Signed-off-by: Nirmoy Das 
Reviewed-by: Andi Shyti 
Reviewed-by: Andrzej Hajda 
---
 drivers/gpu/drm/i915/gt/intel_reset.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index d5ed904f355d..e6fbc6202c80 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -593,10 +593,10 @@ static int gen8_engine_reset_prepare(struct 
intel_engine_cs *engine)
ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
   700, 0, NULL);
if (ret)
-   gt_err(engine->gt,
-  "%s reset request timed out: {request: %08x, RESET_CTL: 
%08x}\n",
-  engine->name, request,
-  intel_uncore_read_fw(uncore, reg));
+   GT_TRACE(engine->gt,
+"%s reset request timed out: {request: %08x, 
RESET_CTL: %08x}\n",
+engine->name, request,
+intel_uncore_read_fw(uncore, reg));
 
return ret;
 }
-- 
2.42.0



Re: [Intel-gfx] [PATCH] drm/i915/gt: Reduce log severity on reset prepare.

2023-12-05 Thread Nirmoy Das

Hi Tvrtko,

On 12/5/2023 9:34 AM, Tvrtko Ursulin wrote:


On 01/12/2023 15:44, Nirmoy Das wrote:

gen8_engine_reset_prepare() can fail when HW fails to set
RESET_CTL_READY_TO_RESET bit. In some cases this is not fatal
error as driver will retry.

Let the caller of gen8_engine_reset_prepare() decide if a
failure in gen8_engine_reset_prepare is an error or not.


No complaints per se but I don't see the caller deciding and it is not 
really reducing log level but converting to trace. So commit message 
and patch do not align for me which I think should be improved.



I meant the return value is checked by the caller, gen8_reset_engines(). 
I will resend with a improved commit message.


Thanks,

Nirmoy



Regards,

Tvrtko


Cc: Tvrtko Ursulin 
Cc: John Harrison 
Cc: Andi Shyti 
Cc: Andrzej Hajda 
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/5591
Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/i915/gt/intel_reset.c | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c

index d5ed904f355d..e6fbc6202c80 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -593,10 +593,10 @@ static int gen8_engine_reset_prepare(struct 
intel_engine_cs *engine)

  ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
 700, 0, NULL);
  if (ret)
-    gt_err(engine->gt,
-   "%s reset request timed out: {request: %08x, 
RESET_CTL: %08x}\n",

-   engine->name, request,
-   intel_uncore_read_fw(uncore, reg));
+    GT_TRACE(engine->gt,
+ "%s reset request timed out: {request: %08x, RESET_CTL: 
%08x}\n",

+ engine->name, request,
+ intel_uncore_read_fw(uncore, reg));
    return ret;
  }


[Intel-gfx] [PATCH] drm/i915/gt: Reduce log severity on reset prepare.

2023-12-01 Thread Nirmoy Das
gen8_engine_reset_prepare() can fail when HW fails to set
RESET_CTL_READY_TO_RESET bit. In some cases this is not fatal
error as driver will retry.

Let the caller of gen8_engine_reset_prepare() decide if a
failure in gen8_engine_reset_prepare is an error or not.

Cc: Tvrtko Ursulin 
Cc: John Harrison 
Cc: Andi Shyti 
Cc: Andrzej Hajda 
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/5591
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/i915/gt/intel_reset.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index d5ed904f355d..e6fbc6202c80 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -593,10 +593,10 @@ static int gen8_engine_reset_prepare(struct 
intel_engine_cs *engine)
ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
   700, 0, NULL);
if (ret)
-   gt_err(engine->gt,
-  "%s reset request timed out: {request: %08x, RESET_CTL: 
%08x}\n",
-  engine->name, request,
-  intel_uncore_read_fw(uncore, reg));
+   GT_TRACE(engine->gt,
+"%s reset request timed out: {request: %08x, 
RESET_CTL: %08x}\n",
+engine->name, request,
+intel_uncore_read_fw(uncore, reg));
 
return ret;
 }
-- 
2.42.0



Re: [Intel-gfx] [PATCH] drm/i915/gt: add missing new-line to GT_TRACE

2023-11-15 Thread Nirmoy Das



On 11/15/2023 1:10 PM, Andrzej Hajda wrote:

Trace requires new-line at the end of message (in opposition to printk),
otherwise trace dump becomes messy.

Signed-off-by: Andrzej Hajda 


Reviewed-by: Nirmoy Das 


---
  drivers/gpu/drm/i915/gt/intel_gt_pm.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c 
b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index f5899d503e234b..471b7cdc10ba0f 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -167,7 +167,7 @@ static void gt_sanitize(struct intel_gt *gt, bool force)
enum intel_engine_id id;
intel_wakeref_t wakeref;
  
-	GT_TRACE(gt, "force:%s", str_yes_no(force));

+   GT_TRACE(gt, "force:%s\n", str_yes_no(force));
  
  	/* Use a raw wakeref to avoid calling intel_display_power_get early */

wakeref = intel_runtime_pm_get(gt->uncore->rpm);

---
base-commit: 1489bab52c281a869295414031a56506a375b036
change-id: 20231115-eols-20f9f52cf338

Best regards,


Re: [Intel-gfx] [PATCH v2] drm/i915: do not clean GT table on error path

2023-11-15 Thread Nirmoy Das



On 11/15/2023 11:54 AM, Andrzej Hajda wrote:

The only task of intel_gt_release_all is to zero gt table. Calling
it on error path prevents intel_gt_driver_late_release_all (called from
i915_driver_late_release) to cleanup GTs, causing leakage.
After i915_driver_late_release GT array is not used anymore so
it does not need cleaning at all.

Sample leak report:

BUG i915_request (...): Objects remaining in i915_request on 
__kmem_cache_shutdown()
...
Object 0x888113420040 @offset=64
Allocated in __i915_request_create+0x75/0x610 [i915] age=18339 cpu=1 pid=1454
  kmem_cache_alloc+0x25b/0x270
  __i915_request_create+0x75/0x610 [i915]
  i915_request_create+0x109/0x290 [i915]
  __engines_record_defaults+0xca/0x440 [i915]
  intel_gt_init+0x275/0x430 [i915]
  i915_gem_init+0x135/0x2c0 [i915]
  i915_driver_probe+0x8d1/0xdc0 [i915]

v2: removed whole intel_gt_release_all

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/8489
Fixes: bec68cc9ea42d8 ("drm/i915: Prepare for multiple GTs")
Signed-off-by: Andrzej Hajda 


Reviewed-by: Nirmoy Das 



---
- Link to v1: 
https://lore.kernel.org/r/20231114-dont_clean_gt_on_error_path-v1-1-37f2fa827...@intel.com
---
  drivers/gpu/drm/i915/gt/intel_gt.c | 11 ---
  drivers/gpu/drm/i915/i915_driver.c |  4 +---
  2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c 
b/drivers/gpu/drm/i915/gt/intel_gt.c
index ed32bf5b15464e..ba1186fc524f84 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -982,8 +982,6 @@ int intel_gt_probe_all(struct drm_i915_private *i915)
  
  err:

i915_probe_error(i915, "Failed to initialize %s! (%d)\n", gtdef->name, 
ret);
-   intel_gt_release_all(i915);
-
return ret;
  }
  
@@ -1002,15 +1000,6 @@ int intel_gt_tiles_init(struct drm_i915_private *i915)

return 0;
  }
  
-void intel_gt_release_all(struct drm_i915_private *i915)

-{
-   struct intel_gt *gt;
-   unsigned int id;
-
-   for_each_gt(gt, i915, id)
-   i915->gt[id] = NULL;
-}
-
  void intel_gt_info_print(const struct intel_gt_info *info,
 struct drm_printer *p)
  {
diff --git a/drivers/gpu/drm/i915/i915_driver.c 
b/drivers/gpu/drm/i915/i915_driver.c
index 01fd25b622d16c..2a1faf4039659c 100644
--- a/drivers/gpu/drm/i915/i915_driver.c
+++ b/drivers/gpu/drm/i915/i915_driver.c
@@ -776,7 +776,7 @@ int i915_driver_probe(struct pci_dev *pdev, const struct 
pci_device_id *ent)
  
  	ret = i915_driver_mmio_probe(i915);

if (ret < 0)
-   goto out_tiles_cleanup;
+   goto out_runtime_pm_put;
  
  	ret = i915_driver_hw_probe(i915);

if (ret < 0)
@@ -836,8 +836,6 @@ int i915_driver_probe(struct pci_dev *pdev, const struct 
pci_device_id *ent)
i915_ggtt_driver_late_release(i915);
  out_cleanup_mmio:
i915_driver_mmio_release(i915);
-out_tiles_cleanup:
-   intel_gt_release_all(i915);
  out_runtime_pm_put:
enable_rpm_wakeref_asserts(&i915->runtime_pm);
i915_driver_late_release(i915);

---
base-commit: 1489bab52c281a869295414031a56506a375b036
change-id: 20231114-dont_clean_gt_on_error_path-91cd9c3caa0a

Best regards,


Re: [Intel-gfx] [PATCH] drm/i915/mtl: Apply notify_guc to all GTs

2023-11-06 Thread Nirmoy Das



On 11/6/2023 1:45 PM, Jani Nikula wrote:

On Wed, 25 Oct 2023, Nirmoy Das  wrote:

Handle platforms with multiple GTs by iterate over all GTs.
Add a Fixes commit so this gets propagated for MTL support.

Fixes: 213c43676beb ("drm/i915/mtl: Remove the 'force_probe' requirement for Meteor 
Lake")

This came up in another patch. I don't like abusing Fixes: like this. I
understand the motivation here, but this patch does not fix the
referenced commit.


I wasn't aware of a better solution but now I have from your response to 
the other patch.


I will keep that in my mind.


Thanks,

Nirmoy



BR,
Jani.


Suggested-by: John Harrison 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Tvrtko Ursulin 
Cc: Andi Shyti 
Cc: Andrzej Hajda 
Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/i915/i915_debugfs_params.c | 9 ++---
  1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs_params.c 
b/drivers/gpu/drm/i915/i915_debugfs_params.c
index 614bde321589..8bca02025e09 100644
--- a/drivers/gpu/drm/i915/i915_debugfs_params.c
+++ b/drivers/gpu/drm/i915/i915_debugfs_params.c
@@ -38,10 +38,13 @@ static int i915_param_int_open(struct inode *inode, struct 
file *file)
  
  static int notify_guc(struct drm_i915_private *i915)

  {
-   int ret = 0;
+   struct intel_gt *gt;
+   int i, ret = 0;
  
-	if (intel_uc_uses_guc_submission(&to_gt(i915)->uc))

-   ret = intel_guc_global_policies_update(&to_gt(i915)->uc.guc);
+   for_each_gt(gt, i915, i) {
+   if (intel_uc_uses_guc_submission(>->uc))
+   ret = intel_guc_global_policies_update(>->uc.guc);
+   }
  
  	return ret;

  }


Re: [Intel-gfx] [PATCH RESEND 2/3] drm/i915: move gpu error debugfs to i915_gpu_error.c

2023-10-31 Thread Nirmoy Das



On 10/31/2023 3:18 PM, Jani Nikula wrote:

On Tue, 31 Oct 2023, Nirmoy Das  wrote:

On 10/31/2023 1:45 PM, Jani Nikula wrote:

+void i915_gpu_error_debugfs_register(struct drm_i915_private *i915)
+{
+   struct drm_minor *minor = i915->drm.primary;
+
+   debugfs_create_file("i915_error_state", 0644,

nit: s/0644/S_IRUGO | S_IWUSR

The direction pretty much across the kernel is to go towards octal
permissions because the macros are harder to understand.


Personally I prefer octal but didn't realize this is preferred in 
general[*].


[*]https://lore.kernel.org/lkml/7232ef011d05a92f4caa86a5e9830d87966a2eaf.1470180926.git@perches.com/


Regards,

Nirmoy




Reviewed-by: Nirmoy Das 

Thanks,
Jani.



Re: [Intel-gfx] [PATCH RESEND 3/3] drm/i915: move gpu error sysfs to i915_gpu_error.c

2023-10-31 Thread Nirmoy Das



On 10/31/2023 1:45 PM, Jani Nikula wrote:

Hide gpu error specifics in i915_gpu_error.c. This is also cleaner wrt
conditional compilation, as i915_gpu_error.c is only built with
DRM_I915_CAPTURE_ERROR=y.

With this, we can also make i915_first_error_state() static.

Signed-off-by: Jani Nikula 

Reviewed-by: Nirmoy Das 


---
  drivers/gpu/drm/i915/i915_gpu_error.c | 75 -
  drivers/gpu/drm/i915/i915_gpu_error.h | 17 +++---
  drivers/gpu/drm/i915/i915_sysfs.c | 79 +--
  3 files changed, 86 insertions(+), 85 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index f195df91d9e6..00559a75b798 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -57,6 +57,7 @@
  #include "i915_memcpy.h"
  #include "i915_reg.h"
  #include "i915_scatterlist.h"
+#include "i915_sysfs.h"
  #include "i915_utils.h"
  
  #define ALLOW_FAIL (__GFP_KSWAPD_RECLAIM | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)

@@ -2211,7 +2212,7 @@ void i915_capture_error_state(struct intel_gt *gt,
i915_gpu_coredump_put(error);
  }
  
-struct i915_gpu_coredump *

+static struct i915_gpu_coredump *
  i915_first_error_state(struct drm_i915_private *i915)
  {
struct i915_gpu_coredump *error;
@@ -2487,3 +2488,75 @@ void i915_gpu_error_debugfs_register(struct 
drm_i915_private *i915)
debugfs_create_file("i915_gpu_info", 0644, minor->debugfs_root, i915,
&i915_gpu_info_fops);
  }
+
+static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
+   struct bin_attribute *attr, char *buf,
+   loff_t off, size_t count)
+{
+
+   struct device *kdev = kobj_to_dev(kobj);
+   struct drm_i915_private *i915 = kdev_minor_to_i915(kdev);
+   struct i915_gpu_coredump *gpu;
+   ssize_t ret = 0;
+
+   /*
+* FIXME: Concurrent clients triggering resets and reading + clearing
+* dumps can cause inconsistent sysfs reads when a user calls in with a
+* non-zero offset to complete a prior partial read but the
+* gpu_coredump has been cleared or replaced.
+*/
+
+   gpu = i915_first_error_state(i915);
+   if (IS_ERR(gpu)) {
+   ret = PTR_ERR(gpu);
+   } else if (gpu) {
+   ret = i915_gpu_coredump_copy_to_buffer(gpu, buf, off, count);
+   i915_gpu_coredump_put(gpu);
+   } else {
+   const char *str = "No error state collected\n";
+   size_t len = strlen(str);
+
+   if (off < len) {
+   ret = min_t(size_t, count, len - off);
+   memcpy(buf, str + off, ret);
+   }
+   }
+
+   return ret;
+}
+
+static ssize_t error_state_write(struct file *file, struct kobject *kobj,
+struct bin_attribute *attr, char *buf,
+loff_t off, size_t count)
+{
+   struct device *kdev = kobj_to_dev(kobj);
+   struct drm_i915_private *dev_priv = kdev_minor_to_i915(kdev);
+
+   drm_dbg(&dev_priv->drm, "Resetting error state\n");
+   i915_reset_error_state(dev_priv);
+
+   return count;
+}
+
+static const struct bin_attribute error_state_attr = {
+   .attr.name = "error",
+   .attr.mode = S_IRUSR | S_IWUSR,
+   .size = 0,
+   .read = error_state_read,
+   .write = error_state_write,
+};
+
+void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915)
+{
+   struct device *kdev = i915->drm.primary->kdev;
+
+   if (sysfs_create_bin_file(&kdev->kobj, &error_state_attr))
+   drm_err(&i915->drm, "error_state sysfs setup failed\n");
+}
+
+void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915)
+{
+   struct device *kdev = i915->drm.primary->kdev;
+
+   sysfs_remove_bin_file(&kdev->kobj, &error_state_attr);
+}
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h 
b/drivers/gpu/drm/i915/i915_gpu_error.h
index f851189b0ff1..fa886620d3f8 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -325,11 +325,12 @@ static inline void i915_gpu_coredump_put(struct 
i915_gpu_coredump *gpu)
kref_put(&gpu->ref, __i915_gpu_coredump_free);
  }
  
-struct i915_gpu_coredump *i915_first_error_state(struct drm_i915_private *i915);

  void i915_reset_error_state(struct drm_i915_private *i915);
  void i915_disable_error_state(struct drm_i915_private *i915, int err);
  
  void i915_gpu_error_debugfs_register(struct drm_i915_private *i915);

+void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915);
+void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915);
  
  #else
  
@@ -398,12 +399,6 @@ static inline v

Re: [Intel-gfx] [PATCH RESEND 2/3] drm/i915: move gpu error debugfs to i915_gpu_error.c

2023-10-31 Thread Nirmoy Das
s/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -2140,7 +2140,7 @@ __i915_gpu_coredump(struct intel_gt *gt, 
intel_engine_mask_t engine_mask, u32 du
return error;
  }
  
-struct i915_gpu_coredump *

+static struct i915_gpu_coredump *
  i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 
dump_flags)
  {
static DEFINE_MUTEX(capture_mutex);
@@ -2378,3 +2378,112 @@ void intel_klog_error_capture(struct intel_gt *gt,
drm_info(&i915->drm, "[Capture/%d.%d] Dumped %zd bytes\n", l_count, 
line++, pos_err);
  }
  #endif
+
+static ssize_t gpu_state_read(struct file *file, char __user *ubuf,
+ size_t count, loff_t *pos)
+{
+   struct i915_gpu_coredump *error;
+   ssize_t ret;
+   void *buf;
+
+   error = file->private_data;
+   if (!error)
+   return 0;
+
+   /* Bounce buffer required because of kernfs __user API convenience. */
+   buf = kmalloc(count, GFP_KERNEL);
+   if (!buf)
+   return -ENOMEM;
+
+   ret = i915_gpu_coredump_copy_to_buffer(error, buf, *pos, count);
+   if (ret <= 0)
+   goto out;
+
+   if (!copy_to_user(ubuf, buf, ret))
+   *pos += ret;
+   else
+   ret = -EFAULT;
+
+out:
+   kfree(buf);
+   return ret;
+}
+
+static int gpu_state_release(struct inode *inode, struct file *file)
+{
+   i915_gpu_coredump_put(file->private_data);
+   return 0;
+}
+
+static int i915_gpu_info_open(struct inode *inode, struct file *file)
+{
+   struct drm_i915_private *i915 = inode->i_private;
+   struct i915_gpu_coredump *gpu;
+   intel_wakeref_t wakeref;
+
+   gpu = NULL;
+   with_intel_runtime_pm(&i915->runtime_pm, wakeref)
+   gpu = i915_gpu_coredump(to_gt(i915), ALL_ENGINES, 
CORE_DUMP_FLAG_NONE);
+
+   if (IS_ERR(gpu))
+   return PTR_ERR(gpu);
+
+   file->private_data = gpu;
+   return 0;
+}
+
+static const struct file_operations i915_gpu_info_fops = {
+   .owner = THIS_MODULE,
+   .open = i915_gpu_info_open,
+   .read = gpu_state_read,
+   .llseek = default_llseek,
+   .release = gpu_state_release,
+};
+
+static ssize_t
+i915_error_state_write(struct file *filp,
+  const char __user *ubuf,
+  size_t cnt,
+  loff_t *ppos)
+{
+   struct i915_gpu_coredump *error = filp->private_data;
+
+   if (!error)
+   return 0;
+
+   drm_dbg(&error->i915->drm, "Resetting error state\n");
+   i915_reset_error_state(error->i915);
+
+   return cnt;
+}
+
+static int i915_error_state_open(struct inode *inode, struct file *file)
+{
+   struct i915_gpu_coredump *error;
+
+   error = i915_first_error_state(inode->i_private);
+   if (IS_ERR(error))
+   return PTR_ERR(error);
+
+   file->private_data  = error;
+   return 0;
+}
+
+static const struct file_operations i915_error_state_fops = {
+   .owner = THIS_MODULE,
+   .open = i915_error_state_open,
+   .read = gpu_state_read,
+   .write = i915_error_state_write,
+   .llseek = default_llseek,
+   .release = gpu_state_release,
+};
+
+void i915_gpu_error_debugfs_register(struct drm_i915_private *i915)
+{
+   struct drm_minor *minor = i915->drm.primary;
+
+   debugfs_create_file("i915_error_state", 0644,


nit: s/0644/S_IRUGO | S_IWUSR

Reviewed-by: Nirmoy Das 


minor->debugfs_root, i915,
+   &i915_error_state_fops);
+   debugfs_create_file("i915_gpu_info", 0644, minor->debugfs_root, i915,
+   &i915_gpu_info_fops);
+}
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h 
b/drivers/gpu/drm/i915/i915_gpu_error.h
index 8f9cdf056181..f851189b0ff1 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -278,8 +278,6 @@ static inline void intel_klog_error_capture(struct intel_gt 
*gt,
  __printf(2, 3)
  void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, 
...);
  
-struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt,

-   intel_engine_mask_t engine_mask, 
u32 dump_flags);
  void i915_capture_error_state(struct intel_gt *gt,
  intel_engine_mask_t engine_mask, u32 dump_flags);
  
@@ -331,6 +329,8 @@ struct i915_gpu_coredump *i915_first_error_state(struct drm_i915_private *i915);

  void i915_reset_error_state(struct drm_i915_private *i915);
  void i915_disable_error_state(struct drm_i915_private *i915, int err);
  
+void i915_gpu_error_debugfs_register(struct drm_i915_private *i915);

+
  #else
  
  __printf(2, 3)

@@ -413,6 +413,10 @@ static inline void i915_disable_error_state(struct 
drm_i915_private *i915,
  {
  }
  
+static inline void i915_gpu_error_debugfs_register(struct drm_i915_private *i915)

+{
+}
+
  #endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */
  
  #endif /* _I915_GPU_ERROR_H_ */


Re: [Intel-gfx] [PATCH RESEND 1/3] drm/i915: make some error capture functions static

2023-10-31 Thread Nirmoy Das



On 10/31/2023 1:45 PM, Jani Nikula wrote:

Not needed outside of i915_gpu_error.c.

Signed-off-by: Jani Nikula 

Reviewed-by: Nirmoy Das 

---
  drivers/gpu/drm/i915/i915_gpu_error.c | 8 
  drivers/gpu/drm/i915/i915_gpu_error.h | 5 -
  2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index 8275f9b6a47d..889db834f07d 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -520,7 +520,7 @@ __find_vma(struct i915_vma_coredump *vma, const char *name)
return NULL;
  }
  
-struct i915_vma_coredump *

+static struct i915_vma_coredump *
  intel_gpu_error_find_batch(const struct intel_engine_coredump *ee)
  {
return __find_vma(ee->vma, "batch");
@@ -609,9 +609,9 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, 
const char *f, ...)
va_end(args);
  }
  
-void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m,

-  const struct intel_engine_cs *engine,
-  const struct i915_vma_coredump *vma)
+static void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m,
+ const struct intel_engine_cs *engine,
+ const struct i915_vma_coredump *vma)
  {
char out[ASCII85_BUFSZ];
struct page *page;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h 
b/drivers/gpu/drm/i915/i915_gpu_error.h
index 4ce227f7e1e1..8f9cdf056181 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -277,11 +277,6 @@ static inline void intel_klog_error_capture(struct 
intel_gt *gt,
  
  __printf(2, 3)

  void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, 
...);
-void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m,
-  const struct intel_engine_cs *engine,
-  const struct i915_vma_coredump *vma);
-struct i915_vma_coredump *
-intel_gpu_error_find_batch(const struct intel_engine_coredump *ee);
  
  struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt,

intel_engine_mask_t engine_mask, 
u32 dump_flags);


  1   2   3   4   5   6   7   8   >