Re: [PATCH 2/2] drm/i915/ttm: fix CCS handling

2022-08-07 Thread Ramalingam C
On 2022-08-05 at 14:22:40 +0100, Matthew Auld wrote:
> Crucible + recent Mesa seems to sometimes hit:
> 
> GEM_BUG_ON(num_ccs_blks > NUM_CCS_BLKS_PER_XFER)
> 
> And it looks like we can also trigger this with gem_lmem_swapping, if we
> modify the test to use slightly larger object sizes.
> 
> Looking closer it looks like we have the following issues in
> migrate_copy():
> 
>   - We are using plain integer in various places, which we can easily overflow
> with a large object.
> 
>   - We pass the entire object size (when the src is lmem) into
> emit_pte() and then try to copy it, which doesn't work, since we
> only have a few fixed sized windows in which to map the pages and
> perform the copy. With an object > 8M we therefore aren't properly
> copying the pages. And then with an object > 64M we trigger the
> GEM_BUG_ON(num_ccs_blks > NUM_CCS_BLKS_PER_XFER).
> 
> So it looks like our copy handling for any object > 8M (which is our
> CHUNK_SZ) is currently broken on DG2.
> 
> Fixes: da0595ae91da ("drm/i915/migrate: Evict and restore the flatccs capable 
> lmem obj")
> Testcase: igt@gem_lmem_swapping@basic-big
> Testcase: igt@gem_lmem_swapping@verify-ccs-big
> Signed-off-by: Matthew Auld 
> Cc: Thomas Hellström 
> Cc: Ramalingam C 
> ---
>  drivers/gpu/drm/i915/gt/intel_migrate.c | 44 -
>  1 file changed, 21 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
> b/drivers/gpu/drm/i915/gt/intel_migrate.c
> index 1bbed7aa436a..aaaf1906026c 100644
> --- a/drivers/gpu/drm/i915/gt/intel_migrate.c
> +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> @@ -609,9 +609,9 @@ static int emit_copy(struct i915_request *rq,
>   return 0;
>  }
>  
> -static int scatter_list_length(struct scatterlist *sg)
> +static u64 scatter_list_length(struct scatterlist *sg)
>  {
> - int len = 0;
> + u64 len = 0;
>  
>   while (sg && sg_dma_len(sg)) {
>   len += sg_dma_len(sg);
> @@ -621,28 +621,26 @@ static int scatter_list_length(struct scatterlist *sg)
>   return len;
>  }
>  
> -static void
> +static int
>  calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem,
> -int *src_sz, u32 bytes_to_cpy, u32 ccs_bytes_to_cpy)
> +u64 bytes_to_cpy, u64 ccs_bytes_to_cpy)
>  {
> - if (ccs_bytes_to_cpy) {
> - if (!src_is_lmem)
> - /*
> -  * When CHUNK_SZ is passed all the pages upto CHUNK_SZ
> -  * will be taken for the blt. in Flat-ccs supported
> -  * platform Smem obj will have more pages than required
> -  * for main meory hence limit it to the required size
> -  * for main memory
> -  */
> - *src_sz = min_t(int, bytes_to_cpy, CHUNK_SZ);
> - } else { /* ccs handling is not required */
> - *src_sz = CHUNK_SZ;
> - }
> + if (ccs_bytes_to_cpy && !src_is_lmem)
Yes this is needed for ccs copy of an obj of >8M from lmem to smem.

Reviewed-by: Ramalingam C
> + /*
> +  * When CHUNK_SZ is passed all the pages upto CHUNK_SZ
> +  * will be taken for the blt. in Flat-ccs supported
> +  * platform Smem obj will have more pages than required
> +  * for main meory hence limit it to the required size
> +  * for main memory
> +  */
> + return min_t(u64, bytes_to_cpy, CHUNK_SZ);
> + else
> + return CHUNK_SZ;
>  }
>  
> -static void get_ccs_sg_sgt(struct sgt_dma *it, u32 bytes_to_cpy)
> +static void get_ccs_sg_sgt(struct sgt_dma *it, u64 bytes_to_cpy)
>  {
> - u32 len;
> + u64 len;
>  
>   do {
>   GEM_BUG_ON(!it->sg || !sg_dma_len(it->sg));
> @@ -673,12 +671,12 @@ intel_context_migrate_copy(struct intel_context *ce,
>  {
>   struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst), it_ccs;
>   struct drm_i915_private *i915 = ce->engine->i915;
> - u32 ccs_bytes_to_cpy = 0, bytes_to_cpy;
> + u64 ccs_bytes_to_cpy = 0, bytes_to_cpy;
>   enum i915_cache_level ccs_cache_level;
>   u32 src_offset, dst_offset;
>   u8 src_access, dst_access;
>   struct i915_request *rq;
> - int src_sz, dst_sz;
> + u64 src_sz, dst_sz;
>   bool ccs_is_src, overwrite_ccs;
>   int err;
>  
> @@ -761,8 +759,8 @@ intel_context_migrate_copy(struct intel_context *ce,
>   if (err)
>   goto out_rq;
>  
> - calculate_chunk_sz(i915, src_is_lmem, _sz,
> -bytes_to_cpy, ccs_bytes_to_cpy);
> + src_sz = calculate_chunk_sz(i915, src_is_lmem,
> + bytes_to_cpy, ccs_bytes_to_cpy);
>  
>   len = emit_pte(rq, _src, src_cache_level, src_is_lmem,
>  src_offset, src_sz);
> -- 
> 2.37.1
> 


Re: [PATCH 1/2] drm/i915/ttm: remove calc_ctrl_surf_instr_size

2022-08-07 Thread Ramalingam C
On 2022-08-05 at 14:22:39 +0100, Matthew Auld wrote:
> We only ever need to emit one ccs block copy command.
Since max size we handle at a time is CHUNK_SZ, we will need only one
cmd.

Reviewed-by: Ramalingam C
> 
> Signed-off-by: Matthew Auld 
> Cc: Thomas Hellström 
> Cc: Ramalingam C 
> ---
>  drivers/gpu/drm/i915/gt/intel_migrate.c | 35 +++--
>  1 file changed, 3 insertions(+), 32 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
> b/drivers/gpu/drm/i915/gt/intel_migrate.c
> index 9a0814422ba4..1bbed7aa436a 100644
> --- a/drivers/gpu/drm/i915/gt/intel_migrate.c
> +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> @@ -511,44 +511,16 @@ static inline u32 *i915_flush_dw(u32 *cmd, u32 flags)
>   return cmd;
>  }
>  
> -static u32 calc_ctrl_surf_instr_size(struct drm_i915_private *i915, int size)
> -{
> - u32 num_cmds, num_blks, total_size;
> -
> - if (!GET_CCS_BYTES(i915, size))
> - return 0;
> -
> - /*
> -  * XY_CTRL_SURF_COPY_BLT transfers CCS in 256 byte
> -  * blocks. one XY_CTRL_SURF_COPY_BLT command can
> -  * transfer upto 1024 blocks.
> -  */
> - num_blks = DIV_ROUND_UP(GET_CCS_BYTES(i915, size),
> - NUM_CCS_BYTES_PER_BLOCK);
> - num_cmds = DIV_ROUND_UP(num_blks, NUM_CCS_BLKS_PER_XFER);
> - total_size = XY_CTRL_SURF_INSTR_SIZE * num_cmds;
> -
> - /*
> -  * Adding a flush before and after XY_CTRL_SURF_COPY_BLT
> -  */
> - total_size += 2 * MI_FLUSH_DW_SIZE;
> -
> - return total_size;
> -}
> -
>  static int emit_copy_ccs(struct i915_request *rq,
>u32 dst_offset, u8 dst_access,
>u32 src_offset, u8 src_access, int size)
>  {
>   struct drm_i915_private *i915 = rq->engine->i915;
>   int mocs = rq->engine->gt->mocs.uc_index << 1;
> - u32 num_ccs_blks, ccs_ring_size;
> + u32 num_ccs_blks;
>   u32 *cs;
>  
> - ccs_ring_size = calc_ctrl_surf_instr_size(i915, size);
> - WARN_ON(!ccs_ring_size);
> -
> - cs = intel_ring_begin(rq, round_up(ccs_ring_size, 2));
> + cs = intel_ring_begin(rq, 12);
>   if (IS_ERR(cs))
>   return PTR_ERR(cs);
>  
> @@ -583,8 +555,7 @@ static int emit_copy_ccs(struct i915_request *rq,
>   FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, mocs);
>  
>   cs = i915_flush_dw(cs, MI_FLUSH_DW_LLC | MI_FLUSH_DW_CCS);
> - if (ccs_ring_size & 1)
> - *cs++ = MI_NOOP;
> + *cs++ = MI_NOOP;
>  
>   intel_ring_advance(rq, cs);
>  
> -- 
> 2.37.1
> 


Re: [RFC 10/10] drm/i915/vm_bind: Fix vm->vm_bind_mutex and vm->mutex nesting

2022-07-06 Thread Ramalingam C
On 2022-07-05 at 10:40:56 +0200, Thomas Hellström wrote:
> On Fri, 2022-07-01 at 15:50 -0700, Niranjana Vishwanathapura wrote:
> > VM_BIND functionality maintain that vm->vm_bind_mutex will never be
> > taken
> > while holding vm->mutex.
> > However, while closing 'vm', vma is destroyed while holding vm-
> > >mutex.
> > But vma releasing needs to take vm->vm_bind_mutex in order to delete
> > vma
> > from the vm_bind_list. To avoid this, destroy the vma outside vm-
> > >mutex
> > while closing the 'vm'.
> > 
> > Signed-off-by: Niranjana Vishwanathapura
> 
> First, when introducing a new feature like this, we should not need to
> end the series with "Fix.." patches like this, rather whatever needs to
> be fixed should be fixed where the code was introduced.
Thanks Thomas for the review. I will fix it.
> 
> Second, an analogy whith linux kernel CPU mapping, could we instead
> think of the vm_bind_lock being similar to the mmap_lock, and the
> vm_mutex being similar to the i_mmap_lock, the former being used for VA
> manipulation and the latter when attaching / removing the backing store
> from the VA?
> 
> Then we would not need to take the vm_bind_lock from vma destruction
> since the VA would already have been reclaimed at that point. For vm
> destruction here we'd loop over all relevant vm bind VAs under the
> vm_bind lock and call vm_unbind? Would that work?

Sounds reasonable. I will try this locking approach.

Ram
> 
> /Thomas
> 
> 
> > 
> > ---
> >  drivers/gpu/drm/i915/gt/intel_gtt.c | 23 ++-
> >  1 file changed, 18 insertions(+), 5 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c
> > b/drivers/gpu/drm/i915/gt/intel_gtt.c
> > index 4ab3bda644ff..4f707d0eb3ef 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_gtt.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c
> > @@ -109,7 +109,8 @@ int map_pt_dma_locked(struct i915_address_space
> > *vm, struct drm_i915_gem_object
> > return 0;
> >  }
> >  
> > -static void clear_vm_list(struct list_head *list)
> > +static void clear_vm_list(struct list_head *list,
> > + struct list_head *destroy_list)
> >  {
> > struct i915_vma *vma, *vn;
> >  
> > @@ -138,8 +139,7 @@ static void clear_vm_list(struct list_head *list)
> > i915_vm_resv_get(vma->vm);
> > vma->vm_ddestroy = true;
> > } else {
> > -   i915_vma_destroy_locked(vma);
> > -   i915_gem_object_put(obj);
> > +   list_move_tail(>vm_link, destroy_list);
> > }
> >  
> > }
> > @@ -147,16 +147,29 @@ static void clear_vm_list(struct list_head
> > *list)
> >  
> >  static void __i915_vm_close(struct i915_address_space *vm)
> >  {
> > +   struct i915_vma *vma, *vn;
> > +   struct list_head list;
> > +
> > +   INIT_LIST_HEAD();
> > +
> > mutex_lock(>mutex);
> >  
> > -   clear_vm_list(>bound_list);
> > -   clear_vm_list(>unbound_list);
> > +   clear_vm_list(>bound_list, );
> > +   clear_vm_list(>unbound_list, );
> >  
> > /* Check for must-fix unanticipated side-effects */
> > GEM_BUG_ON(!list_empty(>bound_list));
> > GEM_BUG_ON(!list_empty(>unbound_list));
> >  
> > mutex_unlock(>mutex);
> > +
> > +   /* Destroy vmas outside vm->mutex */
> > +   list_for_each_entry_safe(vma, vn, , vm_link) {
> > +   struct drm_i915_gem_object *obj = vma->obj;
> > +
> > +   i915_vma_destroy(vma);
> > +   i915_gem_object_put(obj);
> > +   }
> >  }
> >  
> >  /* lock the vm into the current ww, if we lock one, we lock all */
> 


Re: [Intel-gfx] [RFC 05/10] drm/i915/vm_bind: Handle persistent vmas

2022-07-05 Thread Ramalingam C
On 2022-07-04 at 17:05:38 +, Zeng, Oak wrote:
> 
> 
> Thanks,
> Oak
> 
> > -Original Message-
> > From: Intel-gfx  On Behalf Of
> > Niranjana Vishwanathapura
> > Sent: July 1, 2022 6:51 PM
> > To: intel-...@lists.freedesktop.org; dri-devel@lists.freedesktop.org
> > Cc: Zanoni, Paulo R ; Hellstrom, Thomas
> > ; Auld, Matthew ;
> > Vetter, Daniel ; christian.koe...@amd.com
> > Subject: [Intel-gfx] [RFC 05/10] drm/i915/vm_bind: Handle persistent vmas
> > 
> > Treat VM_BIND vmas as persistent and handle them during the request
> > submission in the execbuff path.
> 
> Hi Niranjana,
> 
> Is the meaning of "persistent" above persistent across all the subsequent 
> execbuf ioctls?

Yes oak. Thats correct. persistent across multiple execbuf ioctls.

Regards,
Ram.
> 
> Thanks,
> Oak 
> 
> > 
> > Support eviction by maintaining a list of evicted persistent vmas for 
> > rebinding
> > during next submission.
> > 
> > Signed-off-by: Niranjana Vishwanathapura
> > 
> > ---
> >  drivers/gpu/drm/i915/gem/i915_gem_object.c|  1 +
> >  drivers/gpu/drm/i915/gem/i915_gem_vm_bind.h   |  3 +
> >  .../drm/i915/gem/i915_gem_vm_bind_object.c| 12 ++-
> >  drivers/gpu/drm/i915/gt/intel_gtt.c   |  2 +
> >  drivers/gpu/drm/i915/gt/intel_gtt.h   |  2 +
> >  drivers/gpu/drm/i915/i915_gem_gtt.h   | 22 ++
> >  drivers/gpu/drm/i915/i915_vma.c   | 32 +++-
> >  drivers/gpu/drm/i915/i915_vma.h   | 78 +--
> >  drivers/gpu/drm/i915/i915_vma_types.h | 23 ++
> >  9 files changed, 163 insertions(+), 12 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c
> > b/drivers/gpu/drm/i915/gem/i915_gem_object.c
> > index ccec4055fde3..5121f02ba95c 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
> > @@ -38,6 +38,7 @@
> >  #include "i915_gem_mman.h"
> >  #include "i915_gem_object.h"
> >  #include "i915_gem_ttm.h"
> > +#include "i915_gem_vm_bind.h"
> >  #include "i915_memcpy.h"
> >  #include "i915_trace.h"
> > 
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_vm_bind.h
> > b/drivers/gpu/drm/i915/gem/i915_gem_vm_bind.h
> > index 849bf3c1061e..eaadf5a6ab09 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_vm_bind.h
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_vm_bind.h
> > @@ -6,6 +6,7 @@
> >  #ifndef __I915_GEM_VM_BIND_H
> >  #define __I915_GEM_VM_BIND_H
> > 
> > +#include 
> >  #include "i915_drv.h"
> > 
> >  #define assert_vm_bind_held(vm)   lockdep_assert_held(&(vm)-
> > >vm_bind_lock)
> > @@ -26,6 +27,8 @@ static inline void i915_gem_vm_bind_unlock(struct
> > i915_address_space *vm)
> > mutex_unlock(>vm_bind_lock);
> >  }
> > 
> > +#define assert_vm_priv_held(vm)   assert_object_held((vm)->root_obj)
> > +
> >  static inline int i915_gem_vm_priv_lock(struct i915_address_space *vm,
> > struct i915_gem_ww_ctx *ww)
> >  {
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c
> > b/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c
> > index 96f139cc8060..1a8efa83547f 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c
> > @@ -85,6 +85,13 @@ void i915_gem_vm_bind_remove(struct i915_vma
> > *vma, bool release_obj)  {
> > assert_vm_bind_held(vma->vm);
> > 
> > +   spin_lock(>vm->vm_rebind_lock);
> > +   if (!list_empty(>vm_rebind_link))
> > +   list_del_init(>vm_rebind_link);
> > +   i915_vma_set_purged(vma);
> > +   i915_vma_set_freed(vma);
> > +   spin_unlock(>vm->vm_rebind_lock);
> > +
> > if (!list_empty(>vm_bind_link)) {
> > list_del_init(>vm_bind_link);
> > list_del_init(>non_priv_vm_bind_link);
> > @@ -220,6 +227,7 @@ static struct i915_vma *vm_bind_get_vma(struct
> > i915_address_space *vm,
> > 
> > vma->start = va->start;
> > vma->last = va->start + va->length - 1;
> > +   i915_vma_set_persistent(vma);
> > 
> > return vma;
> >  }
> > @@ -304,8 +312,10 @@ int i915_gem_vm_bind_obj(struct
> > i915_address_space *vm,
> > 
> > i915_vm_bind_put_fence(vma);
> >  put_vma:
> > -   if (ret)
> > +   if (ret) {
> > +   i915_vma_set_freed(vma);
> > i915_vma_destroy(vma);
> > +   }
> > 
> > i915_gem_ww_ctx_fini();
> >  unlock_vm:
> > diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c
> > b/drivers/gpu/drm/i915/gt/intel_gtt.c
> > index df0a8459c3c6..55d5389b2c6c 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_gtt.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c
> > @@ -293,6 +293,8 @@ void i915_address_space_init(struct
> > i915_address_space *vm, int subclass)
> > INIT_LIST_HEAD(>non_priv_vm_bind_list);
> > vm->root_obj = i915_gem_object_create_internal(vm->i915,
> > PAGE_SIZE);
> > GEM_BUG_ON(IS_ERR(vm->root_obj));
> > +   INIT_LIST_HEAD(>vm_rebind_list);
> > +   spin_lock_init(>vm_rebind_lock);
> >  }
> > 
> >  void *__px_vaddr(struct drm_i915_gem_object *p) 

Re: [PATCH v3 12/13] drm/i915/ttm: disallow CPU fallback mode for ccs pages

2022-06-29 Thread Ramalingam C
On 2022-06-29 at 13:14:26 +0100, Matthew Auld wrote:
> Falling back to memcpy/memset shouldn't be allowed if we know we have
> CCS state to manage using the blitter. Otherwise we are potentially
> leaving the aux CCS state in an unknown state, which smells like an info
> leak.
> 
> Fixes: 48760ffe923a ("drm/i915/gt: Clear compress metadata for Flat-ccs 
> objects")

Looks good to me.

Reviewed-by: Ramalingam C 

> Signed-off-by: Matthew Auld 
> Cc: Thomas Hellström 
> Cc: Lionel Landwerlin 
> Cc: Tvrtko Ursulin 
> Cc: Jon Bloomfield 
> Cc: Daniel Vetter 
> Cc: Jordan Justen 
> Cc: Kenneth Graunke 
> Cc: Akeem G Abodunrin 
> Cc: Ramalingam C 
> ---
>  drivers/gpu/drm/i915/gem/i915_gem_object.c   | 26 
>  drivers/gpu/drm/i915/gem/i915_gem_object.h   |  2 ++
>  drivers/gpu/drm/i915/gem/i915_gem_ttm.c  | 18 --
>  drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c |  3 +++
>  4 files changed, 31 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c 
> b/drivers/gpu/drm/i915/gem/i915_gem_object.c
> index 642a5d59ce26..ccec4055fde3 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
> @@ -717,6 +717,32 @@ bool i915_gem_object_placement_possible(struct 
> drm_i915_gem_object *obj,
>   return false;
>  }
>  
> +/**
> + * i915_gem_object_needs_ccs_pages - Check whether the object requires extra
> + * pages when placed in system-memory, in order to save and later restore the
> + * flat-CCS aux state when the object is moved between local-memory and
> + * system-memory
> + * @obj: Pointer to the object
> + *
> + * Return: True if the object needs extra ccs pages. False otherwise.
> + */
> +bool i915_gem_object_needs_ccs_pages(struct drm_i915_gem_object *obj)
> +{
> + bool lmem_placement = false;
> + int i;
> +
> + for (i = 0; i < obj->mm.n_placements; i++) {
> + /* Compression is not allowed for the objects with smem 
> placement */
> + if (obj->mm.placements[i]->type == INTEL_MEMORY_SYSTEM)
> + return false;
> + if (!lmem_placement &&
> + obj->mm.placements[i]->type == INTEL_MEMORY_LOCAL)
> + lmem_placement = true;
> + }
> +
> + return lmem_placement;
> +}
> +
>  void i915_gem_init__objects(struct drm_i915_private *i915)
>  {
>   INIT_DELAYED_WORK(>mm.free_work, __i915_gem_free_work);
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h 
> b/drivers/gpu/drm/i915/gem/i915_gem_object.h
> index 0bf3ee27a2a8..6f0a3ce35567 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_object.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h
> @@ -618,6 +618,8 @@ int i915_gem_object_wait_migration(struct 
> drm_i915_gem_object *obj,
>  bool i915_gem_object_placement_possible(struct drm_i915_gem_object *obj,
>   enum intel_memory_type type);
>  
> +bool i915_gem_object_needs_ccs_pages(struct drm_i915_gem_object *obj);
> +
>  int shmem_sg_alloc_table(struct drm_i915_private *i915, struct sg_table *st,
>size_t size, struct intel_memory_region *mr,
>struct address_space *mapping,
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c 
> b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> index 098409a33e10..7e1f8b83077f 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> @@ -266,24 +266,6 @@ static const struct i915_refct_sgt_ops tt_rsgt_ops = {
>   .release = i915_ttm_tt_release
>  };
>  
> -static inline bool
> -i915_gem_object_needs_ccs_pages(struct drm_i915_gem_object *obj)
> -{
> - bool lmem_placement = false;
> - int i;
> -
> - for (i = 0; i < obj->mm.n_placements; i++) {
> - /* Compression is not allowed for the objects with smem 
> placement */
> - if (obj->mm.placements[i]->type == INTEL_MEMORY_SYSTEM)
> - return false;
> - if (!lmem_placement &&
> - obj->mm.placements[i]->type == INTEL_MEMORY_LOCAL)
> - lmem_placement = true;
> - }
> -
> - return lmem_placement;
> -}
> -
>  static struct ttm_tt *i915_ttm_tt_create(struct ttm_buffer_object *bo,
>uint32_t page_flags)
>  {
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c 
> b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
> index 364e7fe8efb1..d22e38aad6b9 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
> +++ b/drivers/gpu/d

Re: [PATCH] drm/i915/gt: handle null ptr at sg traversing

2022-06-28 Thread Ramalingam C
On 2022-06-28 at 10:40:56 +0100, Matthew Auld wrote:
> On 27/06/2022 18:35, Ramalingam C wrote:
> > When calculating the starting address for ccs data in smem scatterlist,
> > handle the NULL pointer returned from sg_next, incase of scatterlist
> > less than required size..
> 
> Do we have some more information on how we can hit this? Is this a
> programmer error? Do we have a testcase?
Typically We will never get NULL at this point, as we allocate the smem
of sz equal to lmem obj size + requiured ccs size. So we will never run
into NULL when we traverse the sg for the size of lmem in smem's sg.

IF there is NULL returned in this scenario we could report BUG_ON or let
it NPD or return the error code.

But either way couldn't think of a scenario when this will hit. after
thinking further seems to be leaving the NPD itself sufficient as other
error handling also not doing good job at it. Please share your thoughts

Ram
> 
> > 
> > Signed-off-by: Ramalingam C 
> > ---
> >   drivers/gpu/drm/i915/gt/intel_migrate.c | 13 ++---
> >   1 file changed, 10 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
> > b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > index 2c35324b5f68..c206fb4f4186 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_migrate.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > @@ -669,7 +669,7 @@ calculate_chunk_sz(struct drm_i915_private *i915, bool 
> > src_is_lmem,
> > }
> >   }
> > -static void get_ccs_sg_sgt(struct sgt_dma *it, u32 bytes_to_cpy)
> > +static int get_ccs_sg_sgt(struct sgt_dma *it, u32 bytes_to_cpy)
> >   {
> > u32 len;
> > @@ -684,9 +684,13 @@ static void get_ccs_sg_sgt(struct sgt_dma *it, u32 
> > bytes_to_cpy)
> > bytes_to_cpy -= len;
> > it->sg = __sg_next(it->sg);
> > +   if (!it->sg)
> > +   return -EINVAL;
> > it->dma = sg_dma_address(it->sg);
> > it->max = it->dma + sg_dma_len(it->sg);
> > } while (bytes_to_cpy);
> > +
> > +   return 0;
> >   }
> >   int
> > @@ -745,8 +749,11 @@ intel_context_migrate_copy(struct intel_context *ce,
> >  * Need to fix it.
> >  */
> > ccs_bytes_to_cpy = src_sz != dst_sz ? GET_CCS_BYTES(i915, 
> > bytes_to_cpy) : 0;
> > -   if (ccs_bytes_to_cpy)
> > -   get_ccs_sg_sgt(_ccs, bytes_to_cpy);
> > +   if (ccs_bytes_to_cpy) {
> > +   err = get_ccs_sg_sgt(_ccs, bytes_to_cpy);
> > +   if (err)
> > +   return err;
> > +   }
> > }
> > src_offset = 0;


Re: [PATCH 3/3] drm/i915: Do not use reserved requests for virtual engines

2022-06-27 Thread Ramalingam C
On 2022-06-27 at 10:18:59 -0700, Matthew Brost wrote:
> On Wed, Jun 15, 2022 at 12:13:48AM +0530, Ramalingam C wrote:
> > Do not use reserved requests for virtual engines as this is only
> > needed for kernel contexts.
> > 
> > Signed-off-by: Ramalingam C 
> > Suggested-by: Matthew Brost 
> 
> With the patch squashed into the previous patch:
> Reviewed-by: Matthew Brost 
Thank you Matthew. I will squash them while merging.

Ram
> 
> > ---
> >  drivers/gpu/drm/i915/i915_request.c | 5 -
> >  1 file changed, 4 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_request.c 
> > b/drivers/gpu/drm/i915/i915_request.c
> > index c71905d8e154..f0392b053bca 100644
> > --- a/drivers/gpu/drm/i915/i915_request.c
> > +++ b/drivers/gpu/drm/i915/i915_request.c
> > @@ -135,6 +135,8 @@ static void i915_fence_release(struct dma_fence *fence)
> >  
> > /*
> >  * Keep one request on each engine for reserved use under mempressure
> > +* do not use with virtual engines as this really is only needed for
> > +* kernel contexts.
> >  *
> >  * We do not hold a reference to the engine here and so have to be
> >  * very careful in what rq->engine we poke. The virtual engine is
> > @@ -164,7 +166,8 @@ static void i915_fence_release(struct dma_fence *fence)
> >  * know that if the rq->execution_mask is a single bit, rq->engine
> >  * can be a physical engine with the exact corresponding mask.
> >  */
> > -   if (is_power_of_2(rq->execution_mask) &&
> > +   if (!intel_engine_is_virtual(rq->engine) &&
> > +   is_power_of_2(rq->execution_mask) &&
> > !cmpxchg(>engine->request_pool, NULL, rq))
> > return;
> >  
> > -- 
> > 2.20.1
> > 


[PATCH] drm/i915/gt: handle null ptr at sg traversing

2022-06-27 Thread Ramalingam C
When calculating the starting address for ccs data in smem scatterlist,
handle the NULL pointer returned from sg_next, incase of scatterlist
less than required size..

Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 2c35324b5f68..c206fb4f4186 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -669,7 +669,7 @@ calculate_chunk_sz(struct drm_i915_private *i915, bool 
src_is_lmem,
}
 }
 
-static void get_ccs_sg_sgt(struct sgt_dma *it, u32 bytes_to_cpy)
+static int get_ccs_sg_sgt(struct sgt_dma *it, u32 bytes_to_cpy)
 {
u32 len;
 
@@ -684,9 +684,13 @@ static void get_ccs_sg_sgt(struct sgt_dma *it, u32 
bytes_to_cpy)
bytes_to_cpy -= len;
 
it->sg = __sg_next(it->sg);
+   if (!it->sg)
+   return -EINVAL;
it->dma = sg_dma_address(it->sg);
it->max = it->dma + sg_dma_len(it->sg);
} while (bytes_to_cpy);
+
+   return 0;
 }
 
 int
@@ -745,8 +749,11 @@ intel_context_migrate_copy(struct intel_context *ce,
 * Need to fix it.
 */
ccs_bytes_to_cpy = src_sz != dst_sz ? GET_CCS_BYTES(i915, 
bytes_to_cpy) : 0;
-   if (ccs_bytes_to_cpy)
-   get_ccs_sg_sgt(_ccs, bytes_to_cpy);
+   if (ccs_bytes_to_cpy) {
+   err = get_ccs_sg_sgt(_ccs, bytes_to_cpy);
+   if (err)
+   return err;
+   }
}
 
src_offset = 0;
-- 
2.20.1



[PATCH 3/3] drm/i915: Do not use reserved requests for virtual engines

2022-06-14 Thread Ramalingam C
Do not use reserved requests for virtual engines as this is only
needed for kernel contexts.

Signed-off-by: Ramalingam C 
Suggested-by: Matthew Brost 
---
 drivers/gpu/drm/i915/i915_request.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_request.c 
b/drivers/gpu/drm/i915/i915_request.c
index c71905d8e154..f0392b053bca 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -135,6 +135,8 @@ static void i915_fence_release(struct dma_fence *fence)
 
/*
 * Keep one request on each engine for reserved use under mempressure
+* do not use with virtual engines as this really is only needed for
+* kernel contexts.
 *
 * We do not hold a reference to the engine here and so have to be
 * very careful in what rq->engine we poke. The virtual engine is
@@ -164,7 +166,8 @@ static void i915_fence_release(struct dma_fence *fence)
 * know that if the rq->execution_mask is a single bit, rq->engine
 * can be a physical engine with the exact corresponding mask.
 */
-   if (is_power_of_2(rq->execution_mask) &&
+   if (!intel_engine_is_virtual(rq->engine) &&
+   is_power_of_2(rq->execution_mask) &&
!cmpxchg(>engine->request_pool, NULL, rq))
return;
 
-- 
2.20.1



[PATCH 2/3] Revert "drm/i915: Hold reference to intel_context over life of i915_request"

2022-06-14 Thread Ramalingam C
From: Niranjana Vishwanathapura 

This reverts commit 1e98d8c52ed5dfbaf273c4423c636525c2ce59e7.

The problem with this patch is that it makes i915_request to hold a
reference to intel_context, which in turn holds a reference on the VM.
This strong back referencing can lead to reference loops which leads
to resource leak.

An example is the upcoming VM_BIND work which requires VM to hold
a reference to some shared VM specific BO. But this BO's dma-resv
fences holds reference to the i915_request thus leading to reference
loop.

Signed-off-by: Niranjana Vishwanathapura 
Signed-off-by: Ramalingam C 
Suggested-by: Matthew Brost 
---
 drivers/gpu/drm/i915/i915_request.c | 55 +
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_request.c 
b/drivers/gpu/drm/i915/i915_request.c
index 7f6998bf390c..c71905d8e154 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -134,17 +134,39 @@ static void i915_fence_release(struct dma_fence *fence)
i915_sw_fence_fini(>semaphore);
 
/*
-* Keep one request on each engine for reserved use under mempressure,
-* do not use with virtual engines as this really is only needed for
-* kernel contexts.
+* Keep one request on each engine for reserved use under mempressure
+*
+* We do not hold a reference to the engine here and so have to be
+* very careful in what rq->engine we poke. The virtual engine is
+* referenced via the rq->context and we released that ref during
+* i915_request_retire(), ergo we must not dereference a virtual
+* engine here. Not that we would want to, as the only consumer of
+* the reserved engine->request_pool is the power management parking,
+* which must-not-fail, and that is only run on the physical engines.
+*
+* Since the request must have been executed to be have completed,
+* we know that it will have been processed by the HW and will
+* not be unsubmitted again, so rq->engine and rq->execution_mask
+* at this point is stable. rq->execution_mask will be a single
+* bit if the last and _only_ engine it could execution on was a
+* physical engine, if it's multiple bits then it started on and
+* could still be on a virtual engine. Thus if the mask is not a
+* power-of-two we assume that rq->engine may still be a virtual
+* engine and so a dangling invalid pointer that we cannot dereference
+*
+* For example, consider the flow of a bonded request through a virtual
+* engine. The request is created with a wide engine mask (all engines
+* that we might execute on). On processing the bond, the request mask
+* is reduced to one or more engines. If the request is subsequently
+* bound to a single engine, it will then be constrained to only
+* execute on that engine and never returned to the virtual engine
+* after timeslicing away, see __unwind_incomplete_requests(). Thus we
+* know that if the rq->execution_mask is a single bit, rq->engine
+* can be a physical engine with the exact corresponding mask.
 */
-   if (!intel_engine_is_virtual(rq->engine) &&
-   !cmpxchg(>engine->request_pool, NULL, rq)) {
-   intel_context_put(rq->context);
+   if (is_power_of_2(rq->execution_mask) &&
+   !cmpxchg(>engine->request_pool, NULL, rq))
return;
-   }
-
-   intel_context_put(rq->context);
 
kmem_cache_free(slab_requests, rq);
 }
@@ -921,19 +943,7 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp)
}
}
 
-   /*
-* Hold a reference to the intel_context over life of an i915_request.
-* Without this an i915_request can exist after the context has been
-* destroyed (e.g. request retired, context closed, but user space holds
-* a reference to the request from an out fence). In the case of GuC
-* submission + virtual engine, the engine that the request references
-* is also destroyed which can trigger bad pointer dref in fence ops
-* (e.g. i915_fence_get_driver_name). We could likely change these
-* functions to avoid touching the engine but let's just be safe and
-* hold the intel_context reference. In execlist mode the request always
-* eventually points to a physical engine so this isn't an issue.
-*/
-   rq->context = intel_context_get(ce);
+   rq->context = ce;
rq->engine = ce->engine;
rq->ring = ce->ring;
rq->execution_mask = ce->engine->mask;
@@ -1009,7 +1019,6 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp)
GEM_BUG_ON(!list_empty(>sched.w

[PATCH 1/3] drm/i915: Do not access rq->engine without a reference

2022-06-14 Thread Ramalingam C
From: Niranjana Vishwanathapura 

In i915_fence_get_driver_name(), user may not hold a
reference to rq->engine. Hence do not access it. Instead,
store required device private pointer in 'rq->i915' and use it.

Signed-off-by: Niranjana Vishwanathapura 
Suggested-by: Matthew Brost 
---
 drivers/gpu/drm/i915/i915_request.c | 3 ++-
 drivers/gpu/drm/i915/i915_request.h | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_request.c 
b/drivers/gpu/drm/i915/i915_request.c
index 73d5195146b0..7f6998bf390c 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -60,7 +60,7 @@ static struct kmem_cache *slab_execute_cbs;
 
 static const char *i915_fence_get_driver_name(struct dma_fence *fence)
 {
-   return dev_name(to_request(fence)->engine->i915->drm.dev);
+   return dev_name(to_request(fence)->i915->drm.dev);
 }
 
 static const char *i915_fence_get_timeline_name(struct dma_fence *fence)
@@ -937,6 +937,7 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp)
rq->engine = ce->engine;
rq->ring = ce->ring;
rq->execution_mask = ce->engine->mask;
+   rq->i915 = ce->engine->i915;
 
ret = intel_timeline_get_seqno(tl, rq, );
if (ret)
diff --git a/drivers/gpu/drm/i915/i915_request.h 
b/drivers/gpu/drm/i915/i915_request.h
index 28b1f9db5487..47041ec68df8 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -196,6 +196,8 @@ struct i915_request {
struct dma_fence fence;
spinlock_t lock;
 
+   struct drm_i915_private *i915;
+
/**
 * Context and ring buffer related to this request
 * Contexts are refcounted, so when this request is associated with a
-- 
2.20.1



[PATCH 0/3] Break VM to rq reference loop

2022-06-14 Thread Ramalingam C
The i915_request holds a reference to intel_context, which in
turn holds a reference on the VM. But the dma-resv update for
VM_BIND feature would require VM hold a reference to the
i915_request through dma-resv fences of VM_PRIVATE objects
(which share a per VM dma-resv object).

Thus, we have a circular reference pattern causing the VM
reference to never reach 0, hence VM is not destroyed.

Break this by reverting the below patch which is making the
i915_request to hold a reference on intel_context.
"drm/i915: Hold reference to intel_context over life of i915_request"

This means we can't access rq->engine in i915_fence_get_driver_name()
as user do not hold a reference on rq->engine here. So, instead
store required device private pointer in 'rq->i915' and use it.

Niranjana Vishwanathapura (2):
  drm/i915: Do not access rq->engine without a reference
  Revert "drm/i915: Hold reference to intel_context over life of
    i915_request"

Ramalingam C (1):
  drm/i915: Do not use reserved requests for virtual engines

 drivers/gpu/drm/i915/i915_request.c | 55 ++---
 drivers/gpu/drm/i915/i915_request.h |  2 ++
 2 files changed, 36 insertions(+), 21 deletions(-)

-- 
2.20.1



Re: [PATCH v3] uapi/drm/i915: Document memory residency and Flat-CCS capability of obj

2022-05-18 Thread Ramalingam C
On 2022-05-13 at 14:06:11 -0700, Jordan Justen wrote:
> On 2022-05-13 05:31:00, Lionel Landwerlin wrote:
> > On 02/05/2022 17:15, Ramalingam C wrote:
> > > Capture the impact of memory region preference list of the objects, on
> > > their memory residency and Flat-CCS capability.
> > >
> > > v2:
> > >Fix the Flat-CCS capability of an obj with {lmem, smem} preference
> > >list [Thomas]
> > > v3:
> > >Reworded the doc [Matt]
> > >
> > > Signed-off-by: Ramalingam C 
> > > cc: Matthew Auld 
> > > cc: Thomas Hellstrom 
> > > cc: Daniel Vetter 
> > > cc: Jon Bloomfield 
> > > cc: Lionel Landwerlin 
> > > cc: Kenneth Graunke 
> > > cc: mesa-...@lists.freedesktop.org
> > > cc: Jordan Justen 
> > > cc: Tony Ye 
> > > Reviewed-by: Matthew Auld 
> > > ---
> > >   include/uapi/drm/i915_drm.h | 16 
> > >   1 file changed, 16 insertions(+)
> > >
> > > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> > > index a2def7b27009..b7e1c2fe08dc 100644
> > > --- a/include/uapi/drm/i915_drm.h
> > > +++ b/include/uapi/drm/i915_drm.h
> > > @@ -3443,6 +3443,22 @@ struct drm_i915_gem_create_ext {
> > >* At which point we get the object handle in 
> > > _i915_gem_create_ext.handle,
> > >* along with the final object size in _i915_gem_create_ext.size, 
> > > which
> > >* should account for any rounding up, if required.
> > > + *
> > > + * Note that userspace has no means of knowing the current backing region
> > > + * for objects where @num_regions is larger than one. The kernel will 
> > > only
> > > + * ensure that the priority order of the @regions array is honoured, 
> > > either
> > > + * when initially placing the object, or when moving memory around due to
> > > + * memory pressure
> > > + *
> > > + * On Flat-CCS capable HW, compression is supported for the objects 
> > > residing
> > > + * in I915_MEMORY_CLASS_DEVICE. When such objects (compressed) has other
> > > + * memory class in @regions and migrated (by I915, due to memory
> > > + * constrain) to the non I915_MEMORY_CLASS_DEVICE region, then I915 
> > > needs to
> > > + * decompress the content. But I915 dosen't have the required 
> > > information to
> > > + * decompress the userspace compressed objects.
> > > + *
> > > + * So I915 supports Flat-CCS, only on the objects which can reside only 
> > > on
> > > + * I915_MEMORY_CLASS_DEVICE regions.
> > 
> > I think it's fine to assume Flat-CSS surface will always be in lmem.
> > 
> > I see no issue for the Anv Vulkan driver.
> > 
> > Maybe Nanley or Ken can speak for the Iris GL driver?
> > 
> 
> Acked-by: Jordan Justen 
Thank you Jordan for the Ack!

Ram
> 
> I think Nanley has accounted for this on iris with:
> 
> https://gitlab.freedesktop.org/mesa/mesa/-/commit/42a865730ef72574e179b56a314f30fdccc6cba8
> 
> -Jordan


[PATCH v3 3/3] drm/i915/gt: Document the eviction of the Flat-CCS objects

2022-05-02 Thread Ramalingam C
Capture the eviction details for Flat-CCS capable, lmem objects.

v2:
  Fix the Flat-ccs capbility of lmem obj with smem residency
  possibility [Thomas]
v3:
  Fixed the suggestions [Matt]

Signed-off-by: Ramalingam C 
cc: Thomas Hellstrom 
cc: Matthew Auld 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index fc6975e55fae..509955885b93 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -485,16 +485,21 @@ static bool wa_1209644611_applies(int ver, u32 size)
  * And CCS data can be copied in and out of CCS region through
  * XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly.
  *
- * When we exhaust the lmem, if the object's placements support smem, then we 
can
- * directly decompress the compressed lmem object into smem and start using it
- * from smem itself.
+ * I915 supports Flat-CCS on lmem only objects. When an objects has smem in
+ * its preference list, on memory pressure, i915 needs to migrate the lmem
+ * content into smem. If the lmem object is Flat-CCS compressed by userspace,
+ * then i915 needs to decompress it. But I915 lack the required information
+ * for such decompression. Hence I915 supports Flat-CCS only on lmem only 
objects.
  *
- * But when we need to swapout the compressed lmem object into a smem region
- * though objects' placement doesn't support smem, then we copy the lmem 
content
- * as it is into smem region along with ccs data (using XY_CTRL_SURF_COPY_BLT).
- * When the object is referred, lmem content will be swaped in along with
- * restoration of the CCS data (using XY_CTRL_SURF_COPY_BLT) at corresponding
- * location.
+ * When we exhaust the lmem, Flat-CCS capable objects' lmem backing memory can
+ * be temporarily evicted to smem, along with the auxiliary CCS state, where
+ * it can be potentially swapped-out at a later point, if required.
+ * If userspace later touches the evicted pages, then we always move
+ * the backing memory back to lmem, which includes restoring the saved CCS 
state,
+ * and potentially performing any required swap-in.
+ *
+ * For the migration of the lmem objects with smem in placement list, such as
+ * {lmem, smem}, objects are treated as non Flat-CCS capable objects.
  */
 
 static inline u32 *i915_flush_dw(u32 *cmd, u32 flags)
-- 
2.20.1



[PATCH v3 2/3] drm/i915/gt: optimize the ccs_sz calculation per chunk

2022-05-02 Thread Ramalingam C
Calculate the ccs_sz that needs to be emitted based on the src
and dst pages emitted per chunk. And handle the return value of emit_pte
for the ccs pages.

v2:
  ccs_sz moved to the reduced scope [Matt]

Signed-off-by: Ramalingam C 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 36 +
 1 file changed, 13 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 168d17b6f48a..fc6975e55fae 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -647,17 +647,9 @@ static int scatter_list_length(struct scatterlist *sg)
 
 static void
 calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem,
-  int *src_sz, int *ccs_sz, u32 bytes_to_cpy,
-  u32 ccs_bytes_to_cpy)
+  int *src_sz, u32 bytes_to_cpy, u32 ccs_bytes_to_cpy)
 {
if (ccs_bytes_to_cpy) {
-   /*
-* We can only copy the ccs data corresponding to
-* the CHUNK_SZ of lmem which is
-* GET_CCS_BYTES(i915, CHUNK_SZ))
-*/
-   *ccs_sz = min_t(int, ccs_bytes_to_cpy, GET_CCS_BYTES(i915, 
CHUNK_SZ));
-
if (!src_is_lmem)
/*
 * When CHUNK_SZ is passed all the pages upto CHUNK_SZ
@@ -717,10 +709,10 @@ intel_context_migrate_copy(struct intel_context *ce,
struct drm_i915_private *i915 = ce->engine->i915;
u32 ccs_bytes_to_cpy = 0, bytes_to_cpy;
enum i915_cache_level ccs_cache_level;
-   int src_sz, dst_sz, ccs_sz;
u32 src_offset, dst_offset;
u8 src_access, dst_access;
struct i915_request *rq;
+   int src_sz, dst_sz;
bool ccs_is_src;
int err;
 
@@ -803,7 +795,7 @@ intel_context_migrate_copy(struct intel_context *ce,
if (err)
goto out_rq;
 
-   calculate_chunk_sz(i915, src_is_lmem, _sz, _sz,
+   calculate_chunk_sz(i915, src_is_lmem, _sz,
   bytes_to_cpy, ccs_bytes_to_cpy);
 
len = emit_pte(rq, _src, src_cache_level, src_is_lmem,
@@ -837,37 +829,35 @@ intel_context_migrate_copy(struct intel_context *ce,
bytes_to_cpy -= len;
 
if (ccs_bytes_to_cpy) {
+   int ccs_sz;
+
err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
if (err)
goto out_rq;
 
+   ccs_sz = GET_CCS_BYTES(i915, len);
err = emit_pte(rq, _ccs, ccs_cache_level, false,
   ccs_is_src ? src_offset : dst_offset,
   ccs_sz);
+   if (err < 0)
+   goto out_rq;
+   if (err < ccs_sz) {
+   err = -EINVAL;
+   goto out_rq;
+   }
 
err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
if (err)
goto out_rq;
 
-   /*
-* Using max of src_sz and dst_sz, as we need to
-* pass the lmem size corresponding to the ccs
-* blocks we need to handle.
-*/
-   ccs_sz = max_t(int, ccs_is_src ? ccs_sz : src_sz,
-  ccs_is_src ? dst_sz : ccs_sz);
-
err = emit_copy_ccs(rq, dst_offset, dst_access,
-   src_offset, src_access, ccs_sz);
+   src_offset, src_access, len);
if (err)
goto out_rq;
 
err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
if (err)
goto out_rq;
-
-   /* Converting back to ccs bytes */
-   ccs_sz = GET_CCS_BYTES(rq->engine->i915, ccs_sz);
ccs_bytes_to_cpy -= ccs_sz;
}
 
-- 
2.20.1



[PATCH v3 1/3] drm/i915/gt: BUG_ON unexpected NULL at scatterlist walking

2022-05-02 Thread Ramalingam C
While locating the start of ccs scatterlist in smem scatterlist, that has
to be the size of lmem obj size + corresponding ccs data size, report bug
if scatterlist terminate before that length.

v2:
  s/GEM_BUG_ON/BUG_ON with more commenting [Matt]
v3:
  Converted GEM_BUG_ON into BUG_ON with more documentation [Matt]

Signed-off-by: Ramalingam C 
Reviewed-by: Matthew Auld  (v1)
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 9d552f30b627..168d17b6f48a 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -687,6 +687,16 @@ static void get_ccs_sg_sgt(struct sgt_dma *it, u32 
bytes_to_cpy)
bytes_to_cpy -= len;
 
it->sg = __sg_next(it->sg);
+
+   /*
+* On Flat-CCS capable platform when we back the lmem pages with
+* smem pages we add extra pages at the end of the smem
+* scatterlist, to store the ccs data corresponding to the lmem
+* pages. get_ccs_sg_sgt() is called to get the pointer for the
+* start of the extra pages added at the end of smem 
scatterlist.
+* So scatterlist can't end at or before bytes_to_cpy.
+*/
+   BUG_ON(!it->sg);
it->dma = sg_dma_address(it->sg);
it->max = it->dma + sg_dma_len(it->sg);
} while (bytes_to_cpy);
@@ -748,8 +758,10 @@ intel_context_migrate_copy(struct intel_context *ce,
 * Need to fix it.
 */
ccs_bytes_to_cpy = src_sz != dst_sz ? GET_CCS_BYTES(i915, 
bytes_to_cpy) : 0;
-   if (ccs_bytes_to_cpy)
+   if (ccs_bytes_to_cpy) {
+   WARN_ON(abs(src_sz - dst_sz) < ccs_bytes_to_cpy);
get_ccs_sg_sgt(_ccs, bytes_to_cpy);
+   }
}
 
src_offset = 0;
-- 
2.20.1



[PATCH v3 0/3] Flat-CCS eviction enhancements

2022-05-02 Thread Ramalingam C
Flat-CCS eviction enhancements

v3:
  Incorporated the review suggestions [Matt]

Ramalingam C (3):
  drm/i915/gt: BUG_ON unexpected NULL at scatterlist walking
  drm/i915/gt: optimize the ccs_sz calculation per chunk
  drm/i915/gt: Document the eviction of the Flat-CCS objects

 drivers/gpu/drm/i915/gt/intel_migrate.c | 73 ++---
 1 file changed, 40 insertions(+), 33 deletions(-)

-- 
2.20.1



[PATCH v3] uapi/drm/i915: Document memory residency and Flat-CCS capability of obj

2022-05-02 Thread Ramalingam C
Capture the impact of memory region preference list of the objects, on
their memory residency and Flat-CCS capability.

v2:
  Fix the Flat-CCS capability of an obj with {lmem, smem} preference
  list [Thomas]
v3:
  Reworded the doc [Matt]

Signed-off-by: Ramalingam C 
cc: Matthew Auld 
cc: Thomas Hellstrom 
cc: Daniel Vetter 
cc: Jon Bloomfield 
cc: Lionel Landwerlin 
cc: Kenneth Graunke 
cc: mesa-...@lists.freedesktop.org
cc: Jordan Justen 
cc: Tony Ye 
Reviewed-by: Matthew Auld 
---
 include/uapi/drm/i915_drm.h | 16 
 1 file changed, 16 insertions(+)

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index a2def7b27009..b7e1c2fe08dc 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -3443,6 +3443,22 @@ struct drm_i915_gem_create_ext {
  * At which point we get the object handle in _i915_gem_create_ext.handle,
  * along with the final object size in _i915_gem_create_ext.size, which
  * should account for any rounding up, if required.
+ *
+ * Note that userspace has no means of knowing the current backing region
+ * for objects where @num_regions is larger than one. The kernel will only
+ * ensure that the priority order of the @regions array is honoured, either
+ * when initially placing the object, or when moving memory around due to
+ * memory pressure
+ *
+ * On Flat-CCS capable HW, compression is supported for the objects residing
+ * in I915_MEMORY_CLASS_DEVICE. When such objects (compressed) has other
+ * memory class in @regions and migrated (by I915, due to memory
+ * constrain) to the non I915_MEMORY_CLASS_DEVICE region, then I915 needs to
+ * decompress the content. But I915 dosen't have the required information to
+ * decompress the userspace compressed objects.
+ *
+ * So I915 supports Flat-CCS, only on the objects which can reside only on
+ * I915_MEMORY_CLASS_DEVICE regions.
  */
 struct drm_i915_gem_create_ext_memory_regions {
/** @base: Extension link. See struct i915_user_extension. */
-- 
2.20.1



Re: [PATCH v4 1/4] drm/i915/gt: Explicitly clear BB_OFFSET for new contexts

2022-05-02 Thread Ramalingam C
On 2022-05-02 at 16:40:00 +0530, Ramalingam C wrote:
> From: Chris Wilson 
> 
> Even though the initial protocontext we load onto HW has the register
> cleared, by the time we save it into the default image, BB_OFFSET has
> had the enable bit set. Reclear BB_OFFSET for each new context.
> 
> Testcase: igt/i915_selftests/gt_lrc
> 
> v2:
>   Extend it for gen8.
> v3:
>   BB_OFFSET is recorded per engine from Gen9 onwards
> 
> Signed-off-by: Chris Wilson 
> Cc: Mika Kuoppala 
> Signed-off-by: Ramalingam C 
> Reviewed-by: Thomas Hellstrom 
Thomas,

Could you please reconfirm your R-b for v3? This R-b was given for v1.

Ram
> ---
>  drivers/gpu/drm/i915/gt/intel_engine_regs.h |  1 +
>  drivers/gpu/drm/i915/gt/intel_lrc.c | 20 
>  drivers/gpu/drm/i915/gt/selftest_lrc.c  |  5 +
>  3 files changed, 26 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_regs.h 
> b/drivers/gpu/drm/i915/gt/intel_engine_regs.h
> index 75a0c55c5aa5..8c65f3a7acfb 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_regs.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_regs.h
> @@ -109,6 +109,7 @@
>  #define RING_SBBSTATE(base)  _MMIO((base) + 0x118) /* hsw+ */
>  #define RING_SBBADDR_UDW(base)   _MMIO((base) + 0x11c) 
> /* gen8+ */
>  #define RING_BBADDR(base)_MMIO((base) + 0x140)
> +#define RING_BB_OFFSET(base) _MMIO((base) + 0x158)
>  #define RING_BBADDR_UDW(base)_MMIO((base) + 0x168) 
> /* gen8+ */
>  #define CCID(base)   _MMIO((base) + 0x180)
>  #define   CCID_ENBIT(0)
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c 
> b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index eec73c66406c..ee8ab7470a62 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -662,6 +662,21 @@ static int lrc_ring_mi_mode(const struct intel_engine_cs 
> *engine)
>   return -1;
>  }
>  
> +static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
> +{
> + if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
> + return 0x80;
> + else if (GRAPHICS_VER(engine->i915) >= 12)
> + return 0x70;
> + else if (GRAPHICS_VER(engine->i915) >= 9)
> + return 0x64;
> + else if (GRAPHICS_VER(engine->i915) >= 8 &&
> +  engine->class == RENDER_CLASS)
> + return 0xc4;
> + else
> + return -1;
> +}
> +
>  static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
>  {
>   if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
> @@ -768,6 +783,7 @@ static void init_common_regs(u32 * const regs,
>bool inhibit)
>  {
>   u32 ctl;
> + int loc;
>  
>   ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
>   ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
> @@ -779,6 +795,10 @@ static void init_common_regs(u32 * const regs,
>   regs[CTX_CONTEXT_CONTROL] = ctl;
>  
>   regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
> +
> + loc = lrc_ring_bb_offset(engine);
> + if (loc != -1)
> + regs[loc + 1] = 0;
>  }
>  
>  static void init_wa_bb_regs(u32 * const regs,
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c 
> b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index 8b2c11dbe354..c4bd4e1ac5ef 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -357,6 +357,11 @@ static int live_lrc_fixed(void *arg)
>   lrc_ring_cmd_buf_cctl(engine),
>   "RING_CMD_BUF_CCTL"
>   },
> + {
> + 
> i915_mmio_reg_offset(RING_BB_OFFSET(engine->mmio_base)),
> + lrc_ring_bb_offset(engine),
> + "RING_BB_OFFSET"
> + },
>   { },
>   }, *t;
>   u32 *hw;
> -- 
> 2.20.1
> 


[PATCH v4 3/4] drm/i915/selftest: Always cancel semaphore on error

2022-05-02 Thread Ramalingam C
From: Chris Wilson 

Ensure that we always signal the semaphore when timing out, so that if it
happens to be stuck waiting for the semaphore we will quickly recover
without having to wait for a reset.

Reported-by: CQ Tang 
Signed-off-by: Chris Wilson 
Cc: CQ Tang 
cc: Joonas Lahtinen 
Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c 
b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 3271f01fe7db..e4d5d74489bf 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -1460,18 +1460,17 @@ static int __lrc_isolation(struct intel_engine_cs 
*engine, u32 poison)
}
 
err = poison_registers(B, poison, sema);
-   if (err) {
-   WRITE_ONCE(*sema, -1);
-   i915_request_put(rq);
-   goto err_result1;
-   }
-
-   if (i915_request_wait(rq, 0, HZ / 2) < 0) {
-   i915_request_put(rq);
+   if (err == 0 && i915_request_wait(rq, 0, HZ / 2) < 0) {
+   pr_err("%s(%s): wait for results timed out\n",
+  __func__, engine->name);
err = -ETIME;
-   goto err_result1;
}
+
+   /* Always cancel the semaphore wait, just in case the GPU gets stuck */
+   WRITE_ONCE(*sema, -1);
i915_request_put(rq);
+   if (err)
+   goto err_result1;
 
err = compare_isolation(engine, ref, result, A, poison);
 
-- 
2.20.1



[PATCH v4 2/4] drm/i915/selftests: Check for incomplete LRI from the context image

2022-05-02 Thread Ramalingam C
From: Chris Wilson 

In order to keep the context image parser simple, we assume that all
commands follow a similar format. A few, especially not MI commands on
the render engines, have fixed lengths not encoded in a length field.
This caused us to incorrectly skip over 3D state commands, and start
interpreting context data as instructions. Eventually, as Daniele
discovered, this would lead us to find addition LRI as part of the data
and mistakenly add invalid LRI commands to the context probes.

Stop parsing after we see the first !MI command, as we know we will have
seen all the context registers by that point. (Mostly true for all gen
so far, though the render context does have LRI after the first page
that we have been ignoring so far. It would be useful to extract those
as well so that we have the full list of user accessible registers.)

Similarly, emit a warning if we do try to emit an invalid zero-length
LRI.

Reported-by: Daniele Ceraolo Spurio 
Signed-off-by: Chris Wilson 
Cc: Daniele Ceraolo Spurio 
Signed-off-by: Ramalingam C 
Acked-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 61 +++---
 1 file changed, 54 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c 
b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index c4bd4e1ac5ef..3271f01fe7db 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -27,6 +27,9 @@
 #define NUM_GPR 16
 #define NUM_GPR_DW (NUM_GPR * 2) /* each GPR is 2 dwords */
 
+#define LRI_HEADER MI_INSTR(0x22, 0)
+#define LRI_LENGTH_MASK GENMASK(7, 0)
+
 static struct i915_vma *create_scratch(struct intel_gt *gt)
 {
return __vm_create_scratch_for_read_pinned(>ggtt->vm, PAGE_SIZE);
@@ -202,7 +205,7 @@ static int live_lrc_layout(void *arg)
continue;
}
 
-   if ((lri & GENMASK(31, 23)) != MI_INSTR(0x22, 0)) {
+   if ((lri & GENMASK(31, 23)) != LRI_HEADER) {
pr_err("%s: Expected LRI command at dword %d, 
found %08x\n",
   engine->name, dw, lri);
err = -EINVAL;
@@ -992,18 +995,40 @@ store_context(struct intel_context *ce, struct i915_vma 
*scratch)
hw = defaults;
hw += LRC_STATE_OFFSET / sizeof(*hw);
do {
-   u32 len = hw[dw] & 0x7f;
+   u32 len = hw[dw] & LRI_LENGTH_MASK;
+
+   /*
+* Keep it simple, skip parsing complex commands
+*
+* At present, there are no more MI_LOAD_REGISTER_IMM
+* commands after the first 3D state command. Rather
+* than include a table (see i915_cmd_parser.c) of all
+* the possible commands and their instruction lengths
+* (or mask for variable length instructions), assume
+* we have gathered the complete list of registers and
+* bail out.
+*/
+   if ((hw[dw] >> INSTR_CLIENT_SHIFT) != INSTR_MI_CLIENT)
+   break;
 
if (hw[dw] == 0) {
dw++;
continue;
}
 
-   if ((hw[dw] & GENMASK(31, 23)) != MI_INSTR(0x22, 0)) {
+   if ((hw[dw] & GENMASK(31, 23)) != LRI_HEADER) {
+   /* Assume all other MI commands match LRI length mask */
dw += len + 2;
continue;
}
 
+   if (!len) {
+   pr_err("%s: invalid LRI found in context image\n",
+  ce->engine->name);
+   igt_hexdump(defaults, PAGE_SIZE);
+   break;
+   }
+
dw++;
len = (len + 1) / 2;
while (len--) {
@@ -1155,18 +1180,29 @@ static struct i915_vma *load_context(struct 
intel_context *ce, u32 poison)
hw = defaults;
hw += LRC_STATE_OFFSET / sizeof(*hw);
do {
-   u32 len = hw[dw] & 0x7f;
+   u32 len = hw[dw] & LRI_LENGTH_MASK;
+
+   /* For simplicity, break parsing at the first complex command */
+   if ((hw[dw] >> INSTR_CLIENT_SHIFT) != INSTR_MI_CLIENT)
+   break;
 
if (hw[dw] == 0) {
dw++;
continue;
}
 
-   if ((hw[dw] & GENMASK(31, 23)) != MI_INSTR(0x22, 0)) {
+   if ((hw[dw] & GENMASK(31, 23)) != LRI_HEADER) {
dw += len + 2;
continue;
}
 
+   if (!len) {
+   pr_err("%s: invalid LRI found in context image\n",
+   

[PATCH v4 4/4] drm/i915/selftest: Clear the output buffers before GPU writes

2022-05-02 Thread Ramalingam C
From: Chris Wilson 

When testing whether we can get the GPU to leak information about
non-privileged state, we first need to ensure that the output buffer is
set to a known value as the HW may opt to skip the write into memory for
a non-privileged read of a sensitive register. We chose POISON_INUSE (0x5a)
so that is both non-zero and distinct from the poison values used during
the test.

v2:
  Use i915_gem_object_pin_map_unlocked

Reported-by: CQ Tang 
Signed-off-by: Chris Wilson 
Cc: CQ Tang 
cc: Joonas Lahtinen 
Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 32 ++
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c 
b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index e4d5d74489bf..d04d08d9d92e 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -1395,6 +1395,30 @@ static int compare_isolation(struct intel_engine_cs 
*engine,
return err;
 }
 
+static struct i915_vma *
+create_result_vma(struct i915_address_space *vm, unsigned long sz)
+{
+   struct i915_vma *vma;
+   void *ptr;
+
+   vma = create_user_vma(vm, sz);
+   if (IS_ERR(vma))
+   return vma;
+
+   /* Set the results to a known value distinct from the poison */
+   ptr = i915_gem_object_pin_map_unlocked(vma->obj, I915_MAP_WC);
+   if (IS_ERR(ptr)) {
+   i915_vma_put(vma);
+   return ERR_CAST(ptr);
+   }
+
+   memset(ptr, POISON_INUSE, vma->size);
+   i915_gem_object_flush_map(vma->obj);
+   i915_gem_object_unpin_map(vma->obj);
+
+   return vma;
+}
+
 static int __lrc_isolation(struct intel_engine_cs *engine, u32 poison)
 {
u32 *sema = memset32(engine->status_page.addr + 1000, 0, 1);
@@ -1413,13 +1437,13 @@ static int __lrc_isolation(struct intel_engine_cs 
*engine, u32 poison)
goto err_A;
}
 
-   ref[0] = create_user_vma(A->vm, SZ_64K);
+   ref[0] = create_result_vma(A->vm, SZ_64K);
if (IS_ERR(ref[0])) {
err = PTR_ERR(ref[0]);
goto err_B;
}
 
-   ref[1] = create_user_vma(A->vm, SZ_64K);
+   ref[1] = create_result_vma(A->vm, SZ_64K);
if (IS_ERR(ref[1])) {
err = PTR_ERR(ref[1]);
goto err_ref0;
@@ -1441,13 +1465,13 @@ static int __lrc_isolation(struct intel_engine_cs 
*engine, u32 poison)
}
i915_request_put(rq);
 
-   result[0] = create_user_vma(A->vm, SZ_64K);
+   result[0] = create_result_vma(A->vm, SZ_64K);
if (IS_ERR(result[0])) {
err = PTR_ERR(result[0]);
goto err_ref1;
}
 
-   result[1] = create_user_vma(A->vm, SZ_64K);
+   result[1] = create_result_vma(A->vm, SZ_64K);
if (IS_ERR(result[1])) {
err = PTR_ERR(result[1]);
goto err_result0;
-- 
2.20.1



[PATCH v4 1/4] drm/i915/gt: Explicitly clear BB_OFFSET for new contexts

2022-05-02 Thread Ramalingam C
From: Chris Wilson 

Even though the initial protocontext we load onto HW has the register
cleared, by the time we save it into the default image, BB_OFFSET has
had the enable bit set. Reclear BB_OFFSET for each new context.

Testcase: igt/i915_selftests/gt_lrc

v2:
  Extend it for gen8.
v3:
  BB_OFFSET is recorded per engine from Gen9 onwards

Signed-off-by: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_engine_regs.h |  1 +
 drivers/gpu/drm/i915/gt/intel_lrc.c | 20 
 drivers/gpu/drm/i915/gt/selftest_lrc.c  |  5 +
 3 files changed, 26 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_regs.h 
b/drivers/gpu/drm/i915/gt/intel_engine_regs.h
index 75a0c55c5aa5..8c65f3a7acfb 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_regs.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_regs.h
@@ -109,6 +109,7 @@
 #define RING_SBBSTATE(base)_MMIO((base) + 0x118) /* hsw+ */
 #define RING_SBBADDR_UDW(base) _MMIO((base) + 0x11c) /* gen8+ 
*/
 #define RING_BBADDR(base)  _MMIO((base) + 0x140)
+#define RING_BB_OFFSET(base)   _MMIO((base) + 0x158)
 #define RING_BBADDR_UDW(base)  _MMIO((base) + 0x168) /* gen8+ 
*/
 #define CCID(base) _MMIO((base) + 0x180)
 #define   CCID_EN  BIT(0)
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c 
b/drivers/gpu/drm/i915/gt/intel_lrc.c
index eec73c66406c..ee8ab7470a62 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -662,6 +662,21 @@ static int lrc_ring_mi_mode(const struct intel_engine_cs 
*engine)
return -1;
 }
 
+static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
+{
+   if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
+   return 0x80;
+   else if (GRAPHICS_VER(engine->i915) >= 12)
+   return 0x70;
+   else if (GRAPHICS_VER(engine->i915) >= 9)
+   return 0x64;
+   else if (GRAPHICS_VER(engine->i915) >= 8 &&
+engine->class == RENDER_CLASS)
+   return 0xc4;
+   else
+   return -1;
+}
+
 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
 {
if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
@@ -768,6 +783,7 @@ static void init_common_regs(u32 * const regs,
 bool inhibit)
 {
u32 ctl;
+   int loc;
 
ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
@@ -779,6 +795,10 @@ static void init_common_regs(u32 * const regs,
regs[CTX_CONTEXT_CONTROL] = ctl;
 
regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
+
+   loc = lrc_ring_bb_offset(engine);
+   if (loc != -1)
+   regs[loc + 1] = 0;
 }
 
 static void init_wa_bb_regs(u32 * const regs,
diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c 
b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 8b2c11dbe354..c4bd4e1ac5ef 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -357,6 +357,11 @@ static int live_lrc_fixed(void *arg)
lrc_ring_cmd_buf_cctl(engine),
"RING_CMD_BUF_CCTL"
},
+   {
+   
i915_mmio_reg_offset(RING_BB_OFFSET(engine->mmio_base)),
+   lrc_ring_bb_offset(engine),
+   "RING_BB_OFFSET"
+   },
{ },
}, *t;
u32 *hw;
-- 
2.20.1



[PATCH v4 0/4] lrc selftest fixes

2022-05-02 Thread Ramalingam C
Few bug fixes for lrc selftest.

v4:
  Gen8 don't have per engine recording of BB_OFFSET [Chris]

Chris Wilson (4):
  drm/i915/gt: Explicitly clear BB_OFFSET for new contexts
  drm/i915/selftests: Check for incomplete LRI from the context image
  drm/i915/selftest: Always cancel semaphore on error
  drm/i915/selftest: Clear the output buffers before GPU writes

 drivers/gpu/drm/i915/gt/intel_engine_regs.h |   1 +
 drivers/gpu/drm/i915/gt/intel_lrc.c |  20 
 drivers/gpu/drm/i915/gt/selftest_lrc.c  | 115 
 3 files changed, 116 insertions(+), 20 deletions(-)

-- 
2.20.1



[PATCH v3 4/4] drm/i915/selftest: Clear the output buffers before GPU writes

2022-04-29 Thread Ramalingam C
From: Chris Wilson 

When testing whether we can get the GPU to leak information about
non-privileged state, we first need to ensure that the output buffer is
set to a known value as the HW may opt to skip the write into memory for
a non-privileged read of a sensitive register. We chose POISON_INUSE (0x5a)
so that is both non-zero and distinct from the poison values used during
the test.

v2:
  Use i915_gem_object_pin_map_unlocked

Reported-by: CQ Tang 
Signed-off-by: Chris Wilson 
Cc: CQ Tang 
cc: Joonas Lahtinen 
Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 32 ++
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c 
b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 51e4b7092d4f..9c8e8321c633 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -1346,6 +1346,30 @@ static int compare_isolation(struct intel_engine_cs 
*engine,
return err;
 }
 
+static struct i915_vma *
+create_result_vma(struct i915_address_space *vm, unsigned long sz)
+{
+   struct i915_vma *vma;
+   void *ptr;
+
+   vma = create_user_vma(vm, sz);
+   if (IS_ERR(vma))
+   return vma;
+
+   /* Set the results to a known value distinct from the poison */
+   ptr = i915_gem_object_pin_map_unlocked(vma->obj, I915_MAP_WC);
+   if (IS_ERR(ptr)) {
+   i915_vma_put(vma);
+   return ERR_CAST(ptr);
+   }
+
+   memset(ptr, POISON_INUSE, vma->size);
+   i915_gem_object_flush_map(vma->obj);
+   i915_gem_object_unpin_map(vma->obj);
+
+   return vma;
+}
+
 static int __lrc_isolation(struct intel_engine_cs *engine, u32 poison)
 {
u32 *sema = memset32(engine->status_page.addr + 1000, 0, 1);
@@ -1364,13 +1388,13 @@ static int __lrc_isolation(struct intel_engine_cs 
*engine, u32 poison)
goto err_A;
}
 
-   ref[0] = create_user_vma(A->vm, SZ_64K);
+   ref[0] = create_result_vma(A->vm, SZ_64K);
if (IS_ERR(ref[0])) {
err = PTR_ERR(ref[0]);
goto err_B;
}
 
-   ref[1] = create_user_vma(A->vm, SZ_64K);
+   ref[1] = create_result_vma(A->vm, SZ_64K);
if (IS_ERR(ref[1])) {
err = PTR_ERR(ref[1]);
goto err_ref0;
@@ -1392,13 +1416,13 @@ static int __lrc_isolation(struct intel_engine_cs 
*engine, u32 poison)
}
i915_request_put(rq);
 
-   result[0] = create_user_vma(A->vm, SZ_64K);
+   result[0] = create_result_vma(A->vm, SZ_64K);
if (IS_ERR(result[0])) {
err = PTR_ERR(result[0]);
goto err_ref1;
}
 
-   result[1] = create_user_vma(A->vm, SZ_64K);
+   result[1] = create_result_vma(A->vm, SZ_64K);
if (IS_ERR(result[1])) {
err = PTR_ERR(result[1]);
goto err_result0;
-- 
2.20.1



[PATCH v3 3/4] drm/i915/selftest: Always cancel semaphore on error

2022-04-29 Thread Ramalingam C
From: Chris Wilson 

Ensure that we always signal the semaphore when timing out, so that if it
happens to be stuck waiting for the semaphore we will quickly recover
without having to wait for a reset.

Reported-by: CQ Tang 
Signed-off-by: Chris Wilson 
Cc: CQ Tang 
cc: Joonas Lahtinen 
Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c 
b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 684a63de156a..51e4b7092d4f 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -1411,18 +1411,17 @@ static int __lrc_isolation(struct intel_engine_cs 
*engine, u32 poison)
}
 
err = poison_registers(B, poison, sema);
-   if (err) {
-   WRITE_ONCE(*sema, -1);
-   i915_request_put(rq);
-   goto err_result1;
-   }
-
-   if (i915_request_wait(rq, 0, HZ / 2) < 0) {
-   i915_request_put(rq);
+   if (err == 0 && i915_request_wait(rq, 0, HZ / 2) < 0) {
+   pr_err("%s(%s): wait for results timed out\n",
+  __func__, engine->name);
err = -ETIME;
-   goto err_result1;
}
+
+   /* Always cancel the semaphore wait, just in case the GPU gets stuck */
+   WRITE_ONCE(*sema, -1);
i915_request_put(rq);
+   if (err)
+   goto err_result1;
 
err = compare_isolation(engine, ref, result, A, poison);
 
-- 
2.20.1



[PATCH v3 2/4] drm/i915/selftests: Check for incomplete LRI from the context image

2022-04-29 Thread Ramalingam C
From: Chris Wilson 

In order to keep the context image parser simple, we assume that all
commands follow a similar format. A few, especially not MI commands on
the render engines, have fixed lengths not encoded in a length field.
This caused us to incorrectly skip over 3D state commands, and start
interpreting context data as instructions. Eventually, as Daniele
discovered, this would lead us to find addition LRI as part of the data
and mistakenly add invalid LRI commands to the context probes.

Stop parsing after we see the first !MI command, as we know we will have
seen all the context registers by that point. (Mostly true for all gen
so far, though the render context does have LRI after the first page
that we have been ignoring so far. It would be useful to extract those
as well so that we have the full list of user accessible registers.)

Similarly, emit a warning if we do try to emit an invalid zero-length
LRI.

Reported-by: Daniele Ceraolo Spurio 
Signed-off-by: Chris Wilson 
Cc: Daniele Ceraolo Spurio 
Signed-off-by: Ramalingam C 
Acked-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 61 +++---
 1 file changed, 54 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c 
b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 33f22f17e358..684a63de156a 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -27,6 +27,9 @@
 #define NUM_GPR 16
 #define NUM_GPR_DW (NUM_GPR * 2) /* each GPR is 2 dwords */
 
+#define LRI_HEADER MI_INSTR(0x22, 0)
+#define LRI_LENGTH_MASK GENMASK(7, 0)
+
 static struct i915_vma *create_scratch(struct intel_gt *gt)
 {
return __vm_create_scratch_for_read_pinned(>ggtt->vm, PAGE_SIZE);
@@ -180,7 +183,7 @@ static int live_lrc_layout(void *arg)
continue;
}
 
-   if ((lri & GENMASK(31, 23)) != MI_INSTR(0x22, 0)) {
+   if ((lri & GENMASK(31, 23)) != LRI_HEADER) {
pr_err("%s: Expected LRI command at dword %d, 
found %08x\n",
   engine->name, dw, lri);
err = -EINVAL;
@@ -945,18 +948,40 @@ store_context(struct intel_context *ce, struct i915_vma 
*scratch)
hw = defaults;
hw += LRC_STATE_OFFSET / sizeof(*hw);
do {
-   u32 len = hw[dw] & 0x7f;
+   u32 len = hw[dw] & LRI_LENGTH_MASK;
+
+   /*
+* Keep it simple, skip parsing complex commands
+*
+* At present, there are no more MI_LOAD_REGISTER_IMM
+* commands after the first 3D state command. Rather
+* than include a table (see i915_cmd_parser.c) of all
+* the possible commands and their instruction lengths
+* (or mask for variable length instructions), assume
+* we have gathered the complete list of registers and
+* bail out.
+*/
+   if ((hw[dw] >> INSTR_CLIENT_SHIFT) != INSTR_MI_CLIENT)
+   break;
 
if (hw[dw] == 0) {
dw++;
continue;
}
 
-   if ((hw[dw] & GENMASK(31, 23)) != MI_INSTR(0x22, 0)) {
+   if ((hw[dw] & GENMASK(31, 23)) != LRI_HEADER) {
+   /* Assume all other MI commands match LRI length mask */
dw += len + 2;
continue;
}
 
+   if (!len) {
+   pr_err("%s: invalid LRI found in context image\n",
+  ce->engine->name);
+   igt_hexdump(defaults, PAGE_SIZE);
+   break;
+   }
+
dw++;
len = (len + 1) / 2;
while (len--) {
@@ -1108,18 +1133,29 @@ static struct i915_vma *load_context(struct 
intel_context *ce, u32 poison)
hw = defaults;
hw += LRC_STATE_OFFSET / sizeof(*hw);
do {
-   u32 len = hw[dw] & 0x7f;
+   u32 len = hw[dw] & LRI_LENGTH_MASK;
+
+   /* For simplicity, break parsing at the first complex command */
+   if ((hw[dw] >> INSTR_CLIENT_SHIFT) != INSTR_MI_CLIENT)
+   break;
 
if (hw[dw] == 0) {
dw++;
continue;
}
 
-   if ((hw[dw] & GENMASK(31, 23)) != MI_INSTR(0x22, 0)) {
+   if ((hw[dw] & GENMASK(31, 23)) != LRI_HEADER) {
dw += len + 2;
continue;
}
 
+   if (!len) {
+   pr_err("%s: invalid LRI found in context image\n",
+   

[PATCH v3 1/4] drm/i915/gt: Explicitly clear BB_OFFSET for new contexts

2022-04-29 Thread Ramalingam C
From: Chris Wilson 

Even though the initial protocontext we load onto HW has the register
cleared, by the time we save it into the default image, BB_OFFSET has
had the enable bit set. Reclear BB_OFFSET for each new context.

Testcase: igt/i915_selftests/gt_lrc

v2:
  Extend it for gen8.

Signed-off-by: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_engine_regs.h |  1 +
 drivers/gpu/drm/i915/gt/intel_lrc.c | 19 +++
 drivers/gpu/drm/i915/gt/selftest_lrc.c  |  5 +
 3 files changed, 25 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_regs.h 
b/drivers/gpu/drm/i915/gt/intel_engine_regs.h
index 594a629cb28f..d4b02d36d2a6 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_regs.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_regs.h
@@ -109,6 +109,7 @@
 #define RING_SBBSTATE(base)_MMIO((base) + 0x118) /* hsw+ */
 #define RING_SBBADDR_UDW(base) _MMIO((base) + 0x11c) /* gen8+ 
*/
 #define RING_BBADDR(base)  _MMIO((base) + 0x140)
+#define RING_BB_OFFSET(base)   _MMIO((base) + 0x158)
 #define RING_BBADDR_UDW(base)  _MMIO((base) + 0x168) /* gen8+ 
*/
 #define CCID(base) _MMIO((base) + 0x180)
 #define   CCID_EN  BIT(0)
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c 
b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 3f83a9038e13..5f6479dadea7 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -662,6 +662,20 @@ static int lrc_ring_mi_mode(const struct intel_engine_cs 
*engine)
return -1;
 }
 
+static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
+{
+   if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
+   return 0x80;
+   else if (GRAPHICS_VER(engine->i915) >= 12)
+   return 0x70;
+   else if (GRAPHICS_VER(engine->i915) >= 9)
+   return 0x64;
+   else if (GRAPHICS_VER(engine->i915) >= 8)
+   return 0xc4;
+   else
+   return -1;
+}
+
 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
 {
if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
@@ -768,6 +782,7 @@ static void init_common_regs(u32 * const regs,
 bool inhibit)
 {
u32 ctl;
+   int loc;
 
ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
@@ -779,6 +794,10 @@ static void init_common_regs(u32 * const regs,
regs[CTX_CONTEXT_CONTROL] = ctl;
 
regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
+
+   loc = lrc_ring_bb_offset(engine);
+   if (loc != -1)
+   regs[loc + 1] = 0;
 }
 
 static void init_wa_bb_regs(u32 * const regs,
diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c 
b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 6ba52ef1acb8..33f22f17e358 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -323,6 +323,11 @@ static int live_lrc_fixed(void *arg)
lrc_ring_cmd_buf_cctl(engine),
"RING_CMD_BUF_CCTL"
},
+   {
+   
i915_mmio_reg_offset(RING_BB_OFFSET(engine->mmio_base)),
+   lrc_ring_bb_offset(engine),
+   "RING_BB_OFFSET"
+   },
{ },
}, *t;
u32 *hw;
-- 
2.20.1



[PATCH v3 0/4] lrc selftest fixes

2022-04-29 Thread Ramalingam C
Few bug fixes for lrc selftest.

v3:
  Extending the first patch for gen8

Chris Wilson (4):
  drm/i915/gt: Explicitly clear BB_OFFSET for new contexts
  drm/i915/selftests: Check for incomplete LRI from the context image
  drm/i915/selftest: Always cancel semaphore on error
  drm/i915/selftest: Clear the output buffers before GPU writes

 drivers/gpu/drm/i915/gt/intel_engine_regs.h |   1 +
 drivers/gpu/drm/i915/gt/intel_lrc.c |  19 
 drivers/gpu/drm/i915/gt/selftest_lrc.c  | 115 
 3 files changed, 115 insertions(+), 20 deletions(-)

-- 
2.20.1



[PATCH v2 4/4] uapi/drm/i915: Document memory residency and Flat-CCS capability of obj

2022-04-25 Thread Ramalingam C
Capture the impact of memory region preference list of an object, on
their memory residency and Flat-CCS capability of the objects.

v2:
  Fix the Flat-CCS capability of an obj with {lmem, smem} preference
  list [Thomas]

Signed-off-by: Ramalingam C 
cc: Matthew Auld 
cc: Thomas Hellstrom 
---
 include/uapi/drm/i915_drm.h | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 35ca528803fd..ad191ed6547c 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -3393,6 +3393,24 @@ struct drm_i915_gem_create_ext {
  * At which point we get the object handle in _i915_gem_create_ext.handle,
  * along with the final object size in _i915_gem_create_ext.size, which
  * should account for any rounding up, if required.
+ *
+ * Objects with multiple memory regions in the preference list will be backed
+ * by one of the memory regions mentioned in the preference list. Though I915
+ * tries to honour the order of the memory regions in the preference list,
+ * based on the memory pressure of the regions, objects' backing region
+ * will be selected.
+ *
+ * Userspace has no means of knowing the backing region for such objects.
+ *
+ * On Flat-CCS capable HW, compression is supported for the objects residing
+ * in I915_MEMORY_CLASS_DEVICE. When such objects (compressed) has other
+ * memory class in preference list and migrated (by I915, due to memory
+ * constrain) to the non I915_MEMORY_CLASS_DEVICE region, then I915 needs to
+ * decompress the content. But I915 dont have the required information to
+ * decompress the userspace compressed objects.
+ *
+ * So I915 supports Flat-CCS, only on the objects which can reside only on
+ * I915_MEMORY_CLASS_DEVICE regions.
  */
 struct drm_i915_gem_create_ext_memory_regions {
/** @base: Extension link. See struct i915_user_extension. */
-- 
2.20.1



[PATCH v2 2/4] drm/i915/gt: optimize the ccs_sz calculation per chunk

2022-04-25 Thread Ramalingam C
Calculate the ccs_sz that needs to be emitted based on the src
and dst pages emitted per chunk. And handle the return value of emit_pte
for the ccs pages.

Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 36 +
 1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 29d761da02c4..463a6a14b5f9 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -647,17 +647,9 @@ static int scatter_list_length(struct scatterlist *sg)
 
 static void
 calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem,
-  int *src_sz, int *ccs_sz, u32 bytes_to_cpy,
-  u32 ccs_bytes_to_cpy)
+  int *src_sz, u32 bytes_to_cpy, u32 ccs_bytes_to_cpy)
 {
if (ccs_bytes_to_cpy) {
-   /*
-* We can only copy the ccs data corresponding to
-* the CHUNK_SZ of lmem which is
-* GET_CCS_BYTES(i915, CHUNK_SZ))
-*/
-   *ccs_sz = min_t(int, ccs_bytes_to_cpy, GET_CCS_BYTES(i915, 
CHUNK_SZ));
-
if (!src_is_lmem)
/*
 * When CHUNK_SZ is passed all the pages upto CHUNK_SZ
@@ -713,10 +705,10 @@ intel_context_migrate_copy(struct intel_context *ce,
struct drm_i915_private *i915 = ce->engine->i915;
u32 ccs_bytes_to_cpy = 0, bytes_to_cpy;
enum i915_cache_level ccs_cache_level;
-   int src_sz, dst_sz, ccs_sz;
u32 src_offset, dst_offset;
u8 src_access, dst_access;
struct i915_request *rq;
+   int src_sz, dst_sz;
bool ccs_is_src;
int err;
 
@@ -770,7 +762,7 @@ intel_context_migrate_copy(struct intel_context *ce,
}
 
do {
-   int len;
+   int len, ccs_sz;
 
rq = i915_request_create(ce);
if (IS_ERR(rq)) {
@@ -797,7 +789,7 @@ intel_context_migrate_copy(struct intel_context *ce,
if (err)
goto out_rq;
 
-   calculate_chunk_sz(i915, src_is_lmem, _sz, _sz,
+   calculate_chunk_sz(i915, src_is_lmem, _sz,
   bytes_to_cpy, ccs_bytes_to_cpy);
 
len = emit_pte(rq, _src, src_cache_level, src_is_lmem,
@@ -835,33 +827,29 @@ intel_context_migrate_copy(struct intel_context *ce,
if (err)
goto out_rq;
 
+   ccs_sz = GET_CCS_BYTES(i915, len);
err = emit_pte(rq, _ccs, ccs_cache_level, false,
   ccs_is_src ? src_offset : dst_offset,
   ccs_sz);
+   if (err < 0)
+   goto out_rq;
+   if (err < ccs_sz) {
+   err = -EINVAL;
+   goto out_rq;
+   }
 
err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
if (err)
goto out_rq;
 
-   /*
-* Using max of src_sz and dst_sz, as we need to
-* pass the lmem size corresponding to the ccs
-* blocks we need to handle.
-*/
-   ccs_sz = max_t(int, ccs_is_src ? ccs_sz : src_sz,
-  ccs_is_src ? dst_sz : ccs_sz);
-
err = emit_copy_ccs(rq, dst_offset, dst_access,
-   src_offset, src_access, ccs_sz);
+   src_offset, src_access, len);
if (err)
goto out_rq;
 
err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
if (err)
goto out_rq;
-
-   /* Converting back to ccs bytes */
-   ccs_sz = GET_CCS_BYTES(rq->engine->i915, ccs_sz);
ccs_bytes_to_cpy -= ccs_sz;
}
 
-- 
2.20.1



[PATCH v2 3/4] drm/i915/gt: Document the eviction of the Flat-CCS objects

2022-04-25 Thread Ramalingam C
Capture the eviction details for Flat-CCS capable, lmem objects.

v2:
  Fix the Flat-ccs capbility of lmem obj with smem residency
  possibility [Thomas]

Signed-off-by: Ramalingam C 
cc: Thomas Hellstrom 
cc: Matthew Auld 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 463a6a14b5f9..930e0fd9795f 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -485,16 +485,21 @@ static bool wa_1209644611_applies(int ver, u32 size)
  * And CCS data can be copied in and out of CCS region through
  * XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly.
  *
- * When we exhaust the lmem, if the object's placements support smem, then we 
can
- * directly decompress the compressed lmem object into smem and start using it
- * from smem itself.
+ * I915 supports Flat-CCS on lmem only objects. When an objects has the smem in
+ * its preference list, on memory pressure, i915 needs to migarte the lmem
+ * content into smem. If the lmem object is Flat-CCS compressed by userspace,
+ * then i915 needs to decompress it. But I915 lack the required information
+ * for such decompression. Hence I915 supports Flat-CCS only on lmem only 
objects.
  *
- * But when we need to swapout the compressed lmem object into a smem region
- * though objects' placement doesn't support smem, then we copy the lmem 
content
- * as it is into smem region along with ccs data (using XY_CTRL_SURF_COPY_BLT).
- * When the object is referred, lmem content will be swaped in along with
- * restoration of the CCS data (using XY_CTRL_SURF_COPY_BLT) at corresponding
- * location.
+ * when we exhaust the lmem, Flat-CCS capable objects' lmem backing memory can
+ * be temporarily evicted to smem, along with the auxiliary CCS state, where
+ * it can be potentially swapped-out at a later point, if required.
+ * If userspace later touches the evicted pages, then we always move
+ * the backing memory back to lmem, which includes restoring the saved CCS 
state,
+ * and potentially performing any required swap-in.
+ *
+ * For the migration of the lmem objects with smem in placement list, such as
+ * {lmem, smem}, objects are treated as non Flat-CCS capable objects.
  */
 
 static inline u32 *i915_flush_dw(u32 *cmd, u32 flags)
-- 
2.20.1



[PATCH v2 1/4] drm/i915/gt: GEM_BUG_ON unexpected NULL at scatterlist walking

2022-04-25 Thread Ramalingam C
While locating the start of ccs scatterlist in smem scatterlist, that has
to be the size of lmem obj size + corresponding ccs data size. Report bug
if scatterlist terminate before that length.

Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 9d552f30b627..29d761da02c4 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -687,6 +687,12 @@ static void get_ccs_sg_sgt(struct sgt_dma *it, u32 
bytes_to_cpy)
bytes_to_cpy -= len;
 
it->sg = __sg_next(it->sg);
+
+   /*
+* scatterlist supposed to be the size of
+* bytes_to_cpy + GET_CCS_BYTES(bytes_to_copy).
+*/
+   GEM_BUG_ON(!it->sg);
it->dma = sg_dma_address(it->sg);
it->max = it->dma + sg_dma_len(it->sg);
} while (bytes_to_cpy);
-- 
2.20.1



[PATCH v2 0/4] Flat-CCS eviction enhancements

2022-04-25 Thread Ramalingam C
Flat-CCS eviction enhancements

v2: Correcting the memory residency requirement for flat-ccs capability
[Thomas]

Ramalingam C (4):
  drm/i915/gt: GEM_BUG_ON unexpected NULL at scatterlist walking
  drm/i915/gt: optimize the ccs_sz calculation per chunk
  drm/i915/gt: Document the eviction of the Flat-CCS objects
  uapi/drm/i915: Document memory residency and Flat-CCS capability of
obj

 drivers/gpu/drm/i915/gt/intel_migrate.c | 65 -
 include/uapi/drm/i915_drm.h | 18 +++
 2 files changed, 50 insertions(+), 33 deletions(-)

-- 
2.20.1



[PATCH 3/3] drm/i915/gt: Clear SET_PREDICATE_RESULT prior to executing the ring

2022-04-25 Thread Ramalingam C
From: Chris Wilson 

Userspace may leave predication enabled upon return from the batch
buffer, which has the consequent of preventing all operation from the
ring from being executed, including all the synchronisation, coherency
control, arbitration and user signaling. This is more than just a local
gpu hang in one client, as the user has the ability to prevent the
kernel from applying critical workarounds and can cause a full GT reset.

We could simply execute MI_SET_PREDICATE upon return from the user
batch, but this has the repercussion of modifying the user's context
state. Instead, we opt to execute a fixup batch which by mixing
predicated operations can determine the state of the
SET_PREDICATE_RESULT register and restore it prior to the next userspace
batch. This allows us to protect the kernel's ring without changing the
uABI.

Suggested-by: Zbigniew Kempczynski 
Signed-off-by: Chris Wilson 
Cc: Zbigniew Kempczynski 
Cc: Thomas Hellstrom 
Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/gen8_engine_cs.c  | 54 +
 drivers/gpu/drm/i915/gt/gen8_engine_cs.h  |  7 ++
 drivers/gpu/drm/i915/gt/intel_engine_regs.h   |  1 +
 .../drm/i915/gt/intel_execlists_submission.c  | 15 +++-
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h  |  2 +
 drivers/gpu/drm/i915/gt/intel_lrc.c   | 75 ++-
 drivers/gpu/drm/i915/gt/intel_lrc.h   |  5 ++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  2 +
 8 files changed, 137 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c 
b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
index 9529c5455bc3..3e13960615bd 100644
--- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
@@ -5,6 +5,7 @@
 
 #include "gen8_engine_cs.h"
 #include "i915_drv.h"
+#include "intel_engine_regs.h"
 #include "intel_gpu_commands.h"
 #include "intel_lrc.h"
 #include "intel_ring.h"
@@ -385,6 +386,59 @@ int gen8_emit_init_breadcrumb(struct i915_request *rq)
return 0;
 }
 
+static int __gen125_emit_bb_start(struct i915_request *rq,
+ u64 offset, u32 len,
+ const unsigned int flags,
+ u32 arb)
+{
+   struct intel_context *ce = rq->context;
+   u32 wa_offset = lrc_indirect_bb(ce);
+   u32 *cs;
+
+   cs = intel_ring_begin(rq, 12);
+   if (IS_ERR(cs))
+   return PTR_ERR(cs);
+
+   *cs++ = MI_ARB_ON_OFF | arb;
+
+   *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
+   MI_SRM_LRM_GLOBAL_GTT |
+   MI_LRI_LRM_CS_MMIO;
+   *cs++ = i915_mmio_reg_offset(RING_PREDICATE_RESULT(0));
+   *cs++ = wa_offset + DG2_PREDICATE_RESULT_WA;
+   *cs++ = 0;
+
+   *cs++ = MI_BATCH_BUFFER_START_GEN8 |
+   (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
+   *cs++ = lower_32_bits(offset);
+   *cs++ = upper_32_bits(offset);
+
+   /* Fixup stray MI_SET_PREDICATE as it prevents us executing the ring */
+   *cs++ = MI_BATCH_BUFFER_START_GEN8;
+   *cs++ = wa_offset + DG2_PREDICATE_RESULT_BB;
+   *cs++ = 0;
+
+   *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
+   intel_ring_advance(rq, cs);
+
+   return 0;
+}
+
+int gen125_emit_bb_start_noarb(struct i915_request *rq,
+  u64 offset, u32 len,
+  const unsigned int flags)
+{
+   return __gen125_emit_bb_start(rq, offset, len, flags, MI_ARB_DISABLE);
+}
+
+int gen125_emit_bb_start(struct i915_request *rq,
+u64 offset, u32 len,
+const unsigned int flags)
+{
+   return __gen125_emit_bb_start(rq, offset, len, flags, MI_ARB_ENABLE);
+}
+
 int gen8_emit_bb_start_noarb(struct i915_request *rq,
 u64 offset, u32 len,
 const unsigned int flags)
diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.h 
b/drivers/gpu/drm/i915/gt/gen8_engine_cs.h
index 107ab42539ab..32e3d2b831bb 100644
--- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.h
+++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.h
@@ -31,6 +31,13 @@ int gen8_emit_bb_start(struct i915_request *rq,
   u64 offset, u32 len,
   const unsigned int flags);
 
+int gen125_emit_bb_start_noarb(struct i915_request *rq,
+  u64 offset, u32 len,
+  const unsigned int flags);
+int gen125_emit_bb_start(struct i915_request *rq,
+u64 offset, u32 len,
+const unsigned int flags);
+
 u32 *gen8_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs);
 u32 *gen12_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs);
 
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_regs.h 
b/drivers/gpu/drm/i915/gt/intel_engine_regs.h
index 1dab554bf640..75a0c55c5aa5 100644
--- a/driver

[PATCH 2/3] drm/i915/selftests: Skip poisoning SET_PREDICATE_RESULT on dg2

2022-04-25 Thread Ramalingam C
From: Chris Wilson 

When predication is enabled all commands baring a few (such as MI_BB_END)
are nop'ed. If we accidentally enable predication while poisoning the
context, not only is the rest of the poisoning skipped (thus disabling
the test), but the closing instructions of the poison request are
nop'ed. Not only do we then not signal the waiting context, but we even
prevent re-enabling arbitration and the GPU will not perform a context
switch at the end of the request.

Cc: Joonas Lahtinen 
Suggested-by: CQ Tang 
Signed-off-by: Chris Wilson 
Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/intel_engine_regs.h |  1 +
 drivers/gpu/drm/i915/gt/selftest_lrc.c  | 17 -
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_regs.h 
b/drivers/gpu/drm/i915/gt/intel_engine_regs.h
index 594a629cb28f..1dab554bf640 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_regs.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_regs.h
@@ -193,6 +193,7 @@
 #define RING_TIMESTAMP_UDW(base)   _MMIO((base) + 0x358 + 4)
 #define RING_CONTEXT_STATUS_PTR(base)  _MMIO((base) + 0x3a0)
 #define RING_CTX_TIMESTAMP(base)   _MMIO((base) + 0x3a8) /* gen8+ 
*/
+#define RING_PREDICATE_RESULT(base)_MMIO((base) + 0x3b8)
 #define RING_FORCE_TO_NONPRIV(base, i) _MMIO(((base) + 0x4D0) + (i) * 
4)
 #define   RING_FORCE_TO_NONPRIV_ADDRESS_MASK   REG_GENMASK(25, 2)
 #define   RING_FORCE_TO_NONPRIV_ACCESS_RW  (0 << 28)/* CFL+ & Gen11+ */
diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c 
b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 8dc7b88cdca0..8b2c11dbe354 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -945,6 +945,19 @@ create_user_vma(struct i915_address_space *vm, unsigned 
long size)
return vma;
 }
 
+static u32 safe_poison(u32 offset, u32 poison)
+{
+   /*
+* Do not enable predication as it will nop all subsequent commands,
+* not only disabling the tests (by preventing all the other SRM) but
+* also preventing the arbitration events at the end of the request.
+*/
+   if (offset == i915_mmio_reg_offset(RING_PREDICATE_RESULT(0)))
+   poison &= ~REG_BIT(0);
+
+   return poison;
+}
+
 static struct i915_vma *
 store_context(struct intel_context *ce, struct i915_vma *scratch)
 {
@@ -1154,7 +1167,9 @@ static struct i915_vma *load_context(struct intel_context 
*ce, u32 poison)
*cs++ = MI_LOAD_REGISTER_IMM(len);
while (len--) {
*cs++ = hw[dw];
-   *cs++ = poison;
+   *cs++ = safe_poison(hw[dw] & get_lri_mask(ce->engine,
+ 
MI_LRI_LRM_CS_MMIO),
+   poison);
dw += 2;
}
} while (dw < PAGE_SIZE / sizeof(u32) &&
-- 
2.20.1



[PATCH 1/3] drm/i915/xehpsdv/dg1/tgl: Fix issue with LRI relative addressing

2022-04-25 Thread Ramalingam C
From: Akeem G Abodunrin 

When bit 19 of MI_LOAD_REGISTER_IMM instruction opcode is set on tgl+
devices, HW does not care about certain register address offsets, but
instead check the following for valid address ranges on specific engines:
RCS && CCS: BITS(0 - 10)
BCS: BITS(0 - 11)
VECS && VCS: BITS(0 - 13)
Also, tgl+ now support relative addressing for BCS engine - So, this
patch fixes issue with live_gt_lrc selftest that is failing where there is
mismatch between LRC register layout generated during init and HW
default register offsets.

Signed-off-by: Akeem G Abodunrin 
cc: Prathap Kumar Valsan 
Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 36 +-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c 
b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 6ba52ef1acb8..8dc7b88cdca0 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -128,6 +128,27 @@ static int context_flush(struct intel_context *ce, long 
timeout)
return err;
 }
 
+static int get_lri_mask(struct intel_engine_cs *engine, u32 lri)
+{
+   if ((lri & MI_LRI_LRM_CS_MMIO) == 0)
+   return ~0u;
+
+   if (GRAPHICS_VER(engine->i915) < 12)
+   return 0xfff;
+
+   switch (engine->class) {
+   default:
+   case RENDER_CLASS:
+   case COMPUTE_CLASS:
+   return 0x07ff;
+   case COPY_ENGINE_CLASS:
+   return 0x0fff;
+   case VIDEO_DECODE_CLASS:
+   case VIDEO_ENHANCEMENT_CLASS:
+   return 0x3fff;
+   }
+}
+
 static int live_lrc_layout(void *arg)
 {
struct intel_gt *gt = arg;
@@ -167,6 +188,7 @@ static int live_lrc_layout(void *arg)
dw = 0;
do {
u32 lri = READ_ONCE(hw[dw]);
+   u32 lri_mask;
 
if (lri == 0) {
dw++;
@@ -194,6 +216,18 @@ static int live_lrc_layout(void *arg)
break;
}
 
+   /*
+* When bit 19 of MI_LOAD_REGISTER_IMM instruction
+* opcode is set on Gen12+ devices, HW does not
+* care about certain register address offsets, and
+* instead check the following for valid address
+* ranges on specific engines:
+* RCS && CCS: BITS(0 - 10)
+* BCS: BITS(0 - 11)
+* VECS && VCS: BITS(0 - 13)
+*/
+   lri_mask = get_lri_mask(engine, lri);
+
lri &= 0x7f;
lri++;
dw++;
@@ -201,7 +235,7 @@ static int live_lrc_layout(void *arg)
while (lri) {
u32 offset = READ_ONCE(hw[dw]);
 
-   if (offset != lrc[dw]) {
+   if ((offset ^ lrc[dw]) & lri_mask) {
pr_err("%s: Different registers found 
at dword %d, expected %x, found %x\n",
   engine->name, dw, offset, 
lrc[dw]);
err = -EINVAL;
-- 
2.20.1



[PATCH 0/3] Handle predicate programming

2022-04-25 Thread Ramalingam C
Userspace can leave SET_PREDICATE_RESULT active at the end of their
batch, causing all the kernel operations from the ring to be noop'ed.
This includes workarounds for memory corruption on dg2, as well as the
usual synchronisation, arbitration, coherency and signaling. The latter
can be used to cause system-wide hangs, prevent TLB invalidates, as
well as runtime-pm leakage due to a never signaled fence which escapes
hangcheck as the context does run.

To avoid the issues caused by allowing userspace to disable kernel
execution, we explicitly clear SET_PREDICATE_RESULT but not before
recording whether predication was active. By tracking if predication was
active at the end of the batch, we can restore it immediately prior to
executing the users next batch, preserving the status of the user's
predication.

And also LRI relative addressing is fixed as part of this series.

Akeem G Abodunrin (1):
  drm/i915/xehpsdv/dg1/tgl: Fix issue with LRI relative addressing

Chris Wilson (2):
  drm/i915/selftests: Skip poisoning SET_PREDICATE_RESULT on dg2
  drm/i915/gt: Clear SET_PREDICATE_RESULT prior to executing the ring

 drivers/gpu/drm/i915/gt/gen8_engine_cs.c  | 54 +
 drivers/gpu/drm/i915/gt/gen8_engine_cs.h  |  7 ++
 drivers/gpu/drm/i915/gt/intel_engine_regs.h   |  2 +
 .../drm/i915/gt/intel_execlists_submission.c  | 15 +++-
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h  |  2 +
 drivers/gpu/drm/i915/gt/intel_lrc.c   | 75 ++-
 drivers/gpu/drm/i915/gt/intel_lrc.h   |  5 ++
 drivers/gpu/drm/i915/gt/selftest_lrc.c| 53 -
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  2 +
 9 files changed, 189 insertions(+), 26 deletions(-)

-- 
2.20.1



[PATCH v2 4/4] drm/i915/selftest: Clear the output buffers before GPU writes

2022-04-25 Thread Ramalingam C
From: Chris Wilson 

When testing whether we can get the GPU to leak information about
non-privileged state, we first need to ensure that the output buffer is
set to a known value as the HW may opt to skip the write into memory for
a non-privileged read of a sensitive register. We chose POISON_INUSE (0x5a)
so that is both non-zero and distinct from the poison values used during
the test.

v2:
  Use i915_gem_object_pin_map_unlocked

Reported-by: CQ Tang 
Signed-off-by: Chris Wilson 
Cc: CQ Tang 
cc: Joonas Lahtinen 
Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 32 ++
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c 
b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 51e4b7092d4f..9c8e8321c633 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -1346,6 +1346,30 @@ static int compare_isolation(struct intel_engine_cs 
*engine,
return err;
 }
 
+static struct i915_vma *
+create_result_vma(struct i915_address_space *vm, unsigned long sz)
+{
+   struct i915_vma *vma;
+   void *ptr;
+
+   vma = create_user_vma(vm, sz);
+   if (IS_ERR(vma))
+   return vma;
+
+   /* Set the results to a known value distinct from the poison */
+   ptr = i915_gem_object_pin_map_unlocked(vma->obj, I915_MAP_WC);
+   if (IS_ERR(ptr)) {
+   i915_vma_put(vma);
+   return ERR_CAST(ptr);
+   }
+
+   memset(ptr, POISON_INUSE, vma->size);
+   i915_gem_object_flush_map(vma->obj);
+   i915_gem_object_unpin_map(vma->obj);
+
+   return vma;
+}
+
 static int __lrc_isolation(struct intel_engine_cs *engine, u32 poison)
 {
u32 *sema = memset32(engine->status_page.addr + 1000, 0, 1);
@@ -1364,13 +1388,13 @@ static int __lrc_isolation(struct intel_engine_cs 
*engine, u32 poison)
goto err_A;
}
 
-   ref[0] = create_user_vma(A->vm, SZ_64K);
+   ref[0] = create_result_vma(A->vm, SZ_64K);
if (IS_ERR(ref[0])) {
err = PTR_ERR(ref[0]);
goto err_B;
}
 
-   ref[1] = create_user_vma(A->vm, SZ_64K);
+   ref[1] = create_result_vma(A->vm, SZ_64K);
if (IS_ERR(ref[1])) {
err = PTR_ERR(ref[1]);
goto err_ref0;
@@ -1392,13 +1416,13 @@ static int __lrc_isolation(struct intel_engine_cs 
*engine, u32 poison)
}
i915_request_put(rq);
 
-   result[0] = create_user_vma(A->vm, SZ_64K);
+   result[0] = create_result_vma(A->vm, SZ_64K);
if (IS_ERR(result[0])) {
err = PTR_ERR(result[0]);
goto err_ref1;
}
 
-   result[1] = create_user_vma(A->vm, SZ_64K);
+   result[1] = create_result_vma(A->vm, SZ_64K);
if (IS_ERR(result[1])) {
err = PTR_ERR(result[1]);
goto err_result0;
-- 
2.20.1



[PATCH v2 3/4] drm/i915/selftest: Always cancel semaphore on error

2022-04-25 Thread Ramalingam C
From: Chris Wilson 

Ensure that we always signal the semaphore when timing out, so that if it
happens to be stuck waiting for the semaphore we will quickly recover
without having to wait for a reset.

Reported-by: CQ Tang 
Signed-off-by: Chris Wilson 
Cc: CQ Tang 
cc: Joonas Lahtinen 
Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c 
b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 684a63de156a..51e4b7092d4f 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -1411,18 +1411,17 @@ static int __lrc_isolation(struct intel_engine_cs 
*engine, u32 poison)
}
 
err = poison_registers(B, poison, sema);
-   if (err) {
-   WRITE_ONCE(*sema, -1);
-   i915_request_put(rq);
-   goto err_result1;
-   }
-
-   if (i915_request_wait(rq, 0, HZ / 2) < 0) {
-   i915_request_put(rq);
+   if (err == 0 && i915_request_wait(rq, 0, HZ / 2) < 0) {
+   pr_err("%s(%s): wait for results timed out\n",
+  __func__, engine->name);
err = -ETIME;
-   goto err_result1;
}
+
+   /* Always cancel the semaphore wait, just in case the GPU gets stuck */
+   WRITE_ONCE(*sema, -1);
i915_request_put(rq);
+   if (err)
+   goto err_result1;
 
err = compare_isolation(engine, ref, result, A, poison);
 
-- 
2.20.1



[PATCH v2 2/4] drm/i915/selftests: Check for incomplete LRI from the context image

2022-04-25 Thread Ramalingam C
From: Chris Wilson 

In order to keep the context image parser simple, we assume that all
commands follow a similar format. A few, especially not MI commands on
the render engines, have fixed lengths not encoded in a length field.
This caused us to incorrectly skip over 3D state commands, and start
interpreting context data as instructions. Eventually, as Daniele
discovered, this would lead us to find addition LRI as part of the data
and mistakenly add invalid LRI commands to the context probes.

Stop parsing after we see the first !MI command, as we know we will have
seen all the context registers by that point. (Mostly true for all gen so far,
though the render context does have LRI after the first page that we
have been ignoring so far. It would be useful to extract those as well
so that we have the full list of user accessible registers.)

Similarly, emit a warning if we do try to emit an invalid zero-length
LRI.

Reported-by: Daniele Ceraolo Spurio 
Signed-off-by: Chris Wilson 
Cc: Daniele Ceraolo Spurio 
Signed-off-by: Ramalingam C 
Acked-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 61 +++---
 1 file changed, 54 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c 
b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 33f22f17e358..684a63de156a 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -27,6 +27,9 @@
 #define NUM_GPR 16
 #define NUM_GPR_DW (NUM_GPR * 2) /* each GPR is 2 dwords */
 
+#define LRI_HEADER MI_INSTR(0x22, 0)
+#define LRI_LENGTH_MASK GENMASK(7, 0)
+
 static struct i915_vma *create_scratch(struct intel_gt *gt)
 {
return __vm_create_scratch_for_read_pinned(>ggtt->vm, PAGE_SIZE);
@@ -180,7 +183,7 @@ static int live_lrc_layout(void *arg)
continue;
}
 
-   if ((lri & GENMASK(31, 23)) != MI_INSTR(0x22, 0)) {
+   if ((lri & GENMASK(31, 23)) != LRI_HEADER) {
pr_err("%s: Expected LRI command at dword %d, 
found %08x\n",
   engine->name, dw, lri);
err = -EINVAL;
@@ -945,18 +948,40 @@ store_context(struct intel_context *ce, struct i915_vma 
*scratch)
hw = defaults;
hw += LRC_STATE_OFFSET / sizeof(*hw);
do {
-   u32 len = hw[dw] & 0x7f;
+   u32 len = hw[dw] & LRI_LENGTH_MASK;
+
+   /*
+* Keep it simple, skip parsing complex commands
+*
+* At present, there are no more MI_LOAD_REGISTER_IMM
+* commands after the first 3D state command. Rather
+* than include a table (see i915_cmd_parser.c) of all
+* the possible commands and their instruction lengths
+* (or mask for variable length instructions), assume
+* we have gathered the complete list of registers and
+* bail out.
+*/
+   if ((hw[dw] >> INSTR_CLIENT_SHIFT) != INSTR_MI_CLIENT)
+   break;
 
if (hw[dw] == 0) {
dw++;
continue;
}
 
-   if ((hw[dw] & GENMASK(31, 23)) != MI_INSTR(0x22, 0)) {
+   if ((hw[dw] & GENMASK(31, 23)) != LRI_HEADER) {
+   /* Assume all other MI commands match LRI length mask */
dw += len + 2;
continue;
}
 
+   if (!len) {
+   pr_err("%s: invalid LRI found in context image\n",
+  ce->engine->name);
+   igt_hexdump(defaults, PAGE_SIZE);
+   break;
+   }
+
dw++;
len = (len + 1) / 2;
while (len--) {
@@ -1108,18 +1133,29 @@ static struct i915_vma *load_context(struct 
intel_context *ce, u32 poison)
hw = defaults;
hw += LRC_STATE_OFFSET / sizeof(*hw);
do {
-   u32 len = hw[dw] & 0x7f;
+   u32 len = hw[dw] & LRI_LENGTH_MASK;
+
+   /* For simplicity, break parsing at the first complex command */
+   if ((hw[dw] >> INSTR_CLIENT_SHIFT) != INSTR_MI_CLIENT)
+   break;
 
if (hw[dw] == 0) {
dw++;
continue;
}
 
-   if ((hw[dw] & GENMASK(31, 23)) != MI_INSTR(0x22, 0)) {
+   if ((hw[dw] & GENMASK(31, 23)) != LRI_HEADER) {
dw += len + 2;
continue;
}
 
+   if (!len) {
+   pr_err("%s: invalid LRI found in context image\n",
+   

[PATCH v2 1/4] drm/i915/gt: Explicitly clear BB_OFFSET for new contexts

2022-04-25 Thread Ramalingam C
From: Chris Wilson 

Even though the initial protocontext we load onto HW has the register
cleared, by the time we save it into the default image, BB_OFFSET has
had the enable bit set. Reclear BB_OFFSET for each new context.

Testcase: igt/i915_selftests/gt_lrc

Signed-off-by: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_engine_regs.h |  1 +
 drivers/gpu/drm/i915/gt/intel_lrc.c | 17 +
 drivers/gpu/drm/i915/gt/selftest_lrc.c  |  5 +
 3 files changed, 23 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_regs.h 
b/drivers/gpu/drm/i915/gt/intel_engine_regs.h
index 594a629cb28f..d4b02d36d2a6 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_regs.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_regs.h
@@ -109,6 +109,7 @@
 #define RING_SBBSTATE(base)_MMIO((base) + 0x118) /* hsw+ */
 #define RING_SBBADDR_UDW(base) _MMIO((base) + 0x11c) /* gen8+ 
*/
 #define RING_BBADDR(base)  _MMIO((base) + 0x140)
+#define RING_BB_OFFSET(base)   _MMIO((base) + 0x158)
 #define RING_BBADDR_UDW(base)  _MMIO((base) + 0x168) /* gen8+ 
*/
 #define CCID(base) _MMIO((base) + 0x180)
 #define   CCID_EN  BIT(0)
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c 
b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 3f83a9038e13..63f0b44084cf 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -662,6 +662,18 @@ static int lrc_ring_mi_mode(const struct intel_engine_cs 
*engine)
return -1;
 }
 
+static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
+{
+   if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
+   return 0x80;
+   else if (GRAPHICS_VER(engine->i915) >= 12)
+   return 0x70;
+   else if (GRAPHICS_VER(engine->i915) >= 9)
+   return 0x64;
+   else
+   return -1;
+}
+
 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
 {
if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
@@ -768,6 +780,7 @@ static void init_common_regs(u32 * const regs,
 bool inhibit)
 {
u32 ctl;
+   int loc;
 
ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
@@ -779,6 +792,10 @@ static void init_common_regs(u32 * const regs,
regs[CTX_CONTEXT_CONTROL] = ctl;
 
regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
+
+   loc = lrc_ring_bb_offset(engine);
+   if (loc != -1)
+   regs[loc + 1] = 0;
 }
 
 static void init_wa_bb_regs(u32 * const regs,
diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c 
b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 6ba52ef1acb8..33f22f17e358 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -323,6 +323,11 @@ static int live_lrc_fixed(void *arg)
lrc_ring_cmd_buf_cctl(engine),
"RING_CMD_BUF_CCTL"
},
+   {
+   
i915_mmio_reg_offset(RING_BB_OFFSET(engine->mmio_base)),
+   lrc_ring_bb_offset(engine),
+   "RING_BB_OFFSET"
+   },
{ },
}, *t;
u32 *hw;
-- 
2.20.1



[PATCH v2 0/4] lrc selftest fixes

2022-04-25 Thread Ramalingam C
Few bug fixes for lrc selftest.

Resending the reviewed patches for CI feedback.

Chris Wilson (4):
  drm/i915/gt: Explicitly clear BB_OFFSET for new contexts
  drm/i915/selftests: Check for incomplete LRI from the context image
  drm/i915/selftest: Always cancel semaphore on error
  drm/i915/selftest: Clear the output buffers before GPU writes

 drivers/gpu/drm/i915/gt/intel_engine_regs.h |   1 +
 drivers/gpu/drm/i915/gt/intel_lrc.c |  17 +++
 drivers/gpu/drm/i915/gt/selftest_lrc.c  | 115 
 3 files changed, 113 insertions(+), 20 deletions(-)

-- 
2.20.1



Re: [PATCH 3/4] drm/i915/gt: Extend doc on Flat-CCS obj eviction

2022-04-22 Thread Ramalingam C
On 2022-04-21 at 19:07:29 +0530, Hellstrom, Thomas wrote:
> On Thu, 2022-04-21 at 17:08 +0530, Ramalingam C wrote:
> > Capture the eviction details for Flat-CCS capable lmem only objects
> > and
> > lmem objects with smem residency. This also captures the impact of
> > eviction on  object's memory residency and Flat-CCS compression
> > state.
> >
> > Signed-off-by: Ramalingam C 
> > ---
> >  drivers/gpu/drm/i915/gt/intel_migrate.c | 36 ++-
> > --
> >  1 file changed, 27 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c
> > b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > index 463a6a14b5f9..9d0d18950e76 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_migrate.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > @@ -485,16 +485,34 @@ static bool wa_1209644611_applies(int ver, u32
> > size)
> >   * And CCS data can be copied in and out of CCS region through
> >   * XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly.
> >   *
> > - * When we exhaust the lmem, if the object's placements support
> > smem, then we can
> > - * directly decompress the compressed lmem object into smem and
> > start using it
> > - * from smem itself.
> > + * when we exhaust the lmem, we need to handle two types of flat-ccs
> > capable
> > + * objects for its eviction.
> > + *   1) lmem only objects
> > + *   2) lmem objects with smem residency option
> >   *
> > - * But when we need to swapout the compressed lmem object into a
> > smem region
> > - * though objects' placement doesn't support smem, then we copy the
> > lmem content
> > - * as it is into smem region along with ccs data (using
> > XY_CTRL_SURF_COPY_BLT).
> > - * When the object is referred, lmem content will be swaped in along
> > with
> > - * restoration of the CCS data (using XY_CTRL_SURF_COPY_BLT) at
> > corresponding
> > - * location.
> > + * 1) lmem only objects:
> > + *
> > + * lmem backing memory can be temporarily evicted to smem, along
> > with the
> > + * auxiliary CCS state, where it can be potentially swapped-out at a
> > later point,
> > + * if required. If userspace later touches the evicted pages, then
> > we always move
> > + * the backing memory back to lmem, which includes restoring the
> > saved CCS state,
> > + * and potentially performing any required swap-in.
> > + *
> > + * In this scenario, objects' backing memory class and Flat-CCS
> > state doesn't
> > + * change.
> > + *
> > + * 2) lmem objects with smem residency option
> > + *
> > + * Lmem object with smem region in it's placement list, will be
> > migrated into
> > + * smem  by decompressing the content. I915 doesn't handle this kind
> > of
> > + * migration for Flat-CCS compressed objects yet.
> > + *
> > + * In this scenario, objects' backing memory class and Flat-CCS
> > state changed,
> > + * and userspace is not aware of it.
> > + *
> > + * In summary, when a userspace wants to be sure about the objects
> > memory
> > + * residency and flat-ccs compression state, then placement list
> > can't have
> > + * the lmem and smem together. Instead, object has to be lmem
> > resident only.
> 
> For 2) I was under the impression that with flat CCS, these objects
> need to be always uncompressed, since the kernel doesn't have the
> needed information to decompress / compress. Or has this been changed
> recently?
Sorry. I have overlooked the lack of inputs required for decompression
at kernel. So yes we can't support the compression on the lmem objects
with {lmem, smem} as placement preferences. I will update the
documentation accordingly.

Ram.
> 
> /Thomas
> 
> 
> 
> 
> >   */
> >
> >  static inline u32 *i915_flush_dw(u32 *cmd, u32 flags)
> 


Re: [PATCH 1/4] drm/i915/gt: GEM_BUG_ON unexpected NULL at scatterlist walking

2022-04-21 Thread Ramalingam C
On 2022-04-21 at 18:57:59 +0530, Hellstrom, Thomas wrote:
> On Thu, 2022-04-21 at 17:08 +0530, Ramalingam C wrote:
> > While locating the start of ccs scatterlist in smem scatterlist, that
> > has
> > to be the size of lmem obj size + corresponding ccs data size. Report
> > bug
> > if scatterlist terminate before that length.
> >
> > Signed-off-by: Ramalingam C 
> > ---
> >  drivers/gpu/drm/i915/gt/intel_migrate.c | 6 ++
> >  1 file changed, 6 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c
> > b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > index 9d552f30b627..29d761da02c4 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_migrate.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > @@ -687,6 +687,12 @@ static void get_ccs_sg_sgt(struct sgt_dma *it,
> > u32 bytes_to_cpy)
> > bytes_to_cpy -= len;
> >
> 
> 
> > it->sg = __sg_next(it->sg);
> 
> If bytes_to_cpy == 0 here, couldn't it->sg be NULL then?
Hi,

bytes_to_cpy is the lmem size and the scatterlist is the length of
bytes_to_cpy + GET_CCS_BYTES(bytes_to_cpy). So this should not be null.

when bytes_to_cpy reduces to zero we will be having the start of the scatterlist
for ccs.

Ram.
> 
> > +
> > +   /*
> > +* scatterlist supposed to be the size of
> > +* bytes_to_cpy + GET_CCS_BYTES(bytes_to_copy).
> > +*/
> > +   GEM_BUG_ON(!it->sg);
> > it->dma = sg_dma_address(it->sg);
> > it->max = it->dma + sg_dma_len(it->sg);
> > } while (bytes_to_cpy);
> 
> /Thomas
> 


[PATCH 2/4] drm/i915/gt: optimize the ccs_sz calculation per chunk

2022-04-21 Thread Ramalingam C
Calculate the ccs_sz that needs to be emitted based on the src
and dst pages emitted per chunk. And handle the return value of emit_pte
for the ccs pages.

Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 36 +
 1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 29d761da02c4..463a6a14b5f9 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -647,17 +647,9 @@ static int scatter_list_length(struct scatterlist *sg)
 
 static void
 calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem,
-  int *src_sz, int *ccs_sz, u32 bytes_to_cpy,
-  u32 ccs_bytes_to_cpy)
+  int *src_sz, u32 bytes_to_cpy, u32 ccs_bytes_to_cpy)
 {
if (ccs_bytes_to_cpy) {
-   /*
-* We can only copy the ccs data corresponding to
-* the CHUNK_SZ of lmem which is
-* GET_CCS_BYTES(i915, CHUNK_SZ))
-*/
-   *ccs_sz = min_t(int, ccs_bytes_to_cpy, GET_CCS_BYTES(i915, 
CHUNK_SZ));
-
if (!src_is_lmem)
/*
 * When CHUNK_SZ is passed all the pages upto CHUNK_SZ
@@ -713,10 +705,10 @@ intel_context_migrate_copy(struct intel_context *ce,
struct drm_i915_private *i915 = ce->engine->i915;
u32 ccs_bytes_to_cpy = 0, bytes_to_cpy;
enum i915_cache_level ccs_cache_level;
-   int src_sz, dst_sz, ccs_sz;
u32 src_offset, dst_offset;
u8 src_access, dst_access;
struct i915_request *rq;
+   int src_sz, dst_sz;
bool ccs_is_src;
int err;
 
@@ -770,7 +762,7 @@ intel_context_migrate_copy(struct intel_context *ce,
}
 
do {
-   int len;
+   int len, ccs_sz;
 
rq = i915_request_create(ce);
if (IS_ERR(rq)) {
@@ -797,7 +789,7 @@ intel_context_migrate_copy(struct intel_context *ce,
if (err)
goto out_rq;
 
-   calculate_chunk_sz(i915, src_is_lmem, _sz, _sz,
+   calculate_chunk_sz(i915, src_is_lmem, _sz,
   bytes_to_cpy, ccs_bytes_to_cpy);
 
len = emit_pte(rq, _src, src_cache_level, src_is_lmem,
@@ -835,33 +827,29 @@ intel_context_migrate_copy(struct intel_context *ce,
if (err)
goto out_rq;
 
+   ccs_sz = GET_CCS_BYTES(i915, len);
err = emit_pte(rq, _ccs, ccs_cache_level, false,
   ccs_is_src ? src_offset : dst_offset,
   ccs_sz);
+   if (err < 0)
+   goto out_rq;
+   if (err < ccs_sz) {
+   err = -EINVAL;
+   goto out_rq;
+   }
 
err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
if (err)
goto out_rq;
 
-   /*
-* Using max of src_sz and dst_sz, as we need to
-* pass the lmem size corresponding to the ccs
-* blocks we need to handle.
-*/
-   ccs_sz = max_t(int, ccs_is_src ? ccs_sz : src_sz,
-  ccs_is_src ? dst_sz : ccs_sz);
-
err = emit_copy_ccs(rq, dst_offset, dst_access,
-   src_offset, src_access, ccs_sz);
+   src_offset, src_access, len);
if (err)
goto out_rq;
 
err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
if (err)
goto out_rq;
-
-   /* Converting back to ccs bytes */
-   ccs_sz = GET_CCS_BYTES(rq->engine->i915, ccs_sz);
ccs_bytes_to_cpy -= ccs_sz;
}
 
-- 
2.20.1



[PATCH 3/4] drm/i915/gt: Extend doc on Flat-CCS obj eviction

2022-04-21 Thread Ramalingam C
Capture the eviction details for Flat-CCS capable lmem only objects and
lmem objects with smem residency. This also captures the impact of
eviction on  object's memory residency and Flat-CCS compression
state.

Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 36 ++---
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 463a6a14b5f9..9d0d18950e76 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -485,16 +485,34 @@ static bool wa_1209644611_applies(int ver, u32 size)
  * And CCS data can be copied in and out of CCS region through
  * XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly.
  *
- * When we exhaust the lmem, if the object's placements support smem, then we 
can
- * directly decompress the compressed lmem object into smem and start using it
- * from smem itself.
+ * when we exhaust the lmem, we need to handle two types of flat-ccs capable
+ * objects for its eviction.
+ *   1) lmem only objects
+ *   2) lmem objects with smem residency option
  *
- * But when we need to swapout the compressed lmem object into a smem region
- * though objects' placement doesn't support smem, then we copy the lmem 
content
- * as it is into smem region along with ccs data (using XY_CTRL_SURF_COPY_BLT).
- * When the object is referred, lmem content will be swaped in along with
- * restoration of the CCS data (using XY_CTRL_SURF_COPY_BLT) at corresponding
- * location.
+ * 1) lmem only objects:
+ *
+ * lmem backing memory can be temporarily evicted to smem, along with the
+ * auxiliary CCS state, where it can be potentially swapped-out at a later 
point,
+ * if required. If userspace later touches the evicted pages, then we always 
move
+ * the backing memory back to lmem, which includes restoring the saved CCS 
state,
+ * and potentially performing any required swap-in.
+ *
+ * In this scenario, objects' backing memory class and Flat-CCS state doesn't
+ * change.
+ *
+ * 2) lmem objects with smem residency option
+ *
+ * Lmem object with smem region in it's placement list, will be migrated into
+ * smem  by decompressing the content. I915 doesn't handle this kind of
+ * migration for Flat-CCS compressed objects yet.
+ *
+ * In this scenario, objects' backing memory class and Flat-CCS state changed,
+ * and userspace is not aware of it.
+ *
+ * In summary, when a userspace wants to be sure about the objects memory
+ * residency and flat-ccs compression state, then placement list can't have
+ * the lmem and smem together. Instead, object has to be lmem resident only.
  */
 
 static inline u32 *i915_flush_dw(u32 *cmd, u32 flags)
-- 
2.20.1



[PATCH 4/4] uapi/drm/i915: Update the placement list impact on obj residency

2022-04-21 Thread Ramalingam C
Object created with list of memory classes as placement preferences, can
be backed with any memory class of the list as per kernel's migration
policy for the memory contrain situation. Userspace won't be notified of
the memory residency change at this scenario.

And also Flat-CCS compression is supported only on objects of
I915_MEMORY_CLASS_DEVICE. When the Flat-CCS compressed objects migrates
out of I915_MEMORY_CLASS_DEVICE, due to memory constrain, content will
be decompressed without notifying the userpsace.

Record these details in Kernel documentation.

Signed-off-by: Ramalingam C 
---
 include/uapi/drm/i915_drm.h | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 35ca528803fd..8b25dd6a157a 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -3393,6 +3393,20 @@ struct drm_i915_gem_create_ext {
  * At which point we get the object handle in _i915_gem_create_ext.handle,
  * along with the final object size in _i915_gem_create_ext.size, which
  * should account for any rounding up, if required.
+ *
+ * If an object is created with list of memory classes as their placement
+ * preference, kernel could use one of the memory class as the backing storage
+ * based on the memory availability. At memory pressure kernel could migrate 
the
+ * objects content from one memory class to another, given in the placement 
list.
+ *
+ * With placement preference list, userpace can't be sure about the object's 
memory
+ * residence.
+ *
+ * Flat-CCS compression is supported only for objects of 
I915_MEMORY_CLASS_DEVICE.
+ * If the object has other placement preferences, and if the content is
+ * migrated (by kernel due to memory constrain) to a memory class which is 
other
+ * than I915_MEMORY_CLASS_DEVICE, object content will be decompressed by 
kernel.
+ * Userpace will be ignorant of this Flat-CCS state change.
  */
 struct drm_i915_gem_create_ext_memory_regions {
/** @base: Extension link. See struct i915_user_extension. */
-- 
2.20.1



[PATCH 0/4] Flat-CCS eviction enhancements

2022-04-21 Thread Ramalingam C
Flat-CCS eviction enhancements.

Ramalingam C (4):
  drm/i915/gt: GEM_BUG_ON unexpected NULL at scatterlist walking
  drm/i915/gt: optimize the ccs_sz calculation per chunk
  drm/i915/gt: Extend doc on Flat-CCS obj eviction
  uapi/drm/i915: Update the placement list impact on obj residency

 drivers/gpu/drm/i915/gt/intel_migrate.c | 78 ++---
 include/uapi/drm/i915_drm.h | 14 +
 2 files changed, 59 insertions(+), 33 deletions(-)

-- 
2.20.1



[PATCH 1/4] drm/i915/gt: GEM_BUG_ON unexpected NULL at scatterlist walking

2022-04-21 Thread Ramalingam C
While locating the start of ccs scatterlist in smem scatterlist, that has
to be the size of lmem obj size + corresponding ccs data size. Report bug
if scatterlist terminate before that length.

Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 9d552f30b627..29d761da02c4 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -687,6 +687,12 @@ static void get_ccs_sg_sgt(struct sgt_dma *it, u32 
bytes_to_cpy)
bytes_to_cpy -= len;
 
it->sg = __sg_next(it->sg);
+
+   /*
+* scatterlist supposed to be the size of
+* bytes_to_cpy + GET_CCS_BYTES(bytes_to_copy).
+*/
+   GEM_BUG_ON(!it->sg);
it->dma = sg_dma_address(it->sg);
it->max = it->dma + sg_dma_len(it->sg);
} while (bytes_to_cpy);
-- 
2.20.1



[PATCH 0/4] Flat-CCS eviction enhancements

2022-04-21 Thread Ramalingam C
Flat-CCS eviction enhancements.

Ramalingam C (4):
  drm/i915/gt: GEM_BUG_ON unexpected NULL at scatterlist walking
  drm/i915/gt: optimize the ccs_sz calculation per chunk
  drm/i915/gt: Extend doc on Flat-CCS obj eviction
  uapi/drm/i915: Update the placement list impact on obj residency

 drivers/gpu/drm/i915/gt/intel_migrate.c | 78 ++---
 include/uapi/drm/i915_drm.h | 14 +
 2 files changed, 59 insertions(+), 33 deletions(-)

-- 
2.20.1



Re: [CI 4/4] drm/i915/selftests: tweak the misaligned_case

2022-04-21 Thread Ramalingam C
On 2022-04-20 at 19:16:13 +0100, Matthew Auld wrote:
> The compact-pt layout restrictions should only apply to the ppGTT. Also
> make this play nice on platforms that only have the 64K GTT restriction,
> and not the compact-pt thing.
> 
> Signed-off-by: Matthew Auld 
> Cc: Thomas Hellström 
> Cc: Nirmoy Das 
> Cc: Ramalingam C 
Looks good to me.

Reviewed-by: Ramalingam C 

> ---
>  drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 12 +---
>  1 file changed, 9 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c 
> b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> index bccc49a8ab5e..8633bec18fa7 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> @@ -1112,10 +1112,16 @@ static int misaligned_case(struct i915_address_space 
> *vm, struct intel_memory_re
>   expected_vma_size = round_up(size, 1 << 
> (ffs(vma->resource->page_sizes_gtt) - 1));
>   expected_node_size = expected_vma_size;
>  
> - if (NEEDS_COMPACT_PT(vm->i915) && i915_gem_object_is_lmem(obj)) {
> - /* compact-pt should expand lmem node to 2MB */
> + if (HAS_64K_PAGES(vm->i915) && i915_gem_object_is_lmem(obj)) {
> + /*
> +  * The compact-pt should expand lmem node to 2MB for the ppGTT,
> +  * for all other cases we should only expect 64K.
> +  */
>   expected_vma_size = round_up(size, I915_GTT_PAGE_SIZE_64K);
> - expected_node_size = round_up(size, I915_GTT_PAGE_SIZE_2M);
> + if (NEEDS_COMPACT_PT(vm->i915) && !i915_is_ggtt(vm))
> + expected_node_size = round_up(size, 
> I915_GTT_PAGE_SIZE_2M);
> + else
> + expected_node_size = round_up(size, 
> I915_GTT_PAGE_SIZE_64K);
>   }
>  
>   if (vma->size != expected_vma_size || vma->node.size != 
> expected_node_size) {
> -- 
> 2.34.1
> 


Re: [CI 3/4] drm/i915/selftests: fixup min_alignment usage

2022-04-21 Thread Ramalingam C
On 2022-04-20 at 19:16:12 +0100, Matthew Auld wrote:
> Trying to cast the region id into the region type doesn't work too well,
> since the i915_vm_min_alignment() won't give us the correct value for
> the stolen-lmem case.
> 
> Signed-off-by: Matthew Auld 
Looks good to me.

Reviewed-by: Ramalingam C 

> Cc: Thomas Hellström 
> Cc: Nirmoy Das 
> Cc: Ramalingam C 
> ---
>  drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c 
> b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> index 5c9bfa409ff5..bccc49a8ab5e 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> @@ -1150,7 +1150,7 @@ static int misaligned_pin(struct i915_address_space *vm,
>   flags |= PIN_GLOBAL;
>  
>   for_each_memory_region(mr, vm->i915, id) {
> - u64 min_alignment = i915_vm_min_alignment(vm, (enum 
> intel_memory_type)id);
> + u64 min_alignment = i915_vm_min_alignment(vm, mr->type);
>   u64 size = min_alignment;
>   u64 addr = round_down(hole_start + (hole_size / 2), 
> min_alignment);
>  
> -- 
> 2.34.1
> 


Re: [PATCH v7 7/9] drm/ttm: Add a parameter to add extra pages into ttm_tt

2022-04-13 Thread Ramalingam C
On 2022-04-13 at 11:28:28 +0300, Joonas Lahtinen wrote:
> (+ Tvrtko and Jani)
> 
> Quoting Ramalingam C (2022-04-02 06:02:38)
> > On 2022-04-01 at 16:31:19 +0200, Christian König wrote:
> > > I would be nicer to push this through drm-misc-next, but the intel branch
> > > works for me as well.
> > Hi Christian
> > 
> > I have pushed this patch into drm-misc-next.
> 
> I've now backmerged drm-next containing this commit to drm-intel-gt-next
> in order to unblock merging the rest of the series.
> 
> > Regards,
> > Ram.
> > > 
> > > Regards,
> > > Christian.
> > > 
> > > Am 01.04.22 um 16:28 schrieb Ramalingam C:
> > > > Christian, Joonas and vivi
> > > > 
> > > > Once the premerge results are greeen, if this patch can be merged into
> > > > drm-intel-gt-next along with other patches could you please ack the
> > > > request to merge into drm-intel-gt-next?
> 
> For future reference, when in doubt who are the right ones to handle,
> add all the maintainers and wait for them to reply before proceeding.
> 
> Then we can avoid some unnecessary churn where there are more
> straightforward options like here: merge via drm-intel-gt-next as
> nobody else needs the new functions yet.
Sure Joonas! thank you for backmerging!

Ram
> 
> Regards, Joonas
> 
> > > > Thanks
> > > > Ram
> > > > 
> > > > On 2022-04-01 at 18:07:49 +0530, Ramalingam C wrote:
> > > > > Add a parameter called "extra_pages" for ttm_tt_init, to indicate that
> > > > > driver needs extra pages in ttm_tt.
> > > > > 
> > > > > v2:
> > > > >Used imperative wording [Thomas and Christian]
> > > > > 
> > > > > Signed-off-by: Ramalingam C 
> > > > > cc: Christian Koenig 
> > > > > cc: Hellstrom Thomas 
> > > > > Reviewed-by: Thomas Hellstrom 
> > > > > Reviewed-by: Christian Konig 
> > > > > Reviewed-by: Nirmoy Das 
> > > > > ---
> > > > >   drivers/gpu/drm/drm_gem_vram_helper.c  |  2 +-
> > > > >   drivers/gpu/drm/i915/gem/i915_gem_ttm.c|  2 +-
> > > > >   drivers/gpu/drm/qxl/qxl_ttm.c  |  2 +-
> > > > >   drivers/gpu/drm/ttm/ttm_agp_backend.c  |  2 +-
> > > > >   drivers/gpu/drm/ttm/ttm_tt.c   | 12 +++-
> > > > >   drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c |  2 +-
> > > > >   include/drm/ttm/ttm_tt.h   |  4 +++-
> > > > >   7 files changed, 15 insertions(+), 11 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c 
> > > > > b/drivers/gpu/drm/drm_gem_vram_helper.c
> > > > > index dc7f938bfff2..123045b58fec 100644
> > > > > --- a/drivers/gpu/drm/drm_gem_vram_helper.c
> > > > > +++ b/drivers/gpu/drm/drm_gem_vram_helper.c
> > > > > @@ -867,7 +867,7 @@ static struct ttm_tt 
> > > > > *bo_driver_ttm_tt_create(struct ttm_buffer_object *bo,
> > > > >   if (!tt)
> > > > >   return NULL;
> > > > > - ret = ttm_tt_init(tt, bo, page_flags, ttm_cached);
> > > > > + ret = ttm_tt_init(tt, bo, page_flags, ttm_cached, 0);
> > > > >   if (ret < 0)
> > > > >   goto err_ttm_tt_init;
> > > > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c 
> > > > > b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> > > > > index c40aca99442f..a878910a563c 100644
> > > > > --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> > > > > +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> > > > > @@ -293,7 +293,7 @@ static struct ttm_tt *i915_ttm_tt_create(struct 
> > > > > ttm_buffer_object *bo,
> > > > >   i915_tt->is_shmem = true;
> > > > >   }
> > > > > - ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching);
> > > > > + ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, 0);
> > > > >   if (ret)
> > > > >   goto err_free;
> > > > > diff --git a/drivers/gpu/drm/qxl/qxl_ttm.c 
> > > > > b/drivers/gpu/drm/qxl/qxl_ttm.c
> > > > > index 95df5750f47f..9ba871bd19b1 100644
> > > > > --- a/drivers/gpu/drm/qxl/qxl_ttm.c
> > > > > +++ b/drivers/gpu/drm/qxl/qxl_

[PATCH v9 7/9] drm/i915/selftest_migrate: Check CCS meta data clear

2022-04-05 Thread Ramalingam C
Extend the live migrate selftest, to verify the ccs surface clearing
during the Flat-CCS capable lmem obj clear.

v2:
  Look at right places for ccs data [Thomas]

Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/selftest_migrate.c | 250 ++---
 1 file changed, 222 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c 
b/drivers/gpu/drm/i915/gt/selftest_migrate.c
index b5da8b8cd039..8cd9a22054f3 100644
--- a/drivers/gpu/drm/i915/gt/selftest_migrate.c
+++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c
@@ -132,6 +132,124 @@ static int copy(struct intel_migrate *migrate,
return err;
 }
 
+static int intel_context_copy_ccs(struct intel_context *ce,
+ const struct i915_deps *deps,
+ struct scatterlist *sg,
+ enum i915_cache_level cache_level,
+ bool write_to_ccs,
+ struct i915_request **out)
+{
+   u8 src_access = write_to_ccs ? DIRECT_ACCESS : INDIRECT_ACCESS;
+   u8 dst_access = write_to_ccs ? INDIRECT_ACCESS : DIRECT_ACCESS;
+   struct sgt_dma it = sg_sgt(sg);
+   struct i915_request *rq;
+   u32 offset;
+   int err;
+
+   GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
+   *out = NULL;
+
+   GEM_BUG_ON(ce->ring->size < SZ_64K);
+
+   offset = 0;
+   if (HAS_64K_PAGES(ce->engine->i915))
+   offset = CHUNK_SZ;
+
+   do {
+   int len;
+
+   rq = i915_request_create(ce);
+   if (IS_ERR(rq)) {
+   err = PTR_ERR(rq);
+   goto out_ce;
+   }
+
+   if (deps) {
+   err = i915_request_await_deps(rq, deps);
+   if (err)
+   goto out_rq;
+
+   if (rq->engine->emit_init_breadcrumb) {
+   err = rq->engine->emit_init_breadcrumb(rq);
+   if (err)
+   goto out_rq;
+   }
+
+   deps = NULL;
+   }
+
+   /* The PTE updates + clear must not be interrupted. */
+   err = emit_no_arbitration(rq);
+   if (err)
+   goto out_rq;
+
+   len = emit_pte(rq, , cache_level, true, offset, CHUNK_SZ);
+   if (len <= 0) {
+   err = len;
+   goto out_rq;
+   }
+
+   err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
+   if (err)
+   goto out_rq;
+
+   err = emit_copy_ccs(rq, offset, dst_access,
+   offset, src_access, len);
+   if (err)
+   goto out_rq;
+
+   err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
+
+   /* Arbitration is re-enabled between requests. */
+out_rq:
+   if (*out)
+   i915_request_put(*out);
+   *out = i915_request_get(rq);
+   i915_request_add(rq);
+   if (err || !it.sg || !sg_dma_len(it.sg))
+   break;
+
+   cond_resched();
+   } while (1);
+
+out_ce:
+   return err;
+}
+
+static int
+intel_migrate_ccs_copy(struct intel_migrate *m,
+  struct i915_gem_ww_ctx *ww,
+  const struct i915_deps *deps,
+  struct scatterlist *sg,
+  enum i915_cache_level cache_level,
+  bool write_to_ccs,
+  struct i915_request **out)
+{
+   struct intel_context *ce;
+   int err;
+
+   *out = NULL;
+   if (!m->context)
+   return -ENODEV;
+
+   ce = intel_migrate_create_context(m);
+   if (IS_ERR(ce))
+   ce = intel_context_get(m->context);
+   GEM_BUG_ON(IS_ERR(ce));
+
+   err = intel_context_pin_ww(ce, ww);
+   if (err)
+   goto out;
+
+   err = intel_context_copy_ccs(ce, deps, sg, cache_level,
+write_to_ccs, out);
+
+   intel_context_unpin(ce);
+out:
+   intel_context_put(ce);
+   return err;
+}
+
 static int clear(struct intel_migrate *migrate,
 int (*fn)(struct intel_migrate *migrate,
   struct i915_gem_ww_ctx *ww,
@@ -144,7 +262,8 @@ static int clear(struct intel_migrate *migrate,
struct drm_i915_gem_object *obj;
struct i915_request *rq;
struct i915_gem_ww_ctx ww;
-   u32 *vaddr;
+   u32 *vaddr, val = 0;
+   bool ccs_cap = false;
int err = 0;
int i;
 
@@ -155,7 +274,12 @@ static int clear(struct intel_migrate *migrate,
  

[PATCH v9 9/9] drm/i915/migrate: Evict and restore the flatccs capable lmem obj

2022-04-05 Thread Ramalingam C
When we are swapping out the local memory obj on flat-ccs capable platform,
we need to capture the ccs data too along with main meory and we need to
restore it when we are swapping in the content.

When lmem object is swapped into a smem obj, smem obj will
have the extra pages required to hold the ccs data corresponding to the
lmem main memory. So main memory of lmem will be copied into the initial
pages of the smem and then ccs data corresponding to the main memory
will be copied to the subsequent pages of smem. ccs data is 1/256 of
lmem size.

Swapin happens exactly in reverse order. First main memory of lmem is
restored from the smem's initial pages and the ccs data will be restored
from the subsequent pages of smem.

Extracting and restoring the CCS data is done through a special cmd called
XY_CTRL_SURF_COPY_BLT

v2: Fixing the ccs handling
v3: Handle the ccs data at same loop as main memory [Thomas]
v4: changes for emit_copy_ccs
v5: handle non-flat-ccs scenario

Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 164 +++-
 1 file changed, 160 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 5dec1df40e0e..9d552f30b627 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -633,6 +633,65 @@ static int emit_copy(struct i915_request *rq,
return 0;
 }
 
+static int scatter_list_length(struct scatterlist *sg)
+{
+   int len = 0;
+
+   while (sg && sg_dma_len(sg)) {
+   len += sg_dma_len(sg);
+   sg = sg_next(sg);
+   };
+
+   return len;
+}
+
+static void
+calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem,
+  int *src_sz, int *ccs_sz, u32 bytes_to_cpy,
+  u32 ccs_bytes_to_cpy)
+{
+   if (ccs_bytes_to_cpy) {
+   /*
+* We can only copy the ccs data corresponding to
+* the CHUNK_SZ of lmem which is
+* GET_CCS_BYTES(i915, CHUNK_SZ))
+*/
+   *ccs_sz = min_t(int, ccs_bytes_to_cpy, GET_CCS_BYTES(i915, 
CHUNK_SZ));
+
+   if (!src_is_lmem)
+   /*
+* When CHUNK_SZ is passed all the pages upto CHUNK_SZ
+* will be taken for the blt. in Flat-ccs supported
+* platform Smem obj will have more pages than required
+* for main meory hence limit it to the required size
+* for main memory
+*/
+   *src_sz = min_t(int, bytes_to_cpy, CHUNK_SZ);
+   } else { /* ccs handling is not required */
+   *src_sz = CHUNK_SZ;
+   }
+}
+
+static void get_ccs_sg_sgt(struct sgt_dma *it, u32 bytes_to_cpy)
+{
+   u32 len;
+
+   do {
+   GEM_BUG_ON(!it->sg || !sg_dma_len(it->sg));
+   len = it->max - it->dma;
+   if (len > bytes_to_cpy) {
+   it->dma += bytes_to_cpy;
+   break;
+   }
+
+   bytes_to_cpy -= len;
+
+   it->sg = __sg_next(it->sg);
+   it->dma = sg_dma_address(it->sg);
+   it->max = it->dma + sg_dma_len(it->sg);
+   } while (bytes_to_cpy);
+}
+
 int
 intel_context_migrate_copy(struct intel_context *ce,
   const struct i915_deps *deps,
@@ -644,9 +703,15 @@ intel_context_migrate_copy(struct intel_context *ce,
   bool dst_is_lmem,
   struct i915_request **out)
 {
-   struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst);
+   struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst), it_ccs;
+   struct drm_i915_private *i915 = ce->engine->i915;
+   u32 ccs_bytes_to_cpy = 0, bytes_to_cpy;
+   enum i915_cache_level ccs_cache_level;
+   int src_sz, dst_sz, ccs_sz;
u32 src_offset, dst_offset;
+   u8 src_access, dst_access;
struct i915_request *rq;
+   bool ccs_is_src;
int err;
 
GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
@@ -655,6 +720,38 @@ intel_context_migrate_copy(struct intel_context *ce,
 
GEM_BUG_ON(ce->ring->size < SZ_64K);
 
+   src_sz = scatter_list_length(src);
+   bytes_to_cpy = src_sz;
+
+   if (HAS_FLAT_CCS(i915) && src_is_lmem ^ dst_is_lmem) {
+   src_access = !src_is_lmem && dst_is_lmem;
+   dst_access = !src_access;
+
+   dst_sz = scatter_list_length(dst);
+   if (src_is_lmem) {
+   it_ccs = it_dst;
+   ccs_cache_level = dst_cache_level;
+   ccs_is_src = false;
+   } else if (dst_is_l

[PATCH v9 8/9] drm/i915/gem: Add extra pages in ttm_tt for ccs data

2022-04-05 Thread Ramalingam C
On Xe-HP and later devices, dedicated compression control state (CCS)
stored in local memory is used for each surface, to support the
3D and media compression formats.

The memory required for the CCS of the entire local memory is 1/256 of
the local memory size. So before the kernel boot, the required memory
is reserved for the CCS data and a secure register will be programmed
with the CCS base address

So when an object is allocated in local memory, dont need to explicitly
allocate the space for ccs data. But when the obj is evicted into the
smem, to hold the compression related data along with the obj extra space
is needed in smem. i.e obj_size + (obj_size/256).

Hence when a smem pages are allocated for an obj with lmem placement
possibility we create with the extra pages required for the ccs data for
the obj size.

v2:
  Used imperative wording [Thomas]
v3:
  Inflate the pages only when obj's placement is lmem only
v4:
  GEM_BUG_ON if the ttm->num_pages > obj page size [Thomas]

Signed-off-by: Ramalingam C 
cc: Christian Koenig 
cc: Hellstrom Thomas 
Reviewed-by: Thomas Hellstrom 
Reviewed-by: Nirmoy Das 
---
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 30 -
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index a878910a563c..4c25d9b2f138 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -20,6 +20,7 @@
 #include "gem/i915_gem_ttm.h"
 #include "gem/i915_gem_ttm_move.h"
 #include "gem/i915_gem_ttm_pm.h"
+#include "gt/intel_gpu_commands.h"
 
 #define I915_TTM_PRIO_PURGE 0
 #define I915_TTM_PRIO_NO_PAGES  1
@@ -265,12 +266,33 @@ static const struct i915_refct_sgt_ops tt_rsgt_ops = {
.release = i915_ttm_tt_release
 };
 
+static inline bool
+i915_gem_object_needs_ccs_pages(struct drm_i915_gem_object *obj)
+{
+   bool lmem_placement = false;
+   int i;
+
+   for (i = 0; i < obj->mm.n_placements; i++) {
+   /* Compression is not allowed for the objects with smem 
placement */
+   if (obj->mm.placements[i]->type == INTEL_MEMORY_SYSTEM)
+   return false;
+   if (!lmem_placement &&
+   obj->mm.placements[i]->type == INTEL_MEMORY_LOCAL)
+   lmem_placement = true;
+   }
+
+   return lmem_placement;
+}
+
 static struct ttm_tt *i915_ttm_tt_create(struct ttm_buffer_object *bo,
 uint32_t page_flags)
 {
+   struct drm_i915_private *i915 = container_of(bo->bdev, typeof(*i915),
+bdev);
struct ttm_resource_manager *man =
ttm_manager_type(bo->bdev, bo->resource->mem_type);
struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo);
+   unsigned long ccs_pages = 0;
enum ttm_caching caching;
struct i915_ttm_tt *i915_tt;
int ret;
@@ -293,7 +315,12 @@ static struct ttm_tt *i915_ttm_tt_create(struct 
ttm_buffer_object *bo,
i915_tt->is_shmem = true;
}
 
-   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, 0);
+   if (HAS_FLAT_CCS(i915) && i915_gem_object_needs_ccs_pages(obj))
+   ccs_pages = DIV_ROUND_UP(DIV_ROUND_UP(bo->base.size,
+ NUM_BYTES_PER_CCS_BYTE),
+PAGE_SIZE);
+
+   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, ccs_pages);
if (ret)
goto err_free;
 
@@ -773,6 +800,7 @@ static int __i915_ttm_get_pages(struct drm_i915_gem_object 
*obj,
i915_sg_dma_sizes(rsgt->table.sgl));
}
 
+   GEM_BUG_ON(bo->ttm && ((obj->base.size >> PAGE_SHIFT) < 
bo->ttm->num_pages));
i915_ttm_adjust_lru(obj);
return ret;
 }
-- 
2.20.1



[PATCH v9 6/9] drm/i915/selftest_migrate: Consider the possible roundup of size

2022-04-05 Thread Ramalingam C
Consider the possible round up happened at obj size alignment to
min_page_size during the obj allocation.

Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/selftest_migrate.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c 
b/drivers/gpu/drm/i915/gt/selftest_migrate.c
index c9c4f391c5cc..b5da8b8cd039 100644
--- a/drivers/gpu/drm/i915/gt/selftest_migrate.c
+++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c
@@ -152,6 +152,9 @@ static int clear(struct intel_migrate *migrate,
if (IS_ERR(obj))
return 0;
 
+   /* Consider the rounded up memory too */
+   sz = obj->base.size;
+
for_i915_gem_ww(, err, true) {
err = i915_gem_object_lock(obj, );
if (err)
-- 
2.20.1



[PATCH v9 5/9] drm/i915/gt: Clear compress metadata for Flat-ccs objects

2022-04-05 Thread Ramalingam C
Xe-HP and latest devices support Flat CCS which reserved a portion of
the device memory to store compression metadata, during the clearing of
device memory buffer object we also need to clear the associated
CCS buffer.

XY_CTRL_SURF_COPY_BLT is a BLT cmd used for reading and writing the
ccs surface of a lmem memory. So on Flat-CCS capable platform we use
XY_CTRL_SURF_COPY_BLT  to clear the CCS meta data.

v2: Fixed issues with platform naming [Lucas]
v3: Rebased [Ram]
Used the round_up funcs [Bob]
v4: Fixed ccs blk calculation [Ram]
Added Kdoc on flat-ccs.
v5: GENMASK is used [Matt]
mocs fix [Matt]
Comments Fix [Matt]
Flush address programming [Ram]
v6: FLUSH_DW is fixed
Few coding style fix
v7: Adopting the XY_FAST_COLOR_BLT (Thomas]
v8: XY_CTRL_SURF_COPY_BLT for ccs clearing.
v9: emit_copy_ccs is used.
v10: ctrl_surf cmds are filled in caller itself. [Thomas]
 only one ctrl surf cmd is used as size of lmem is <=8M [Thomas]

Signed-off-by: Ramalingam C 
Signed-off-by: Ayaz A Siddiqui 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  16 +++
 drivers/gpu/drm/i915/gt/intel_migrate.c  | 137 ++-
 2 files changed, 152 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h 
b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index d1b8c23f7a9e..724ab069ddb6 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -154,8 +154,10 @@
 #define   MI_FLUSH_DW_PROTECTED_MEM_EN (1 << 22)
 #define   MI_FLUSH_DW_STORE_INDEX  (1<<21)
 #define   MI_INVALIDATE_TLB(1<<18)
+#define   MI_FLUSH_DW_CCS  (1<<16)
 #define   MI_FLUSH_DW_OP_STOREDW   (1<<14)
 #define   MI_FLUSH_DW_OP_MASK  (3<<14)
+#define   MI_FLUSH_DW_LLC  (1<<9)
 #define   MI_FLUSH_DW_NOTIFY   (1<<8)
 #define   MI_INVALIDATE_BSD(1<<7)
 #define   MI_FLUSH_DW_USE_GTT  (1<<2)
@@ -204,6 +206,20 @@
 #define GFX_OP_DRAWRECT_INFO ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3))
 #define GFX_OP_DRAWRECT_INFO_I965  ((0x7900<<16)|0x2)
 
+#define XY_CTRL_SURF_INSTR_SIZE5
+#define MI_FLUSH_DW_SIZE   3
+#define XY_CTRL_SURF_COPY_BLT  ((2 << 29) | (0x48 << 22) | 3)
+#define   SRC_ACCESS_TYPE_SHIFT21
+#define   DST_ACCESS_TYPE_SHIFT20
+#define   CCS_SIZE_MASK0x3FF
+#define   CCS_SIZE_SHIFT   8
+#define   XY_CTRL_SURF_MOCS_MASK   GENMASK(31, 25)
+#define   NUM_CCS_BYTES_PER_BLOCK  256
+#define   NUM_BYTES_PER_CCS_BYTE   256
+#define   NUM_CCS_BLKS_PER_XFER1024
+#define   INDIRECT_ACCESS  0
+#define   DIRECT_ACCESS1
+
 #define COLOR_BLT_CMD  (2 << 29 | 0x40 << 22 | (5 - 2))
 #define XY_COLOR_BLT_CMD   (2 << 29 | 0x50 << 22)
 #define XY_FAST_COLOR_BLT_CMD  (2 << 29 | 0x44 << 22)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 6378d4450e1a..5dec1df40e0e 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -17,6 +17,8 @@ struct insert_pte_data {
 
 #define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
 
+#define GET_CCS_BYTES(i915, size)  (HAS_FLAT_CCS(i915) ? \
+DIV_ROUND_UP(size, 
NUM_BYTES_PER_CCS_BYTE) : 0)
 static bool engine_supports_migration(struct intel_engine_cs *engine)
 {
if (!engine)
@@ -467,6 +469,123 @@ static bool wa_1209644611_applies(int ver, u32 size)
return height % 4 == 3 && height <= 8;
 }
 
+/**
+ * DOC: Flat-CCS - Memory compression for Local memory
+ *
+ * On Xe-HP and later devices, we use dedicated compression control state (CCS)
+ * stored in local memory for each surface, to support the 3D and media
+ * compression formats.
+ *
+ * The memory required for the CCS of the entire local memory is 1/256 of the
+ * local memory size. So before the kernel boot, the required memory is 
reserved
+ * for the CCS data and a secure register will be programmed with the CCS base
+ * address.
+ *
+ * Flat CCS data needs to be cleared when a lmem object is allocated.
+ * And CCS data can be copied in and out of CCS region through
+ * XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly.
+ *
+ * When we exhaust the lmem, if the object's placements support smem, then we 
can
+ * directly decompress the compressed lmem object into smem and start using it
+ * from smem itself.
+ *
+ * But when we need to swapout the compressed lmem object into a smem region
+ * though objects' placement doesn't support smem, then we copy the lmem 
content
+ * as it is into smem region along with ccs data (using XY_CTRL_SURF_

[PATCH v9 2/9] drm/i915/gt: Use XY_FAST_COLOR_BLT to clear obj on graphics ver 12+

2022-04-05 Thread Ramalingam C
Use faster XY_FAST_COLOR_BLT cmd on graphics version of 12 and more,
for clearing (Zero out) the pages of the newly allocated object.

XY_FAST_COLOR_BLT is faster than the older XY_COLOR_BLT.

v2:
  Typo fix at title [Thomas]
v3:
  XY_FAST_COLOR_BLT is used only for FLAT_CCS capable gen12+

Signed-off-by: Ramalingam C 
Signed-off-by: Chris Wilson 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  5 +++
 drivers/gpu/drm/i915/gt/intel_migrate.c  | 43 +---
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h 
b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index 4243be030bc1..d1b8c23f7a9e 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -206,6 +206,11 @@
 
 #define COLOR_BLT_CMD  (2 << 29 | 0x40 << 22 | (5 - 2))
 #define XY_COLOR_BLT_CMD   (2 << 29 | 0x50 << 22)
+#define XY_FAST_COLOR_BLT_CMD  (2 << 29 | 0x44 << 22)
+#define   XY_FAST_COLOR_BLT_DEPTH_32   (2 << 19)
+#define   XY_FAST_COLOR_BLT_DW 16
+#define   XY_FAST_COLOR_BLT_MOCS_MASK  GENMASK(27, 21)
+#define   XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT 31
 #define SRC_COPY_BLT_CMD   (2 << 29 | 0x43 << 22)
 #define GEN9_XY_FAST_COPY_BLT_CMD  (2 << 29 | 0x42 << 22)
 #define XY_SRC_COPY_BLT_CMD(2 << 29 | 0x53 << 22)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 9d852a570400..e81f20266f62 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -613,18 +613,51 @@ intel_context_migrate_copy(struct intel_context *ce,
return err;
 }
 
-static int emit_clear(struct i915_request *rq, u32 offset, int size, u32 value)
+static int emit_clear(struct i915_request *rq, u32 offset, int size,
+ u32 value, bool is_lmem)
 {
-   const int ver = GRAPHICS_VER(rq->engine->i915);
+   struct drm_i915_private *i915 = rq->engine->i915;
+   int mocs = rq->engine->gt->mocs.uc_index << 1;
+   const int ver = GRAPHICS_VER(i915);
+   int ring_sz;
u32 *cs;
 
GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
 
-   cs = intel_ring_begin(rq, ver >= 8 ? 8 : 6);
+   if (HAS_FLAT_CCS(i915) && ver >= 12)
+   ring_sz = XY_FAST_COLOR_BLT_DW;
+   else if (ver >= 8)
+   ring_sz = 8;
+   else
+   ring_sz = 6;
+
+   cs = intel_ring_begin(rq, ring_sz);
if (IS_ERR(cs))
return PTR_ERR(cs);
 
-   if (ver >= 8) {
+   if (HAS_FLAT_CCS(i915) && ver >= 12) {
+   *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 |
+   (XY_FAST_COLOR_BLT_DW - 2);
+   *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) |
+   (PAGE_SIZE - 1);
+   *cs++ = 0;
+   *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
+   *cs++ = offset;
+   *cs++ = rq->engine->instance;
+   *cs++ = !is_lmem << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT;
+   /* BG7 */
+   *cs++ = value;
+   *cs++ = 0;
+   *cs++ = 0;
+   *cs++ = 0;
+   /* BG11 */
+   *cs++ = 0;
+   *cs++ = 0;
+   /* BG13 */
+   *cs++ = 0;
+   *cs++ = 0;
+   *cs++ = 0;
+   } else if (ver >= 8) {
*cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (7 - 2);
*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
*cs++ = 0;
@@ -707,7 +740,7 @@ intel_context_migrate_clear(struct intel_context *ce,
if (err)
goto out_rq;
 
-   err = emit_clear(rq, offset, len, value);
+   err = emit_clear(rq, offset, len, value, is_lmem);
 
/* Arbitration is re-enabled between requests. */
 out_rq:
-- 
2.20.1



[PATCH v9 4/9] drm/i915/gt: Pass the -EINVAL when emit_pte doesn't update any PTE

2022-04-05 Thread Ramalingam C
When emit_pte doesn't update any PTE with return value as 0, interpret
it as -EINVAL.

v2:
  Add missing goto [Thomas]

Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index e0f1c727662e..6378d4450e1a 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -577,7 +577,11 @@ intel_context_migrate_copy(struct intel_context *ce,
 
len = emit_pte(rq, _src, src_cache_level, src_is_lmem,
   src_offset, CHUNK_SZ);
-   if (len <= 0) {
+   if (!len) {
+   err = -EINVAL;
+   goto out_rq;
+   }
+   if (len < 0) {
err = len;
goto out_rq;
}
-- 
2.20.1



[PATCH v9 3/9] drm/i915/gt: Optimize the migration and clear loop

2022-04-05 Thread Ramalingam C
Move the static calculations out of the loops for copy and clear.

v2:
  Fix the loss of proper error code on emit_pte

Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom  (v1)
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 34 -
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index e81f20266f62..e0f1c727662e 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -526,6 +526,7 @@ intel_context_migrate_copy(struct intel_context *ce,
   struct i915_request **out)
 {
struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst);
+   u32 src_offset, dst_offset;
struct i915_request *rq;
int err;
 
@@ -535,8 +536,18 @@ intel_context_migrate_copy(struct intel_context *ce,
 
GEM_BUG_ON(ce->ring->size < SZ_64K);
 
+   src_offset = 0;
+   dst_offset = CHUNK_SZ;
+   if (HAS_64K_PAGES(ce->engine->i915)) {
+   src_offset = 0;
+   dst_offset = 0;
+   if (src_is_lmem)
+   src_offset = CHUNK_SZ;
+   if (dst_is_lmem)
+   dst_offset = 2 * CHUNK_SZ;
+   }
+
do {
-   u32 src_offset, dst_offset;
int len;
 
rq = i915_request_create(ce);
@@ -564,17 +575,6 @@ intel_context_migrate_copy(struct intel_context *ce,
if (err)
goto out_rq;
 
-   src_offset = 0;
-   dst_offset = CHUNK_SZ;
-   if (HAS_64K_PAGES(ce->engine->i915)) {
-   src_offset = 0;
-   dst_offset = 0;
-   if (src_is_lmem)
-   src_offset = CHUNK_SZ;
-   if (dst_is_lmem)
-   dst_offset = 2 * CHUNK_SZ;
-   }
-
len = emit_pte(rq, _src, src_cache_level, src_is_lmem,
   src_offset, CHUNK_SZ);
if (len <= 0) {
@@ -690,6 +690,7 @@ intel_context_migrate_clear(struct intel_context *ce,
 {
struct sgt_dma it = sg_sgt(sg);
struct i915_request *rq;
+   u32 offset;
int err;
 
GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
@@ -697,8 +698,11 @@ intel_context_migrate_clear(struct intel_context *ce,
 
GEM_BUG_ON(ce->ring->size < SZ_64K);
 
+   offset = 0;
+   if (HAS_64K_PAGES(ce->engine->i915) && is_lmem)
+   offset = CHUNK_SZ;
+
do {
-   u32 offset;
int len;
 
rq = i915_request_create(ce);
@@ -726,10 +730,6 @@ intel_context_migrate_clear(struct intel_context *ce,
if (err)
goto out_rq;
 
-   offset = 0;
-   if (HAS_64K_PAGES(ce->engine->i915) && is_lmem)
-   offset = CHUNK_SZ;
-
len = emit_pte(rq, , cache_level, is_lmem, offset, CHUNK_SZ);
if (len <= 0) {
err = len;
-- 
2.20.1



[PATCH v9 1/9] drm/i915/gt: use engine instance directly for offset

2022-04-05 Thread Ramalingam C
To make it uniform across copy and clear, use the engine offset directly
to calculate the offset in the cmd forming for emit_clear.

Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 950fd6da146c..9d852a570400 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -613,15 +613,13 @@ intel_context_migrate_copy(struct intel_context *ce,
return err;
 }
 
-static int emit_clear(struct i915_request *rq, u64 offset, int size, u32 value)
+static int emit_clear(struct i915_request *rq, u32 offset, int size, u32 value)
 {
const int ver = GRAPHICS_VER(rq->engine->i915);
u32 *cs;
 
GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
 
-   offset += (u64)rq->engine->instance << 32;
-
cs = intel_ring_begin(rq, ver >= 8 ? 8 : 6);
if (IS_ERR(cs))
return PTR_ERR(cs);
@@ -631,17 +629,16 @@ static int emit_clear(struct i915_request *rq, u64 
offset, int size, u32 value)
*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
*cs++ = 0;
*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
-   *cs++ = lower_32_bits(offset);
-   *cs++ = upper_32_bits(offset);
+   *cs++ = offset;
+   *cs++ = rq->engine->instance;
*cs++ = value;
*cs++ = MI_NOOP;
} else {
-   GEM_BUG_ON(upper_32_bits(offset));
*cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
*cs++ = 0;
*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
-   *cs++ = lower_32_bits(offset);
+   *cs++ = offset;
*cs++ = value;
}
 
-- 
2.20.1



[PATCH v9 0/9] drm/i915/ttm: Evict and restore of compressed object

2022-04-05 Thread Ramalingam C
On Xe-HP and later devices, we use dedicated compression control
state (CCS) stored in local memory for each surface, to support
the 3D and media compression formats.

The memory required for the CCS of the entire local memory is
1/256 of the local memory size. So before the kernel
boot, the required memory is reserved for the CCS data and a
secure register will be programmed with the CCS base address

So when we allocate a object in local memory we dont need to explicitly
allocate the space for ccs data. But when we evict the obj into the smem
to hold the compression related data along with the obj we need smem
space of obj_size + (obj_size/256).

Hence when we create smem for an obj with lmem placement possibility we
create with the extra space.

When we are swapping out the local memory obj on flat-ccs capable platform,
we need to capture the ccs data too along with main meory and we need to
restore it when we are swapping in the content.

When lmem object is swapped into a smem obj, smem obj will
have the extra pages required to hold the ccs data corresponding to the
lmem main memory. So main memory of lmem will be copied into the initial
pages of the smem and then ccs data corresponding to the main memory
will be copied to the subsequent pages of smem.

Swapin happens exactly in reverse order. First main memory of lmem is
restored from the smem's initial pages and the ccs data will be restored
from the subsequent pages of smem.

Extracting and restoring the CCS data is done through a special cmd called
XY_CTRL_SURF_COPY_BLT

v8 and v9:
  New patch for return value fix
  Fix a return error code

Test-with: 20220405141050.16037-1-ramalinga...@intel.com

Ramalingam C (9):
  drm/i915/gt: use engine instance directly for offset
  drm/i915/gt: Use XY_FAST_COLOR_BLT to clear obj on graphics ver 12+
  drm/i915/gt: Optimize the migration and clear loop
  drm/i915/gt: Pass the -EINVAL when emit_pte doesn't update any PTE
  drm/i915/gt: Clear compress metadata for Flat-ccs objects
  drm/i915/selftest_migrate: Consider the possible roundup of size
  drm/i915/selftest_migrate: Check CCS meta data clear
  drm/i915/gem: Add extra pages in ttm_tt for ccs data
  drm/i915/migrate: Evict and restore the flatccs capable lmem obj

 drivers/gpu/drm/i915/gem/i915_gem_ttm.c  |  30 +-
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  21 +
 drivers/gpu/drm/i915/gt/intel_migrate.c  | 387 +--
 drivers/gpu/drm/i915/gt/selftest_migrate.c   | 253 ++--
 4 files changed, 631 insertions(+), 60 deletions(-)

-- 
2.20.1



[PATCH v8 6/9] drm/i915/selftest_migrate: Consider the possible roundup of size

2022-04-05 Thread Ramalingam C
Consider the possible round up happened at obj size alignment to
min_page_size during the obj allocation.

Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/selftest_migrate.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c 
b/drivers/gpu/drm/i915/gt/selftest_migrate.c
index c9c4f391c5cc..b5da8b8cd039 100644
--- a/drivers/gpu/drm/i915/gt/selftest_migrate.c
+++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c
@@ -152,6 +152,9 @@ static int clear(struct intel_migrate *migrate,
if (IS_ERR(obj))
return 0;
 
+   /* Consider the rounded up memory too */
+   sz = obj->base.size;
+
for_i915_gem_ww(, err, true) {
err = i915_gem_object_lock(obj, );
if (err)
-- 
2.20.1



[PATCH v8 9/9] drm/i915/migrate: Evict and restore the flatccs capable lmem obj

2022-04-05 Thread Ramalingam C
When we are swapping out the local memory obj on flat-ccs capable platform,
we need to capture the ccs data too along with main meory and we need to
restore it when we are swapping in the content.

When lmem object is swapped into a smem obj, smem obj will
have the extra pages required to hold the ccs data corresponding to the
lmem main memory. So main memory of lmem will be copied into the initial
pages of the smem and then ccs data corresponding to the main memory
will be copied to the subsequent pages of smem. ccs data is 1/256 of
lmem size.

Swapin happens exactly in reverse order. First main memory of lmem is
restored from the smem's initial pages and the ccs data will be restored
from the subsequent pages of smem.

Extracting and restoring the CCS data is done through a special cmd called
XY_CTRL_SURF_COPY_BLT

v2: Fixing the ccs handling
v3: Handle the ccs data at same loop as main memory [Thomas]
v4: changes for emit_copy_ccs
v5: handle non-flat-ccs scenario

Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 164 +++-
 1 file changed, 160 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 2446ff70ce45..dd7b89589f20 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -633,6 +633,65 @@ static int emit_copy(struct i915_request *rq,
return 0;
 }
 
+static int scatter_list_length(struct scatterlist *sg)
+{
+   int len = 0;
+
+   while (sg && sg_dma_len(sg)) {
+   len += sg_dma_len(sg);
+   sg = sg_next(sg);
+   };
+
+   return len;
+}
+
+static void
+calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem,
+  int *src_sz, int *ccs_sz, u32 bytes_to_cpy,
+  u32 ccs_bytes_to_cpy)
+{
+   if (ccs_bytes_to_cpy) {
+   /*
+* We can only copy the ccs data corresponding to
+* the CHUNK_SZ of lmem which is
+* GET_CCS_BYTES(i915, CHUNK_SZ))
+*/
+   *ccs_sz = min_t(int, ccs_bytes_to_cpy, GET_CCS_BYTES(i915, 
CHUNK_SZ));
+
+   if (!src_is_lmem)
+   /*
+* When CHUNK_SZ is passed all the pages upto CHUNK_SZ
+* will be taken for the blt. in Flat-ccs supported
+* platform Smem obj will have more pages than required
+* for main meory hence limit it to the required size
+* for main memory
+*/
+   *src_sz = min_t(int, bytes_to_cpy, CHUNK_SZ);
+   } else { /* ccs handling is not required */
+   *src_sz = CHUNK_SZ;
+   }
+}
+
+static void get_ccs_sg_sgt(struct sgt_dma *it, u32 bytes_to_cpy)
+{
+   u32 len;
+
+   do {
+   GEM_BUG_ON(!it->sg || !sg_dma_len(it->sg));
+   len = it->max - it->dma;
+   if (len > bytes_to_cpy) {
+   it->dma += bytes_to_cpy;
+   break;
+   }
+
+   bytes_to_cpy -= len;
+
+   it->sg = __sg_next(it->sg);
+   it->dma = sg_dma_address(it->sg);
+   it->max = it->dma + sg_dma_len(it->sg);
+   } while (bytes_to_cpy);
+}
+
 int
 intel_context_migrate_copy(struct intel_context *ce,
   const struct i915_deps *deps,
@@ -644,9 +703,15 @@ intel_context_migrate_copy(struct intel_context *ce,
   bool dst_is_lmem,
   struct i915_request **out)
 {
-   struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst);
+   struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst), it_ccs;
+   struct drm_i915_private *i915 = ce->engine->i915;
+   u32 ccs_bytes_to_cpy = 0, bytes_to_cpy;
+   enum i915_cache_level ccs_cache_level;
+   int src_sz, dst_sz, ccs_sz;
u32 src_offset, dst_offset;
+   u8 src_access, dst_access;
struct i915_request *rq;
+   bool ccs_is_src;
int err;
 
GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
@@ -655,6 +720,38 @@ intel_context_migrate_copy(struct intel_context *ce,
 
GEM_BUG_ON(ce->ring->size < SZ_64K);
 
+   src_sz = scatter_list_length(src);
+   bytes_to_cpy = src_sz;
+
+   if (HAS_FLAT_CCS(i915) && src_is_lmem ^ dst_is_lmem) {
+   src_access = !src_is_lmem && dst_is_lmem;
+   dst_access = !src_access;
+
+   dst_sz = scatter_list_length(dst);
+   if (src_is_lmem) {
+   it_ccs = it_dst;
+   ccs_cache_level = dst_cache_level;
+   ccs_is_src = false;
+   } else if (dst_is_l

[PATCH v8 8/9] drm/i915/gem: Add extra pages in ttm_tt for ccs data

2022-04-05 Thread Ramalingam C
On Xe-HP and later devices, dedicated compression control state (CCS)
stored in local memory is used for each surface, to support the
3D and media compression formats.

The memory required for the CCS of the entire local memory is 1/256 of
the local memory size. So before the kernel boot, the required memory
is reserved for the CCS data and a secure register will be programmed
with the CCS base address

So when an object is allocated in local memory, dont need to explicitly
allocate the space for ccs data. But when the obj is evicted into the
smem, to hold the compression related data along with the obj extra space
is needed in smem. i.e obj_size + (obj_size/256).

Hence when a smem pages are allocated for an obj with lmem placement
possibility we create with the extra pages required for the ccs data for
the obj size.

v2:
  Used imperative wording [Thomas]
v3:
  Inflate the pages only when obj's placement is lmem only
v4:
  GEM_BUG_ON if the ttm->num_pages > obj page size [Thomas]

Signed-off-by: Ramalingam C 
cc: Christian Koenig 
cc: Hellstrom Thomas 
Reviewed-by: Thomas Hellstrom 
Reviewed-by: Nirmoy Das 
---
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 30 -
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index a878910a563c..4c25d9b2f138 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -20,6 +20,7 @@
 #include "gem/i915_gem_ttm.h"
 #include "gem/i915_gem_ttm_move.h"
 #include "gem/i915_gem_ttm_pm.h"
+#include "gt/intel_gpu_commands.h"
 
 #define I915_TTM_PRIO_PURGE 0
 #define I915_TTM_PRIO_NO_PAGES  1
@@ -265,12 +266,33 @@ static const struct i915_refct_sgt_ops tt_rsgt_ops = {
.release = i915_ttm_tt_release
 };
 
+static inline bool
+i915_gem_object_needs_ccs_pages(struct drm_i915_gem_object *obj)
+{
+   bool lmem_placement = false;
+   int i;
+
+   for (i = 0; i < obj->mm.n_placements; i++) {
+   /* Compression is not allowed for the objects with smem 
placement */
+   if (obj->mm.placements[i]->type == INTEL_MEMORY_SYSTEM)
+   return false;
+   if (!lmem_placement &&
+   obj->mm.placements[i]->type == INTEL_MEMORY_LOCAL)
+   lmem_placement = true;
+   }
+
+   return lmem_placement;
+}
+
 static struct ttm_tt *i915_ttm_tt_create(struct ttm_buffer_object *bo,
 uint32_t page_flags)
 {
+   struct drm_i915_private *i915 = container_of(bo->bdev, typeof(*i915),
+bdev);
struct ttm_resource_manager *man =
ttm_manager_type(bo->bdev, bo->resource->mem_type);
struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo);
+   unsigned long ccs_pages = 0;
enum ttm_caching caching;
struct i915_ttm_tt *i915_tt;
int ret;
@@ -293,7 +315,12 @@ static struct ttm_tt *i915_ttm_tt_create(struct 
ttm_buffer_object *bo,
i915_tt->is_shmem = true;
}
 
-   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, 0);
+   if (HAS_FLAT_CCS(i915) && i915_gem_object_needs_ccs_pages(obj))
+   ccs_pages = DIV_ROUND_UP(DIV_ROUND_UP(bo->base.size,
+ NUM_BYTES_PER_CCS_BYTE),
+PAGE_SIZE);
+
+   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, ccs_pages);
if (ret)
goto err_free;
 
@@ -773,6 +800,7 @@ static int __i915_ttm_get_pages(struct drm_i915_gem_object 
*obj,
i915_sg_dma_sizes(rsgt->table.sgl));
}
 
+   GEM_BUG_ON(bo->ttm && ((obj->base.size >> PAGE_SHIFT) < 
bo->ttm->num_pages));
i915_ttm_adjust_lru(obj);
return ret;
 }
-- 
2.20.1



[PATCH v8 7/9] drm/i915/selftest_migrate: Check CCS meta data clear

2022-04-05 Thread Ramalingam C
Extend the live migrate selftest, to verify the ccs surface clearing
during the Flat-CCS capable lmem obj clear.

v2:
  Look at right places for ccs data [Thomas]

Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/selftest_migrate.c | 250 ++---
 1 file changed, 222 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c 
b/drivers/gpu/drm/i915/gt/selftest_migrate.c
index b5da8b8cd039..8cd9a22054f3 100644
--- a/drivers/gpu/drm/i915/gt/selftest_migrate.c
+++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c
@@ -132,6 +132,124 @@ static int copy(struct intel_migrate *migrate,
return err;
 }
 
+static int intel_context_copy_ccs(struct intel_context *ce,
+ const struct i915_deps *deps,
+ struct scatterlist *sg,
+ enum i915_cache_level cache_level,
+ bool write_to_ccs,
+ struct i915_request **out)
+{
+   u8 src_access = write_to_ccs ? DIRECT_ACCESS : INDIRECT_ACCESS;
+   u8 dst_access = write_to_ccs ? INDIRECT_ACCESS : DIRECT_ACCESS;
+   struct sgt_dma it = sg_sgt(sg);
+   struct i915_request *rq;
+   u32 offset;
+   int err;
+
+   GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
+   *out = NULL;
+
+   GEM_BUG_ON(ce->ring->size < SZ_64K);
+
+   offset = 0;
+   if (HAS_64K_PAGES(ce->engine->i915))
+   offset = CHUNK_SZ;
+
+   do {
+   int len;
+
+   rq = i915_request_create(ce);
+   if (IS_ERR(rq)) {
+   err = PTR_ERR(rq);
+   goto out_ce;
+   }
+
+   if (deps) {
+   err = i915_request_await_deps(rq, deps);
+   if (err)
+   goto out_rq;
+
+   if (rq->engine->emit_init_breadcrumb) {
+   err = rq->engine->emit_init_breadcrumb(rq);
+   if (err)
+   goto out_rq;
+   }
+
+   deps = NULL;
+   }
+
+   /* The PTE updates + clear must not be interrupted. */
+   err = emit_no_arbitration(rq);
+   if (err)
+   goto out_rq;
+
+   len = emit_pte(rq, , cache_level, true, offset, CHUNK_SZ);
+   if (len <= 0) {
+   err = len;
+   goto out_rq;
+   }
+
+   err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
+   if (err)
+   goto out_rq;
+
+   err = emit_copy_ccs(rq, offset, dst_access,
+   offset, src_access, len);
+   if (err)
+   goto out_rq;
+
+   err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
+
+   /* Arbitration is re-enabled between requests. */
+out_rq:
+   if (*out)
+   i915_request_put(*out);
+   *out = i915_request_get(rq);
+   i915_request_add(rq);
+   if (err || !it.sg || !sg_dma_len(it.sg))
+   break;
+
+   cond_resched();
+   } while (1);
+
+out_ce:
+   return err;
+}
+
+static int
+intel_migrate_ccs_copy(struct intel_migrate *m,
+  struct i915_gem_ww_ctx *ww,
+  const struct i915_deps *deps,
+  struct scatterlist *sg,
+  enum i915_cache_level cache_level,
+  bool write_to_ccs,
+  struct i915_request **out)
+{
+   struct intel_context *ce;
+   int err;
+
+   *out = NULL;
+   if (!m->context)
+   return -ENODEV;
+
+   ce = intel_migrate_create_context(m);
+   if (IS_ERR(ce))
+   ce = intel_context_get(m->context);
+   GEM_BUG_ON(IS_ERR(ce));
+
+   err = intel_context_pin_ww(ce, ww);
+   if (err)
+   goto out;
+
+   err = intel_context_copy_ccs(ce, deps, sg, cache_level,
+write_to_ccs, out);
+
+   intel_context_unpin(ce);
+out:
+   intel_context_put(ce);
+   return err;
+}
+
 static int clear(struct intel_migrate *migrate,
 int (*fn)(struct intel_migrate *migrate,
   struct i915_gem_ww_ctx *ww,
@@ -144,7 +262,8 @@ static int clear(struct intel_migrate *migrate,
struct drm_i915_gem_object *obj;
struct i915_request *rq;
struct i915_gem_ww_ctx ww;
-   u32 *vaddr;
+   u32 *vaddr, val = 0;
+   bool ccs_cap = false;
int err = 0;
int i;
 
@@ -155,7 +274,12 @@ static int clear(struct intel_migrate *migrate,
  

[PATCH v8 4/9] drm/i915/gt: Pass the -EINVAL when emit_pte doesn't update any PTE

2022-04-05 Thread Ramalingam C
When emit_pte doesn't update any PTE with return value as 0, interpret
it as -EINVAL.

Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index e0f1c727662e..f9f3b0e7ed87 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -577,7 +577,9 @@ intel_context_migrate_copy(struct intel_context *ce,
 
len = emit_pte(rq, _src, src_cache_level, src_is_lmem,
   src_offset, CHUNK_SZ);
-   if (len <= 0) {
+   if (!len)
+   err = -EINVAL;
+   if (len < 0) {
err = len;
goto out_rq;
}
-- 
2.20.1



[PATCH v8 5/9] drm/i915/gt: Clear compress metadata for Flat-ccs objects

2022-04-05 Thread Ramalingam C
Xe-HP and latest devices support Flat CCS which reserved a portion of
the device memory to store compression metadata, during the clearing of
device memory buffer object we also need to clear the associated
CCS buffer.

XY_CTRL_SURF_COPY_BLT is a BLT cmd used for reading and writing the
ccs surface of a lmem memory. So on Flat-CCS capable platform we use
XY_CTRL_SURF_COPY_BLT  to clear the CCS meta data.

v2: Fixed issues with platform naming [Lucas]
v3: Rebased [Ram]
Used the round_up funcs [Bob]
v4: Fixed ccs blk calculation [Ram]
Added Kdoc on flat-ccs.
v5: GENMASK is used [Matt]
mocs fix [Matt]
Comments Fix [Matt]
Flush address programming [Ram]
v6: FLUSH_DW is fixed
Few coding style fix
v7: Adopting the XY_FAST_COLOR_BLT (Thomas]
v8: XY_CTRL_SURF_COPY_BLT for ccs clearing.
v9: emit_copy_ccs is used.
v10: ctrl_surf cmds are filled in caller itself. [Thomas]
 only one ctrl surf cmd is used as size of lmem is <=8M [Thomas]

Signed-off-by: Ramalingam C 
Signed-off-by: Ayaz A Siddiqui 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  16 +++
 drivers/gpu/drm/i915/gt/intel_migrate.c  | 137 ++-
 2 files changed, 152 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h 
b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index d1b8c23f7a9e..724ab069ddb6 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -154,8 +154,10 @@
 #define   MI_FLUSH_DW_PROTECTED_MEM_EN (1 << 22)
 #define   MI_FLUSH_DW_STORE_INDEX  (1<<21)
 #define   MI_INVALIDATE_TLB(1<<18)
+#define   MI_FLUSH_DW_CCS  (1<<16)
 #define   MI_FLUSH_DW_OP_STOREDW   (1<<14)
 #define   MI_FLUSH_DW_OP_MASK  (3<<14)
+#define   MI_FLUSH_DW_LLC  (1<<9)
 #define   MI_FLUSH_DW_NOTIFY   (1<<8)
 #define   MI_INVALIDATE_BSD(1<<7)
 #define   MI_FLUSH_DW_USE_GTT  (1<<2)
@@ -204,6 +206,20 @@
 #define GFX_OP_DRAWRECT_INFO ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3))
 #define GFX_OP_DRAWRECT_INFO_I965  ((0x7900<<16)|0x2)
 
+#define XY_CTRL_SURF_INSTR_SIZE5
+#define MI_FLUSH_DW_SIZE   3
+#define XY_CTRL_SURF_COPY_BLT  ((2 << 29) | (0x48 << 22) | 3)
+#define   SRC_ACCESS_TYPE_SHIFT21
+#define   DST_ACCESS_TYPE_SHIFT20
+#define   CCS_SIZE_MASK0x3FF
+#define   CCS_SIZE_SHIFT   8
+#define   XY_CTRL_SURF_MOCS_MASK   GENMASK(31, 25)
+#define   NUM_CCS_BYTES_PER_BLOCK  256
+#define   NUM_BYTES_PER_CCS_BYTE   256
+#define   NUM_CCS_BLKS_PER_XFER1024
+#define   INDIRECT_ACCESS  0
+#define   DIRECT_ACCESS1
+
 #define COLOR_BLT_CMD  (2 << 29 | 0x40 << 22 | (5 - 2))
 #define XY_COLOR_BLT_CMD   (2 << 29 | 0x50 << 22)
 #define XY_FAST_COLOR_BLT_CMD  (2 << 29 | 0x44 << 22)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index f9f3b0e7ed87..2446ff70ce45 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -17,6 +17,8 @@ struct insert_pte_data {
 
 #define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
 
+#define GET_CCS_BYTES(i915, size)  (HAS_FLAT_CCS(i915) ? \
+DIV_ROUND_UP(size, 
NUM_BYTES_PER_CCS_BYTE) : 0)
 static bool engine_supports_migration(struct intel_engine_cs *engine)
 {
if (!engine)
@@ -467,6 +469,123 @@ static bool wa_1209644611_applies(int ver, u32 size)
return height % 4 == 3 && height <= 8;
 }
 
+/**
+ * DOC: Flat-CCS - Memory compression for Local memory
+ *
+ * On Xe-HP and later devices, we use dedicated compression control state (CCS)
+ * stored in local memory for each surface, to support the 3D and media
+ * compression formats.
+ *
+ * The memory required for the CCS of the entire local memory is 1/256 of the
+ * local memory size. So before the kernel boot, the required memory is 
reserved
+ * for the CCS data and a secure register will be programmed with the CCS base
+ * address.
+ *
+ * Flat CCS data needs to be cleared when a lmem object is allocated.
+ * And CCS data can be copied in and out of CCS region through
+ * XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly.
+ *
+ * When we exhaust the lmem, if the object's placements support smem, then we 
can
+ * directly decompress the compressed lmem object into smem and start using it
+ * from smem itself.
+ *
+ * But when we need to swapout the compressed lmem object into a smem region
+ * though objects' placement doesn't support smem, then we copy the lmem 
content
+ * as it is into smem region along with ccs data (using XY_CTRL_SURF_

[PATCH v8 3/9] drm/i915/gt: Optimize the migration and clear loop

2022-04-05 Thread Ramalingam C
Move the static calculations out of the loops for copy and clear.

v2:
  Fix the loss of proper error code on emit_pte

Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom  (v1)
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 34 -
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index e81f20266f62..e0f1c727662e 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -526,6 +526,7 @@ intel_context_migrate_copy(struct intel_context *ce,
   struct i915_request **out)
 {
struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst);
+   u32 src_offset, dst_offset;
struct i915_request *rq;
int err;
 
@@ -535,8 +536,18 @@ intel_context_migrate_copy(struct intel_context *ce,
 
GEM_BUG_ON(ce->ring->size < SZ_64K);
 
+   src_offset = 0;
+   dst_offset = CHUNK_SZ;
+   if (HAS_64K_PAGES(ce->engine->i915)) {
+   src_offset = 0;
+   dst_offset = 0;
+   if (src_is_lmem)
+   src_offset = CHUNK_SZ;
+   if (dst_is_lmem)
+   dst_offset = 2 * CHUNK_SZ;
+   }
+
do {
-   u32 src_offset, dst_offset;
int len;
 
rq = i915_request_create(ce);
@@ -564,17 +575,6 @@ intel_context_migrate_copy(struct intel_context *ce,
if (err)
goto out_rq;
 
-   src_offset = 0;
-   dst_offset = CHUNK_SZ;
-   if (HAS_64K_PAGES(ce->engine->i915)) {
-   src_offset = 0;
-   dst_offset = 0;
-   if (src_is_lmem)
-   src_offset = CHUNK_SZ;
-   if (dst_is_lmem)
-   dst_offset = 2 * CHUNK_SZ;
-   }
-
len = emit_pte(rq, _src, src_cache_level, src_is_lmem,
   src_offset, CHUNK_SZ);
if (len <= 0) {
@@ -690,6 +690,7 @@ intel_context_migrate_clear(struct intel_context *ce,
 {
struct sgt_dma it = sg_sgt(sg);
struct i915_request *rq;
+   u32 offset;
int err;
 
GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
@@ -697,8 +698,11 @@ intel_context_migrate_clear(struct intel_context *ce,
 
GEM_BUG_ON(ce->ring->size < SZ_64K);
 
+   offset = 0;
+   if (HAS_64K_PAGES(ce->engine->i915) && is_lmem)
+   offset = CHUNK_SZ;
+
do {
-   u32 offset;
int len;
 
rq = i915_request_create(ce);
@@ -726,10 +730,6 @@ intel_context_migrate_clear(struct intel_context *ce,
if (err)
goto out_rq;
 
-   offset = 0;
-   if (HAS_64K_PAGES(ce->engine->i915) && is_lmem)
-   offset = CHUNK_SZ;
-
len = emit_pte(rq, , cache_level, is_lmem, offset, CHUNK_SZ);
if (len <= 0) {
err = len;
-- 
2.20.1



[PATCH v8 1/9] drm/i915/gt: use engine instance directly for offset

2022-04-05 Thread Ramalingam C
To make it uniform across copy and clear, use the engine offset directly
to calculate the offset in the cmd forming for emit_clear.

Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 950fd6da146c..9d852a570400 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -613,15 +613,13 @@ intel_context_migrate_copy(struct intel_context *ce,
return err;
 }
 
-static int emit_clear(struct i915_request *rq, u64 offset, int size, u32 value)
+static int emit_clear(struct i915_request *rq, u32 offset, int size, u32 value)
 {
const int ver = GRAPHICS_VER(rq->engine->i915);
u32 *cs;
 
GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
 
-   offset += (u64)rq->engine->instance << 32;
-
cs = intel_ring_begin(rq, ver >= 8 ? 8 : 6);
if (IS_ERR(cs))
return PTR_ERR(cs);
@@ -631,17 +629,16 @@ static int emit_clear(struct i915_request *rq, u64 
offset, int size, u32 value)
*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
*cs++ = 0;
*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
-   *cs++ = lower_32_bits(offset);
-   *cs++ = upper_32_bits(offset);
+   *cs++ = offset;
+   *cs++ = rq->engine->instance;
*cs++ = value;
*cs++ = MI_NOOP;
} else {
-   GEM_BUG_ON(upper_32_bits(offset));
*cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
*cs++ = 0;
*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
-   *cs++ = lower_32_bits(offset);
+   *cs++ = offset;
*cs++ = value;
}
 
-- 
2.20.1



[PATCH v8 2/9] drm/i915/gt: Use XY_FAST_COLOR_BLT to clear obj on graphics ver 12+

2022-04-05 Thread Ramalingam C
Use faster XY_FAST_COLOR_BLT cmd on graphics version of 12 and more,
for clearing (Zero out) the pages of the newly allocated object.

XY_FAST_COLOR_BLT is faster than the older XY_COLOR_BLT.

v2:
  Typo fix at title [Thomas]
v3:
  XY_FAST_COLOR_BLT is used only for FLAT_CCS capable gen12+

Signed-off-by: Ramalingam C 
Signed-off-by: Chris Wilson 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  5 +++
 drivers/gpu/drm/i915/gt/intel_migrate.c  | 43 +---
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h 
b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index 4243be030bc1..d1b8c23f7a9e 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -206,6 +206,11 @@
 
 #define COLOR_BLT_CMD  (2 << 29 | 0x40 << 22 | (5 - 2))
 #define XY_COLOR_BLT_CMD   (2 << 29 | 0x50 << 22)
+#define XY_FAST_COLOR_BLT_CMD  (2 << 29 | 0x44 << 22)
+#define   XY_FAST_COLOR_BLT_DEPTH_32   (2 << 19)
+#define   XY_FAST_COLOR_BLT_DW 16
+#define   XY_FAST_COLOR_BLT_MOCS_MASK  GENMASK(27, 21)
+#define   XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT 31
 #define SRC_COPY_BLT_CMD   (2 << 29 | 0x43 << 22)
 #define GEN9_XY_FAST_COPY_BLT_CMD  (2 << 29 | 0x42 << 22)
 #define XY_SRC_COPY_BLT_CMD(2 << 29 | 0x53 << 22)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 9d852a570400..e81f20266f62 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -613,18 +613,51 @@ intel_context_migrate_copy(struct intel_context *ce,
return err;
 }
 
-static int emit_clear(struct i915_request *rq, u32 offset, int size, u32 value)
+static int emit_clear(struct i915_request *rq, u32 offset, int size,
+ u32 value, bool is_lmem)
 {
-   const int ver = GRAPHICS_VER(rq->engine->i915);
+   struct drm_i915_private *i915 = rq->engine->i915;
+   int mocs = rq->engine->gt->mocs.uc_index << 1;
+   const int ver = GRAPHICS_VER(i915);
+   int ring_sz;
u32 *cs;
 
GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
 
-   cs = intel_ring_begin(rq, ver >= 8 ? 8 : 6);
+   if (HAS_FLAT_CCS(i915) && ver >= 12)
+   ring_sz = XY_FAST_COLOR_BLT_DW;
+   else if (ver >= 8)
+   ring_sz = 8;
+   else
+   ring_sz = 6;
+
+   cs = intel_ring_begin(rq, ring_sz);
if (IS_ERR(cs))
return PTR_ERR(cs);
 
-   if (ver >= 8) {
+   if (HAS_FLAT_CCS(i915) && ver >= 12) {
+   *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 |
+   (XY_FAST_COLOR_BLT_DW - 2);
+   *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) |
+   (PAGE_SIZE - 1);
+   *cs++ = 0;
+   *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
+   *cs++ = offset;
+   *cs++ = rq->engine->instance;
+   *cs++ = !is_lmem << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT;
+   /* BG7 */
+   *cs++ = value;
+   *cs++ = 0;
+   *cs++ = 0;
+   *cs++ = 0;
+   /* BG11 */
+   *cs++ = 0;
+   *cs++ = 0;
+   /* BG13 */
+   *cs++ = 0;
+   *cs++ = 0;
+   *cs++ = 0;
+   } else if (ver >= 8) {
*cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (7 - 2);
*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
*cs++ = 0;
@@ -707,7 +740,7 @@ intel_context_migrate_clear(struct intel_context *ce,
if (err)
goto out_rq;
 
-   err = emit_clear(rq, offset, len, value);
+   err = emit_clear(rq, offset, len, value, is_lmem);
 
/* Arbitration is re-enabled between requests. */
 out_rq:
-- 
2.20.1



[PATCH v8 0/9] drm/i915/ttm: Evict and restore of compressed object

2022-04-05 Thread Ramalingam C
On Xe-HP and later devices, we use dedicated compression control
state (CCS) stored in local memory for each surface, to support
the 3D and media compression formats.

The memory required for the CCS of the entire local memory is
1/256 of the local memory size. So before the kernel
boot, the required memory is reserved for the CCS data and a
secure register will be programmed with the CCS base address

So when we allocate a object in local memory we dont need to explicitly
allocate the space for ccs data. But when we evict the obj into the smem
to hold the compression related data along with the obj we need smem
space of obj_size + (obj_size/256).

Hence when we create smem for an obj with lmem placement possibility we
create with the extra space.

When we are swapping out the local memory obj on flat-ccs capable platform,
we need to capture the ccs data too along with main meory and we need to
restore it when we are swapping in the content.

When lmem object is swapped into a smem obj, smem obj will
have the extra pages required to hold the ccs data corresponding to the
lmem main memory. So main memory of lmem will be copied into the initial
pages of the smem and then ccs data corresponding to the main memory
will be copied to the subsequent pages of smem.

Swapin happens exactly in reverse order. First main memory of lmem is
restored from the smem's initial pages and the ccs data will be restored
from the subsequent pages of smem.

Extracting and restoring the CCS data is done through a special cmd called
XY_CTRL_SURF_COPY_BLT

v8:
  New patch for return value fix
  Fix a return error code

Test-with: 20220405141050.16037-1-ramalinga...@intel.com

Ramalingam C (9):
  drm/i915/gt: use engine instance directly for offset
  drm/i915/gt: Use XY_FAST_COLOR_BLT to clear obj on graphics ver 12+
  drm/i915/gt: Optimize the migration and clear loop
  drm/i915/gt: Pass the -EINVAL when emit_pte doesn't update any PTE
  drm/i915/gt: Clear compress metadata for Flat-ccs objects
  drm/i915/selftest_migrate: Consider the possible roundup of size
  drm/i915/selftest_migrate: Check CCS meta data clear
  drm/i915/gem: Add extra pages in ttm_tt for ccs data
  drm/i915/migrate: Evict and restore the flatccs capable lmem obj

 drivers/gpu/drm/i915/gem/i915_gem_ttm.c  |  30 +-
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  21 +
 drivers/gpu/drm/i915/gt/intel_migrate.c  | 385 +--
 drivers/gpu/drm/i915/gt/selftest_migrate.c   | 253 ++--
 4 files changed, 629 insertions(+), 60 deletions(-)

-- 
2.20.1



Re: [PATCH v7 3/9] drm/i915/gt: Optimize the migration and clear loop

2022-04-05 Thread Ramalingam C
On 2022-03-29 at 18:53:42 +0530, Balasubramani Vivekanandan wrote:
> On 29.03.2022 00:37, Ramalingam C wrote:
> > Move the static calculations out of the loops for copy and clear.
> > 
> > Signed-off-by: Ramalingam C 
> > Reviewed-by: Thomas Hellström 
> > ---
> >  drivers/gpu/drm/i915/gt/intel_migrate.c | 44 -
> >  1 file changed, 21 insertions(+), 23 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
> > b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > index 17dd372a47d1..ec9a9e7cb388 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_migrate.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > @@ -526,6 +526,7 @@ intel_context_migrate_copy(struct intel_context *ce,
> >struct i915_request **out)
> >  {
> > struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst);
> > +   u32 src_offset, dst_offset;
> > struct i915_request *rq;
> > int err;
> >  
> > @@ -534,8 +535,20 @@ intel_context_migrate_copy(struct intel_context *ce,
> >  
> > GEM_BUG_ON(ce->ring->size < SZ_64K);
> >  
> > +   src_offset = 0;
> > +   dst_offset = CHUNK_SZ;
> > +   if (HAS_64K_PAGES(ce->engine->i915)) {
> > +   GEM_BUG_ON(!src_is_lmem && !dst_is_lmem);
> > +
> > +   src_offset = 0;
> > +   dst_offset = 0;
> > +   if (src_is_lmem)
> > +   src_offset = CHUNK_SZ;
> > +   if (dst_is_lmem)
> > +   dst_offset = 2 * CHUNK_SZ;
> > +   }
> > +
> > do {
> > -   u32 src_offset, dst_offset;
> > int len;
> >  
> > rq = i915_request_create(ce);
> > @@ -563,19 +576,6 @@ intel_context_migrate_copy(struct intel_context *ce,
> > if (err)
> > goto out_rq;
> >  
> > -   src_offset = 0;
> > -   dst_offset = CHUNK_SZ;
> > -   if (HAS_64K_PAGES(ce->engine->i915)) {
> > -   GEM_BUG_ON(!src_is_lmem && !dst_is_lmem);
> > -
> > -   src_offset = 0;
> > -   dst_offset = 0;
> > -   if (src_is_lmem)
> > -   src_offset = CHUNK_SZ;
> > -   if (dst_is_lmem)
> > -   dst_offset = 2 * CHUNK_SZ;
> > -   }
> > -
> > len = emit_pte(rq, _src, src_cache_level, src_is_lmem,
> >src_offset, CHUNK_SZ);
> > if (len <= 0) {
> > @@ -585,12 +585,10 @@ intel_context_migrate_copy(struct intel_context *ce,
> >  
> > err = emit_pte(rq, _dst, dst_cache_level, dst_is_lmem,
> >dst_offset, len);
> > -   if (err < 0)
> > -   goto out_rq;
> > -   if (err < len) {
> > +   if (err < len)
> > err = -EINVAL;
> > +   if (err < 0)
> > goto out_rq;
> > -   }
> With this change, for the case 0 < err < len, now the code does not
> reach `goto out_rq`.

With this change, flow will land into out_rq for all err < len.
But just now i am noticing we are overwriding all error code with
-EINVAL. I will fix that.

Ram.
> Is it the expected behavior? If yes, can you please add some details
> regarding this change in the commit description.
> 
> Regards,
> Bala
> >  
> > err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
> > if (err)
> > @@ -691,6 +689,7 @@ intel_context_migrate_clear(struct intel_context *ce,
> >  {
> > struct sgt_dma it = sg_sgt(sg);
> > struct i915_request *rq;
> > +   u32 offset;
> > int err;
> >  
> > GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
> > @@ -698,8 +697,11 @@ intel_context_migrate_clear(struct intel_context *ce,
> >  
> > GEM_BUG_ON(ce->ring->size < SZ_64K);
> >  
> > +   offset = 0;
> > +   if (HAS_64K_PAGES(ce->engine->i915) && is_lmem)
> > +   offset = CHUNK_SZ;
> > +
> > do {
> > -   u32 offset;
> > int len;
> >  
> > rq = i915_request_create(ce);
> > @@ -727,10 +729,6 @@ intel_context_migrate_clear(struct intel_context *ce,
> > if (err)
> > goto out_rq;
> >  
> > -   offset = 0;
> > -   if (HAS_64K_PAGES(ce->engine->i915) && is_lmem)
> > -   offset = CHUNK_SZ;
> > -
> > len = emit_pte(rq, , cache_level, is_lmem, offset, CHUNK_SZ);
> > if (len <= 0) {
> > err = len;
> > -- 
> > 2.20.1
> > 


Re: [PATCH v7 7/9] drm/ttm: Add a parameter to add extra pages into ttm_tt

2022-04-01 Thread Ramalingam C
On 2022-04-01 at 16:31:19 +0200, Christian König wrote:
> I would be nicer to push this through drm-misc-next, but the intel branch
> works for me as well.
Hi Christian

I have pushed this patch into drm-misc-next.

Regards,
Ram.
> 
> Regards,
> Christian.
> 
> Am 01.04.22 um 16:28 schrieb Ramalingam C:
> > Christian, Joonas and vivi
> > 
> > Once the premerge results are greeen, if this patch can be merged into
> > drm-intel-gt-next along with other patches could you please ack the
> > request to merge into drm-intel-gt-next?
> > 
> > Thanks
> > Ram
> > 
> > On 2022-04-01 at 18:07:49 +0530, Ramalingam C wrote:
> > > Add a parameter called "extra_pages" for ttm_tt_init, to indicate that
> > > driver needs extra pages in ttm_tt.
> > > 
> > > v2:
> > >Used imperative wording [Thomas and Christian]
> > > 
> > > Signed-off-by: Ramalingam C 
> > > cc: Christian Koenig 
> > > cc: Hellstrom Thomas 
> > > Reviewed-by: Thomas Hellstrom 
> > > Reviewed-by: Christian Konig 
> > > Reviewed-by: Nirmoy Das 
> > > ---
> > >   drivers/gpu/drm/drm_gem_vram_helper.c  |  2 +-
> > >   drivers/gpu/drm/i915/gem/i915_gem_ttm.c|  2 +-
> > >   drivers/gpu/drm/qxl/qxl_ttm.c  |  2 +-
> > >   drivers/gpu/drm/ttm/ttm_agp_backend.c  |  2 +-
> > >   drivers/gpu/drm/ttm/ttm_tt.c   | 12 +++-
> > >   drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c |  2 +-
> > >   include/drm/ttm/ttm_tt.h   |  4 +++-
> > >   7 files changed, 15 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c 
> > > b/drivers/gpu/drm/drm_gem_vram_helper.c
> > > index dc7f938bfff2..123045b58fec 100644
> > > --- a/drivers/gpu/drm/drm_gem_vram_helper.c
> > > +++ b/drivers/gpu/drm/drm_gem_vram_helper.c
> > > @@ -867,7 +867,7 @@ static struct ttm_tt *bo_driver_ttm_tt_create(struct 
> > > ttm_buffer_object *bo,
> > >   if (!tt)
> > >   return NULL;
> > > - ret = ttm_tt_init(tt, bo, page_flags, ttm_cached);
> > > + ret = ttm_tt_init(tt, bo, page_flags, ttm_cached, 0);
> > >   if (ret < 0)
> > >   goto err_ttm_tt_init;
> > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c 
> > > b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> > > index c40aca99442f..a878910a563c 100644
> > > --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> > > +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> > > @@ -293,7 +293,7 @@ static struct ttm_tt *i915_ttm_tt_create(struct 
> > > ttm_buffer_object *bo,
> > >   i915_tt->is_shmem = true;
> > >   }
> > > - ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching);
> > > + ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, 0);
> > >   if (ret)
> > >   goto err_free;
> > > diff --git a/drivers/gpu/drm/qxl/qxl_ttm.c b/drivers/gpu/drm/qxl/qxl_ttm.c
> > > index 95df5750f47f..9ba871bd19b1 100644
> > > --- a/drivers/gpu/drm/qxl/qxl_ttm.c
> > > +++ b/drivers/gpu/drm/qxl/qxl_ttm.c
> > > @@ -113,7 +113,7 @@ static struct ttm_tt *qxl_ttm_tt_create(struct 
> > > ttm_buffer_object *bo,
> > >   ttm = kzalloc(sizeof(struct ttm_tt), GFP_KERNEL);
> > >   if (ttm == NULL)
> > >   return NULL;
> > > - if (ttm_tt_init(ttm, bo, page_flags, ttm_cached)) {
> > > + if (ttm_tt_init(ttm, bo, page_flags, ttm_cached, 0)) {
> > >   kfree(ttm);
> > >   return NULL;
> > >   }
> > > diff --git a/drivers/gpu/drm/ttm/ttm_agp_backend.c 
> > > b/drivers/gpu/drm/ttm/ttm_agp_backend.c
> > > index 6ddc16f0fe2b..d27691f2e451 100644
> > > --- a/drivers/gpu/drm/ttm/ttm_agp_backend.c
> > > +++ b/drivers/gpu/drm/ttm/ttm_agp_backend.c
> > > @@ -134,7 +134,7 @@ struct ttm_tt *ttm_agp_tt_create(struct 
> > > ttm_buffer_object *bo,
> > >   agp_be->mem = NULL;
> > >   agp_be->bridge = bridge;
> > > - if (ttm_tt_init(_be->ttm, bo, page_flags, ttm_write_combined)) {
> > > + if (ttm_tt_init(_be->ttm, bo, page_flags, ttm_write_combined, 0)) {
> > >   kfree(agp_be);
> > >   return NULL;
> > >   }
> > > diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
> > > ind

Re: [PATCH v7 7/9] drm/ttm: Add a parameter to add extra pages into ttm_tt

2022-04-01 Thread Ramalingam C
Christian, Joonas and vivi

Once the premerge results are greeen, if this patch can be merged into
drm-intel-gt-next along with other patches could you please ack the
request to merge into drm-intel-gt-next?

Thanks
Ram

On 2022-04-01 at 18:07:49 +0530, Ramalingam C wrote:
> Add a parameter called "extra_pages" for ttm_tt_init, to indicate that
> driver needs extra pages in ttm_tt.
> 
> v2:
>   Used imperative wording [Thomas and Christian]
> 
> Signed-off-by: Ramalingam C 
> cc: Christian Koenig 
> cc: Hellstrom Thomas 
> Reviewed-by: Thomas Hellstrom 
> Reviewed-by: Christian Konig 
> Reviewed-by: Nirmoy Das 
> ---
>  drivers/gpu/drm/drm_gem_vram_helper.c  |  2 +-
>  drivers/gpu/drm/i915/gem/i915_gem_ttm.c|  2 +-
>  drivers/gpu/drm/qxl/qxl_ttm.c  |  2 +-
>  drivers/gpu/drm/ttm/ttm_agp_backend.c  |  2 +-
>  drivers/gpu/drm/ttm/ttm_tt.c   | 12 +++-
>  drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c |  2 +-
>  include/drm/ttm/ttm_tt.h   |  4 +++-
>  7 files changed, 15 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c 
> b/drivers/gpu/drm/drm_gem_vram_helper.c
> index dc7f938bfff2..123045b58fec 100644
> --- a/drivers/gpu/drm/drm_gem_vram_helper.c
> +++ b/drivers/gpu/drm/drm_gem_vram_helper.c
> @@ -867,7 +867,7 @@ static struct ttm_tt *bo_driver_ttm_tt_create(struct 
> ttm_buffer_object *bo,
>   if (!tt)
>   return NULL;
>  
> - ret = ttm_tt_init(tt, bo, page_flags, ttm_cached);
> + ret = ttm_tt_init(tt, bo, page_flags, ttm_cached, 0);
>   if (ret < 0)
>   goto err_ttm_tt_init;
>  
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c 
> b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> index c40aca99442f..a878910a563c 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> @@ -293,7 +293,7 @@ static struct ttm_tt *i915_ttm_tt_create(struct 
> ttm_buffer_object *bo,
>   i915_tt->is_shmem = true;
>   }
>  
> - ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching);
> + ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, 0);
>   if (ret)
>   goto err_free;
>  
> diff --git a/drivers/gpu/drm/qxl/qxl_ttm.c b/drivers/gpu/drm/qxl/qxl_ttm.c
> index 95df5750f47f..9ba871bd19b1 100644
> --- a/drivers/gpu/drm/qxl/qxl_ttm.c
> +++ b/drivers/gpu/drm/qxl/qxl_ttm.c
> @@ -113,7 +113,7 @@ static struct ttm_tt *qxl_ttm_tt_create(struct 
> ttm_buffer_object *bo,
>   ttm = kzalloc(sizeof(struct ttm_tt), GFP_KERNEL);
>   if (ttm == NULL)
>   return NULL;
> - if (ttm_tt_init(ttm, bo, page_flags, ttm_cached)) {
> + if (ttm_tt_init(ttm, bo, page_flags, ttm_cached, 0)) {
>   kfree(ttm);
>   return NULL;
>   }
> diff --git a/drivers/gpu/drm/ttm/ttm_agp_backend.c 
> b/drivers/gpu/drm/ttm/ttm_agp_backend.c
> index 6ddc16f0fe2b..d27691f2e451 100644
> --- a/drivers/gpu/drm/ttm/ttm_agp_backend.c
> +++ b/drivers/gpu/drm/ttm/ttm_agp_backend.c
> @@ -134,7 +134,7 @@ struct ttm_tt *ttm_agp_tt_create(struct ttm_buffer_object 
> *bo,
>   agp_be->mem = NULL;
>   agp_be->bridge = bridge;
>  
> - if (ttm_tt_init(_be->ttm, bo, page_flags, ttm_write_combined)) {
> + if (ttm_tt_init(_be->ttm, bo, page_flags, ttm_write_combined, 0)) {
>   kfree(agp_be);
>   return NULL;
>   }
> diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
> index d234aab800a0..1a66d9fc589a 100644
> --- a/drivers/gpu/drm/ttm/ttm_tt.c
> +++ b/drivers/gpu/drm/ttm/ttm_tt.c
> @@ -134,9 +134,10 @@ void ttm_tt_destroy(struct ttm_device *bdev, struct 
> ttm_tt *ttm)
>  static void ttm_tt_init_fields(struct ttm_tt *ttm,
>  struct ttm_buffer_object *bo,
>  uint32_t page_flags,
> -enum ttm_caching caching)
> +enum ttm_caching caching,
> +unsigned long extra_pages)
>  {
> - ttm->num_pages = PAGE_ALIGN(bo->base.size) >> PAGE_SHIFT;
> + ttm->num_pages = (PAGE_ALIGN(bo->base.size) >> PAGE_SHIFT) + 
> extra_pages;
>   ttm->caching = ttm_cached;
>   ttm->page_flags = page_flags;
>   ttm->dma_address = NULL;
> @@ -146,9 +147,10 @@ static void ttm_tt_init_fields(struct ttm_tt *ttm,
>  }
>  
>  int ttm_tt_init(struct ttm_tt *ttm, struct ttm_buffer_object *bo,
> - uint32_t page_flags, enum ttm_caching caching)
> + uint32_t page_flags, enum ttm_caching caching,
> + unsigned long extra_pages)
>  

[PATCH v7 9/9] drm/i915/migrate: Evict and restore the flatccs capable lmem obj

2022-04-01 Thread Ramalingam C
When we are swapping out the local memory obj on flat-ccs capable platform,
we need to capture the ccs data too along with main meory and we need to
restore it when we are swapping in the content.

When lmem object is swapped into a smem obj, smem obj will
have the extra pages required to hold the ccs data corresponding to the
lmem main memory. So main memory of lmem will be copied into the initial
pages of the smem and then ccs data corresponding to the main memory
will be copied to the subsequent pages of smem. ccs data is 1/256 of
lmem size.

Swapin happens exactly in reverse order. First main memory of lmem is
restored from the smem's initial pages and the ccs data will be restored
from the subsequent pages of smem.

Extracting and restoring the CCS data is done through a special cmd called
XY_CTRL_SURF_COPY_BLT

v2: Fixing the ccs handling
v3: Handle the ccs data at same loop as main memory [Thomas]
v4: changes for emit_copy_ccs
v5: handle non-flat-ccs scenario

Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 164 +++-
 1 file changed, 160 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index a44f2d29da4e..ec417c84600b 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -633,6 +633,65 @@ static int emit_copy(struct i915_request *rq,
return 0;
 }
 
+static int scatter_list_length(struct scatterlist *sg)
+{
+   int len = 0;
+
+   while (sg && sg_dma_len(sg)) {
+   len += sg_dma_len(sg);
+   sg = sg_next(sg);
+   };
+
+   return len;
+}
+
+static void
+calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem,
+  int *src_sz, int *ccs_sz, u32 bytes_to_cpy,
+  u32 ccs_bytes_to_cpy)
+{
+   if (ccs_bytes_to_cpy) {
+   /*
+* We can only copy the ccs data corresponding to
+* the CHUNK_SZ of lmem which is
+* GET_CCS_BYTES(i915, CHUNK_SZ))
+*/
+   *ccs_sz = min_t(int, ccs_bytes_to_cpy, GET_CCS_BYTES(i915, 
CHUNK_SZ));
+
+   if (!src_is_lmem)
+   /*
+* When CHUNK_SZ is passed all the pages upto CHUNK_SZ
+* will be taken for the blt. in Flat-ccs supported
+* platform Smem obj will have more pages than required
+* for main meory hence limit it to the required size
+* for main memory
+*/
+   *src_sz = min_t(int, bytes_to_cpy, CHUNK_SZ);
+   } else { /* ccs handling is not required */
+   *src_sz = CHUNK_SZ;
+   }
+}
+
+static void get_ccs_sg_sgt(struct sgt_dma *it, u32 bytes_to_cpy)
+{
+   u32 len;
+
+   do {
+   GEM_BUG_ON(!it->sg || !sg_dma_len(it->sg));
+   len = it->max - it->dma;
+   if (len > bytes_to_cpy) {
+   it->dma += bytes_to_cpy;
+   break;
+   }
+
+   bytes_to_cpy -= len;
+
+   it->sg = __sg_next(it->sg);
+   it->dma = sg_dma_address(it->sg);
+   it->max = it->dma + sg_dma_len(it->sg);
+   } while (bytes_to_cpy);
+}
+
 int
 intel_context_migrate_copy(struct intel_context *ce,
   const struct i915_deps *deps,
@@ -644,9 +703,15 @@ intel_context_migrate_copy(struct intel_context *ce,
   bool dst_is_lmem,
   struct i915_request **out)
 {
-   struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst);
+   struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst), it_ccs;
+   struct drm_i915_private *i915 = ce->engine->i915;
+   u32 ccs_bytes_to_cpy = 0, bytes_to_cpy;
+   enum i915_cache_level ccs_cache_level;
+   int src_sz, dst_sz, ccs_sz;
u32 src_offset, dst_offset;
+   u8 src_access, dst_access;
struct i915_request *rq;
+   bool ccs_is_src;
int err;
 
GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
@@ -655,6 +720,38 @@ intel_context_migrate_copy(struct intel_context *ce,
 
GEM_BUG_ON(ce->ring->size < SZ_64K);
 
+   src_sz = scatter_list_length(src);
+   bytes_to_cpy = src_sz;
+
+   if (HAS_FLAT_CCS(i915) && src_is_lmem ^ dst_is_lmem) {
+   src_access = !src_is_lmem && dst_is_lmem;
+   dst_access = !src_access;
+
+   dst_sz = scatter_list_length(dst);
+   if (src_is_lmem) {
+   it_ccs = it_dst;
+   ccs_cache_level = dst_cache_level;
+   ccs_is_src = false;
+   } else if (dst_is_l

[PATCH v7 8/9] drm/i915/gem: Add extra pages in ttm_tt for ccs data

2022-04-01 Thread Ramalingam C
On Xe-HP and later devices, dedicated compression control state (CCS)
stored in local memory is used for each surface, to support the
3D and media compression formats.

The memory required for the CCS of the entire local memory is 1/256 of
the local memory size. So before the kernel boot, the required memory
is reserved for the CCS data and a secure register will be programmed
with the CCS base address

So when an object is allocated in local memory, dont need to explicitly
allocate the space for ccs data. But when the obj is evicted into the
smem, to hold the compression related data along with the obj extra space
is needed in smem. i.e obj_size + (obj_size/256).

Hence when a smem pages are allocated for an obj with lmem placement
possibility we create with the extra pages required for the ccs data for
the obj size.

v2:
  Used imperative wording [Thomas]
v3:
  Inflate the pages only when obj's placement is lmem only
v4:
  GEM_BUG_ON if the ttm->num_pages > obj page size [Thomas]

Signed-off-by: Ramalingam C 
cc: Christian Koenig 
cc: Hellstrom Thomas 
Reviewed-by: Thomas Hellstrom 
Reviewed-by: Nirmoy Das 
---
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 30 -
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index a878910a563c..4c25d9b2f138 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -20,6 +20,7 @@
 #include "gem/i915_gem_ttm.h"
 #include "gem/i915_gem_ttm_move.h"
 #include "gem/i915_gem_ttm_pm.h"
+#include "gt/intel_gpu_commands.h"
 
 #define I915_TTM_PRIO_PURGE 0
 #define I915_TTM_PRIO_NO_PAGES  1
@@ -265,12 +266,33 @@ static const struct i915_refct_sgt_ops tt_rsgt_ops = {
.release = i915_ttm_tt_release
 };
 
+static inline bool
+i915_gem_object_needs_ccs_pages(struct drm_i915_gem_object *obj)
+{
+   bool lmem_placement = false;
+   int i;
+
+   for (i = 0; i < obj->mm.n_placements; i++) {
+   /* Compression is not allowed for the objects with smem 
placement */
+   if (obj->mm.placements[i]->type == INTEL_MEMORY_SYSTEM)
+   return false;
+   if (!lmem_placement &&
+   obj->mm.placements[i]->type == INTEL_MEMORY_LOCAL)
+   lmem_placement = true;
+   }
+
+   return lmem_placement;
+}
+
 static struct ttm_tt *i915_ttm_tt_create(struct ttm_buffer_object *bo,
 uint32_t page_flags)
 {
+   struct drm_i915_private *i915 = container_of(bo->bdev, typeof(*i915),
+bdev);
struct ttm_resource_manager *man =
ttm_manager_type(bo->bdev, bo->resource->mem_type);
struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo);
+   unsigned long ccs_pages = 0;
enum ttm_caching caching;
struct i915_ttm_tt *i915_tt;
int ret;
@@ -293,7 +315,12 @@ static struct ttm_tt *i915_ttm_tt_create(struct 
ttm_buffer_object *bo,
i915_tt->is_shmem = true;
}
 
-   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, 0);
+   if (HAS_FLAT_CCS(i915) && i915_gem_object_needs_ccs_pages(obj))
+   ccs_pages = DIV_ROUND_UP(DIV_ROUND_UP(bo->base.size,
+ NUM_BYTES_PER_CCS_BYTE),
+PAGE_SIZE);
+
+   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, ccs_pages);
if (ret)
goto err_free;
 
@@ -773,6 +800,7 @@ static int __i915_ttm_get_pages(struct drm_i915_gem_object 
*obj,
i915_sg_dma_sizes(rsgt->table.sgl));
}
 
+   GEM_BUG_ON(bo->ttm && ((obj->base.size >> PAGE_SHIFT) < 
bo->ttm->num_pages));
i915_ttm_adjust_lru(obj);
return ret;
 }
-- 
2.20.1



[PATCH v7 7/9] drm/ttm: Add a parameter to add extra pages into ttm_tt

2022-04-01 Thread Ramalingam C
Add a parameter called "extra_pages" for ttm_tt_init, to indicate that
driver needs extra pages in ttm_tt.

v2:
  Used imperative wording [Thomas and Christian]

Signed-off-by: Ramalingam C 
cc: Christian Koenig 
cc: Hellstrom Thomas 
Reviewed-by: Thomas Hellstrom 
Reviewed-by: Christian Konig 
Reviewed-by: Nirmoy Das 
---
 drivers/gpu/drm/drm_gem_vram_helper.c  |  2 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c|  2 +-
 drivers/gpu/drm/qxl/qxl_ttm.c  |  2 +-
 drivers/gpu/drm/ttm/ttm_agp_backend.c  |  2 +-
 drivers/gpu/drm/ttm/ttm_tt.c   | 12 +++-
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c |  2 +-
 include/drm/ttm/ttm_tt.h   |  4 +++-
 7 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c 
b/drivers/gpu/drm/drm_gem_vram_helper.c
index dc7f938bfff2..123045b58fec 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -867,7 +867,7 @@ static struct ttm_tt *bo_driver_ttm_tt_create(struct 
ttm_buffer_object *bo,
if (!tt)
return NULL;
 
-   ret = ttm_tt_init(tt, bo, page_flags, ttm_cached);
+   ret = ttm_tt_init(tt, bo, page_flags, ttm_cached, 0);
if (ret < 0)
goto err_ttm_tt_init;
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index c40aca99442f..a878910a563c 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -293,7 +293,7 @@ static struct ttm_tt *i915_ttm_tt_create(struct 
ttm_buffer_object *bo,
i915_tt->is_shmem = true;
}
 
-   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching);
+   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, 0);
if (ret)
goto err_free;
 
diff --git a/drivers/gpu/drm/qxl/qxl_ttm.c b/drivers/gpu/drm/qxl/qxl_ttm.c
index 95df5750f47f..9ba871bd19b1 100644
--- a/drivers/gpu/drm/qxl/qxl_ttm.c
+++ b/drivers/gpu/drm/qxl/qxl_ttm.c
@@ -113,7 +113,7 @@ static struct ttm_tt *qxl_ttm_tt_create(struct 
ttm_buffer_object *bo,
ttm = kzalloc(sizeof(struct ttm_tt), GFP_KERNEL);
if (ttm == NULL)
return NULL;
-   if (ttm_tt_init(ttm, bo, page_flags, ttm_cached)) {
+   if (ttm_tt_init(ttm, bo, page_flags, ttm_cached, 0)) {
kfree(ttm);
return NULL;
}
diff --git a/drivers/gpu/drm/ttm/ttm_agp_backend.c 
b/drivers/gpu/drm/ttm/ttm_agp_backend.c
index 6ddc16f0fe2b..d27691f2e451 100644
--- a/drivers/gpu/drm/ttm/ttm_agp_backend.c
+++ b/drivers/gpu/drm/ttm/ttm_agp_backend.c
@@ -134,7 +134,7 @@ struct ttm_tt *ttm_agp_tt_create(struct ttm_buffer_object 
*bo,
agp_be->mem = NULL;
agp_be->bridge = bridge;
 
-   if (ttm_tt_init(_be->ttm, bo, page_flags, ttm_write_combined)) {
+   if (ttm_tt_init(_be->ttm, bo, page_flags, ttm_write_combined, 0)) {
kfree(agp_be);
return NULL;
}
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index d234aab800a0..1a66d9fc589a 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -134,9 +134,10 @@ void ttm_tt_destroy(struct ttm_device *bdev, struct ttm_tt 
*ttm)
 static void ttm_tt_init_fields(struct ttm_tt *ttm,
   struct ttm_buffer_object *bo,
   uint32_t page_flags,
-  enum ttm_caching caching)
+  enum ttm_caching caching,
+  unsigned long extra_pages)
 {
-   ttm->num_pages = PAGE_ALIGN(bo->base.size) >> PAGE_SHIFT;
+   ttm->num_pages = (PAGE_ALIGN(bo->base.size) >> PAGE_SHIFT) + 
extra_pages;
ttm->caching = ttm_cached;
ttm->page_flags = page_flags;
ttm->dma_address = NULL;
@@ -146,9 +147,10 @@ static void ttm_tt_init_fields(struct ttm_tt *ttm,
 }
 
 int ttm_tt_init(struct ttm_tt *ttm, struct ttm_buffer_object *bo,
-   uint32_t page_flags, enum ttm_caching caching)
+   uint32_t page_flags, enum ttm_caching caching,
+   unsigned long extra_pages)
 {
-   ttm_tt_init_fields(ttm, bo, page_flags, caching);
+   ttm_tt_init_fields(ttm, bo, page_flags, caching, extra_pages);
 
if (ttm_tt_alloc_page_directory(ttm)) {
pr_err("Failed allocating page table\n");
@@ -180,7 +182,7 @@ int ttm_sg_tt_init(struct ttm_tt *ttm, struct 
ttm_buffer_object *bo,
 {
int ret;
 
-   ttm_tt_init_fields(ttm, bo, page_flags, caching);
+   ttm_tt_init_fields(ttm, bo, page_flags, caching, 0);
 
if (page_flags & TTM_TT_FLAG_EXTERNAL)
ret = ttm_sg_tt_alloc_page_directory(ttm);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c 
b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
index b84ecc6d6611..4e3938e62c08 100

[PATCH v7 6/9] drm/i915/selftest_migrate: Check CCS meta data clear

2022-04-01 Thread Ramalingam C
Extend the live migrate selftest, to verify the ccs surface clearing
during the Flat-CCS capable lmem obj clear.

v2:
  Look at right places for ccs data [Thomas]

Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/selftest_migrate.c | 250 ++---
 1 file changed, 222 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c 
b/drivers/gpu/drm/i915/gt/selftest_migrate.c
index b5da8b8cd039..8cd9a22054f3 100644
--- a/drivers/gpu/drm/i915/gt/selftest_migrate.c
+++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c
@@ -132,6 +132,124 @@ static int copy(struct intel_migrate *migrate,
return err;
 }
 
+static int intel_context_copy_ccs(struct intel_context *ce,
+ const struct i915_deps *deps,
+ struct scatterlist *sg,
+ enum i915_cache_level cache_level,
+ bool write_to_ccs,
+ struct i915_request **out)
+{
+   u8 src_access = write_to_ccs ? DIRECT_ACCESS : INDIRECT_ACCESS;
+   u8 dst_access = write_to_ccs ? INDIRECT_ACCESS : DIRECT_ACCESS;
+   struct sgt_dma it = sg_sgt(sg);
+   struct i915_request *rq;
+   u32 offset;
+   int err;
+
+   GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
+   *out = NULL;
+
+   GEM_BUG_ON(ce->ring->size < SZ_64K);
+
+   offset = 0;
+   if (HAS_64K_PAGES(ce->engine->i915))
+   offset = CHUNK_SZ;
+
+   do {
+   int len;
+
+   rq = i915_request_create(ce);
+   if (IS_ERR(rq)) {
+   err = PTR_ERR(rq);
+   goto out_ce;
+   }
+
+   if (deps) {
+   err = i915_request_await_deps(rq, deps);
+   if (err)
+   goto out_rq;
+
+   if (rq->engine->emit_init_breadcrumb) {
+   err = rq->engine->emit_init_breadcrumb(rq);
+   if (err)
+   goto out_rq;
+   }
+
+   deps = NULL;
+   }
+
+   /* The PTE updates + clear must not be interrupted. */
+   err = emit_no_arbitration(rq);
+   if (err)
+   goto out_rq;
+
+   len = emit_pte(rq, , cache_level, true, offset, CHUNK_SZ);
+   if (len <= 0) {
+   err = len;
+   goto out_rq;
+   }
+
+   err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
+   if (err)
+   goto out_rq;
+
+   err = emit_copy_ccs(rq, offset, dst_access,
+   offset, src_access, len);
+   if (err)
+   goto out_rq;
+
+   err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
+
+   /* Arbitration is re-enabled between requests. */
+out_rq:
+   if (*out)
+   i915_request_put(*out);
+   *out = i915_request_get(rq);
+   i915_request_add(rq);
+   if (err || !it.sg || !sg_dma_len(it.sg))
+   break;
+
+   cond_resched();
+   } while (1);
+
+out_ce:
+   return err;
+}
+
+static int
+intel_migrate_ccs_copy(struct intel_migrate *m,
+  struct i915_gem_ww_ctx *ww,
+  const struct i915_deps *deps,
+  struct scatterlist *sg,
+  enum i915_cache_level cache_level,
+  bool write_to_ccs,
+  struct i915_request **out)
+{
+   struct intel_context *ce;
+   int err;
+
+   *out = NULL;
+   if (!m->context)
+   return -ENODEV;
+
+   ce = intel_migrate_create_context(m);
+   if (IS_ERR(ce))
+   ce = intel_context_get(m->context);
+   GEM_BUG_ON(IS_ERR(ce));
+
+   err = intel_context_pin_ww(ce, ww);
+   if (err)
+   goto out;
+
+   err = intel_context_copy_ccs(ce, deps, sg, cache_level,
+write_to_ccs, out);
+
+   intel_context_unpin(ce);
+out:
+   intel_context_put(ce);
+   return err;
+}
+
 static int clear(struct intel_migrate *migrate,
 int (*fn)(struct intel_migrate *migrate,
   struct i915_gem_ww_ctx *ww,
@@ -144,7 +262,8 @@ static int clear(struct intel_migrate *migrate,
struct drm_i915_gem_object *obj;
struct i915_request *rq;
struct i915_gem_ww_ctx ww;
-   u32 *vaddr;
+   u32 *vaddr, val = 0;
+   bool ccs_cap = false;
int err = 0;
int i;
 
@@ -155,7 +274,12 @@ static int clear(struct intel_migrate *migrate,
  

[PATCH v7 5/9] drm/i915/selftest_migrate: Consider the possible roundup of size

2022-04-01 Thread Ramalingam C
Consider the possible round up happened at obj size alignment to
min_page_size during the obj allocation.

Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/selftest_migrate.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c 
b/drivers/gpu/drm/i915/gt/selftest_migrate.c
index c9c4f391c5cc..b5da8b8cd039 100644
--- a/drivers/gpu/drm/i915/gt/selftest_migrate.c
+++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c
@@ -152,6 +152,9 @@ static int clear(struct intel_migrate *migrate,
if (IS_ERR(obj))
return 0;
 
+   /* Consider the rounded up memory too */
+   sz = obj->base.size;
+
for_i915_gem_ww(, err, true) {
err = i915_gem_object_lock(obj, );
if (err)
-- 
2.20.1



[PATCH v7 4/9] drm/i915/gt: Clear compress metadata for Flat-ccs objects

2022-04-01 Thread Ramalingam C
Xe-HP and latest devices support Flat CCS which reserved a portion of
the device memory to store compression metadata, during the clearing of
device memory buffer object we also need to clear the associated
CCS buffer.

XY_CTRL_SURF_COPY_BLT is a BLT cmd used for reading and writing the
ccs surface of a lmem memory. So on Flat-CCS capable platform we use
XY_CTRL_SURF_COPY_BLT  to clear the CCS meta data.

v2: Fixed issues with platform naming [Lucas]
v3: Rebased [Ram]
Used the round_up funcs [Bob]
v4: Fixed ccs blk calculation [Ram]
Added Kdoc on flat-ccs.
v5: GENMASK is used [Matt]
mocs fix [Matt]
Comments Fix [Matt]
Flush address programming [Ram]
v6: FLUSH_DW is fixed
Few coding style fix
v7: Adopting the XY_FAST_COLOR_BLT (Thomas]
v8: XY_CTRL_SURF_COPY_BLT for ccs clearing.
v9: emit_copy_ccs is used.
v10: ctrl_surf cmds are filled in caller itself. [Thomas]
 only one ctrl surf cmd is used as size of lmem is <=8M [Thomas]

Signed-off-by: Ramalingam C 
Signed-off-by: Ayaz A Siddiqui 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  16 +++
 drivers/gpu/drm/i915/gt/intel_migrate.c  | 137 ++-
 2 files changed, 152 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h 
b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index d1b8c23f7a9e..724ab069ddb6 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -154,8 +154,10 @@
 #define   MI_FLUSH_DW_PROTECTED_MEM_EN (1 << 22)
 #define   MI_FLUSH_DW_STORE_INDEX  (1<<21)
 #define   MI_INVALIDATE_TLB(1<<18)
+#define   MI_FLUSH_DW_CCS  (1<<16)
 #define   MI_FLUSH_DW_OP_STOREDW   (1<<14)
 #define   MI_FLUSH_DW_OP_MASK  (3<<14)
+#define   MI_FLUSH_DW_LLC  (1<<9)
 #define   MI_FLUSH_DW_NOTIFY   (1<<8)
 #define   MI_INVALIDATE_BSD(1<<7)
 #define   MI_FLUSH_DW_USE_GTT  (1<<2)
@@ -204,6 +206,20 @@
 #define GFX_OP_DRAWRECT_INFO ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3))
 #define GFX_OP_DRAWRECT_INFO_I965  ((0x7900<<16)|0x2)
 
+#define XY_CTRL_SURF_INSTR_SIZE5
+#define MI_FLUSH_DW_SIZE   3
+#define XY_CTRL_SURF_COPY_BLT  ((2 << 29) | (0x48 << 22) | 3)
+#define   SRC_ACCESS_TYPE_SHIFT21
+#define   DST_ACCESS_TYPE_SHIFT20
+#define   CCS_SIZE_MASK0x3FF
+#define   CCS_SIZE_SHIFT   8
+#define   XY_CTRL_SURF_MOCS_MASK   GENMASK(31, 25)
+#define   NUM_CCS_BYTES_PER_BLOCK  256
+#define   NUM_BYTES_PER_CCS_BYTE   256
+#define   NUM_CCS_BLKS_PER_XFER1024
+#define   INDIRECT_ACCESS  0
+#define   DIRECT_ACCESS1
+
 #define COLOR_BLT_CMD  (2 << 29 | 0x40 << 22 | (5 - 2))
 #define XY_COLOR_BLT_CMD   (2 << 29 | 0x50 << 22)
 #define XY_FAST_COLOR_BLT_CMD  (2 << 29 | 0x44 << 22)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 580b4cf1efa2..a44f2d29da4e 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -17,6 +17,8 @@ struct insert_pte_data {
 
 #define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
 
+#define GET_CCS_BYTES(i915, size)  (HAS_FLAT_CCS(i915) ? \
+DIV_ROUND_UP(size, 
NUM_BYTES_PER_CCS_BYTE) : 0)
 static bool engine_supports_migration(struct intel_engine_cs *engine)
 {
if (!engine)
@@ -467,6 +469,123 @@ static bool wa_1209644611_applies(int ver, u32 size)
return height % 4 == 3 && height <= 8;
 }
 
+/**
+ * DOC: Flat-CCS - Memory compression for Local memory
+ *
+ * On Xe-HP and later devices, we use dedicated compression control state (CCS)
+ * stored in local memory for each surface, to support the 3D and media
+ * compression formats.
+ *
+ * The memory required for the CCS of the entire local memory is 1/256 of the
+ * local memory size. So before the kernel boot, the required memory is 
reserved
+ * for the CCS data and a secure register will be programmed with the CCS base
+ * address.
+ *
+ * Flat CCS data needs to be cleared when a lmem object is allocated.
+ * And CCS data can be copied in and out of CCS region through
+ * XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly.
+ *
+ * When we exhaust the lmem, if the object's placements support smem, then we 
can
+ * directly decompress the compressed lmem object into smem and start using it
+ * from smem itself.
+ *
+ * But when we need to swapout the compressed lmem object into a smem region
+ * though objects' placement doesn't support smem, then we copy the lmem 
content
+ * as it is into smem region along with ccs data (using XY_CTRL_SURF_

[PATCH v7 3/9] drm/i915/gt: Optimize the migration and clear loop

2022-04-01 Thread Ramalingam C
Move the static calculations out of the loops for copy and clear.

Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 40 -
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index e81f20266f62..580b4cf1efa2 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -526,6 +526,7 @@ intel_context_migrate_copy(struct intel_context *ce,
   struct i915_request **out)
 {
struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst);
+   u32 src_offset, dst_offset;
struct i915_request *rq;
int err;
 
@@ -535,8 +536,18 @@ intel_context_migrate_copy(struct intel_context *ce,
 
GEM_BUG_ON(ce->ring->size < SZ_64K);
 
+   src_offset = 0;
+   dst_offset = CHUNK_SZ;
+   if (HAS_64K_PAGES(ce->engine->i915)) {
+   src_offset = 0;
+   dst_offset = 0;
+   if (src_is_lmem)
+   src_offset = CHUNK_SZ;
+   if (dst_is_lmem)
+   dst_offset = 2 * CHUNK_SZ;
+   }
+
do {
-   u32 src_offset, dst_offset;
int len;
 
rq = i915_request_create(ce);
@@ -564,17 +575,6 @@ intel_context_migrate_copy(struct intel_context *ce,
if (err)
goto out_rq;
 
-   src_offset = 0;
-   dst_offset = CHUNK_SZ;
-   if (HAS_64K_PAGES(ce->engine->i915)) {
-   src_offset = 0;
-   dst_offset = 0;
-   if (src_is_lmem)
-   src_offset = CHUNK_SZ;
-   if (dst_is_lmem)
-   dst_offset = 2 * CHUNK_SZ;
-   }
-
len = emit_pte(rq, _src, src_cache_level, src_is_lmem,
   src_offset, CHUNK_SZ);
if (len <= 0) {
@@ -584,12 +584,10 @@ intel_context_migrate_copy(struct intel_context *ce,
 
err = emit_pte(rq, _dst, dst_cache_level, dst_is_lmem,
   dst_offset, len);
-   if (err < 0)
-   goto out_rq;
-   if (err < len) {
+   if (err < len)
err = -EINVAL;
+   if (err < 0)
goto out_rq;
-   }
 
err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
if (err)
@@ -690,6 +688,7 @@ intel_context_migrate_clear(struct intel_context *ce,
 {
struct sgt_dma it = sg_sgt(sg);
struct i915_request *rq;
+   u32 offset;
int err;
 
GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
@@ -697,8 +696,11 @@ intel_context_migrate_clear(struct intel_context *ce,
 
GEM_BUG_ON(ce->ring->size < SZ_64K);
 
+   offset = 0;
+   if (HAS_64K_PAGES(ce->engine->i915) && is_lmem)
+   offset = CHUNK_SZ;
+
do {
-   u32 offset;
int len;
 
rq = i915_request_create(ce);
@@ -726,10 +728,6 @@ intel_context_migrate_clear(struct intel_context *ce,
if (err)
goto out_rq;
 
-   offset = 0;
-   if (HAS_64K_PAGES(ce->engine->i915) && is_lmem)
-   offset = CHUNK_SZ;
-
len = emit_pte(rq, , cache_level, is_lmem, offset, CHUNK_SZ);
if (len <= 0) {
err = len;
-- 
2.20.1



[PATCH v7 2/9] drm/i915/gt: Use XY_FAST_COLOR_BLT to clear obj on graphics ver 12+

2022-04-01 Thread Ramalingam C
Use faster XY_FAST_COLOR_BLT cmd on graphics version of 12 and more,
for clearing (Zero out) the pages of the newly allocated object.

XY_FAST_COLOR_BLT is faster than the older XY_COLOR_BLT.

v2:
  Typo fix at title [Thomas]
v3:
  XY_FAST_COLOR_BLT is used only for FLAT_CCS capable gen12+

Signed-off-by: Ramalingam C 
Signed-off-by: Chris Wilson 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  5 +++
 drivers/gpu/drm/i915/gt/intel_migrate.c  | 43 +---
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h 
b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index 4243be030bc1..d1b8c23f7a9e 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -206,6 +206,11 @@
 
 #define COLOR_BLT_CMD  (2 << 29 | 0x40 << 22 | (5 - 2))
 #define XY_COLOR_BLT_CMD   (2 << 29 | 0x50 << 22)
+#define XY_FAST_COLOR_BLT_CMD  (2 << 29 | 0x44 << 22)
+#define   XY_FAST_COLOR_BLT_DEPTH_32   (2 << 19)
+#define   XY_FAST_COLOR_BLT_DW 16
+#define   XY_FAST_COLOR_BLT_MOCS_MASK  GENMASK(27, 21)
+#define   XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT 31
 #define SRC_COPY_BLT_CMD   (2 << 29 | 0x43 << 22)
 #define GEN9_XY_FAST_COPY_BLT_CMD  (2 << 29 | 0x42 << 22)
 #define XY_SRC_COPY_BLT_CMD(2 << 29 | 0x53 << 22)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 9d852a570400..e81f20266f62 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -613,18 +613,51 @@ intel_context_migrate_copy(struct intel_context *ce,
return err;
 }
 
-static int emit_clear(struct i915_request *rq, u32 offset, int size, u32 value)
+static int emit_clear(struct i915_request *rq, u32 offset, int size,
+ u32 value, bool is_lmem)
 {
-   const int ver = GRAPHICS_VER(rq->engine->i915);
+   struct drm_i915_private *i915 = rq->engine->i915;
+   int mocs = rq->engine->gt->mocs.uc_index << 1;
+   const int ver = GRAPHICS_VER(i915);
+   int ring_sz;
u32 *cs;
 
GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
 
-   cs = intel_ring_begin(rq, ver >= 8 ? 8 : 6);
+   if (HAS_FLAT_CCS(i915) && ver >= 12)
+   ring_sz = XY_FAST_COLOR_BLT_DW;
+   else if (ver >= 8)
+   ring_sz = 8;
+   else
+   ring_sz = 6;
+
+   cs = intel_ring_begin(rq, ring_sz);
if (IS_ERR(cs))
return PTR_ERR(cs);
 
-   if (ver >= 8) {
+   if (HAS_FLAT_CCS(i915) && ver >= 12) {
+   *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 |
+   (XY_FAST_COLOR_BLT_DW - 2);
+   *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) |
+   (PAGE_SIZE - 1);
+   *cs++ = 0;
+   *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
+   *cs++ = offset;
+   *cs++ = rq->engine->instance;
+   *cs++ = !is_lmem << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT;
+   /* BG7 */
+   *cs++ = value;
+   *cs++ = 0;
+   *cs++ = 0;
+   *cs++ = 0;
+   /* BG11 */
+   *cs++ = 0;
+   *cs++ = 0;
+   /* BG13 */
+   *cs++ = 0;
+   *cs++ = 0;
+   *cs++ = 0;
+   } else if (ver >= 8) {
*cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (7 - 2);
*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
*cs++ = 0;
@@ -707,7 +740,7 @@ intel_context_migrate_clear(struct intel_context *ce,
if (err)
goto out_rq;
 
-   err = emit_clear(rq, offset, len, value);
+   err = emit_clear(rq, offset, len, value, is_lmem);
 
/* Arbitration is re-enabled between requests. */
 out_rq:
-- 
2.20.1



[PATCH v7 1/9] drm/i915/gt: use engine instance directly for offset

2022-04-01 Thread Ramalingam C
To make it uniform across copy and clear, use the engine offset directly
to calculate the offset in the cmd forming for emit_clear.

Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 950fd6da146c..9d852a570400 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -613,15 +613,13 @@ intel_context_migrate_copy(struct intel_context *ce,
return err;
 }
 
-static int emit_clear(struct i915_request *rq, u64 offset, int size, u32 value)
+static int emit_clear(struct i915_request *rq, u32 offset, int size, u32 value)
 {
const int ver = GRAPHICS_VER(rq->engine->i915);
u32 *cs;
 
GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
 
-   offset += (u64)rq->engine->instance << 32;
-
cs = intel_ring_begin(rq, ver >= 8 ? 8 : 6);
if (IS_ERR(cs))
return PTR_ERR(cs);
@@ -631,17 +629,16 @@ static int emit_clear(struct i915_request *rq, u64 
offset, int size, u32 value)
*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
*cs++ = 0;
*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
-   *cs++ = lower_32_bits(offset);
-   *cs++ = upper_32_bits(offset);
+   *cs++ = offset;
+   *cs++ = rq->engine->instance;
*cs++ = value;
*cs++ = MI_NOOP;
} else {
-   GEM_BUG_ON(upper_32_bits(offset));
*cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
*cs++ = 0;
*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
-   *cs++ = lower_32_bits(offset);
+   *cs++ = offset;
*cs++ = value;
}
 
-- 
2.20.1



[PATCH v7 0/9] drm/i915/ttm: Evict and restore of compressed object

2022-04-01 Thread Ramalingam C
On Xe-HP and later devices, we use dedicated compression control
state (CCS) stored in local memory for each surface, to support
the 3D and media compression formats.

The memory required for the CCS of the entire local memory is
1/256 of the local memory size. So before the kernel
boot, the required memory is reserved for the CCS data and a
secure register will be programmed with the CCS base address

So when we allocate a object in local memory we dont need to explicitly
allocate the space for ccs data. But when we evict the obj into the smem
to hold the compression related data along with the obj we need smem
space of obj_size + (obj_size/256).

Hence when we create smem for an obj with lmem placement possibility we
create with the extra space.

When we are swapping out the local memory obj on flat-ccs capable platform,
we need to capture the ccs data too along with main meory and we need to
restore it when we are swapping in the content.

When lmem object is swapped into a smem obj, smem obj will
have the extra pages required to hold the ccs data corresponding to the
lmem main memory. So main memory of lmem will be copied into the initial
pages of the smem and then ccs data corresponding to the main memory
will be copied to the subsequent pages of smem.

Swapin happens exactly in reverse order. First main memory of lmem is
restored from the smem's initial pages and the ccs data will be restored
from the subsequent pages of smem.

Extracting and restoring the CCS data is done through a special cmd called
XY_CTRL_SURF_COPY_BLT

v7:
  GEM_BUG_ON is added for catching if inflated pages are filled for
get_pages
  Optimised emit_copy_clear
  Engine index is directly used for the upper 32 bits of offset
  Use FAT_COLOR_BLT only for FLAT_CCS capable platforms

Resending with updated igt version for test with.

Test-with: 20220401074527.15709-2-ramalinga...@intel.com

Ramalingam C (9):
  drm/i915/gt: use engine instance directly for offset
  drm/i915/gt: Use XY_FAST_COLOR_BLT to clear obj on graphics ver 12+
  drm/i915/gt: Optimize the migration and clear loop
  drm/i915/gt: Clear compress metadata for Flat-ccs objects
  drm/i915/selftest_migrate: Consider the possible roundup of size
  drm/i915/selftest_migrate: Check CCS meta data clear
  drm/ttm: Add a parameter to add extra pages into ttm_tt
  drm/i915/gem: Add extra pages in ttm_tt for ccs data
  drm/i915/migrate: Evict and restore the flatccs capable lmem obj

 drivers/gpu/drm/drm_gem_vram_helper.c|   2 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c  |  30 +-
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  21 +
 drivers/gpu/drm/i915/gt/intel_migrate.c  | 387 +--
 drivers/gpu/drm/i915/gt/selftest_migrate.c   | 253 ++--
 drivers/gpu/drm/qxl/qxl_ttm.c|   2 +-
 drivers/gpu/drm/ttm/ttm_agp_backend.c|   2 +-
 drivers/gpu/drm/ttm/ttm_tt.c |  12 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c   |   2 +-
 include/drm/ttm/ttm_tt.h |   4 +-
 10 files changed, 642 insertions(+), 73 deletions(-)

-- 
2.20.1



[PATCH v7 3/9] drm/i915/gt: Optimize the migration and clear loop

2022-03-28 Thread Ramalingam C
Move the static calculations out of the loops for copy and clear.

Signed-off-by: Ramalingam C 
Reviewed-by: Thomas Hellström 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 44 -
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 17dd372a47d1..ec9a9e7cb388 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -526,6 +526,7 @@ intel_context_migrate_copy(struct intel_context *ce,
   struct i915_request **out)
 {
struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst);
+   u32 src_offset, dst_offset;
struct i915_request *rq;
int err;
 
@@ -534,8 +535,20 @@ intel_context_migrate_copy(struct intel_context *ce,
 
GEM_BUG_ON(ce->ring->size < SZ_64K);
 
+   src_offset = 0;
+   dst_offset = CHUNK_SZ;
+   if (HAS_64K_PAGES(ce->engine->i915)) {
+   GEM_BUG_ON(!src_is_lmem && !dst_is_lmem);
+
+   src_offset = 0;
+   dst_offset = 0;
+   if (src_is_lmem)
+   src_offset = CHUNK_SZ;
+   if (dst_is_lmem)
+   dst_offset = 2 * CHUNK_SZ;
+   }
+
do {
-   u32 src_offset, dst_offset;
int len;
 
rq = i915_request_create(ce);
@@ -563,19 +576,6 @@ intel_context_migrate_copy(struct intel_context *ce,
if (err)
goto out_rq;
 
-   src_offset = 0;
-   dst_offset = CHUNK_SZ;
-   if (HAS_64K_PAGES(ce->engine->i915)) {
-   GEM_BUG_ON(!src_is_lmem && !dst_is_lmem);
-
-   src_offset = 0;
-   dst_offset = 0;
-   if (src_is_lmem)
-   src_offset = CHUNK_SZ;
-   if (dst_is_lmem)
-   dst_offset = 2 * CHUNK_SZ;
-   }
-
len = emit_pte(rq, _src, src_cache_level, src_is_lmem,
   src_offset, CHUNK_SZ);
if (len <= 0) {
@@ -585,12 +585,10 @@ intel_context_migrate_copy(struct intel_context *ce,
 
err = emit_pte(rq, _dst, dst_cache_level, dst_is_lmem,
   dst_offset, len);
-   if (err < 0)
-   goto out_rq;
-   if (err < len) {
+   if (err < len)
err = -EINVAL;
+   if (err < 0)
goto out_rq;
-   }
 
err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
if (err)
@@ -691,6 +689,7 @@ intel_context_migrate_clear(struct intel_context *ce,
 {
struct sgt_dma it = sg_sgt(sg);
struct i915_request *rq;
+   u32 offset;
int err;
 
GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
@@ -698,8 +697,11 @@ intel_context_migrate_clear(struct intel_context *ce,
 
GEM_BUG_ON(ce->ring->size < SZ_64K);
 
+   offset = 0;
+   if (HAS_64K_PAGES(ce->engine->i915) && is_lmem)
+   offset = CHUNK_SZ;
+
do {
-   u32 offset;
int len;
 
rq = i915_request_create(ce);
@@ -727,10 +729,6 @@ intel_context_migrate_clear(struct intel_context *ce,
if (err)
goto out_rq;
 
-   offset = 0;
-   if (HAS_64K_PAGES(ce->engine->i915) && is_lmem)
-   offset = CHUNK_SZ;
-
len = emit_pte(rq, , cache_level, is_lmem, offset, CHUNK_SZ);
if (len <= 0) {
err = len;
-- 
2.20.1



[PATCH v7 9/9] drm/i915/migrate: Evict and restore the flatccs capable lmem obj

2022-03-28 Thread Ramalingam C
When we are swapping out the local memory obj on flat-ccs capable platform,
we need to capture the ccs data too along with main meory and we need to
restore it when we are swapping in the content.

When lmem object is swapped into a smem obj, smem obj will
have the extra pages required to hold the ccs data corresponding to the
lmem main memory. So main memory of lmem will be copied into the initial
pages of the smem and then ccs data corresponding to the main memory
will be copied to the subsequent pages of smem. ccs data is 1/256 of
lmem size.

Swapin happens exactly in reverse order. First main memory of lmem is
restored from the smem's initial pages and the ccs data will be restored
from the subsequent pages of smem.

Extracting and restoring the CCS data is done through a special cmd called
XY_CTRL_SURF_COPY_BLT

v2: Fixing the ccs handling
v3: Handle the ccs data at same loop as main memory [Thomas]
v4: changes for emit_copy_ccs
v5: handle non-flat-ccs scenario

Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 164 +++-
 1 file changed, 160 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 0657d33fedac..0b44e3785eed 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -633,6 +633,65 @@ static int emit_copy(struct i915_request *rq,
return 0;
 }
 
+static int scatter_list_length(struct scatterlist *sg)
+{
+   int len = 0;
+
+   while (sg && sg_dma_len(sg)) {
+   len += sg_dma_len(sg);
+   sg = sg_next(sg);
+   };
+
+   return len;
+}
+
+static void
+calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem,
+  int *src_sz, int *ccs_sz, u32 bytes_to_cpy,
+  u32 ccs_bytes_to_cpy)
+{
+   if (ccs_bytes_to_cpy) {
+   /*
+* We can only copy the ccs data corresponding to
+* the CHUNK_SZ of lmem which is
+* GET_CCS_BYTES(i915, CHUNK_SZ))
+*/
+   *ccs_sz = min_t(int, ccs_bytes_to_cpy, GET_CCS_BYTES(i915, 
CHUNK_SZ));
+
+   if (!src_is_lmem)
+   /*
+* When CHUNK_SZ is passed all the pages upto CHUNK_SZ
+* will be taken for the blt. in Flat-ccs supported
+* platform Smem obj will have more pages than required
+* for main meory hence limit it to the required size
+* for main memory
+*/
+   *src_sz = min_t(int, bytes_to_cpy, CHUNK_SZ);
+   } else { /* ccs handling is not required */
+   *src_sz = CHUNK_SZ;
+   }
+}
+
+static void get_ccs_sg_sgt(struct sgt_dma *it, u32 bytes_to_cpy)
+{
+   u32 len;
+
+   do {
+   GEM_BUG_ON(!it->sg || !sg_dma_len(it->sg));
+   len = it->max - it->dma;
+   if (len > bytes_to_cpy) {
+   it->dma += bytes_to_cpy;
+   break;
+   }
+
+   bytes_to_cpy -= len;
+
+   it->sg = __sg_next(it->sg);
+   it->dma = sg_dma_address(it->sg);
+   it->max = it->dma + sg_dma_len(it->sg);
+   } while (bytes_to_cpy);
+}
+
 int
 intel_context_migrate_copy(struct intel_context *ce,
   const struct i915_deps *deps,
@@ -644,9 +703,15 @@ intel_context_migrate_copy(struct intel_context *ce,
   bool dst_is_lmem,
   struct i915_request **out)
 {
-   struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst);
+   struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst), it_ccs;
+   struct drm_i915_private *i915 = ce->engine->i915;
+   u32 ccs_bytes_to_cpy = 0, bytes_to_cpy;
+   enum i915_cache_level ccs_cache_level;
+   int src_sz, dst_sz, ccs_sz;
u32 src_offset, dst_offset;
+   u8 src_access, dst_access;
struct i915_request *rq;
+   bool ccs_is_src;
int err;
 
GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
@@ -654,6 +719,38 @@ intel_context_migrate_copy(struct intel_context *ce,
 
GEM_BUG_ON(ce->ring->size < SZ_64K);
 
+   src_sz = scatter_list_length(src);
+   bytes_to_cpy = src_sz;
+
+   if (HAS_FLAT_CCS(i915) && src_is_lmem ^ dst_is_lmem) {
+   src_access = !src_is_lmem && dst_is_lmem;
+   dst_access = !src_access;
+
+   dst_sz = scatter_list_length(dst);
+   if (src_is_lmem) {
+   it_ccs = it_dst;
+   ccs_cache_level = dst_cache_level;
+   ccs_is_src = false;
+   } else if (dst_is_lmem) {
+   bytes_to_c

[PATCH v7 7/9] drm/ttm: Add a parameter to add extra pages into ttm_tt

2022-03-28 Thread Ramalingam C
Add a parameter called "extra_pages" for ttm_tt_init, to indicate that
driver needs extra pages in ttm_tt.

v2:
  Used imperative wording [Thomas and Christian]

Signed-off-by: Ramalingam C 
cc: Christian Koenig 
cc: Hellstrom Thomas 
Reviewed-by: Thomas Hellstrom 
Reviewed-by: Christian Konig 
Reviewed-by: Nirmoy Das 
---
 drivers/gpu/drm/drm_gem_vram_helper.c  |  2 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c|  2 +-
 drivers/gpu/drm/qxl/qxl_ttm.c  |  2 +-
 drivers/gpu/drm/ttm/ttm_agp_backend.c  |  2 +-
 drivers/gpu/drm/ttm/ttm_tt.c   | 12 +++-
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c |  2 +-
 include/drm/ttm/ttm_tt.h   |  4 +++-
 7 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c 
b/drivers/gpu/drm/drm_gem_vram_helper.c
index dc7f938bfff2..123045b58fec 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -867,7 +867,7 @@ static struct ttm_tt *bo_driver_ttm_tt_create(struct 
ttm_buffer_object *bo,
if (!tt)
return NULL;
 
-   ret = ttm_tt_init(tt, bo, page_flags, ttm_cached);
+   ret = ttm_tt_init(tt, bo, page_flags, ttm_cached, 0);
if (ret < 0)
goto err_ttm_tt_init;
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index e4a06fcf741a..3b9f99c765c4 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -290,7 +290,7 @@ static struct ttm_tt *i915_ttm_tt_create(struct 
ttm_buffer_object *bo,
i915_tt->is_shmem = true;
}
 
-   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching);
+   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, 0);
if (ret)
goto err_free;
 
diff --git a/drivers/gpu/drm/qxl/qxl_ttm.c b/drivers/gpu/drm/qxl/qxl_ttm.c
index 95df5750f47f..9ba871bd19b1 100644
--- a/drivers/gpu/drm/qxl/qxl_ttm.c
+++ b/drivers/gpu/drm/qxl/qxl_ttm.c
@@ -113,7 +113,7 @@ static struct ttm_tt *qxl_ttm_tt_create(struct 
ttm_buffer_object *bo,
ttm = kzalloc(sizeof(struct ttm_tt), GFP_KERNEL);
if (ttm == NULL)
return NULL;
-   if (ttm_tt_init(ttm, bo, page_flags, ttm_cached)) {
+   if (ttm_tt_init(ttm, bo, page_flags, ttm_cached, 0)) {
kfree(ttm);
return NULL;
}
diff --git a/drivers/gpu/drm/ttm/ttm_agp_backend.c 
b/drivers/gpu/drm/ttm/ttm_agp_backend.c
index 6ddc16f0fe2b..d27691f2e451 100644
--- a/drivers/gpu/drm/ttm/ttm_agp_backend.c
+++ b/drivers/gpu/drm/ttm/ttm_agp_backend.c
@@ -134,7 +134,7 @@ struct ttm_tt *ttm_agp_tt_create(struct ttm_buffer_object 
*bo,
agp_be->mem = NULL;
agp_be->bridge = bridge;
 
-   if (ttm_tt_init(_be->ttm, bo, page_flags, ttm_write_combined)) {
+   if (ttm_tt_init(_be->ttm, bo, page_flags, ttm_write_combined, 0)) {
kfree(agp_be);
return NULL;
}
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index d234aab800a0..1a66d9fc589a 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -134,9 +134,10 @@ void ttm_tt_destroy(struct ttm_device *bdev, struct ttm_tt 
*ttm)
 static void ttm_tt_init_fields(struct ttm_tt *ttm,
   struct ttm_buffer_object *bo,
   uint32_t page_flags,
-  enum ttm_caching caching)
+  enum ttm_caching caching,
+  unsigned long extra_pages)
 {
-   ttm->num_pages = PAGE_ALIGN(bo->base.size) >> PAGE_SHIFT;
+   ttm->num_pages = (PAGE_ALIGN(bo->base.size) >> PAGE_SHIFT) + 
extra_pages;
ttm->caching = ttm_cached;
ttm->page_flags = page_flags;
ttm->dma_address = NULL;
@@ -146,9 +147,10 @@ static void ttm_tt_init_fields(struct ttm_tt *ttm,
 }
 
 int ttm_tt_init(struct ttm_tt *ttm, struct ttm_buffer_object *bo,
-   uint32_t page_flags, enum ttm_caching caching)
+   uint32_t page_flags, enum ttm_caching caching,
+   unsigned long extra_pages)
 {
-   ttm_tt_init_fields(ttm, bo, page_flags, caching);
+   ttm_tt_init_fields(ttm, bo, page_flags, caching, extra_pages);
 
if (ttm_tt_alloc_page_directory(ttm)) {
pr_err("Failed allocating page table\n");
@@ -180,7 +182,7 @@ int ttm_sg_tt_init(struct ttm_tt *ttm, struct 
ttm_buffer_object *bo,
 {
int ret;
 
-   ttm_tt_init_fields(ttm, bo, page_flags, caching);
+   ttm_tt_init_fields(ttm, bo, page_flags, caching, 0);
 
if (page_flags & TTM_TT_FLAG_EXTERNAL)
ret = ttm_sg_tt_alloc_page_directory(ttm);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c 
b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
index b84ecc6d6611..4e3938e62c08 100

[PATCH v7 8/9] drm/i915/gem: Add extra pages in ttm_tt for ccs data

2022-03-28 Thread Ramalingam C
On Xe-HP and later devices, dedicated compression control state (CCS)
stored in local memory is used for each surface, to support the
3D and media compression formats.

The memory required for the CCS of the entire local memory is 1/256 of
the local memory size. So before the kernel boot, the required memory
is reserved for the CCS data and a secure register will be programmed
with the CCS base address

So when an object is allocated in local memory, dont need to explicitly
allocate the space for ccs data. But when the obj is evicted into the
smem, to hold the compression related data along with the obj extra space
is needed in smem. i.e obj_size + (obj_size/256).

Hence when a smem pages are allocated for an obj with lmem placement
possibility we create with the extra pages required for the ccs data for
the obj size.

v2:
  Used imperative wording [Thomas]
v3:
  Inflate the pages only when obj's placement is lmem only
v4:
  GEM_BUG_ON if the ttm->num_pages > obj page size [Thomas]

Signed-off-by: Ramalingam C 
cc: Christian Koenig 
cc: Hellstrom Thomas 
Reviewed-by: Thomas Hellstrom 
Reviewed-by: Nirmoy Das 
---
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 30 -
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index 3b9f99c765c4..5b34fe8ea9d7 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -20,6 +20,7 @@
 #include "gem/i915_gem_ttm.h"
 #include "gem/i915_gem_ttm_move.h"
 #include "gem/i915_gem_ttm_pm.h"
+#include "gt/intel_gpu_commands.h"
 
 #define I915_TTM_PRIO_PURGE 0
 #define I915_TTM_PRIO_NO_PAGES  1
@@ -262,12 +263,33 @@ static const struct i915_refct_sgt_ops tt_rsgt_ops = {
.release = i915_ttm_tt_release
 };
 
+static inline bool
+i915_gem_object_needs_ccs_pages(struct drm_i915_gem_object *obj)
+{
+   bool lmem_placement = false;
+   int i;
+
+   for (i = 0; i < obj->mm.n_placements; i++) {
+   /* Compression is not allowed for the objects with smem 
placement */
+   if (obj->mm.placements[i]->type == INTEL_MEMORY_SYSTEM)
+   return false;
+   if (!lmem_placement &&
+   obj->mm.placements[i]->type == INTEL_MEMORY_LOCAL)
+   lmem_placement = true;
+   }
+
+   return lmem_placement;
+}
+
 static struct ttm_tt *i915_ttm_tt_create(struct ttm_buffer_object *bo,
 uint32_t page_flags)
 {
+   struct drm_i915_private *i915 = container_of(bo->bdev, typeof(*i915),
+bdev);
struct ttm_resource_manager *man =
ttm_manager_type(bo->bdev, bo->resource->mem_type);
struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo);
+   unsigned long ccs_pages = 0;
enum ttm_caching caching;
struct i915_ttm_tt *i915_tt;
int ret;
@@ -290,7 +312,12 @@ static struct ttm_tt *i915_ttm_tt_create(struct 
ttm_buffer_object *bo,
i915_tt->is_shmem = true;
}
 
-   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, 0);
+   if (HAS_FLAT_CCS(i915) && i915_gem_object_needs_ccs_pages(obj))
+   ccs_pages = DIV_ROUND_UP(DIV_ROUND_UP(bo->base.size,
+ NUM_BYTES_PER_CCS_BYTE),
+PAGE_SIZE);
+
+   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, ccs_pages);
if (ret)
goto err_free;
 
@@ -770,6 +797,7 @@ static int __i915_ttm_get_pages(struct drm_i915_gem_object 
*obj,
i915_sg_dma_sizes(rsgt->table.sgl));
}
 
+   GEM_BUG_ON(bo->ttm && ((obj->base.size >> PAGE_SHIFT) < 
bo->ttm->num_pages));
i915_ttm_adjust_lru(obj);
return ret;
 }
-- 
2.20.1



[PATCH v7 6/9] drm/i915/selftest_migrate: Check CCS meta data clear

2022-03-28 Thread Ramalingam C
Extend the live migrate selftest, to verify the ccs surface clearing
during the Flat-CCS capable lmem obj clear.

v2:
  Look at right places for ccs data [Thomas]

Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/selftest_migrate.c | 250 ++---
 1 file changed, 222 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c 
b/drivers/gpu/drm/i915/gt/selftest_migrate.c
index b5da8b8cd039..8cd9a22054f3 100644
--- a/drivers/gpu/drm/i915/gt/selftest_migrate.c
+++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c
@@ -132,6 +132,124 @@ static int copy(struct intel_migrate *migrate,
return err;
 }
 
+static int intel_context_copy_ccs(struct intel_context *ce,
+ const struct i915_deps *deps,
+ struct scatterlist *sg,
+ enum i915_cache_level cache_level,
+ bool write_to_ccs,
+ struct i915_request **out)
+{
+   u8 src_access = write_to_ccs ? DIRECT_ACCESS : INDIRECT_ACCESS;
+   u8 dst_access = write_to_ccs ? INDIRECT_ACCESS : DIRECT_ACCESS;
+   struct sgt_dma it = sg_sgt(sg);
+   struct i915_request *rq;
+   u32 offset;
+   int err;
+
+   GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
+   *out = NULL;
+
+   GEM_BUG_ON(ce->ring->size < SZ_64K);
+
+   offset = 0;
+   if (HAS_64K_PAGES(ce->engine->i915))
+   offset = CHUNK_SZ;
+
+   do {
+   int len;
+
+   rq = i915_request_create(ce);
+   if (IS_ERR(rq)) {
+   err = PTR_ERR(rq);
+   goto out_ce;
+   }
+
+   if (deps) {
+   err = i915_request_await_deps(rq, deps);
+   if (err)
+   goto out_rq;
+
+   if (rq->engine->emit_init_breadcrumb) {
+   err = rq->engine->emit_init_breadcrumb(rq);
+   if (err)
+   goto out_rq;
+   }
+
+   deps = NULL;
+   }
+
+   /* The PTE updates + clear must not be interrupted. */
+   err = emit_no_arbitration(rq);
+   if (err)
+   goto out_rq;
+
+   len = emit_pte(rq, , cache_level, true, offset, CHUNK_SZ);
+   if (len <= 0) {
+   err = len;
+   goto out_rq;
+   }
+
+   err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
+   if (err)
+   goto out_rq;
+
+   err = emit_copy_ccs(rq, offset, dst_access,
+   offset, src_access, len);
+   if (err)
+   goto out_rq;
+
+   err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
+
+   /* Arbitration is re-enabled between requests. */
+out_rq:
+   if (*out)
+   i915_request_put(*out);
+   *out = i915_request_get(rq);
+   i915_request_add(rq);
+   if (err || !it.sg || !sg_dma_len(it.sg))
+   break;
+
+   cond_resched();
+   } while (1);
+
+out_ce:
+   return err;
+}
+
+static int
+intel_migrate_ccs_copy(struct intel_migrate *m,
+  struct i915_gem_ww_ctx *ww,
+  const struct i915_deps *deps,
+  struct scatterlist *sg,
+  enum i915_cache_level cache_level,
+  bool write_to_ccs,
+  struct i915_request **out)
+{
+   struct intel_context *ce;
+   int err;
+
+   *out = NULL;
+   if (!m->context)
+   return -ENODEV;
+
+   ce = intel_migrate_create_context(m);
+   if (IS_ERR(ce))
+   ce = intel_context_get(m->context);
+   GEM_BUG_ON(IS_ERR(ce));
+
+   err = intel_context_pin_ww(ce, ww);
+   if (err)
+   goto out;
+
+   err = intel_context_copy_ccs(ce, deps, sg, cache_level,
+write_to_ccs, out);
+
+   intel_context_unpin(ce);
+out:
+   intel_context_put(ce);
+   return err;
+}
+
 static int clear(struct intel_migrate *migrate,
 int (*fn)(struct intel_migrate *migrate,
   struct i915_gem_ww_ctx *ww,
@@ -144,7 +262,8 @@ static int clear(struct intel_migrate *migrate,
struct drm_i915_gem_object *obj;
struct i915_request *rq;
struct i915_gem_ww_ctx ww;
-   u32 *vaddr;
+   u32 *vaddr, val = 0;
+   bool ccs_cap = false;
int err = 0;
int i;
 
@@ -155,7 +274,12 @@ static int clear(struct intel_migrate *migrate,
/* Co

[PATCH v7 5/9] drm/i915/selftest_migrate: Consider the possible roundup of size

2022-03-28 Thread Ramalingam C
Consider the possible round up happened at obj size alignment to
min_page_size during the obj allocation.

Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/selftest_migrate.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c 
b/drivers/gpu/drm/i915/gt/selftest_migrate.c
index c9c4f391c5cc..b5da8b8cd039 100644
--- a/drivers/gpu/drm/i915/gt/selftest_migrate.c
+++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c
@@ -152,6 +152,9 @@ static int clear(struct intel_migrate *migrate,
if (IS_ERR(obj))
return 0;
 
+   /* Consider the rounded up memory too */
+   sz = obj->base.size;
+
for_i915_gem_ww(, err, true) {
err = i915_gem_object_lock(obj, );
if (err)
-- 
2.20.1



[PATCH v7 4/9] drm/i915/gt: Clear compress metadata for Flat-ccs objects

2022-03-28 Thread Ramalingam C
Xe-HP and latest devices support Flat CCS which reserved a portion of
the device memory to store compression metadata, during the clearing of
device memory buffer object we also need to clear the associated
CCS buffer.

XY_CTRL_SURF_COPY_BLT is a BLT cmd used for reading and writing the
ccs surface of a lmem memory. So on Flat-CCS capable platform we use
XY_CTRL_SURF_COPY_BLT  to clear the CCS meta data.

v2: Fixed issues with platform naming [Lucas]
v3: Rebased [Ram]
Used the round_up funcs [Bob]
v4: Fixed ccs blk calculation [Ram]
Added Kdoc on flat-ccs.
v5: GENMASK is used [Matt]
mocs fix [Matt]
Comments Fix [Matt]
Flush address programming [Ram]
v6: FLUSH_DW is fixed
Few coding style fix
v7: Adopting the XY_FAST_COLOR_BLT (Thomas]
v8: XY_CTRL_SURF_COPY_BLT for ccs clearing.
v9: emit_copy_ccs is used.
v10: ctrl_surf cmds are filled in caller itself. [Thomas]
 only one ctrl surf cmd is used as size of lmem is <=8M [Thomas]

Signed-off-by: Ramalingam C 
Signed-off-by: Ayaz A Siddiqui 
---
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  16 +++
 drivers/gpu/drm/i915/gt/intel_migrate.c  | 137 ++-
 2 files changed, 152 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h 
b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index 925e55b6a94f..372ef4c3ce2a 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -153,8 +153,10 @@
 #define   MI_FLUSH_DW_PROTECTED_MEM_EN (1 << 22)
 #define   MI_FLUSH_DW_STORE_INDEX  (1<<21)
 #define   MI_INVALIDATE_TLB(1<<18)
+#define   MI_FLUSH_DW_CCS  (1<<16)
 #define   MI_FLUSH_DW_OP_STOREDW   (1<<14)
 #define   MI_FLUSH_DW_OP_MASK  (3<<14)
+#define   MI_FLUSH_DW_LLC  (1<<9)
 #define   MI_FLUSH_DW_NOTIFY   (1<<8)
 #define   MI_INVALIDATE_BSD(1<<7)
 #define   MI_FLUSH_DW_USE_GTT  (1<<2)
@@ -203,6 +205,20 @@
 #define GFX_OP_DRAWRECT_INFO ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3))
 #define GFX_OP_DRAWRECT_INFO_I965  ((0x7900<<16)|0x2)
 
+#define XY_CTRL_SURF_INSTR_SIZE5
+#define MI_FLUSH_DW_SIZE   3
+#define XY_CTRL_SURF_COPY_BLT  ((2 << 29) | (0x48 << 22) | 3)
+#define   SRC_ACCESS_TYPE_SHIFT21
+#define   DST_ACCESS_TYPE_SHIFT20
+#define   CCS_SIZE_MASK0x3FF
+#define   CCS_SIZE_SHIFT   8
+#define   XY_CTRL_SURF_MOCS_MASK   GENMASK(31, 25)
+#define   NUM_CCS_BYTES_PER_BLOCK  256
+#define   NUM_BYTES_PER_CCS_BYTE   256
+#define   NUM_CCS_BLKS_PER_XFER1024
+#define   INDIRECT_ACCESS  0
+#define   DIRECT_ACCESS1
+
 #define COLOR_BLT_CMD  (2 << 29 | 0x40 << 22 | (5 - 2))
 #define XY_COLOR_BLT_CMD   (2 << 29 | 0x50 << 22)
 #define XY_FAST_COLOR_BLT_CMD  (2 << 29 | 0x44 << 22)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index ec9a9e7cb388..0657d33fedac 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -17,6 +17,8 @@ struct insert_pte_data {
 
 #define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
 
+#define GET_CCS_BYTES(i915, size)  (HAS_FLAT_CCS(i915) ? \
+DIV_ROUND_UP(size, 
NUM_BYTES_PER_CCS_BYTE) : 0)
 static bool engine_supports_migration(struct intel_engine_cs *engine)
 {
if (!engine)
@@ -467,6 +469,123 @@ static bool wa_1209644611_applies(int ver, u32 size)
return height % 4 == 3 && height <= 8;
 }
 
+/**
+ * DOC: Flat-CCS - Memory compression for Local memory
+ *
+ * On Xe-HP and later devices, we use dedicated compression control state (CCS)
+ * stored in local memory for each surface, to support the 3D and media
+ * compression formats.
+ *
+ * The memory required for the CCS of the entire local memory is 1/256 of the
+ * local memory size. So before the kernel boot, the required memory is 
reserved
+ * for the CCS data and a secure register will be programmed with the CCS base
+ * address.
+ *
+ * Flat CCS data needs to be cleared when a lmem object is allocated.
+ * And CCS data can be copied in and out of CCS region through
+ * XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly.
+ *
+ * When we exhaust the lmem, if the object's placements support smem, then we 
can
+ * directly decompress the compressed lmem object into smem and start using it
+ * from smem itself.
+ *
+ * But when we need to swapout the compressed lmem object into a smem region
+ * though objects' placement doesn't support smem, then we copy the lmem 
content
+ * as it is into smem region along with ccs data (using XY_CTRL_SURF_COPY_BLT).
+ * When the object 

[PATCH v7 1/9] drm/i915/gt: use engine instance directly for offset

2022-03-28 Thread Ramalingam C
To make it uniform across copy and clear, use the engine offset directly
to calculate the offset in the cmd forming for emit_clear.

Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 20444d6ceb3c..9e6c98a17441 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -614,15 +614,13 @@ intel_context_migrate_copy(struct intel_context *ce,
return err;
 }
 
-static int emit_clear(struct i915_request *rq, u64 offset, int size, u32 value)
+static int emit_clear(struct i915_request *rq, u32 offset, int size, u32 value)
 {
const int ver = GRAPHICS_VER(rq->engine->i915);
u32 *cs;
 
GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
 
-   offset += (u64)rq->engine->instance << 32;
-
cs = intel_ring_begin(rq, ver >= 8 ? 8 : 6);
if (IS_ERR(cs))
return PTR_ERR(cs);
@@ -632,17 +630,16 @@ static int emit_clear(struct i915_request *rq, u64 
offset, int size, u32 value)
*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
*cs++ = 0;
*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
-   *cs++ = lower_32_bits(offset);
-   *cs++ = upper_32_bits(offset);
+   *cs++ = offset;
+   *cs++ = rq->engine->instance;
*cs++ = value;
*cs++ = MI_NOOP;
} else {
-   GEM_BUG_ON(upper_32_bits(offset));
*cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
*cs++ = 0;
*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
-   *cs++ = lower_32_bits(offset);
+   *cs++ = offset;
*cs++ = value;
}
 
-- 
2.20.1



[PATCH v7 2/9] drm/i915/gt: Use XY_FAST_COLOR_BLT to clear obj on graphics ver 12+

2022-03-28 Thread Ramalingam C
Use faster XY_FAST_COLOR_BLT cmd on graphics version of 12 and more,
for clearing (Zero out) the pages of the newly allocated object.

XY_FAST_COLOR_BLT is faster than the older XY_COLOR_BLT.

v2:
  Typo fix at title [Thomas]
v3:
  XY_FAST_COLOR_BLT is used only for FLAT_CCS capable gen12+

Signed-off-by: Ramalingam C 
Signed-off-by: Chris Wilson 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  5 +++
 drivers/gpu/drm/i915/gt/intel_migrate.c  | 43 +---
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h 
b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index d112ffd56418..925e55b6a94f 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -205,6 +205,11 @@
 
 #define COLOR_BLT_CMD  (2 << 29 | 0x40 << 22 | (5 - 2))
 #define XY_COLOR_BLT_CMD   (2 << 29 | 0x50 << 22)
+#define XY_FAST_COLOR_BLT_CMD  (2 << 29 | 0x44 << 22)
+#define   XY_FAST_COLOR_BLT_DEPTH_32   (2 << 19)
+#define   XY_FAST_COLOR_BLT_DW 16
+#define   XY_FAST_COLOR_BLT_MOCS_MASK  GENMASK(27, 21)
+#define   XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT 31
 #define SRC_COPY_BLT_CMD   (2 << 29 | 0x43 << 22)
 #define GEN9_XY_FAST_COPY_BLT_CMD  (2 << 29 | 0x42 << 22)
 #define XY_SRC_COPY_BLT_CMD(2 << 29 | 0x53 << 22)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 9e6c98a17441..17dd372a47d1 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -614,18 +614,51 @@ intel_context_migrate_copy(struct intel_context *ce,
return err;
 }
 
-static int emit_clear(struct i915_request *rq, u32 offset, int size, u32 value)
+static int emit_clear(struct i915_request *rq, u32 offset, int size,
+ u32 value, bool is_lmem)
 {
-   const int ver = GRAPHICS_VER(rq->engine->i915);
+   struct drm_i915_private *i915 = rq->engine->i915;
+   int mocs = rq->engine->gt->mocs.uc_index << 1;
+   const int ver = GRAPHICS_VER(i915);
+   int ring_sz;
u32 *cs;
 
GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
 
-   cs = intel_ring_begin(rq, ver >= 8 ? 8 : 6);
+   if (HAS_FLAT_CCS(i915) && ver >= 12)
+   ring_sz = XY_FAST_COLOR_BLT_DW;
+   else if (ver >= 8)
+   ring_sz = 8;
+   else
+   ring_sz = 6;
+
+   cs = intel_ring_begin(rq, ring_sz);
if (IS_ERR(cs))
return PTR_ERR(cs);
 
-   if (ver >= 8) {
+   if (HAS_FLAT_CCS(i915) && ver >= 12) {
+   *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 |
+   (XY_FAST_COLOR_BLT_DW - 2);
+   *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) |
+   (PAGE_SIZE - 1);
+   *cs++ = 0;
+   *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
+   *cs++ = offset;
+   *cs++ = rq->engine->instance;
+   *cs++ = !is_lmem << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT;
+   /* BG7 */
+   *cs++ = value;
+   *cs++ = 0;
+   *cs++ = 0;
+   *cs++ = 0;
+   /* BG11 */
+   *cs++ = 0;
+   *cs++ = 0;
+   /* BG13 */
+   *cs++ = 0;
+   *cs++ = 0;
+   *cs++ = 0;
+   } else if (ver >= 8) {
*cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (7 - 2);
*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
*cs++ = 0;
@@ -708,7 +741,7 @@ intel_context_migrate_clear(struct intel_context *ce,
if (err)
goto out_rq;
 
-   err = emit_clear(rq, offset, len, value);
+   err = emit_clear(rq, offset, len, value, is_lmem);
 
/* Arbitration is re-enabled between requests. */
 out_rq:
-- 
2.20.1



[PATCH v7 0/9] drm/i915/ttm: Evict and restore of compressed object

2022-03-28 Thread Ramalingam C
On Xe-HP and later devices, we use dedicated compression control
state (CCS) stored in local memory for each surface, to support
the 3D and media compression formats.

The memory required for the CCS of the entire local memory is
1/256 of the local memory size. So before the kernel
boot, the required memory is reserved for the CCS data and a
secure register will be programmed with the CCS base address

So when we allocate a object in local memory we dont need to explicitly
allocate the space for ccs data. But when we evict the obj into the smem
to hold the compression related data along with the obj we need smem
space of obj_size + (obj_size/256).

Hence when we create smem for an obj with lmem placement possibility we
create with the extra space.

When we are swapping out the local memory obj on flat-ccs capable platform,
we need to capture the ccs data too along with main meory and we need to
restore it when we are swapping in the content.

When lmem object is swapped into a smem obj, smem obj will
have the extra pages required to hold the ccs data corresponding to the
lmem main memory. So main memory of lmem will be copied into the initial
pages of the smem and then ccs data corresponding to the main memory
will be copied to the subsequent pages of smem.

Swapin happens exactly in reverse order. First main memory of lmem is
restored from the smem's initial pages and the ccs data will be restored
from the subsequent pages of smem.

Extracting and restoring the CCS data is done through a special cmd called
XY_CTRL_SURF_COPY_BLT

v7:
  GEM_BUG_ON is added for catching if inflated pages are filled for
get_pages
  Optimised emit_copy_clear
  Engine index is directly used for the upper 32 bits of offset
  Use FAT_COLOR_BLT only for FLAT_CCS capable platforms

Test-with: 20220314051432.15785-1-ramalinga...@intel.com

Ramalingam C (9):
  drm/i915/gt: use engine instance directly for offset
  drm/i915/gt: Use XY_FAST_COLOR_BLT to clear obj on graphics ver 12+
  drm/i915/gt: Optimize the migration and clear loop
  drm/i915/gt: Clear compress metadata for Flat-ccs objects
  drm/i915/selftest_migrate: Consider the possible roundup of size
  drm/i915/selftest_migrate: Check CCS meta data clear
  drm/ttm: Add a parameter to add extra pages into ttm_tt
  drm/i915/gem: Add extra pages in ttm_tt for ccs data
  drm/i915/migrate: Evict and restore the flatccs capable lmem obj

 drivers/gpu/drm/drm_gem_vram_helper.c|   2 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c  |  30 +-
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  21 +
 drivers/gpu/drm/i915/gt/intel_migrate.c  | 391 +--
 drivers/gpu/drm/i915/gt/selftest_migrate.c   | 253 ++--
 drivers/gpu/drm/qxl/qxl_ttm.c|   2 +-
 drivers/gpu/drm/ttm/ttm_agp_backend.c|   2 +-
 drivers/gpu/drm/ttm/ttm_tt.c |  12 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c   |   2 +-
 include/drm/ttm/ttm_tt.h |   4 +-
 10 files changed, 644 insertions(+), 75 deletions(-)

-- 
2.20.1



Re: [Intel-gfx] [PATCH v5 3/9] drm/i915/gt: Clear compress metadata for Flat-ccs objects

2022-03-28 Thread Ramalingam C
On 2022-03-24 at 17:14:53 +0100, Thomas Hellström (Intel) wrote:
> Hi, Ram
> 
> On 3/21/22 23:44, Ramalingam C wrote:
> > Xe-HP and latest devices support Flat CCS which reserved a portion of
> > the device memory to store compression metadata, during the clearing of
> > device memory buffer object we also need to clear the associated
> > CCS buffer.
> > 
> > XY_CTRL_SURF_COPY_BLT is a BLT cmd used for reading and writing the
> > ccs surface of a lmem memory. So on Flat-CCS capable platform we use
> > XY_CTRL_SURF_COPY_BLT  to clear the CCS meta data.
> > 
> > v2: Fixed issues with platform naming [Lucas]
> > v3: Rebased [Ram]
> >  Used the round_up funcs [Bob]
> > v4: Fixed ccs blk calculation [Ram]
> >  Added Kdoc on flat-ccs.
> > v5: GENMASK is used [Matt]
> >  mocs fix [Matt]
> >  Comments Fix [Matt]
> >  Flush address programming [Ram]
> > v6: FLUSH_DW is fixed
> >  Few coding style fix
> > v7: Adopting the XY_FAST_COLOR_BLT (Thomas]
> > v8: XY_CTRL_SURF_COPY_BLT for ccs clearing.
> > v9: emit_copy_ccs is used.
> > 
> > Signed-off-by: Ramalingam C 
> > Signed-off-by: Ayaz A Siddiqui 
> > ---
> >   drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  15 ++
> >   drivers/gpu/drm/i915/gt/intel_migrate.c  | 164 ++-
> >   2 files changed, 175 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h 
> > b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> > index 925e55b6a94f..6b4eb7927ec7 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> > @@ -153,8 +153,10 @@
> >   #define   MI_FLUSH_DW_PROTECTED_MEM_EN(1 << 22)
> >   #define   MI_FLUSH_DW_STORE_INDEX (1<<21)
> >   #define   MI_INVALIDATE_TLB   (1<<18)
> > +#define   MI_FLUSH_DW_CCS  (1<<16)
> >   #define   MI_FLUSH_DW_OP_STOREDW  (1<<14)
> >   #define   MI_FLUSH_DW_OP_MASK (3<<14)
> > +#define   MI_FLUSH_DW_LLC  (1<<9)
> >   #define   MI_FLUSH_DW_NOTIFY  (1<<8)
> >   #define   MI_INVALIDATE_BSD   (1<<7)
> >   #define   MI_FLUSH_DW_USE_GTT (1<<2)
> > @@ -203,6 +205,19 @@
> >   #define GFX_OP_DRAWRECT_INFO ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3))
> >   #define GFX_OP_DRAWRECT_INFO_I965  ((0x7900<<16)|0x2)
> > +#define XY_CTRL_SURF_INSTR_SIZE5
> > +#define MI_FLUSH_DW_SIZE   3
> > +#define XY_CTRL_SURF_COPY_BLT  ((2 << 29) | (0x48 << 22) | 3)
> > +#define   SRC_ACCESS_TYPE_SHIFT21
> > +#define   DST_ACCESS_TYPE_SHIFT20
> > +#define   CCS_SIZE_MASKGENMASK(17, 8)
> > +#define   XY_CTRL_SURF_MOCS_MASK   GENMASK(31, 25)
> > +#define   NUM_CCS_BYTES_PER_BLOCK  256
> > +#define   NUM_BYTES_PER_CCS_BYTE   256
> > +#define   NUM_CCS_BLKS_PER_XFER1024
> > +#define   INDIRECT_ACCESS  0
> > +#define   DIRECT_ACCESS1
> > +
> >   #define COLOR_BLT_CMD (2 << 29 | 0x40 << 22 | (5 - 2))
> >   #define XY_COLOR_BLT_CMD  (2 << 29 | 0x50 << 22)
> >   #define XY_FAST_COLOR_BLT_CMD (2 << 29 | 0x44 << 22)
> > diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
> > b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > index b656685a486d..39a5f8ae664d 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_migrate.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > @@ -16,7 +16,8 @@ struct insert_pte_data {
> >   };
> >   #define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
> > -
> > +#define GET_CCS_BYTES(i915, size)  (HAS_FLAT_CCS(i915) ? \
> > +DIV_ROUND_UP(size, 
> > NUM_BYTES_PER_CCS_BYTE) : 0)
> >   static bool engine_supports_migration(struct intel_engine_cs *engine)
> >   {
> > if (!engine)
> > @@ -467,6 +468,145 @@ static bool wa_1209644611_applies(int ver, u32 size)
> > return height % 4 == 3 && height <= 8;
> >   }
> > +/**
> > + * DOC: Flat-CCS - Memory compression for Local memory
> > + *
> > + * On Xe-HP and later devices, we use dedicated compression control state 
> > (CCS)
> > + * stored in local memory for each surface, to support the 3D and media
> > + * compression formats.
> > + *
> > + * The memory required for the CCS of the entire l

Re: [PATCH v5 8/9] drm/i915/gem: Add extra pages in ttm_tt for ccs data

2022-03-28 Thread Ramalingam C
On 2022-03-24 at 17:28:08 +0100, Thomas Hellström wrote:
> 
> On 3/21/22 23:44, Ramalingam C wrote:
> > On Xe-HP and later devices, dedicated compression control state (CCS)
> > stored in local memory is used for each surface, to support the
> > 3D and media compression formats.
> > 
> > The memory required for the CCS of the entire local memory is 1/256 of
> > the local memory size. So before the kernel boot, the required memory
> > is reserved for the CCS data and a secure register will be programmed
> > with the CCS base address
> > 
> > So when an object is allocated in local memory, dont need to explicitly
> > allocate the space for ccs data. But when the obj is evicted into the
> > smem, to hold the compression related data along with the obj extra space
> > is needed in smem. i.e obj_size + (obj_size/256).
> > 
> > Hence when a smem pages are allocated for an obj with lmem placement
> > possibility we create with the extra pages required for the ccs data for
> > the obj size.
> > 
> > v2:
> >Used imperative wording [Thomas]
> > v3:
> >Inflate the pages only when obj's placement is lmem only
> > 
> > Signed-off-by: Ramalingam C 
> > cc: Christian Koenig 
> > cc: Hellstrom Thomas 
> > Reviewed-by: Thomas Hellstrom 
> > Reviewed-by: Nirmoy Das 
> > ---
> >   drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 29 -
> >   1 file changed, 28 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c 
> > b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> > index 3b9f99c765c4..0305a150b9d4 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> > @@ -20,6 +20,7 @@
> >   #include "gem/i915_gem_ttm.h"
> >   #include "gem/i915_gem_ttm_move.h"
> >   #include "gem/i915_gem_ttm_pm.h"
> > +#include "gt/intel_gpu_commands.h"
> >   #define I915_TTM_PRIO_PURGE 0
> >   #define I915_TTM_PRIO_NO_PAGES  1
> > @@ -262,12 +263,33 @@ static const struct i915_refct_sgt_ops tt_rsgt_ops = {
> > .release = i915_ttm_tt_release
> >   };
> > +static inline bool
> > +i915_gem_object_needs_ccs_pages(struct drm_i915_gem_object *obj)
> > +{
> > +   bool lmem_placement = false;
> > +   int i;
> > +
> > +   for (i = 0; i < obj->mm.n_placements; i++) {
> > +   /* Compression is not allowed for the objects with smem 
> > placement */
> > +   if (obj->mm.placements[i]->type == INTEL_MEMORY_SYSTEM)
> > +   return false;
> > +   if (!lmem_placement &&
> > +   obj->mm.placements[i]->type == INTEL_MEMORY_LOCAL)
> > +   lmem_placement = true;
> > +   }
> > +
> > +   return lmem_placement;
> > +}
> > +
> >   static struct ttm_tt *i915_ttm_tt_create(struct ttm_buffer_object *bo,
> >  uint32_t page_flags)
> >   {
> > +   struct drm_i915_private *i915 = container_of(bo->bdev, typeof(*i915),
> > +bdev);
> > struct ttm_resource_manager *man =
> > ttm_manager_type(bo->bdev, bo->resource->mem_type);
> > struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo);
> > +   unsigned long ccs_pages = 0;
> > enum ttm_caching caching;
> > struct i915_ttm_tt *i915_tt;
> > int ret;
> > @@ -290,7 +312,12 @@ static struct ttm_tt *i915_ttm_tt_create(struct 
> > ttm_buffer_object *bo,
> > i915_tt->is_shmem = true;
> > }
> > -   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, 0);
> > +   if (HAS_FLAT_CCS(i915) && i915_gem_object_needs_ccs_pages(obj))
> > +   ccs_pages = DIV_ROUND_UP(DIV_ROUND_UP(bo->base.size,
> > + NUM_BYTES_PER_CCS_BYTE),
> > +PAGE_SIZE);
> > +
> > +   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, ccs_pages);
> > if (ret)
> > goto err_free;
> 
> Since we need to respin could we add (in __i915_ttm_get_pages())
> 
> /* Verify that gem never sees inflated system pages. Keep that local to ttm
> */GEM_BUG_ON(bo->ttm && ((obj->base.size >> PAGE_SHIFT) <
> bo->ttm->num_pages))
Adding this gem warn on in next ver.

Ram
> 
> /Thomas
> 
> 
> 


Re: [Intel-gfx] [PATCH v5 6/9] drm/i915/gt: offset handling for multiple copy engines

2022-03-28 Thread Ramalingam C
On 2022-03-24 at 17:20:28 +0100, Thomas Hellström (Intel) wrote:
> 
> On 3/21/22 23:44, Ramalingam C wrote:
> > Handle the src and dst chunk offsets for different instances of the copy
> > engines.
> > 
> > Signed-off-by: Ramalingam C 
> > ---
> >   drivers/gpu/drm/i915/gt/intel_migrate.c | 3 +++
> >   1 file changed, 3 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
> > b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > index 39a5f8ae664d..5f6341f91622 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_migrate.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > @@ -614,6 +614,9 @@ static int emit_copy(struct i915_request *rq,
> > u32 instance = rq->engine->instance;
> > u32 *cs;
> > +   src_offset += (u64)rq->engine->instance << 32;
> > +   dst_offset += (u64)rq->engine->instance << 32;
> > +
> 
> Again, these are nops since the offsets are 32-bit.
> 
> Also the instance selection is already handled in the functon, so I think
> this patch can be dropped.

Thanks. Dropped this patch. and made copy and clear uniform with respect to
engine index handling for offset.

Ram.
> 
> 
> > cs = intel_ring_begin(rq, ver >= 8 ? 10 : 6);
> > if (IS_ERR(cs))
> > return PTR_ERR(cs);


  1   2   3   4   5   6   7   8   9   10   >