i915: Eliminate lots of iterations over the execobjects array

Joonas Lahtinen Tue, 04 Apr 2017 07:57:50 -0700

On ke, 2017-03-29 at 16:56 +0100, Chris Wilson wrote:
> The major scaling bottleneck in execbuffer is the processing of the
> execobjects. Creating an auxiliary list is inefficient when compared to
> using the execobject array we already have allocated.
> 
> Reservation is then split into phases. As we lookup up the VMA, we
> try and bind it back into active location. Only if that fails, do we add
> it to the unbound list for phase 2. In phase 2, we try and add all those
> objects that could not fit into their previous location, with fallback
> to retrying all objects and evicting the VM in case of severe
> fragmentation. (This is the same as before, except that phase 1 is now
> done inline with looking up the VMA to avoid an iteration over the
> execobject array. In the ideal case, we eliminate the separate reservation
> phase). During the reservation phase, we only evict from the VM between
> passes (rather than currently as we try to fit every new VMA). In
> testing with Unreal Engine's Atlantis demo which stresses the eviction
> logic on gen7 class hardware, this speed up the framerate by a factor of
> 2.
> 
> The second loop amalgamation is between move_to_gpu and move_to_active.
> As we always submit the request, even if incomplete, we can use the
> current request to track active VMA as we perform the flushes and
> synchronisation required.
> 
> The next big advancement is to avoid copying back to the user any
> execobjects and relocations that are not changed.
> 
> v2: Add a Theory of Operation spiel.
> v3: Fall back to slow relocations in preparation for flushing userptrs.
> 
> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>


<SNIP>

>  struct i915_execbuffer {
>       struct drm_i915_private *i915;
>       struct drm_file *file;
> @@ -63,19 +180,24 @@ struct i915_execbuffer {
>       struct i915_address_space *vm;
>       struct i915_vma *batch;
>       struct drm_i915_gem_request *request;
> -     u32 batch_start_offset;
> -     u32 batch_len;
> -     unsigned int dispatch_flags;
> -     struct drm_i915_gem_exec_object2 shadow_exec_entry;
> -     bool need_relocs;
> -     struct list_head vmas;
> +     unsigned int buffer_count;
> +     struct list_head unbound;
> +     struct list_head relocs;
>       struct reloc_cache {
>               struct drm_mm_node node;
>               unsigned long vaddr;
>               unsigned int page;
>               bool use_64bit_reloc : 1;
> +             bool has_llc : 1;
> +             bool has_fence : 1;
> +             bool needs_unfenced : 1;
>       } reloc_cache;
> -     int lut_mask;
> +     u64 invalid_flags;
> +     u32 context_flags;
> +     u32 dispatch_flags;
> +     u32 batch_start_offset;
> +     u32 batch_len;
> +     int lut_size;
>       struct hlist_head *buckets;

Please document (new) members.

> +static inline u64 gen8_noncanonical_addr(u64 address)
> +{
> +     return address & ((1ULL << (GEN8_HIGH_ADDRESS_BIT + 1)) - 1);

GENMASK_ULL

> @@ -106,71 +256,302 @@ eb_create(struct i915_execbuffer *eb)
>                               return -ENOMEM;
>               }
>  
> -             eb->lut_mask = size;
> +             eb->lut_size = size;
>       } else {
> -             eb->lut_mask = -eb->args->buffer_count;
> +             eb->lut_size = -eb->buffer_count;

Document the negative meaning in the doc mentioned above.

> +static bool
> +eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry,
> +              const struct i915_vma *vma)
> +{
> +     if ((entry->flags & __EXEC_OBJECT_HAS_PIN) == 0)

Lets try to stick to one convention, we gave up == NULL too, so
!(a & FOO).

<SNIP>

> +     if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0 &&
> +         (vma->node.start + vma->node.size - 1) >> 32)

upper_32_bits for clarity?

> +static void
> +eb_pin_vma(struct i915_execbuffer *eb,
> +        struct drm_i915_gem_exec_object2 *entry,
> +        struct i915_vma *vma)
> +{
> +     u64 flags;
> +
> +     flags = vma->node.start;

I'd be more comfortable if some mask was applied here.

Or at least GEM_BUG_ON(flags & BAD_FLAGS);

> +static inline void
> +eb_unreserve_vma(struct i915_vma *vma,
> +              struct drm_i915_gem_exec_object2 *entry)
>  {
> -     struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> -
> -     __eb_unreserve_vma(vma, entry);
> -     entry->flags &= ~(__EXEC_OBJECT_HAS_FENCE | __EXEC_OBJECT_HAS_PIN);
> +     if (entry->flags & __EXEC_OBJECT_HAS_PIN) {

I'd treat the if more as an early return, but I guess you have more
code coming.

> +static int
> +eb_add_vma(struct i915_execbuffer *eb,
> > +      struct drm_i915_gem_exec_object2 *entry,
> > +      struct i915_vma *vma)
>  {
> > -   struct i915_vma *vma;
> > +   int ret;
>  
> > -   list_for_each_entry(vma, &eb->vmas, exec_link) {
> > -           eb_unreserve_vma(vma);
> > -           i915_vma_put(vma);
> > -           vma->exec_entry = NULL;
> > +   GEM_BUG_ON(i915_vma_is_closed(vma));
> +
> +     if ((eb->args->flags & __EXEC_VALIDATED) == 0) {

smells like a function here.

<SNIP>

>       }
>  
> -     if (eb->lut_mask >= 0)
> -             memset(eb->buckets, 0,
> -                    sizeof(struct hlist_head) << eb->lut_mask);
> +     vma->exec_entry = entry;
> +     entry->rsvd2 = (uintptr_t)vma;

Umm, there was a helper introduced in some patch.

<SNIP>

Could add a comment as to why do this?

> +     if ((entry->flags & EXEC_OBJECT_PINNED) == 0)
> +             entry->flags |= eb->context_flags;
> +

<SNIP>

> +
> +static int
> +eb_reserve_vma(struct i915_execbuffer *eb, struct i915_vma *vma)
> +{

<SNIP>
 
> -     i915_vma_get(vma);
> -     __exec_to_vma(&eb->exec[i]) = (uintptr_t)vma;
> -     return true;
> +     if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) {
> +             ret = i915_vma_get_fence(vma);
> +             if (ret)
> +                     return ret;
> +
> +             if (i915_vma_pin_fence(vma))
> +                     entry->flags |= __EXEC_OBJECT_HAS_FENCE;
> +     }

Smells like duplicate code with eb_vma_pin.

<SNIP> 
 
 
> static int
> eb_lookup_vmas(struct i915_execbuffer *eb)
> {

<SNIP>

> -     if (ht_needs_resize(eb->ctx)) {
> -             eb->ctx->vma_lut.ht_size |= I915_CTX_RESIZE_IN_PROGRESS;
> -             queue_work(system_highpri_wq, &eb->ctx->vma_lut.resize);
> -     }
> +     if (eb->ctx->vma_lut.ht_size & I915_CTX_RESIZE_IN_PROGRESS) {
> +             struct i915_gem_context_vma_lut *lut = &eb->ctx->vma_lut;

You could hoist the lut variable, its used quite a few times.

> @@ -616,16 +1048,15 @@ relocate_entry(struct drm_i915_gem_object *obj,
>               goto repeat;
>       }
>  
> -     return 0;
> +     return gen8_canonical_addr(target->node.start) | 1;

Magic bit.

> +static int eb_relocate_vma(struct i915_execbuffer *eb, struct i915_vma *vma)
>  {
>  #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry))
> -     struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)];
> -     struct drm_i915_gem_relocation_entry __user *user_relocs;
> -     struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> -     int remain, ret = 0;
> -
> -     user_relocs = u64_to_user_ptr(entry->relocs_ptr);
> +     struct drm_i915_gem_relocation_entry stack[N_RELOC(512)];
> +     struct drm_i915_gem_relocation_entry __user *urelocs;
> +     const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> +     unsigned int remain;
>  
> +     urelocs = u64_to_user_ptr(entry->relocs_ptr);
>       remain = entry->relocation_count;
> -     while (remain) {
> -             struct drm_i915_gem_relocation_entry *r = stack_reloc;
> -             unsigned long unwritten;
> -             unsigned int count;
> +     if (unlikely(remain > ULONG_MAX / sizeof(*urelocs)))

How bout N_RELOC(ULONG_MAX) ?

> @@ -732,66 +1164,66 @@ static int eb_relocate_vma(struct i915_vma *vma, 
> struct i915_execbuffer *eb)
>                * this is bad and so lockdep complains vehemently.
>                */
>               pagefault_disable();
> -             unwritten = __copy_from_user_inatomic(r, user_relocs, 
> count*sizeof(r[0]));
> -             pagefault_enable();
> -             if (unlikely(unwritten)) {
> -                     ret = -EFAULT;
> +             if (__copy_from_user_inatomic(r, urelocs, count*sizeof(r[0]))) {
> +                     pagefault_enable();
> +                     remain = -EFAULT;
>                       goto out;
>               }
> +             pagefault_enable();

Why dupe the pagefault_enable?

>  
> +             remain -= count;
>               do {
> -                     u64 offset = r->presumed_offset;
> +                     u64 offset = eb_relocate_entry(eb, vma, r);
>  
> -                     ret = eb_relocate_entry(vma, eb, r);
> -                     if (ret)
> +                     if (likely(offset == 0)) {

Sparse not complaining?

> +                     } else if ((s64)offset < 0) {
> +                             remain = (s64)offset;
>                               goto out;
> -
> -                     if (r->presumed_offset != offset) {
> +                     } else {
> +                             /* Note that reporting an error now
> +                              * leaves everything in an inconsistent
> +                              * state as we have *already* changed
> +                              * the relocation value inside the
> +                              * object. As we have not changed the
> +                              * reloc.presumed_offset or will not
> +                              * change the execobject.offset, on the
> +                              * call we may not rewrite the value
> +                              * inside the object, leaving it
> +                              * dangling and causing a GPU hang.
> +                              */
>                               pagefault_disable();
> -                             unwritten = __put_user(r->presumed_offset,
> -                                                    
> &user_relocs->presumed_offset);
> +                             __put_user(offset & ~1,

Magic.

> +                                        &urelocs[r-stack].presumed_offset);

There's the comment, so maybe worth if (__put_user()) DRM_DEBUG?

>                               pagefault_enable();

<SNIP>

> +static int check_relocations(const struct drm_i915_gem_exec_object2 *entry)
> {

<SNIP>
> +     const unsigned long relocs_max =
> +             ULONG_MAX / sizeof(struct drm_i915_gem_relocation_entry);

Could be a define, this is used above too.

<SNIP>

> +     return __get_user(c, end - 1);

What's the point in this final check?

>  }
>  
> -static bool
> -need_reloc_mappable(struct i915_vma *vma)
> +static int
> +eb_copy_relocations(const struct i915_execbuffer *eb)
>  {
> -     struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> -
> -     if (entry->relocation_count == 0)
> -             return false;
> -
> -     if (!i915_vma_is_ggtt(vma))
> -             return false;
> +     const unsigned int count = eb->buffer_count;
> +     unsigned int i;
> +     int ret;
>  
> -     /* See also use_cpu_reloc() */
> -     if (HAS_LLC(to_i915(vma->obj->base.dev)))
> -             return false;
> +     for (i = 0; i < count; i++) {
> +             struct drm_i915_gem_relocation_entry __user *urelocs;
> +             struct drm_i915_gem_relocation_entry *relocs;
> +             unsigned int nreloc = eb->exec[i].relocation_count, j;

No hidden variables in assignment lines.
 
> -static bool
> -eb_vma_misplaced(struct i915_vma *vma)
> -{
> -     struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> +             urelocs = u64_to_user_ptr(eb->exec[i].relocs_ptr);
> +             size = nreloc * sizeof(*relocs);
>  
> -     WARN_ON(entry->flags & __EXEC_OBJECT_NEEDS_MAP &&
> -             !i915_vma_is_ggtt(vma));
> +             relocs = drm_malloc_gfp(size, 1, GFP_TEMPORARY);
> +             if (!relocs) {
> +                     drm_free_large(relocs);
> +                     ret = -ENOMEM;
> +                     goto err;
> +             }
>  
> -     if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment))
> -             return true;
> +             /* copy_from_user is limited to 4GiB */
> +             j = 0;
> +             do {
> +                     u32 len = min_t(u64, 1ull<<31, size);

BIT_ULL(31)

> +static void eb_export_fence(struct drm_i915_gem_object *obj,
> +                         struct drm_i915_gem_request *req,
> +                         unsigned int flags)
> +{
> +     struct reservation_object *resv = obj->resv;
> +
> +     /* Ignore errors from failing to allocate the new fence, we can't
> +      * handle an error right now. Worst case should be missed
> +      * synchronisation leading to rendering corruption.
> +      */

Worthy DRM_DEBUG?

> @@ -1155,10 +1524,33 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
>               }
>  
>               ret = i915_gem_request_await_object
> -                     (eb->request, obj, vma->exec_entry->flags & 
> EXEC_OBJECT_WRITE);
> +                     (eb->request, obj, entry->flags & EXEC_OBJECT_WRITE);
>               if (ret)
>                       return ret;
> +
> +skip_flushes:
> +             obj->base.write_domain = 0;
> +             if (entry->flags & EXEC_OBJECT_WRITE) {
> +                     obj->base.read_domains = 0;
> +                     if (!obj->cache_dirty && gpu_write_needs_clflush(obj))
> +                             obj->cache_dirty = true;
> +                     intel_fb_obj_invalidate(obj, ORIGIN_CS);
> +             }
> +             obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
> +
> +             i915_vma_move_to_active(vma, eb->request, entry->flags);
> +             __eb_unreserve_vma(vma, entry);
> +             vma->exec_entry = NULL;

This seems like a bugfix hunk lost to refactoring patch.

> @@ -1377,16 +1629,16 @@ i915_reset_gen7_sol_offsets(struct 
> drm_i915_gem_request *req)
>               return -EINVAL;
>       }
>  
> -     cs = intel_ring_begin(req, 4 * 3);
> +     cs = intel_ring_begin(req, 4 * 2 + 2);
>       if (IS_ERR(cs))
>               return PTR_ERR(cs);
>  
> +     *cs++ = MI_LOAD_REGISTER_IMM(4);
>       for (i = 0; i < 4; i++) {
> -             *cs++ = MI_LOAD_REGISTER_IMM(1);
>               *cs++ = i915_mmio_reg_offset(GEN7_SO_WRITE_OFFSET(i));
>               *cs++ = 0;
>       }
> -
> +     *cs++ = MI_NOOP;
>       intel_ring_advance(req, cs);

Again a lost hunk.

>  
> >     return 0;
> @@ -1422,10 +1674,11 @@ static struct i915_vma *eb_parse(struct 
> i915_execbuffer *eb, bool is_master)
>               goto out;
>  
>       vma->exec_entry =
> -             memset(&eb->shadow_exec_entry, 0, sizeof(*vma->exec_entry));
> +             memset(&eb->exec[eb->buffer_count++],
> +                    0, sizeof(*vma->exec_entry));
>       vma->exec_entry->flags = __EXEC_OBJECT_HAS_PIN;
> -     i915_gem_object_get(shadow_batch_obj);
> -     list_add_tail(&vma->exec_link, &eb->vmas);
> +     vma->exec_entry->rsvd2 = (uintptr_t)vma;

Use the helper.

> @@ -1901,56 +2135,63 @@ i915_gem_execbuffer2(struct drm_device *dev, void 
> *data,
>                    struct drm_file *file)
>  {
>       struct drm_i915_gem_execbuffer2 *args = data;
> -     struct drm_i915_gem_exec_object2 *exec2_list = NULL;
> +     struct drm_i915_gem_exec_object2 *exec2_list;
>       int ret;
>  
>       if (args->buffer_count < 1 ||
> -         args->buffer_count > UINT_MAX / sizeof(*exec2_list)) {
> +         args->buffer_count >= UINT_MAX / sizeof(*exec2_list) - 1) {
>               DRM_DEBUG("execbuf2 with %d buffers\n", args->buffer_count);
>               return -EINVAL;
>       }
>  
> -     exec2_list = drm_malloc_gfp(args->buffer_count,
> +     if (!i915_gem_check_execbuffer(args))
> +             return -EINVAL;
> +
> +     exec2_list = drm_malloc_gfp(args->buffer_count + 1,

The "+ 1" is very unintuitive without a comment.

With the comments, this is (90%);

Reviewed-by: Joonas Lahtinen <joonas.lahti...@linux.intel.com>

I'm pretty darn sure I ain't reviewing this again without some very
specific changelog and inter-diff.

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Re: [Intel-gfx] [PATCH 08/13] drm/i915: Eliminate lots of iterations over the execobjects array

Reply via email to