With a little complexity to handle cmds straddling page boundaries, we
can completely avoiding having to vmap the batch and the shadow batch
objects whilst running the command parser.

On ivb i7-3720MQ:

x11perf -dot before 54.3M, after 53.2M (max 203M)
glxgears before 7110 fps, after 7300 fps (max 7860 fps)

Before:
Time to blt 16384 bytes x      1:        12.400µs, 1.2GiB/s
Time to blt 16384 bytes x   4096:         3.055µs, 5.0GiB/s

After:
Time to blt 16384 bytes x      1:         8.600µs, 1.8GiB/s
Time to blt 16384 bytes x   4096:         2.456µs, 6.2GiB/s

Removing the vmap is mostly a win, except we lose in a few cases where
the batch size is greater than a page due to the extra complexity (loss
of a simple cache efficient large copy, and boundary handling).

Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 297 +++++++++++++++++----------------
 1 file changed, 150 insertions(+), 147 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c 
b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 9605ff8f2fcd..60b30b4165d4 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -818,100 +818,6 @@ static bool valid_reg(const u32 *table, int count, u32 
addr)
        return false;
 }
 
-static u32 *vmap_batch(struct drm_i915_gem_object *obj,
-                      unsigned start, unsigned len)
-{
-       int i;
-       void *addr = NULL;
-       struct sg_page_iter sg_iter;
-       int first_page = start >> PAGE_SHIFT;
-       int last_page = (len + start + 4095) >> PAGE_SHIFT;
-       int npages = last_page - first_page;
-       struct page **pages;
-
-       pages = drm_malloc_ab(npages, sizeof(*pages));
-       if (pages == NULL) {
-               DRM_DEBUG_DRIVER("Failed to get space for pages\n");
-               goto finish;
-       }
-
-       i = 0;
-       for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, 
first_page) {
-               pages[i++] = sg_page_iter_page(&sg_iter);
-               if (i == npages)
-                       break;
-       }
-
-       addr = vmap(pages, i, 0, PAGE_KERNEL);
-       if (addr == NULL) {
-               DRM_DEBUG_DRIVER("Failed to vmap pages\n");
-               goto finish;
-       }
-
-finish:
-       if (pages)
-               drm_free_large(pages);
-       return (u32*)addr;
-}
-
-/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
-static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
-                      struct drm_i915_gem_object *src_obj,
-                      u32 batch_start_offset,
-                      u32 batch_len)
-{
-       int needs_clflush = 0;
-       void *src_base, *src;
-       void *dst = NULL;
-       int ret;
-
-       if (batch_len > dest_obj->base.size ||
-           batch_len + batch_start_offset > src_obj->base.size)
-               return ERR_PTR(-E2BIG);
-
-       if (WARN_ON(dest_obj->pages_pin_count == 0))
-               return ERR_PTR(-ENODEV);
-
-       ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
-       if (ret) {
-               DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
-               return ERR_PTR(ret);
-       }
-
-       src_base = vmap_batch(src_obj, batch_start_offset, batch_len);
-       if (!src_base) {
-               DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
-               ret = -ENOMEM;
-               goto unpin_src;
-       }
-
-       ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
-       if (ret) {
-               DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
-               goto unmap_src;
-       }
-
-       dst = vmap_batch(dest_obj, 0, batch_len);
-       if (!dst) {
-               DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
-               ret = -ENOMEM;
-               goto unmap_src;
-       }
-
-       src = src_base + offset_in_page(batch_start_offset);
-       if (needs_clflush)
-               drm_clflush_virt_range(src, batch_len);
-
-       memcpy(dst, src, batch_len);
-
-unmap_src:
-       vunmap(src_base);
-unpin_src:
-       i915_gem_object_unpin_pages(src_obj);
-
-       return ret ? ERR_PTR(ret) : dst;
-}
-
 /**
  * i915_needs_cmd_parser() - should a given ring use software command parsing?
  * @ring: the ring in question
@@ -1046,16 +952,34 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
                    u32 batch_len,
                    bool is_master)
 {
-       u32 *cmd, *batch_base, *batch_end;
+       u32 tmp[128];
+       struct sg_page_iter src_iter, dst_iter;
+       const struct drm_i915_cmd_descriptor *desc;
+       int needs_clflush = 0;
+       void *src, *dst;
+       unsigned in, out;
+       u32 *buf, partial = 0, length;
        struct drm_i915_cmd_descriptor default_desc = { 0 };
        bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
        int ret = 0;
 
-       batch_base = copy_batch(shadow_batch_obj, batch_obj,
-                               batch_start_offset, batch_len);
-       if (IS_ERR(batch_base)) {
-               DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
-               return PTR_ERR(batch_base);
+       if (batch_len > shadow_batch_obj->base.size ||
+           batch_len + batch_start_offset > batch_obj->base.size)
+               return -E2BIG;
+
+       if (WARN_ON(shadow_batch_obj->pages_pin_count == 0))
+               return -ENODEV;
+
+       ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush);
+       if (ret) {
+               DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
+               return ret;
+       }
+
+       ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
+       if (ret) {
+               DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
+               goto unpin;
        }
 
        /*
@@ -1063,54 +987,136 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
         * large or larger and copy_batch() will write MI_NOPs to the extra
         * space. Parsing should be faster in some cases this way.
         */
-       batch_end = batch_base + (batch_len / sizeof(*batch_end));
+       ret = -EINVAL;
+
+       __sg_page_iter_start(&dst_iter,
+                            shadow_batch_obj->pages->sgl,
+                            shadow_batch_obj->pages->nents,
+                            0);
+       __sg_page_iter_next(&dst_iter);
+       dst = kmap_atomic(sg_page_iter_page(&dst_iter));
+
+       in = offset_in_page(batch_start_offset);
+       out = 0;
+       for_each_sg_page(batch_obj->pages->sgl,
+                        &src_iter,
+                        batch_obj->pages->nents,
+                        batch_start_offset >> PAGE_SHIFT) {
+               u32 this, i, j, k;
+               u32 *cmd, *page_end, *batch_end;
+
+               this = batch_len;
+               if (this > PAGE_SIZE - in)
+                       this = PAGE_SIZE - in;
+
+               src = kmap_atomic(sg_page_iter_page(&src_iter));
+               if (needs_clflush)
+                       drm_clflush_virt_range(src + in, this);
+
+               i = this;
+               j = in;
+               do {
+                       /* We keep dst around so that we do not blow
+                        * the CPU caches immediately after the copy (due
+                        * to the kunmap_atomic(dst) doing a TLB flush.
+                        */
+                       if (out == PAGE_SIZE) {
+                               __sg_page_iter_next(&dst_iter);
+                               kunmap_atomic(dst);
+                               dst = kmap_atomic(sg_page_iter_page(&dst_iter));
+                               out = 0;
+                       }
 
-       cmd = batch_base;
-       while (cmd < batch_end) {
-               const struct drm_i915_cmd_descriptor *desc;
-               u32 length;
+                       k = i;
+                       if (k > PAGE_SIZE - out)
+                               k = PAGE_SIZE - out;
+                       if (k == PAGE_SIZE)
+                               copy_page(dst, src);
+                       else
+                               memcpy(dst + out, src + j, k);
+
+                       out += k;
+                       j += k;
+                       i -= k;
+               } while (i);
+
+               cmd = src + in;
+               page_end = (void *)cmd + this;
+               batch_end = (void *)cmd + batch_len;
+
+               if (partial) {
+                       memcpy(tmp + partial, cmd, (length - partial)*4);
+                       cmd -= partial;
+                       partial = 0;
+                       buf = tmp;
+                       goto check;
+               }
 
-               if (*cmd == MI_BATCH_BUFFER_END)
-                       break;
+               do {
+                       if (*cmd == MI_BATCH_BUFFER_END) {
+                               ret = 0;
+                               goto unmap_src;
+                       }
 
-               desc = find_cmd(ring, *cmd, &default_desc);
-               if (!desc) {
-                       DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
-                                        *cmd);
-                       ret = -EINVAL;
-                       break;
-               }
+                       desc = find_cmd(ring, *cmd, &default_desc);
+                       if (!desc) {
+                               DRM_DEBUG_DRIVER("CMD: Unrecognized command: 
0x%08X\n",
+                                                *cmd);
+                               goto unmap_src;
+                       }
 
-               /*
-                * If the batch buffer contains a chained batch, return an
-                * error that tells the caller to abort and dispatch the
-                * workload as a non-secure batch.
-                */
-               if (desc->cmd.value == MI_BATCH_BUFFER_START) {
-                       ret = -EACCES;
-                       break;
-               }
+                       /*
+                        * If the batch buffer contains a chained batch, return 
an
+                        * error that tells the caller to abort and dispatch the
+                        * workload as a non-secure batch.
+                        */
+                       if (desc->cmd.value == MI_BATCH_BUFFER_START) {
+                               ret = -EACCES;
+                               goto unmap_src;
+                       }
 
-               if (desc->flags & CMD_DESC_FIXED)
-                       length = desc->length.fixed;
-               else
-                       length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
-
-               if ((batch_end - cmd) < length) {
-                       DRM_DEBUG_DRIVER("CMD: Command length exceeds batch 
length: 0x%08X length=%u batchlen=%td\n",
-                                        *cmd,
-                                        length,
-                                        batch_end - cmd);
-                       ret = -EINVAL;
-                       break;
-               }
+                       if (desc->flags & CMD_DESC_FIXED)
+                               length = desc->length.fixed;
+                       else
+                               length = ((*cmd & desc->length.mask) + 
LENGTH_BIAS);
+
+                       if (cmd + length > page_end) {
+                               if (length + cmd > batch_end) {
+                                       DRM_DEBUG_DRIVER("CMD: Command length 
exceeds batch length: 0x%08X length=%u batchlen=%td\n",
+                                                        *cmd, length, 
batch_end - cmd);
+                                       goto unmap_src;
+                               }
+
+                               if (WARN_ON(length > sizeof(tmp)/4)) {
+                                       ret = -ENODEV;
+                                       goto unmap_src;
+                               }
+
+                               partial = page_end - cmd;
+                               memcpy(tmp, cmd, partial*4);
+                               break;
+                       }
+
+                       buf = cmd;
+check:
+                       if (!check_cmd(ring, desc, buf, is_master, 
&oacontrol_set))
+                               goto unmap_src;
 
-               if (!check_cmd(ring, desc, cmd, is_master, &oacontrol_set)) {
-                       ret = -EINVAL;
+                       cmd += length;
+               } while (cmd < page_end);
+
+               kunmap_atomic(src);
+
+               batch_len -= this;
+               if (batch_len == 0)
                        break;
-               }
 
-               cmd += length;
+               in = 0;
+               continue;
+
+unmap_src:
+               kunmap_atomic(src);
+               goto unmap_dst;
        }
 
        if (oacontrol_set) {
@@ -1118,13 +1124,10 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
                ret = -EINVAL;
        }
 
-       if (cmd >= batch_end) {
-               DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE 
cmd!\n");
-               ret = -EINVAL;
-       }
-
-       vunmap(batch_base);
-
+unmap_dst:
+       kunmap_atomic(dst);
+unpin:
+       i915_gem_object_unpin_pages(batch_obj);
        return ret;
 }
 
-- 
2.1.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to