Re: [Intel-gfx] [PATCH 24/27] drm/i915: Multi-BB execbuf

2021-09-30 Thread Matthew Brost
On Fri, Aug 20, 2021 at 03:44:43PM -0700, Matthew Brost wrote:

Did a review offline with John Harrison, adding notes for what we found. 

> Allow multiple batch buffers to be submitted in a single execbuf IOCTL
> after a context has been configured with the 'set_parallel' extension.
> The number batches is implicit based on the contexts configuration.
> 
> This is implemented with a series of loops. First a loop is used to find
> all the batches, a loop to pin all the HW contexts, a loop to generate
> all the requests, a loop to submit all the requests, a loop to commit
> all the requests, and finally a loop to tie the requests to the VMAs
> they touch.

Clarify these steps a bit, also tieing requests to the VMAs is the 2nd to last
step with commiting requests to be the last.

> 
> A composite fence is also created for the also the generated requests to
> return to the user and to stick in dma resv slots.
>

Add a comment saying there should be no change in behavior for existing IOCTLs
expect if throttling because to space in the ring, the wait is done with the VMA
locks being held rather than dropping the locks and putting to the slow path.

> IGT: https://patchwork.freedesktop.org/patch/447008/?series=93071=1
> media UMD: link to come
> 
> Signed-off-by: Matthew Brost 
> ---
>  .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 765 --
>  drivers/gpu/drm/i915/gt/intel_context.h   |   8 +-
>  drivers/gpu/drm/i915/gt/intel_context_types.h |  12 +
>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c |   2 +
>  drivers/gpu/drm/i915/i915_request.h   |   9 +
>  drivers/gpu/drm/i915/i915_vma.c   |  21 +-
>  drivers/gpu/drm/i915/i915_vma.h   |  13 +-
>  7 files changed, 573 insertions(+), 257 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
> b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index 8290bdadd167..481978974627 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -244,17 +244,23 @@ struct i915_execbuffer {
>   struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */
>   struct eb_vma *vma;
>  
> - struct intel_engine_cs *engine; /** engine to queue the request to */
> + struct intel_gt *gt; /* gt for the execbuf */
>   struct intel_context *context; /* logical state for the request */
>   struct i915_gem_context *gem_context; /** caller's context */
>  
> - struct i915_request *request; /** our request to build */
> - struct eb_vma *batch; /** identity of the batch obj/vma */

Line wrap of these comments.

> + struct i915_request *requests[MAX_ENGINE_INSTANCE + 1]; /** our 
> requests to build */
> + struct eb_vma *batches[MAX_ENGINE_INSTANCE + 1]; /** identity of the 
> batch obj/vma */
>   struct i915_vma *trampoline; /** trampoline used for chaining */
>  
> + /** used for excl fence in dma_resv objects when > 1 BB submitted */
> + struct dma_fence *composite_fence;
> +
>   /** actual size of execobj[] as we may extend it for the cmdparser */
>   unsigned int buffer_count;
>  
> + /* number of batches in execbuf IOCTL */
> + unsigned int num_batches;
> +
>   /** list of vma not yet bound during reservation phase */
>   struct list_head unbound;
>  
> @@ -281,7 +287,7 @@ struct i915_execbuffer {
>  
>   u64 invalid_flags; /** Set of execobj.flags that are invalid */
>  
> - u64 batch_len; /** Length of batch within object */
> + u64 batch_len[MAX_ENGINE_INSTANCE + 1]; /** Length of batch within 
> object */
>   u32 batch_start_offset; /** Location within object of batch */
>   u32 batch_flags; /** Flags composed for emit_bb_start() */
>   struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch 
> buffer */
> @@ -299,14 +305,13 @@ struct i915_execbuffer {
>  };
>  
>  static int eb_parse(struct i915_execbuffer *eb);
> -static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb,
> -   bool throttle);
> +static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle);
>  static void eb_unpin_engine(struct i915_execbuffer *eb);
>  
>  static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb)
>  {
> - return intel_engine_requires_cmd_parser(eb->engine) ||
> - (intel_engine_using_cmd_parser(eb->engine) &&
> + return intel_engine_requires_cmd_parser(eb->context->engine) ||
> + (intel_engine_using_cmd_parser(eb->context->engine) &&
>eb->args->batch_len);
>  }
>  
> @@ -544,11 +549,21 @@ eb_validate_vma(struct i915_execbuffer *eb,
>   return 0;
>  }
>  
> -static void
> +static inline bool
> +is_batch_buffer(struct i915_execbuffer *eb, unsigned int buffer_idx)
> +{
> + return eb->args->flags & I915_EXEC_BATCH_FIRST ?
> + buffer_idx < eb->num_batches :
> + buffer_idx >= eb->args->buffer_count - 

Re: [Intel-gfx] [PATCH 24/27] drm/i915: Multi-BB execbuf

2021-08-29 Thread kernel test robot
Hi Matthew,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm-intel/for-linux-next]
[also build test WARNING on drm-tip/drm-tip drm-exynos/exynos-drm-next 
next-20210827]
[cannot apply to tegra-drm/drm/tegra/for-next linus/master drm/drm-next v5.14]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20210821-065348
base:   git://anongit.freedesktop.org/drm-intel for-linux-next
config: x86_64-rhel-8.3-kselftests (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce:
# apt-get install sparse
# sparse version: v0.6.3-348-gf0e6938b-dirty
# 
https://github.com/0day-ci/linux/commit/7e7ae2111b2855ac3d63aa5c806c6936daaa6bbc
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20210821-065348
git checkout 7e7ae2111b2855ac3d63aa5c806c6936daaa6bbc
# save the attached .config to linux build tree
make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir 
ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 


sparse warnings: (new ones prefixed by >>)
>> drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c:608:21: sparse: sparse: Using 
>> plain integer as NULL pointer

vim +608 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c

   548  
   549  static int
   550  eb_add_vma(struct i915_execbuffer *eb,
   551 unsigned int *current_batch,
   552 unsigned int i,
   553 struct i915_vma *vma)
   554  {
   555  struct drm_i915_private *i915 = eb->i915;
   556  struct drm_i915_gem_exec_object2 *entry = >exec[i];
   557  struct eb_vma *ev = >vma[i];
   558  
   559  ev->vma = vma;
   560  ev->exec = entry;
   561  ev->flags = entry->flags;
   562  
   563  if (eb->lut_size > 0) {
   564  ev->handle = entry->handle;
   565  hlist_add_head(>node,
   566 >buckets[hash_32(entry->handle,
   567  eb->lut_size)]);
   568  }
   569  
   570  if (entry->relocation_count)
   571  list_add_tail(>reloc_link, >relocs);
   572  
   573  /*
   574   * SNA is doing fancy tricks with compressing batch buffers, 
which leads
   575   * to negative relocation deltas. Usually that works out ok 
since the
   576   * relocate address is still positive, except when the batch is 
placed
   577   * very low in the GTT. Ensure this doesn't happen.
   578   *
   579   * Note that actual hangs have only been observed on gen7, but 
for
   580   * paranoia do it everywhere.
   581   */
   582  if (is_batch_buffer(eb, i)) {
   583  if (entry->relocation_count &&
   584  !(ev->flags & EXEC_OBJECT_PINNED))
   585  ev->flags |= __EXEC_OBJECT_NEEDS_BIAS;
   586  if (eb->reloc_cache.has_fence)
   587  ev->flags |= EXEC_OBJECT_NEEDS_FENCE;
   588  
   589  eb->batches[*current_batch] = ev;
   590  
   591  if (unlikely(ev->flags & EXEC_OBJECT_WRITE)) {
   592  drm_dbg(>drm,
   593  "Attempting to use self-modifying batch 
buffer\n");
   594  return -EINVAL;
   595  }
   596  
   597  if (range_overflows_t(u64,
   598eb->batch_start_offset,
   599eb->args->batch_len,
   600ev->vma->size)) {
   601  drm_dbg(>drm, "Attempting to use 
out-of-bounds batch\n");
   602  return -EINVAL;
   603  }
   604  
   605  if (eb->args->batch_len == 0)
   606  eb->batch_len[*current_batch] = ev->vma->size -
   607  eb->batch_start_offset;
 > 608  if (unlikely(eb->batch_len == 0)) { /* impossible! */
   609  drm_dbg(>drm, "Invalid batch length\n");
   610  return -EINVAL;
   611  }
   612  
   613  ++*current_batch;
   614  }
   615  
   616  return 0;
   617  }
   618  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip


Re: [Intel-gfx] [PATCH 24/27] drm/i915: Multi-BB execbuf

2021-08-21 Thread kernel test robot
Hi Matthew,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm-intel/for-linux-next]
[also build test WARNING on drm-tip/drm-tip drm-exynos/exynos-drm-next 
next-20210820]
[cannot apply to tegra-drm/drm/tegra/for-next linus/master drm/drm-next 
v5.14-rc6]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20210821-065348
base:   git://anongit.freedesktop.org/drm-intel for-linux-next
config: x86_64-buildonly-randconfig-r002-20210821 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 
9e9d70591e72fc6762b4b9a226b68ed1307419bf)
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# 
https://github.com/0day-ci/linux/commit/7e7ae2111b2855ac3d63aa5c806c6936daaa6bbc
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20210821-065348
git checkout 7e7ae2111b2855ac3d63aa5c806c6936daaa6bbc
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All warnings (new ones prefixed by >>):

>> drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c:608:20: warning: comparison 
>> of array 'eb->batch_len' equal to a null pointer is always false 
>> [-Wtautological-pointer-compare]
   if (unlikely(eb->batch_len == 0)) { /* impossible! */
^~
   include/linux/compiler.h:78:42: note: expanded from macro 'unlikely'
   # define unlikely(x)__builtin_expect(!!(x), 0)
   ^
   1 warning generated.


vim +608 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c

   548  
   549  static int
   550  eb_add_vma(struct i915_execbuffer *eb,
   551 unsigned int *current_batch,
   552 unsigned int i,
   553 struct i915_vma *vma)
   554  {
   555  struct drm_i915_private *i915 = eb->i915;
   556  struct drm_i915_gem_exec_object2 *entry = >exec[i];
   557  struct eb_vma *ev = >vma[i];
   558  
   559  ev->vma = vma;
   560  ev->exec = entry;
   561  ev->flags = entry->flags;
   562  
   563  if (eb->lut_size > 0) {
   564  ev->handle = entry->handle;
   565  hlist_add_head(>node,
   566 >buckets[hash_32(entry->handle,
   567  eb->lut_size)]);
   568  }
   569  
   570  if (entry->relocation_count)
   571  list_add_tail(>reloc_link, >relocs);
   572  
   573  /*
   574   * SNA is doing fancy tricks with compressing batch buffers, 
which leads
   575   * to negative relocation deltas. Usually that works out ok 
since the
   576   * relocate address is still positive, except when the batch is 
placed
   577   * very low in the GTT. Ensure this doesn't happen.
   578   *
   579   * Note that actual hangs have only been observed on gen7, but 
for
   580   * paranoia do it everywhere.
   581   */
   582  if (is_batch_buffer(eb, i)) {
   583  if (entry->relocation_count &&
   584  !(ev->flags & EXEC_OBJECT_PINNED))
   585  ev->flags |= __EXEC_OBJECT_NEEDS_BIAS;
   586  if (eb->reloc_cache.has_fence)
   587  ev->flags |= EXEC_OBJECT_NEEDS_FENCE;
   588  
   589  eb->batches[*current_batch] = ev;
   590  
   591  if (unlikely(ev->flags & EXEC_OBJECT_WRITE)) {
   592  drm_dbg(>drm,
   593  "Attempting to use self-modifying batch 
buffer\n");
   594  return -EINVAL;
   595  }
   596  
   597  if (range_overflows_t(u64,
   598eb->batch_start_offset,
   599eb->args->batch_len,
   600ev->vma->size)) {
   601  drm_dbg(>drm, "Attempting to use 
out-of-bounds batch\n");
   602  return -EINVAL;
   603  }
   604  
   605  if (eb->args->batch_len == 0)
   606  eb->batch_len[*current_batch] = ev->vma->size -
   607  

[PATCH 24/27] drm/i915: Multi-BB execbuf

2021-08-20 Thread Matthew Brost
Allow multiple batch buffers to be submitted in a single execbuf IOCTL
after a context has been configured with the 'set_parallel' extension.
The number batches is implicit based on the contexts configuration.

This is implemented with a series of loops. First a loop is used to find
all the batches, a loop to pin all the HW contexts, a loop to generate
all the requests, a loop to submit all the requests, a loop to commit
all the requests, and finally a loop to tie the requests to the VMAs
they touch.

A composite fence is also created for the also the generated requests to
return to the user and to stick in dma resv slots.

IGT: https://patchwork.freedesktop.org/patch/447008/?series=93071=1
media UMD: link to come

Signed-off-by: Matthew Brost 
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 765 --
 drivers/gpu/drm/i915/gt/intel_context.h   |   8 +-
 drivers/gpu/drm/i915/gt/intel_context_types.h |  12 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c |   2 +
 drivers/gpu/drm/i915/i915_request.h   |   9 +
 drivers/gpu/drm/i915/i915_vma.c   |  21 +-
 drivers/gpu/drm/i915/i915_vma.h   |  13 +-
 7 files changed, 573 insertions(+), 257 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 8290bdadd167..481978974627 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -244,17 +244,23 @@ struct i915_execbuffer {
struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */
struct eb_vma *vma;
 
-   struct intel_engine_cs *engine; /** engine to queue the request to */
+   struct intel_gt *gt; /* gt for the execbuf */
struct intel_context *context; /* logical state for the request */
struct i915_gem_context *gem_context; /** caller's context */
 
-   struct i915_request *request; /** our request to build */
-   struct eb_vma *batch; /** identity of the batch obj/vma */
+   struct i915_request *requests[MAX_ENGINE_INSTANCE + 1]; /** our 
requests to build */
+   struct eb_vma *batches[MAX_ENGINE_INSTANCE + 1]; /** identity of the 
batch obj/vma */
struct i915_vma *trampoline; /** trampoline used for chaining */
 
+   /** used for excl fence in dma_resv objects when > 1 BB submitted */
+   struct dma_fence *composite_fence;
+
/** actual size of execobj[] as we may extend it for the cmdparser */
unsigned int buffer_count;
 
+   /* number of batches in execbuf IOCTL */
+   unsigned int num_batches;
+
/** list of vma not yet bound during reservation phase */
struct list_head unbound;
 
@@ -281,7 +287,7 @@ struct i915_execbuffer {
 
u64 invalid_flags; /** Set of execobj.flags that are invalid */
 
-   u64 batch_len; /** Length of batch within object */
+   u64 batch_len[MAX_ENGINE_INSTANCE + 1]; /** Length of batch within 
object */
u32 batch_start_offset; /** Location within object of batch */
u32 batch_flags; /** Flags composed for emit_bb_start() */
struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch 
buffer */
@@ -299,14 +305,13 @@ struct i915_execbuffer {
 };
 
 static int eb_parse(struct i915_execbuffer *eb);
-static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb,
- bool throttle);
+static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle);
 static void eb_unpin_engine(struct i915_execbuffer *eb);
 
 static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb)
 {
-   return intel_engine_requires_cmd_parser(eb->engine) ||
-   (intel_engine_using_cmd_parser(eb->engine) &&
+   return intel_engine_requires_cmd_parser(eb->context->engine) ||
+   (intel_engine_using_cmd_parser(eb->context->engine) &&
 eb->args->batch_len);
 }
 
@@ -544,11 +549,21 @@ eb_validate_vma(struct i915_execbuffer *eb,
return 0;
 }
 
-static void
+static inline bool
+is_batch_buffer(struct i915_execbuffer *eb, unsigned int buffer_idx)
+{
+   return eb->args->flags & I915_EXEC_BATCH_FIRST ?
+   buffer_idx < eb->num_batches :
+   buffer_idx >= eb->args->buffer_count - eb->num_batches;
+}
+
+static int
 eb_add_vma(struct i915_execbuffer *eb,
-  unsigned int i, unsigned batch_idx,
+  unsigned int *current_batch,
+  unsigned int i,
   struct i915_vma *vma)
 {
+   struct drm_i915_private *i915 = eb->i915;
struct drm_i915_gem_exec_object2 *entry = >exec[i];
struct eb_vma *ev = >vma[i];
 
@@ -575,15 +590,41 @@ eb_add_vma(struct i915_execbuffer *eb,
 * Note that actual hangs have only been observed on gen7, but for
 * paranoia do it everywhere.
 */
-   if (i == batch_idx) {
+   if (is_batch_buffer(eb, i)) {
if (entry->relocation_count &&