Re: [Intel-gfx] [PATCH 21/26] drm/i915: Multi-BB execbuf

2021-10-12 Thread Matthew Brost
On Tue, Oct 12, 2021 at 02:22:41PM -0700, John Harrison wrote:
> On 10/4/2021 15:06, Matthew Brost wrote:
> > Allow multiple batch buffers to be submitted in a single execbuf IOCTL
> > after a context has been configured with the 'set_parallel' extension.
> > The number batches is implicit based on the contexts configuration.
> > 
> > This is implemented with a series of loops. First a loop is used to find
> > all the batches, a loop to pin all the HW contexts, a loop to create all
> > the requests, a loop to submit (emit BB start, etc...) all the requests,
> > a loop to tie the requests to the VMAs they touch, and finally a loop to
> > commit the requests to the backend.
> > 
> > A composite fence is also created for the generated requests to return
> > to the user and to stick in dma resv slots.
> > 
> > No behavior from the existing IOCTL should be changed aside from when
> > throttling because the ring for a context is full, wait on the request
> throttling because the ring for -> throttling the ring because
> 
> full, wait -> full. In this situation, i915 will now wait
> 

Yep.

> > while holding the object locks.
> , previously it would have dropped the locks for the wait.
> 
> And maybe explain why this change is necessary?
>

We could drop the lock but it would make the code way more complicated
and IMO simpler code far out weighs the potential benefit of dropping
the lock. The dropping of the lock probably was a premature optimization
that landed in the code without any data backing it up that it helped in
any meaningful way. I can add a comment stating this.

> 
> > 
> > IGT: https://patchwork.freedesktop.org/patch/447008/?series=93071=1
> > media UMD: https://github.com/intel/media-driver/pull/1252
> > 
> > v2:
> >   (Matthew Brost)
> >- Return proper error value if i915_request_create fails
> > v3:
> >   (John Harrison)
> >- Add comment explaining create / add order loops + locking
> >- Update commit message explaining different in IOCTL behavior
> >- Line wrap some comments
> >- eb_add_request returns void
> >- Return -EINVAL rather triggering BUG_ON if cmd parser used
> >   (Checkpatch)
> >- Check eb->batch_len[*current_batch]
> > 
> > Signed-off-by: Matthew Brost 
> > ---
> >   .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 793 --
> >   drivers/gpu/drm/i915/gt/intel_context.h   |   8 +-
> >   drivers/gpu/drm/i915/gt/intel_context_types.h |  10 +
> >   .../gpu/drm/i915/gt/uc/intel_guc_submission.c |   2 +
> >   drivers/gpu/drm/i915/i915_request.h   |   9 +
> >   drivers/gpu/drm/i915/i915_vma.c   |  21 +-
> >   drivers/gpu/drm/i915/i915_vma.h   |  13 +-
> >   7 files changed, 599 insertions(+), 257 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
> > b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> > index 2f2434b52317..5c7fb6f68bbb 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> > @@ -244,17 +244,25 @@ struct i915_execbuffer {
> > struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */
> > struct eb_vma *vma;
> > -   struct intel_engine_cs *engine; /** engine to queue the request to */
> > +   struct intel_gt *gt; /* gt for the execbuf */
> > struct intel_context *context; /* logical state for the request */
> > struct i915_gem_context *gem_context; /** caller's context */
> > -   struct i915_request *request; /** our request to build */
> > -   struct eb_vma *batch; /** identity of the batch obj/vma */
> > +   /** our requests to build */
> > +   struct i915_request *requests[MAX_ENGINE_INSTANCE + 1];
> > +   /** identity of the batch obj/vma */
> > +   struct eb_vma *batches[MAX_ENGINE_INSTANCE + 1];
> > struct i915_vma *trampoline; /** trampoline used for chaining */
> > +   /** used for excl fence in dma_resv objects when > 1 BB submitted */
> > +   struct dma_fence *composite_fence;
> > +
> > /** actual size of execobj[] as we may extend it for the cmdparser */
> > unsigned int buffer_count;
> > +   /* number of batches in execbuf IOCTL */
> > +   unsigned int num_batches;
> > +
> > /** list of vma not yet bound during reservation phase */
> > struct list_head unbound;
> > @@ -281,7 +289,8 @@ struct i915_execbuffer {
> > u64 invalid_flags; /** Set of execobj.flags that are invalid */
> > -   u64 batch_len; /** Length of batch within object */
> > +   /** Length of batch within object */
> > +   u64 batch_len[MAX_ENGINE_INSTANCE + 1];
> > u32 batch_start_offset; /** Location within object of batch */
> > u32 batch_flags; /** Flags composed for emit_bb_start() */
> > struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch 
> > buffer */
> > @@ -299,14 +308,13 @@ struct i915_execbuffer {
> >   };
> >   static int eb_parse(struct i915_execbuffer *eb);
> > -static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb,
> > - 

Re: [Intel-gfx] [PATCH 21/26] drm/i915: Multi-BB execbuf

2021-10-12 Thread John Harrison

On 10/4/2021 15:06, Matthew Brost wrote:

Allow multiple batch buffers to be submitted in a single execbuf IOCTL
after a context has been configured with the 'set_parallel' extension.
The number batches is implicit based on the contexts configuration.

This is implemented with a series of loops. First a loop is used to find
all the batches, a loop to pin all the HW contexts, a loop to create all
the requests, a loop to submit (emit BB start, etc...) all the requests,
a loop to tie the requests to the VMAs they touch, and finally a loop to
commit the requests to the backend.

A composite fence is also created for the generated requests to return
to the user and to stick in dma resv slots.

No behavior from the existing IOCTL should be changed aside from when
throttling because the ring for a context is full, wait on the request

throttling because the ring for -> throttling the ring because

full, wait -> full. In this situation, i915 will now wait


while holding the object locks.

, previously it would have dropped the locks for the wait.

And maybe explain why this change is necessary?




IGT: https://patchwork.freedesktop.org/patch/447008/?series=93071=1
media UMD: https://github.com/intel/media-driver/pull/1252

v2:
  (Matthew Brost)
   - Return proper error value if i915_request_create fails
v3:
  (John Harrison)
   - Add comment explaining create / add order loops + locking
   - Update commit message explaining different in IOCTL behavior
   - Line wrap some comments
   - eb_add_request returns void
   - Return -EINVAL rather triggering BUG_ON if cmd parser used
  (Checkpatch)
   - Check eb->batch_len[*current_batch]

Signed-off-by: Matthew Brost 
---
  .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 793 --
  drivers/gpu/drm/i915/gt/intel_context.h   |   8 +-
  drivers/gpu/drm/i915/gt/intel_context_types.h |  10 +
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c |   2 +
  drivers/gpu/drm/i915/i915_request.h   |   9 +
  drivers/gpu/drm/i915/i915_vma.c   |  21 +-
  drivers/gpu/drm/i915/i915_vma.h   |  13 +-
  7 files changed, 599 insertions(+), 257 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 2f2434b52317..5c7fb6f68bbb 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -244,17 +244,25 @@ struct i915_execbuffer {
struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */
struct eb_vma *vma;
  
-	struct intel_engine_cs *engine; /** engine to queue the request to */

+   struct intel_gt *gt; /* gt for the execbuf */
struct intel_context *context; /* logical state for the request */
struct i915_gem_context *gem_context; /** caller's context */
  
-	struct i915_request *request; /** our request to build */

-   struct eb_vma *batch; /** identity of the batch obj/vma */
+   /** our requests to build */
+   struct i915_request *requests[MAX_ENGINE_INSTANCE + 1];
+   /** identity of the batch obj/vma */
+   struct eb_vma *batches[MAX_ENGINE_INSTANCE + 1];
struct i915_vma *trampoline; /** trampoline used for chaining */
  
+	/** used for excl fence in dma_resv objects when > 1 BB submitted */

+   struct dma_fence *composite_fence;
+
/** actual size of execobj[] as we may extend it for the cmdparser */
unsigned int buffer_count;
  
+	/* number of batches in execbuf IOCTL */

+   unsigned int num_batches;
+
/** list of vma not yet bound during reservation phase */
struct list_head unbound;
  
@@ -281,7 +289,8 @@ struct i915_execbuffer {
  
  	u64 invalid_flags; /** Set of execobj.flags that are invalid */
  
-	u64 batch_len; /** Length of batch within object */

+   /** Length of batch within object */
+   u64 batch_len[MAX_ENGINE_INSTANCE + 1];
u32 batch_start_offset; /** Location within object of batch */
u32 batch_flags; /** Flags composed for emit_bb_start() */
struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch 
buffer */
@@ -299,14 +308,13 @@ struct i915_execbuffer {
  };
  
  static int eb_parse(struct i915_execbuffer *eb);

-static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb,
- bool throttle);
+static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle);
  static void eb_unpin_engine(struct i915_execbuffer *eb);
  
  static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb)

  {
-   return intel_engine_requires_cmd_parser(eb->engine) ||
-   (intel_engine_using_cmd_parser(eb->engine) &&
+   return intel_engine_requires_cmd_parser(eb->context->engine) ||
+   (intel_engine_using_cmd_parser(eb->context->engine) &&
 eb->args->batch_len);
  }
  
@@ -544,11 +552,21 @@ eb_validate_vma(struct i915_execbuffer *eb,

return 0;
  

Re: [Intel-gfx] [PATCH 21/26] drm/i915: Multi-BB execbuf

2021-10-06 Thread Matthew Brost
On Mon, Oct 04, 2021 at 03:06:32PM -0700, Matthew Brost wrote:
> Allow multiple batch buffers to be submitted in a single execbuf IOCTL
> after a context has been configured with the 'set_parallel' extension.
> The number batches is implicit based on the contexts configuration.
> 
> This is implemented with a series of loops. First a loop is used to find
> all the batches, a loop to pin all the HW contexts, a loop to create all
> the requests, a loop to submit (emit BB start, etc...) all the requests,
> a loop to tie the requests to the VMAs they touch, and finally a loop to
> commit the requests to the backend.
> 
> A composite fence is also created for the generated requests to return
> to the user and to stick in dma resv slots.
> 
> No behavior from the existing IOCTL should be changed aside from when
> throttling because the ring for a context is full, wait on the request
> while holding the object locks.
> 
> IGT: https://patchwork.freedesktop.org/patch/447008/?series=93071=1
> media UMD: https://github.com/intel/media-driver/pull/1252
> 
> v2:
>  (Matthew Brost)
>   - Return proper error value if i915_request_create fails
> v3:
>  (John Harrison)
>   - Add comment explaining create / add order loops + locking
>   - Update commit message explaining different in IOCTL behavior
>   - Line wrap some comments
>   - eb_add_request returns void
>   - Return -EINVAL rather triggering BUG_ON if cmd parser used
>  (Checkpatch)
>   - Check eb->batch_len[*current_batch]
> 
> Signed-off-by: Matthew Brost 
> ---
>  .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 793 --
>  drivers/gpu/drm/i915/gt/intel_context.h   |   8 +-
>  drivers/gpu/drm/i915/gt/intel_context_types.h |  10 +
>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c |   2 +
>  drivers/gpu/drm/i915/i915_request.h   |   9 +
>  drivers/gpu/drm/i915/i915_vma.c   |  21 +-
>  drivers/gpu/drm/i915/i915_vma.h   |  13 +-
>  7 files changed, 599 insertions(+), 257 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
> b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index 2f2434b52317..5c7fb6f68bbb 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -244,17 +244,25 @@ struct i915_execbuffer {
>   struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */
>   struct eb_vma *vma;
>  
> - struct intel_engine_cs *engine; /** engine to queue the request to */
> + struct intel_gt *gt; /* gt for the execbuf */
>   struct intel_context *context; /* logical state for the request */
>   struct i915_gem_context *gem_context; /** caller's context */
>  
> - struct i915_request *request; /** our request to build */
> - struct eb_vma *batch; /** identity of the batch obj/vma */
> + /** our requests to build */
> + struct i915_request *requests[MAX_ENGINE_INSTANCE + 1];
> + /** identity of the batch obj/vma */
> + struct eb_vma *batches[MAX_ENGINE_INSTANCE + 1];
>   struct i915_vma *trampoline; /** trampoline used for chaining */
>  
> + /** used for excl fence in dma_resv objects when > 1 BB submitted */
> + struct dma_fence *composite_fence;
> +
>   /** actual size of execobj[] as we may extend it for the cmdparser */
>   unsigned int buffer_count;
>  
> + /* number of batches in execbuf IOCTL */
> + unsigned int num_batches;
> +
>   /** list of vma not yet bound during reservation phase */
>   struct list_head unbound;
>  
> @@ -281,7 +289,8 @@ struct i915_execbuffer {
>  
>   u64 invalid_flags; /** Set of execobj.flags that are invalid */
>  
> - u64 batch_len; /** Length of batch within object */
> + /** Length of batch within object */
> + u64 batch_len[MAX_ENGINE_INSTANCE + 1];
>   u32 batch_start_offset; /** Location within object of batch */
>   u32 batch_flags; /** Flags composed for emit_bb_start() */
>   struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch 
> buffer */
> @@ -299,14 +308,13 @@ struct i915_execbuffer {
>  };
>  
>  static int eb_parse(struct i915_execbuffer *eb);
> -static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb,
> -   bool throttle);
> +static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle);
>  static void eb_unpin_engine(struct i915_execbuffer *eb);
>  
>  static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb)
>  {
> - return intel_engine_requires_cmd_parser(eb->engine) ||
> - (intel_engine_using_cmd_parser(eb->engine) &&
> + return intel_engine_requires_cmd_parser(eb->context->engine) ||
> + (intel_engine_using_cmd_parser(eb->context->engine) &&
>eb->args->batch_len);
>  }
>  
> @@ -544,11 +552,21 @@ eb_validate_vma(struct i915_execbuffer *eb,
>   return 0;
>  }
>  
> -static void
> +static inline bool
> +is_batch_buffer(struct 

Re: [Intel-gfx] [PATCH 21/26] drm/i915: Multi-BB execbuf

2021-10-05 Thread Matthew Brost
On Mon, Oct 04, 2021 at 03:06:32PM -0700, Matthew Brost wrote:
> Allow multiple batch buffers to be submitted in a single execbuf IOCTL
> after a context has been configured with the 'set_parallel' extension.
> The number batches is implicit based on the contexts configuration.
> 
> This is implemented with a series of loops. First a loop is used to find
> all the batches, a loop to pin all the HW contexts, a loop to create all
> the requests, a loop to submit (emit BB start, etc...) all the requests,
> a loop to tie the requests to the VMAs they touch, and finally a loop to
> commit the requests to the backend.
> 
> A composite fence is also created for the generated requests to return
> to the user and to stick in dma resv slots.
> 
> No behavior from the existing IOCTL should be changed aside from when
> throttling because the ring for a context is full, wait on the request
> while holding the object locks.
> 
> IGT: https://patchwork.freedesktop.org/patch/447008/?series=93071=1
> media UMD: https://github.com/intel/media-driver/pull/1252
> 
> v2:
>  (Matthew Brost)
>   - Return proper error value if i915_request_create fails
> v3:
>  (John Harrison)
>   - Add comment explaining create / add order loops + locking
>   - Update commit message explaining different in IOCTL behavior
>   - Line wrap some comments
>   - eb_add_request returns void
>   - Return -EINVAL rather triggering BUG_ON if cmd parser used
>  (Checkpatch)
>   - Check eb->batch_len[*current_batch]
> 
> Signed-off-by: Matthew Brost 
> ---
>  .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 793 --
>  drivers/gpu/drm/i915/gt/intel_context.h   |   8 +-
>  drivers/gpu/drm/i915/gt/intel_context_types.h |  10 +
>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c |   2 +
>  drivers/gpu/drm/i915/i915_request.h   |   9 +
>  drivers/gpu/drm/i915/i915_vma.c   |  21 +-
>  drivers/gpu/drm/i915/i915_vma.h   |  13 +-
>  7 files changed, 599 insertions(+), 257 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
> b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index 2f2434b52317..5c7fb6f68bbb 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -244,17 +244,25 @@ struct i915_execbuffer {
>   struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */
>   struct eb_vma *vma;
>  
> - struct intel_engine_cs *engine; /** engine to queue the request to */
> + struct intel_gt *gt; /* gt for the execbuf */
>   struct intel_context *context; /* logical state for the request */
>   struct i915_gem_context *gem_context; /** caller's context */
>  
> - struct i915_request *request; /** our request to build */
> - struct eb_vma *batch; /** identity of the batch obj/vma */
> + /** our requests to build */
> + struct i915_request *requests[MAX_ENGINE_INSTANCE + 1];
> + /** identity of the batch obj/vma */
> + struct eb_vma *batches[MAX_ENGINE_INSTANCE + 1];
>   struct i915_vma *trampoline; /** trampoline used for chaining */
>  
> + /** used for excl fence in dma_resv objects when > 1 BB submitted */
> + struct dma_fence *composite_fence;
> +
>   /** actual size of execobj[] as we may extend it for the cmdparser */
>   unsigned int buffer_count;
>  
> + /* number of batches in execbuf IOCTL */
> + unsigned int num_batches;
> +
>   /** list of vma not yet bound during reservation phase */
>   struct list_head unbound;
>  
> @@ -281,7 +289,8 @@ struct i915_execbuffer {
>  
>   u64 invalid_flags; /** Set of execobj.flags that are invalid */
>  
> - u64 batch_len; /** Length of batch within object */
> + /** Length of batch within object */
> + u64 batch_len[MAX_ENGINE_INSTANCE + 1];
>   u32 batch_start_offset; /** Location within object of batch */
>   u32 batch_flags; /** Flags composed for emit_bb_start() */
>   struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch 
> buffer */
> @@ -299,14 +308,13 @@ struct i915_execbuffer {
>  };
>  
>  static int eb_parse(struct i915_execbuffer *eb);
> -static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb,
> -   bool throttle);
> +static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle);
>  static void eb_unpin_engine(struct i915_execbuffer *eb);
>  
>  static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb)
>  {
> - return intel_engine_requires_cmd_parser(eb->engine) ||
> - (intel_engine_using_cmd_parser(eb->engine) &&
> + return intel_engine_requires_cmd_parser(eb->context->engine) ||
> + (intel_engine_using_cmd_parser(eb->context->engine) &&
>eb->args->batch_len);
>  }
>  
> @@ -544,11 +552,21 @@ eb_validate_vma(struct i915_execbuffer *eb,
>   return 0;
>  }
>  
> -static void
> +static inline bool
> +is_batch_buffer(struct 

Re: [Intel-gfx] [PATCH 21/26] drm/i915: Multi-BB execbuf

2021-10-05 Thread kernel test robot
Hi Matthew,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm-tip/drm-tip]
[cannot apply to drm-intel/for-linux-next drm-exynos/exynos-drm-next 
tegra-drm/drm/tegra/for-next linus/master airlied/drm-next v5.15-rc3 
next-20210922]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20211005-061424
base:   git://anongit.freedesktop.org/drm/drm-tip drm-tip
config: i386-randconfig-a004-20211004 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 
c0039de2953d15815448b4b3c3bafb45607781e0)
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# 
https://github.com/0day-ci/linux/commit/758202922dad66c1b302eb34a141961acbefe417
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20211005-061424
git checkout 758202922dad66c1b302eb34a141961acbefe417
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 
ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All warnings (new ones prefixed by >>):

>> drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c:2361:6: warning: variable 
>> 'rq' is used uninitialized whenever 'if' condition is false 
>> [-Wsometimes-uninitialized]
   if (throttle)
   ^~~~
   drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c:2365:6: note: uninitialized 
use occurs here
   if (rq) {
   ^~
   drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c:2361:2: note: remove the 'if' 
if its condition is always true
   if (throttle)
   ^
   drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c:2346:25: note: initialize the 
variable 'rq' to silence this warning
   struct i915_request *rq;
  ^
   = NULL
   1 warning generated.


vim +2361 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c

e5dadff4b09376 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c Chris Wilson  
2019-08-15  2341  
758202922dad66 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c Matthew Brost 
2021-10-04  2342  static int eb_pin_timeline(struct i915_execbuffer *eb, struct 
intel_context *ce,
758202922dad66 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c Matthew Brost 
2021-10-04  2343   bool throttle)
8f2a1057d6ec21 drivers/gpu/drm/i915/i915_gem_execbuffer.c Chris Wilson  
2019-04-25  2344  {
e5dadff4b09376 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c Chris Wilson  
2019-08-15  2345struct intel_timeline *tl;
758202922dad66 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c Matthew Brost 
2021-10-04  2346struct i915_request *rq;
8f2a1057d6ec21 drivers/gpu/drm/i915/i915_gem_execbuffer.c Chris Wilson  
2019-04-25  2347  
a4e57f9031ccd5 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c Chris Wilson  
2019-08-04  2348/*
a4e57f9031ccd5 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c Chris Wilson  
2019-08-04  2349 * Take a local wakeref for preparing to dispatch the 
execbuf as
a4e57f9031ccd5 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c Chris Wilson  
2019-08-04  2350 * we expect to access the hardware fairly frequently 
in the
a4e57f9031ccd5 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c Chris Wilson  
2019-08-04  2351 * process, and require the engine to be kept awake 
between accesses.
a4e57f9031ccd5 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c Chris Wilson  
2019-08-04  2352 * Upon dispatch, we acquire another prolonged wakeref 
that we hold
a4e57f9031ccd5 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c Chris Wilson  
2019-08-04  2353 * until the timeline is idle, which in turn releases 
the wakeref
a4e57f9031ccd5 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c Chris Wilson  
2019-08-04  2354 * taken on the engine, and the parent device.
a4e57f9031ccd5 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c Chris Wilson  
2019-08-04  2355 */
e5dadff4b09376 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c Chris Wilson  
2019-08-15  2356tl = intel_context_timeline_lock(ce);
758202922dad66 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c Matthew Brost 
2021-10-04  2357if (IS_ERR(tl))
758202922dad66 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c Matthew Brost 
2021-10-04  2358return PTR_ERR(tl);
a4e57f9031ccd5 

[Intel-gfx] [PATCH 21/26] drm/i915: Multi-BB execbuf

2021-10-04 Thread Matthew Brost
Allow multiple batch buffers to be submitted in a single execbuf IOCTL
after a context has been configured with the 'set_parallel' extension.
The number batches is implicit based on the contexts configuration.

This is implemented with a series of loops. First a loop is used to find
all the batches, a loop to pin all the HW contexts, a loop to create all
the requests, a loop to submit (emit BB start, etc...) all the requests,
a loop to tie the requests to the VMAs they touch, and finally a loop to
commit the requests to the backend.

A composite fence is also created for the generated requests to return
to the user and to stick in dma resv slots.

No behavior from the existing IOCTL should be changed aside from when
throttling because the ring for a context is full, wait on the request
while holding the object locks.

IGT: https://patchwork.freedesktop.org/patch/447008/?series=93071=1
media UMD: https://github.com/intel/media-driver/pull/1252

v2:
 (Matthew Brost)
  - Return proper error value if i915_request_create fails
v3:
 (John Harrison)
  - Add comment explaining create / add order loops + locking
  - Update commit message explaining different in IOCTL behavior
  - Line wrap some comments
  - eb_add_request returns void
  - Return -EINVAL rather triggering BUG_ON if cmd parser used
 (Checkpatch)
  - Check eb->batch_len[*current_batch]

Signed-off-by: Matthew Brost 
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 793 --
 drivers/gpu/drm/i915/gt/intel_context.h   |   8 +-
 drivers/gpu/drm/i915/gt/intel_context_types.h |  10 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c |   2 +
 drivers/gpu/drm/i915/i915_request.h   |   9 +
 drivers/gpu/drm/i915/i915_vma.c   |  21 +-
 drivers/gpu/drm/i915/i915_vma.h   |  13 +-
 7 files changed, 599 insertions(+), 257 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 2f2434b52317..5c7fb6f68bbb 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -244,17 +244,25 @@ struct i915_execbuffer {
struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */
struct eb_vma *vma;
 
-   struct intel_engine_cs *engine; /** engine to queue the request to */
+   struct intel_gt *gt; /* gt for the execbuf */
struct intel_context *context; /* logical state for the request */
struct i915_gem_context *gem_context; /** caller's context */
 
-   struct i915_request *request; /** our request to build */
-   struct eb_vma *batch; /** identity of the batch obj/vma */
+   /** our requests to build */
+   struct i915_request *requests[MAX_ENGINE_INSTANCE + 1];
+   /** identity of the batch obj/vma */
+   struct eb_vma *batches[MAX_ENGINE_INSTANCE + 1];
struct i915_vma *trampoline; /** trampoline used for chaining */
 
+   /** used for excl fence in dma_resv objects when > 1 BB submitted */
+   struct dma_fence *composite_fence;
+
/** actual size of execobj[] as we may extend it for the cmdparser */
unsigned int buffer_count;
 
+   /* number of batches in execbuf IOCTL */
+   unsigned int num_batches;
+
/** list of vma not yet bound during reservation phase */
struct list_head unbound;
 
@@ -281,7 +289,8 @@ struct i915_execbuffer {
 
u64 invalid_flags; /** Set of execobj.flags that are invalid */
 
-   u64 batch_len; /** Length of batch within object */
+   /** Length of batch within object */
+   u64 batch_len[MAX_ENGINE_INSTANCE + 1];
u32 batch_start_offset; /** Location within object of batch */
u32 batch_flags; /** Flags composed for emit_bb_start() */
struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch 
buffer */
@@ -299,14 +308,13 @@ struct i915_execbuffer {
 };
 
 static int eb_parse(struct i915_execbuffer *eb);
-static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb,
- bool throttle);
+static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle);
 static void eb_unpin_engine(struct i915_execbuffer *eb);
 
 static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb)
 {
-   return intel_engine_requires_cmd_parser(eb->engine) ||
-   (intel_engine_using_cmd_parser(eb->engine) &&
+   return intel_engine_requires_cmd_parser(eb->context->engine) ||
+   (intel_engine_using_cmd_parser(eb->context->engine) &&
 eb->args->batch_len);
 }
 
@@ -544,11 +552,21 @@ eb_validate_vma(struct i915_execbuffer *eb,
return 0;
 }
 
-static void
+static inline bool
+is_batch_buffer(struct i915_execbuffer *eb, unsigned int buffer_idx)
+{
+   return eb->args->flags & I915_EXEC_BATCH_FIRST ?
+   buffer_idx < eb->num_batches :
+   buffer_idx >= eb->args->buffer_count - eb->num_batches;
+}