Re: [Intel-gfx] [PATCH 24/27] drm/i915: Multi-BB execbuf
On Fri, Aug 20, 2021 at 03:44:43PM -0700, Matthew Brost wrote: Did a review offline with John Harrison, adding notes for what we found. > Allow multiple batch buffers to be submitted in a single execbuf IOCTL > after a context has been configured with the 'set_parallel' extension. > The number batches is implicit based on the contexts configuration. > > This is implemented with a series of loops. First a loop is used to find > all the batches, a loop to pin all the HW contexts, a loop to generate > all the requests, a loop to submit all the requests, a loop to commit > all the requests, and finally a loop to tie the requests to the VMAs > they touch. Clarify these steps a bit, also tieing requests to the VMAs is the 2nd to last step with commiting requests to be the last. > > A composite fence is also created for the also the generated requests to > return to the user and to stick in dma resv slots. > Add a comment saying there should be no change in behavior for existing IOCTLs expect if throttling because to space in the ring, the wait is done with the VMA locks being held rather than dropping the locks and putting to the slow path. > IGT: https://patchwork.freedesktop.org/patch/447008/?series=93071=1 > media UMD: link to come > > Signed-off-by: Matthew Brost > --- > .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 765 -- > drivers/gpu/drm/i915/gt/intel_context.h | 8 +- > drivers/gpu/drm/i915/gt/intel_context_types.h | 12 + > .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 2 + > drivers/gpu/drm/i915/i915_request.h | 9 + > drivers/gpu/drm/i915/i915_vma.c | 21 +- > drivers/gpu/drm/i915/i915_vma.h | 13 +- > 7 files changed, 573 insertions(+), 257 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c > b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c > index 8290bdadd167..481978974627 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c > +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c > @@ -244,17 +244,23 @@ struct i915_execbuffer { > struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */ > struct eb_vma *vma; > > - struct intel_engine_cs *engine; /** engine to queue the request to */ > + struct intel_gt *gt; /* gt for the execbuf */ > struct intel_context *context; /* logical state for the request */ > struct i915_gem_context *gem_context; /** caller's context */ > > - struct i915_request *request; /** our request to build */ > - struct eb_vma *batch; /** identity of the batch obj/vma */ Line wrap of these comments. > + struct i915_request *requests[MAX_ENGINE_INSTANCE + 1]; /** our > requests to build */ > + struct eb_vma *batches[MAX_ENGINE_INSTANCE + 1]; /** identity of the > batch obj/vma */ > struct i915_vma *trampoline; /** trampoline used for chaining */ > > + /** used for excl fence in dma_resv objects when > 1 BB submitted */ > + struct dma_fence *composite_fence; > + > /** actual size of execobj[] as we may extend it for the cmdparser */ > unsigned int buffer_count; > > + /* number of batches in execbuf IOCTL */ > + unsigned int num_batches; > + > /** list of vma not yet bound during reservation phase */ > struct list_head unbound; > > @@ -281,7 +287,7 @@ struct i915_execbuffer { > > u64 invalid_flags; /** Set of execobj.flags that are invalid */ > > - u64 batch_len; /** Length of batch within object */ > + u64 batch_len[MAX_ENGINE_INSTANCE + 1]; /** Length of batch within > object */ > u32 batch_start_offset; /** Location within object of batch */ > u32 batch_flags; /** Flags composed for emit_bb_start() */ > struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch > buffer */ > @@ -299,14 +305,13 @@ struct i915_execbuffer { > }; > > static int eb_parse(struct i915_execbuffer *eb); > -static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, > - bool throttle); > +static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle); > static void eb_unpin_engine(struct i915_execbuffer *eb); > > static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) > { > - return intel_engine_requires_cmd_parser(eb->engine) || > - (intel_engine_using_cmd_parser(eb->engine) && > + return intel_engine_requires_cmd_parser(eb->context->engine) || > + (intel_engine_using_cmd_parser(eb->context->engine) && >eb->args->batch_len); > } > > @@ -544,11 +549,21 @@ eb_validate_vma(struct i915_execbuffer *eb, > return 0; > } > > -static void > +static inline bool > +is_batch_buffer(struct i915_execbuffer *eb, unsigned int buffer_idx) > +{ > + return eb->args->flags & I915_EXEC_BATCH_FIRST ? > + buffer_idx < eb->num_batches : > + buffer_idx >= eb->args->buffer_count -
Re: [Intel-gfx] [PATCH 24/27] drm/i915: Multi-BB execbuf
Hi Matthew, Thank you for the patch! Perhaps something to improve: [auto build test WARNING on drm-intel/for-linux-next] [also build test WARNING on drm-tip/drm-tip drm-exynos/exynos-drm-next next-20210827] [cannot apply to tegra-drm/drm/tegra/for-next linus/master drm/drm-next v5.14] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20210821-065348 base: git://anongit.freedesktop.org/drm-intel for-linux-next config: x86_64-rhel-8.3-kselftests (attached as .config) compiler: gcc-9 (Debian 9.3.0-22) 9.3.0 reproduce: # apt-get install sparse # sparse version: v0.6.3-348-gf0e6938b-dirty # https://github.com/0day-ci/linux/commit/7e7ae2111b2855ac3d63aa5c806c6936daaa6bbc git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20210821-065348 git checkout 7e7ae2111b2855ac3d63aa5c806c6936daaa6bbc # save the attached .config to linux build tree make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=x86_64 SHELL=/bin/bash If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot sparse warnings: (new ones prefixed by >>) >> drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c:608:21: sparse: sparse: Using >> plain integer as NULL pointer vim +608 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 548 549 static int 550 eb_add_vma(struct i915_execbuffer *eb, 551 unsigned int *current_batch, 552 unsigned int i, 553 struct i915_vma *vma) 554 { 555 struct drm_i915_private *i915 = eb->i915; 556 struct drm_i915_gem_exec_object2 *entry = >exec[i]; 557 struct eb_vma *ev = >vma[i]; 558 559 ev->vma = vma; 560 ev->exec = entry; 561 ev->flags = entry->flags; 562 563 if (eb->lut_size > 0) { 564 ev->handle = entry->handle; 565 hlist_add_head(>node, 566 >buckets[hash_32(entry->handle, 567 eb->lut_size)]); 568 } 569 570 if (entry->relocation_count) 571 list_add_tail(>reloc_link, >relocs); 572 573 /* 574 * SNA is doing fancy tricks with compressing batch buffers, which leads 575 * to negative relocation deltas. Usually that works out ok since the 576 * relocate address is still positive, except when the batch is placed 577 * very low in the GTT. Ensure this doesn't happen. 578 * 579 * Note that actual hangs have only been observed on gen7, but for 580 * paranoia do it everywhere. 581 */ 582 if (is_batch_buffer(eb, i)) { 583 if (entry->relocation_count && 584 !(ev->flags & EXEC_OBJECT_PINNED)) 585 ev->flags |= __EXEC_OBJECT_NEEDS_BIAS; 586 if (eb->reloc_cache.has_fence) 587 ev->flags |= EXEC_OBJECT_NEEDS_FENCE; 588 589 eb->batches[*current_batch] = ev; 590 591 if (unlikely(ev->flags & EXEC_OBJECT_WRITE)) { 592 drm_dbg(>drm, 593 "Attempting to use self-modifying batch buffer\n"); 594 return -EINVAL; 595 } 596 597 if (range_overflows_t(u64, 598eb->batch_start_offset, 599eb->args->batch_len, 600ev->vma->size)) { 601 drm_dbg(>drm, "Attempting to use out-of-bounds batch\n"); 602 return -EINVAL; 603 } 604 605 if (eb->args->batch_len == 0) 606 eb->batch_len[*current_batch] = ev->vma->size - 607 eb->batch_start_offset; > 608 if (unlikely(eb->batch_len == 0)) { /* impossible! */ 609 drm_dbg(>drm, "Invalid batch length\n"); 610 return -EINVAL; 611 } 612 613 ++*current_batch; 614 } 615 616 return 0; 617 } 618 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org .config.gz Description: application/gzip
Re: [Intel-gfx] [PATCH 24/27] drm/i915: Multi-BB execbuf
Hi Matthew, Thank you for the patch! Perhaps something to improve: [auto build test WARNING on drm-intel/for-linux-next] [also build test WARNING on drm-tip/drm-tip drm-exynos/exynos-drm-next next-20210820] [cannot apply to tegra-drm/drm/tegra/for-next linus/master drm/drm-next v5.14-rc6] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20210821-065348 base: git://anongit.freedesktop.org/drm-intel for-linux-next config: x86_64-buildonly-randconfig-r002-20210821 (attached as .config) compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 9e9d70591e72fc6762b4b9a226b68ed1307419bf) reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/0day-ci/linux/commit/7e7ae2111b2855ac3d63aa5c806c6936daaa6bbc git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20210821-065348 git checkout 7e7ae2111b2855ac3d63aa5c806c6936daaa6bbc # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot All warnings (new ones prefixed by >>): >> drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c:608:20: warning: comparison >> of array 'eb->batch_len' equal to a null pointer is always false >> [-Wtautological-pointer-compare] if (unlikely(eb->batch_len == 0)) { /* impossible! */ ^~ include/linux/compiler.h:78:42: note: expanded from macro 'unlikely' # define unlikely(x)__builtin_expect(!!(x), 0) ^ 1 warning generated. vim +608 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 548 549 static int 550 eb_add_vma(struct i915_execbuffer *eb, 551 unsigned int *current_batch, 552 unsigned int i, 553 struct i915_vma *vma) 554 { 555 struct drm_i915_private *i915 = eb->i915; 556 struct drm_i915_gem_exec_object2 *entry = >exec[i]; 557 struct eb_vma *ev = >vma[i]; 558 559 ev->vma = vma; 560 ev->exec = entry; 561 ev->flags = entry->flags; 562 563 if (eb->lut_size > 0) { 564 ev->handle = entry->handle; 565 hlist_add_head(>node, 566 >buckets[hash_32(entry->handle, 567 eb->lut_size)]); 568 } 569 570 if (entry->relocation_count) 571 list_add_tail(>reloc_link, >relocs); 572 573 /* 574 * SNA is doing fancy tricks with compressing batch buffers, which leads 575 * to negative relocation deltas. Usually that works out ok since the 576 * relocate address is still positive, except when the batch is placed 577 * very low in the GTT. Ensure this doesn't happen. 578 * 579 * Note that actual hangs have only been observed on gen7, but for 580 * paranoia do it everywhere. 581 */ 582 if (is_batch_buffer(eb, i)) { 583 if (entry->relocation_count && 584 !(ev->flags & EXEC_OBJECT_PINNED)) 585 ev->flags |= __EXEC_OBJECT_NEEDS_BIAS; 586 if (eb->reloc_cache.has_fence) 587 ev->flags |= EXEC_OBJECT_NEEDS_FENCE; 588 589 eb->batches[*current_batch] = ev; 590 591 if (unlikely(ev->flags & EXEC_OBJECT_WRITE)) { 592 drm_dbg(>drm, 593 "Attempting to use self-modifying batch buffer\n"); 594 return -EINVAL; 595 } 596 597 if (range_overflows_t(u64, 598eb->batch_start_offset, 599eb->args->batch_len, 600ev->vma->size)) { 601 drm_dbg(>drm, "Attempting to use out-of-bounds batch\n"); 602 return -EINVAL; 603 } 604 605 if (eb->args->batch_len == 0) 606 eb->batch_len[*current_batch] = ev->vma->size - 607
[PATCH 24/27] drm/i915: Multi-BB execbuf
Allow multiple batch buffers to be submitted in a single execbuf IOCTL after a context has been configured with the 'set_parallel' extension. The number batches is implicit based on the contexts configuration. This is implemented with a series of loops. First a loop is used to find all the batches, a loop to pin all the HW contexts, a loop to generate all the requests, a loop to submit all the requests, a loop to commit all the requests, and finally a loop to tie the requests to the VMAs they touch. A composite fence is also created for the also the generated requests to return to the user and to stick in dma resv slots. IGT: https://patchwork.freedesktop.org/patch/447008/?series=93071=1 media UMD: link to come Signed-off-by: Matthew Brost --- .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 765 -- drivers/gpu/drm/i915/gt/intel_context.h | 8 +- drivers/gpu/drm/i915/gt/intel_context_types.h | 12 + .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 2 + drivers/gpu/drm/i915/i915_request.h | 9 + drivers/gpu/drm/i915/i915_vma.c | 21 +- drivers/gpu/drm/i915/i915_vma.h | 13 +- 7 files changed, 573 insertions(+), 257 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 8290bdadd167..481978974627 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -244,17 +244,23 @@ struct i915_execbuffer { struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */ struct eb_vma *vma; - struct intel_engine_cs *engine; /** engine to queue the request to */ + struct intel_gt *gt; /* gt for the execbuf */ struct intel_context *context; /* logical state for the request */ struct i915_gem_context *gem_context; /** caller's context */ - struct i915_request *request; /** our request to build */ - struct eb_vma *batch; /** identity of the batch obj/vma */ + struct i915_request *requests[MAX_ENGINE_INSTANCE + 1]; /** our requests to build */ + struct eb_vma *batches[MAX_ENGINE_INSTANCE + 1]; /** identity of the batch obj/vma */ struct i915_vma *trampoline; /** trampoline used for chaining */ + /** used for excl fence in dma_resv objects when > 1 BB submitted */ + struct dma_fence *composite_fence; + /** actual size of execobj[] as we may extend it for the cmdparser */ unsigned int buffer_count; + /* number of batches in execbuf IOCTL */ + unsigned int num_batches; + /** list of vma not yet bound during reservation phase */ struct list_head unbound; @@ -281,7 +287,7 @@ struct i915_execbuffer { u64 invalid_flags; /** Set of execobj.flags that are invalid */ - u64 batch_len; /** Length of batch within object */ + u64 batch_len[MAX_ENGINE_INSTANCE + 1]; /** Length of batch within object */ u32 batch_start_offset; /** Location within object of batch */ u32 batch_flags; /** Flags composed for emit_bb_start() */ struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch buffer */ @@ -299,14 +305,13 @@ struct i915_execbuffer { }; static int eb_parse(struct i915_execbuffer *eb); -static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, - bool throttle); +static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle); static void eb_unpin_engine(struct i915_execbuffer *eb); static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) { - return intel_engine_requires_cmd_parser(eb->engine) || - (intel_engine_using_cmd_parser(eb->engine) && + return intel_engine_requires_cmd_parser(eb->context->engine) || + (intel_engine_using_cmd_parser(eb->context->engine) && eb->args->batch_len); } @@ -544,11 +549,21 @@ eb_validate_vma(struct i915_execbuffer *eb, return 0; } -static void +static inline bool +is_batch_buffer(struct i915_execbuffer *eb, unsigned int buffer_idx) +{ + return eb->args->flags & I915_EXEC_BATCH_FIRST ? + buffer_idx < eb->num_batches : + buffer_idx >= eb->args->buffer_count - eb->num_batches; +} + +static int eb_add_vma(struct i915_execbuffer *eb, - unsigned int i, unsigned batch_idx, + unsigned int *current_batch, + unsigned int i, struct i915_vma *vma) { + struct drm_i915_private *i915 = eb->i915; struct drm_i915_gem_exec_object2 *entry = >exec[i]; struct eb_vma *ev = >vma[i]; @@ -575,15 +590,41 @@ eb_add_vma(struct i915_execbuffer *eb, * Note that actual hangs have only been observed on gen7, but for * paranoia do it everywhere. */ - if (i == batch_idx) { + if (is_batch_buffer(eb, i)) { if (entry->relocation_count &&