Re: [Mesa-dev] [PATCH] i965/gen7: Use predicated rendering for indirect compute
On Tue, Feb 16, 2016 at 1:09 PM, Jordan Justen wrote: > On gen7 (Ivy Bridge, Haswell), we will get a GPU hang if an indirect > dispatch is used, but one of the dimensions is 0. > > Therefore we use predicated rendering on the GPGPU_WALKER command to > handle this case. > > Fixes piglit test: spec/arb_compute_shader/zero-dispatch-size > > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94100 > Signed-off-by: Jordan Justen > Cc: Kenneth Graunke > Cc: Ben Widawsky > Cc: Ilia Mirkin Tested-by: Ilia Mirkin All the dEQP tests now pass instead of hanging the GPU. [shared_var*atomic is still broken, but I'm sure that's unrelated.] -ilia ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] i965/gen7: Use predicated rendering for indirect compute
On Tuesday, February 16, 2016 10:09:50 AM PST Jordan Justen wrote: > On gen7 (Ivy Bridge, Haswell), we will get a GPU hang if an indirect > dispatch is used, but one of the dimensions is 0. > > Therefore we use predicated rendering on the GPGPU_WALKER command to > handle this case. > > Fixes piglit test: spec/arb_compute_shader/zero-dispatch-size > > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94100 > Signed-off-by: Jordan Justen > Cc: Kenneth Graunke > Cc: Ben Widawsky > Cc: Ilia Mirkin > --- > src/mesa/drivers/dri/i965/brw_compute.c | 104 ++ +- > src/mesa/drivers/dri/i965/brw_defines.h | 1 + > 2 files changed, 91 insertions(+), 14 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_compute.c b/src/mesa/drivers/dri/ i965/brw_compute.c > index d9f181a..bbb8ce3 100644 > --- a/src/mesa/drivers/dri/i965/brw_compute.c > +++ b/src/mesa/drivers/dri/i965/brw_compute.c > @@ -35,6 +35,92 @@ > > > static void > +brw_prepare_indirect_gpgpu_walker(struct brw_context *brw) static functions don't need the "brw_" prefix. (Similarly, in core Mesa, we don't use the "_mesa_" prefix for static functions...this helps identify them.) > +{ > + GLintptr indirect_offset = brw->compute.num_work_groups_offset; > + drm_intel_bo *bo = brw->compute.num_work_groups_bo; > + > + brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMX, bo, > + I915_GEM_DOMAIN_VERTEX, 0, > + indirect_offset + 0); > + brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMY, bo, > + I915_GEM_DOMAIN_VERTEX, 0, > + indirect_offset + 4); > + brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMZ, bo, > + I915_GEM_DOMAIN_VERTEX, 0, > + indirect_offset + 8); > + > + if (brw->gen > 7) > + return; > + > + /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 > +*/ As Ben mentioned, I do prefer one line comments, but it's up to you... > + BEGIN_BATCH(7); > + OUT_BATCH(MI_LOAD_REGISTER_IMM | (7 - 2)); > + OUT_BATCH(MI_PREDICATE_SRC0 + 4); > + OUT_BATCH(0u); > + OUT_BATCH(MI_PREDICATE_SRC1 + 0); > + OUT_BATCH(0u); > + OUT_BATCH(MI_PREDICATE_SRC1 + 4); > + OUT_BATCH(0u); > + ADVANCE_BATCH(); > + > + /* Load compute_dispatch_indirect_x_size into SRC0 > +*/ > + brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo, > + I915_GEM_DOMAIN_INSTRUCTION, 0, > + indirect_offset + 0); > + > + /* predicate = (compute_dispatch_indirect_x_size == 0); > +*/ > + BEGIN_BATCH(1); > + OUT_BATCH(GEN7_MI_PREDICATE | > + MI_PREDICATE_LOADOP_LOAD | > + MI_PREDICATE_COMBINEOP_SET | > + MI_PREDICATE_COMPAREOP_SRCS_EQUAL); > + ADVANCE_BATCH(); > + > + /* Load compute_dispatch_indirect_y_size into SRC0 > +*/ > + brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo, > + I915_GEM_DOMAIN_INSTRUCTION, 0, > + indirect_offset + 4); > + > + /* predicate |= (compute_dispatch_indirect_y_size == 0); > +*/ > + BEGIN_BATCH(1); > + OUT_BATCH(GEN7_MI_PREDICATE | > + MI_PREDICATE_LOADOP_LOAD | > + MI_PREDICATE_COMBINEOP_OR | > + MI_PREDICATE_COMPAREOP_SRCS_EQUAL); > + ADVANCE_BATCH(); > + > + /* Load compute_dispatch_indirect_z_size into SRC0 > +*/ > + brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo, > + I915_GEM_DOMAIN_INSTRUCTION, 0, > + indirect_offset + 8); > + > + /* predicate |= (compute_dispatch_indirect_z_size == 0); > +*/ > + BEGIN_BATCH(1); > + OUT_BATCH(GEN7_MI_PREDICATE | > + MI_PREDICATE_LOADOP_LOAD | > + MI_PREDICATE_COMBINEOP_OR | > + MI_PREDICATE_COMPAREOP_SRCS_EQUAL); > + ADVANCE_BATCH(); > + > + /* predicate = !predicate; > +*/ > + BEGIN_BATCH(1); > + OUT_BATCH(GEN7_MI_PREDICATE | > + MI_PREDICATE_LOADOP_LOADINV | > + MI_PREDICATE_COMBINEOP_OR | > + MI_PREDICATE_COMPAREOP_FALSE); > + ADVANCE_BATCH(); > +} > + > +static void > brw_emit_gpgpu_walker(struct brw_context *brw) > { > const struct brw_cs_prog_data *prog_data = brw->cs.prog_data; > @@ -45,20 +131,10 @@ brw_emit_gpgpu_walker(struct brw_context *brw) > if (brw->compute.num_work_groups_bo == NULL) { >indirect_flag = 0; > } else { > - GLintptr indirect_offset = brw->compute.num_work_groups_offset; > - drm_intel_bo *bo = brw->compute.num_work_groups_bo; > - > - indirect_flag = GEN7_GPGPU_INDIRECT_PARAMETER_ENABLE; > - > - brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMX, bo, > -I915_GEM_DOMAIN_VERTEX, 0, > -indirect_offset + 0); > - brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMY, bo, > -I915_GEM_DOMAIN_VERTEX, 0, > -
Re: [Mesa-dev] [PATCH] i965/gen7: Use predicated rendering for indirect compute
On Tue, Feb 16, 2016 at 12:21:02PM -0800, Jordan Justen wrote: > On 2016-02-16 12:03:10, Ben Widawsky wrote: > > On Tue, Feb 16, 2016 at 10:09:50AM -0800, Jordan Justen wrote: > > > On gen7 (Ivy Bridge, Haswell), we will get a GPU hang if an indirect > > > dispatch is used, but one of the dimensions is 0. > > > > > > Therefore we use predicated rendering on the GPGPU_WALKER command to > > > handle this case. > > > > > > Fixes piglit test: spec/arb_compute_shader/zero-dispatch-size > > > > > > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94100 > > > Signed-off-by: Jordan Justen > > > Cc: Kenneth Graunke > > > Cc: Ben Widawsky > > > Cc: Ilia Mirkin > > > --- > > > src/mesa/drivers/dri/i965/brw_compute.c | 104 > > > +++- > > > src/mesa/drivers/dri/i965/brw_defines.h | 1 + > > > 2 files changed, 91 insertions(+), 14 deletions(-) > > > > > > diff --git a/src/mesa/drivers/dri/i965/brw_compute.c > > > b/src/mesa/drivers/dri/i965/brw_compute.c > > > index d9f181a..bbb8ce3 100644 > > > --- a/src/mesa/drivers/dri/i965/brw_compute.c > > > +++ b/src/mesa/drivers/dri/i965/brw_compute.c > > > @@ -35,6 +35,92 @@ > > > > > > > > > static void > > > +brw_prepare_indirect_gpgpu_walker(struct brw_context *brw) > > > +{ > > > > Just FYI: > > There is a blurb in the predicate text: > > To ensure the memory sources of the MI_LOAD_REGISTER_MEM commands are > > coherent > > with previous 3D_PIPECONTROL store-DWord operations, software can use the > > new > > Pipe Control Flush Enable bit in the PIPE_CONTROL command. > > > > I suppose it's never the case that we'll be writing these with > > PIPE_CONTROL, so > > it's safe to ignore this. > > > > On irc it sounded like you didn't think the flush was required. I'm > going to stick with that unless you tell me otherwise. > > The LRM is coming from a user BO, and they may have set the values by > mapping the buffer and writing it from the CPU, or by writing it from > a shader (for example SSBO). > The note seems to imply that DW writes with a pipe control can be deferred and so you need to flush are previously pipe controls... again, yeah, I think we're safe. > > > + GLintptr indirect_offset = brw->compute.num_work_groups_offset; > > > + drm_intel_bo *bo = brw->compute.num_work_groups_bo; > > > + > > > + brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMX, bo, > > > + I915_GEM_DOMAIN_VERTEX, 0, > > > + indirect_offset + 0); > > > + brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMY, bo, > > > + I915_GEM_DOMAIN_VERTEX, 0, > > > + indirect_offset + 4); > > > + brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMZ, bo, > > > + I915_GEM_DOMAIN_VERTEX, 0, > > > + indirect_offset + 8); > > > + > > > + if (brw->gen > 7) > > > + return; > > > + > > > + /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 > > > +*/ > > > + BEGIN_BATCH(7); > > > + OUT_BATCH(MI_LOAD_REGISTER_IMM | (7 - 2)); > > > + OUT_BATCH(MI_PREDICATE_SRC0 + 4); > > > + OUT_BATCH(0u); > > > + OUT_BATCH(MI_PREDICATE_SRC1 + 0); > > > + OUT_BATCH(0u); > > > + OUT_BATCH(MI_PREDICATE_SRC1 + 4); > > > + OUT_BATCH(0u); > > > + ADVANCE_BATCH(); > > > + > > > + /* Load compute_dispatch_indirect_x_size into SRC0 > > > +*/ > > > + brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo, > > > + I915_GEM_DOMAIN_INSTRUCTION, 0, > > > + indirect_offset + 0); > > > + > > > + /* predicate = (compute_dispatch_indirect_x_size == 0); > > > +*/ > > > + BEGIN_BATCH(1); > > > + OUT_BATCH(GEN7_MI_PREDICATE | > > > + MI_PREDICATE_LOADOP_LOAD | > > > + MI_PREDICATE_COMBINEOP_SET | > > > + MI_PREDICATE_COMPAREOP_SRCS_EQUAL); > > > + ADVANCE_BATCH(); > > > + > > > + /* Load compute_dispatch_indirect_y_size into SRC0 > > > +*/ > > > + brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo, > > > + I915_GEM_DOMAIN_INSTRUCTION, 0, > > > + indirect_offset + 4); > > > + > > > + /* predicate |= (compute_dispatch_indirect_y_size == 0); > > > +*/ > > > + BEGIN_BATCH(1); > > > + OUT_BATCH(GEN7_MI_PREDICATE | > > > + MI_PREDICATE_LOADOP_LOAD | > > > + MI_PREDICATE_COMBINEOP_OR | > > > + MI_PREDICATE_COMPAREOP_SRCS_EQUAL); > > > + ADVANCE_BATCH(); > > > + > > > + /* Load compute_dispatch_indirect_z_size into SRC0 > > > +*/ > > > + brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo, > > > + I915_GEM_DOMAIN_INSTRUCTION, 0, > > > + indirect_offset + 8); > > > + > > > + /* predicate |= (compute_dispatch_indirect_z_size == 0); > > > +*/ > > > + BEGIN_BATCH(1); > > > + OUT_BATCH(GEN7_MI_PREDICATE | > > > + MI_PREDICATE_LOADOP_LOAD | > > > + MI_P
Re: [Mesa-dev] [PATCH] i965/gen7: Use predicated rendering for indirect compute
On 2016-02-16 12:03:10, Ben Widawsky wrote: > On Tue, Feb 16, 2016 at 10:09:50AM -0800, Jordan Justen wrote: > > On gen7 (Ivy Bridge, Haswell), we will get a GPU hang if an indirect > > dispatch is used, but one of the dimensions is 0. > > > > Therefore we use predicated rendering on the GPGPU_WALKER command to > > handle this case. > > > > Fixes piglit test: spec/arb_compute_shader/zero-dispatch-size > > > > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94100 > > Signed-off-by: Jordan Justen > > Cc: Kenneth Graunke > > Cc: Ben Widawsky > > Cc: Ilia Mirkin > > --- > > src/mesa/drivers/dri/i965/brw_compute.c | 104 > > +++- > > src/mesa/drivers/dri/i965/brw_defines.h | 1 + > > 2 files changed, 91 insertions(+), 14 deletions(-) > > > > diff --git a/src/mesa/drivers/dri/i965/brw_compute.c > > b/src/mesa/drivers/dri/i965/brw_compute.c > > index d9f181a..bbb8ce3 100644 > > --- a/src/mesa/drivers/dri/i965/brw_compute.c > > +++ b/src/mesa/drivers/dri/i965/brw_compute.c > > @@ -35,6 +35,92 @@ > > > > > > static void > > +brw_prepare_indirect_gpgpu_walker(struct brw_context *brw) > > +{ > > Just FYI: > There is a blurb in the predicate text: > To ensure the memory sources of the MI_LOAD_REGISTER_MEM commands are coherent > with previous 3D_PIPECONTROL store-DWord operations, software can use the new > Pipe Control Flush Enable bit in the PIPE_CONTROL command. > > I suppose it's never the case that we'll be writing these with PIPE_CONTROL, > so > it's safe to ignore this. > On irc it sounded like you didn't think the flush was required. I'm going to stick with that unless you tell me otherwise. The LRM is coming from a user BO, and they may have set the values by mapping the buffer and writing it from the CPU, or by writing it from a shader (for example SSBO). > > + GLintptr indirect_offset = brw->compute.num_work_groups_offset; > > + drm_intel_bo *bo = brw->compute.num_work_groups_bo; > > + > > + brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMX, bo, > > + I915_GEM_DOMAIN_VERTEX, 0, > > + indirect_offset + 0); > > + brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMY, bo, > > + I915_GEM_DOMAIN_VERTEX, 0, > > + indirect_offset + 4); > > + brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMZ, bo, > > + I915_GEM_DOMAIN_VERTEX, 0, > > + indirect_offset + 8); > > + > > + if (brw->gen > 7) > > + return; > > + > > + /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 > > +*/ > > + BEGIN_BATCH(7); > > + OUT_BATCH(MI_LOAD_REGISTER_IMM | (7 - 2)); > > + OUT_BATCH(MI_PREDICATE_SRC0 + 4); > > + OUT_BATCH(0u); > > + OUT_BATCH(MI_PREDICATE_SRC1 + 0); > > + OUT_BATCH(0u); > > + OUT_BATCH(MI_PREDICATE_SRC1 + 4); > > + OUT_BATCH(0u); > > + ADVANCE_BATCH(); > > + > > + /* Load compute_dispatch_indirect_x_size into SRC0 > > +*/ > > + brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo, > > + I915_GEM_DOMAIN_INSTRUCTION, 0, > > + indirect_offset + 0); > > + > > + /* predicate = (compute_dispatch_indirect_x_size == 0); > > +*/ > > + BEGIN_BATCH(1); > > + OUT_BATCH(GEN7_MI_PREDICATE | > > + MI_PREDICATE_LOADOP_LOAD | > > + MI_PREDICATE_COMBINEOP_SET | > > + MI_PREDICATE_COMPAREOP_SRCS_EQUAL); > > + ADVANCE_BATCH(); > > + > > + /* Load compute_dispatch_indirect_y_size into SRC0 > > +*/ > > + brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo, > > + I915_GEM_DOMAIN_INSTRUCTION, 0, > > + indirect_offset + 4); > > + > > + /* predicate |= (compute_dispatch_indirect_y_size == 0); > > +*/ > > + BEGIN_BATCH(1); > > + OUT_BATCH(GEN7_MI_PREDICATE | > > + MI_PREDICATE_LOADOP_LOAD | > > + MI_PREDICATE_COMBINEOP_OR | > > + MI_PREDICATE_COMPAREOP_SRCS_EQUAL); > > + ADVANCE_BATCH(); > > + > > + /* Load compute_dispatch_indirect_z_size into SRC0 > > +*/ > > + brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo, > > + I915_GEM_DOMAIN_INSTRUCTION, 0, > > + indirect_offset + 8); > > + > > + /* predicate |= (compute_dispatch_indirect_z_size == 0); > > +*/ > > + BEGIN_BATCH(1); > > + OUT_BATCH(GEN7_MI_PREDICATE | > > + MI_PREDICATE_LOADOP_LOAD | > > + MI_PREDICATE_COMBINEOP_OR | > > + MI_PREDICATE_COMPAREOP_SRCS_EQUAL); > > + ADVANCE_BATCH(); > > + > > + /* predicate = !predicate; > > +*/ > > + BEGIN_BATCH(1); > > + OUT_BATCH(GEN7_MI_PREDICATE | > > + MI_PREDICATE_LOADOP_LOADINV | > > + MI_PREDICATE_COMBINEOP_OR | > > + MI_PREDICATE_COMPAREOP_FALSE); > > + ADVANCE_BATCH(); > > +} > > I think all of your comments would fit on one line... > > Just summing up our conve
Re: [Mesa-dev] [PATCH] i965/gen7: Use predicated rendering for indirect compute
On Tue, Feb 16, 2016 at 10:09:50AM -0800, Jordan Justen wrote: > On gen7 (Ivy Bridge, Haswell), we will get a GPU hang if an indirect > dispatch is used, but one of the dimensions is 0. > > Therefore we use predicated rendering on the GPGPU_WALKER command to > handle this case. > > Fixes piglit test: spec/arb_compute_shader/zero-dispatch-size > > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94100 > Signed-off-by: Jordan Justen > Cc: Kenneth Graunke > Cc: Ben Widawsky > Cc: Ilia Mirkin > --- > src/mesa/drivers/dri/i965/brw_compute.c | 104 > +++- > src/mesa/drivers/dri/i965/brw_defines.h | 1 + > 2 files changed, 91 insertions(+), 14 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_compute.c > b/src/mesa/drivers/dri/i965/brw_compute.c > index d9f181a..bbb8ce3 100644 > --- a/src/mesa/drivers/dri/i965/brw_compute.c > +++ b/src/mesa/drivers/dri/i965/brw_compute.c > @@ -35,6 +35,92 @@ > > > static void > +brw_prepare_indirect_gpgpu_walker(struct brw_context *brw) > +{ Just FYI: There is a blurb in the predicate text: To ensure the memory sources of the MI_LOAD_REGISTER_MEM commands are coherent with previous 3D_PIPECONTROL store-DWord operations, software can use the new Pipe Control Flush Enable bit in the PIPE_CONTROL command. I suppose it's never the case that we'll be writing these with PIPE_CONTROL, so it's safe to ignore this. > + GLintptr indirect_offset = brw->compute.num_work_groups_offset; > + drm_intel_bo *bo = brw->compute.num_work_groups_bo; > + > + brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMX, bo, > + I915_GEM_DOMAIN_VERTEX, 0, > + indirect_offset + 0); > + brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMY, bo, > + I915_GEM_DOMAIN_VERTEX, 0, > + indirect_offset + 4); > + brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMZ, bo, > + I915_GEM_DOMAIN_VERTEX, 0, > + indirect_offset + 8); > + > + if (brw->gen > 7) > + return; > + > + /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 > +*/ > + BEGIN_BATCH(7); > + OUT_BATCH(MI_LOAD_REGISTER_IMM | (7 - 2)); > + OUT_BATCH(MI_PREDICATE_SRC0 + 4); > + OUT_BATCH(0u); > + OUT_BATCH(MI_PREDICATE_SRC1 + 0); > + OUT_BATCH(0u); > + OUT_BATCH(MI_PREDICATE_SRC1 + 4); > + OUT_BATCH(0u); > + ADVANCE_BATCH(); > + > + /* Load compute_dispatch_indirect_x_size into SRC0 > +*/ > + brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo, > + I915_GEM_DOMAIN_INSTRUCTION, 0, > + indirect_offset + 0); > + > + /* predicate = (compute_dispatch_indirect_x_size == 0); > +*/ > + BEGIN_BATCH(1); > + OUT_BATCH(GEN7_MI_PREDICATE | > + MI_PREDICATE_LOADOP_LOAD | > + MI_PREDICATE_COMBINEOP_SET | > + MI_PREDICATE_COMPAREOP_SRCS_EQUAL); > + ADVANCE_BATCH(); > + > + /* Load compute_dispatch_indirect_y_size into SRC0 > +*/ > + brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo, > + I915_GEM_DOMAIN_INSTRUCTION, 0, > + indirect_offset + 4); > + > + /* predicate |= (compute_dispatch_indirect_y_size == 0); > +*/ > + BEGIN_BATCH(1); > + OUT_BATCH(GEN7_MI_PREDICATE | > + MI_PREDICATE_LOADOP_LOAD | > + MI_PREDICATE_COMBINEOP_OR | > + MI_PREDICATE_COMPAREOP_SRCS_EQUAL); > + ADVANCE_BATCH(); > + > + /* Load compute_dispatch_indirect_z_size into SRC0 > +*/ > + brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo, > + I915_GEM_DOMAIN_INSTRUCTION, 0, > + indirect_offset + 8); > + > + /* predicate |= (compute_dispatch_indirect_z_size == 0); > +*/ > + BEGIN_BATCH(1); > + OUT_BATCH(GEN7_MI_PREDICATE | > + MI_PREDICATE_LOADOP_LOAD | > + MI_PREDICATE_COMBINEOP_OR | > + MI_PREDICATE_COMPAREOP_SRCS_EQUAL); > + ADVANCE_BATCH(); > + > + /* predicate = !predicate; > +*/ > + BEGIN_BATCH(1); > + OUT_BATCH(GEN7_MI_PREDICATE | > + MI_PREDICATE_LOADOP_LOADINV | > + MI_PREDICATE_COMBINEOP_OR | > + MI_PREDICATE_COMPAREOP_FALSE); > + ADVANCE_BATCH(); > +} I think all of your comments would fit on one line... Just summing up our conversation, I believe you could have slightly simpler code using DELTAS_EQUAL, but it's not entirely clear. > + > +static void > brw_emit_gpgpu_walker(struct brw_context *brw) > { > const struct brw_cs_prog_data *prog_data = brw->cs.prog_data; > @@ -45,20 +131,10 @@ brw_emit_gpgpu_walker(struct brw_context *brw) > if (brw->compute.num_work_groups_bo == NULL) { >indirect_flag = 0; > } else { > - GLintptr indirect_offset = brw->compute.num_work_groups_offset; > - drm_intel_bo *bo = brw->compute.num_work_groups_bo; > - > - indirect_flag = GEN7_GPGPU_INDIRECT
[Mesa-dev] [PATCH] i965/gen7: Use predicated rendering for indirect compute
On gen7 (Ivy Bridge, Haswell), we will get a GPU hang if an indirect dispatch is used, but one of the dimensions is 0. Therefore we use predicated rendering on the GPGPU_WALKER command to handle this case. Fixes piglit test: spec/arb_compute_shader/zero-dispatch-size Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94100 Signed-off-by: Jordan Justen Cc: Kenneth Graunke Cc: Ben Widawsky Cc: Ilia Mirkin --- src/mesa/drivers/dri/i965/brw_compute.c | 104 +++- src/mesa/drivers/dri/i965/brw_defines.h | 1 + 2 files changed, 91 insertions(+), 14 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_compute.c b/src/mesa/drivers/dri/i965/brw_compute.c index d9f181a..bbb8ce3 100644 --- a/src/mesa/drivers/dri/i965/brw_compute.c +++ b/src/mesa/drivers/dri/i965/brw_compute.c @@ -35,6 +35,92 @@ static void +brw_prepare_indirect_gpgpu_walker(struct brw_context *brw) +{ + GLintptr indirect_offset = brw->compute.num_work_groups_offset; + drm_intel_bo *bo = brw->compute.num_work_groups_bo; + + brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMX, bo, + I915_GEM_DOMAIN_VERTEX, 0, + indirect_offset + 0); + brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMY, bo, + I915_GEM_DOMAIN_VERTEX, 0, + indirect_offset + 4); + brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMZ, bo, + I915_GEM_DOMAIN_VERTEX, 0, + indirect_offset + 8); + + if (brw->gen > 7) + return; + + /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 +*/ + BEGIN_BATCH(7); + OUT_BATCH(MI_LOAD_REGISTER_IMM | (7 - 2)); + OUT_BATCH(MI_PREDICATE_SRC0 + 4); + OUT_BATCH(0u); + OUT_BATCH(MI_PREDICATE_SRC1 + 0); + OUT_BATCH(0u); + OUT_BATCH(MI_PREDICATE_SRC1 + 4); + OUT_BATCH(0u); + ADVANCE_BATCH(); + + /* Load compute_dispatch_indirect_x_size into SRC0 +*/ + brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + indirect_offset + 0); + + /* predicate = (compute_dispatch_indirect_x_size == 0); +*/ + BEGIN_BATCH(1); + OUT_BATCH(GEN7_MI_PREDICATE | + MI_PREDICATE_LOADOP_LOAD | + MI_PREDICATE_COMBINEOP_SET | + MI_PREDICATE_COMPAREOP_SRCS_EQUAL); + ADVANCE_BATCH(); + + /* Load compute_dispatch_indirect_y_size into SRC0 +*/ + brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + indirect_offset + 4); + + /* predicate |= (compute_dispatch_indirect_y_size == 0); +*/ + BEGIN_BATCH(1); + OUT_BATCH(GEN7_MI_PREDICATE | + MI_PREDICATE_LOADOP_LOAD | + MI_PREDICATE_COMBINEOP_OR | + MI_PREDICATE_COMPAREOP_SRCS_EQUAL); + ADVANCE_BATCH(); + + /* Load compute_dispatch_indirect_z_size into SRC0 +*/ + brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + indirect_offset + 8); + + /* predicate |= (compute_dispatch_indirect_z_size == 0); +*/ + BEGIN_BATCH(1); + OUT_BATCH(GEN7_MI_PREDICATE | + MI_PREDICATE_LOADOP_LOAD | + MI_PREDICATE_COMBINEOP_OR | + MI_PREDICATE_COMPAREOP_SRCS_EQUAL); + ADVANCE_BATCH(); + + /* predicate = !predicate; +*/ + BEGIN_BATCH(1); + OUT_BATCH(GEN7_MI_PREDICATE | + MI_PREDICATE_LOADOP_LOADINV | + MI_PREDICATE_COMBINEOP_OR | + MI_PREDICATE_COMPAREOP_FALSE); + ADVANCE_BATCH(); +} + +static void brw_emit_gpgpu_walker(struct brw_context *brw) { const struct brw_cs_prog_data *prog_data = brw->cs.prog_data; @@ -45,20 +131,10 @@ brw_emit_gpgpu_walker(struct brw_context *brw) if (brw->compute.num_work_groups_bo == NULL) { indirect_flag = 0; } else { - GLintptr indirect_offset = brw->compute.num_work_groups_offset; - drm_intel_bo *bo = brw->compute.num_work_groups_bo; - - indirect_flag = GEN7_GPGPU_INDIRECT_PARAMETER_ENABLE; - - brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMX, bo, -I915_GEM_DOMAIN_VERTEX, 0, -indirect_offset + 0); - brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMY, bo, -I915_GEM_DOMAIN_VERTEX, 0, -indirect_offset + 4); - brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMZ, bo, -I915_GEM_DOMAIN_VERTEX, 0, -indirect_offset + 8); + indirect_flag = + GEN7_GPGPU_INDIRECT_PARAMETER_ENABLE | + ((brw->gen == 7) ? GEN7_GPGPU_PREDICATE_ENABLE : 0); + brw_prepare_indirect_gpgpu_walker(brw); } const unsigned simd_size = prog_data->simd_size; diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index