Module: Mesa Branch: main Commit: d161e3c2e2f634aba7766e5c7e9efa69b21f99db URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=d161e3c2e2f634aba7766e5c7e9efa69b21f99db
Author: Rohan Garg <rohan.g...@intel.com> Date: Mon Jun 27 14:49:30 2022 +0200 iris: Emit a EXECUTE_INDIRECT_DISPATCH when available On newer platforms (Arrowlake and above) we can issue a EXECUTE_INDIRECT_DISPATCH that allows us to: * Skip issuing mi load/store instructions for indirect parameters Signed-off-by: Rohan Garg <rohan.g...@intel.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwer...@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26421> --- src/gallium/drivers/iris/iris_state.c | 116 ++++++++++++++++++++++++---------- 1 file changed, 83 insertions(+), 33 deletions(-) diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 59ba70de95a..87c950319f3 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -8332,8 +8332,52 @@ iris_load_indirect_location(struct iris_context *ice, mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z); } +static bool iris_emit_indirect_dispatch_supported(const struct intel_device_info *devinfo) +{ + // TODO: Swizzling X and Y workgroup sizes is not supported in execute indirect dispatch + return devinfo->has_indirect_unroll; +} + #if GFX_VERx10 >= 125 +static void iris_emit_execute_indirect_dispatch(struct iris_context *ice, + struct iris_batch *batch, + const struct pipe_grid_info *grid, + const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd) +{ + const struct iris_screen *screen = batch->screen; + const struct intel_device_info *devinfo = screen->devinfo; + struct iris_compiled_shader *shader = + ice->shaders.prog[MESA_SHADER_COMPUTE]; + struct brw_stage_prog_data *prog_data = shader->prog_data; + struct brw_cs_prog_data *cs_prog_data = (void *) prog_data; + const struct brw_cs_dispatch_info dispatch = + brw_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block); + struct iris_bo *indirect = iris_resource_bo(grid->indirect); + const int dispatch_size = dispatch.simd_size / 16; + + struct GENX(COMPUTE_WALKER_BODY) body = {}; + body.SIMDSize = dispatch_size; + body.MessageSIMD = dispatch_size; + body.LocalXMaximum = grid->block[0] - 1; + body.LocalYMaximum = grid->block[1] - 1; + body.LocalZMaximum = grid->block[2] - 1; + body.ExecutionMask = dispatch.right_mask; + body.PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0); + body.InterfaceDescriptor = idd; + + struct iris_address indirect_bo = ro_bo(indirect, grid->indirect_offset); + iris_emit_cmd(batch, GENX(EXECUTE_INDIRECT_DISPATCH), ind) { + ind.PredicateEnable = + ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT; + ind.MaxCount = 1; + ind.COMPUTE_WALKER_BODY = body; + ind.ArgumentBufferStartAddress = indirect_bo; + ind.MOCS = + iris_mocs(indirect_bo.bo, &screen->isl_dev, 0); + } +} + static void iris_upload_compute_walker(struct iris_context *ice, struct iris_batch *batch, @@ -8363,42 +8407,48 @@ iris_upload_compute_walker(struct iris_context *ice, } } - if (grid->indirect) - iris_load_indirect_location(ice, batch, grid); + struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {}; + idd.KernelStartPointer = KSP(shader); + idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads; + idd.SharedLocalMemorySize = + encode_slm_size(GFX_VER, prog_data->total_shared); + idd.SamplerStatePointer = shs->sampler_table.offset; + idd.SamplerCount = encode_sampler_count(shader), + idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE]; + /* Typically set to 0 to avoid prefetching on every thread dispatch. */ + idd.BindingTableEntryCount = devinfo->verx10 == 125 ? + 0 : MIN2(shader->bt.size_bytes / 4, 31); + idd.PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo); + idd.NumberOfBarriers = cs_prog_data->uses_barrier; iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL); - ice->utrace.last_compute_walker = - iris_emit_dwords(batch, GENX(COMPUTE_WALKER_length)); - _iris_pack_command(batch, GENX(COMPUTE_WALKER), - ice->utrace.last_compute_walker, cw) { - cw.IndirectParameterEnable = grid->indirect; - cw.SIMDSize = dispatch.simd_size / 16; - cw.LocalXMaximum = grid->block[0] - 1; - cw.LocalYMaximum = grid->block[1] - 1; - cw.LocalZMaximum = grid->block[2] - 1; - cw.ThreadGroupIDXDimension = grid->grid[0]; - cw.ThreadGroupIDYDimension = grid->grid[1]; - cw.ThreadGroupIDZDimension = grid->grid[2]; - cw.ExecutionMask = dispatch.right_mask; - cw.PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0); - - cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { - .KernelStartPointer = KSP(shader), - .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, - .SharedLocalMemorySize = - encode_slm_size(GFX_VER, prog_data->total_shared), - .PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo), - .NumberOfBarriers = cs_prog_data->uses_barrier, - .SamplerStatePointer = shs->sampler_table.offset, - .SamplerCount = encode_sampler_count(shader), - .BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE], - /* Typically set to 0 to avoid prefetching on every thread dispatch. */ - .BindingTableEntryCount = devinfo->verx10 == 125 ? - 0 : MIN2(shader->bt.size_bytes / 4, 31), - }; - - assert(brw_cs_push_const_total_size(cs_prog_data, dispatch.threads) == 0); + if (iris_emit_indirect_dispatch_supported(devinfo) && grid->indirect) { + iris_emit_execute_indirect_dispatch(ice, batch, grid, idd); + } else { + if (grid->indirect) + iris_load_indirect_location(ice, batch, grid); + + iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL); + + ice->utrace.last_compute_walker = + iris_emit_dwords(batch, GENX(COMPUTE_WALKER_length)); + _iris_pack_command(batch, GENX(COMPUTE_WALKER), + ice->utrace.last_compute_walker, cw) { + cw.IndirectParameterEnable = grid->indirect; + cw.SIMDSize = dispatch.simd_size / 16; + cw.LocalXMaximum = grid->block[0] - 1; + cw.LocalYMaximum = grid->block[1] - 1; + cw.LocalZMaximum = grid->block[2] - 1; + cw.ThreadGroupIDXDimension = grid->grid[0]; + cw.ThreadGroupIDYDimension = grid->grid[1]; + cw.ThreadGroupIDZDimension = grid->grid[2]; + cw.ExecutionMask = dispatch.right_mask; + cw.PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0); + cw.InterfaceDescriptor = idd; + + assert(brw_cs_push_const_total_size(cs_prog_data, dispatch.threads) == 0); + } } trace_intel_end_compute(&batch->trace, grid->grid[0], grid->grid[1], grid->grid[2]);