Module: Mesa Branch: main Commit: 338319741c3b11584188614434660d7dc800a119 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=338319741c3b11584188614434660d7dc800a119
Author: Samuel Pitoiset <[email protected]> Date: Wed Oct 25 17:46:35 2023 +0200 radv: add DGC support for mesh shader only This only implements mesh shaders with DGC because task shaders are really tricky. I will address them later. Signed-off-by: Samuel Pitoiset <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25890> --- src/amd/vulkan/radv_cmd_buffer.c | 22 +++-- src/amd/vulkan/radv_device_generated_commands.c | 117 ++++++++++++++++++++++-- src/amd/vulkan/radv_private.h | 1 + 3 files changed, 127 insertions(+), 13 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 7f33942747a..c17b646bd62 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -9032,7 +9032,8 @@ radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info } ALWAYS_INLINE static bool -radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount) +radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount, + bool dgc) { /* For direct draws, this makes sure we don't draw anything. * For indirect draws, this is necessary to prevent a GPU hang (on MEC version < 100). @@ -9090,7 +9091,8 @@ radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_ if (pc_stages) radv_flush_constants(cmd_buffer, pc_stages, VK_PIPELINE_BIND_POINT_GRAPHICS); - radv_describe_draw(cmd_buffer); + if (!dgc) + radv_describe_draw(cmd_buffer); if (likely(!info->indirect)) { struct radv_cmd_state *state = &cmd_buffer->state; if (unlikely(state->last_num_instances != 1)) { @@ -9335,7 +9337,7 @@ radv_CmdDrawMeshTasksEXT(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, info.count_buffer = NULL; info.indirect = NULL; - if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1)) + if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1, false)) return; if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) { @@ -9368,7 +9370,7 @@ radv_CmdDrawMeshTasksIndirectEXT(VkCommandBuffer commandBuffer, VkBuffer _buffer info.indexed = false; info.instance_count = 0; - if (!radv_before_taskmesh_draw(cmd_buffer, &info, drawCount)) + if (!radv_before_taskmesh_draw(cmd_buffer, &info, drawCount, false)) return; if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) { @@ -9402,7 +9404,7 @@ radv_CmdDrawMeshTasksIndirectCountEXT(VkCommandBuffer commandBuffer, VkBuffer _b info.indexed = false; info.instance_count = 0; - if (!radv_before_taskmesh_draw(cmd_buffer, &info, maxDrawCount)) + if (!radv_before_taskmesh_draw(cmd_buffer, &info, maxDrawCount, false)) return; if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) { @@ -9453,6 +9455,7 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre if (compute) { radv_dgc_before_dispatch(cmd_buffer); } else { + struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline); struct radv_draw_info info; info.count = pGeneratedCommandsInfo->sequencesCount; @@ -9465,8 +9468,13 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre info.indexed = layout->indexed; info.instance_count = 0; - if (!radv_before_draw(cmd_buffer, &info, 1, true)) - return; + if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_MESH)) { + if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1, true)) + return; + } else { + if (!radv_before_draw(cmd_buffer, &info, 1, true)) + return; + } } uint32_t cmdbuf_size = radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo); diff --git a/src/amd/vulkan/radv_device_generated_commands.c b/src/amd/vulkan/radv_device_generated_commands.c index 8a98ad0df72..b656ec5018d 100644 --- a/src/amd/vulkan/radv_device_generated_commands.c +++ b/src/amd/vulkan/radv_device_generated_commands.c @@ -89,8 +89,13 @@ radv_get_sequence_size_graphics(const struct radv_indirect_command_layout *layou *cmd_size += (4 + (pipeline->uses_drawid ? 10 : 5)) * 4; } } else { - /* userdata writes + instance count + non-indexed draw */ - *cmd_size += (5 + 2 + 3) * 4; + if (layout->draw_mesh_tasks) { + /* userdata writes + instance count + non-indexed draw */ + *cmd_size += (6 + 2 + (device->mesh_fast_launch_2 ? 5 : 3)) * 4; + } else { + /* userdata writes + instance count + non-indexed draw */ + *cmd_size += (5 + 2 + 3) * 4; + } } if (device->sqtt.bo) { @@ -196,6 +201,7 @@ struct radv_dgc_params { uint16_t binds_index_buffer; uint16_t vtx_base_sgpr; uint32_t max_index_count; + uint8_t draw_mesh_tasks; /* dispatch info */ uint32_t dispatch_initiator; @@ -229,6 +235,7 @@ struct radv_dgc_params { enum { DGC_USES_DRAWID = 1u << 14, DGC_USES_BASEINSTANCE = 1u << 15, + DGC_USES_GRID_SIZE = DGC_USES_BASEINSTANCE, /* Mesh shader only */ }; enum { @@ -326,6 +333,37 @@ dgc_emit_userdata_vertex(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *vtx_bas dgc_emit(b, cs, nir_vec(b, values, 5)); } +static void +dgc_emit_userdata_mesh(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *vtx_base_sgpr, nir_def *x, nir_def *y, + nir_def *z, nir_def *drawid, const struct radv_device *device) +{ + vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr); + nir_def *has_grid_size = nir_test_mask(b, vtx_base_sgpr, DGC_USES_GRID_SIZE); + nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID); + + nir_push_if(b, nir_ior(b, has_grid_size, has_drawid)); + { + nir_def *pkt_cnt = nir_imm_int(b, 0); + pkt_cnt = nir_bcsel(b, has_grid_size, nir_iadd_imm(b, pkt_cnt, 3), pkt_cnt); + pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt); + + nir_def *values[6] = { + nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt), nir_iand_imm(b, vtx_base_sgpr, 0x3FFF), dgc_get_nop_packet(b, device), + dgc_get_nop_packet(b, device), dgc_get_nop_packet(b, device), dgc_get_nop_packet(b, device), + }; + + /* DrawID needs to be first if no GridSize. */ + values[2] = nir_bcsel(b, has_grid_size, x, drawid); + values[3] = nir_bcsel(b, has_grid_size, y, values[3]); + values[4] = nir_bcsel(b, has_grid_size, z, values[4]); + values[5] = nir_bcsel(b, has_drawid, drawid, values[5]); + + for (uint32_t i = 0; i < ARRAY_SIZE(values); i++) + dgc_emit(b, cs, values[i]); + } + nir_pop_if(b, NULL); +} + static void dgc_emit_sqtt_userdata(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *data) { @@ -447,6 +485,15 @@ dgc_emit_dispatch_direct(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *wg_x, n dgc_emit(b, cs, nir_vec(b, values, 5)); } +static void +dgc_emit_dispatch_mesh_direct(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *x, nir_def *y, nir_def *z) +{ + nir_def *values[5] = {nir_imm_int(b, PKT3(PKT3_DISPATCH_MESH_DIRECT, 3, false)), x, y, z, + nir_imm_int(b, S_0287F0_SOURCE_SELECT(V_0287F0_DI_SRC_SEL_AUTO_INDEX))}; + + dgc_emit(b, cs, nir_vec(b, values, 5)); +} + static void dgc_emit_grid_size_user_sgpr(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *grid_base_sgpr, nir_def *wg_x, nir_def *wg_y, nir_def *wg_z) @@ -1111,6 +1158,42 @@ dgc_emit_dispatch(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_buf, ni nir_pop_if(b, 0); } +/** + * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_MESH_TASKS_NV. + */ +static void +dgc_emit_draw_mesh_tasks(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_buf, nir_def *stream_base, + nir_def *draw_params_offset, nir_def *sequence_id, const struct radv_device *device) +{ + nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr); + nir_def *stream_offset = nir_iadd(b, draw_params_offset, stream_base); + + nir_def *draw_data = nir_load_ssbo(b, 4, 32, stream_buf, stream_offset); + nir_def *x = nir_channel(b, draw_data, 0); + nir_def *y = nir_channel(b, draw_data, 1); + nir_def *z = nir_channel(b, draw_data, 2); + + nir_push_if(b, nir_iand(b, nir_ine_imm(b, x, 0), nir_iand(b, nir_ine_imm(b, y, 0), nir_ine_imm(b, z, 0)))); + { + dgc_emit_sqtt_begin_api_marker(b, cs, ApiCmdDrawMeshTasksEXT); + dgc_emit_sqtt_marker_event(b, cs, sequence_id, EventCmdDrawMeshTasksEXT); + + dgc_emit_userdata_mesh(b, cs, vtx_base_sgpr, x, y, z, sequence_id, device); + dgc_emit_instance_count(b, cs, nir_imm_int(b, 1)); + + if (device->mesh_fast_launch_2) { + dgc_emit_dispatch_mesh_direct(b, cs, x, y, z); + } else { + nir_def *vertex_count = nir_imul(b, x, nir_imul(b, y, z)); + dgc_emit_draw_index_auto(b, cs, vertex_count); + } + + dgc_emit_sqtt_thread_trace_marker(b, cs); + dgc_emit_sqtt_end_api_marker(b, cs, ApiCmdDrawMeshTasksEXT); + } + nir_pop_if(b, NULL); +} + static nir_shader * build_dgc_prepare_shader(struct radv_device *dev) { @@ -1188,8 +1271,18 @@ build_dgc_prepare_shader(struct radv_device *dev) { nir_push_if(&b, nir_ieq_imm(&b, load_param16(&b, draw_indexed), 0)); { - dgc_emit_draw(&b, &cmd_buf, stream_buf, stream_base, load_param16(&b, draw_params_offset), sequence_id, - dev); + nir_def *draw_mesh_tasks = load_param8(&b, draw_mesh_tasks); + nir_push_if(&b, nir_ieq_imm(&b, draw_mesh_tasks, 0)); + { + dgc_emit_draw(&b, &cmd_buf, stream_buf, stream_base, load_param16(&b, draw_params_offset), sequence_id, + dev); + } + nir_push_else(&b, NULL); + { + dgc_emit_draw_mesh_tasks(&b, &cmd_buf, stream_buf, stream_base, load_param16(&b, draw_params_offset), + sequence_id, dev); + } + nir_pop_if(&b, NULL); } nir_push_else(&b, NULL); { @@ -1413,6 +1506,10 @@ radv_CreateIndirectCommandsLayoutNV(VkDevice _device, const VkIndirectCommandsLa layout->push_constant_offsets[j] = pCreateInfo->pTokens[i].offset + k * 4; } break; + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_MESH_TASKS_NV: + layout->draw_mesh_tasks = true; + layout->draw_params_offset = pCreateInfo->pTokens[i].offset; + break; default: unreachable("Unhandled token type"); } @@ -1576,8 +1673,15 @@ radv_prepare_dgc_graphics(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedC if (cmd_buffer->state.graphics_pipeline->uses_drawid) vtx_base_sgpr |= DGC_USES_DRAWID; - if (cmd_buffer->state.graphics_pipeline->uses_baseinstance) - vtx_base_sgpr |= DGC_USES_BASEINSTANCE; + + if (layout->draw_mesh_tasks) { + struct radv_shader *mesh_shader = radv_get_shader(graphics_pipeline->base.shaders, MESA_SHADER_MESH); + if (mesh_shader->info.cs.uses_grid_size) + vtx_base_sgpr |= DGC_USES_GRID_SIZE; + } else { + if (cmd_buffer->state.graphics_pipeline->uses_baseinstance) + vtx_base_sgpr |= DGC_USES_BASEINSTANCE; + } params->draw_indexed = layout->indexed; params->draw_params_offset = layout->draw_params_offset; @@ -1587,6 +1691,7 @@ radv_prepare_dgc_graphics(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedC params->index_buffer_offset = layout->index_buffer_offset; params->ibo_type_32 = layout->ibo_type_32; params->ibo_type_8 = layout->ibo_type_8; + params->draw_mesh_tasks = layout->draw_mesh_tasks; if (layout->bind_vbo_mask) { uint32_t mask = vs->info.vs.vb_desc_usage_mask; diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 1f26c8df946..72477307b5e 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -3253,6 +3253,7 @@ struct radv_indirect_command_layout { bool indexed; bool binds_index_buffer; + bool draw_mesh_tasks; uint16_t draw_params_offset; uint16_t index_buffer_offset;
