Module: Mesa Branch: main Commit: 50709863ace6ccd003389b595af20536980f6a3e URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=50709863ace6ccd003389b595af20536980f6a3e
Author: Samuel Pitoiset <[email protected]> Date: Wed Jul 19 09:12:01 2023 +0200 radv: simplify the NGG vs legacy pipelinestat query path NGG is enabled by default on RDNA1-2 but the driver might fallback to legacy GS for some reasons, like XFB. On these generations, the number of generated primitives by GS needs to be emulated from the NGG shader because the hw doesn't increment the related pipelinestat counter. In order to support NGG and legacy GS with that query (remember that we can't know pipelines when starting/ending queries), we used to reserve 2x 64-bit counters to store the GDS results, and the results were accumulated. Now that legacy GS also uses GDS counters, we can simplify this path and overwrite the pipelinestat counter directly instead of having two separate counters. Signed-off-by: Samuel Pitoiset <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24231> --- src/amd/vulkan/radv_query.c | 76 +++++++++++---------------------------------- 1 file changed, 18 insertions(+), 58 deletions(-) diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c index f9d8e4e1eec..d2d0819d9bd 100644 --- a/src/amd/vulkan/radv_query.c +++ b/src/amd/vulkan/radv_query.c @@ -43,6 +43,13 @@ /* TODO: Add support for mesh/task queries on GFX11 */ static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10}; +static unsigned +radv_get_pipelinestat_query_offset(VkQueryPipelineStatisticFlagBits query) +{ + uint32_t idx = ffs(query) - 1; + return pipeline_statistics_indices[idx] * 8; +} + static unsigned radv_get_pipelinestat_query_size(struct radv_device *device) { @@ -276,25 +283,14 @@ build_pipeline_statistics_query_shader(struct radv_device *device) nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4); nir_ssa_def *stats_mask = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 8), .range = 12); nir_ssa_def *avail_offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 12), .range = 16); - nir_ssa_def *uses_gds = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20); nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0); nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1); nir_ssa_def *global_id = get_global_ids(&b, 1); - nir_variable *input_stride = nir_local_variable_create(b.impl, glsl_int_type(), "input_stride"); - nir_push_if(&b, nir_ine_imm(&b, uses_gds, 0)); - { - nir_store_var(&b, input_stride, nir_imm_int(&b, pipelinestat_block_size * 2 + 8 * 2), 0x1); - } - nir_push_else(&b, NULL); - { - nir_store_var(&b, input_stride, nir_imm_int(&b, pipelinestat_block_size * 2), 0x1); - } - nir_pop_if(&b, NULL); - - nir_ssa_def *input_base = nir_imul(&b, nir_load_var(&b, input_stride), global_id); + nir_ssa_def *input_stride = nir_imm_int(&b, pipelinestat_block_size * 2); + nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id); nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8); nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id); @@ -324,23 +320,6 @@ build_pipeline_statistics_query_shader(struct radv_device *device) nir_store_var(&b, result, nir_isub(&b, end, start), 0x1); - nir_push_if(&b, - nir_iand(&b, nir_i2b(&b, uses_gds), - nir_imm_bool(&b, 1u << i == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))); - { - /* Compute the GDS result if needed. */ - nir_ssa_def *gds_start_offset = nir_iadd(&b, input_base, nir_imm_int(&b, pipelinestat_block_size * 2)); - nir_ssa_def *gds_start = nir_load_ssbo(&b, 1, 64, src_buf, gds_start_offset); - - nir_ssa_def *gds_end_offset = nir_iadd(&b, input_base, nir_imm_int(&b, pipelinestat_block_size * 2 + 8)); - nir_ssa_def *gds_end = nir_load_ssbo(&b, 1, 64, src_buf, gds_end_offset); - - nir_ssa_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start); - - nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), ngg_gds_result), 0x1); - } - nir_pop_if(&b, NULL); - /* Store result */ nir_push_if(&b, result_is_64bit); @@ -1096,12 +1075,6 @@ radv_create_query_pool(struct radv_device *device, const VkQueryPoolCreateInfo * break; case VK_QUERY_TYPE_PIPELINE_STATISTICS: pool->stride = radv_get_pipelinestat_query_size(device) * 2; - if (pool->uses_gds) { - /* When the query pool needs GDS (for counting the number of primitives generated by a - * geometry shader with NGG), allocate 2x64-bit values for begin/end. - */ - pool->stride += 8 * 2; - } break; case VK_QUERY_TYPE_TIMESTAMP: case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: @@ -1268,7 +1241,6 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first case VK_QUERY_TYPE_PIPELINE_STATISTICS: { unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device); const uint32_t *avail_ptr = (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query); - uint64_t ngg_gds_result = 0; do { available = p_atomic_read(avail_ptr); @@ -1277,14 +1249,6 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) result = VK_NOT_READY; - if (pool->uses_gds) { - /* Compute the result that was copied from GDS. */ - const uint64_t *gds_start = (uint64_t *)(src + pipelinestat_block_size * 2); - const uint64_t *gds_stop = (uint64_t *)(src + pipelinestat_block_size * 2 + 8); - - ngg_gds_result = gds_stop[0] - gds_start[0]; - } - const uint64_t *start = (uint64_t *)src; const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size); if (flags & VK_QUERY_RESULT_64_BIT) { @@ -1294,10 +1258,6 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first if (pool->pipeline_stats_mask & (1u << i)) { if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) { *dst = stop[pipeline_statistics_indices[i]] - start[pipeline_statistics_indices[i]]; - - if (pool->uses_gds && (1u << i) == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) { - *dst += ngg_gds_result; - } } dst++; } @@ -1310,10 +1270,6 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first if (pool->pipeline_stats_mask & (1u << i)) { if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) { *dst = stop[pipeline_statistics_indices[i]] - start[pipeline_statistics_indices[i]]; - - if (pool->uses_gds && (1u << i) == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) { - *dst += ngg_gds_result; - } } dst++; } @@ -1538,7 +1494,7 @@ radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPoo radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline, pool->bo, dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount, flags, pool->pipeline_stats_mask, - pool->availability_offset + 4 * firstQuery, pool->uses_gds); + pool->availability_offset + 4 * firstQuery, false); break; case VK_QUERY_TYPE_TIMESTAMP: case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: @@ -1762,8 +1718,6 @@ emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *poo radeon_emit(cs, va >> 32); break; case VK_QUERY_TYPE_PIPELINE_STATISTICS: { - unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(cmd_buffer->device); - radeon_check_space(cmd_buffer->device->ws, cs, 4); ++cmd_buffer->state.active_pipeline_queries; @@ -1778,7 +1732,10 @@ emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *poo radeon_emit(cs, va >> 32); if (pool->uses_gds) { - va += pipelinestat_block_size * 2; + uint32_t gs_prim_offset = + radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT); + + va += gs_prim_offset; /* pipeline statistics counter for all streams */ gfx10_copy_gds_query(cmd_buffer, RADV_SHADER_QUERY_PIPELINE_STAT_OFFSET, va); @@ -1919,7 +1876,10 @@ emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, EOP_DATA_SEL_VALUE_32BIT, avail_va, 1, cmd_buffer->gfx9_eop_bug_va); if (pool->uses_gds) { - va += pipelinestat_block_size + 8; + uint32_t gs_prim_offset = + radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT); + + va += gs_prim_offset; /* pipeline statistics counter for all streams */ gfx10_copy_gds_query(cmd_buffer, RADV_SHADER_QUERY_PIPELINE_STAT_OFFSET, va);
