Module: Mesa
Branch: main
Commit: 50709863ace6ccd003389b595af20536980f6a3e
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=50709863ace6ccd003389b595af20536980f6a3e

Author: Samuel Pitoiset <[email protected]>
Date:   Wed Jul 19 09:12:01 2023 +0200

radv: simplify the NGG vs legacy pipelinestat query path

NGG is enabled by default on RDNA1-2 but the driver might fallback to
legacy GS for some reasons, like XFB. On these generations, the number
of generated primitives by GS needs to be emulated from the NGG shader
because the hw doesn't increment the related pipelinestat counter.

In order to support NGG and legacy GS with that query (remember that
we can't know pipelines when starting/ending queries), we used to
reserve 2x 64-bit counters to store the GDS results, and the results
were accumulated.

Now that legacy GS also uses GDS counters, we can simplify this path
and overwrite the pipelinestat counter directly instead of having two
separate counters.

Signed-off-by: Samuel Pitoiset <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24231>

---

 src/amd/vulkan/radv_query.c | 76 +++++++++++----------------------------------
 1 file changed, 18 insertions(+), 58 deletions(-)

diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c
index f9d8e4e1eec..d2d0819d9bd 100644
--- a/src/amd/vulkan/radv_query.c
+++ b/src/amd/vulkan/radv_query.c
@@ -43,6 +43,13 @@
 /* TODO: Add support for mesh/task queries on GFX11 */
 static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 
8, 9, 10};
 
+static unsigned
+radv_get_pipelinestat_query_offset(VkQueryPipelineStatisticFlagBits query)
+{
+   uint32_t idx = ffs(query) - 1;
+   return pipeline_statistics_indices[idx] * 8;
+}
+
 static unsigned
 radv_get_pipelinestat_query_size(struct radv_device *device)
 {
@@ -276,25 +283,14 @@ build_pipeline_statistics_query_shader(struct radv_device 
*device)
    nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), 
.range = 4);
    nir_ssa_def *stats_mask = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 
8), .range = 12);
    nir_ssa_def *avail_offset = nir_load_push_constant(&b, 1, 32, 
nir_imm_int(&b, 12), .range = 16);
-   nir_ssa_def *uses_gds = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 
16), .range = 20);
 
    nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
    nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
 
    nir_ssa_def *global_id = get_global_ids(&b, 1);
 
-   nir_variable *input_stride = nir_local_variable_create(b.impl, 
glsl_int_type(), "input_stride");
-   nir_push_if(&b, nir_ine_imm(&b, uses_gds, 0));
-   {
-      nir_store_var(&b, input_stride, nir_imm_int(&b, pipelinestat_block_size 
* 2 + 8 * 2), 0x1);
-   }
-   nir_push_else(&b, NULL);
-   {
-      nir_store_var(&b, input_stride, nir_imm_int(&b, pipelinestat_block_size 
* 2), 0x1);
-   }
-   nir_pop_if(&b, NULL);
-
-   nir_ssa_def *input_base = nir_imul(&b, nir_load_var(&b, input_stride), 
global_id);
+   nir_ssa_def *input_stride = nir_imm_int(&b, pipelinestat_block_size * 2);
+   nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
    nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, 
nir_imm_int(&b, 4), .range = 8);
    nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
 
@@ -324,23 +320,6 @@ build_pipeline_statistics_query_shader(struct radv_device 
*device)
 
       nir_store_var(&b, result, nir_isub(&b, end, start), 0x1);
 
-      nir_push_if(&b,
-                  nir_iand(&b, nir_i2b(&b, uses_gds),
-                           nir_imm_bool(&b, 1u << i == 
VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT)));
-      {
-         /* Compute the GDS result if needed. */
-         nir_ssa_def *gds_start_offset = nir_iadd(&b, input_base, 
nir_imm_int(&b, pipelinestat_block_size * 2));
-         nir_ssa_def *gds_start = nir_load_ssbo(&b, 1, 64, src_buf, 
gds_start_offset);
-
-         nir_ssa_def *gds_end_offset = nir_iadd(&b, input_base, 
nir_imm_int(&b, pipelinestat_block_size * 2 + 8));
-         nir_ssa_def *gds_end = nir_load_ssbo(&b, 1, 64, src_buf, 
gds_end_offset);
-
-         nir_ssa_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start);
-
-         nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), 
ngg_gds_result), 0x1);
-      }
-      nir_pop_if(&b, NULL);
-
       /* Store result */
       nir_push_if(&b, result_is_64bit);
 
@@ -1096,12 +1075,6 @@ radv_create_query_pool(struct radv_device *device, const 
VkQueryPoolCreateInfo *
       break;
    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
       pool->stride = radv_get_pipelinestat_query_size(device) * 2;
-      if (pool->uses_gds) {
-         /* When the query pool needs GDS (for counting the number of 
primitives generated by a
-          * geometry shader with NGG), allocate 2x64-bit values for begin/end.
-          */
-         pool->stride += 8 * 2;
-      }
       break;
    case VK_QUERY_TYPE_TIMESTAMP:
    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
@@ -1268,7 +1241,6 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool 
queryPool, uint32_t first
       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
          unsigned pipelinestat_block_size = 
radv_get_pipelinestat_query_size(device);
          const uint32_t *avail_ptr = (const uint32_t *)(pool->ptr + 
pool->availability_offset + 4 * query);
-         uint64_t ngg_gds_result = 0;
 
          do {
             available = p_atomic_read(avail_ptr);
@@ -1277,14 +1249,6 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool 
queryPool, uint32_t first
          if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
             result = VK_NOT_READY;
 
-         if (pool->uses_gds) {
-            /* Compute the result that was copied from GDS. */
-            const uint64_t *gds_start = (uint64_t *)(src + 
pipelinestat_block_size * 2);
-            const uint64_t *gds_stop = (uint64_t *)(src + 
pipelinestat_block_size * 2 + 8);
-
-            ngg_gds_result = gds_stop[0] - gds_start[0];
-         }
-
          const uint64_t *start = (uint64_t *)src;
          const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size);
          if (flags & VK_QUERY_RESULT_64_BIT) {
@@ -1294,10 +1258,6 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool 
queryPool, uint32_t first
                if (pool->pipeline_stats_mask & (1u << i)) {
                   if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
                      *dst = stop[pipeline_statistics_indices[i]] - 
start[pipeline_statistics_indices[i]];
-
-                     if (pool->uses_gds && (1u << i) == 
VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
-                        *dst += ngg_gds_result;
-                     }
                   }
                   dst++;
                }
@@ -1310,10 +1270,6 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool 
queryPool, uint32_t first
                if (pool->pipeline_stats_mask & (1u << i)) {
                   if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
                      *dst = stop[pipeline_statistics_indices[i]] - 
start[pipeline_statistics_indices[i]];
-
-                     if (pool->uses_gds && (1u << i) == 
VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
-                        *dst += ngg_gds_result;
-                     }
                   }
                   dst++;
                }
@@ -1538,7 +1494,7 @@ radv_CmdCopyQueryPoolResults(VkCommandBuffer 
commandBuffer, VkQueryPool queryPoo
       radv_query_shader(cmd_buffer, 
&cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline, 
pool->bo,
                         dst_buffer->bo, firstQuery * pool->stride, 
dst_buffer->offset + dstOffset, pool->stride, stride,
                         dst_size, queryCount, flags, pool->pipeline_stats_mask,
-                        pool->availability_offset + 4 * firstQuery, 
pool->uses_gds);
+                        pool->availability_offset + 4 * firstQuery, false);
       break;
    case VK_QUERY_TYPE_TIMESTAMP:
    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
@@ -1762,8 +1718,6 @@ emit_begin_query(struct radv_cmd_buffer *cmd_buffer, 
struct radv_query_pool *poo
       radeon_emit(cs, va >> 32);
       break;
    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
-      unsigned pipelinestat_block_size = 
radv_get_pipelinestat_query_size(cmd_buffer->device);
-
       radeon_check_space(cmd_buffer->device->ws, cs, 4);
 
       ++cmd_buffer->state.active_pipeline_queries;
@@ -1778,7 +1732,10 @@ emit_begin_query(struct radv_cmd_buffer *cmd_buffer, 
struct radv_query_pool *poo
       radeon_emit(cs, va >> 32);
 
       if (pool->uses_gds) {
-         va += pipelinestat_block_size * 2;
+         uint32_t gs_prim_offset =
+            
radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
+
+         va += gs_prim_offset;
 
          /* pipeline statistics counter for all streams */
          gfx10_copy_gds_query(cmd_buffer, 
RADV_SHADER_QUERY_PIPELINE_STAT_OFFSET, va);
@@ -1919,7 +1876,10 @@ emit_end_query(struct radv_cmd_buffer *cmd_buffer, 
struct radv_query_pool *pool,
                                  EOP_DATA_SEL_VALUE_32BIT, avail_va, 1, 
cmd_buffer->gfx9_eop_bug_va);
 
       if (pool->uses_gds) {
-         va += pipelinestat_block_size + 8;
+         uint32_t gs_prim_offset =
+            
radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
+
+         va += gs_prim_offset;
 
          /* pipeline statistics counter for all streams */
          gfx10_copy_gds_query(cmd_buffer, 
RADV_SHADER_QUERY_PIPELINE_STAT_OFFSET, va);

Reply via email to