Module: Mesa
Branch: main
Commit: 3f3fa5ee0c30a35e0f9b7bbcae287151f05e57a2
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=3f3fa5ee0c30a35e0f9b7bbcae287151f05e57a2

Author: Rob Clark <robdcl...@chromium.org>
Date:   Fri Nov 10 10:49:45 2023 -0800

freedreno/a6xx: Rework wave input size

Rework to match tu.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/7464
Signed-off-by: Rob Clark <robdcl...@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26149>

---

 src/freedreno/ci/freedreno-a660-fails.txt         |  1 -
 src/gallium/drivers/freedreno/a6xx/fd6_program.cc | 61 +++++++++++------------
 2 files changed, 29 insertions(+), 33 deletions(-)

diff --git a/src/freedreno/ci/freedreno-a660-fails.txt 
b/src/freedreno/ci/freedreno-a660-fails.txt
index ef106528a08..7dca9f1aada 100644
--- a/src/freedreno/ci/freedreno-a660-fails.txt
+++ b/src/freedreno/ci/freedreno-a660-fails.txt
@@ -3,7 +3,6 @@ KHR-GL46.gpu_shader_fp64.fp64.max_uniform_components,Fail
 KHR-GL46.multi_bind.dispatch_bind_image_textures,Fail
 KHR-GL46.shader_image_load_store.basic-allTargets-store,Fail
 
KHR-GL46.shader_subroutine.control_flow_and_returned_subroutine_values_used_as_subroutine_input,Fail
-KHR-GL46.tessellation_shader.single.max_patch_vertices,Fail
 
 # Fails when TU_DEBUG=forcebin is set
 
gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_vert,Fail
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.cc 
b/src/gallium/drivers/freedreno/a6xx/fd6_program.cc
index 53aee8b7a7e..4ee1852b757 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_program.cc
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.cc
@@ -1101,45 +1101,42 @@ setup_stateobj(struct fd_ringbuffer *ring, const struct 
program_builder *b)
    }
 
    if (b->hs) {
-      if (b->ctx->screen->info->a6xx.tess_use_shared) {
-         unsigned hs_input_size = 6 + (3 * (b->vs->output_size - 1));
-         unsigned wave_input_size =
-               MIN2(64, DIV_ROUND_UP(hs_input_size * 4,
-                                     b->hs->tess.tcs_vertices_out));
+      uint32_t patch_control_points = b->key->patch_vertices;
 
-         OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1);
-         OUT_RING(ring, hs_input_size);
+      uint32_t patch_local_mem_size_16b =
+         patch_control_points * b->vs->output_size / 4;
 
-         OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
-         OUT_RING(ring, wave_input_size);
-      } else {
-         uint32_t hs_input_size =
-               b->hs->tess.tcs_vertices_out * b->vs->output_size / 4;
+      /* Total attribute slots in HS incoming patch. */
+      OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1);
+      OUT_RING(ring, patch_local_mem_size_16b);
 
-         /* Total attribute slots in HS incoming patch. */
-         OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1);
-         OUT_RING(ring, hs_input_size);
+      const uint32_t wavesize = 64;
+      const uint32_t vs_hs_local_mem_size = 16384;
+
+      uint32_t max_patches_per_wave;
+      if (b->ctx->screen->info->a6xx.tess_use_shared) {
+         /* HS invocations for a patch are always within the same wave,
+         * making barriers less expensive. VS can't have barriers so we
+         * don't care about VS invocations being in the same wave.
+         */
+         max_patches_per_wave = wavesize / b->hs->tess.tcs_vertices_out;
+      } else {
+      /* VS is also in the same wave */
+         max_patches_per_wave =
+            wavesize / MAX2(patch_control_points,
+                            b->hs->tess.tcs_vertices_out);
+      }
 
-         const uint32_t wavesize = 64;
-         const uint32_t max_wave_input_size = 64;
-         const uint32_t patch_control_points = b->hs->tess.tcs_vertices_out;
 
-         /* note: if HS is really just the VS extended, then this
-          * should be by MAX2(patch_control_points, 
hs_info->tess.tcs_vertices_out)
-          * however that doesn't match the blob, and fails some dEQP tests.
-          */
-         uint32_t prims_per_wave = wavesize / b->hs->tess.tcs_vertices_out;
-         uint32_t max_prims_per_wave = max_wave_input_size * wavesize /
-               (b->vs->output_size * patch_control_points);
-         prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave);
+      uint32_t patches_per_wave =
+         MIN2(vs_hs_local_mem_size / (patch_local_mem_size_16b * 16),
+              max_patches_per_wave);
 
-         uint32_t total_size =
-               b->vs->output_size * patch_control_points * prims_per_wave;
-         uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize);
+      uint32_t wave_input_size = DIV_ROUND_UP(
+         patches_per_wave * patch_local_mem_size_16b * 16, 256);
 
-         OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
-         OUT_RING(ring, wave_input_size);
-      }
+      OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
+      OUT_RING(ring, wave_input_size);
 
       enum a6xx_tess_output output;
       if (b->ds->tess.point_mode)

Reply via email to