Module: Mesa Branch: main Commit: ab37109d23195ae8594591a7d4050dfaa088f54f URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=ab37109d23195ae8594591a7d4050dfaa088f54f
Author: Rob Clark <[email protected]> Date: Sat Aug 21 10:14:16 2021 -0700 freedreno/a6xx: Updates for tess_use_shared The formula for calculating these two values seems to depend on tess_use_shared, ie. a6xx_gen3 and a6xx_gen4 match. The existing calculation matches a6xx_gen1 and a6xx_gen2. The new formula is based on traces varying # of output (from VS) varyings from (1..31)*vec4 and vertices from (1..31) and coming up with something that matches the blob. Once hs_input_size*4 divided by tcs_vertices_out goes above 64, this deviates a bit from the blob, but AFAICT it is safe to pick a larger values. Signed-off-by: Rob Clark <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12497> --- src/gallium/drivers/freedreno/a6xx/fd6_program.c | 58 +++++++++++++++--------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.c b/src/gallium/drivers/freedreno/a6xx/fd6_program.c index 5ad33eaf387..20fe9699dc2 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.c @@ -639,29 +639,45 @@ setup_stateobj(struct fd_ringbuffer *ring, struct fd_context *ctx, OUT_PKT4(ring, REG_A6XX_PC_TESS_NUM_VERTEX, 1); OUT_RING(ring, hs_info->tess.tcs_vertices_out); - /* Total attribute slots in HS incoming patch. */ - OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1); - OUT_RING(ring, hs_info->tess.tcs_vertices_out * vs->output_size / 4); - - const uint32_t wavesize = 64; - const uint32_t max_wave_input_size = 64; - const uint32_t patch_control_points = hs_info->tess.tcs_vertices_out; + if (ctx->screen->info->a6xx.tess_use_shared) { + unsigned hs_input_size = 6 + (3 * (vs->output_size - 1)); + unsigned wave_input_size = + MIN2(64, DIV_ROUND_UP(hs_input_size * 4, + hs_info->tess.tcs_vertices_out)); + + OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1); + OUT_RING(ring, hs_input_size); + + OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); + OUT_RING(ring, wave_input_size); + } else { + uint32_t hs_input_size = + hs_info->tess.tcs_vertices_out * vs->output_size / 4; + + /* Total attribute slots in HS incoming patch. */ + OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1); + OUT_RING(ring, hs_input_size); + + const uint32_t wavesize = 64; + const uint32_t max_wave_input_size = 64; + const uint32_t patch_control_points = hs_info->tess.tcs_vertices_out; + + /* note: if HS is really just the VS extended, then this + * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out) + * however that doesn't match the blob, and fails some dEQP tests. + */ + uint32_t prims_per_wave = wavesize / hs_info->tess.tcs_vertices_out; + uint32_t max_prims_per_wave = max_wave_input_size * wavesize / + (vs->output_size * patch_control_points); + prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave); - /* note: if HS is really just the VS extended, then this - * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out) - * however that doesn't match the blob, and fails some dEQP tests. - */ - uint32_t prims_per_wave = wavesize / hs_info->tess.tcs_vertices_out; - uint32_t max_prims_per_wave = max_wave_input_size * wavesize / - (vs->output_size * patch_control_points); - prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave); + uint32_t total_size = + vs->output_size * patch_control_points * prims_per_wave; + uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize); - uint32_t total_size = - vs->output_size * patch_control_points * prims_per_wave; - uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize); - - OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); - OUT_RING(ring, wave_input_size); + OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); + OUT_RING(ring, wave_input_size); + } shader_info *ds_info = &ds->shader->nir->info; OUT_PKT4(ring, REG_A6XX_PC_TESS_CNTL, 1);
