Module: Mesa Branch: main Commit: 2a1c711052a5983c67bad56433cd30034b2bd5f8 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=2a1c711052a5983c67bad56433cd30034b2bd5f8
Author: Marek Olšák <[email protected]> Date: Mon May 2 22:43:38 2022 -0400 ac/llvm: skip s_barrier if tess patches don't cross a wave boundary If tess patches are wholly in one wave, "s_waitcnt lgkm(0)" is sufficient. Reviewed-by: Pierre-Eric Pelloux-Prayer <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16304> --- src/amd/llvm/ac_nir_to_llvm.c | 5 +++++ src/gallium/drivers/radeonsi/si_shader.c | 8 ++++++++ src/gallium/drivers/radeonsi/si_shader.h | 1 + src/gallium/drivers/radeonsi/si_shader_llvm.c | 12 +++++++++++- src/gallium/drivers/radeonsi/si_shader_llvm_tess.c | 10 ++++++---- 5 files changed, 31 insertions(+), 5 deletions(-) diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index 71529579554..efdad1449b6 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -3928,6 +3928,11 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins break; } case nir_intrinsic_control_barrier: + /* If output patches are wholly in one wave, we don't need a barrier. */ + if (ctx->stage == MESA_SHADER_TESS_CTRL && + ctx->ac.wave_size % ctx->info->tess.tcs_vertices_out == 0) + break; + ac_build_s_barrier(&ctx->ac, ctx->stage); break; case nir_intrinsic_shared_atomic_add: diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 127c1e3228d..1777f73fb46 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1971,6 +1971,14 @@ void si_get_tcs_epilog_key(struct si_shader *shader, union si_shader_part_key *k memset(key, 0, sizeof(*key)); key->tcs_epilog.wave32 = shader->wave_size == 32; key->tcs_epilog.states = shader->key.ge.part.tcs.epilog; + + /* If output patches are wholly in one wave, we don't need a barrier. + * The fixed-func TCS doesn't set tcs_vertices_out, but it won't use a barrier + * anyway because tess levels are always defined in all invocations there. + */ + key->tcs_epilog.noop_s_barrier = + shader->selector->info.base.tess.tcs_vertices_out && + shader->wave_size % shader->selector->info.base.tess.tcs_vertices_out == 0; } /** diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 36d9fc2075d..24fdd4dc046 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -601,6 +601,7 @@ union si_shader_part_key { struct { struct si_tcs_epilog_bits states; unsigned wave32 : 1; + unsigned noop_s_barrier : 1; } tcs_epilog; struct { struct si_ps_prolog_bits states; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index ebf31ddbdd4..3b4837780ca 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -1017,7 +1017,17 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad shader->selector->info.base.inputs_read & ~shader->selector->info.tcs_vgpr_only_inputs) { ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); - ac_build_s_barrier(&ctx->ac, ctx->stage); + + /* If both input and output patches are wholly in one wave, we don't need a barrier. + * That's true when both VS and TCS have the same number of patch vertices and + * the wave size is a multiple of the number of patch vertices. + * + * The fixed-func TCS doesn't set tcs_vertices_out. + */ + if (!shader->key.ge.opt.same_patch_vertices || + (sel->info.base.tess.tcs_vertices_out && + ctx->ac.wave_size % sel->info.base.tess.tcs_vertices_out != 0)) + ac_build_s_barrier(&ctx->ac, ctx->stage); } } else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) { /* gfx10_ngg_gs_emit_prologue inserts the barrier for NGG. */ diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index a401b6bbc4a..7b907b09dd0 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -671,8 +671,8 @@ static void si_copy_tcs_inputs(struct si_shader_context *ctx) } } -static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef rel_patch_id, - LLVMValueRef invocation_id, +static void si_write_tess_factors(struct si_shader_context *ctx, union si_shader_part_key *key, + LLVMValueRef rel_patch_id, LLVMValueRef invocation_id, LLVMValueRef tcs_out_current_patch_data_offset, LLVMValueRef invoc0_tf_outer[4], LLVMValueRef invoc0_tf_inner[2]) { @@ -685,7 +685,9 @@ static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef re /* Add a barrier before loading tess factors from LDS. */ if (!shader->key.ge.part.tcs.epilog.invoc0_tess_factors_are_def) { ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); - ac_build_s_barrier(&ctx->ac, ctx->stage); + + if (!key->tcs_epilog.noop_s_barrier) + ac_build_s_barrier(&ctx->ac, ctx->stage); } /* Do this only for invocation 0, because the tess levels are per-patch, @@ -1075,7 +1077,7 @@ void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_par for (unsigned i = 0; i < 6; i++) invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]); - si_write_tess_factors(ctx, ac_get_arg(&ctx->ac, rel_patch_id), + si_write_tess_factors(ctx, key, ac_get_arg(&ctx->ac, rel_patch_id), ac_get_arg(&ctx->ac, invocation_id), ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset), invoc0_tess_factors, invoc0_tess_factors + 4);
