Module: Mesa Branch: master Commit: 61fe66a2e433c5565153ca800e81c36a17c7cba1 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=61fe66a2e433c5565153ca800e81c36a17c7cba1
Author: Marek Olšák <[email protected]> Date: Sat Nov 14 17:24:11 2020 -0500 radeonsi: pass VS->TCS IO via VGPRs if VS and TCS have the same thread count It can only be done if a TCS input is accessed without indirect indexing and with gl_InvocationID as the vertex index, and the number of VS and TCS threads is the same. This eliminates LDS stores and loads for VS->TCS IO, reducing shader lifetime and LDS traffic. Acked-by: Pierre-Eric Pelloux-Prayer <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7623> --- src/gallium/drivers/radeonsi/si_shader.c | 15 ++++++++++++ src/gallium/drivers/radeonsi/si_shader.h | 1 + src/gallium/drivers/radeonsi/si_shader_internal.h | 1 + src/gallium/drivers/radeonsi/si_shader_llvm_tess.c | 28 ++++++++++++++++++++-- src/gallium/drivers/radeonsi/si_state_shaders.c | 4 ++++ 5 files changed, 47 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index ec748c92ce9..46b15f74fb5 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -479,7 +479,21 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader) returns[num_returns++] = ctx->ac.i32; /* SGPRs */ for (i = 0; i < 2; i++) returns[num_returns++] = ctx->ac.f32; /* VGPRs */ + + /* VS outputs passed via VGPRs to TCS. */ + if (shader->key.opt.same_patch_vertices) { + unsigned num_outputs = util_last_bit64(shader->selector->outputs_written); + for (i = 0; i < num_outputs * 4; i++) + returns[num_returns++] = ctx->ac.f32; /* VGPRs */ + } } else { + /* TCS inputs are passed via VGPRs from VS. */ + if (shader->key.opt.same_patch_vertices) { + unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->outputs_written); + for (i = 0; i < num_inputs * 4; i++) + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); + } + /* TCS return values are inputs to the TCS epilog. * * param_tcs_offchip_offset, param_tcs_factor_offset, @@ -1765,6 +1779,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com parts[3] = ctx.main_fn; /* VS as LS main part */ + ctx.next_shader_sel = ctx.shader->selector; nir = get_nir_shader(ls, NULL, &free_nir); struct si_shader shader_ls = {}; shader_ls.selector = ls; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index c9aa439b740..b080bfc835d 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -461,6 +461,7 @@ struct si_shader_selector { uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */ uint64_t inputs_read; /* "get_unique_index" bits */ + uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */ /* bitmasks of used descriptor slots */ uint64_t active_const_and_shader_buffers; diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 38c5ab94dab..4fd2f2168a2 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -46,6 +46,7 @@ struct si_shader_output_values { struct si_shader_context { struct ac_llvm_context ac; struct si_shader *shader; + struct si_shader_selector *next_shader_sel; struct si_screen *screen; gl_shader_stage stage; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index 470a24a353c..abf91715d4e 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -395,6 +395,21 @@ static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, LLVMType semantic = info->output_semantic[driver_location]; } + /* Load the TCS input from a VGPR if possible. */ + if (ctx->shader->key.opt.same_patch_vertices && + load_input && vertex_index_is_invoc_id && !param_index) { + unsigned func_param = ctx->args.tcs_rel_ids.arg_index + 1 + + si_shader_io_get_unique_index(semantic, false) * 4; + LLVMValueRef value[4]; + + for (unsigned i = component; i < component + num_components; i++) { + value[i] = LLVMGetParam(ctx->main_fn, func_param + i); + value[i] = LLVMBuildBitCast(ctx->ac.builder, value[i], type, ""); + } + + return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); + } + bool is_patch = vertex_index == NULL; assert((semantic >= VARYING_SLOT_PATCH0 || semantic == VARYING_SLOT_TESS_LEVEL_INNER || @@ -944,6 +959,7 @@ void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, L LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->rel_auto_id); LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx); LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, vertex_dw_stride, ""); + unsigned ret_offset = 8 + GFX9_TCS_NUM_USER_SGPR + 2; /* Write outputs to LDS. The next shader (TCS aka HS) will read * its inputs from it. */ @@ -976,8 +992,16 @@ void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, L if (!(info->output_usagemask[i] & (1 << chan))) continue; - lshs_lds_store(ctx, chan, dw_addr, - LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "")); + LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); + + if (!shader->key.opt.same_patch_vertices || + !(ctx->next_shader_sel->tcs_vgpr_only_inputs & (1ull << semantic))) + lshs_lds_store(ctx, chan, dw_addr, value); + + if (shader->key.opt.same_patch_vertices) { + ctx->return_value = LLVMBuildInsertValue(ctx->ac.builder, ctx->return_value, + value, ret_offset + param * 4 + chan, ""); + } } } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index cd2168534a9..729f46599be 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2756,6 +2756,10 @@ static void *si_create_shader_selector(struct pipe_context *ctx, assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0); + sel->tcs_vgpr_only_inputs = ~sel->info.base.tess.tcs_cross_invocation_inputs_read & + ~sel->info.base.inputs_read_indirectly & + sel->info.base.inputs_read; + /* Only for TES: */ if (sel->info.stage == MESA_SHADER_TESS_EVAL) { if (sel->info.base.tess.point_mode) _______________________________________________ mesa-commit mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-commit
