On Tue, Sep 5, 2017 at 9:56 AM, Nicolai Hähnle <nhaeh...@gmail.com> wrote: > From: Nicolai Hähnle <nicolai.haeh...@amd.com> > > When the HS wave is empty, the hardware writes the LS VGPRs starting at > v0 instead of v2. Workaround by shifting them back into place when > necessary. For simplicity, this is always done in the LS prolog. > > According to the hardware team, this will be fixed in future chips, > so take that into account already. > > Note that this is not a bug fix, as the bug was already worked > around by commit 166823bfd26 ("radeonsi/gfx9: add a temporary workaround > for a tessellation driver bug"). This change merely replaces the > workaround by one that should be better. > > v2: add workaround code to shader only when necessary > --- > src/gallium/drivers/radeonsi/si_pipe.h | 1 + > src/gallium/drivers/radeonsi/si_shader.c | 71 > ++++++++++++++++++------- > src/gallium/drivers/radeonsi/si_shader.h | 1 + > src/gallium/drivers/radeonsi/si_state_draw.c | 27 ++++++++-- > src/gallium/drivers/radeonsi/si_state_shaders.c | 8 +++ > 5 files changed, 84 insertions(+), 24 deletions(-) > > diff --git a/src/gallium/drivers/radeonsi/si_pipe.h > b/src/gallium/drivers/radeonsi/si_pipe.h > index 3ae06584427..9832fd19ff6 100644 > --- a/src/gallium/drivers/radeonsi/si_pipe.h > +++ b/src/gallium/drivers/radeonsi/si_pipe.h > @@ -382,20 +382,21 @@ struct si_context { > bool db_flush_stencil_inplace:1; > bool db_depth_clear:1; > bool db_depth_disable_expclear:1; > bool db_stencil_clear:1; > bool db_stencil_disable_expclear:1; > bool occlusion_queries_disabled:1; > bool generate_mipmap_for_depth:1; > > /* Emitted draw state. */ > bool gs_tri_strip_adj_fix:1; > + bool ls_vgpr_fix:1; > int last_index_size; > int last_base_vertex; > int last_start_instance; > int last_drawid; > int last_sh_base_reg; > int last_primitive_restart_en; > int last_restart_index; > int last_gs_out_prim; > int last_prim; > int last_multi_vgt_param; > diff --git a/src/gallium/drivers/radeonsi/si_shader.c > b/src/gallium/drivers/radeonsi/si_shader.c > index 2cddfe97aa5..bae9b8384dd 100644 > --- a/src/gallium/drivers/radeonsi/si_shader.c > +++ b/src/gallium/drivers/radeonsi/si_shader.c > @@ -5536,20 +5536,22 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, > } > > static void si_dump_shader_key_vs(const struct si_shader_key *key, > const struct si_vs_prolog_bits *prolog, > const char *prefix, FILE *f) > { > fprintf(f, " %s.instance_divisor_is_one = %u\n", > prefix, prolog->instance_divisor_is_one); > fprintf(f, " %s.instance_divisor_is_fetched = %u\n", > prefix, prolog->instance_divisor_is_fetched); > + fprintf(f, " %s.ls_vgpr_fix = %u\n", > + prefix, prolog->ls_vgpr_fix); > > fprintf(f, " mono.vs.fix_fetch = {"); > for (int i = 0; i < SI_MAX_ATTRIBS; i++) > fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]); > fprintf(f, "}\n"); > } > > static void si_dump_shader_key(unsigned processor, const struct si_shader > *shader, > FILE *f) > { > @@ -5728,20 +5730,28 @@ static void si_init_exec_from_input(struct > si_shader_context *ctx, > { > LLVMValueRef args[] = { > LLVMGetParam(ctx->main_fn, param), > LLVMConstInt(ctx->i32, bitoffset, 0), > }; > lp_build_intrinsic(ctx->gallivm.builder, > "llvm.amdgcn.init.exec.from.input", > ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT); > } > > +static bool si_vs_needs_prolog(const struct si_shader_selector *sel, > + const struct si_vs_prolog_bits *key) > +{ > + /* VGPR initialization fixup for Vega10 and Raven is always done in > the > + * VS prolog. */ > + return sel->vs_needs_prolog || key->ls_vgpr_fix; > +} > + > static bool si_compile_tgsi_main(struct si_shader_context *ctx, > bool is_monolithic) > { > struct si_shader *shader = ctx->shader; > struct si_shader_selector *sel = shader->selector; > struct lp_build_tgsi_context *bld_base = &ctx->bld_base; > > // TODO clean all this up! > switch (ctx->type) { > case PIPE_SHADER_VERTEX: > @@ -5804,21 +5814,21 @@ static bool si_compile_tgsi_main(struct > si_shader_context *ctx, > * > * For monolithic merged shaders, the first shader is wrapped in an > * if-block together with its prolog in si_build_wrapper_function. > */ > if (ctx->screen->b.chip_class >= GFX9) { > if (!is_monolithic && > sel->info.num_instructions > 1 && /* not empty shader */ > (shader->key.as_es || shader->key.as_ls) && > (ctx->type == PIPE_SHADER_TESS_EVAL || > (ctx->type == PIPE_SHADER_VERTEX && > - !sel->vs_needs_prolog))) { > + !si_vs_needs_prolog(sel, > &shader->key.part.vs.prolog)))) { > si_init_exec_from_input(ctx, > ctx->param_merged_wave_info, > 0); > } else if (ctx->type == PIPE_SHADER_TESS_CTRL || > ctx->type == PIPE_SHADER_GEOMETRY) { > if (!is_monolithic) > si_init_exec_full_mask(ctx); > > /* The barrier must execute for all shaders in a > * threadgroup. > */ > @@ -6478,33 +6488,35 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, > si_build_vs_prolog_function(&ctx, &prolog_key); > parts[0] = ctx.main_fn; > } > > si_build_wrapper_function(&ctx, parts + !need_prolog, > 1 + need_prolog, need_prolog, 0); > } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) { > if (sscreen->b.chip_class >= GFX9) { > struct si_shader_selector *ls = > shader->key.part.tcs.ls; > LLVMValueRef parts[4]; > + bool vs_needs_prolog = > + si_vs_needs_prolog(ls, > &shader->key.part.tcs.ls_prolog); > > /* TCS main part */ > parts[2] = ctx.main_fn; > > /* TCS epilog */ > union si_shader_part_key tcs_epilog_key; > memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key)); > tcs_epilog_key.tcs_epilog.states = > shader->key.part.tcs.epilog; > si_build_tcs_epilog_function(&ctx, &tcs_epilog_key); > parts[3] = ctx.main_fn; > > /* VS prolog */ > - if (ls->vs_needs_prolog) { > + if (vs_needs_prolog) { > union si_shader_part_key vs_prolog_key; > si_get_vs_prolog_key(&ls->info, > > shader->info.num_input_sgprs, > > &shader->key.part.tcs.ls_prolog, > shader, &vs_prolog_key); > vs_prolog_key.vs_prolog.is_monolithic = true; > si_build_vs_prolog_function(&ctx, > &vs_prolog_key); > parts[0] = ctx.main_fn; > } > > @@ -6521,23 +6533,23 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, > return -1; > } > shader->info.uses_instanceid |= > ls->info.uses_instanceid; > parts[1] = ctx.main_fn; > > /* Reset the shader context. */ > ctx.shader = shader; > ctx.type = PIPE_SHADER_TESS_CTRL; > > si_build_wrapper_function(&ctx, > - parts + > !ls->vs_needs_prolog, > - 4 - !ls->vs_needs_prolog, 0, > - ls->vs_needs_prolog ? 2 : > 1); > + parts + !vs_needs_prolog, > + 4 - !vs_needs_prolog, 0, > + vs_needs_prolog ? 2 : 1); > } else { > LLVMValueRef parts[2]; > union si_shader_part_key epilog_key; > > parts[0] = ctx.main_fn; > > memset(&epilog_key, 0, sizeof(epilog_key)); > epilog_key.tcs_epilog.states = > shader->key.part.tcs.epilog; > si_build_tcs_epilog_function(&ctx, &epilog_key); > parts[1] = ctx.main_fn; > @@ -6860,73 +6872,95 @@ static LLVMValueRef si_prolog_get_rw_buffers(struct > si_shader_context *ctx) > * (InstanceID / 2 + StartInstance) > */ > static void si_build_vs_prolog_function(struct si_shader_context *ctx, > union si_shader_part_key *key) > { > struct gallivm_state *gallivm = &ctx->gallivm; > struct si_function_info fninfo; > LLVMTypeRef *returns; > LLVMValueRef ret, func; > int num_returns, i; > - unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs + > - key->vs_prolog.num_merged_next_stage_vgprs; > + unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs; > unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs > + 4; > + LLVMValueRef input_vgprs[9]; > unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + > num_input_vgprs; > unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs > ? 8 : 0; > > si_init_function_info(&fninfo); > > /* 4 preloaded VGPRs + vertex load indices as prolog outputs */ > returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) > * > sizeof(LLVMTypeRef)); > num_returns = 0; > > /* Declare input and output SGPRs. */ > for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { > add_arg(&fninfo, ARG_SGPR, ctx->i32); > returns[num_returns++] = ctx->i32; > } > > /* Preloaded VGPRs (outputs must be floats) */ > for (i = 0; i < num_input_vgprs; i++) { > - add_arg(&fninfo, ARG_VGPR, ctx->i32); > + add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]); > returns[num_returns++] = ctx->f32; > } > > - fninfo.assign[first_vs_vgpr] = &ctx->abi.vertex_id; > - fninfo.assign[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)] = > &ctx->abi.instance_id; > - > /* Vertex load indices. */ > for (i = 0; i <= key->vs_prolog.last_input; i++) > returns[num_returns++] = ctx->f32; > > /* Create the function. */ > si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, > 0); > func = ctx->main_fn; > > - if (key->vs_prolog.num_merged_next_stage_vgprs && > - !key->vs_prolog.is_monolithic) > - si_init_exec_from_input(ctx, 3, 0); > + if (key->vs_prolog.num_merged_next_stage_vgprs) { > + if (!key->vs_prolog.is_monolithic) > + si_init_exec_from_input(ctx, 3, 0); > + > + if (key->vs_prolog.as_ls && > + (ctx->screen->b.family == CHIP_VEGA10 || > + ctx->screen->b.family == CHIP_RAVEN)) { > + /* If there are no HS threads, SPI loads the LS VGPRs > + * starting at VGPR 0. Shift them back to where they > + * belong. > + */ > + LLVMValueRef has_hs_threads = > + LLVMBuildICmp(gallivm->builder, LLVMIntNE, > + unpack_param(ctx, 3, 8, 8), > + ctx->i32_0, ""); > + > + for (i = 4; i > 0; --i) { > + input_vgprs[i + 1] = > + LLVMBuildSelect(gallivm->builder, > has_hs_threads, > + input_vgprs[i + 1], > + input_vgprs[i - 1], > ""); > + } > + } > + } > + > + ctx->abi.vertex_id = input_vgprs[first_vs_vgpr]; > + ctx->abi.instance_id = input_vgprs[first_vs_vgpr + > (key->vs_prolog.as_ls ? 2 : 1)]; > > /* Copy inputs to outputs. This should be no-op, as the registers > match, > * but it will prevent the compiler from overwriting them > unintentionally. > */ > ret = ctx->return_value; > for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { > LLVMValueRef p = LLVMGetParam(func, i); > ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); > } > - for (; i < fninfo.num_params; i++) { > - LLVMValueRef p = LLVMGetParam(func, i); > + for (i = 0; i < num_input_vgprs; i++) { > + LLVMValueRef p = input_vgprs[i]; > p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, ""); > - ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); > + ret = LLVMBuildInsertValue(gallivm->builder, ret, p, > + key->vs_prolog.num_input_sgprs + > i, ""); > } > > /* Compute vertex load indices from instance divisors. */ > LLVMValueRef instance_divisor_constbuf = NULL; > > if (key->vs_prolog.states.instance_divisor_is_fetched) { > LLVMValueRef list = si_prolog_get_rw_buffers(ctx); > LLVMValueRef buf_index = > LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, > 0); > instance_divisor_constbuf = > @@ -6973,22 +7007,21 @@ static void si_build_vs_prolog_function(struct > si_shader_context *ctx, > > static bool si_get_vs_prolog(struct si_screen *sscreen, > LLVMTargetMachineRef tm, > struct si_shader *shader, > struct pipe_debug_callback *debug, > struct si_shader *main_part, > const struct si_vs_prolog_bits *key) > { > struct si_shader_selector *vs = main_part->selector; > > - /* The prolog is a no-op if there are no inputs. */ > - if (!vs->vs_needs_prolog) > + if (!si_vs_needs_prolog(vs, key)) > return true; > > /* Get the prolog. */ > union si_shader_part_key prolog_key; > si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, > key, shader, &prolog_key); > > shader->prolog = > si_get_shader_part(sscreen, &sscreen->vs_prologs, > PIPE_SHADER_VERTEX, true, &prolog_key, tm, > diff --git a/src/gallium/drivers/radeonsi/si_shader.h > b/src/gallium/drivers/radeonsi/si_shader.h > index 0c0fa10f40f..ee6b0c167f9 100644 > --- a/src/gallium/drivers/radeonsi/si_shader.h > +++ b/src/gallium/drivers/radeonsi/si_shader.h > @@ -391,20 +391,21 @@ struct si_shader_selector { > /* Common VS bits between the shader key and the prolog key. */ > struct si_vs_prolog_bits { > /* - If neither "is_one" nor "is_fetched" has a bit set, the instance > * divisor is 0. > * - If "is_one" has a bit set, the instance divisor is 1. > * - If "is_fetched" has a bit set, the instance divisor will be > loaded > * from the constant buffer. > */ > uint16_t instance_divisor_is_one; /* bitmask of inputs */ > uint16_t instance_divisor_is_fetched; /* bitmask of inputs */ > + unsigned ls_vgpr_fix:1; > }; > > /* Common TCS bits between the shader key and the epilog key. */ > struct si_tcs_epilog_bits { > unsigned prim_mode:3; > unsigned tes_reads_tess_factors:1; > }; > > struct si_gs_prolog_bits { > unsigned tri_strip_adj_fix:1; > diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c > b/src/gallium/drivers/radeonsi/si_state_draw.c > index 1092da403c7..20fd128e533 100644 > --- a/src/gallium/drivers/radeonsi/si_state_draw.c > +++ b/src/gallium/drivers/radeonsi/si_state_draw.c > @@ -195,25 +195,21 @@ static void si_emit_derived_tess_state(struct > si_context *sctx, > /* Make sure the output data fits in the offchip buffer */ > *num_patches = MIN2(*num_patches, > (sctx->screen->tess_offchip_block_dw_size * 4) / > output_patch_size); > > /* Not necessary for correctness, but improves performance. The > * specific value is taken from the proprietary driver. > */ > *num_patches = MIN2(*num_patches, 40); > > - if (sctx->b.chip_class == SI || > - /* TODO: fix GFX9 where a threadgroup contains more than 1 wave > and > - * LS vertices per patch > HS vertices per patch. Piglit: > 16in-1out */ > - (sctx->b.chip_class == GFX9 && > - num_tcs_input_cp > num_tcs_output_cp)) { > + if (sctx->b.chip_class == SI) { > /* SI bug workaround, related to power management. Limit LS-HS > * threadgroups to only one wave. > */ > unsigned one_wave = 64 / MAX2(num_tcs_input_cp, > num_tcs_output_cp); > *num_patches = MIN2(*num_patches, one_wave); > } > > /* The VGT HS block increments the patch ID unconditionally > * within a single threadgroup. This results in incorrect > * patch IDs when instanced draws are used. > @@ -1266,20 +1262,41 @@ void si_draw_vbo(struct pipe_context *ctx, const > struct pipe_draw_info *info) > else if (sctx->tes_shader.cso) > rast_prim = > sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]; > else > rast_prim = info->mode; > > if (rast_prim != sctx->current_rast_prim) { > sctx->current_rast_prim = rast_prim; > sctx->do_update_shaders = true; > } > > + if (sctx->tes_shader.cso && > + (sctx->b.family == CHIP_VEGA10 || sctx->b.family == CHIP_RAVEN)) { > + /* Determine whether the LS VGPR fix should be applied. > + * > + * It is only required when num input CPs > num output CPs, > + * which cannot happen with the fixed function TCS. We should > + * also update this bit when switching from TCS to fixed > + * function TCS. > + */ > + struct si_shader_selector *tcs = sctx->tcs_shader.cso; > + bool ls_vgpr_fix = > + tcs && > + info->vertices_per_patch > > + tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; > + > + if (ls_vgpr_fix != sctx->ls_vgpr_fix) { > + sctx->ls_vgpr_fix = ls_vgpr_fix; > + sctx->do_update_shaders = true; > + } > + } > + > if (sctx->gs_shader.cso) { > /* Determine whether the GS triangle strip adjacency fix > should > * be applied. Rotate every other triangle if > * - triangle strips with adjacency are fed to the GS and > * - primitive restart is disabled (the rotation doesn't help > * when the restart occurs after an odd number of > triangles). > */ > bool gs_tri_strip_adj_fix = > !sctx->tes_shader.cso && > info->mode == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY && > diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c > b/src/gallium/drivers/radeonsi/si_state_shaders.c > index 7898c9196c6..acfccb96168 100644 > --- a/src/gallium/drivers/radeonsi/si_state_shaders.c > +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c > @@ -1276,20 +1276,28 @@ static inline void si_shader_selector_key(struct > pipe_context *ctx, > > if (sctx->ps_shader.cso && > sctx->ps_shader.cso->info.uses_primid) > key->mono.u.vs_export_prim_id = 1; > } > break; > case PIPE_SHADER_TESS_CTRL: > if (sctx->b.chip_class >= GFX9) { > si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, > key, > &key->part.tcs.ls_prolog); > key->part.tcs.ls = sctx->vs_shader.cso; > + > + /* When the LS VGPR fix is needed, monolithic shaders > + * can save: > + * - redundant EXEC initialization
The redundant EXEC initialization only seems to happen when (!sel->vs_needs_prolog && ls_vgpr_fix). If sel->vs_needs_prolog == true, the main part doesn't initialize EXEC. I think the comment should clarify that because "redundant EXEC initialization" seems more confusing than explanatory to me. Other than that: Reviewed-by: Marek Olšák <marek.ol...@amd.com> Marek _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev