From: Marek Olšák <marek.ol...@amd.com> The effect of the last 13 commits on user SGPR counts:
64-bit pointers: TCS: 14 -> 12 Merged VS-TCS: 24 -> 20 Merged VS-GS: 18 -> 16 Merged TES-GS: 18 -> 14 32-bit pointers: TCS: 10 -> 8 Merged VS-TCS: 16 -> 12 Merged VS-GS: 11 -> 9 Merged TES-GS: 11 -> 6 --- src/gallium/drivers/radeonsi/si_descriptors.c | 2 +- src/gallium/drivers/radeonsi/si_shader.c | 30 +++++++++++++++++++------ src/gallium/drivers/radeonsi/si_shader.h | 9 +++++++- src/gallium/drivers/radeonsi/si_state_shaders.c | 5 +++-- 4 files changed, 35 insertions(+), 11 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 7fdac23..a4dae44 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -2190,21 +2190,21 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx, struct radeon_winsys_cs *cs = sctx->b.gfx.cs; /* Find the location of the VB descriptor pointer. */ /* TODO: In the future, the pointer will be packed in unused * bits of the first 2 VB descriptors. */ unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR; if (sctx->b.chip_class >= GFX9) { if (sctx->tes_shader.cso) sh_dw_offset = GFX9_TCS_NUM_USER_SGPR; else if (sctx->gs_shader.cso) - sh_dw_offset = GFX9_GS_NUM_USER_SGPR; + sh_dw_offset = GFX9_VSGS_NUM_USER_SGPR; } unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + sh_dw_offset * 4; si_emit_shader_pointer_head(cs, sh_offset, 1); si_emit_shader_pointer_body(sctx->screen, cs, sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset); sctx->vertex_buffer_pointer_dirty = false; } diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index dfbb1f2..7b37765 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -3413,21 +3413,26 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx) 8 + SI_SGPR_RW_BUFFERS); ret = si_insert_input_ptr(ctx, ret, ctx->param_bindless_samplers_and_images, 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); #if !HAVE_32BIT_POINTERS ret = si_insert_input_ptr(ctx, ret, ctx->param_vs_state_bits + 1, 8 + GFX9_SGPR_2ND_SAMPLERS_AND_IMAGES); #endif - unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR; + unsigned vgpr; + if (ctx->type == PIPE_SHADER_VERTEX) + vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR; + else + vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR; + for (unsigned i = 0; i < 5; i++) { unsigned param = ctx->param_gs_vtx01_offset + i; ret = si_insert_input_ret_float(ctx, ret, param, vgpr++); } ctx->return_value = ret; } static void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs) @@ -4772,26 +4777,27 @@ static void create_function(struct si_shader_context *ctx) add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */ add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */ declare_global_desc_pointers(ctx, &fninfo); declare_per_stage_desc_pointers(ctx, &fninfo, (ctx->type == PIPE_SHADER_VERTEX || ctx->type == PIPE_SHADER_TESS_EVAL)); if (ctx->type == PIPE_SHADER_VERTEX) { declare_vs_specific_input_sgprs(ctx, &fninfo); } else { - /* TESS_EVAL (and also GEOMETRY): - * Declare as many input SGPRs as the VS has. */ ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32); - add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ - ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ + if (!HAVE_32BIT_POINTERS) { + /* Declare as many input SGPRs as the VS has. */ + add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ + ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ + } } if (!HAVE_32BIT_POINTERS) { declare_samplers_and_images(ctx, &fninfo, ctx->type == PIPE_SHADER_GEOMETRY); } if (ctx->type == PIPE_SHADER_VERTEX) { ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32)); } @@ -4805,22 +4811,29 @@ static void create_function(struct si_shader_context *ctx) if (ctx->type == PIPE_SHADER_VERTEX) { declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs); } else if (ctx->type == PIPE_SHADER_TESS_EVAL) { declare_tes_input_vgprs(ctx, &fninfo); } if (ctx->type == PIPE_SHADER_VERTEX || ctx->type == PIPE_SHADER_TESS_EVAL) { + unsigned num_user_sgprs; + + if (ctx->type == PIPE_SHADER_VERTEX) + num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR; + else + num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; + /* ES return values are inputs to GS. */ - for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++) + for (i = 0; i < 8 + num_user_sgprs; i++) returns[num_returns++] = ctx->i32; /* SGPRs */ for (i = 0; i < 5; i++) returns[num_returns++] = ctx->f32; /* VGPRs */ } break; case PIPE_SHADER_TESS_EVAL: declare_global_desc_pointers(ctx, &fninfo); declare_per_stage_desc_pointers(ctx, &fninfo, true); ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); @@ -6325,21 +6338,24 @@ static void si_build_gs_prolog_function(struct si_shader_context *ctx, { unsigned num_sgprs, num_vgprs; struct si_function_info fninfo; LLVMBuilderRef builder = ctx->ac.builder; LLVMTypeRef returns[48]; LLVMValueRef func, ret; si_init_function_info(&fninfo); if (ctx->screen->info.chip_class >= GFX9) { - num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR; + if (key->gs_prolog.states.gfx9_prev_is_vs) + num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR; + else + num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR; num_vgprs = 5; /* ES inputs are not needed by GS */ } else { num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; num_vgprs = 8; } for (unsigned i = 0; i < num_sgprs; ++i) { add_arg(&fninfo, ARG_SGPR, ctx->i32); returns[i] = ctx->i32; } diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 471f2e9..f589789 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -205,21 +205,27 @@ enum { GFX9_SGPR_TCS_OFFCHIP_LAYOUT = GFX9_MERGED_NUM_USER_SGPR, GFX9_SGPR_TCS_OUT_OFFSETS, GFX9_SGPR_TCS_OUT_LAYOUT, #if !HAVE_32BIT_POINTERS GFX9_SGPR_align_for_vb_pointer, #endif GFX9_TCS_NUM_USER_SGPR, /* GS limits */ GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS, - GFX9_GS_NUM_USER_SGPR = GFX9_MERGED_NUM_USER_SGPR, +#if HAVE_32BIT_POINTERS + GFX9_VSGS_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR, + GFX9_TESGS_NUM_USER_SGPR = SI_TES_NUM_USER_SGPR, +#else + GFX9_VSGS_NUM_USER_SGPR = GFX9_MERGED_NUM_USER_SGPR, + GFX9_TESGS_NUM_USER_SGPR = GFX9_MERGED_NUM_USER_SGPR, +#endif SI_GSCOPY_NUM_USER_SGPR = SI_SGPR_RW_BUFFERS + (HAVE_32BIT_POINTERS ? 1 : 2), /* PS only */ SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS, SI_PS_NUM_USER_SGPR, }; /* LLVM function parameter indices */ enum { SI_NUM_RESOURCE_PARAMS = 4, @@ -418,20 +424,21 @@ struct si_vs_prolog_bits { /* Common TCS bits between the shader key and the epilog key. */ struct si_tcs_epilog_bits { unsigned prim_mode:3; unsigned invoc0_tess_factors_are_def:1; unsigned tes_reads_tess_factors:1; }; struct si_gs_prolog_bits { unsigned tri_strip_adj_fix:1; + unsigned gfx9_prev_is_vs:1; }; /* Common PS bits between the shader key and the prolog key. */ struct si_ps_prolog_bits { unsigned color_two_side:1; unsigned flatshade_colors:1; unsigned poly_stipple:1; unsigned force_persp_sample_interp:1; unsigned force_linear_sample_interp:1; unsigned force_persp_center_interp:1; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 4b1ff94..7cea13e 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -759,23 +759,23 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */ else if (sel->info.uses_primid) gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */ else if (input_prim >= PIPE_PRIM_TRIANGLES) gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */ else gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */ unsigned num_user_sgprs; if (es_type == PIPE_SHADER_VERTEX) - num_user_sgprs = si_get_num_vs_user_sgprs(GFX9_GS_NUM_USER_SGPR); + num_user_sgprs = si_get_num_vs_user_sgprs(GFX9_VSGS_NUM_USER_SGPR); else - num_user_sgprs = GFX9_GS_NUM_USER_SGPR; + num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; gfx9_get_gs_info(shader->key.part.gs.es, sel, &gs_info); si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8); si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, va >> 40); si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) | S_00B228_DX10_CLAMP(1) | @@ -1291,20 +1291,21 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, } break; case PIPE_SHADER_GEOMETRY: if (sctx->b.chip_class >= GFX9) { if (sctx->tes_shader.cso) { key->part.gs.es = sctx->tes_shader.cso; } else { si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, key, &key->part.gs.vs_prolog); key->part.gs.es = sctx->vs_shader.cso; + key->part.gs.prolog.gfx9_prev_is_vs = 1; } /* Merged ES-GS can have unbalanced wave usage. * * ES threads are per-vertex, while GS threads are * per-primitive. So without any amplification, there * are fewer GS threads than ES threads, which can result * in empty (no-op) GS waves. With too much amplification, * there are more GS threads than ES threads, which * can result in empty (no-op) ES waves. -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev