>> +static void gfx9_get_gs_info(struct si_shader_selector *es, >> + struct si_shader_selector *gs, >> + struct gfx9_gs_info *out) >> +{ >> + unsigned gs_num_invocations = MAX2(gs->gs_num_invocations, 1); >> + unsigned input_prim = >> gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]; >> + bool uses_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY && >> + input_prim <= >> PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY; >> + >> + /* All these are in dwords: */ >> + /* We can't allow using the whole LDS, because GS waves compete >> with >> + * other shader stages for LDS space. */ > > > Is this a strict requirement to prevent hangs? If so, couldn't the hang > occur in other ways as well? > > If it's just for performance, please note that in the comment.
No. I think it's due to the default LDS reservation logic, which is that a half of LDS can be used by PS and CS but not other stages. The number was copied from our Vulkan driver without much thought. > > >> + const unsigned max_lds_size = 8 * 1024; >> + const unsigned esgs_itemsize = es->esgs_itemsize / 4; >> + unsigned esgs_lds_size; >> + >> + /* All these are per subgroup: */ >> + const unsigned max_out_prims = 32 * 1024; >> + const unsigned max_es_verts = 255; > > > I assume the idea here is 4 waves to a CU, so why not 256? The hardware > register goes up to 2047 even. No idea. The number was copied from Vulkan. > > >> + const unsigned ideal_gs_prims = 64; >> + unsigned max_gs_prims, gs_prims; >> + unsigned min_es_verts, es_verts, worst_case_es_verts; >> + >> + assert(gs_num_invocations <= 32); /* GL maximum */ >> + >> + if (uses_adjacency || gs_num_invocations > 1) >> + max_gs_prims = 127 / gs_num_invocations; >> + else >> + max_gs_prims = 255; > > > Same question as for max_es_verts here. Same answer. > > Also, why the different base number? For adjacency, I could imagine it's > because you have basically double the number of vertices per primitive, so > you fewer GS invocations. But why the same reduction of the base number when > gs_num_invocations > 1? No idea. Same answer. I was wondering about it as well, but I'm not gonna try my luck here. > > >> + >> + /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * >> gs_invocations. >> + * Make sure we don't go over the maximum value. >> + */ >> + max_gs_prims = MIN2(max_gs_prims, >> + max_out_prims / >> + (gs->gs_max_out_vertices * >> gs_num_invocations)); >> + assert(max_gs_prims > 0); >> + >> + /* If the primitive has adjacency, halve the number of vertices >> + * that will be reused in multiple primitives. >> + */ >> + min_es_verts = gs->gs_input_verts_per_prim / (uses_adjacency ? 2 : >> 1); > > > I don't understand this. In the worst case, you have e.g. a single triangle > with adjacency which needs 6 ES vertices, and this is already reflected in > gs_input_verts_per_prim. > > I see another reference below about vertex re-use, but I don't see how that > applies to LINES_ADJACENCY and TRIANGLES_ADJACENCY. Also copied from Vulkan. You can ask them. > > > >> + >> + gs_prims = MIN2(ideal_gs_prims, max_gs_prims); >> + worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts); >> + >> + /* Compute ESGS LDS size based on the worst case number of ES >> vertices >> + * needed to create the target number of GS prims per subgroup. >> + */ >> + esgs_lds_size = esgs_itemsize * worst_case_es_verts; >> + >> + /* If total LDS usage is too big, refactor partitions based on >> ratio >> + * of ESGS item sizes. >> + */ >> + if (esgs_lds_size > max_lds_size) { >> + /* Our target GS Prims Per Subgroup was too large. >> Calculate >> + * the maximum number of GS Prims Per Subgroup that will >> fit >> + * into LDS, capped by the maximum that the hardware can >> support. >> + */ >> + gs_prims = MIN2((max_lds_size / (esgs_itemsize * >> min_es_verts)), >> + max_gs_prims); >> + assert(gs_prims > 0); >> + worst_case_es_verts = MIN2(min_es_verts * gs_prims, >> + max_es_verts); >> + >> + esgs_lds_size = esgs_itemsize * worst_case_es_verts; >> + assert(esgs_lds_size <= max_lds_size); >> + } >> + >> + /* Now calculate remaining ESGS information. */ >> + if (esgs_lds_size) >> + es_verts = MIN2(esgs_lds_size / esgs_itemsize, >> max_es_verts); >> + else >> + es_verts = max_es_verts; >> + >> + /* Vertices for adjacency primitives are not always reused, so >> restore >> + * it for ES_VERTS_PER_SUBGRP. >> + */ >> + min_es_verts = gs->gs_input_verts_per_prim; >> + >> + /* For normal primitives, the VGT only checks if they are past the >> ES > > > What are "normal" primitives? Without adjacency I guess. The entire comment was copied from Vulkan. > > > >> + * verts per subgroup after allocating a full GS primitive and if >> they >> + * are, kick off a new subgroup. But if those additional ES verts >> are >> + * unique (e.g. not reused) we need to make sure there is enough >> LDS >> + * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP. >> + */ >> + es_verts -= min_es_verts - 1; >> + >> + out->es_verts_per_subgroup = es_verts; >> + out->gs_prims_per_subgroup = gs_prims; >> + out->gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations; >> + out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup * >> + gs->gs_max_out_vertices; >> + out->lds_size = align(esgs_lds_size, 128) / 128; >> + >> + assert(out->max_prims_per_subgroup <= max_out_prims); >> +} >> + >> +static void si_shader_gs(struct si_screen *sscreen, struct si_shader >> *shader) >> { >> struct si_shader_selector *sel = shader->selector; >> const ubyte *num_components = >> sel->info.num_stream_output_components; >> unsigned gs_num_invocations = sel->gs_num_invocations; >> struct si_pm4_state *pm4; >> uint64_t va; >> unsigned max_stream = sel->max_gs_stream; >> unsigned offset; >> >> pm4 = si_get_shader_pm4_state(shader); >> @@ -614,44 +722,99 @@ static void si_shader_gs(struct si_shader *shader) >> if (max_stream >= 2) >> offset += num_components[2] * sel->gs_max_out_vertices; >> si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, offset); >> if (max_stream >= 3) >> offset += num_components[3] * sel->gs_max_out_vertices; >> si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset); >> >> /* The GSVS_RING_ITEMSIZE register takes 15 bits */ >> assert(offset < (1 << 15)); >> >> - si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, >> shader->selector->gs_max_out_vertices); >> + si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, >> sel->gs_max_out_vertices); >> >> si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, >> num_components[0]); >> si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >> >= 1) ? num_components[1] : 0); >> si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >> >= 2) ? num_components[2] : 0); >> si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >> >= 3) ? num_components[3] : 0); >> >> si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT, >> S_028B90_CNT(MIN2(gs_num_invocations, 127)) | >> S_028B90_ENABLE(gs_num_invocations > 0)); >> >> va = shader->bo->gpu_address; >> si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, >> RADEON_PRIO_SHADER_BINARY); >> - si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8); >> - si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40); >> - >> - si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, >> - S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) >> | >> - S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) >> | >> - S_00B228_DX10_CLAMP(1) | >> - S_00B228_FLOAT_MODE(shader->config.float_mode)); >> - si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, >> - S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) | >> - >> S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); >> + >> + if (sscreen->b.chip_class >= GFX9) { >> + unsigned input_prim = >> sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]; >> + unsigned es_type = shader->key.part.gs.es->type; >> + unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt; >> + struct gfx9_gs_info gs_info; >> + >> + if (es_type == PIPE_SHADER_VERTEX) >> + es_vgpr_comp_cnt = shader->info.uses_instanceid ? >> 3 : 0; >> + else if (es_type == PIPE_SHADER_TESS_EVAL) >> + es_vgpr_comp_cnt = 3; /* all components are needed >> for TES */ >> + else >> + unreachable("invalid shader selector type"); >> + >> + /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored >> and >> + * VGPR[0:4] are always loaded. >> + */ >> + if (sel->info.uses_invocationid) >> + gs_vgpr_comp_cnt = 3; /* VGPR3 contains >> InvocationID. */ >> + else if (sel->info.uses_primid) >> + gs_vgpr_comp_cnt = 2; /* VGPR2 contains >> PrimitiveID. */ >> + else if (input_prim >= PIPE_PRIM_TRIANGLES) >> + gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, >> 3 */ >> + else >> + gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, >> 1 */ >> + >> + gfx9_get_gs_info(shader->key.part.gs.es, sel, &gs_info); >> + >> + si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> >> 8); >> + si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, va >> >> 40); >> + >> + si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, >> + S_00B228_VGPRS((shader->config.num_vgprs - >> 1) / 4) | >> + S_00B228_SGPRS((shader->config.num_sgprs - >> 1) / 8) | >> + S_00B228_DX10_CLAMP(1) | >> + >> S_00B228_FLOAT_MODE(shader->config.float_mode) | >> + >> S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt)); >> + si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, >> + S_00B22C_USER_SGPR(GFX9_GS_NUM_USER_SGPR) | >> + >> S_00B22C_USER_SGPR_MSB(GFX9_GS_NUM_USER_SGPR >> 5) | >> + S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) >> | >> + S_00B22C_OC_LDS_EN(es_type == >> PIPE_SHADER_TESS_EVAL) | >> + S_00B22C_LDS_SIZE(gs_info.lds_size) | >> + >> S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); >> + >> + si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL, >> + >> S_028A44_ES_VERTS_PER_SUBGRP(gs_info.es_verts_per_subgroup) | >> + >> S_028A44_GS_PRIMS_PER_SUBGRP(gs_info.gs_prims_per_subgroup) | >> + >> S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_info.gs_inst_prims_in_subgroup)); >> + si_pm4_set_reg(pm4, >> R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, >> + >> S_028A94_MAX_PRIMS_PER_SUBGROUP(gs_info.max_prims_per_subgroup)); >> + si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, >> + shader->key.part.gs.es->esgs_itemsize / 4); >> + } else { >> + si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> >> 8); >> + si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> >> 40); >> + >> + si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, >> + S_00B228_VGPRS((shader->config.num_vgprs - >> 1) / 4) | >> + S_00B228_SGPRS((shader->config.num_sgprs - >> 1) / 8) | >> + S_00B228_DX10_CLAMP(1) | >> + >> S_00B228_FLOAT_MODE(shader->config.float_mode)); >> + si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, >> + S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) | >> + >> S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); >> + } >> } >> >> /** >> * Compute the state for \p shader, which will run as a vertex shader on >> the >> * hardware. >> * >> * If \p gs is non-NULL, it points to the geometry shader for which this >> shader >> * is the copy shader. >> */ >> static void si_shader_vs(struct si_screen *sscreen, struct si_shader >> *shader, >> @@ -961,21 +1124,21 @@ static void si_shader_init_pm4_state(struct >> si_screen *sscreen, >> case PIPE_SHADER_TESS_CTRL: >> si_shader_hs(sscreen, shader); >> break; >> case PIPE_SHADER_TESS_EVAL: >> if (shader->key.as_es) >> si_shader_es(sscreen, shader); >> else >> si_shader_vs(sscreen, shader, NULL); >> break; >> case PIPE_SHADER_GEOMETRY: >> - si_shader_gs(shader); >> + si_shader_gs(sscreen, shader); >> break; >> case PIPE_SHADER_FRAGMENT: >> si_shader_ps(shader); >> break; >> default: >> assert(0); >> } >> } >> >> static unsigned si_get_alpha_test_func(struct si_context *sctx) >> @@ -1100,20 +1263,29 @@ static inline void si_shader_selector_key(struct >> pipe_context *ctx, >> if (sctx->gs_shader.cso) >> key->as_es = 1; >> else { >> si_shader_selector_key_hw_vs(sctx, sel, key); >> >> if (sctx->ps_shader.cso && >> sctx->ps_shader.cso->info.uses_primid) >> key->part.tes.epilog.export_prim_id = 1; >> } >> break; >> case PIPE_SHADER_GEOMETRY: >> + if (sctx->b.chip_class >= GFX9) { >> + if (sctx->tes_shader.cso) { >> + key->part.gs.es = sctx->tes_shader.cso; >> + } else { >> + si_shader_selector_key_vs(sctx, >> sctx->vs_shader.cso, >> + key, >> &key->part.gs.vs_prolog); >> + key->part.gs.es = sctx->vs_shader.cso; >> + } >> + } >> key->part.gs.prolog.tri_strip_adj_fix = >> sctx->gs_tri_strip_adj_fix; >> break; >> case PIPE_SHADER_FRAGMENT: { >> struct si_state_rasterizer *rs = >> sctx->queued.named.rasterizer; >> struct si_state_blend *blend = sctx->queued.named.blend; >> >> if >> (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] && >> sel->info.colors_written == 0x1) >> key->part.ps.epilog.last_cbuf = >> MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1; >> >> @@ -1721,20 +1893,26 @@ static void *si_create_shader_selector(struct >> pipe_context *ctx, >> break; >> case TGSI_SEMANTIC_CLIPVERTEX: /* ignore these */ >> case TGSI_SEMANTIC_EDGEFLAG: >> break; >> default: >> sel->outputs_written2 |= >> 1u << >> si_shader_io_get_unique_index2(name, index); >> } >> } >> sel->esgs_itemsize = util_last_bit64(sel->outputs_written) >> * 16; >> + >> + /* For the ESGS ring in LDS, add 1 dword to reduce LDS >> bank >> + * conflicts, i.e. each vertex will start at a different >> bank. >> + */ >> + if (sctx->b.chip_class >= GFX9) >> + sel->esgs_itemsize += 4; > > > Could this not be achieved by some form of rounding instead? What do you mean? Marek _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev