Reviewed-by: Marek Olšák <marek.ol...@amd.com> Marek
On Thu, Jul 30, 2015 at 2:06 AM, Dave Airlie <airl...@gmail.com> wrote: > From: Dave Airlie <airl...@redhat.com> > > This is the final piece for ARB_gpu_shader5, > > The code is based on the r600 code from Glenn Kennard, > and myself. > > While developing this, I'm not 100% sure of all the calculations > made in the GS registers, this is why the max_stream is worked > out there and used to limit the changes in registers. Otherwise > my initial attempts either regressed GS texelFetch tests > or primitive-id-restart. The current code has no regressions > in piglit. > > This commit doesn't enable ARB_gpu_shader5, since that just > bumps the glsl level to 4.00, so I'll just do a separate patch > for 4.10. > > v1.1: fix bug introduced in rebase. > v2: Address Marek's review comments, > remove my llvm stream code for simpler C, > move gsvs_ring and gs_next_vertex to arrays. > > Signed-off-by: Dave Airlie <airl...@redhat.com> > --- > src/gallium/drivers/radeonsi/si_descriptors.c | 4 +- > src/gallium/drivers/radeonsi/si_pipe.c | 2 +- > src/gallium/drivers/radeonsi/si_shader.c | 74 +++++++++++++++++++----- > src/gallium/drivers/radeonsi/si_state.c | 4 -- > src/gallium/drivers/radeonsi/si_state.h | 7 ++- > src/gallium/drivers/radeonsi/si_state_shaders.c | 75 > +++++++++++++++++++------ > 6 files changed, 127 insertions(+), 39 deletions(-) > > diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c > b/src/gallium/drivers/radeonsi/si_descriptors.c > index 2e2a35b..14bb6e1 100644 > --- a/src/gallium/drivers/radeonsi/si_descriptors.c > +++ b/src/gallium/drivers/radeonsi/si_descriptors.c > @@ -724,7 +724,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint > shader, uint slot, > struct pipe_resource *buffer, > unsigned stride, unsigned num_records, > bool add_tid, bool swizzle, > - unsigned element_size, unsigned index_stride) > + unsigned element_size, unsigned index_stride, > uint64_t offset) > { > struct si_context *sctx = (struct si_context *)ctx; > struct si_buffer_resources *buffers = &sctx->rw_buffers[shader]; > @@ -741,7 +741,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint > shader, uint slot, > if (buffer) { > uint64_t va; > > - va = r600_resource(buffer)->gpu_address; > + va = r600_resource(buffer)->gpu_address + offset; > > switch (element_size) { > default: > diff --git a/src/gallium/drivers/radeonsi/si_pipe.c > b/src/gallium/drivers/radeonsi/si_pipe.c > index 808b9bc..a120282 100644 > --- a/src/gallium/drivers/radeonsi/si_pipe.c > +++ b/src/gallium/drivers/radeonsi/si_pipe.c > @@ -316,7 +316,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum > pipe_cap param) > case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: > return 4095; > case PIPE_CAP_MAX_VERTEX_STREAMS: > - return 1; > + return 4; > > case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: > return 2048; > diff --git a/src/gallium/drivers/radeonsi/si_shader.c > b/src/gallium/drivers/radeonsi/si_shader.c > index fa31f73..d8bab87 100644 > --- a/src/gallium/drivers/radeonsi/si_shader.c > +++ b/src/gallium/drivers/radeonsi/si_shader.c > @@ -31,6 +31,7 @@ > #include "gallivm/lp_bld_intr.h" > #include "gallivm/lp_bld_logic.h" > #include "gallivm/lp_bld_arit.h" > +#include "gallivm/lp_bld_bitarit.h" > #include "gallivm/lp_bld_flow.h" > #include "radeon/r600_cs.h" > #include "radeon/radeon_llvm.h" > @@ -87,8 +88,8 @@ struct si_shader_context > LLVMValueRef samplers[SI_NUM_SAMPLER_STATES]; > LLVMValueRef so_buffers[4]; > LLVMValueRef esgs_ring; > - LLVMValueRef gsvs_ring; > - LLVMValueRef gs_next_vertex; > + LLVMValueRef gsvs_ring[4]; > + LLVMValueRef gs_next_vertex[4]; > }; > > static struct si_shader_context * si_shader_context( > @@ -1576,6 +1577,9 @@ static void si_llvm_emit_streamout(struct > si_shader_context *shader, > LLVMValueRef can_emit = > LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, ""); > > + LLVMValueRef stream_id = > + unpack_param(shader, shader->param_streamout_config, 24, 2); > + > /* Emit the streamout code conditionally. This actually avoids > * out-of-bounds buffer access. The hw tells us via the SGPR > * (so_vtx_count) which threads are allowed to emit streamout data. */ > @@ -1615,7 +1619,9 @@ static void si_llvm_emit_streamout(struct > si_shader_context *shader, > unsigned reg = so->output[i].register_index; > unsigned start = so->output[i].start_component; > unsigned num_comps = so->output[i].num_components; > + unsigned stream = so->output[i].stream; > LLVMValueRef out[4]; > + struct lp_build_if_state if_ctx_stream; > > assert(num_comps && num_comps <= 4); > if (!num_comps || num_comps > 4) > @@ -1649,11 +1655,18 @@ static void si_llvm_emit_streamout(struct > si_shader_context *shader, > break; > } > > + LLVMValueRef can_emit_stream = > + LLVMBuildICmp(builder, LLVMIntEQ, > + stream_id, > + lp_build_const_int32(gallivm, > stream), ""); > + > + lp_build_if(&if_ctx_stream, gallivm, can_emit_stream); > build_tbuffer_store_dwords(shader, > shader->so_buffers[buf_idx], > vdata, num_comps, > so_write_offset[buf_idx], > LLVMConstInt(i32, 0, 0), > > so->output[i].dst_offset*4); > + lp_build_endif(&if_ctx_stream); > } > } > lp_build_endif(&if_ctx); > @@ -3188,6 +3201,19 @@ static void build_interp_intrinsic(const struct > lp_build_tgsi_action *action, > } > } > > +static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base, > + struct lp_build_emit_data *emit_data) > +{ > + LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates; > + struct tgsi_src_register src0 = emit_data->inst->Src[0].Register; > + unsigned stream; > + > + assert(src0.File == TGSI_FILE_IMMEDIATE); > + > + stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & > 0x3; > + return stream; > +} > + > /* Emit one vertex from the geometry shader */ > static void si_llvm_emit_vertex( > const struct lp_build_tgsi_action *action, > @@ -3207,9 +3233,14 @@ static void si_llvm_emit_vertex( > LLVMValueRef args[2]; > unsigned chan; > int i; > + unsigned stream; > + > + stream = si_llvm_get_stream(bld_base, emit_data); > > /* Write vertex attribute values to GSVS ring */ > - gs_next_vertex = LLVMBuildLoad(gallivm->builder, > si_shader_ctx->gs_next_vertex, ""); > + gs_next_vertex = LLVMBuildLoad(gallivm->builder, > + si_shader_ctx->gs_next_vertex[stream], > + ""); > > /* If this thread has already emitted the declared maximum number of > * vertices, kill it: excessive vertex emissions are not supposed to > @@ -3222,6 +3253,7 @@ static void si_llvm_emit_vertex( > kill = lp_build_select(&bld_base->base, can_emit, > lp_build_const_float(gallivm, 1.0f), > lp_build_const_float(gallivm, -1.0f)); > + > build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", > LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0); > > @@ -3241,7 +3273,7 @@ static void si_llvm_emit_vertex( > out_val = LLVMBuildBitCast(gallivm->builder, out_val, > i32, ""); > > build_tbuffer_store(si_shader_ctx, > - si_shader_ctx->gsvs_ring, > + si_shader_ctx->gsvs_ring[stream], > out_val, 1, > voffset, soffset, 0, > V_008F0C_BUF_DATA_FORMAT_32, > @@ -3251,10 +3283,11 @@ static void si_llvm_emit_vertex( > } > gs_next_vertex = lp_build_add(uint, gs_next_vertex, > lp_build_const_int32(gallivm, 1)); > - LLVMBuildStore(gallivm->builder, gs_next_vertex, > si_shader_ctx->gs_next_vertex); > + > + LLVMBuildStore(gallivm->builder, gs_next_vertex, > si_shader_ctx->gs_next_vertex[stream]); > > /* Signal vertex emission */ > - args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | > SENDMSG_GS); > + args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | > SENDMSG_GS | (stream << 8)); > args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, > SI_PARAM_GS_WAVE_ID); > build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", > LLVMVoidTypeInContext(gallivm->context), args, 2, > @@ -3270,9 +3303,11 @@ static void si_llvm_emit_primitive( > struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); > struct gallivm_state *gallivm = bld_base->base.gallivm; > LLVMValueRef args[2]; > + unsigned stream; > > /* Signal primitive cut */ > - args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | > SENDMSG_GS); > + stream = si_llvm_get_stream(bld_base, emit_data); > + args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | > SENDMSG_GS | (stream << 8)); > args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, > SI_PARAM_GS_WAVE_ID); > build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", > LLVMVoidTypeInContext(gallivm->context), args, 2, > @@ -3651,13 +3686,21 @@ static void preload_ring_buffers(struct > si_shader_context *si_shader_ctx) > build_indexed_load_const(si_shader_ctx, buf_ptr, > offset); > } > > - if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY || > - si_shader_ctx->shader->is_gs_copy_shader) { > + if (si_shader_ctx->shader->is_gs_copy_shader) { > LLVMValueRef offset = lp_build_const_int32(gallivm, > SI_RING_GSVS); > > - si_shader_ctx->gsvs_ring = > + si_shader_ctx->gsvs_ring[0] = > build_indexed_load_const(si_shader_ctx, buf_ptr, > offset); > } > + if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) { > + int i; > + for (i = 0; i < 4; i++) { > + LLVMValueRef offset = lp_build_const_int32(gallivm, > SI_RING_GSVS + i); > + > + si_shader_ctx->gsvs_ring[i] = > + build_indexed_load_const(si_shader_ctx, > buf_ptr, offset); > + } > + } > } > > void si_shader_binary_read_config(const struct si_screen *sscreen, > @@ -3838,7 +3881,7 @@ static int si_generate_gs_copy_shader(struct si_screen > *sscreen, > preload_streamout_buffers(si_shader_ctx); > preload_ring_buffers(si_shader_ctx); > > - args[0] = si_shader_ctx->gsvs_ring; > + args[0] = si_shader_ctx->gsvs_ring[0]; > args[1] = lp_build_mul_imm(uint, > > LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, > > si_shader_ctx->param_vertex_id), > @@ -4076,9 +4119,12 @@ int si_shader_create(struct si_screen *sscreen, > LLVMTargetMachineRef tm, > preload_ring_buffers(&si_shader_ctx); > > if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) { > - si_shader_ctx.gs_next_vertex = > - lp_build_alloca(bld_base->base.gallivm, > - bld_base->uint_bld.elem_type, ""); > + int i; > + for (i = 0; i < 4; i++) { > + si_shader_ctx.gs_next_vertex[i] = > + lp_build_alloca(bld_base->base.gallivm, > + bld_base->uint_bld.elem_type, > ""); > + } > } > > if (!lp_build_tgsi_llvm(bld_base, tokens)) { > diff --git a/src/gallium/drivers/radeonsi/si_state.c > b/src/gallium/drivers/radeonsi/si_state.c > index ab5c3ca..86e1624 100644 > --- a/src/gallium/drivers/radeonsi/si_state.c > +++ b/src/gallium/drivers/radeonsi/si_state.c > @@ -3138,10 +3138,6 @@ static void si_init_config(struct si_context *sctx) > si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0); > si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); > > - si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, 0); > - si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, 0); > - si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, 0); > - > si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); > si_pm4_set_reg(pm4, R_028AB4_VGT_REUSE_OFF, 0); > si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0); > diff --git a/src/gallium/drivers/radeonsi/si_state.h > b/src/gallium/drivers/radeonsi/si_state.h > index 2522053..e4d859a 100644 > --- a/src/gallium/drivers/radeonsi/si_state.h > +++ b/src/gallium/drivers/radeonsi/si_state.h > @@ -148,7 +148,10 @@ struct si_shader_data { > #define SI_RING_TESS_FACTOR 0 /* for HS (TCS) */ > #define SI_RING_ESGS 0 /* for ES, GS */ > #define SI_RING_GSVS 1 /* for GS, VS */ > -#define SI_NUM_RING_BUFFERS 2 > +#define SI_RING_GSVS_1 2 /* 1, 2, 3 for GS */ > +#define SI_RING_GSVS_2 3 > +#define SI_RING_GSVS_3 4 > +#define SI_NUM_RING_BUFFERS 5 > #define SI_SO_BUF_OFFSET SI_NUM_RING_BUFFERS > #define SI_NUM_RW_BUFFERS (SI_SO_BUF_OFFSET + 4) > > @@ -249,7 +252,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint > shader, uint slot, > struct pipe_resource *buffer, > unsigned stride, unsigned num_records, > bool add_tid, bool swizzle, > - unsigned element_size, unsigned index_stride); > + unsigned element_size, unsigned index_stride, > uint64_t offset); > void si_init_all_descriptors(struct si_context *sctx); > void si_release_all_descriptors(struct si_context *sctx); > void si_all_descriptors_begin_new_cs(struct si_context *sctx); > diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c > b/src/gallium/drivers/radeonsi/si_state_shaders.c > index 18bddfd..1a6854e 100644 > --- a/src/gallium/drivers/radeonsi/si_state_shaders.c > +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c > @@ -206,16 +206,32 @@ static void si_shader_es(struct si_shader *shader) > si_set_tesseval_regs(shader, pm4); > } > > +static unsigned si_gs_get_max_stream(struct si_shader *shader) > +{ > + struct pipe_stream_output_info *so = &shader->selector->so; > + unsigned max_stream = 0, i; > + > + if (so->num_outputs == 0) > + return 0; > + > + for (i = 0; i < so->num_outputs; i++) { > + if (so->output[i].stream > max_stream) > + max_stream = so->output[i].stream; > + } > + return max_stream; > +} > + > static void si_shader_gs(struct si_shader *shader) > { > - unsigned gs_vert_itemsize = shader->selector->info.num_outputs * (16 > >> 2); > + unsigned gs_vert_itemsize = shader->selector->info.num_outputs * 16; > unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices; > - unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out; > + unsigned gsvs_itemsize = (gs_vert_itemsize * gs_max_vert_out) >> 2; > unsigned gs_num_invocations = shader->selector->gs_num_invocations; > unsigned cut_mode; > struct si_pm4_state *pm4; > unsigned num_sgprs, num_user_sgprs; > uint64_t va; > + unsigned max_stream = si_gs_get_max_stream(shader); > > /* The GSVS_RING_ITEMSIZE register takes 15 bits */ > assert(gsvs_itemsize < (1 << 15)); > @@ -243,16 +259,19 @@ static void si_shader_gs(struct si_shader *shader) > S_028A40_GS_WRITE_OPTIMIZE(1)); > > si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize); > - si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize); > - si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize); > + si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * > ((max_stream >= 2) ? 2 : 1)); > + si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * > ((max_stream >= 3) ? 3 : 1)); > > si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, > util_bitcount64(shader->selector->inputs_read) * (16 > >> 2)); > - si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize); > + si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * > (max_stream + 1)); > > si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out); > > - si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize); > + si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize > >> 2); > + si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= > 1) ? gs_vert_itemsize >> 2 : 0); > + si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= > 2) ? gs_vert_itemsize >> 2 : 0); > + si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= > 3) ? gs_vert_itemsize >> 2 : 0); > > si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT, > S_028B90_CNT(MIN2(gs_num_invocations, 127)) | > @@ -1001,15 +1020,42 @@ static void si_init_gs_rings(struct si_context *sctx) > > si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS, > sctx->esgs_ring, 0, esgs_ring_size, > - true, true, 4, 64); > + true, true, 4, 64, 0); > si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS, > sctx->esgs_ring, 0, esgs_ring_size, > - false, false, 0, 0); > + false, false, 0, 0, 0); > si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS, > sctx->gsvs_ring, 0, gsvs_ring_size, > - false, false, 0, 0); > + false, false, 0, 0, 0); > } > > +static void si_update_gs_rings(struct si_context *sctx) > +{ > + unsigned gs_vert_itemsize = sctx->gs_shader->info.num_outputs * 16; > + unsigned gs_max_vert_out = sctx->gs_shader->gs_max_out_vertices; > + unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out; > + uint64_t offset; > + > + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS, > + sctx->gsvs_ring, gsvs_itemsize, > + 64, true, true, 4, 16, 0); > + > + offset = gsvs_itemsize * 64; > + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_1, > + sctx->gsvs_ring, gsvs_itemsize, > + 64, true, true, 4, 16, offset); > + > + offset = (gsvs_itemsize * 2) * 64; > + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_2, > + sctx->gsvs_ring, gsvs_itemsize, > + 64, true, true, 4, 16, offset); > + > + offset = (gsvs_itemsize * 3) * 64; > + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_3, > + sctx->gsvs_ring, gsvs_itemsize, > + 64, true, true, 4, 16, offset); > + > +} > /** > * @returns 1 if \p sel has been updated to use a new scratch buffer and 0 > * otherwise. > @@ -1171,7 +1217,7 @@ static void si_init_tess_factor_ring(struct si_context > *sctx) > > si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_TESS_CTRL, > SI_RING_TESS_FACTOR, sctx->tf_ring, 0, > - sctx->tf_ring->width0, false, false, 0, 0); > + sctx->tf_ring->width0, false, false, 0, 0, 0); > > sctx->b.flags |= SI_CONTEXT_VGT_FLUSH; > } > @@ -1252,7 +1298,7 @@ static void si_update_so(struct si_context *sctx, > struct si_shader_selector *sha > int i; > > for (i = 0; i < so->num_outputs; i++) > - enabled_stream_buffers_mask |= (1 << > so->output[i].output_buffer); > + enabled_stream_buffers_mask |= (1 << > so->output[i].output_buffer) << (so->output[i].stream * 4); > sctx->b.streamout.enabled_stream_buffers_mask = > enabled_stream_buffers_mask; > sctx->b.streamout.stride_in_dw = shader->so.stride; > } > @@ -1311,15 +1357,12 @@ void si_update_shaders(struct si_context *sctx) > > if (!sctx->gs_rings) > si_init_gs_rings(sctx); > + > if (sctx->emitted.named.gs_rings != sctx->gs_rings) > sctx->b.flags |= SI_CONTEXT_VGT_FLUSH; > si_pm4_bind_state(sctx, gs_rings, sctx->gs_rings); > > - si_set_ring_buffer(ctx, PIPE_SHADER_GEOMETRY, SI_RING_GSVS, > - sctx->gsvs_ring, > - sctx->gs_shader->gs_max_out_vertices * > - sctx->gs_shader->info.num_outputs * 16, > - 64, true, true, 4, 16); > + si_update_gs_rings(sctx); > } else { > si_pm4_bind_state(sctx, gs_rings, NULL); > si_pm4_bind_state(sctx, gs, NULL); > -- > 2.4.3 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev