From: Marek Olšák <marek.ol...@amd.com> TODO: ADD_TID doesn't work. Needs more investigation. --- src/amd/common/ac_llvm_build.c | 58 ++++++++++++++++++++++++++- src/amd/common/ac_llvm_build.h | 4 +- src/amd/common/ac_nir_to_llvm.c | 4 +- src/gallium/drivers/radeonsi/si_descriptors.c | 12 +++++- src/gallium/drivers/radeonsi/si_shader.c | 35 ++++++++++------ 5 files changed, 95 insertions(+), 18 deletions(-)
diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 08fedc7..9435b18 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -544,22 +544,78 @@ ac_build_indexed_load_const(struct ac_llvm_context *ctx, */ void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, unsigned num_channels, LLVMValueRef voffset, LLVMValueRef soffset, unsigned inst_offset, bool glc, - bool slc) + bool slc, + bool writeonly_memory, + bool has_add_tid) { + /* TODO: Fix stores with ADD_TID and remove the "has_add_tid" flag. */ + if (HAVE_LLVM >= 0x0309 && !has_add_tid) { + /* Split 3 channel stores, becase LLVM doesn't support 3-channel + * intrinsics. */ + if (num_channels == 3) { + LLVMValueRef v[3], v01; + + for (int i = 0; i < 3; i++) { + v[i] = LLVMBuildExtractElement(ctx->builder, vdata, + LLVMConstInt(ctx->i32, i, 0), ""); + } + v01 = ac_build_gather_values(ctx, v, 2); + + ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset, + soffset, inst_offset, glc, slc, + writeonly_memory, has_add_tid); + ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset, + soffset, inst_offset + 8, + glc, slc, + writeonly_memory, has_add_tid); + return; + } + + unsigned func = CLAMP(num_channels, 1, 3) - 1; + static const char *types[] = {"f32", "v2f32", "v4f32"}; + char name[256]; + LLVMValueRef offset = soffset; + + if (inst_offset) + offset = LLVMBuildAdd(ctx->builder, offset, + LLVMConstInt(ctx->i32, inst_offset, 0), ""); + if (voffset) + offset = LLVMBuildAdd(ctx->builder, offset, voffset, ""); + + LLVMValueRef args[] = { + bitcast_to_float(ctx, vdata), + LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""), + LLVMConstInt(ctx->i32, 0, 0), + offset, + LLVMConstInt(ctx->i1, glc, 0), + LLVMConstInt(ctx->i1, slc, 0), + }; + + snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s", + types[func]); + + ac_emit_llvm_intrinsic(ctx, name, ctx->voidt, + args, ARRAY_SIZE(args), + writeonly_memory ? + AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY : + AC_FUNC_ATTR_WRITEONLY); + return; + } + static unsigned dfmt[] = { V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32, V_008F0C_BUF_DATA_FORMAT_32_32_32, V_008F0C_BUF_DATA_FORMAT_32_32_32_32 }; assert(num_channels >= 1 && num_channels <= 4); LLVMValueRef args[] = { rsrc, diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h index 78df441..aa99e92 100644 --- a/src/amd/common/ac_llvm_build.h +++ b/src/amd/common/ac_llvm_build.h @@ -123,21 +123,23 @@ ac_build_indexed_load_const(struct ac_llvm_context *ctx, void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, unsigned num_channels, LLVMValueRef voffset, LLVMValueRef soffset, unsigned inst_offset, bool glc, - bool slc); + bool slc, + bool writeonly_memory, + bool has_add_tid); LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels, LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset, unsigned inst_offset, unsigned glc, unsigned slc, diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 9a91e1a..bf330c2 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -3134,21 +3134,21 @@ visit_emit_vertex(struct nir_to_llvm_context *ctx, out_ptr[j], ""); LLVMValueRef voffset = LLVMConstInt(ctx->i32, (slot * 4 + j + start) * ctx->gs_max_out_vertices, false); voffset = LLVMBuildAdd(ctx->builder, voffset, gs_next_vertex, ""); voffset = LLVMBuildMul(ctx->builder, voffset, LLVMConstInt(ctx->i32, 4, false), ""); out_val = LLVMBuildBitCast(ctx->builder, out_val, ctx->i32, ""); ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring, out_val, 1, voffset, ctx->gs2vs_offset, 0, - 1, 1); + 1, 1, true, true); } idx += slot_inc; } gs_next_vertex = LLVMBuildAdd(ctx->builder, gs_next_vertex, ctx->i32one, ""); LLVMBuildStore(ctx->builder, gs_next_vertex, ctx->gs_next_vertex); ac_emit_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (0 << 8), ctx->gs_wave_id); } @@ -4634,21 +4634,21 @@ handle_es_outputs_post(struct nir_to_llvm_context *ctx) for (j = 0; j < length; j++) { LLVMValueRef out_val = LLVMBuildLoad(ctx->builder, out_ptr[j], ""); out_val = LLVMBuildBitCast(ctx->builder, out_val, ctx->i32, ""); ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, NULL, ctx->es2gs_offset, (4 * param_index + j + start) * 4, - 1, 1); + 1, 1, true, true); } } ctx->shader_info->vs.esgs_itemsize = (max_output_written + 1) * 16; } static void si_export_mrt_color(struct nir_to_llvm_context *ctx, LLVMValueRef *color, unsigned param, bool is_last) { LLVMValueRef args[9]; diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index a41b243..cb2c14b 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -1258,20 +1258,21 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint slot, struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS]; /* The stride field in the resource descriptor has 14 bits */ assert(stride < (1 << 14)); assert(slot < descs->num_elements); pipe_resource_reference(&buffers->buffers[slot], NULL); if (buffer) { uint64_t va; + unsigned data_format; va = r600_resource(buffer)->gpu_address + offset; switch (element_size) { default: assert(!"Unsupported ring buffer element size"); case 0: case 2: element_size = 0; break; @@ -1300,33 +1301,42 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint slot, index_stride = 2; break; case 64: index_stride = 3; break; } if (sctx->b.chip_class >= VI && stride) num_records *= stride; + /* When ADD_TID is set, DATA_FORMAT extends STRIDE on VI. + * It applies to MUBUF, but not MTBUF. We use MUBUF stores + * on LLVM >= 3.9, and MTBUF stores on older LLVM. + */ + if (0&&sctx->b.chip_class >= VI && add_tid && HAVE_LLVM >= 0x0309) + data_format = 0; /* stride[14:17] */ + else + data_format = V_008F0C_BUF_DATA_FORMAT_32; + /* Set the descriptor. */ uint32_t *desc = descs->list + slot*4; desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE(swizzle); desc[2] = num_records; desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_DATA_FORMAT(data_format) | S_008F0C_ELEMENT_SIZE(element_size) | S_008F0C_INDEX_STRIDE(index_stride) | S_008F0C_ADD_TID_ENABLE(add_tid); pipe_resource_reference(&buffers->buffers[slot], buffer); radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)buffer, buffers->shader_usage, buffers->priority); buffers->enabled_mask |= 1u << slot; } else { diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 16579af..6ccb407 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1042,29 +1042,29 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base, /* Skip LDS stores if there is no LDS read of this output. */ if (!skip_lds_store) lds_store(bld_base, chan_index, dw_addr, value); value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, ""); values[chan_index] = value; if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) { ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, buf_addr, base, - 4 * chan_index, 1, 0); + 4 * chan_index, 1, 0, true, false); } } if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) { LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm, values, 4); ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr, - base, 0, 1, 0); + base, 0, 1, 0, true, false); } } static LLVMValueRef fetch_input_gs( struct lp_build_tgsi_context *bld_base, const struct tgsi_full_src_register *reg, enum tgsi_opcode_type type, unsigned swizzle) { struct lp_build_context *base = &bld_base->base; @@ -2080,21 +2080,21 @@ static void emit_streamout_output(struct si_shader_context *ctx, vdata = LLVMBuildInsertElement(builder, vdata, out[j], LLVMConstInt(ctx->i32, j, 0), ""); } break; } ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], vdata, num_comps, so_write_offsets[buf_idx], LLVMConstInt(ctx->i32, 0, 0), - stream_out->dst_offset * 4, 1, 1); + stream_out->dst_offset * 4, 1, 1, true, false); } /** * Write streamout data to buffers for vertex stream @p stream (different * vertex streams can occur for GS copy shaders). */ static void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs, unsigned noutput, unsigned stream) { @@ -2405,21 +2405,21 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base) LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), invocation_id, lp_build_const_int32(gallivm, i)); LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0, lds_ptr); ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr, - buffer_offset, 0, 1, 0); + buffer_offset, 0, 1, 0, true, false); } } static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, LLVMValueRef rel_patch_id, LLVMValueRef invocation_id, LLVMValueRef tcs_out_current_patch_data_offset) { struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; @@ -2520,32 +2520,32 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, lp_build_const_int32(gallivm, 4 * stride), ""); lp_build_if(&inner_if_ctx, gallivm, LLVMBuildICmp(gallivm->builder, LLVMIntEQ, rel_patch_id, bld_base->uint_bld.zero, "")); /* Store the dynamic HS control word. */ ac_build_buffer_store_dword(&ctx->ac, buffer, lp_build_const_int32(gallivm, 0x80000000), 1, lp_build_const_int32(gallivm, 0), tf_base, - 0, 1, 0); + 0, 1, 0, true, false); lp_build_endif(&inner_if_ctx); /* Store the tessellation factors. */ ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, MIN2(stride, 4), byteoffset, tf_base, - 4, 1, 0); + 4, 1, 0, true, false); if (vec1) ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, stride - 4, byteoffset, tf_base, - 20, 1, 0); + 20, 1, 0, true, false); /* Store the tess factors into the offchip buffer if TES reads them. */ if (shader->key.part.tcs.epilog.tes_reads_tess_factors) { LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset; LLVMValueRef tf_inner_offset; unsigned param_outer, param_inner; buf = ac_build_indexed_load_const(&ctx->ac, rw_buffers, LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0)); base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); @@ -2553,32 +2553,32 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, param_outer = si_shader_io_get_unique_index( TGSI_SEMANTIC_TESSOUTER, 0); tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, LLVMConstInt(ctx->i32, param_outer, 0)); outer_vec = lp_build_gather_values(gallivm, outer, util_next_power_of_two(outer_comps)); ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, outer_comps, tf_outer_offset, - base, 0, 1, 0); + base, 0, 1, 0, true, false); if (inner_comps) { param_inner = si_shader_io_get_unique_index( TGSI_SEMANTIC_TESSINNER, 0); tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, LLVMConstInt(ctx->i32, param_inner, 0)); inner_vec = inner_comps == 1 ? inner[0] : lp_build_gather_values(gallivm, inner, inner_comps); ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, inner_comps, tf_inner_offset, - base, 0, 1, 0); + base, 0, 1, 0, true, false); } } lp_build_endif(&if_ctx); } /* This only writes the tessellation factor levels. */ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base) { struct si_shader_context *ctx = si_shader_context(bld_base); @@ -2688,21 +2688,21 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base) info->output_semantic_index[i]); for (chan = 0; chan < 4; chan++) { LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""); out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, ""); ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, NULL, soffset, (4 * param_index + chan) * 4, - 1, 1); + 1, 1, true, true); } } } static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base) { struct si_shader_context *ctx = si_shader_context(bld_base); ac_emit_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID)); @@ -5056,21 +5056,21 @@ static void si_llvm_emit_vertex( voffset = lp_build_add(uint, voffset, gs_next_vertex); voffset = lp_build_mul_imm(uint, voffset, 4); out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, ""); ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset, soffset, 0, - 1, 1); + 1, 1, true, true); } } gs_next_vertex = lp_build_add(uint, gs_next_vertex, lp_build_const_int32(gallivm, 1)); LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]); /* Signal vertex emission */ ac_emit_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), @@ -5568,34 +5568,43 @@ static void preload_ring_buffers(struct si_shader_context *ctx) * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL * t16v0c0 .. * Override the buffer descriptor accordingly. */ LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2); uint64_t stream_offset = 0; for (unsigned stream = 0; stream < 4; ++stream) { unsigned num_components; unsigned stride; - unsigned num_records; + unsigned num_records, data_format; LLVMValueRef ring, tmp; num_components = sel->info.num_stream_output_components[stream]; if (!num_components) continue; stride = 4 * num_components * sel->gs_max_out_vertices; /* Limit on the stride field for <= CIK. */ assert(stride < (1 << 14)); num_records = 64; + /* When ADD_TID is set, DATA_FORMAT extends STRIDE on VI. + * It applies to MUBUF, but not MTBUF. We use MUBUF stores + * on LLVM >= 3.9, and MTBUF stores on older LLVM. + */ + if (0&&ctx->screen->b.chip_class >= VI && HAVE_LLVM >= 0x0309) + data_format = 0; /* stride[14:17] */ + else + data_format = V_008F0C_BUF_DATA_FORMAT_32; + ring = LLVMBuildBitCast(builder, base_ring, v2i64, ""); tmp = LLVMBuildExtractElement(builder, ring, uint->zero, ""); tmp = LLVMBuildAdd(builder, tmp, LLVMConstInt(ctx->i64, stream_offset, 0), ""); stream_offset += stride * 64; ring = LLVMBuildInsertElement(builder, ring, tmp, uint->zero, ""); ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, ""); tmp = LLVMBuildExtractElement(builder, ring, uint->one, ""); @@ -5607,21 +5616,21 @@ static void preload_ring_buffers(struct si_shader_context *ctx) ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->i32, num_records, 0), LLVMConstInt(ctx->i32, 2, 0), ""); ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->i32, S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_DATA_FORMAT(data_format) | S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */ S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */ S_008F0C_ADD_TID_ENABLE(1), 0), LLVMConstInt(ctx->i32, 3, 0), ""); ring = LLVMBuildBitCast(builder, ring, ctx->v16i8, ""); ctx->gsvs_ring[stream] = ring; } } -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev