Rebased ref, commits from common ancestor: commit 313940b03cf7c857143b9e3ec0ab969ce4472c83 Author: Ilia Mirkin <imir...@alum.mit.edu> Date: Tue Jul 28 02:37:51 2015 -0400
nvc0/ir: trim out barrier sync for non-compute shaders It seems like they're never necessary, and actively cause harm. This fixes some of the barrier-related piglits. Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 710f53d..c632e30 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -559,6 +559,12 @@ NVC0LegalizePostRA::visit(BasicBlock *bb) } else if (i->isNop()) { bb->remove(i); + } else + if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC && + prog->getType() != Program::TYPE_COMPUTE) { + // It seems like barriers are never required for tessellation since + // the warp size is 32, and there are always at most 32 tcs threads. + bb->remove(i); } else { // TODO: Move this to before register allocation for operations that // need the $c register ! commit ab63610a3603ae1e40a36d238b5938621bb9e8cc Author: Ilia Mirkin <imir...@alum.mit.edu> Date: Tue Jul 28 02:00:20 2015 -0400 nvc0/ir: fix barrier emission immediate arguments require a flag to be set for each one Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp index 3ed815b..f607f3b 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp @@ -1451,6 +1451,7 @@ CodeEmitterNVC0::emitBAR(const Instruction *i) ImmediateValue *imm = i->getSrc(0)->asImm(); assert(imm); code[0] |= imm->reg.data.u32 << 20; + code[1] |= 0x8000; } // thread count @@ -1461,6 +1462,7 @@ CodeEmitterNVC0::emitBAR(const Instruction *i) assert(imm); code[0] |= imm->reg.data.u32 << 26; code[1] |= imm->reg.data.u32 >> 6; + code[1] |= 0x4000; } if (i->srcExists(2) && (i->predSrc != 2)) { commit 7850774f2118ae87c7e6a4f6c17751e405edfb34 Author: Eric Anholt <e...@anholt.net> Date: Wed Jul 22 12:14:40 2015 -0700 vc4: Add support for ARB_draw_elements_base_vertex. Gallium exposes it unconditionally, so do our best to support it. It fails on the negative index cases, but those seem unlikely to be used in the wild. diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c index fc3c232..1c7f3b1 100644 --- a/src/gallium/drivers/vc4/vc4_draw.c +++ b/src/gallium/drivers/vc4/vc4_draw.c @@ -201,7 +201,9 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) struct pipe_vertex_buffer *vb = &vertexbuf->vb[elem->vertex_buffer_index]; struct vc4_resource *rsc = vc4_resource(vb->buffer); - uint32_t offset = vb->buffer_offset + elem->src_offset; + uint32_t offset = (vb->buffer_offset + + elem->src_offset + + vb->stride * info->index_bias); uint32_t vb_size = rsc->bo->size - offset; uint32_t elem_size = util_format_get_blocksize(elem->src_format); commit 98a4b111fbb9e3ae45e907ddd4d2407e5ab669ec Author: Rob Clark <robcl...@freedesktop.org> Date: Sat Jul 25 12:53:23 2015 -0400 freedreno/ir3: add transform-feedback support Signed-off-by: Rob Clark <robcl...@freedesktop.org> diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index b28d315..97e4161 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -227,9 +227,20 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) /* Stream output. */ case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: + if (is_a3xx(screen) || is_a4xx(screen)) + return PIPE_MAX_SO_BUFFERS; + return 0; case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: + if (is_a3xx(screen) || is_a4xx(screen)) + return 1; + return 0; case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: + if (is_a3xx(screen) || is_a4xx(screen)) + return 16; /* should only be shader out limit? */ + return 0; case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: + if (is_a3xx(screen) || is_a4xx(screen)) + return 16; /* should only be shader out limit? */ return 0; /* Geometry shader output, unsupported. */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index a4b2785..53faf16 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -263,6 +263,7 @@ compile_init(struct ir3_compiler *compiler, * 4 * vec4 - UBO addresses * if (vertex shader) { * 1 * vec4 - driver params (IR3_DP_*) + * 1 * vec4 - stream-out addresses * } * * TODO this could be made more dynamic, to at least skip sections @@ -275,6 +276,8 @@ compile_init(struct ir3_compiler *compiler, if (so->type == SHADER_VERTEX) { /* one (vec4) slot for driver params (see ir3_driver_param): */ so->first_immediate++; + /* one (vec4) slot for stream-output base addresses: */ + so->first_immediate++; } return ctx; @@ -1971,6 +1974,115 @@ emit_cf_list(struct ir3_compile *ctx, struct exec_list *list) } } +/* emit stream-out code. At this point, the current block is the original + * (nir) end block, and nir ensures that all flow control paths terminate + * into the end block. We re-purpose the original end block to generate + * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional + * block holding stream-out write instructions, followed by the new end + * block: + * + * blockOrigEnd { + * p0.x = (vtxcnt < maxvtxcnt) + * // succs: blockStreamOut, blockNewEnd + * } + * blockStreamOut { + * ... stream-out instructions ... + * // succs: blockNewEnd + * } + * blockNewEnd { + * } + */ +static void +emit_stream_out(struct ir3_compile *ctx) +{ + struct ir3_shader_variant *v = ctx->so; + struct ir3 *ir = ctx->ir; + struct pipe_stream_output_info *strmout = + &ctx->so->shader->stream_output; + struct ir3_block *orig_end_block, *stream_out_block, *new_end_block; + struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond; + struct ir3_instruction *bases[PIPE_MAX_SO_BUFFERS]; + + /* create vtxcnt input in input block at top of shader, + * so that it is seen as live over the entire duration + * of the shader: + */ + vtxcnt = create_input(ctx->in_block, 0); + add_sysval_input(ctx, IR3_SEMANTIC_VTXCNT, vtxcnt); + + maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX); + + /* at this point, we are at the original 'end' block, + * re-purpose this block to stream-out condition, then + * append stream-out block and new-end block + */ + orig_end_block = ctx->block; + + stream_out_block = ir3_block_create(ir); + list_addtail(&stream_out_block->node, &ir->block_list); + + new_end_block = ir3_block_create(ir); + list_addtail(&new_end_block->node, &ir->block_list); + + orig_end_block->successors[0] = stream_out_block; + orig_end_block->successors[1] = new_end_block; + stream_out_block->successors[0] = new_end_block; + + /* setup 'if (vtxcnt < maxvtxcnt)' condition: */ + cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0); + cond->regs[0]->num = regid(REG_P0, 0); + cond->cat2.condition = IR3_COND_LT; + + /* condition goes on previous block to the conditional, + * since it is used to pick which of the two successor + * paths to take: + */ + orig_end_block->condition = cond; + + /* switch to stream_out_block to generate the stream-out + * instructions: + */ + ctx->block = stream_out_block; + + /* Calculate base addresses based on vtxcnt. Instructions + * generated for bases not used in following loop will be + * stripped out in the backend. + */ + for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { + unsigned stride = strmout->stride[i]; + struct ir3_instruction *base, *off; + + base = create_uniform(ctx, regid(v->first_driver_param + 5, i)); + + /* 24-bit should be enough: */ + off = ir3_MUL_U(ctx->block, vtxcnt, 0, + create_immed(ctx->block, stride * 4), 0); + + bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0); + } + + /* Generate the per-output store instructions: */ + for (unsigned i = 0; i < strmout->num_outputs; i++) { + for (unsigned j = 0; j < strmout->output[i].num_components; j++) { + unsigned c = j + strmout->output[i].start_component; + struct ir3_instruction *base, *out, *stg; + + base = bases[strmout->output[i].output_buffer]; + out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)]; + + stg = ir3_STG(ctx->block, base, 0, out, 0, + create_immed(ctx->block, 1), 0); + stg->cat6.type = TYPE_U32; + stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4; + + array_insert(ctx->ir->keeps, stg); + } + } + + /* and finally switch to the new_end_block: */ + ctx->block = new_end_block; +} + static void emit_function(struct ir3_compile *ctx, nir_function_impl *impl) { @@ -1981,6 +2093,24 @@ emit_function(struct ir3_compile *ctx, nir_function_impl *impl) * into which we emit the 'end' instruction. */ compile_assert(ctx, list_empty(&ctx->block->instr_list)); + + /* If stream-out (aka transform-feedback) enabled, emit the + * stream-out instructions, followed by a new empty block (into + * which the 'end' instruction lands). + * + * NOTE: it is done in this order, rather than inserting before + * we emit end_block, because NIR guarantees that all blocks + * flow into end_block, and that end_block has no successors. + * So by re-purposing end_block as the first block of stream- + * out, we guarantee that all exit paths flow into the stream- + * out instructions. + */ + if ((ctx->so->shader->stream_output.num_outputs > 0) && + !ctx->so->key.binning_pass) { + debug_assert(ctx->so->type == SHADER_VERTEX); + emit_stream_out(ctx); + } + ir3_END(ctx->block); } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c index 166eb00..312174c 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c @@ -466,10 +466,10 @@ static void emit_ubos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf) { - if (v->constlen > v->first_driver_param) { + uint32_t offset = v->first_driver_param; /* UBOs after user consts */ + if (v->constlen > offset) { struct fd_context *ctx = fd_context(v->shader->pctx); - uint32_t offset = v->first_driver_param; /* UBOs after user consts */ - uint32_t params = MIN2(4, v->constlen - v->first_driver_param) * 4; + uint32_t params = MIN2(4, v->constlen - offset) * 4; uint32_t offsets[params]; struct fd_bo *bos[params]; @@ -515,6 +515,83 @@ emit_immediates(struct ir3_shader_variant *v, struct fd_ringbuffer *ring) } } +/* emit stream-out buffers: */ +static void +emit_tfbos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring) +{ + uint32_t offset = v->first_driver_param + 5; /* streamout addresses after driver-params*/ + if (v->constlen > offset) { + struct fd_context *ctx = fd_context(v->shader->pctx); + struct fd_streamout_stateobj *so = &ctx->streamout; + struct pipe_stream_output_info *info = &v->shader->stream_output; + uint32_t params = 4; + uint32_t offsets[params]; + struct fd_bo *bos[params]; + + for (uint32_t i = 0; i < params; i++) { + struct pipe_stream_output_target *target = so->targets[i]; + + if (target) { + offsets[i] = (so->offsets[i] * info->stride[i] * 4) + + target->buffer_offset; + bos[i] = fd_resource(target->buffer)->bo; + } else { + offsets[i] = 0; + bos[i] = NULL; + } + } + + fd_wfi(ctx, ring); + ctx->emit_const_bo(ring, v->type, true, offset * 4, params, bos, offsets); + } +} + +static uint32_t +max_tf_vtx(struct ir3_shader_variant *v) +{ + struct fd_context *ctx = fd_context(v->shader->pctx); + struct fd_streamout_stateobj *so = &ctx->streamout; + struct pipe_stream_output_info *info = &v->shader->stream_output; + uint32_t maxvtxcnt = 0x7fffffff; + + if (v->key.binning_pass) + return 0; + if (v->shader->stream_output.num_outputs == 0) + return 0; + if (so->num_targets == 0) + return 0; + + /* offset to write to is: + * + * total_vtxcnt = vtxcnt + offsets[i] + * offset = total_vtxcnt * stride[i] + * + * offset = vtxcnt * stride[i] ; calculated in shader + * + offsets[i] * stride[i] ; calculated at emit_tfbos() + * + * assuming for each vtx, each target buffer will have data written + * up to 'offset + stride[i]', that leaves maxvtxcnt as: + * + * buffer_size = (maxvtxcnt * stride[i]) + stride[i] + * maxvtxcnt = (buffer_size - stride[i]) / stride[i] + * + * but shader is actually doing a less-than (rather than less-than- + * equal) check, so we can drop the -stride[i]. + * + * TODO is assumption about `offset + stride[i]` legit? + */ + for (unsigned i = 0; i < so->num_targets; i++) { + struct pipe_stream_output_target *target = so->targets[i]; + unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */ + if (target) { + uint32_t max = target->buffer_size / stride; + maxvtxcnt = MIN2(maxvtxcnt, max); + } + } + + return maxvtxcnt; +} + void ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, const struct pipe_draw_info *info, uint32_t dirty) @@ -548,12 +625,19 @@ ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, uint32_t offset = v->first_driver_param + 4; /* driver params after UBOs */ if (v->constlen >= offset) { uint32_t vertex_params[4] = { - [IR3_DP_VTXID_BASE] = info->indexed ? info->index_bias : info->start, + [IR3_DP_VTXID_BASE] = info->indexed ? + info->index_bias : info->start, + [IR3_DP_VTXCNT_MAX] = max_tf_vtx(v), }; fd_wfi(ctx, ring); ctx->emit_const(ring, SHADER_VERTEX, offset * 4, 0, ARRAY_SIZE(vertex_params), vertex_params, NULL); + + /* if needed, emit stream-out buffer addresses: */ + if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) { + emit_tfbos(v, ring); + } } } } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h index 4cb2520..c0fd44d 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -37,6 +37,7 @@ /* driver param indices: */ enum ir3_driver_param { IR3_DP_VTXID_BASE = 0, + IR3_DP_VTXCNT_MAX = 1, }; /* internal semantic used for passing vtxcnt to vertex shader to commit 96d4db683f90f02e72d34ece544de7eedfa873ee Author: Rob Clark <robcl...@freedesktop.org> Date: Sat Jul 25 13:51:16 2015 -0400 freedreno/ir3: track "keeps" in ir Previously we had a fixed array to track kills, since they don't generate an SSA value, and then cheated by stuffing them in the outputs array before sending things through depth/sched/etc. But store instructions will need similar treatment. So convert this over to a more general array of instructions that must be kept and fix up the places that were previously relying on kills being in the output array. Signed-off-by: Rob Clark <robcl...@freedesktop.org> diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h index e68170d..12f2ebe 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.h +++ b/src/gallium/drivers/freedreno/ir3/ir3.h @@ -369,6 +369,12 @@ struct ir3 { unsigned predicates_count, predicates_sz; struct ir3_instruction **predicates; + /* Track instructions which do not write a register but other- + * wise must not be discarded (such as kill, stg, etc) + */ + unsigned keeps_count, keeps_sz; + struct ir3_instruction **keeps; + /* List of blocks: */ struct list_head block_list; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index e013abe..a4b2785 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -117,10 +117,6 @@ struct ir3_compile { /* for looking up which system value is which */ unsigned sysval_semantics[8]; - /* list of kill instructions: */ - struct ir3_instruction *kill[16]; - unsigned int kill_count; - /* set if we encounter something we can't handle yet, so we * can bail cleanly and fallback to TGSI compiler f/e */ @@ -1481,7 +1477,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) kill = ir3_KILL(b, cond, 0); array_insert(ctx->ir->predicates, kill); - ctx->kill[ctx->kill_count++] = kill; + array_insert(ctx->ir->keeps, kill); ctx->so->has_kill = true; break; @@ -2165,13 +2161,9 @@ emit_instructions(struct ir3_compile *ctx) ninputs = exec_list_length(&ctx->s->inputs) * 4; noutputs = exec_list_length(&ctx->s->outputs) * 4; - /* we need to allocate big enough outputs array so that - * we can stuff the kill's at the end. Likewise for vtx - * shaders, we need to leave room for sysvals: + /* or vtx shaders, we need to leave room for sysvals: */ - if (ctx->so->type == SHADER_FRAGMENT) { - noutputs += ARRAY_SIZE(ctx->kill); - } else if (ctx->so->type == SHADER_VERTEX) { + if (ctx->so->type == SHADER_VERTEX) { ninputs += 8; } @@ -2182,9 +2174,7 @@ emit_instructions(struct ir3_compile *ctx) ctx->in_block = ctx->block; list_addtail(&ctx->block->node, &ctx->ir->block_list); - if (ctx->so->type == SHADER_FRAGMENT) { - ctx->ir->noutputs -= ARRAY_SIZE(ctx->kill); - } else if (ctx->so->type == SHADER_VERTEX) { + if (ctx->so->type == SHADER_VERTEX) { ctx->ir->ninputs -= 8; } @@ -2380,15 +2370,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, } } - /* at this point, we want the kill's in the outputs array too, - * so that they get scheduled (since they have no dst).. we've - * already ensured that the array is big enough in push_block(): - */ - if (so->type == SHADER_FRAGMENT) { - for (i = 0; i < ctx->kill_count; i++) - ir->outputs[ir->noutputs++] = ctx->kill[i]; - } - if (fd_mesa_debug & FD_DBG_OPTMSGS) { printf("BEFORE CP:\n"); ir3_print(ir); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c index f4c825b..be4e4e8 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c @@ -408,6 +408,10 @@ ir3_cp(struct ir3 *ir) } } + for (unsigned i = 0; i < ir->keeps_count; i++) { + ir->keeps[i] = instr_cp(ir->keeps[i], NULL); + } + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { if (block->condition) block->condition = instr_cp(block->condition, NULL); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c index 0f346b2..97df0c2 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c @@ -156,6 +156,9 @@ ir3_depth(struct ir3 *ir) if (ir->outputs[i]) ir3_instr_depth(ir->outputs[i]); + for (i = 0; i < ir->keeps_count; i++) + ir3_instr_depth(ir->keeps[i]); + /* We also need to account for if-condition: */ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { if (block->condition) commit 020301baccc77e5753ead1e890c0cf24a9675517 Author: Rob Clark <robcl...@freedesktop.org> Date: Sat Jul 25 13:48:07 2015 -0400 freedreno/ir3: add support for store instructions For store instructions, the "dst" register is a read register, not a written register. (Ie. it is the address to store to.) Lets not confuse register allocation, scheduling, etc, with these details. Instead just leave a dummy instr->regs[0], and take "dst" from instr->regs[1] and srcs following. Signed-off-by: Rob Clark <robcl...@freedesktop.org> diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c index 6d19a29..b24825c 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.c +++ b/src/gallium/drivers/freedreno/ir3/ir3.c @@ -499,12 +499,28 @@ static int emit_cat5(struct ir3_instruction *instr, void *ptr, static int emit_cat6(struct ir3_instruction *instr, void *ptr, struct ir3_info *info) { - struct ir3_register *dst = instr->regs[0]; - struct ir3_register *src1 = instr->regs[1]; - struct ir3_register *src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL; + struct ir3_register *dst, *src1, *src2; instr_cat6_t *cat6 = ptr; - iassert(instr->regs_count >= 2); + /* the "dst" for a store instruction is (from the perspective + * of data flow in the shader, ie. register use/def, etc) in + * fact a register that is read by the instruction, rather + * than written: + */ + if (is_store(instr)) { + iassert(instr->regs_count >= 3); + + dst = instr->regs[1]; + src1 = instr->regs[2]; + src2 = (instr->regs_count >= 4) ? instr->regs[3] : NULL; + } else { + iassert(instr->regs_count >= 2); + + dst = instr->regs[0]; + src1 = instr->regs[1]; + src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL; + } + /* TODO we need a more comprehensive list about which instructions * can be encoded which way. Or possibly use IR3_INSTR_0 flag to diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h index c3b61a0..e68170d 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.h +++ b/src/gallium/drivers/freedreno/ir3/ir3.h @@ -554,6 +554,26 @@ is_store(struct ir3_instruction *instr) return false; } +static inline bool is_load(struct ir3_instruction *instr) +{ + if (is_mem(instr)) { + switch (instr->opc) { + case OPC_LDG: + case OPC_LDL: + case OPC_LDP: + case OPC_L2G: + case OPC_LDLW: + case OPC_LDC_4: + case OPC_LDLV: + /* probably some others too.. */ + return true; + default: + break; + } + } + return false; +} + static inline bool is_input(struct ir3_instruction *instr) { /* in some cases, ldlv is used to fetch varying without @@ -1043,6 +1063,7 @@ ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, /* cat6 instructions: */ INSTR2(6, LDLV) INSTR2(6, LDG) +INSTR3(6, STG) /* ************************************************************************* */ /* split this out or find some helper to use.. like main/bitset.h.. */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c index f4a4223..e94293f 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c @@ -182,14 +182,14 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) */ ctx->has_samp = true; regmask_set(&needs_sy, n->regs[0]); - } else if (is_mem(n)) { + } else if (is_load(n)) { regmask_set(&needs_sy, n->regs[0]); } /* both tex/sfu appear to not always immediately consume * their src register(s): */ - if (is_tex(n) || is_sfu(n) || is_mem(n)) { + if (is_tex(n) || is_sfu(n) || is_load(n)) { foreach_src(reg, n) { if (reg_gpr(reg)) regmask_set(&needs_ss_war, reg); commit a240748de52f2e469e91b60d29ae872828a594d7 Author: Rob Clark <robcl...@freedesktop.org> Date: Sat Jul 25 12:48:18 2015 -0400 freedreno/ir3: cleanup driver-param stuff Add 'enum ir3_driver_param' to track driver-param slots, and a create_driver_param() helper to avoid having the knowledge about where driver params are placed in const regs spread throughout the code as we add additional driver-params. Signed-off-by: Rob Clark <robcl...@freedesktop.org> diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index bdbaf89..e013abe 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -261,13 +261,26 @@ compile_init(struct ir3_compiler *compiler, so->first_driver_param = so->first_immediate = ctx->s->num_uniforms; - /* one (vec4) slot for vertex id base: */ - if (so->type == SHADER_VERTEX) - so->first_immediate++; + /* Layout of constant registers: + * + * num_uniform * vec4 - user consts + * 4 * vec4 - UBO addresses + * if (vertex shader) { + * 1 * vec4 - driver params (IR3_DP_*) + * } + * + * TODO this could be made more dynamic, to at least skip sections + * that we don't need.. + */ /* reserve 4 (vec4) slots for ubo base addresses: */ so->first_immediate += 4; + if (so->type == SHADER_VERTEX) { + /* one (vec4) slot for driver params (see ir3_driver_param): */ + so->first_immediate++; + } + return ctx; } @@ -811,6 +824,14 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp) } } +static struct ir3_instruction * +create_driver_param(struct ir3_compile *ctx, enum ir3_driver_param dp) +{ + /* first four vec4 sysval's reserved for UBOs: */ + unsigned r = regid(ctx->so->first_driver_param + 4, dp); + return create_uniform(ctx, r); +} + /* helper for instructions that produce multiple consecutive scalar * outputs which need to have a split/fanout meta instruction inserted */ @@ -1415,9 +1436,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) break; case nir_intrinsic_load_base_vertex: if (!ctx->basevertex) { - /* first four vec4 sysval's reserved for UBOs: */ - unsigned r = regid(ctx->so->first_driver_param + 4, 0); - ctx->basevertex = create_uniform(ctx, r); + ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE); add_sysval_input(ctx, TGSI_SEMANTIC_BASEVERTEX, ctx->basevertex); } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c index 75425e9..166eb00 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c @@ -548,10 +548,7 @@ ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring, uint32_t offset = v->first_driver_param + 4; /* driver params after UBOs */ if (v->constlen >= offset) { uint32_t vertex_params[4] = { - info->indexed ? info->index_bias : info->start, - 0, - 0, - 0 + [IR3_DP_VTXID_BASE] = info->indexed ? info->index_bias : info->start, }; fd_wfi(ctx, ring); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h index f0af447..4cb2520 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -34,6 +34,11 @@ #include "ir3.h" #include "disasm.h" +/* driver param indices: */ +enum ir3_driver_param { + IR3_DP_VTXID_BASE = 0, +}; + /* internal semantic used for passing vtxcnt to vertex shader to * implement transform feedback: */ commit be8a8ebe578267ab24e343c3c1347936a221468e Author: Rob Clark <robcl...@freedesktop.org> Date: Sat Jul 25 10:56:39 2015 -0400 freedreno: add transform-feedback state Signed-off-by: Rob Clark <robcl...@freedesktop.org> diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h index bc5267a..cc585af 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.h +++ b/src/gallium/drivers/freedreno/freedreno_context.h @@ -82,6 +82,20 @@ struct fd_vertex_stateobj { unsigned num_elements; }; +struct fd_streamout_stateobj { + struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS]; + unsigned num_targets; + /* Track offset from vtxcnt for streamout data. This counter + * is just incremented by # of vertices on each draw until + * reset or new streamout buffer bound. + * + * When we eventually have GS, the CPU won't actually know the + * number of vertices per draw, so I think we'll have to do + * something more clever. + */ + unsigned offsets[PIPE_MAX_SO_BUFFERS]; +}; + /* group together the vertex and vertexbuf state.. for ease of passing * around, and because various internal operations (gmem<->mem, etc) * need their own vertex state: @@ -319,6 +333,7 @@ struct fd_context { FD_DIRTY_VTXBUF = (1 << 15), FD_DIRTY_INDEXBUF = (1 << 16), FD_DIRTY_SCISSOR = (1 << 17), + FD_DIRTY_STREAMOUT = (1 << 18), } dirty; struct pipe_blend_state *blend; @@ -339,6 +354,7 @@ struct fd_context { struct pipe_viewport_state viewport; struct fd_constbuf_stateobj constbuf[PIPE_SHADER_TYPES]; struct pipe_index_buffer indexbuf; + struct fd_streamout_stateobj streamout; /* GMEM/tile handling fxns: */ void (*emit_tile_init)(struct fd_context *ctx); diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c index ae75b3e..f886540 100644 --- a/src/gallium/drivers/freedreno/freedreno_draw.c +++ b/src/gallium/drivers/freedreno/freedreno_draw.c @@ -62,7 +62,7 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) struct fd_context *ctx = fd_context(pctx); struct pipe_framebuffer_state *pfb = &ctx->framebuffer; struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx); - unsigned i, buffers = 0; + unsigned i, prims, buffers = 0; /* if we supported transform feedback, we'd have to disable this: */ if (((scissor->maxx - scissor->minx) * @@ -144,11 +144,17 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) if (ctx->fragtex.textures[i]) resource_used(ctx, ctx->fragtex.textures[i]->texture, true); + /* Mark streamout buffers as being read.. actually they are written.. */ + for (i = 0; i < ctx->streamout.num_targets; i++) + if (ctx->streamout.targets[i]) + resource_used(ctx, ctx->streamout.targets[i]->buffer, false); + ctx->num_draws++; + prims = u_reduced_prims_for_vertices(info->mode, info->count); + ctx->stats.draw_calls++; - ctx->stats.prims_emitted += - u_reduced_prims_for_vertices(info->mode, info->count); + ctx->stats.prims_emitted += prims; /* any buffers that haven't been cleared yet, we need to restore: */ ctx->restore |= buffers & (FD_BUFFER_ALL & ~ctx->cleared); @@ -162,6 +168,9 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_DRAW); ctx->draw_vbo(ctx, info); + for (i = 0; i < ctx->streamout.num_targets; i++) + ctx->streamout.offsets[i] += prims; + /* if an app (or, well, piglit test) does many thousands of draws * without flush (or anything which implicitly flushes, like * changing render targets), we can exceed the ringbuffer size. diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c index de3cb64..d649925 100644 --- a/src/gallium/drivers/freedreno/freedreno_resource.c +++ b/src/gallium/drivers/freedreno/freedreno_resource.c @@ -647,6 +647,8 @@ fd_blitter_pipe_begin(struct fd_context *ctx) util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vtx.vertexbuf.vb); util_blitter_save_vertex_elements(ctx->blitter, ctx->vtx.vtx); util_blitter_save_vertex_shader(ctx->blitter, ctx->prog.vp); + util_blitter_save_so_targets(ctx->blitter, ctx->streamout.num_targets, + ctx->streamout.targets); util_blitter_save_rasterizer(ctx->blitter, ctx->rasterizer); util_blitter_save_viewport(ctx->blitter, &ctx->viewport); util_blitter_save_scissor(ctx->blitter, &ctx->scissor); diff --git a/src/gallium/drivers/freedreno/freedreno_state.c b/src/gallium/drivers/freedreno/freedreno_state.c index 77aa4f2..7bf8bdb 100644 --- a/src/gallium/drivers/freedreno/freedreno_state.c +++ b/src/gallium/drivers/freedreno/freedreno_state.c @@ -300,6 +300,67 @@ fd_vertex_state_bind(struct pipe_context *pctx, void *hwcso) ctx->dirty |= FD_DIRTY_VTXSTATE; } +static struct pipe_stream_output_target * +fd_create_stream_output_target(struct pipe_context *pctx, + struct pipe_resource *prsc, unsigned buffer_offset, + unsigned buffer_size) +{ + struct pipe_stream_output_target *target; + + target = CALLOC_STRUCT(pipe_stream_output_target); + if (!target) + return NULL; + + pipe_reference_init(&target->reference, 1); + pipe_resource_reference(&target->buffer, prsc); + + target->context = pctx; + target->buffer_offset = buffer_offset; + target->buffer_size = buffer_size; + + return target; +} + +static void +fd_stream_output_target_destroy(struct pipe_context *pctx, + struct pipe_stream_output_target *target) +{ + pipe_resource_reference(&target->buffer, NULL); + FREE(target); +} + +static void +fd_set_stream_output_targets(struct pipe_context *pctx, + unsigned num_targets, struct pipe_stream_output_target **targets, + const unsigned *offsets) +{ + struct fd_context *ctx = fd_context(pctx); + struct fd_streamout_stateobj *so = &ctx->streamout; + unsigned i; + + debug_assert(num_targets <= ARRAY_SIZE(so->targets)); + + for (i = 0; i < num_targets; i++) { + boolean changed = targets[i] != so->targets[i]; + boolean append = (offsets[i] == (unsigned)-1); + + if (!changed && append) + continue; + + so->offsets[i] = 0; + + pipe_so_target_reference(&so->targets[i], targets[i]); + } + + for (; i < so->num_targets; i++) { + pipe_so_target_reference(&so->targets[i], NULL); + } + + so->num_targets = num_targets; + + ctx->dirty |= FD_DIRTY_STREAMOUT; +} + void fd_state_init(struct pipe_context *pctx) { @@ -328,4 +389,8 @@ fd_state_init(struct pipe_context *pctx) pctx->create_vertex_elements_state = fd_vertex_state_create; pctx->delete_vertex_elements_state = fd_vertex_state_delete; pctx->bind_vertex_elements_state = fd_vertex_state_bind; + + pctx->create_stream_output_target = fd_create_stream_output_target; + pctx->stream_output_target_destroy = fd_stream_output_target_destroy; + pctx->set_stream_output_targets = fd_set_stream_output_targets; } commit bda1354aac9d32e236048af4d353d5530f644c34 Author: Rob Clark <robcl...@freedesktop.org> Date: Sun Jul 26 13:30:26 2015 -0400 freedreno: add resource tracking support for written buffers With stream-out (transform-feedback) we have the case where resources are *written* by the gpu, which needs basically the same tracking to figure out when rendering must be flushed. Signed-off-by: Rob Clark <robcl...@freedesktop.org> diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c index 127fb5f..02613dc 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.c +++ b/src/gallium/drivers/freedreno/freedreno_context.c @@ -130,8 +130,9 @@ fd_context_render(struct pipe_context *pctx) /* go through all the used resources and clear their reading flag */ LIST_FOR_EACH_ENTRY_SAFE(rsc, rsc_tmp, &ctx->used_resources, list) { - assert(rsc->reading); + assert(rsc->reading || rsc->writing); rsc->reading = false; + rsc->writing = false; list_delinit(&rsc->list); } diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c index c9e317c..ae75b3e 100644 --- a/src/gallium/drivers/freedreno/freedreno_draw.c +++ b/src/gallium/drivers/freedreno/freedreno_draw.c @@ -40,7 +40,7 @@ #include "freedreno_util.h" static void -resource_reading(struct fd_context *ctx, struct pipe_resource *prsc) +resource_used(struct fd_context *ctx, struct pipe_resource *prsc, boolean reading) { struct fd_resource *rsc; @@ -48,7 +48,10 @@ resource_reading(struct fd_context *ctx, struct pipe_resource *prsc) return; rsc = fd_resource(prsc); - rsc->reading = true; + if (reading) + rsc->reading = true; -- To UNSUBSCRIBE, email to debian-x-requ...@lists.debian.org with a subject of "unsubscribe". Trouble? Contact listmas...@lists.debian.org Archive: https://lists.debian.org/e1zk3lq-0000br...@moszumanska.debian.org