Rebased ref, commits from common ancestor:
commit 313940b03cf7c857143b9e3ec0ab969ce4472c83
Author: Ilia Mirkin <imir...@alum.mit.edu>
Date:   Tue Jul 28 02:37:51 2015 -0400

    nvc0/ir: trim out barrier sync for non-compute shaders
    
    It seems like they're never necessary, and actively cause harm. This
    fixes some of the barrier-related piglits.
    
    Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu>

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 710f53d..c632e30 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -559,6 +559,12 @@ NVC0LegalizePostRA::visit(BasicBlock *bb)
       } else
       if (i->isNop()) {
          bb->remove(i);
+      } else
+      if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&
+          prog->getType() != Program::TYPE_COMPUTE) {
+         // It seems like barriers are never required for tessellation since
+         // the warp size is 32, and there are always at most 32 tcs threads.
+         bb->remove(i);
       } else {
          // TODO: Move this to before register allocation for operations that
          // need the $c register !

commit ab63610a3603ae1e40a36d238b5938621bb9e8cc
Author: Ilia Mirkin <imir...@alum.mit.edu>
Date:   Tue Jul 28 02:00:20 2015 -0400

    nvc0/ir: fix barrier emission
    
    immediate arguments require a flag to be set for each one
    
    Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu>

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 3ed815b..f607f3b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1451,6 +1451,7 @@ CodeEmitterNVC0::emitBAR(const Instruction *i)
       ImmediateValue *imm = i->getSrc(0)->asImm();
       assert(imm);
       code[0] |= imm->reg.data.u32 << 20;
+      code[1] |= 0x8000;
    }
 
    // thread count
@@ -1461,6 +1462,7 @@ CodeEmitterNVC0::emitBAR(const Instruction *i)
       assert(imm);
       code[0] |= imm->reg.data.u32 << 26;
       code[1] |= imm->reg.data.u32 >> 6;
+      code[1] |= 0x4000;
    }
 
    if (i->srcExists(2) && (i->predSrc != 2)) {

commit 7850774f2118ae87c7e6a4f6c17751e405edfb34
Author: Eric Anholt <e...@anholt.net>
Date:   Wed Jul 22 12:14:40 2015 -0700

    vc4: Add support for ARB_draw_elements_base_vertex.
    
    Gallium exposes it unconditionally, so do our best to support it.  It
    fails on the negative index cases, but those seem unlikely to be used in
    the wild.

diff --git a/src/gallium/drivers/vc4/vc4_draw.c 
b/src/gallium/drivers/vc4/vc4_draw.c
index fc3c232..1c7f3b1 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -201,7 +201,9 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct 
pipe_draw_info *info)
                 struct pipe_vertex_buffer *vb =
                         &vertexbuf->vb[elem->vertex_buffer_index];
                 struct vc4_resource *rsc = vc4_resource(vb->buffer);
-                uint32_t offset = vb->buffer_offset + elem->src_offset;
+                uint32_t offset = (vb->buffer_offset +
+                                   elem->src_offset +
+                                   vb->stride * info->index_bias);
                 uint32_t vb_size = rsc->bo->size - offset;
                 uint32_t elem_size =
                         util_format_get_blocksize(elem->src_format);

commit 98a4b111fbb9e3ae45e907ddd4d2407e5ab669ec
Author: Rob Clark <robcl...@freedesktop.org>
Date:   Sat Jul 25 12:53:23 2015 -0400

    freedreno/ir3: add transform-feedback support
    
    Signed-off-by: Rob Clark <robcl...@freedesktop.org>

diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c 
b/src/gallium/drivers/freedreno/freedreno_screen.c
index b28d315..97e4161 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -227,9 +227,20 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
 
        /* Stream output. */
        case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+               if (is_a3xx(screen) || is_a4xx(screen))
+                       return PIPE_MAX_SO_BUFFERS;
+               return 0;
        case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+               if (is_a3xx(screen) || is_a4xx(screen))
+                       return 1;
+               return 0;
        case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+               if (is_a3xx(screen) || is_a4xx(screen))
+                       return 16;    /* should only be shader out limit? */
+               return 0;
        case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+               if (is_a3xx(screen) || is_a4xx(screen))
+                       return 16;    /* should only be shader out limit? */
                return 0;
 
        /* Geometry shader output, unsupported. */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c 
b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index a4b2785..53faf16 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -263,6 +263,7 @@ compile_init(struct ir3_compiler *compiler,
         *    4 * vec4            -  UBO addresses
         *    if (vertex shader) {
         *        1 * vec4        -  driver params (IR3_DP_*)
+        *        1 * vec4        -  stream-out addresses
         *    }
         *
         * TODO this could be made more dynamic, to at least skip sections
@@ -275,6 +276,8 @@ compile_init(struct ir3_compiler *compiler,
        if (so->type == SHADER_VERTEX) {
                /* one (vec4) slot for driver params (see ir3_driver_param): */
                so->first_immediate++;
+               /* one (vec4) slot for stream-output base addresses: */
+               so->first_immediate++;
        }
 
        return ctx;
@@ -1971,6 +1974,115 @@ emit_cf_list(struct ir3_compile *ctx, struct exec_list 
*list)
        }
 }
 
+/* emit stream-out code.  At this point, the current block is the original
+ * (nir) end block, and nir ensures that all flow control paths terminate
+ * into the end block.  We re-purpose the original end block to generate
+ * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
+ * block holding stream-out write instructions, followed by the new end
+ * block:
+ *
+ *   blockOrigEnd {
+ *      p0.x = (vtxcnt < maxvtxcnt)
+ *      // succs: blockStreamOut, blockNewEnd
+ *   }
+ *   blockStreamOut {
+ *      ... stream-out instructions ...
+ *      // succs: blockNewEnd
+ *   }
+ *   blockNewEnd {
+ *   }
+ */
+static void
+emit_stream_out(struct ir3_compile *ctx)
+{
+       struct ir3_shader_variant *v = ctx->so;
+       struct ir3 *ir = ctx->ir;
+       struct pipe_stream_output_info *strmout =
+                       &ctx->so->shader->stream_output;
+       struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
+       struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
+       struct ir3_instruction *bases[PIPE_MAX_SO_BUFFERS];
+
+       /* create vtxcnt input in input block at top of shader,
+        * so that it is seen as live over the entire duration
+        * of the shader:
+        */
+       vtxcnt = create_input(ctx->in_block, 0);
+       add_sysval_input(ctx, IR3_SEMANTIC_VTXCNT, vtxcnt);
+
+       maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
+
+       /* at this point, we are at the original 'end' block,
+        * re-purpose this block to stream-out condition, then
+        * append stream-out block and new-end block
+        */
+       orig_end_block = ctx->block;
+
+       stream_out_block = ir3_block_create(ir);
+       list_addtail(&stream_out_block->node, &ir->block_list);
+
+       new_end_block = ir3_block_create(ir);
+       list_addtail(&new_end_block->node, &ir->block_list);
+
+       orig_end_block->successors[0] = stream_out_block;
+       orig_end_block->successors[1] = new_end_block;
+       stream_out_block->successors[0] = new_end_block;
+
+       /* setup 'if (vtxcnt < maxvtxcnt)' condition: */
+       cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
+       cond->regs[0]->num = regid(REG_P0, 0);
+       cond->cat2.condition = IR3_COND_LT;
+
+       /* condition goes on previous block to the conditional,
+        * since it is used to pick which of the two successor
+        * paths to take:
+        */
+       orig_end_block->condition = cond;
+
+       /* switch to stream_out_block to generate the stream-out
+        * instructions:
+        */
+       ctx->block = stream_out_block;
+
+       /* Calculate base addresses based on vtxcnt.  Instructions
+        * generated for bases not used in following loop will be
+        * stripped out in the backend.
+        */
+       for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+               unsigned stride = strmout->stride[i];
+               struct ir3_instruction *base, *off;
+
+               base = create_uniform(ctx, regid(v->first_driver_param + 5, i));
+
+               /* 24-bit should be enough: */
+               off = ir3_MUL_U(ctx->block, vtxcnt, 0,
+                               create_immed(ctx->block, stride * 4), 0);
+
+               bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
+       }
+
+       /* Generate the per-output store instructions: */
+       for (unsigned i = 0; i < strmout->num_outputs; i++) {
+               for (unsigned j = 0; j < strmout->output[i].num_components; 
j++) {
+                       unsigned c = j + strmout->output[i].start_component;
+                       struct ir3_instruction *base, *out, *stg;
+
+                       base = bases[strmout->output[i].output_buffer];
+                       out = 
ctx->ir->outputs[regid(strmout->output[i].register_index, c)];
+
+                       stg = ir3_STG(ctx->block, base, 0, out, 0,
+                                       create_immed(ctx->block, 1), 0);
+                       stg->cat6.type = TYPE_U32;
+                       stg->cat6.dst_offset = (strmout->output[i].dst_offset + 
j) * 4;
+
+                       array_insert(ctx->ir->keeps, stg);
+               }
+       }
+
+       /* and finally switch to the new_end_block: */
+       ctx->block = new_end_block;
+}
+
 static void
 emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
 {
@@ -1981,6 +2093,24 @@ emit_function(struct ir3_compile *ctx, nir_function_impl 
*impl)
         * into which we emit the 'end' instruction.
         */
        compile_assert(ctx, list_empty(&ctx->block->instr_list));
+
+       /* If stream-out (aka transform-feedback) enabled, emit the
+        * stream-out instructions, followed by a new empty block (into
+        * which the 'end' instruction lands).
+        *
+        * NOTE: it is done in this order, rather than inserting before
+        * we emit end_block, because NIR guarantees that all blocks
+        * flow into end_block, and that end_block has no successors.
+        * So by re-purposing end_block as the first block of stream-
+        * out, we guarantee that all exit paths flow into the stream-
+        * out instructions.
+        */
+       if ((ctx->so->shader->stream_output.num_outputs > 0) &&
+                       !ctx->so->key.binning_pass) {
+               debug_assert(ctx->so->type == SHADER_VERTEX);
+               emit_stream_out(ctx);
+       }
+
        ir3_END(ctx->block);
 }
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c 
b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 166eb00..312174c 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -466,10 +466,10 @@ static void
 emit_ubos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
                struct fd_constbuf_stateobj *constbuf)
 {
-       if (v->constlen > v->first_driver_param) {
+       uint32_t offset = v->first_driver_param;  /* UBOs after user consts */
+       if (v->constlen > offset) {
                struct fd_context *ctx = fd_context(v->shader->pctx);
-               uint32_t offset = v->first_driver_param;  /* UBOs after user 
consts */
-               uint32_t params = MIN2(4, v->constlen - v->first_driver_param) 
* 4;
+               uint32_t params = MIN2(4, v->constlen - offset) * 4;
                uint32_t offsets[params];
                struct fd_bo *bos[params];
 
@@ -515,6 +515,83 @@ emit_immediates(struct ir3_shader_variant *v, struct 
fd_ringbuffer *ring)
        }
 }
 
+/* emit stream-out buffers: */
+static void
+emit_tfbos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
+{
+       uint32_t offset = v->first_driver_param + 5;  /* streamout addresses 
after driver-params*/
+       if (v->constlen > offset) {
+               struct fd_context *ctx = fd_context(v->shader->pctx);
+               struct fd_streamout_stateobj *so = &ctx->streamout;
+               struct pipe_stream_output_info *info = 
&v->shader->stream_output;
+               uint32_t params = 4;
+               uint32_t offsets[params];
+               struct fd_bo *bos[params];
+
+               for (uint32_t i = 0; i < params; i++) {
+                       struct pipe_stream_output_target *target = 
so->targets[i];
+
+                       if (target) {
+                               offsets[i] = (so->offsets[i] * info->stride[i] 
* 4) +
+                                               target->buffer_offset;
+                               bos[i] = fd_resource(target->buffer)->bo;
+                       } else {
+                               offsets[i] = 0;
+                               bos[i] = NULL;
+                       }
+               }
+
+               fd_wfi(ctx, ring);
+               ctx->emit_const_bo(ring, v->type, true, offset * 4, params, 
bos, offsets);
+       }
+}
+
+static uint32_t
+max_tf_vtx(struct ir3_shader_variant *v)
+{
+       struct fd_context *ctx = fd_context(v->shader->pctx);
+       struct fd_streamout_stateobj *so = &ctx->streamout;
+       struct pipe_stream_output_info *info = &v->shader->stream_output;
+       uint32_t maxvtxcnt = 0x7fffffff;
+
+       if (v->key.binning_pass)
+               return 0;
+       if (v->shader->stream_output.num_outputs == 0)
+               return 0;
+       if (so->num_targets == 0)
+               return 0;
+
+       /* offset to write to is:
+        *
+        *   total_vtxcnt = vtxcnt + offsets[i]
+        *   offset = total_vtxcnt * stride[i]
+        *
+        *   offset =   vtxcnt * stride[i]       ; calculated in shader
+        *            + offsets[i] * stride[i]   ; calculated at emit_tfbos()
+        *
+        * assuming for each vtx, each target buffer will have data written
+        * up to 'offset + stride[i]', that leaves maxvtxcnt as:
+        *
+        *   buffer_size = (maxvtxcnt * stride[i]) + stride[i]
+        *   maxvtxcnt   = (buffer_size - stride[i]) / stride[i]
+        *
+        * but shader is actually doing a less-than (rather than less-than-
+        * equal) check, so we can drop the -stride[i].
+        *
+        * TODO is assumption about `offset + stride[i]` legit?
+        */
+       for (unsigned i = 0; i < so->num_targets; i++) {
+               struct pipe_stream_output_target *target = so->targets[i];
+               unsigned stride = info->stride[i] * 4;   /* convert 
dwords->bytes */
+               if (target) {
+                       uint32_t max = target->buffer_size / stride;
+                       maxvtxcnt = MIN2(maxvtxcnt, max);
+               }
+       }
+
+       return maxvtxcnt;
+}
+
 void
 ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
                const struct pipe_draw_info *info, uint32_t dirty)
@@ -548,12 +625,19 @@ ir3_emit_consts(struct ir3_shader_variant *v, struct 
fd_ringbuffer *ring,
                uint32_t offset = v->first_driver_param + 4;  /* driver params 
after UBOs */
                if (v->constlen >= offset) {
                        uint32_t vertex_params[4] = {
-                               [IR3_DP_VTXID_BASE] = info->indexed ? 
info->index_bias : info->start,
+                               [IR3_DP_VTXID_BASE] = info->indexed ?
+                                               info->index_bias : info->start,
+                               [IR3_DP_VTXCNT_MAX] = max_tf_vtx(v),
                        };
 
                        fd_wfi(ctx, ring);
                        ctx->emit_const(ring, SHADER_VERTEX, offset * 4, 0,
                                        ARRAY_SIZE(vertex_params), 
vertex_params, NULL);
+
+                       /* if needed, emit stream-out buffer addresses: */
+                       if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) {
+                               emit_tfbos(v, ring);
+                       }
                }
        }
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h 
b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index 4cb2520..c0fd44d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -37,6 +37,7 @@
 /* driver param indices: */
 enum ir3_driver_param {
        IR3_DP_VTXID_BASE = 0,
+       IR3_DP_VTXCNT_MAX = 1,
 };
 
 /* internal semantic used for passing vtxcnt to vertex shader to

commit 96d4db683f90f02e72d34ece544de7eedfa873ee
Author: Rob Clark <robcl...@freedesktop.org>
Date:   Sat Jul 25 13:51:16 2015 -0400

    freedreno/ir3: track "keeps" in ir
    
    Previously we had a fixed array to track kills, since they don't
    generate an SSA value, and then cheated by stuffing them in the
    outputs array before sending things through depth/sched/etc.  But
    store instructions will need similar treatment.  So convert this
    over to a more general array of instructions that must be kept
    and fix up the places that were previously relying on kills being
    in the output array.
    
    Signed-off-by: Rob Clark <robcl...@freedesktop.org>

diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h 
b/src/gallium/drivers/freedreno/ir3/ir3.h
index e68170d..12f2ebe 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -369,6 +369,12 @@ struct ir3 {
        unsigned predicates_count, predicates_sz;
        struct ir3_instruction **predicates;
 
+       /* Track instructions which do not write a register but other-
+        * wise must not be discarded (such as kill, stg, etc)
+        */
+       unsigned keeps_count, keeps_sz;
+       struct ir3_instruction **keeps;
+
        /* List of blocks: */
        struct list_head block_list;
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c 
b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index e013abe..a4b2785 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -117,10 +117,6 @@ struct ir3_compile {
        /* for looking up which system value is which */
        unsigned sysval_semantics[8];
 
-       /* list of kill instructions: */
-       struct ir3_instruction *kill[16];
-       unsigned int kill_count;
-
        /* set if we encounter something we can't handle yet, so we
         * can bail cleanly and fallback to TGSI compiler f/e
         */
@@ -1481,7 +1477,7 @@ emit_intrinisic(struct ir3_compile *ctx, 
nir_intrinsic_instr *intr)
                kill = ir3_KILL(b, cond, 0);
                array_insert(ctx->ir->predicates, kill);
 
-               ctx->kill[ctx->kill_count++] = kill;
+               array_insert(ctx->ir->keeps, kill);
                ctx->so->has_kill = true;
 
                break;
@@ -2165,13 +2161,9 @@ emit_instructions(struct ir3_compile *ctx)
        ninputs  = exec_list_length(&ctx->s->inputs) * 4;
        noutputs = exec_list_length(&ctx->s->outputs) * 4;
 
-       /* we need to allocate big enough outputs array so that
-        * we can stuff the kill's at the end.  Likewise for vtx
-        * shaders, we need to leave room for sysvals:
+       /* or vtx shaders, we need to leave room for sysvals:
         */
-       if (ctx->so->type == SHADER_FRAGMENT) {
-               noutputs += ARRAY_SIZE(ctx->kill);
-       } else if (ctx->so->type == SHADER_VERTEX) {
+       if (ctx->so->type == SHADER_VERTEX) {
                ninputs += 8;
        }
 
@@ -2182,9 +2174,7 @@ emit_instructions(struct ir3_compile *ctx)
        ctx->in_block = ctx->block;
        list_addtail(&ctx->block->node, &ctx->ir->block_list);
 
-       if (ctx->so->type == SHADER_FRAGMENT) {
-               ctx->ir->noutputs -= ARRAY_SIZE(ctx->kill);
-       } else if (ctx->so->type == SHADER_VERTEX) {
+       if (ctx->so->type == SHADER_VERTEX) {
                ctx->ir->ninputs -= 8;
        }
 
@@ -2380,15 +2370,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
                }
        }
 
-       /* at this point, we want the kill's in the outputs array too,
-        * so that they get scheduled (since they have no dst).. we've
-        * already ensured that the array is big enough in push_block():
-        */
-       if (so->type == SHADER_FRAGMENT) {
-               for (i = 0; i < ctx->kill_count; i++)
-                       ir->outputs[ir->noutputs++] = ctx->kill[i];
-       }
-
        if (fd_mesa_debug & FD_DBG_OPTMSGS) {
                printf("BEFORE CP:\n");
                ir3_print(ir);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c 
b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index f4c825b..be4e4e8 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -408,6 +408,10 @@ ir3_cp(struct ir3 *ir)
                }
        }
 
+       for (unsigned i = 0; i < ir->keeps_count; i++) {
+               ir->keeps[i] = instr_cp(ir->keeps[i], NULL);
+       }
+
        list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
                if (block->condition)
                        block->condition = instr_cp(block->condition, NULL);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c 
b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
index 0f346b2..97df0c2 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -156,6 +156,9 @@ ir3_depth(struct ir3 *ir)
                if (ir->outputs[i])
                        ir3_instr_depth(ir->outputs[i]);
 
+       for (i = 0; i < ir->keeps_count; i++)
+               ir3_instr_depth(ir->keeps[i]);
+
        /* We also need to account for if-condition: */
        list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
                if (block->condition)

commit 020301baccc77e5753ead1e890c0cf24a9675517
Author: Rob Clark <robcl...@freedesktop.org>
Date:   Sat Jul 25 13:48:07 2015 -0400

    freedreno/ir3: add support for store instructions
    
    For store instructions, the "dst" register is a read register, not a
    written register.  (Ie. it is the address to store to.)  Lets not
    confuse register allocation, scheduling, etc, with these details.
    Instead just leave a dummy instr->regs[0], and take "dst" from
    instr->regs[1] and srcs following.
    
    Signed-off-by: Rob Clark <robcl...@freedesktop.org>

diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c 
b/src/gallium/drivers/freedreno/ir3/ir3.c
index 6d19a29..b24825c 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -499,12 +499,28 @@ static int emit_cat5(struct ir3_instruction *instr, void 
*ptr,
 static int emit_cat6(struct ir3_instruction *instr, void *ptr,
                struct ir3_info *info)
 {
-       struct ir3_register *dst  = instr->regs[0];
-       struct ir3_register *src1 = instr->regs[1];
-       struct ir3_register *src2 = (instr->regs_count >= 3) ? instr->regs[2] : 
NULL;
+       struct ir3_register *dst, *src1, *src2;
        instr_cat6_t *cat6 = ptr;
 
-       iassert(instr->regs_count >= 2);
+       /* the "dst" for a store instruction is (from the perspective
+        * of data flow in the shader, ie. register use/def, etc) in
+        * fact a register that is read by the instruction, rather
+        * than written:
+        */
+       if (is_store(instr)) {
+               iassert(instr->regs_count >= 3);
+
+               dst  = instr->regs[1];
+               src1 = instr->regs[2];
+               src2 = (instr->regs_count >= 4) ? instr->regs[3] : NULL;
+       } else {
+               iassert(instr->regs_count >= 2);
+
+               dst  = instr->regs[0];
+               src1 = instr->regs[1];
+               src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL;
+       }
+
 
        /* TODO we need a more comprehensive list about which instructions
         * can be encoded which way.  Or possibly use IR3_INSTR_0 flag to
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h 
b/src/gallium/drivers/freedreno/ir3/ir3.h
index c3b61a0..e68170d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -554,6 +554,26 @@ is_store(struct ir3_instruction *instr)
        return false;
 }
 
+static inline bool is_load(struct ir3_instruction *instr)
+{
+       if (is_mem(instr)) {
+               switch (instr->opc) {
+               case OPC_LDG:
+               case OPC_LDL:
+               case OPC_LDP:
+               case OPC_L2G:
+               case OPC_LDLW:
+               case OPC_LDC_4:
+               case OPC_LDLV:
+               /* probably some others too.. */
+                       return true;
+               default:
+                       break;
+               }
+       }
+       return false;
+}
+
 static inline bool is_input(struct ir3_instruction *instr)
 {
        /* in some cases, ldlv is used to fetch varying without
@@ -1043,6 +1063,7 @@ ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
 /* cat6 instructions: */
 INSTR2(6, LDLV)
 INSTR2(6, LDG)
+INSTR3(6, STG)
 
 /* ************************************************************************* */
 /* split this out or find some helper to use.. like main/bitset.h.. */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c 
b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
index f4a4223..e94293f 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
@@ -182,14 +182,14 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct 
ir3_block *block)
                         */
                        ctx->has_samp = true;
                        regmask_set(&needs_sy, n->regs[0]);
-               } else if (is_mem(n)) {
+               } else if (is_load(n)) {
                        regmask_set(&needs_sy, n->regs[0]);
                }
 
                /* both tex/sfu appear to not always immediately consume
                 * their src register(s):
                 */
-               if (is_tex(n) || is_sfu(n) || is_mem(n)) {
+               if (is_tex(n) || is_sfu(n) || is_load(n)) {
                        foreach_src(reg, n) {
                                if (reg_gpr(reg))
                                        regmask_set(&needs_ss_war, reg);

commit a240748de52f2e469e91b60d29ae872828a594d7
Author: Rob Clark <robcl...@freedesktop.org>
Date:   Sat Jul 25 12:48:18 2015 -0400

    freedreno/ir3: cleanup driver-param stuff
    
    Add 'enum ir3_driver_param' to track driver-param slots, and a
    create_driver_param() helper to avoid having the knowledge about
    where driver params are placed in const regs spread throughout
    the code as we add additional driver-params.
    
    Signed-off-by: Rob Clark <robcl...@freedesktop.org>

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c 
b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index bdbaf89..e013abe 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -261,13 +261,26 @@ compile_init(struct ir3_compiler *compiler,
 
        so->first_driver_param = so->first_immediate = ctx->s->num_uniforms;
 
-       /* one (vec4) slot for vertex id base: */
-       if (so->type == SHADER_VERTEX)
-               so->first_immediate++;
+       /* Layout of constant registers:
+        *
+        *    num_uniform * vec4  -  user consts
+        *    4 * vec4            -  UBO addresses
+        *    if (vertex shader) {
+        *        1 * vec4        -  driver params (IR3_DP_*)
+        *    }
+        *
+        * TODO this could be made more dynamic, to at least skip sections
+        * that we don't need..
+        */
 
        /* reserve 4 (vec4) slots for ubo base addresses: */
        so->first_immediate += 4;
 
+       if (so->type == SHADER_VERTEX) {
+               /* one (vec4) slot for driver params (see ir3_driver_param): */
+               so->first_immediate++;
+       }
+
        return ctx;
 }
 
@@ -811,6 +824,14 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp)
        }
 }
 
+static struct ir3_instruction *
+create_driver_param(struct ir3_compile *ctx, enum ir3_driver_param dp)
+{
+       /* first four vec4 sysval's reserved for UBOs: */
+       unsigned r = regid(ctx->so->first_driver_param + 4, dp);
+       return create_uniform(ctx, r);
+}
+
 /* helper for instructions that produce multiple consecutive scalar
  * outputs which need to have a split/fanout meta instruction inserted
  */
@@ -1415,9 +1436,7 @@ emit_intrinisic(struct ir3_compile *ctx, 
nir_intrinsic_instr *intr)
                break;
        case nir_intrinsic_load_base_vertex:
                if (!ctx->basevertex) {
-                       /* first four vec4 sysval's reserved for UBOs: */
-                       unsigned r = regid(ctx->so->first_driver_param + 4, 0);
-                       ctx->basevertex = create_uniform(ctx, r);
+                       ctx->basevertex = create_driver_param(ctx, 
IR3_DP_VTXID_BASE);
                        add_sysval_input(ctx, TGSI_SEMANTIC_BASEVERTEX,
                                        ctx->basevertex);
                }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c 
b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 75425e9..166eb00 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -548,10 +548,7 @@ ir3_emit_consts(struct ir3_shader_variant *v, struct 
fd_ringbuffer *ring,
                uint32_t offset = v->first_driver_param + 4;  /* driver params 
after UBOs */
                if (v->constlen >= offset) {
                        uint32_t vertex_params[4] = {
-                               info->indexed ? info->index_bias : info->start,
-                               0,
-                               0,
-                               0
+                               [IR3_DP_VTXID_BASE] = info->indexed ? 
info->index_bias : info->start,
                        };
 
                        fd_wfi(ctx, ring);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h 
b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index f0af447..4cb2520 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -34,6 +34,11 @@
 #include "ir3.h"
 #include "disasm.h"
 
+/* driver param indices: */
+enum ir3_driver_param {
+       IR3_DP_VTXID_BASE = 0,
+};
+
 /* internal semantic used for passing vtxcnt to vertex shader to
  * implement transform feedback:
  */

commit be8a8ebe578267ab24e343c3c1347936a221468e
Author: Rob Clark <robcl...@freedesktop.org>
Date:   Sat Jul 25 10:56:39 2015 -0400

    freedreno: add transform-feedback state
    
    Signed-off-by: Rob Clark <robcl...@freedesktop.org>

diff --git a/src/gallium/drivers/freedreno/freedreno_context.h 
b/src/gallium/drivers/freedreno/freedreno_context.h
index bc5267a..cc585af 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -82,6 +82,20 @@ struct fd_vertex_stateobj {
        unsigned num_elements;
 };
 
+struct fd_streamout_stateobj {
+       struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
+       unsigned num_targets;
+       /* Track offset from vtxcnt for streamout data.  This counter
+        * is just incremented by # of vertices on each draw until
+        * reset or new streamout buffer bound.
+        *
+        * When we eventually have GS, the CPU won't actually know the
+        * number of vertices per draw, so I think we'll have to do
+        * something more clever.
+        */
+       unsigned offsets[PIPE_MAX_SO_BUFFERS];
+};
+
 /* group together the vertex and vertexbuf state.. for ease of passing
  * around, and because various internal operations (gmem<->mem, etc)
  * need their own vertex state:
@@ -319,6 +333,7 @@ struct fd_context {
                FD_DIRTY_VTXBUF      = (1 << 15),
                FD_DIRTY_INDEXBUF    = (1 << 16),
                FD_DIRTY_SCISSOR     = (1 << 17),
+               FD_DIRTY_STREAMOUT   = (1 << 18),
        } dirty;
 
        struct pipe_blend_state *blend;
@@ -339,6 +354,7 @@ struct fd_context {
        struct pipe_viewport_state viewport;
        struct fd_constbuf_stateobj constbuf[PIPE_SHADER_TYPES];
        struct pipe_index_buffer indexbuf;
+       struct fd_streamout_stateobj streamout;
 
        /* GMEM/tile handling fxns: */
        void (*emit_tile_init)(struct fd_context *ctx);
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c 
b/src/gallium/drivers/freedreno/freedreno_draw.c
index ae75b3e..f886540 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.c
+++ b/src/gallium/drivers/freedreno/freedreno_draw.c
@@ -62,7 +62,7 @@ fd_draw_vbo(struct pipe_context *pctx, const struct 
pipe_draw_info *info)
        struct fd_context *ctx = fd_context(pctx);
        struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
        struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx);
-       unsigned i, buffers = 0;
+       unsigned i, prims, buffers = 0;
 
        /* if we supported transform feedback, we'd have to disable this: */
        if (((scissor->maxx - scissor->minx) *
@@ -144,11 +144,17 @@ fd_draw_vbo(struct pipe_context *pctx, const struct 
pipe_draw_info *info)
                if (ctx->fragtex.textures[i])
                        resource_used(ctx, ctx->fragtex.textures[i]->texture, 
true);
 
+       /* Mark streamout buffers as being read.. actually they are written.. */
+       for (i = 0; i < ctx->streamout.num_targets; i++)
+               if (ctx->streamout.targets[i])
+                       resource_used(ctx, ctx->streamout.targets[i]->buffer, 
false);
+
        ctx->num_draws++;
 
+       prims = u_reduced_prims_for_vertices(info->mode, info->count);
+
        ctx->stats.draw_calls++;
-       ctx->stats.prims_emitted +=
-               u_reduced_prims_for_vertices(info->mode, info->count);
+       ctx->stats.prims_emitted += prims;
 
        /* any buffers that haven't been cleared yet, we need to restore: */
        ctx->restore |= buffers & (FD_BUFFER_ALL & ~ctx->cleared);
@@ -162,6 +168,9 @@ fd_draw_vbo(struct pipe_context *pctx, const struct 
pipe_draw_info *info)
        fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_DRAW);
        ctx->draw_vbo(ctx, info);
 
+       for (i = 0; i < ctx->streamout.num_targets; i++)
+               ctx->streamout.offsets[i] += prims;
+
        /* if an app (or, well, piglit test) does many thousands of draws
         * without flush (or anything which implicitly flushes, like
         * changing render targets), we can exceed the ringbuffer size.
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c 
b/src/gallium/drivers/freedreno/freedreno_resource.c
index de3cb64..d649925 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -647,6 +647,8 @@ fd_blitter_pipe_begin(struct fd_context *ctx)
        util_blitter_save_vertex_buffer_slot(ctx->blitter, 
ctx->vtx.vertexbuf.vb);
        util_blitter_save_vertex_elements(ctx->blitter, ctx->vtx.vtx);
        util_blitter_save_vertex_shader(ctx->blitter, ctx->prog.vp);
+       util_blitter_save_so_targets(ctx->blitter, ctx->streamout.num_targets,
+                       ctx->streamout.targets);
        util_blitter_save_rasterizer(ctx->blitter, ctx->rasterizer);
        util_blitter_save_viewport(ctx->blitter, &ctx->viewport);
        util_blitter_save_scissor(ctx->blitter, &ctx->scissor);
diff --git a/src/gallium/drivers/freedreno/freedreno_state.c 
b/src/gallium/drivers/freedreno/freedreno_state.c
index 77aa4f2..7bf8bdb 100644
--- a/src/gallium/drivers/freedreno/freedreno_state.c
+++ b/src/gallium/drivers/freedreno/freedreno_state.c
@@ -300,6 +300,67 @@ fd_vertex_state_bind(struct pipe_context *pctx, void 
*hwcso)
        ctx->dirty |= FD_DIRTY_VTXSTATE;
 }
 
+static struct pipe_stream_output_target *
+fd_create_stream_output_target(struct pipe_context *pctx,
+               struct pipe_resource *prsc, unsigned buffer_offset,
+               unsigned buffer_size)
+{
+       struct pipe_stream_output_target *target;
+
+       target = CALLOC_STRUCT(pipe_stream_output_target);
+       if (!target)
+               return NULL;
+
+       pipe_reference_init(&target->reference, 1);
+       pipe_resource_reference(&target->buffer, prsc);
+
+       target->context = pctx;
+       target->buffer_offset = buffer_offset;
+       target->buffer_size = buffer_size;
+
+       return target;
+}
+
+static void
+fd_stream_output_target_destroy(struct pipe_context *pctx,
+               struct pipe_stream_output_target *target)
+{
+       pipe_resource_reference(&target->buffer, NULL);
+       FREE(target);
+}
+
+static void
+fd_set_stream_output_targets(struct pipe_context *pctx,
+               unsigned num_targets, struct pipe_stream_output_target 
**targets,
+               const unsigned *offsets)
+{
+       struct fd_context *ctx = fd_context(pctx);
+       struct fd_streamout_stateobj *so = &ctx->streamout;
+       unsigned i;
+
+       debug_assert(num_targets <= ARRAY_SIZE(so->targets));
+
+       for (i = 0; i < num_targets; i++) {
+               boolean changed = targets[i] != so->targets[i];
+               boolean append = (offsets[i] == (unsigned)-1);
+
+               if (!changed && append)
+                       continue;
+
+               so->offsets[i] = 0;
+
+               pipe_so_target_reference(&so->targets[i], targets[i]);
+       }
+
+       for (; i < so->num_targets; i++) {
+               pipe_so_target_reference(&so->targets[i], NULL);
+       }
+
+       so->num_targets = num_targets;
+
+       ctx->dirty |= FD_DIRTY_STREAMOUT;
+}
+
 void
 fd_state_init(struct pipe_context *pctx)
 {
@@ -328,4 +389,8 @@ fd_state_init(struct pipe_context *pctx)
        pctx->create_vertex_elements_state = fd_vertex_state_create;
        pctx->delete_vertex_elements_state = fd_vertex_state_delete;
        pctx->bind_vertex_elements_state = fd_vertex_state_bind;
+
+       pctx->create_stream_output_target = fd_create_stream_output_target;
+       pctx->stream_output_target_destroy = fd_stream_output_target_destroy;
+       pctx->set_stream_output_targets = fd_set_stream_output_targets;
 }

commit bda1354aac9d32e236048af4d353d5530f644c34
Author: Rob Clark <robcl...@freedesktop.org>
Date:   Sun Jul 26 13:30:26 2015 -0400

    freedreno: add resource tracking support for written buffers
    
    With stream-out (transform-feedback) we have the case where resources
    are *written* by the gpu, which needs basically the same tracking to
    figure out when rendering must be flushed.
    
    Signed-off-by: Rob Clark <robcl...@freedesktop.org>

diff --git a/src/gallium/drivers/freedreno/freedreno_context.c 
b/src/gallium/drivers/freedreno/freedreno_context.c
index 127fb5f..02613dc 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -130,8 +130,9 @@ fd_context_render(struct pipe_context *pctx)
 
        /* go through all the used resources and clear their reading flag */
        LIST_FOR_EACH_ENTRY_SAFE(rsc, rsc_tmp, &ctx->used_resources, list) {
-               assert(rsc->reading);
+               assert(rsc->reading || rsc->writing);
                rsc->reading = false;
+               rsc->writing = false;
                list_delinit(&rsc->list);
        }
 
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c 
b/src/gallium/drivers/freedreno/freedreno_draw.c
index c9e317c..ae75b3e 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.c
+++ b/src/gallium/drivers/freedreno/freedreno_draw.c
@@ -40,7 +40,7 @@
 #include "freedreno_util.h"
 
 static void
-resource_reading(struct fd_context *ctx, struct pipe_resource *prsc)
+resource_used(struct fd_context *ctx, struct pipe_resource *prsc, boolean 
reading)
 {
        struct fd_resource *rsc;
 
@@ -48,7 +48,10 @@ resource_reading(struct fd_context *ctx, struct 
pipe_resource *prsc)
                return;
 
        rsc = fd_resource(prsc);
-       rsc->reading = true;
+       if (reading)
+               rsc->reading = true;


-- 
To UNSUBSCRIBE, email to debian-x-requ...@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmas...@lists.debian.org
Archive: https://lists.debian.org/e1zk3lq-0000br...@moszumanska.debian.org

Reply via email to