There are 3 changes:

1) stride is specified for each buffer, not just one, so that drivers don't
   have to derive it from the outputs

2) new per-output property dst_offset, which specifies the offset
   into the buffer in dwords where the output should be stored,
   so that drivers don't have to compute the offsets manually;
   this will also be useful for gl_SkipComponents
   from ARB_transform_feedback3

3) register_mask is removed, instead, there is start_component
   and num_components; register_mask with non-consecutive 1s
   doesn't make much sense (some hardware cannot do packing of components)

Christoph Bumiller: fixed nvc0.
---
 src/gallium/auxiliary/draw/draw_pt_so_emit.c |    4 ++-
 src/gallium/auxiliary/util/u_blitter.c       |    4 +-
 src/gallium/auxiliary/util/u_dump_state.c    |    6 ++-
 src/gallium/drivers/llvmpipe/lp_state_so.c   |    2 +-
 src/gallium/drivers/nvc0/nvc0_program.c      |   11 ++----
 src/gallium/drivers/r600/r600.h              |    4 +-
 src/gallium/drivers/r600/r600_hw_context.c   |    8 ++--
 src/gallium/drivers/r600/r600_pipe.h         |    1 -
 src/gallium/drivers/r600/r600_shader.c       |   46 +++++++-------------------
 src/gallium/drivers/r600/r600_state_common.c |    2 +-
 src/gallium/drivers/softpipe/sp_state_so.c   |    2 +-
 src/gallium/drivers/trace/tr_dump_state.c    |    6 ++-
 src/gallium/include/pipe/p_state.h           |   13 ++++---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp   |   18 ++++------
 14 files changed, 53 insertions(+), 74 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pt_so_emit.c 
b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
index 2dc9e29..7dc6937 100644
--- a/src/gallium/auxiliary/draw/draw_pt_so_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
@@ -144,7 +144,9 @@ static void so_emit_prim(struct pt_so_emit *so,
          (const char *)input_ptr + (indices[i] * input_vertex_stride));
       for (slot = 0; slot < state->num_outputs; ++slot) {
          unsigned idx = state->output[slot].register_index;
-         unsigned writemask = state->output[slot].register_mask;
+         unsigned writemask =
+               ((1 << state->output[slot].num_components) - 1) <<
+               state->output[slot].start_component;
          unsigned written_compos = 0;
          unsigned compo;
 
diff --git a/src/gallium/auxiliary/util/u_blitter.c 
b/src/gallium/auxiliary/util/u_blitter.c
index 80fdfe0..94012c5 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -266,8 +266,8 @@ struct blitter_context *util_blitter_create(struct 
pipe_context *pipe)
 
       memset(&so, 0, sizeof(so));
       so.num_outputs = 1;
-      so.output[0].register_mask = TGSI_WRITEMASK_XYZW;
-      so.stride = 4;
+      so.output[0].num_components = 4;
+      so.stride[0] = 4;
 
       ctx->vs_pos_only =
          util_make_vertex_passthrough_shader_with_so(pipe, 1, semantic_names,
diff --git a/src/gallium/auxiliary/util/u_dump_state.c 
b/src/gallium/auxiliary/util/u_dump_state.c
index e44c619..4b5a042 100644
--- a/src/gallium/auxiliary/util/u_dump_state.c
+++ b/src/gallium/auxiliary/util/u_dump_state.c
@@ -441,13 +441,15 @@ util_dump_shader_state(FILE *stream, const struct 
pipe_shader_state *state)
    util_dump_member_begin(stream, "stream_output");
    util_dump_struct_begin(stream, "pipe_stream_output_info");
    util_dump_member(stream, uint, &state->stream_output, num_outputs);
-   util_dump_member(stream, uint, &state->stream_output, stride);
+   util_dump_array(stream, uint, state->stream_output.stride,
+                   Elements(state->stream_output.stride));
    util_dump_array_begin(stream);
    for(i = 0; i < state->stream_output.num_outputs; ++i) {
       util_dump_elem_begin(stream);
       util_dump_struct_begin(stream, ""); /* anonymous */
       util_dump_member(stream, uint, &state->stream_output.output[i], 
register_index);
-      util_dump_member(stream, uint, &state->stream_output.output[i], 
register_mask);
+      util_dump_member(stream, uint, &state->stream_output.output[i], 
start_component);
+      util_dump_member(stream, uint, &state->stream_output.output[i], 
num_components);
       util_dump_member(stream, uint, &state->stream_output.output[i], 
output_buffer);
       util_dump_struct_end(stream);
       util_dump_elem_end(stream);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_so.c 
b/src/gallium/drivers/llvmpipe/lp_state_so.c
index 108f3aa..ed2272d 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_so.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_so.c
@@ -42,7 +42,7 @@ llvmpipe_create_stream_output_state(struct pipe_context *pipe,
 
    if (so) {
       so->base.num_outputs = templ->num_outputs;
-      so->base.stride = templ->stride;
+      memcpy(so->base.stride, templ->stride, sizeof(templ->stride));
       memcpy(so->base.output, templ->output,
              templ->num_outputs * sizeof(templ->output[0]));
    }
diff --git a/src/gallium/drivers/nvc0/nvc0_program.c 
b/src/gallium/drivers/nvc0/nvc0_program.c
index 605bca5..499d20e 100644
--- a/src/gallium/drivers/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nvc0/nvc0_program.c
@@ -496,20 +496,17 @@ nvc0_program_create_tfb_state(const struct 
nv50_ir_prog_info *info,
       tfb->varying_count[b] = 0;
 
       for (i = 0; i < pso->num_outputs; ++i) {
+         unsigned startc = pso->output[i].start_component;
          if (pso->output[i].output_buffer != b)
             continue;
-         for (c = 0; c < 4; ++c) {
-            if (!(pso->output[i].register_mask & (1 << c)))
-               continue;
+         for (c = 0; c < pso->output[i].num_components; ++c) {
             tfb->varying_count[b]++;
             tfb->varying_index[n++] =
-               info->out[pso->output[i].register_index].slot[c];
+               info->out[pso->output[i].register_index].slot[startc + c];
          }
       }
-      tfb->stride[b] = tfb->varying_count[b] * 4;
+      tfb->stride[b] = pso->stride[b] * 4;
    }
-   if (pso->stride)
-      tfb->stride[0] = pso->stride;
 
    return tfb;
 }
diff --git a/src/gallium/drivers/r600/r600.h b/src/gallium/drivers/r600/r600.h
index 4bfb5a9..baf09c1 100644
--- a/src/gallium/drivers/r600/r600.h
+++ b/src/gallium/drivers/r600/r600.h
@@ -196,7 +196,7 @@ struct r600_so_target {
 
        /* The buffer where BUFFER_FILLED_SIZE is stored. */
        struct r600_resource    *filled_size;
-       unsigned                stride;
+       unsigned                stride_in_dw;
        unsigned                so_index;
 };
 
@@ -248,7 +248,7 @@ struct r600_context {
        struct r600_so_target           *so_targets[PIPE_MAX_SO_BUFFERS];
        boolean                         streamout_start;
        unsigned                        streamout_append_bitmask;
-       unsigned                        *vs_shader_so_strides;
+       unsigned                        *vs_so_stride_in_dw;
 };
 
 struct r600_draw {
diff --git a/src/gallium/drivers/r600/r600_hw_context.c 
b/src/gallium/drivers/r600/r600_hw_context.c
index 1dba966..09bc5a7 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -2012,7 +2012,7 @@ static void r600_set_streamout_enable(struct r600_context 
*ctx, unsigned buffer_
 void r600_context_streamout_begin(struct r600_context *ctx)
 {
        struct r600_so_target **t = ctx->so_targets;
-       unsigned *strides = ctx->vs_shader_so_strides;
+       unsigned *stride_in_dw = ctx->vs_so_stride_in_dw;
        unsigned buffer_en, i, update_flags = 0;
 
        buffer_en = (ctx->num_so_targets >= 1 && t[0] ? 1 : 0) |
@@ -2043,7 +2043,7 @@ void r600_context_streamout_begin(struct r600_context 
*ctx)
 
        for (i = 0; i < ctx->num_so_targets; i++) {
                if (t[i]) {
-                       t[i]->stride = strides[i];
+                       t[i]->stride_in_dw = stride_in_dw[i];
                        t[i]->so_index = i;
 
                        update_flags |= SURFACE_BASE_UPDATE_STRMOUT(i);
@@ -2053,7 +2053,7 @@ void r600_context_streamout_begin(struct r600_context 
*ctx)
                                                        16*i - 
R600_CONTEXT_REG_OFFSET) >> 2;
                        ctx->pm4[ctx->pm4_cdwords++] = (t[i]->b.buffer_offset +
                                                        t[i]->b.buffer_size) >> 
2; /* BUFFER_SIZE (in DW) */
-                       ctx->pm4[ctx->pm4_cdwords++] = strides[i] >> 2;         
   /* VTX_STRIDE (in DW) */
+                       ctx->pm4[ctx->pm4_cdwords++] = stride_in_dw[i];         
   /* VTX_STRIDE (in DW) */
                        ctx->pm4[ctx->pm4_cdwords++] = 0;                       
   /* BUFFER_BASE */
 
                        ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
@@ -2165,7 +2165,7 @@ void r600_context_draw_opaque_count(struct r600_context 
*ctx, struct r600_so_tar
 
        ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
        ctx->pm4[ctx->pm4_cdwords++] = 
(R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - R600_CONTEXT_REG_OFFSET) >> 2;
-       ctx->pm4[ctx->pm4_cdwords++] = t->stride >> 2;
+       ctx->pm4[ctx->pm4_cdwords++] = t->stride_in_dw;
 
        ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_COPY_DW, 4, 0);
        ctx->pm4[ctx->pm4_cdwords++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG;
diff --git a/src/gallium/drivers/r600/r600_pipe.h 
b/src/gallium/drivers/r600/r600_pipe.h
index 447b9dc..91eb0e8 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -147,7 +147,6 @@ struct r600_pipe_shader {
        struct tgsi_token               *tokens;
        unsigned        sprite_coord_enable;
        struct pipe_stream_output_info  so;
-       unsigned                        so_strides[4];
 };
 
 struct r600_pipe_sampler_state {
diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index ad4aded..9dbbbd8 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -124,12 +124,14 @@ int r600_pipe_shader_create(struct pipe_context *ctx, 
struct r600_pipe_shader *s
                        unsigned i;
                        fprintf(stderr, "STREAMOUT\n");
                        for (i = 0; i < shader->so.num_outputs; i++) {
+                               unsigned mask = ((1 << 
shader->so.output[i].num_components) - 1) <<
+                                               
shader->so.output[i].start_component;
                                fprintf(stderr, "  %i: MEM_STREAM0_BUF%i 
OUT[%i].%s%s%s%s\n", i,
                                        shader->so.output[i].output_buffer, 
shader->so.output[i].register_index,
-                                       shader->so.output[i].register_mask & 1 
? "x" : "_",
-                                       (shader->so.output[i].register_mask >> 
1) & 1 ? "y" : "_",
-                                       (shader->so.output[i].register_mask >> 
2) & 1 ? "z" : "_",
-                                       (shader->so.output[i].register_mask >> 
3) & 1 ? "w" : "_");
+                                       mask & 1 ? "x" : "_",
+                                       (mask >> 1) & 1 ? "y" : "_",
+                                       (mask >> 2) & 1 ? "z" : "_",
+                                       (mask >> 3) & 1 ? "w" : "_");
                        }
                }
        }
@@ -863,11 +865,8 @@ static int r600_shader_from_tgsi(struct r600_pipe_context 
* rctx, struct r600_pi
 
        /* Add stream outputs. */
        if (ctx.type == TGSI_PROCESSOR_VERTEX && so.num_outputs) {
-               unsigned buffer_offset[PIPE_MAX_SO_BUFFERS] = {0};
-
                for (i = 0; i < so.num_outputs; i++) {
                        struct r600_bytecode_output output;
-                       unsigned comps;
 
                        if (so.output[i].output_buffer >= 4) {
                                R600_ERR("exceeded the max number of stream 
output buffers, got: %d\n",
@@ -875,36 +874,21 @@ static int r600_shader_from_tgsi(struct r600_pipe_context 
* rctx, struct r600_pi
                                r = -EINVAL;
                                goto out_err;
                        }
-
-                       switch (so.output[i].register_mask) {
-                       case TGSI_WRITEMASK_XYZW:
-                               comps = 4;
-                               break;
-                       case TGSI_WRITEMASK_XYZ:
-                               comps = 3;
-                               break;
-                       case TGSI_WRITEMASK_XY:
-                               comps = 2;
-                               break;
-                       case TGSI_WRITEMASK_X:
-                               comps = 1;
-                               break;
-                       default:
-                               R600_ERR("streamout: invalid register_mask, 
got: %x\n",
-                                        so.output[i].register_mask);
-                               r = -EINVAL;
-                               goto out_err;
+                       if (so.output[i].start_component) {
+                          R600_ERR("stream_output - start_component cannot be 
non-zero\n");
+                          r = -EINVAL;
+                          goto out_err;
                        }
 
                        memset(&output, 0, sizeof(struct r600_bytecode_output));
                        output.gpr = 
shader->output[so.output[i].register_index].gpr;
                        output.elem_size = 0;
-                       output.array_base = 
buffer_offset[so.output[i].output_buffer];
+                       output.array_base = so.output[i].dst_offset;
                        output.type = 
V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
                        output.burst_count = 1;
                        output.barrier = 1;
                        output.array_size = 0;
-                       output.comp_mask = so.output[i].register_mask;
+                       output.comp_mask = (1 << so.output[i].num_components) - 
1;
                        if (ctx.bc->chip_class >= EVERGREEN) {
                                switch (so.output[i].output_buffer) {
                                case 0:
@@ -939,12 +923,6 @@ static int r600_shader_from_tgsi(struct r600_pipe_context 
* rctx, struct r600_pi
                        r = r600_bytecode_add_output(ctx.bc, &output);
                        if (r)
                                goto out_err;
-
-                       buffer_offset[so.output[i].output_buffer] += comps;
-               }
-
-               for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
-                       pipeshader->so_strides[i] = buffer_offset[i] * 4;
                }
        }
 
diff --git a/src/gallium/drivers/r600/r600_state_common.c 
b/src/gallium/drivers/r600/r600_state_common.c
index 054ab90..d96857e 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -645,7 +645,7 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct 
pipe_draw_info *dinfo)
                }
        }
 
-       rctx->ctx.vs_shader_so_strides = rctx->vs_shader->so_strides;
+       rctx->ctx.vs_so_stride_in_dw = rctx->vs_shader->so.stride;
 
        mask = (1ULL << ((unsigned)rctx->framebuffer.nr_cbufs * 4)) - 1;
 
diff --git a/src/gallium/drivers/softpipe/sp_state_so.c 
b/src/gallium/drivers/softpipe/sp_state_so.c
index 31ef384..f4b5cdf 100644
--- a/src/gallium/drivers/softpipe/sp_state_so.c
+++ b/src/gallium/drivers/softpipe/sp_state_so.c
@@ -43,7 +43,7 @@ softpipe_create_stream_output_state(struct pipe_context *pipe,
 
    if (so) {
       so->base.num_outputs = templ->num_outputs;
-      so->base.stride = templ->stride;
+      memcpy(so->base.stride, templ->stride, sizeof(templ->stride));
       memcpy(so->base.output, templ->output,
              templ->num_outputs * sizeof(templ->output[0]));
    }
diff --git a/src/gallium/drivers/trace/tr_dump_state.c 
b/src/gallium/drivers/trace/tr_dump_state.c
index 038a80e..7ce477e 100644
--- a/src/gallium/drivers/trace/tr_dump_state.c
+++ b/src/gallium/drivers/trace/tr_dump_state.c
@@ -271,14 +271,16 @@ void trace_dump_shader_state(const struct 
pipe_shader_state *state)
    trace_dump_member_begin("stream_output");
    trace_dump_struct_begin("pipe_stream_output_info");
    trace_dump_member(uint, &state->stream_output, num_outputs);
-   trace_dump_member(uint, &state->stream_output, stride);
+   trace_dump_array(uint, state->stream_output.stride, PIPE_MAX_SO_BUFFERS);
    trace_dump_array_begin();
    for(i = 0; i < state->stream_output.num_outputs; ++i) {
       trace_dump_elem_begin();
       trace_dump_struct_begin(""); /* anonymous */
       trace_dump_member(uint, &state->stream_output.output[i], register_index);
-      trace_dump_member(uint, &state->stream_output.output[i], register_mask);
+      trace_dump_member(uint, &state->stream_output.output[i], 
start_component);
+      trace_dump_member(uint, &state->stream_output.output[i], num_components);
       trace_dump_member(uint, &state->stream_output.output[i], output_buffer);
+      trace_dump_member(uint, &state->stream_output.output[i], dst_offset);
       trace_dump_struct_end();
       trace_dump_elem_end();
    }
diff --git a/src/gallium/include/pipe/p_state.h 
b/src/gallium/include/pipe/p_state.h
index f943ca5..15cc001 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -176,16 +176,19 @@ struct pipe_clip_state
 struct pipe_stream_output_info
 {
    unsigned num_outputs;
-   /** stride for an entire vertex, only used if all output_buffers are 0 */
-   unsigned stride;
+   /** stride for an entire vertex for each buffer in dwords */
+   unsigned stride[PIPE_MAX_SO_BUFFERS];
+
    /**
     * Array of stream outputs, in the order they are to be written in.
     * Selected components are tightly packed into the output buffer.
     */
    struct {
-      unsigned register_index:8; /**< 0 to PIPE_MAX_SHADER_OUTPUTS */
-      unsigned register_mask:4;  /**< TGSI_WRITEMASK_x */
-      unsigned output_buffer:4;  /**< 0 to PIPE_MAX_SO_BUFFERS */
+      unsigned register_index:8;  /**< 0 to PIPE_MAX_SHADER_OUTPUTS */
+      unsigned start_component:2; /** 0 to 3 */
+      unsigned num_components:3;  /** 1 to 4 */
+      unsigned output_buffer:3;   /**< 0 to PIPE_MAX_SO_BUFFERS */
+      unsigned dst_offset:16;     /**< offset into the buffer in dwords */
    } output[PIPE_MAX_SHADER_OUTPUTS];
 };
 
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp 
b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 3b8e2fe..1613bf8 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -5091,25 +5091,21 @@ st_translate_stream_output_info(struct 
glsl_to_tgsi_visitor *glsl_to_tgsi,
                                 const GLuint outputMapping[],
                                 struct pipe_stream_output_info *so)
 {
-   static unsigned comps_to_mask[] = {
-      0,
-      TGSI_WRITEMASK_X,
-      TGSI_WRITEMASK_XY,
-      TGSI_WRITEMASK_XYZ,
-      TGSI_WRITEMASK_XYZW
-   };
    unsigned i;
    struct gl_transform_feedback_info *info =
       &glsl_to_tgsi->shader_program->LinkedTransformFeedback;
 
    for (i = 0; i < info->NumOutputs; i++) {
-      assert(info->Outputs[i].NumComponents < Elements(comps_to_mask));
       so->output[i].register_index =
          outputMapping[info->Outputs[i].OutputRegister];
-      so->output[i].register_mask =
-         comps_to_mask[info->Outputs[i].NumComponents]
-         << info->Outputs[i].ComponentOffset;
+      so->output[i].start_component = info->Outputs[i].ComponentOffset;
+      so->output[i].num_components = info->Outputs[i].NumComponents;
       so->output[i].output_buffer = info->Outputs[i].OutputBuffer;
+      so->output[i].dst_offset = info->Outputs[i].DstOffset;
+   }
+
+   for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+      so->stride[i] = info->BufferStride[i];
    }
    so->num_outputs = info->NumOutputs;
 }
-- 
1.7.5.4

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to