[Mesa-dev] [PATCH] radeonsi: add GS multiple streams support (v2)

2015-07-29 Thread Dave Airlie
From: Dave Airlie 

This is the final piece for ARB_gpu_shader5,

The code is based on the r600 code from Glenn Kennard,
and myself.

While developing this, I'm not 100% sure of all the calculations
made in the GS registers, this is why the max_stream is worked
out there and used to limit the changes in registers. Otherwise
my initial attempts either regressed GS texelFetch tests
or primitive-id-restart. The current code has no regressions
in piglit.

This commit doesn't enable ARB_gpu_shader5, since that just
bumps the glsl level to 4.00, so I'll just do a separate patch
for 4.10.

v1.1: fix bug introduced in rebase.
v2: Address Marek's review comments,
remove my llvm stream code for simpler C,
move gsvs_ring and gs_next_vertex to arrays.

Signed-off-by: Dave Airlie 
---
 src/gallium/drivers/radeonsi/si_descriptors.c   |  4 +-
 src/gallium/drivers/radeonsi/si_pipe.c  |  2 +-
 src/gallium/drivers/radeonsi/si_shader.c| 74 +++-
 src/gallium/drivers/radeonsi/si_state.c |  4 --
 src/gallium/drivers/radeonsi/si_state.h |  7 ++-
 src/gallium/drivers/radeonsi/si_state_shaders.c | 75 +++--
 6 files changed, 127 insertions(+), 39 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 2e2a35b..14bb6e1 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -724,7 +724,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint 
shader, uint slot,
struct pipe_resource *buffer,
unsigned stride, unsigned num_records,
bool add_tid, bool swizzle,
-   unsigned element_size, unsigned index_stride)
+   unsigned element_size, unsigned index_stride, uint64_t 
offset)
 {
struct si_context *sctx = (struct si_context *)ctx;
struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
@@ -741,7 +741,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint 
shader, uint slot,
if (buffer) {
uint64_t va;
 
-   va = r600_resource(buffer)->gpu_address;
+   va = r600_resource(buffer)->gpu_address + offset;
 
switch (element_size) {
default:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index 808b9bc..a120282 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -316,7 +316,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum 
pipe_cap param)
case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
return 4095;
case PIPE_CAP_MAX_VERTEX_STREAMS:
-   return 1;
+   return 4;
 
case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
return 2048;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index fa31f73..d8bab87 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -31,6 +31,7 @@
 #include "gallivm/lp_bld_intr.h"
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_bitarit.h"
 #include "gallivm/lp_bld_flow.h"
 #include "radeon/r600_cs.h"
 #include "radeon/radeon_llvm.h"
@@ -87,8 +88,8 @@ struct si_shader_context
LLVMValueRef samplers[SI_NUM_SAMPLER_STATES];
LLVMValueRef so_buffers[4];
LLVMValueRef esgs_ring;
-   LLVMValueRef gsvs_ring;
-   LLVMValueRef gs_next_vertex;
+   LLVMValueRef gsvs_ring[4];
+   LLVMValueRef gs_next_vertex[4];
 };
 
 static struct si_shader_context * si_shader_context(
@@ -1576,6 +1577,9 @@ static void si_llvm_emit_streamout(struct 
si_shader_context *shader,
LLVMValueRef can_emit =
LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
 
+   LLVMValueRef stream_id =
+   unpack_param(shader, shader->param_streamout_config, 24, 2);
+
/* Emit the streamout code conditionally. This actually avoids
 * out-of-bounds buffer access. The hw tells us via the SGPR
 * (so_vtx_count) which threads are allowed to emit streamout data. */
@@ -1615,7 +1619,9 @@ static void si_llvm_emit_streamout(struct 
si_shader_context *shader,
unsigned reg = so->output[i].register_index;
unsigned start = so->output[i].start_component;
unsigned num_comps = so->output[i].num_components;
+   unsigned stream = so->output[i].stream;
LLVMValueRef out[4];
+   struct lp_build_if_state if_ctx_stream;
 
assert(num_comps && num_comps <= 4);
if (!num_comps || num_comps > 4)
@@ -1649,11 +1655,18 @@ static void si_llvm_emit_streamout(struct 
si_shader_context *s

Re: [Mesa-dev] [PATCH] radeonsi: add GS multiple streams support (v2)

2015-07-30 Thread Marek Olšák
Reviewed-by: Marek Olšák 

Marek

On Thu, Jul 30, 2015 at 2:06 AM, Dave Airlie  wrote:
> From: Dave Airlie 
>
> This is the final piece for ARB_gpu_shader5,
>
> The code is based on the r600 code from Glenn Kennard,
> and myself.
>
> While developing this, I'm not 100% sure of all the calculations
> made in the GS registers, this is why the max_stream is worked
> out there and used to limit the changes in registers. Otherwise
> my initial attempts either regressed GS texelFetch tests
> or primitive-id-restart. The current code has no regressions
> in piglit.
>
> This commit doesn't enable ARB_gpu_shader5, since that just
> bumps the glsl level to 4.00, so I'll just do a separate patch
> for 4.10.
>
> v1.1: fix bug introduced in rebase.
> v2: Address Marek's review comments,
> remove my llvm stream code for simpler C,
> move gsvs_ring and gs_next_vertex to arrays.
>
> Signed-off-by: Dave Airlie 
> ---
>  src/gallium/drivers/radeonsi/si_descriptors.c   |  4 +-
>  src/gallium/drivers/radeonsi/si_pipe.c  |  2 +-
>  src/gallium/drivers/radeonsi/si_shader.c| 74 +++-
>  src/gallium/drivers/radeonsi/si_state.c |  4 --
>  src/gallium/drivers/radeonsi/si_state.h |  7 ++-
>  src/gallium/drivers/radeonsi/si_state_shaders.c | 75 
> +++--
>  6 files changed, 127 insertions(+), 39 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
> b/src/gallium/drivers/radeonsi/si_descriptors.c
> index 2e2a35b..14bb6e1 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -724,7 +724,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint 
> shader, uint slot,
> struct pipe_resource *buffer,
> unsigned stride, unsigned num_records,
> bool add_tid, bool swizzle,
> -   unsigned element_size, unsigned index_stride)
> +   unsigned element_size, unsigned index_stride, 
> uint64_t offset)
>  {
> struct si_context *sctx = (struct si_context *)ctx;
> struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
> @@ -741,7 +741,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint 
> shader, uint slot,
> if (buffer) {
> uint64_t va;
>
> -   va = r600_resource(buffer)->gpu_address;
> +   va = r600_resource(buffer)->gpu_address + offset;
>
> switch (element_size) {
> default:
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
> b/src/gallium/drivers/radeonsi/si_pipe.c
> index 808b9bc..a120282 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -316,7 +316,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum 
> pipe_cap param)
> case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
> return 4095;
> case PIPE_CAP_MAX_VERTEX_STREAMS:
> -   return 1;
> +   return 4;
>
> case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
> return 2048;
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
> b/src/gallium/drivers/radeonsi/si_shader.c
> index fa31f73..d8bab87 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -31,6 +31,7 @@
>  #include "gallivm/lp_bld_intr.h"
>  #include "gallivm/lp_bld_logic.h"
>  #include "gallivm/lp_bld_arit.h"
> +#include "gallivm/lp_bld_bitarit.h"
>  #include "gallivm/lp_bld_flow.h"
>  #include "radeon/r600_cs.h"
>  #include "radeon/radeon_llvm.h"
> @@ -87,8 +88,8 @@ struct si_shader_context
> LLVMValueRef samplers[SI_NUM_SAMPLER_STATES];
> LLVMValueRef so_buffers[4];
> LLVMValueRef esgs_ring;
> -   LLVMValueRef gsvs_ring;
> -   LLVMValueRef gs_next_vertex;
> +   LLVMValueRef gsvs_ring[4];
> +   LLVMValueRef gs_next_vertex[4];
>  };
>
>  static struct si_shader_context * si_shader_context(
> @@ -1576,6 +1577,9 @@ static void si_llvm_emit_streamout(struct 
> si_shader_context *shader,
> LLVMValueRef can_emit =
> LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
>
> +   LLVMValueRef stream_id =
> +   unpack_param(shader, shader->param_streamout_config, 24, 2);
> +
> /* Emit the streamout code conditionally. This actually avoids
>  * out-of-bounds buffer access. The hw tells us via the SGPR
>  * (so_vtx_count) which threads are allowed to emit streamout data. */
> @@ -1615,7 +1619,9 @@ static void si_llvm_emit_streamout(struct 
> si_shader_context *shader,
> unsigned reg = so->output[i].register_index;
> unsigned start = so->output[i].start_component;
> unsigned num_comps = so->output[i].num_components;
> +   unsigned stream = so->output[i].stream;
>