Re: [Mesa-dev] [PATCH 05/12] i965/cs: Split out helper for building local id payload

2015-10-08 Thread Pohjolainen, Topi
On Wed, Oct 07, 2015 at 07:11:45AM -0700, Kristian H?gsberg Kristensen wrote:
> The initial motivation for this patch was to avoid calling
> brw_cs_prog_local_id_payload_dwords() in gen7_cs_state.c from the
> compiler. This commit ends up refactoring things a bit more so as to
> split out the logic to build the local id payload to brw_fs.cpp. This
> moves the payload building closer to the compiler code that uses the
> payload layout and makes it avaiable to other users of the compiler.
> 
> Signed-off-by: Kristian Høgsberg Kristensen 
> ---
>  src/mesa/drivers/dri/i965/brw_context.h   |  1 +
>  src/mesa/drivers/dri/i965/brw_cs.h|  5 +-
>  src/mesa/drivers/dri/i965/brw_fs.cpp  | 68 --
>  src/mesa/drivers/dri/i965/gen7_cs_state.c | 80 
> ---
>  4 files changed, 76 insertions(+), 78 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
> b/src/mesa/drivers/dri/i965/brw_context.h
> index 0a29a69..1869f28 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -484,6 +484,7 @@ struct brw_cs_prog_data {
> unsigned simd_size;
> bool uses_barrier;
> bool uses_num_work_groups;
> +   unsigned local_invocation_id_regs;
>  
> struct {
>/** @{
> diff --git a/src/mesa/drivers/dri/i965/brw_cs.h 
> b/src/mesa/drivers/dri/i965/brw_cs.h
> index 0c0ed2b..c07eb6c 100644
> --- a/src/mesa/drivers/dri/i965/brw_cs.h
> +++ b/src/mesa/drivers/dri/i965/brw_cs.h
> @@ -48,8 +48,9 @@ brw_cs_emit(struct brw_context *brw,
>  struct gl_shader_program *prog,
>  unsigned *final_assembly_size);
>  
> -unsigned
> -brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width);
> +void
> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
> + void *buffer, uint32_t threads, uint32_t 
> stride);
>  
>  #ifdef __cplusplus
>  }
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 1dee888..b4125aa 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -4718,20 +4718,43 @@ fs_visitor::setup_vs_payload()
> payload.num_regs = 2;
>  }
>  
> +/**
> + * We are building the local ID push constant data using the simplest 
> possible
> + * method. We simply push the local IDs directly as they should appear in the
> + * registers for the uvec3 gl_LocalInvocationID variable.
> + *
> + * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
> + * registers worth of push constant space.
> + *
> + * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
> + * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup 
> need
> + * to coordinated.
> + *
> + * FINISHME: There are a few easy optimizations to consider.
> + *
> + * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
> + *no need for using push constant space for that dimension.
> + *
> + * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
> + *easily use 16-bit words rather than 32-bit dwords in the push constant
> + *data.
> + *
> + * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
> + *conveying the data, and thereby reduce push constant usage.
> + *
> + */
>  void
>  fs_visitor::setup_cs_payload()
>  {
> assert(devinfo->gen >= 7);
> +   brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
>  
> payload.num_regs = 1;
>  
> if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
> -  const unsigned local_id_dwords =
> - brw_cs_prog_local_id_payload_dwords(dispatch_width);
> -  assert((local_id_dwords & 0x7) == 0);
> -  const unsigned local_id_regs = local_id_dwords / 8;
> +  prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;
>payload.local_invocation_id_reg = payload.num_regs;
> -  payload.num_regs += local_id_regs;
> +  payload.num_regs += prog_data->local_invocation_id_regs;
> }
>  }
>  
> @@ -5171,6 +5194,41 @@ brw_wm_fs_emit(struct brw_context *brw,
> return g.get_assembly(final_assembly_size);
>  }
>  
> +void
> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
> + void *buffer, uint32_t threads, uint32_t stride)
> +{
> +   if (prog_data->local_invocation_id_regs == 0)
> +  return;
> +
> +   /* stride should be an integer number of registers, that is, a multiple of
> +* 32 bytes. */

Comment closing on its own line and you could start the sentence with a
capital 'S' as well.

> +   assert(stride % 32 == 0);
> +
> +   unsigned x = 0, y = 0, z = 0;
> +   for (unsigned t = 0; t < threads; t++) {
> +  uint32_t *param = (uint32_t *) buffer + stride * t / 4;
> +
> +  for (unsigned i = 0; i < prog_data->simd_size; i++) {
> + param[0 * prog_data->simd_size + i] = x;
> + 

Re: [Mesa-dev] [PATCH 05/12] i965/cs: Split out helper for building local id payload

2015-10-08 Thread Pohjolainen, Topi
On Wed, Oct 07, 2015 at 07:11:45AM -0700, Kristian H?gsberg Kristensen wrote:
> The initial motivation for this patch was to avoid calling
> brw_cs_prog_local_id_payload_dwords() in gen7_cs_state.c from the
> compiler. This commit ends up refactoring things a bit more so as to
> split out the logic to build the local id payload to brw_fs.cpp. This
> moves the payload building closer to the compiler code that uses the
> payload layout and makes it avaiable to other users of the compiler.
  ^

  available
> 
> Signed-off-by: Kristian Høgsberg Kristensen 
> ---
>  src/mesa/drivers/dri/i965/brw_context.h   |  1 +
>  src/mesa/drivers/dri/i965/brw_cs.h|  5 +-
>  src/mesa/drivers/dri/i965/brw_fs.cpp  | 68 --
>  src/mesa/drivers/dri/i965/gen7_cs_state.c | 80 
> ---
>  4 files changed, 76 insertions(+), 78 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
> b/src/mesa/drivers/dri/i965/brw_context.h
> index 0a29a69..1869f28 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -484,6 +484,7 @@ struct brw_cs_prog_data {
> unsigned simd_size;
> bool uses_barrier;
> bool uses_num_work_groups;
> +   unsigned local_invocation_id_regs;
>  
> struct {
>/** @{
> diff --git a/src/mesa/drivers/dri/i965/brw_cs.h 
> b/src/mesa/drivers/dri/i965/brw_cs.h
> index 0c0ed2b..c07eb6c 100644
> --- a/src/mesa/drivers/dri/i965/brw_cs.h
> +++ b/src/mesa/drivers/dri/i965/brw_cs.h
> @@ -48,8 +48,9 @@ brw_cs_emit(struct brw_context *brw,
>  struct gl_shader_program *prog,
>  unsigned *final_assembly_size);
>  
> -unsigned
> -brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width);
> +void
> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
> + void *buffer, uint32_t threads, uint32_t 
> stride);
>  
>  #ifdef __cplusplus
>  }
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 1dee888..b4125aa 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -4718,20 +4718,43 @@ fs_visitor::setup_vs_payload()
> payload.num_regs = 2;
>  }
>  
> +/**
> + * We are building the local ID push constant data using the simplest 
> possible
> + * method. We simply push the local IDs directly as they should appear in the
> + * registers for the uvec3 gl_LocalInvocationID variable.
> + *
> + * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
> + * registers worth of push constant space.
> + *
> + * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
> + * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup 
> need
> + * to coordinated.
> + *
> + * FINISHME: There are a few easy optimizations to consider.
> + *
> + * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
> + *no need for using push constant space for that dimension.
> + *
> + * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
> + *easily use 16-bit words rather than 32-bit dwords in the push constant
> + *data.
> + *
> + * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
> + *conveying the data, and thereby reduce push constant usage.
> + *
> + */
>  void
>  fs_visitor::setup_cs_payload()
>  {
> assert(devinfo->gen >= 7);
> +   brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
>  
> payload.num_regs = 1;
>  
> if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
> -  const unsigned local_id_dwords =
> - brw_cs_prog_local_id_payload_dwords(dispatch_width);
> -  assert((local_id_dwords & 0x7) == 0);
> -  const unsigned local_id_regs = local_id_dwords / 8;
> +  prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;

Subtle change in functionality here. Before this was fixed to 3 without
taking the dispatch width into account. Might be worth to note in the commit
message.

>payload.local_invocation_id_reg = payload.num_regs;
> -  payload.num_regs += local_id_regs;
> +  payload.num_regs += prog_data->local_invocation_id_regs;
> }
>  }
>  
> @@ -5171,6 +5194,41 @@ brw_wm_fs_emit(struct brw_context *brw,
> return g.get_assembly(final_assembly_size);
>  }
>  
> +void
> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
> + void *buffer, uint32_t threads, uint32_t stride)
> +{
> +   if (prog_data->local_invocation_id_regs == 0)
> +  return;
> +
> +   /* stride should be an integer number of registers, that is, a multiple of
> +* 32 bytes. */
> +   assert(stride % 32 == 0);
> +
> +   unsigned x = 0, y = 0, z = 0;
> +   for (unsigned t = 0; t < threads; t++) {
> +  uint32_t *param = (uint32_t *) buffer + 

Re: [Mesa-dev] [PATCH 05/12] i965/cs: Split out helper for building local id payload

2015-10-08 Thread Kristian Høgsberg
On Thu, Oct 8, 2015 at 1:34 AM, Pohjolainen, Topi
 wrote:
> On Wed, Oct 07, 2015 at 07:11:45AM -0700, Kristian H?gsberg Kristensen wrote:
>> The initial motivation for this patch was to avoid calling
>> brw_cs_prog_local_id_payload_dwords() in gen7_cs_state.c from the
>> compiler. This commit ends up refactoring things a bit more so as to
>> split out the logic to build the local id payload to brw_fs.cpp. This
>> moves the payload building closer to the compiler code that uses the
>> payload layout and makes it avaiable to other users of the compiler.
>   ^
>
>   available

Thanks, fixed.

>>
>> Signed-off-by: Kristian Høgsberg Kristensen 
>> ---
>>  src/mesa/drivers/dri/i965/brw_context.h   |  1 +
>>  src/mesa/drivers/dri/i965/brw_cs.h|  5 +-
>>  src/mesa/drivers/dri/i965/brw_fs.cpp  | 68 --
>>  src/mesa/drivers/dri/i965/gen7_cs_state.c | 80 
>> ---
>>  4 files changed, 76 insertions(+), 78 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
>> b/src/mesa/drivers/dri/i965/brw_context.h
>> index 0a29a69..1869f28 100644
>> --- a/src/mesa/drivers/dri/i965/brw_context.h
>> +++ b/src/mesa/drivers/dri/i965/brw_context.h
>> @@ -484,6 +484,7 @@ struct brw_cs_prog_data {
>> unsigned simd_size;
>> bool uses_barrier;
>> bool uses_num_work_groups;
>> +   unsigned local_invocation_id_regs;
>>
>> struct {
>>/** @{
>> diff --git a/src/mesa/drivers/dri/i965/brw_cs.h 
>> b/src/mesa/drivers/dri/i965/brw_cs.h
>> index 0c0ed2b..c07eb6c 100644
>> --- a/src/mesa/drivers/dri/i965/brw_cs.h
>> +++ b/src/mesa/drivers/dri/i965/brw_cs.h
>> @@ -48,8 +48,9 @@ brw_cs_emit(struct brw_context *brw,
>>  struct gl_shader_program *prog,
>>  unsigned *final_assembly_size);
>>
>> -unsigned
>> -brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width);
>> +void
>> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
>> + void *buffer, uint32_t threads, uint32_t 
>> stride);
>>
>>  #ifdef __cplusplus
>>  }
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
>> b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> index 1dee888..b4125aa 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> @@ -4718,20 +4718,43 @@ fs_visitor::setup_vs_payload()
>> payload.num_regs = 2;
>>  }
>>
>> +/**
>> + * We are building the local ID push constant data using the simplest 
>> possible
>> + * method. We simply push the local IDs directly as they should appear in 
>> the
>> + * registers for the uvec3 gl_LocalInvocationID variable.
>> + *
>> + * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
>> + * registers worth of push constant space.
>> + *
>> + * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
>> + * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup 
>> need
>> + * to coordinated.
>> + *
>> + * FINISHME: There are a few easy optimizations to consider.
>> + *
>> + * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
>> + *no need for using push constant space for that dimension.
>> + *
>> + * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
>> + *easily use 16-bit words rather than 32-bit dwords in the push constant
>> + *data.
>> + *
>> + * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
>> + *conveying the data, and thereby reduce push constant usage.
>> + *
>> + */
>>  void
>>  fs_visitor::setup_cs_payload()
>>  {
>> assert(devinfo->gen >= 7);
>> +   brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
>>
>> payload.num_regs = 1;
>>
>> if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
>> -  const unsigned local_id_dwords =
>> - brw_cs_prog_local_id_payload_dwords(dispatch_width);
>> -  assert((local_id_dwords & 0x7) == 0);
>> -  const unsigned local_id_regs = local_id_dwords / 8;
>> +  prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;
>
> Subtle change in functionality here. Before this was fixed to 3 without
> taking the dispatch width into account. Might be worth to note in the commit
> message.

brw_cs_prog_local_id_payload_dwords() used to return 3 *
dispatch_width, which the code above then divided by 8. Did I miss
sommething?

Kristian

>>payload.local_invocation_id_reg = payload.num_regs;
>> -  payload.num_regs += local_id_regs;
>> +  payload.num_regs += prog_data->local_invocation_id_regs;
>> }
>>  }
>>
>> @@ -5171,6 +5194,41 @@ brw_wm_fs_emit(struct brw_context *brw,
>> return g.get_assembly(final_assembly_size);
>>  }
>>
>> +void
>> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
>> + void *buffer, uint32_t threads, 

Re: [Mesa-dev] [PATCH 05/12] i965/cs: Split out helper for building local id payload

2015-10-08 Thread Kristian Høgsberg
On Thu, Oct 8, 2015 at 1:53 AM, Pohjolainen, Topi
 wrote:
> On Wed, Oct 07, 2015 at 07:11:45AM -0700, Kristian H?gsberg Kristensen wrote:
>> The initial motivation for this patch was to avoid calling
>> brw_cs_prog_local_id_payload_dwords() in gen7_cs_state.c from the
>> compiler. This commit ends up refactoring things a bit more so as to
>> split out the logic to build the local id payload to brw_fs.cpp. This
>> moves the payload building closer to the compiler code that uses the
>> payload layout and makes it avaiable to other users of the compiler.
>>
>> Signed-off-by: Kristian Høgsberg Kristensen 
>> ---
>>  src/mesa/drivers/dri/i965/brw_context.h   |  1 +
>>  src/mesa/drivers/dri/i965/brw_cs.h|  5 +-
>>  src/mesa/drivers/dri/i965/brw_fs.cpp  | 68 --
>>  src/mesa/drivers/dri/i965/gen7_cs_state.c | 80 
>> ---
>>  4 files changed, 76 insertions(+), 78 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
>> b/src/mesa/drivers/dri/i965/brw_context.h
>> index 0a29a69..1869f28 100644
>> --- a/src/mesa/drivers/dri/i965/brw_context.h
>> +++ b/src/mesa/drivers/dri/i965/brw_context.h
>> @@ -484,6 +484,7 @@ struct brw_cs_prog_data {
>> unsigned simd_size;
>> bool uses_barrier;
>> bool uses_num_work_groups;
>> +   unsigned local_invocation_id_regs;
>>
>> struct {
>>/** @{
>> diff --git a/src/mesa/drivers/dri/i965/brw_cs.h 
>> b/src/mesa/drivers/dri/i965/brw_cs.h
>> index 0c0ed2b..c07eb6c 100644
>> --- a/src/mesa/drivers/dri/i965/brw_cs.h
>> +++ b/src/mesa/drivers/dri/i965/brw_cs.h
>> @@ -48,8 +48,9 @@ brw_cs_emit(struct brw_context *brw,
>>  struct gl_shader_program *prog,
>>  unsigned *final_assembly_size);
>>
>> -unsigned
>> -brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width);
>> +void
>> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
>> + void *buffer, uint32_t threads, uint32_t 
>> stride);
>>
>>  #ifdef __cplusplus
>>  }
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
>> b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> index 1dee888..b4125aa 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> @@ -4718,20 +4718,43 @@ fs_visitor::setup_vs_payload()
>> payload.num_regs = 2;
>>  }
>>
>> +/**
>> + * We are building the local ID push constant data using the simplest 
>> possible
>> + * method. We simply push the local IDs directly as they should appear in 
>> the
>> + * registers for the uvec3 gl_LocalInvocationID variable.
>> + *
>> + * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
>> + * registers worth of push constant space.
>> + *
>> + * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
>> + * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup 
>> need
>> + * to coordinated.
>> + *
>> + * FINISHME: There are a few easy optimizations to consider.
>> + *
>> + * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
>> + *no need for using push constant space for that dimension.
>> + *
>> + * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
>> + *easily use 16-bit words rather than 32-bit dwords in the push constant
>> + *data.
>> + *
>> + * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
>> + *conveying the data, and thereby reduce push constant usage.
>> + *
>> + */
>>  void
>>  fs_visitor::setup_cs_payload()
>>  {
>> assert(devinfo->gen >= 7);
>> +   brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
>>
>> payload.num_regs = 1;
>>
>> if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
>> -  const unsigned local_id_dwords =
>> - brw_cs_prog_local_id_payload_dwords(dispatch_width);
>> -  assert((local_id_dwords & 0x7) == 0);
>> -  const unsigned local_id_regs = local_id_dwords / 8;
>> +  prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;
>>payload.local_invocation_id_reg = payload.num_regs;
>> -  payload.num_regs += local_id_regs;
>> +  payload.num_regs += prog_data->local_invocation_id_regs;
>> }
>>  }
>>
>> @@ -5171,6 +5194,41 @@ brw_wm_fs_emit(struct brw_context *brw,
>> return g.get_assembly(final_assembly_size);
>>  }
>>
>> +void
>> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
>> + void *buffer, uint32_t threads, uint32_t 
>> stride)
>> +{
>> +   if (prog_data->local_invocation_id_regs == 0)
>> +  return;
>> +
>> +   /* stride should be an integer number of registers, that is, a multiple 
>> of
>> +* 32 bytes. */
>
> Comment closing on its own line and you could start the sentence with a
> capital 'S' as well.

I fixed the comment closing, but 'stride' refers to the argument named
'stride' and I wanted to keep it the 

Re: [Mesa-dev] [PATCH 05/12] i965/cs: Split out helper for building local id payload

2015-10-08 Thread Pohjolainen, Topi
On Thu, Oct 08, 2015 at 10:53:59AM -0700, Kristian H?gsberg wrote:
> On Thu, Oct 8, 2015 at 1:53 AM, Pohjolainen, Topi
>  wrote:
> > On Wed, Oct 07, 2015 at 07:11:45AM -0700, Kristian H?gsberg Kristensen 
> > wrote:
> >> The initial motivation for this patch was to avoid calling
> >> brw_cs_prog_local_id_payload_dwords() in gen7_cs_state.c from the
> >> compiler. This commit ends up refactoring things a bit more so as to
> >> split out the logic to build the local id payload to brw_fs.cpp. This
> >> moves the payload building closer to the compiler code that uses the
> >> payload layout and makes it avaiable to other users of the compiler.
> >>
> >> Signed-off-by: Kristian Høgsberg Kristensen 
> >> ---
> >>  src/mesa/drivers/dri/i965/brw_context.h   |  1 +
> >>  src/mesa/drivers/dri/i965/brw_cs.h|  5 +-
> >>  src/mesa/drivers/dri/i965/brw_fs.cpp  | 68 --
> >>  src/mesa/drivers/dri/i965/gen7_cs_state.c | 80 
> >> ---
> >>  4 files changed, 76 insertions(+), 78 deletions(-)
> >>
> >> diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
> >> b/src/mesa/drivers/dri/i965/brw_context.h
> >> index 0a29a69..1869f28 100644
> >> --- a/src/mesa/drivers/dri/i965/brw_context.h
> >> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> >> @@ -484,6 +484,7 @@ struct brw_cs_prog_data {
> >> unsigned simd_size;
> >> bool uses_barrier;
> >> bool uses_num_work_groups;
> >> +   unsigned local_invocation_id_regs;
> >>
> >> struct {
> >>/** @{
> >> diff --git a/src/mesa/drivers/dri/i965/brw_cs.h 
> >> b/src/mesa/drivers/dri/i965/brw_cs.h
> >> index 0c0ed2b..c07eb6c 100644
> >> --- a/src/mesa/drivers/dri/i965/brw_cs.h
> >> +++ b/src/mesa/drivers/dri/i965/brw_cs.h
> >> @@ -48,8 +48,9 @@ brw_cs_emit(struct brw_context *brw,
> >>  struct gl_shader_program *prog,
> >>  unsigned *final_assembly_size);
> >>
> >> -unsigned
> >> -brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width);
> >> +void
> >> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
> >> + void *buffer, uint32_t threads, uint32_t 
> >> stride);
> >>
> >>  #ifdef __cplusplus
> >>  }
> >> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
> >> b/src/mesa/drivers/dri/i965/brw_fs.cpp
> >> index 1dee888..b4125aa 100644
> >> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> >> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> >> @@ -4718,20 +4718,43 @@ fs_visitor::setup_vs_payload()
> >> payload.num_regs = 2;
> >>  }
> >>
> >> +/**
> >> + * We are building the local ID push constant data using the simplest 
> >> possible
> >> + * method. We simply push the local IDs directly as they should appear in 
> >> the
> >> + * registers for the uvec3 gl_LocalInvocationID variable.
> >> + *
> >> + * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
> >> + * registers worth of push constant space.
> >> + *
> >> + * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
> >> + * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup 
> >> need
> >> + * to coordinated.
> >> + *
> >> + * FINISHME: There are a few easy optimizations to consider.
> >> + *
> >> + * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there 
> >> is
> >> + *no need for using push constant space for that dimension.
> >> + *
> >> + * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we 
> >> can
> >> + *easily use 16-bit words rather than 32-bit dwords in the push 
> >> constant
> >> + *data.
> >> + *
> >> + * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
> >> + *conveying the data, and thereby reduce push constant usage.
> >> + *
> >> + */
> >>  void
> >>  fs_visitor::setup_cs_payload()
> >>  {
> >> assert(devinfo->gen >= 7);
> >> +   brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
> >>
> >> payload.num_regs = 1;
> >>
> >> if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
> >> -  const unsigned local_id_dwords =
> >> - brw_cs_prog_local_id_payload_dwords(dispatch_width);
> >> -  assert((local_id_dwords & 0x7) == 0);
> >> -  const unsigned local_id_regs = local_id_dwords / 8;
> >> +  prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;
> >>payload.local_invocation_id_reg = payload.num_regs;
> >> -  payload.num_regs += local_id_regs;
> >> +  payload.num_regs += prog_data->local_invocation_id_regs;
> >> }
> >>  }
> >>
> >> @@ -5171,6 +5194,41 @@ brw_wm_fs_emit(struct brw_context *brw,
> >> return g.get_assembly(final_assembly_size);
> >>  }
> >>
> >> +void
> >> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
> >> + void *buffer, uint32_t threads, uint32_t 
> >> stride)
> >> +{
> >> +   if (prog_data->local_invocation_id_regs == 0)
> >> +  

[Mesa-dev] [PATCH 05/12] i965/cs: Split out helper for building local id payload

2015-10-07 Thread Kristian Høgsberg Kristensen
The initial motivation for this patch was to avoid calling
brw_cs_prog_local_id_payload_dwords() in gen7_cs_state.c from the
compiler. This commit ends up refactoring things a bit more so as to
split out the logic to build the local id payload to brw_fs.cpp. This
moves the payload building closer to the compiler code that uses the
payload layout and makes it avaiable to other users of the compiler.

Signed-off-by: Kristian Høgsberg Kristensen 
---
 src/mesa/drivers/dri/i965/brw_context.h   |  1 +
 src/mesa/drivers/dri/i965/brw_cs.h|  5 +-
 src/mesa/drivers/dri/i965/brw_fs.cpp  | 68 --
 src/mesa/drivers/dri/i965/gen7_cs_state.c | 80 ---
 4 files changed, 76 insertions(+), 78 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
b/src/mesa/drivers/dri/i965/brw_context.h
index 0a29a69..1869f28 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -484,6 +484,7 @@ struct brw_cs_prog_data {
unsigned simd_size;
bool uses_barrier;
bool uses_num_work_groups;
+   unsigned local_invocation_id_regs;
 
struct {
   /** @{
diff --git a/src/mesa/drivers/dri/i965/brw_cs.h 
b/src/mesa/drivers/dri/i965/brw_cs.h
index 0c0ed2b..c07eb6c 100644
--- a/src/mesa/drivers/dri/i965/brw_cs.h
+++ b/src/mesa/drivers/dri/i965/brw_cs.h
@@ -48,8 +48,9 @@ brw_cs_emit(struct brw_context *brw,
 struct gl_shader_program *prog,
 unsigned *final_assembly_size);
 
-unsigned
-brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width);
+void
+brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
+ void *buffer, uint32_t threads, uint32_t stride);
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 1dee888..b4125aa 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -4718,20 +4718,43 @@ fs_visitor::setup_vs_payload()
payload.num_regs = 2;
 }
 
+/**
+ * We are building the local ID push constant data using the simplest possible
+ * method. We simply push the local IDs directly as they should appear in the
+ * registers for the uvec3 gl_LocalInvocationID variable.
+ *
+ * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
+ * registers worth of push constant space.
+ *
+ * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
+ * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need
+ * to coordinated.
+ *
+ * FINISHME: There are a few easy optimizations to consider.
+ *
+ * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
+ *no need for using push constant space for that dimension.
+ *
+ * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
+ *easily use 16-bit words rather than 32-bit dwords in the push constant
+ *data.
+ *
+ * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
+ *conveying the data, and thereby reduce push constant usage.
+ *
+ */
 void
 fs_visitor::setup_cs_payload()
 {
assert(devinfo->gen >= 7);
+   brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
 
payload.num_regs = 1;
 
if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
-  const unsigned local_id_dwords =
- brw_cs_prog_local_id_payload_dwords(dispatch_width);
-  assert((local_id_dwords & 0x7) == 0);
-  const unsigned local_id_regs = local_id_dwords / 8;
+  prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;
   payload.local_invocation_id_reg = payload.num_regs;
-  payload.num_regs += local_id_regs;
+  payload.num_regs += prog_data->local_invocation_id_regs;
}
 }
 
@@ -5171,6 +5194,41 @@ brw_wm_fs_emit(struct brw_context *brw,
return g.get_assembly(final_assembly_size);
 }
 
+void
+brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
+ void *buffer, uint32_t threads, uint32_t stride)
+{
+   if (prog_data->local_invocation_id_regs == 0)
+  return;
+
+   /* stride should be an integer number of registers, that is, a multiple of
+* 32 bytes. */
+   assert(stride % 32 == 0);
+
+   unsigned x = 0, y = 0, z = 0;
+   for (unsigned t = 0; t < threads; t++) {
+  uint32_t *param = (uint32_t *) buffer + stride * t / 4;
+
+  for (unsigned i = 0; i < prog_data->simd_size; i++) {
+ param[0 * prog_data->simd_size + i] = x;
+ param[1 * prog_data->simd_size + i] = y;
+ param[2 * prog_data->simd_size + i] = z;
+
+ x++;
+ if (x == prog_data->local_size[0]) {
+x = 0;
+y++;
+if (y == prog_data->local_size[1]) {
+   y = 0;
+   z++;
+   if (z == prog_data->local_size[2])
+  z = 0;
+}
+ }
+  }
+   }
+}
+
 fs_reg *