Re: [Mesa-dev] [PATCH 05/12] i965/cs: Split out helper for building local id payload
On Wed, Oct 07, 2015 at 07:11:45AM -0700, Kristian H?gsberg Kristensen wrote: > The initial motivation for this patch was to avoid calling > brw_cs_prog_local_id_payload_dwords() in gen7_cs_state.c from the > compiler. This commit ends up refactoring things a bit more so as to > split out the logic to build the local id payload to brw_fs.cpp. This > moves the payload building closer to the compiler code that uses the > payload layout and makes it avaiable to other users of the compiler. > > Signed-off-by: Kristian Høgsberg Kristensen> --- > src/mesa/drivers/dri/i965/brw_context.h | 1 + > src/mesa/drivers/dri/i965/brw_cs.h| 5 +- > src/mesa/drivers/dri/i965/brw_fs.cpp | 68 -- > src/mesa/drivers/dri/i965/gen7_cs_state.c | 80 > --- > 4 files changed, 76 insertions(+), 78 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_context.h > b/src/mesa/drivers/dri/i965/brw_context.h > index 0a29a69..1869f28 100644 > --- a/src/mesa/drivers/dri/i965/brw_context.h > +++ b/src/mesa/drivers/dri/i965/brw_context.h > @@ -484,6 +484,7 @@ struct brw_cs_prog_data { > unsigned simd_size; > bool uses_barrier; > bool uses_num_work_groups; > + unsigned local_invocation_id_regs; > > struct { >/** @{ > diff --git a/src/mesa/drivers/dri/i965/brw_cs.h > b/src/mesa/drivers/dri/i965/brw_cs.h > index 0c0ed2b..c07eb6c 100644 > --- a/src/mesa/drivers/dri/i965/brw_cs.h > +++ b/src/mesa/drivers/dri/i965/brw_cs.h > @@ -48,8 +48,9 @@ brw_cs_emit(struct brw_context *brw, > struct gl_shader_program *prog, > unsigned *final_assembly_size); > > -unsigned > -brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width); > +void > +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data, > + void *buffer, uint32_t threads, uint32_t > stride); > > #ifdef __cplusplus > } > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp > b/src/mesa/drivers/dri/i965/brw_fs.cpp > index 1dee888..b4125aa 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > @@ -4718,20 +4718,43 @@ fs_visitor::setup_vs_payload() > payload.num_regs = 2; > } > > +/** > + * We are building the local ID push constant data using the simplest > possible > + * method. We simply push the local IDs directly as they should appear in the > + * registers for the uvec3 gl_LocalInvocationID variable. > + * > + * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6 > + * registers worth of push constant space. > + * > + * Note: Any updates to brw_cs_prog_local_id_payload_dwords, > + * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup > need > + * to coordinated. > + * > + * FINISHME: There are a few easy optimizations to consider. > + * > + * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is > + *no need for using push constant space for that dimension. > + * > + * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can > + *easily use 16-bit words rather than 32-bit dwords in the push constant > + *data. > + * > + * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for > + *conveying the data, and thereby reduce push constant usage. > + * > + */ > void > fs_visitor::setup_cs_payload() > { > assert(devinfo->gen >= 7); > + brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data; > > payload.num_regs = 1; > > if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) { > - const unsigned local_id_dwords = > - brw_cs_prog_local_id_payload_dwords(dispatch_width); > - assert((local_id_dwords & 0x7) == 0); > - const unsigned local_id_regs = local_id_dwords / 8; > + prog_data->local_invocation_id_regs = dispatch_width * 3 / 8; >payload.local_invocation_id_reg = payload.num_regs; > - payload.num_regs += local_id_regs; > + payload.num_regs += prog_data->local_invocation_id_regs; > } > } > > @@ -5171,6 +5194,41 @@ brw_wm_fs_emit(struct brw_context *brw, > return g.get_assembly(final_assembly_size); > } > > +void > +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data, > + void *buffer, uint32_t threads, uint32_t stride) > +{ > + if (prog_data->local_invocation_id_regs == 0) > + return; > + > + /* stride should be an integer number of registers, that is, a multiple of > +* 32 bytes. */ Comment closing on its own line and you could start the sentence with a capital 'S' as well. > + assert(stride % 32 == 0); > + > + unsigned x = 0, y = 0, z = 0; > + for (unsigned t = 0; t < threads; t++) { > + uint32_t *param = (uint32_t *) buffer + stride * t / 4; > + > + for (unsigned i = 0; i < prog_data->simd_size; i++) { > + param[0 * prog_data->simd_size + i] = x; > +
Re: [Mesa-dev] [PATCH 05/12] i965/cs: Split out helper for building local id payload
On Wed, Oct 07, 2015 at 07:11:45AM -0700, Kristian H?gsberg Kristensen wrote: > The initial motivation for this patch was to avoid calling > brw_cs_prog_local_id_payload_dwords() in gen7_cs_state.c from the > compiler. This commit ends up refactoring things a bit more so as to > split out the logic to build the local id payload to brw_fs.cpp. This > moves the payload building closer to the compiler code that uses the > payload layout and makes it avaiable to other users of the compiler. ^ available > > Signed-off-by: Kristian Høgsberg Kristensen> --- > src/mesa/drivers/dri/i965/brw_context.h | 1 + > src/mesa/drivers/dri/i965/brw_cs.h| 5 +- > src/mesa/drivers/dri/i965/brw_fs.cpp | 68 -- > src/mesa/drivers/dri/i965/gen7_cs_state.c | 80 > --- > 4 files changed, 76 insertions(+), 78 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_context.h > b/src/mesa/drivers/dri/i965/brw_context.h > index 0a29a69..1869f28 100644 > --- a/src/mesa/drivers/dri/i965/brw_context.h > +++ b/src/mesa/drivers/dri/i965/brw_context.h > @@ -484,6 +484,7 @@ struct brw_cs_prog_data { > unsigned simd_size; > bool uses_barrier; > bool uses_num_work_groups; > + unsigned local_invocation_id_regs; > > struct { >/** @{ > diff --git a/src/mesa/drivers/dri/i965/brw_cs.h > b/src/mesa/drivers/dri/i965/brw_cs.h > index 0c0ed2b..c07eb6c 100644 > --- a/src/mesa/drivers/dri/i965/brw_cs.h > +++ b/src/mesa/drivers/dri/i965/brw_cs.h > @@ -48,8 +48,9 @@ brw_cs_emit(struct brw_context *brw, > struct gl_shader_program *prog, > unsigned *final_assembly_size); > > -unsigned > -brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width); > +void > +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data, > + void *buffer, uint32_t threads, uint32_t > stride); > > #ifdef __cplusplus > } > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp > b/src/mesa/drivers/dri/i965/brw_fs.cpp > index 1dee888..b4125aa 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > @@ -4718,20 +4718,43 @@ fs_visitor::setup_vs_payload() > payload.num_regs = 2; > } > > +/** > + * We are building the local ID push constant data using the simplest > possible > + * method. We simply push the local IDs directly as they should appear in the > + * registers for the uvec3 gl_LocalInvocationID variable. > + * > + * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6 > + * registers worth of push constant space. > + * > + * Note: Any updates to brw_cs_prog_local_id_payload_dwords, > + * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup > need > + * to coordinated. > + * > + * FINISHME: There are a few easy optimizations to consider. > + * > + * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is > + *no need for using push constant space for that dimension. > + * > + * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can > + *easily use 16-bit words rather than 32-bit dwords in the push constant > + *data. > + * > + * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for > + *conveying the data, and thereby reduce push constant usage. > + * > + */ > void > fs_visitor::setup_cs_payload() > { > assert(devinfo->gen >= 7); > + brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data; > > payload.num_regs = 1; > > if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) { > - const unsigned local_id_dwords = > - brw_cs_prog_local_id_payload_dwords(dispatch_width); > - assert((local_id_dwords & 0x7) == 0); > - const unsigned local_id_regs = local_id_dwords / 8; > + prog_data->local_invocation_id_regs = dispatch_width * 3 / 8; Subtle change in functionality here. Before this was fixed to 3 without taking the dispatch width into account. Might be worth to note in the commit message. >payload.local_invocation_id_reg = payload.num_regs; > - payload.num_regs += local_id_regs; > + payload.num_regs += prog_data->local_invocation_id_regs; > } > } > > @@ -5171,6 +5194,41 @@ brw_wm_fs_emit(struct brw_context *brw, > return g.get_assembly(final_assembly_size); > } > > +void > +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data, > + void *buffer, uint32_t threads, uint32_t stride) > +{ > + if (prog_data->local_invocation_id_regs == 0) > + return; > + > + /* stride should be an integer number of registers, that is, a multiple of > +* 32 bytes. */ > + assert(stride % 32 == 0); > + > + unsigned x = 0, y = 0, z = 0; > + for (unsigned t = 0; t < threads; t++) { > + uint32_t *param = (uint32_t *) buffer +
Re: [Mesa-dev] [PATCH 05/12] i965/cs: Split out helper for building local id payload
On Thu, Oct 8, 2015 at 1:34 AM, Pohjolainen, Topiwrote: > On Wed, Oct 07, 2015 at 07:11:45AM -0700, Kristian H?gsberg Kristensen wrote: >> The initial motivation for this patch was to avoid calling >> brw_cs_prog_local_id_payload_dwords() in gen7_cs_state.c from the >> compiler. This commit ends up refactoring things a bit more so as to >> split out the logic to build the local id payload to brw_fs.cpp. This >> moves the payload building closer to the compiler code that uses the >> payload layout and makes it avaiable to other users of the compiler. > ^ > > available Thanks, fixed. >> >> Signed-off-by: Kristian Høgsberg Kristensen >> --- >> src/mesa/drivers/dri/i965/brw_context.h | 1 + >> src/mesa/drivers/dri/i965/brw_cs.h| 5 +- >> src/mesa/drivers/dri/i965/brw_fs.cpp | 68 -- >> src/mesa/drivers/dri/i965/gen7_cs_state.c | 80 >> --- >> 4 files changed, 76 insertions(+), 78 deletions(-) >> >> diff --git a/src/mesa/drivers/dri/i965/brw_context.h >> b/src/mesa/drivers/dri/i965/brw_context.h >> index 0a29a69..1869f28 100644 >> --- a/src/mesa/drivers/dri/i965/brw_context.h >> +++ b/src/mesa/drivers/dri/i965/brw_context.h >> @@ -484,6 +484,7 @@ struct brw_cs_prog_data { >> unsigned simd_size; >> bool uses_barrier; >> bool uses_num_work_groups; >> + unsigned local_invocation_id_regs; >> >> struct { >>/** @{ >> diff --git a/src/mesa/drivers/dri/i965/brw_cs.h >> b/src/mesa/drivers/dri/i965/brw_cs.h >> index 0c0ed2b..c07eb6c 100644 >> --- a/src/mesa/drivers/dri/i965/brw_cs.h >> +++ b/src/mesa/drivers/dri/i965/brw_cs.h >> @@ -48,8 +48,9 @@ brw_cs_emit(struct brw_context *brw, >> struct gl_shader_program *prog, >> unsigned *final_assembly_size); >> >> -unsigned >> -brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width); >> +void >> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data, >> + void *buffer, uint32_t threads, uint32_t >> stride); >> >> #ifdef __cplusplus >> } >> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp >> b/src/mesa/drivers/dri/i965/brw_fs.cpp >> index 1dee888..b4125aa 100644 >> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp >> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp >> @@ -4718,20 +4718,43 @@ fs_visitor::setup_vs_payload() >> payload.num_regs = 2; >> } >> >> +/** >> + * We are building the local ID push constant data using the simplest >> possible >> + * method. We simply push the local IDs directly as they should appear in >> the >> + * registers for the uvec3 gl_LocalInvocationID variable. >> + * >> + * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6 >> + * registers worth of push constant space. >> + * >> + * Note: Any updates to brw_cs_prog_local_id_payload_dwords, >> + * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup >> need >> + * to coordinated. >> + * >> + * FINISHME: There are a few easy optimizations to consider. >> + * >> + * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is >> + *no need for using push constant space for that dimension. >> + * >> + * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can >> + *easily use 16-bit words rather than 32-bit dwords in the push constant >> + *data. >> + * >> + * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for >> + *conveying the data, and thereby reduce push constant usage. >> + * >> + */ >> void >> fs_visitor::setup_cs_payload() >> { >> assert(devinfo->gen >= 7); >> + brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data; >> >> payload.num_regs = 1; >> >> if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) { >> - const unsigned local_id_dwords = >> - brw_cs_prog_local_id_payload_dwords(dispatch_width); >> - assert((local_id_dwords & 0x7) == 0); >> - const unsigned local_id_regs = local_id_dwords / 8; >> + prog_data->local_invocation_id_regs = dispatch_width * 3 / 8; > > Subtle change in functionality here. Before this was fixed to 3 without > taking the dispatch width into account. Might be worth to note in the commit > message. brw_cs_prog_local_id_payload_dwords() used to return 3 * dispatch_width, which the code above then divided by 8. Did I miss sommething? Kristian >>payload.local_invocation_id_reg = payload.num_regs; >> - payload.num_regs += local_id_regs; >> + payload.num_regs += prog_data->local_invocation_id_regs; >> } >> } >> >> @@ -5171,6 +5194,41 @@ brw_wm_fs_emit(struct brw_context *brw, >> return g.get_assembly(final_assembly_size); >> } >> >> +void >> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data, >> + void *buffer, uint32_t threads,
Re: [Mesa-dev] [PATCH 05/12] i965/cs: Split out helper for building local id payload
On Thu, Oct 8, 2015 at 1:53 AM, Pohjolainen, Topiwrote: > On Wed, Oct 07, 2015 at 07:11:45AM -0700, Kristian H?gsberg Kristensen wrote: >> The initial motivation for this patch was to avoid calling >> brw_cs_prog_local_id_payload_dwords() in gen7_cs_state.c from the >> compiler. This commit ends up refactoring things a bit more so as to >> split out the logic to build the local id payload to brw_fs.cpp. This >> moves the payload building closer to the compiler code that uses the >> payload layout and makes it avaiable to other users of the compiler. >> >> Signed-off-by: Kristian Høgsberg Kristensen >> --- >> src/mesa/drivers/dri/i965/brw_context.h | 1 + >> src/mesa/drivers/dri/i965/brw_cs.h| 5 +- >> src/mesa/drivers/dri/i965/brw_fs.cpp | 68 -- >> src/mesa/drivers/dri/i965/gen7_cs_state.c | 80 >> --- >> 4 files changed, 76 insertions(+), 78 deletions(-) >> >> diff --git a/src/mesa/drivers/dri/i965/brw_context.h >> b/src/mesa/drivers/dri/i965/brw_context.h >> index 0a29a69..1869f28 100644 >> --- a/src/mesa/drivers/dri/i965/brw_context.h >> +++ b/src/mesa/drivers/dri/i965/brw_context.h >> @@ -484,6 +484,7 @@ struct brw_cs_prog_data { >> unsigned simd_size; >> bool uses_barrier; >> bool uses_num_work_groups; >> + unsigned local_invocation_id_regs; >> >> struct { >>/** @{ >> diff --git a/src/mesa/drivers/dri/i965/brw_cs.h >> b/src/mesa/drivers/dri/i965/brw_cs.h >> index 0c0ed2b..c07eb6c 100644 >> --- a/src/mesa/drivers/dri/i965/brw_cs.h >> +++ b/src/mesa/drivers/dri/i965/brw_cs.h >> @@ -48,8 +48,9 @@ brw_cs_emit(struct brw_context *brw, >> struct gl_shader_program *prog, >> unsigned *final_assembly_size); >> >> -unsigned >> -brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width); >> +void >> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data, >> + void *buffer, uint32_t threads, uint32_t >> stride); >> >> #ifdef __cplusplus >> } >> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp >> b/src/mesa/drivers/dri/i965/brw_fs.cpp >> index 1dee888..b4125aa 100644 >> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp >> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp >> @@ -4718,20 +4718,43 @@ fs_visitor::setup_vs_payload() >> payload.num_regs = 2; >> } >> >> +/** >> + * We are building the local ID push constant data using the simplest >> possible >> + * method. We simply push the local IDs directly as they should appear in >> the >> + * registers for the uvec3 gl_LocalInvocationID variable. >> + * >> + * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6 >> + * registers worth of push constant space. >> + * >> + * Note: Any updates to brw_cs_prog_local_id_payload_dwords, >> + * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup >> need >> + * to coordinated. >> + * >> + * FINISHME: There are a few easy optimizations to consider. >> + * >> + * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is >> + *no need for using push constant space for that dimension. >> + * >> + * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can >> + *easily use 16-bit words rather than 32-bit dwords in the push constant >> + *data. >> + * >> + * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for >> + *conveying the data, and thereby reduce push constant usage. >> + * >> + */ >> void >> fs_visitor::setup_cs_payload() >> { >> assert(devinfo->gen >= 7); >> + brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data; >> >> payload.num_regs = 1; >> >> if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) { >> - const unsigned local_id_dwords = >> - brw_cs_prog_local_id_payload_dwords(dispatch_width); >> - assert((local_id_dwords & 0x7) == 0); >> - const unsigned local_id_regs = local_id_dwords / 8; >> + prog_data->local_invocation_id_regs = dispatch_width * 3 / 8; >>payload.local_invocation_id_reg = payload.num_regs; >> - payload.num_regs += local_id_regs; >> + payload.num_regs += prog_data->local_invocation_id_regs; >> } >> } >> >> @@ -5171,6 +5194,41 @@ brw_wm_fs_emit(struct brw_context *brw, >> return g.get_assembly(final_assembly_size); >> } >> >> +void >> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data, >> + void *buffer, uint32_t threads, uint32_t >> stride) >> +{ >> + if (prog_data->local_invocation_id_regs == 0) >> + return; >> + >> + /* stride should be an integer number of registers, that is, a multiple >> of >> +* 32 bytes. */ > > Comment closing on its own line and you could start the sentence with a > capital 'S' as well. I fixed the comment closing, but 'stride' refers to the argument named 'stride' and I wanted to keep it the
Re: [Mesa-dev] [PATCH 05/12] i965/cs: Split out helper for building local id payload
On Thu, Oct 08, 2015 at 10:53:59AM -0700, Kristian H?gsberg wrote: > On Thu, Oct 8, 2015 at 1:53 AM, Pohjolainen, Topi >wrote: > > On Wed, Oct 07, 2015 at 07:11:45AM -0700, Kristian H?gsberg Kristensen > > wrote: > >> The initial motivation for this patch was to avoid calling > >> brw_cs_prog_local_id_payload_dwords() in gen7_cs_state.c from the > >> compiler. This commit ends up refactoring things a bit more so as to > >> split out the logic to build the local id payload to brw_fs.cpp. This > >> moves the payload building closer to the compiler code that uses the > >> payload layout and makes it avaiable to other users of the compiler. > >> > >> Signed-off-by: Kristian Høgsberg Kristensen > >> --- > >> src/mesa/drivers/dri/i965/brw_context.h | 1 + > >> src/mesa/drivers/dri/i965/brw_cs.h| 5 +- > >> src/mesa/drivers/dri/i965/brw_fs.cpp | 68 -- > >> src/mesa/drivers/dri/i965/gen7_cs_state.c | 80 > >> --- > >> 4 files changed, 76 insertions(+), 78 deletions(-) > >> > >> diff --git a/src/mesa/drivers/dri/i965/brw_context.h > >> b/src/mesa/drivers/dri/i965/brw_context.h > >> index 0a29a69..1869f28 100644 > >> --- a/src/mesa/drivers/dri/i965/brw_context.h > >> +++ b/src/mesa/drivers/dri/i965/brw_context.h > >> @@ -484,6 +484,7 @@ struct brw_cs_prog_data { > >> unsigned simd_size; > >> bool uses_barrier; > >> bool uses_num_work_groups; > >> + unsigned local_invocation_id_regs; > >> > >> struct { > >>/** @{ > >> diff --git a/src/mesa/drivers/dri/i965/brw_cs.h > >> b/src/mesa/drivers/dri/i965/brw_cs.h > >> index 0c0ed2b..c07eb6c 100644 > >> --- a/src/mesa/drivers/dri/i965/brw_cs.h > >> +++ b/src/mesa/drivers/dri/i965/brw_cs.h > >> @@ -48,8 +48,9 @@ brw_cs_emit(struct brw_context *brw, > >> struct gl_shader_program *prog, > >> unsigned *final_assembly_size); > >> > >> -unsigned > >> -brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width); > >> +void > >> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data, > >> + void *buffer, uint32_t threads, uint32_t > >> stride); > >> > >> #ifdef __cplusplus > >> } > >> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp > >> b/src/mesa/drivers/dri/i965/brw_fs.cpp > >> index 1dee888..b4125aa 100644 > >> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > >> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > >> @@ -4718,20 +4718,43 @@ fs_visitor::setup_vs_payload() > >> payload.num_regs = 2; > >> } > >> > >> +/** > >> + * We are building the local ID push constant data using the simplest > >> possible > >> + * method. We simply push the local IDs directly as they should appear in > >> the > >> + * registers for the uvec3 gl_LocalInvocationID variable. > >> + * > >> + * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6 > >> + * registers worth of push constant space. > >> + * > >> + * Note: Any updates to brw_cs_prog_local_id_payload_dwords, > >> + * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup > >> need > >> + * to coordinated. > >> + * > >> + * FINISHME: There are a few easy optimizations to consider. > >> + * > >> + * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there > >> is > >> + *no need for using push constant space for that dimension. > >> + * > >> + * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we > >> can > >> + *easily use 16-bit words rather than 32-bit dwords in the push > >> constant > >> + *data. > >> + * > >> + * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for > >> + *conveying the data, and thereby reduce push constant usage. > >> + * > >> + */ > >> void > >> fs_visitor::setup_cs_payload() > >> { > >> assert(devinfo->gen >= 7); > >> + brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data; > >> > >> payload.num_regs = 1; > >> > >> if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) { > >> - const unsigned local_id_dwords = > >> - brw_cs_prog_local_id_payload_dwords(dispatch_width); > >> - assert((local_id_dwords & 0x7) == 0); > >> - const unsigned local_id_regs = local_id_dwords / 8; > >> + prog_data->local_invocation_id_regs = dispatch_width * 3 / 8; > >>payload.local_invocation_id_reg = payload.num_regs; > >> - payload.num_regs += local_id_regs; > >> + payload.num_regs += prog_data->local_invocation_id_regs; > >> } > >> } > >> > >> @@ -5171,6 +5194,41 @@ brw_wm_fs_emit(struct brw_context *brw, > >> return g.get_assembly(final_assembly_size); > >> } > >> > >> +void > >> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data, > >> + void *buffer, uint32_t threads, uint32_t > >> stride) > >> +{ > >> + if (prog_data->local_invocation_id_regs == 0) > >> +
[Mesa-dev] [PATCH 05/12] i965/cs: Split out helper for building local id payload
The initial motivation for this patch was to avoid calling brw_cs_prog_local_id_payload_dwords() in gen7_cs_state.c from the compiler. This commit ends up refactoring things a bit more so as to split out the logic to build the local id payload to brw_fs.cpp. This moves the payload building closer to the compiler code that uses the payload layout and makes it avaiable to other users of the compiler. Signed-off-by: Kristian Høgsberg Kristensen--- src/mesa/drivers/dri/i965/brw_context.h | 1 + src/mesa/drivers/dri/i965/brw_cs.h| 5 +- src/mesa/drivers/dri/i965/brw_fs.cpp | 68 -- src/mesa/drivers/dri/i965/gen7_cs_state.c | 80 --- 4 files changed, 76 insertions(+), 78 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 0a29a69..1869f28 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -484,6 +484,7 @@ struct brw_cs_prog_data { unsigned simd_size; bool uses_barrier; bool uses_num_work_groups; + unsigned local_invocation_id_regs; struct { /** @{ diff --git a/src/mesa/drivers/dri/i965/brw_cs.h b/src/mesa/drivers/dri/i965/brw_cs.h index 0c0ed2b..c07eb6c 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.h +++ b/src/mesa/drivers/dri/i965/brw_cs.h @@ -48,8 +48,9 @@ brw_cs_emit(struct brw_context *brw, struct gl_shader_program *prog, unsigned *final_assembly_size); -unsigned -brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width); +void +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data, + void *buffer, uint32_t threads, uint32_t stride); #ifdef __cplusplus } diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 1dee888..b4125aa 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -4718,20 +4718,43 @@ fs_visitor::setup_vs_payload() payload.num_regs = 2; } +/** + * We are building the local ID push constant data using the simplest possible + * method. We simply push the local IDs directly as they should appear in the + * registers for the uvec3 gl_LocalInvocationID variable. + * + * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6 + * registers worth of push constant space. + * + * Note: Any updates to brw_cs_prog_local_id_payload_dwords, + * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need + * to coordinated. + * + * FINISHME: There are a few easy optimizations to consider. + * + * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is + *no need for using push constant space for that dimension. + * + * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can + *easily use 16-bit words rather than 32-bit dwords in the push constant + *data. + * + * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for + *conveying the data, and thereby reduce push constant usage. + * + */ void fs_visitor::setup_cs_payload() { assert(devinfo->gen >= 7); + brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data; payload.num_regs = 1; if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) { - const unsigned local_id_dwords = - brw_cs_prog_local_id_payload_dwords(dispatch_width); - assert((local_id_dwords & 0x7) == 0); - const unsigned local_id_regs = local_id_dwords / 8; + prog_data->local_invocation_id_regs = dispatch_width * 3 / 8; payload.local_invocation_id_reg = payload.num_regs; - payload.num_regs += local_id_regs; + payload.num_regs += prog_data->local_invocation_id_regs; } } @@ -5171,6 +5194,41 @@ brw_wm_fs_emit(struct brw_context *brw, return g.get_assembly(final_assembly_size); } +void +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data, + void *buffer, uint32_t threads, uint32_t stride) +{ + if (prog_data->local_invocation_id_regs == 0) + return; + + /* stride should be an integer number of registers, that is, a multiple of +* 32 bytes. */ + assert(stride % 32 == 0); + + unsigned x = 0, y = 0, z = 0; + for (unsigned t = 0; t < threads; t++) { + uint32_t *param = (uint32_t *) buffer + stride * t / 4; + + for (unsigned i = 0; i < prog_data->simd_size; i++) { + param[0 * prog_data->simd_size + i] = x; + param[1 * prog_data->simd_size + i] = y; + param[2 * prog_data->simd_size + i] = z; + + x++; + if (x == prog_data->local_size[0]) { +x = 0; +y++; +if (y == prog_data->local_size[1]) { + y = 0; + z++; + if (z == prog_data->local_size[2]) + z = 0; +} + } + } + } +} + fs_reg *