Re: [Mesa-dev] [PATCH v3 19/43] i965/fs: Support push constants of 16-bit types
On Thu, Oct 12, 2017 at 11:38 AM, Jose Maria Casanova Crespo < jmcasan...@igalia.com> wrote: > We enable the use of 16-bit values in push constants > modifying the assign_constant_locations function to work > with 16-bit types. > > The API to access buffers in Vulkan use multiples of 4-byte for > offsets and sizes. Current accountability of uniforms based on 4-byte > slots will work for 16-bit values if they are allowed to use 32-bit > slots. For that, we replace the division by 4 by a DIV_ROUND_UP, so > 2-byte elements will use 1 slot instead of 0. > I'm fairly sure this doesn't actually work correctly. That said, I'm also fairly sure the current code is broken for 64 bits. In particular, let's suppose we have something like this: layout(push_constant) { struct { int i; int pad1; float f; double d; int pad2; } arr[2]; } main() { out_color = vec4(arr[arr[0].i].f, float(arr[arr[0].i].d), arr[arr[1].i].f, float(arr[arr[1].i].d)); } I'm pretty sure the current code will explode because it will see a single contiguous chunk that's neither 32 nor 64-bit. If that particular shader doesn't break it, I'm sure some permutation of it will. Things only get worse if we throw in 16-bit. Ultimately, I think the solution is to throw away our current scheme of trying to separate things out by bit size and move to a scheme where we work with everything in bytes in assign_constant_locations and trust in the contiguous[] array to determine where to split things up. We would probably want to continue only re-arranging things 4 bytes at a time. That said, I don't think this patch makes anything any worse... > We aligns the 16-bit locations after assigning the 32-bit > ones. > --- > src/intel/compiler/brw_fs.cpp | 30 +++--- > 1 file changed, 23 insertions(+), 7 deletions(-) > > diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp > index a1d49a63be..8da16145dc 100644 > --- a/src/intel/compiler/brw_fs.cpp > +++ b/src/intel/compiler/brw_fs.cpp > @@ -1909,8 +1909,9 @@ set_push_pull_constant_loc(unsigned uniform, int > *chunk_start, > if (!contiguous) { >/* If bitsize doesn't match the target one, skip it */ >if (*max_chunk_bitsize != target_bitsize) { > - /* FIXME: right now we only support 32 and 64-bit accesses */ > - assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8); > + assert(*max_chunk_bitsize == 4 || > +*max_chunk_bitsize == 8 || > +*max_chunk_bitsize == 2); > *max_chunk_bitsize = 0; > *chunk_start = -1; > return; > @@ -1987,8 +1988,9 @@ fs_visitor::assign_constant_locations() > int constant_nr = inst->src[i].nr + inst->src[i].offset / 4; > > if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { > -assert(inst->src[2].ud % 4 == 0); > -unsigned last = constant_nr + (inst->src[2].ud / 4) - 1; > +assert(type_sz(inst->src[i].type) == 2 ? > + (inst->src[2].ud % 2 == 0) : (inst->src[2].ud % 4 == > 0)); > +unsigned last = constant_nr + DIV_ROUND_UP(inst->src[2].ud, > 4) - 1; > assert(last < uniforms); > > for (unsigned j = constant_nr; j < last; j++) { > @@ -2000,8 +2002,8 @@ fs_visitor::assign_constant_locations() > bitsize_access[last] = MAX2(bitsize_access[last], > type_sz(inst->src[i].type)); > } else { > if (constant_nr >= 0 && constant_nr < (int) uniforms) { > - int regs_read = inst->components_read(i) * > - type_sz(inst->src[i].type) / 4; > + int regs_read = DIV_ROUND_UP(inst->components_read(i) * > +type_sz(inst->src[i].type), > 4); > for (int j = 0; j < regs_read; j++) { >is_live[constant_nr + j] = true; >bitsize_access[constant_nr + j] = > @@ -2062,7 +2064,7 @@ fs_visitor::assign_constant_locations() > > } > > - /* Then push the rest of uniforms */ > + /* Then push the 32-bit uniforms */ > const unsigned uniform_32_bit_size = type_sz(BRW_REGISTER_TYPE_F); > for (unsigned u = 0; u < uniforms; u++) { >if (!is_live[u]) > @@ -2081,6 +2083,20 @@ fs_visitor::assign_constant_locations() > stage_prog_data); > } > > + const unsigned uniform_16_bit_size = type_sz(BRW_REGISTER_TYPE_HF); > + for (unsigned u = 0; u < uniforms; u++) { > + if (!is_live[u]) > + continue; > + > + set_push_pull_constant_loc(u, _start, _chunk_bitsize, > + contiguous[u], bitsize_access[u], > + uniform_16_bit_size, > + push_constant_loc, pull_constant_loc, > + _push_constants, _pull_constants, > +
Re: [Mesa-dev] [PATCH v3 19/43] i965/fs: Support push constants of 16-bit types
On Mon, Oct 30, 2017 at 05:10:53PM +0100, Chema Casanova wrote: > El 30/10/17 a las 07:44, Pohjolainen, Topi escribió: > > On Sun, Oct 29, 2017 at 11:17:11PM +0100, Chema Casanova wrote: > >> On 29/10/17 19:55, Pohjolainen, Topi wrote: > >>> On Thu, Oct 12, 2017 at 08:38:08PM +0200, Jose Maria Casanova Crespo > >>> wrote: > We enable the use of 16-bit values in push constants > modifying the assign_constant_locations function to work > with 16-bit types. > > The API to access buffers in Vulkan use multiples of 4-byte for > offsets and sizes. Current accountability of uniforms based on 4-byte > slots will work for 16-bit values if they are allowed to use 32-bit > slots. For that, we replace the division by 4 by a DIV_ROUND_UP, so > 2-byte elements will use 1 slot instead of 0. > > We aligns the 16-bit locations after assigning the 32-bit > ones. > --- > src/intel/compiler/brw_fs.cpp | 30 +++--- > 1 file changed, 23 insertions(+), 7 deletions(-) > > diff --git a/src/intel/compiler/brw_fs.cpp > b/src/intel/compiler/brw_fs.cpp > index a1d49a63be..8da16145dc 100644 > --- a/src/intel/compiler/brw_fs.cpp > +++ b/src/intel/compiler/brw_fs.cpp > @@ -1909,8 +1909,9 @@ set_push_pull_constant_loc(unsigned uniform, int > *chunk_start, > if (!contiguous) { > /* If bitsize doesn't match the target one, skip it */ > if (*max_chunk_bitsize != target_bitsize) { > - /* FIXME: right now we only support 32 and 64-bit accesses */ > - assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8); > + assert(*max_chunk_bitsize == 4 || > +*max_chunk_bitsize == 8 || > +*max_chunk_bitsize == 2); > *max_chunk_bitsize = 0; > *chunk_start = -1; > return; > @@ -1987,8 +1988,9 @@ fs_visitor::assign_constant_locations() > int constant_nr = inst->src[i].nr + inst->src[i].offset / 4; > >>> Did you test this with, for example, vec4? > >> CTS has 16bit scalar, vec2 (uint,sint), vec4 (float) and matrix tests > >> for push constants for compute and graphics pipelines. For vec4 you can > >> try: > >> > >> dEQP-VK.spirv_assembly.instruction.compute.16bit_storage.push_constant_16_to_32.vector_float > >> > >> For push constant tests in general there are 42 tests, but vec3 aren't > >> tested: > >> > >> dEQP-VK.*16bit_storage.*push_constant. > >> > >> > >>> I've been toying with a glsl > >>> lowering pass changing mediump floats into float16. I was curious to know > >>> how > >>> much is needed as you have addressed most of the things from NIR onwards. > >>> Here I'm seeing offsets 0,2,4,6 which result into 0,0,1,1 when divided by > >>> four. Don't we need something of this sort in addition? > >> If i remember correctly, tests were testing to use push constants with > >> 64 16bit values, to use the minimum spec maximum available as > >> max_push_constants_size that is 128 bytes. So at the end the generated > >> intrinsic was: > >> > >> vec4 16 ssa_4 = intrinsic load_uniform (ssa_3) () (0, 128) /* base=0 */ > >> /* range=128 */ > >> > >> As the calculus here is to calculate the number of location used, and > >> taking into account that the Vulkan API restrictions for push constants > >> that says that push constant ranges that say that offset must be > >> multiple of 4 and size must be multiple of 4, maintain the use of > >> 4-bytes slots was ok for supporting the feature. Our code changes just > >> take the accountability in the number of 32-bits location needed, mainly > >> changing the divisions by 4 using DIV_ROUND_UP( , 4) to calculate sizes. > > I'm probably misunderstanding something. Let me ask a few clarifying > > questions. > > > > I'm reading that the incoming 16-bit values are given in 32-bit slots, and > > for > > the same reason we place them in the push/pull buffers in 32-bits slots. In > > other words a vec4 would take 16-bytes and each component would 32-bits > > apart? > > Probably I explained quite bad. A f16vec4 would use 8-bytes, and each > component > is going to be 16-bits apart. The 32-bit multiple offset only applies to > the first > element. > > > If that is the case, then don't we need to adjust the register offsets > > somewhere the way I did in the fragment below? Otherwise the offsets will > > point to locations in the register that are simply 16-bits apart? > > Yes components are 16-bits apart. Because of that we don't need anything > especial to adjust the offsets. At least we didn't needed for existing > tests. Ah, okay. I misunderstood, thanks for explaining. That sounds good. I need the hack below for now. I'll come back to it once I have all other things needed for the benchmarks addressed somehow. > > > > >>> commit 1a6d2bf3302f6e4305e383da0f27712dc5c20a67 > >>> Author: Topi Pohjolainen
Re: [Mesa-dev] [PATCH v3 19/43] i965/fs: Support push constants of 16-bit types
El 30/10/17 a las 07:44, Pohjolainen, Topi escribió: > On Sun, Oct 29, 2017 at 11:17:11PM +0100, Chema Casanova wrote: >> On 29/10/17 19:55, Pohjolainen, Topi wrote: >>> On Thu, Oct 12, 2017 at 08:38:08PM +0200, Jose Maria Casanova Crespo wrote: We enable the use of 16-bit values in push constants modifying the assign_constant_locations function to work with 16-bit types. The API to access buffers in Vulkan use multiples of 4-byte for offsets and sizes. Current accountability of uniforms based on 4-byte slots will work for 16-bit values if they are allowed to use 32-bit slots. For that, we replace the division by 4 by a DIV_ROUND_UP, so 2-byte elements will use 1 slot instead of 0. We aligns the 16-bit locations after assigning the 32-bit ones. --- src/intel/compiler/brw_fs.cpp | 30 +++--- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index a1d49a63be..8da16145dc 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -1909,8 +1909,9 @@ set_push_pull_constant_loc(unsigned uniform, int *chunk_start, if (!contiguous) { /* If bitsize doesn't match the target one, skip it */ if (*max_chunk_bitsize != target_bitsize) { - /* FIXME: right now we only support 32 and 64-bit accesses */ - assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8); + assert(*max_chunk_bitsize == 4 || +*max_chunk_bitsize == 8 || +*max_chunk_bitsize == 2); *max_chunk_bitsize = 0; *chunk_start = -1; return; @@ -1987,8 +1988,9 @@ fs_visitor::assign_constant_locations() int constant_nr = inst->src[i].nr + inst->src[i].offset / 4; >>> Did you test this with, for example, vec4? >> CTS has 16bit scalar, vec2 (uint,sint), vec4 (float) and matrix tests >> for push constants for compute and graphics pipelines. For vec4 you can try: >> >> dEQP-VK.spirv_assembly.instruction.compute.16bit_storage.push_constant_16_to_32.vector_float >> >> For push constant tests in general there are 42 tests, but vec3 aren't >> tested: >> >> dEQP-VK.*16bit_storage.*push_constant. >> >> >>> I've been toying with a glsl >>> lowering pass changing mediump floats into float16. I was curious to know >>> how >>> much is needed as you have addressed most of the things from NIR onwards. >>> Here I'm seeing offsets 0,2,4,6 which result into 0,0,1,1 when divided by >>> four. Don't we need something of this sort in addition? >> If i remember correctly, tests were testing to use push constants with >> 64 16bit values, to use the minimum spec maximum available as >> max_push_constants_size that is 128 bytes. So at the end the generated >> intrinsic was: >> >> vec4 16 ssa_4 = intrinsic load_uniform (ssa_3) () (0, 128) /* base=0 */ >> /* range=128 */ >> >> As the calculus here is to calculate the number of location used, and >> taking into account that the Vulkan API restrictions for push constants >> that says that push constant ranges that say that offset must be >> multiple of 4 and size must be multiple of 4, maintain the use of >> 4-bytes slots was ok for supporting the feature. Our code changes just >> take the accountability in the number of 32-bits location needed, mainly >> changing the divisions by 4 using DIV_ROUND_UP( , 4) to calculate sizes. > I'm probably misunderstanding something. Let me ask a few clarifying > questions. > > I'm reading that the incoming 16-bit values are given in 32-bit slots, and for > the same reason we place them in the push/pull buffers in 32-bits slots. In > other words a vec4 would take 16-bytes and each component would 32-bits apart? Probably I explained quite bad. A f16vec4 would use 8-bytes, and each component is going to be 16-bits apart. The 32-bit multiple offset only applies to the first element. > If that is the case, then don't we need to adjust the register offsets > somewhere the way I did in the fragment below? Otherwise the offsets will > point to locations in the register that are simply 16-bits apart? Yes components are 16-bits apart. Because of that we don't need anything especial to adjust the offsets. At least we didn't needed for existing tests. > >>> commit 1a6d2bf3302f6e4305e383da0f27712dc5c20a67 >>> Author: Topi Pohjolainen>>> Date: Sun Oct 29 20:28:03 2017 +0200 >>> >>> fix alignment of 16-bit uniforms on 32-bit slots >>> >>> diff --git a/src/intel/compiler/brw_fs_nir.cpp >>> b/src/intel/compiler/brw_fs_nir.cpp >>> index 2f5443958a..586eb9d9ff 100644 >>> --- a/src/intel/compiler/brw_fs_nir.cpp >>> +++ b/src/intel/compiler/brw_fs_nir.cpp >>> @@ -4007,7 +4007,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder >>> , nir_intrinsic_instr *instr >>>
Re: [Mesa-dev] [PATCH v3 19/43] i965/fs: Support push constants of 16-bit types
On Sun, Oct 29, 2017 at 11:17:11PM +0100, Chema Casanova wrote: > On 29/10/17 19:55, Pohjolainen, Topi wrote: > > On Thu, Oct 12, 2017 at 08:38:08PM +0200, Jose Maria Casanova Crespo wrote: > >> We enable the use of 16-bit values in push constants > >> modifying the assign_constant_locations function to work > >> with 16-bit types. > >> > >> The API to access buffers in Vulkan use multiples of 4-byte for > >> offsets and sizes. Current accountability of uniforms based on 4-byte > >> slots will work for 16-bit values if they are allowed to use 32-bit > >> slots. For that, we replace the division by 4 by a DIV_ROUND_UP, so > >> 2-byte elements will use 1 slot instead of 0. > >> > >> We aligns the 16-bit locations after assigning the 32-bit > >> ones. > >> --- > >> src/intel/compiler/brw_fs.cpp | 30 +++--- > >> 1 file changed, 23 insertions(+), 7 deletions(-) > >> > >> diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp > >> index a1d49a63be..8da16145dc 100644 > >> --- a/src/intel/compiler/brw_fs.cpp > >> +++ b/src/intel/compiler/brw_fs.cpp > >> @@ -1909,8 +1909,9 @@ set_push_pull_constant_loc(unsigned uniform, int > >> *chunk_start, > >> if (!contiguous) { > >>/* If bitsize doesn't match the target one, skip it */ > >>if (*max_chunk_bitsize != target_bitsize) { > >> - /* FIXME: right now we only support 32 and 64-bit accesses */ > >> - assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8); > >> + assert(*max_chunk_bitsize == 4 || > >> +*max_chunk_bitsize == 8 || > >> +*max_chunk_bitsize == 2); > >> *max_chunk_bitsize = 0; > >> *chunk_start = -1; > >> return; > >> @@ -1987,8 +1988,9 @@ fs_visitor::assign_constant_locations() > >> int constant_nr = inst->src[i].nr + inst->src[i].offset / 4; > > > > Did you test this with, for example, vec4? > > CTS has 16bit scalar, vec2 (uint,sint), vec4 (float) and matrix tests > for push constants for compute and graphics pipelines. For vec4 you can try: > > dEQP-VK.spirv_assembly.instruction.compute.16bit_storage.push_constant_16_to_32.vector_float > > For push constant tests in general there are 42 tests, but vec3 aren't > tested: > > dEQP-VK.*16bit_storage.*push_constant. > > > > I've been toying with a glsl > > lowering pass changing mediump floats into float16. I was curious to know > > how > > much is needed as you have addressed most of the things from NIR onwards. > > Here I'm seeing offsets 0,2,4,6 which result into 0,0,1,1 when divided by > > four. Don't we need something of this sort in addition? > > If i remember correctly, tests were testing to use push constants with > 64 16bit values, to use the minimum spec maximum available as > max_push_constants_size that is 128 bytes. So at the end the generated > intrinsic was: > > vec4 16 ssa_4 = intrinsic load_uniform (ssa_3) () (0, 128) /* base=0 */ > /* range=128 */ > > As the calculus here is to calculate the number of location used, and > taking into account that the Vulkan API restrictions for push constants > that says that push constant ranges that say that offset must be > multiple of 4 and size must be multiple of 4, maintain the use of > 4-bytes slots was ok for supporting the feature. Our code changes just > take the accountability in the number of 32-bits location needed, mainly > changing the divisions by 4 using DIV_ROUND_UP( , 4) to calculate sizes. I'm probably misunderstanding something. Let me ask a few clarifying questions. I'm reading that the incoming 16-bit values are given in 32-bit slots, and for the same reason we place them in the push/pull buffers in 32-bits slots. In other words a vec4 would take 16-bytes and each component would 32-bits apart? If that is the case, then don't we need to adjust the register offsets somewhere the way I did in the fragment below? Otherwise the offsets will point to locations in the register that are simply 16-bits apart? > > > commit 1a6d2bf3302f6e4305e383da0f27712dc5c20a67 > > Author: Topi Pohjolainen> > Date: Sun Oct 29 20:28:03 2017 +0200 > > > > fix alignment of 16-bit uniforms on 32-bit slots > > > > diff --git a/src/intel/compiler/brw_fs_nir.cpp > > b/src/intel/compiler/brw_fs_nir.cpp > > index 2f5443958a..586eb9d9ff 100644 > > --- a/src/intel/compiler/brw_fs_nir.cpp > > +++ b/src/intel/compiler/brw_fs_nir.cpp > > @@ -4007,7 +4007,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder > > , nir_intrinsic_instr *instr > > src.offset = const_offset->u32[0]; > > > > for (unsigned j = 0; j < instr->num_components; j++) { > > -bld.MOV(offset(dest, bld, j), offset(src, bld, j)); > > +const unsigned src_offset = > > + src.type == BRW_REGISTER_TYPE_HF ? 2 * j : j; > > + > > +bld.MOV(offset(dest, bld, j), offset(src, bld, src_offset)); > > > > > > > > Then about the
Re: [Mesa-dev] [PATCH v3 19/43] i965/fs: Support push constants of 16-bit types
On 29/10/17 19:55, Pohjolainen, Topi wrote: > On Thu, Oct 12, 2017 at 08:38:08PM +0200, Jose Maria Casanova Crespo wrote: >> We enable the use of 16-bit values in push constants >> modifying the assign_constant_locations function to work >> with 16-bit types. >> >> The API to access buffers in Vulkan use multiples of 4-byte for >> offsets and sizes. Current accountability of uniforms based on 4-byte >> slots will work for 16-bit values if they are allowed to use 32-bit >> slots. For that, we replace the division by 4 by a DIV_ROUND_UP, so >> 2-byte elements will use 1 slot instead of 0. >> >> We aligns the 16-bit locations after assigning the 32-bit >> ones. >> --- >> src/intel/compiler/brw_fs.cpp | 30 +++--- >> 1 file changed, 23 insertions(+), 7 deletions(-) >> >> diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp >> index a1d49a63be..8da16145dc 100644 >> --- a/src/intel/compiler/brw_fs.cpp >> +++ b/src/intel/compiler/brw_fs.cpp >> @@ -1909,8 +1909,9 @@ set_push_pull_constant_loc(unsigned uniform, int >> *chunk_start, >> if (!contiguous) { >>/* If bitsize doesn't match the target one, skip it */ >>if (*max_chunk_bitsize != target_bitsize) { >> - /* FIXME: right now we only support 32 and 64-bit accesses */ >> - assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8); >> + assert(*max_chunk_bitsize == 4 || >> +*max_chunk_bitsize == 8 || >> +*max_chunk_bitsize == 2); >> *max_chunk_bitsize = 0; >> *chunk_start = -1; >> return; >> @@ -1987,8 +1988,9 @@ fs_visitor::assign_constant_locations() >> int constant_nr = inst->src[i].nr + inst->src[i].offset / 4; > > Did you test this with, for example, vec4? CTS has 16bit scalar, vec2 (uint,sint), vec4 (float) and matrix tests for push constants for compute and graphics pipelines. For vec4 you can try: dEQP-VK.spirv_assembly.instruction.compute.16bit_storage.push_constant_16_to_32.vector_float For push constant tests in general there are 42 tests, but vec3 aren't tested: dEQP-VK.*16bit_storage.*push_constant. > I've been toying with a glsl > lowering pass changing mediump floats into float16. I was curious to know how > much is needed as you have addressed most of the things from NIR onwards. > Here I'm seeing offsets 0,2,4,6 which result into 0,0,1,1 when divided by > four. Don't we need something of this sort in addition? If i remember correctly, tests were testing to use push constants with 64 16bit values, to use the minimum spec maximum available as max_push_constants_size that is 128 bytes. So at the end the generated intrinsic was: vec4 16 ssa_4 = intrinsic load_uniform (ssa_3) () (0, 128) /* base=0 */ /* range=128 */ As the calculus here is to calculate the number of location used, and taking into account that the Vulkan API restrictions for push constants that says that push constant ranges that say that offset must be multiple of 4 and size must be multiple of 4, maintain the use of 4-bytes slots was ok for supporting the feature. Our code changes just take the accountability in the number of 32-bits location needed, mainly changing the divisions by 4 using DIV_ROUND_UP( , 4) to calculate sizes. > commit 1a6d2bf3302f6e4305e383da0f27712dc5c20a67 > Author: Topi Pohjolainen> Date: Sun Oct 29 20:28:03 2017 +0200 > > fix alignment of 16-bit uniforms on 32-bit slots > > diff --git a/src/intel/compiler/brw_fs_nir.cpp > b/src/intel/compiler/brw_fs_nir.cpp > index 2f5443958a..586eb9d9ff 100644 > --- a/src/intel/compiler/brw_fs_nir.cpp > +++ b/src/intel/compiler/brw_fs_nir.cpp > @@ -4007,7 +4007,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , > nir_intrinsic_instr *instr > src.offset = const_offset->u32[0]; > > for (unsigned j = 0; j < instr->num_components; j++) { > -bld.MOV(offset(dest, bld, j), offset(src, bld, j)); > +const unsigned src_offset = > + src.type == BRW_REGISTER_TYPE_HF ? 2 * j : j; > + > +bld.MOV(offset(dest, bld, j), offset(src, bld, src_offset)); > > > > Then about the change of using 32-bit slots. This is now unconditional and > would require revisiting if we wanted to pack 16-bits tighter and possibly > increase the amount of uniforms that can be pushed. Similarly to Vulkan, in > GL the core stores uniforms as floats and I think we should keep it that way. > I added support in the i965 backend to keep track of the types of the > uniforms and to convert 32-bit presentation to 16-bits on the fly in > gen6_constant_state.c::brw_param_value(). I don't like it that much but I had > to start from somewhere. > My thinking is that we'd want to decouple the storage of the values and the > packing used in the compiler backend. Ideally keeping the mesa gl core and the > api working with full 32-bit floats but using tight 16-bit slots in the > push/pull
Re: [Mesa-dev] [PATCH v3 19/43] i965/fs: Support push constants of 16-bit types
On Thu, Oct 12, 2017 at 08:38:08PM +0200, Jose Maria Casanova Crespo wrote: > We enable the use of 16-bit values in push constants > modifying the assign_constant_locations function to work > with 16-bit types. > > The API to access buffers in Vulkan use multiples of 4-byte for > offsets and sizes. Current accountability of uniforms based on 4-byte > slots will work for 16-bit values if they are allowed to use 32-bit > slots. For that, we replace the division by 4 by a DIV_ROUND_UP, so > 2-byte elements will use 1 slot instead of 0. > > We aligns the 16-bit locations after assigning the 32-bit > ones. > --- > src/intel/compiler/brw_fs.cpp | 30 +++--- > 1 file changed, 23 insertions(+), 7 deletions(-) > > diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp > index a1d49a63be..8da16145dc 100644 > --- a/src/intel/compiler/brw_fs.cpp > +++ b/src/intel/compiler/brw_fs.cpp > @@ -1909,8 +1909,9 @@ set_push_pull_constant_loc(unsigned uniform, int > *chunk_start, > if (!contiguous) { >/* If bitsize doesn't match the target one, skip it */ >if (*max_chunk_bitsize != target_bitsize) { > - /* FIXME: right now we only support 32 and 64-bit accesses */ > - assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8); > + assert(*max_chunk_bitsize == 4 || > +*max_chunk_bitsize == 8 || > +*max_chunk_bitsize == 2); > *max_chunk_bitsize = 0; > *chunk_start = -1; > return; > @@ -1987,8 +1988,9 @@ fs_visitor::assign_constant_locations() > int constant_nr = inst->src[i].nr + inst->src[i].offset / 4; Did you test this with, for example, vec4? I've been toying with a glsl lowering pass changing mediump floats into float16. I was curious to know how much is needed as you have addressed most of the things from NIR onwards. Here I'm seeing offsets 0,2,4,6 which result into 0,0,1,1 when divided by four. Don't we need something of this sort in addition? commit 1a6d2bf3302f6e4305e383da0f27712dc5c20a67 Author: Topi PohjolainenDate: Sun Oct 29 20:28:03 2017 +0200 fix alignment of 16-bit uniforms on 32-bit slots diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 2f5443958a..586eb9d9ff 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4007,7 +4007,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , nir_intrinsic_instr *instr src.offset = const_offset->u32[0]; for (unsigned j = 0; j < instr->num_components; j++) { -bld.MOV(offset(dest, bld, j), offset(src, bld, j)); +const unsigned src_offset = + src.type == BRW_REGISTER_TYPE_HF ? 2 * j : j; + +bld.MOV(offset(dest, bld, j), offset(src, bld, src_offset)); Then about the change of using 32-bit slots. This is now unconditional and would require revisiting if we wanted to pack 16-bits tighter and possibly increase the amount of uniforms that can be pushed. Similarly to Vulkan, in GL the core stores uniforms as floats and I think we should keep it that way. I added support in the i965 backend to keep track of the types of the uniforms and to convert 32-bit presentation to 16-bits on the fly in gen6_constant_state.c::brw_param_value(). I don't like it that much but I had to start from somewhere. My thinking is that we'd want to decouple the storage of the values and the packing used in the compiler backend. Ideally keeping the mesa gl core and the api working with full 32-bit floats but using tight 16-bit slots in the push/pull constant buffers. This requires quite a bit more changes as we have structured param[]/pull_param[] to work with 32-bit slots. My current work can be found in: git://people.freedesktop.org/~tpohjola/mesa 16_bit_gles > > if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { > -assert(inst->src[2].ud % 4 == 0); > -unsigned last = constant_nr + (inst->src[2].ud / 4) - 1; > +assert(type_sz(inst->src[i].type) == 2 ? > + (inst->src[2].ud % 2 == 0) : (inst->src[2].ud % 4 == 0)); > +unsigned last = constant_nr + DIV_ROUND_UP(inst->src[2].ud, 4) - > 1; > assert(last < uniforms); > > for (unsigned j = constant_nr; j < last; j++) { > @@ -2000,8 +2002,8 @@ fs_visitor::assign_constant_locations() > bitsize_access[last] = MAX2(bitsize_access[last], > type_sz(inst->src[i].type)); > } else { > if (constant_nr >= 0 && constant_nr < (int) uniforms) { > - int regs_read = inst->components_read(i) * > - type_sz(inst->src[i].type) / 4; > + int regs_read = DIV_ROUND_UP(inst->components_read(i) * > +type_sz(inst->src[i].type), 4); > for (int j = 0; j < regs_read; j++) { >
Re: [Mesa-dev] [PATCH v3 19/43] i965/fs: Support push constants of 16-bit types
On 14/10/17 10:02, Pohjolainen, Topi wrote: > On Thu, Oct 12, 2017 at 08:38:08PM +0200, Jose Maria Casanova Crespo wrote: >> We enable the use of 16-bit values in push constants >> modifying the assign_constant_locations function to work >> with 16-bit types. >> >> The API to access buffers in Vulkan use multiples of 4-byte for >> offsets and sizes. Current accountability of uniforms based on 4-byte >> slots will work for 16-bit values if they are allowed to use 32-bit >> slots. For that, we replace the division by 4 by a DIV_ROUND_UP, so >> 2-byte elements will use 1 slot instead of 0. >> >> We aligns the 16-bit locations after assigning the 32-bit > > s/aligns/align/ > Also fixed. Thanks. Chema ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH v3 19/43] i965/fs: Support push constants of 16-bit types
On Thu, Oct 12, 2017 at 08:38:08PM +0200, Jose Maria Casanova Crespo wrote: > We enable the use of 16-bit values in push constants > modifying the assign_constant_locations function to work > with 16-bit types. > > The API to access buffers in Vulkan use multiples of 4-byte for > offsets and sizes. Current accountability of uniforms based on 4-byte > slots will work for 16-bit values if they are allowed to use 32-bit > slots. For that, we replace the division by 4 by a DIV_ROUND_UP, so > 2-byte elements will use 1 slot instead of 0. > > We aligns the 16-bit locations after assigning the 32-bit s/aligns/align/ ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 19/43] i965/fs: Support push constants of 16-bit types
We enable the use of 16-bit values in push constants modifying the assign_constant_locations function to work with 16-bit types. The API to access buffers in Vulkan use multiples of 4-byte for offsets and sizes. Current accountability of uniforms based on 4-byte slots will work for 16-bit values if they are allowed to use 32-bit slots. For that, we replace the division by 4 by a DIV_ROUND_UP, so 2-byte elements will use 1 slot instead of 0. We aligns the 16-bit locations after assigning the 32-bit ones. --- src/intel/compiler/brw_fs.cpp | 30 +++--- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index a1d49a63be..8da16145dc 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -1909,8 +1909,9 @@ set_push_pull_constant_loc(unsigned uniform, int *chunk_start, if (!contiguous) { /* If bitsize doesn't match the target one, skip it */ if (*max_chunk_bitsize != target_bitsize) { - /* FIXME: right now we only support 32 and 64-bit accesses */ - assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8); + assert(*max_chunk_bitsize == 4 || +*max_chunk_bitsize == 8 || +*max_chunk_bitsize == 2); *max_chunk_bitsize = 0; *chunk_start = -1; return; @@ -1987,8 +1988,9 @@ fs_visitor::assign_constant_locations() int constant_nr = inst->src[i].nr + inst->src[i].offset / 4; if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { -assert(inst->src[2].ud % 4 == 0); -unsigned last = constant_nr + (inst->src[2].ud / 4) - 1; +assert(type_sz(inst->src[i].type) == 2 ? + (inst->src[2].ud % 2 == 0) : (inst->src[2].ud % 4 == 0)); +unsigned last = constant_nr + DIV_ROUND_UP(inst->src[2].ud, 4) - 1; assert(last < uniforms); for (unsigned j = constant_nr; j < last; j++) { @@ -2000,8 +2002,8 @@ fs_visitor::assign_constant_locations() bitsize_access[last] = MAX2(bitsize_access[last], type_sz(inst->src[i].type)); } else { if (constant_nr >= 0 && constant_nr < (int) uniforms) { - int regs_read = inst->components_read(i) * - type_sz(inst->src[i].type) / 4; + int regs_read = DIV_ROUND_UP(inst->components_read(i) * +type_sz(inst->src[i].type), 4); for (int j = 0; j < regs_read; j++) { is_live[constant_nr + j] = true; bitsize_access[constant_nr + j] = @@ -2062,7 +2064,7 @@ fs_visitor::assign_constant_locations() } - /* Then push the rest of uniforms */ + /* Then push the 32-bit uniforms */ const unsigned uniform_32_bit_size = type_sz(BRW_REGISTER_TYPE_F); for (unsigned u = 0; u < uniforms; u++) { if (!is_live[u]) @@ -2081,6 +2083,20 @@ fs_visitor::assign_constant_locations() stage_prog_data); } + const unsigned uniform_16_bit_size = type_sz(BRW_REGISTER_TYPE_HF); + for (unsigned u = 0; u < uniforms; u++) { + if (!is_live[u]) + continue; + + set_push_pull_constant_loc(u, _start, _chunk_bitsize, + contiguous[u], bitsize_access[u], + uniform_16_bit_size, + push_constant_loc, pull_constant_loc, + _push_constants, _pull_constants, + max_push_components, max_chunk_size, + stage_prog_data); + } + /* Add the CS local thread ID uniform at the end of the push constants */ if (thread_local_id_index >= 0) push_constant_loc[thread_local_id_index] = num_push_constants++; -- 2.13.6 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev