Module: Mesa Branch: main Commit: 702eabaaae3ae559ba495488148139d506c1edcb URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=702eabaaae3ae559ba495488148139d506c1edcb
Author: Francisco Jerez <[email protected]> Date: Fri Dec 1 16:23:11 2023 -0800 intel/fs/xe2+: Update for new layout of vertex setup data in PS payload. The interpolation deltas of PS inputs now show up as a 12B vec3 (A0, A1-A0, A2-A0) in the ATTR file, instead of the previously used 16B format with an unused component. Reviewed-by: Jordan Justen <[email protected]> Reviewed-by: Caio Oliveira <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26606> --- src/intel/compiler/brw_fs.cpp | 52 ++++++++++++++++++++++++++++----------- src/intel/compiler/brw_fs_nir.cpp | 25 ++++++++++++++++--- 2 files changed, 58 insertions(+), 19 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index dedcfd56952..24e1e5662f0 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -1742,10 +1742,10 @@ fs_visitor::assign_urb_setup() for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file == ATTR) { /* ATTR fs_reg::nr in the FS is in units of logical scalar - * inputs each of which consumes half of a GRF register on - * current platforms. In single polygon mode this leads - * to the following layout of the vertex setup plane - * parameters in the ATTR register file: + * inputs each of which consumes 16B on Gfx4-Gfx12. In + * single polygon mode this leads to the following layout + * of the vertex setup plane parameters in the ATTR + * register file: * * fs_reg::nr Input Comp0 Comp1 Comp2 Comp3 * 0 Attr0.x a1-a0 a2-a0 N/A a0 @@ -1782,27 +1782,49 @@ fs_visitor::assign_urb_setup() * The latter layout corresponds to a param_width equal to * dispatch_width, while the former (scalar parameter) * layout has a param_width of 1. + * + * Gfx20+ represent plane parameters in a format similar + * to the above, except the parameters are packed in 12B + * and ordered like "a0, a1-a0, a2-a0" instead of the + * above vec4 representation with a missing component. */ const unsigned param_width = (max_polygons > 1 ? dispatch_width : 1); - assert(inst->src[i].offset / param_width < REG_SIZE / 2); - assert(max_polygons > 0); /* Size of a single scalar component of a plane parameter * in bytes. */ const unsigned chan_sz = 4; + struct brw_reg reg; + assert(max_polygons > 0); /* Translate the offset within the param_width-wide - * representation described above into an offset into grf, - * which contains plane parameters for the first polygon - * handled by the thread. + * representation described above into an offset and a + * grf, which contains the plane parameters for the first + * polygon processed by the thread. */ - const unsigned grf = urb_start + inst->src[i].nr / 2 * max_polygons; - const unsigned delta = (inst->src[i].nr % 2) * (REG_SIZE / 2) + - inst->src[i].offset / (param_width * chan_sz) * chan_sz + - inst->src[i].offset % chan_sz; - struct brw_reg reg = - byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type), delta); + if (devinfo->ver >= 20) { + /* Gfx20+ is able to pack 5 logical input components + * per 64B register. + */ + const unsigned grf = urb_start + inst->src[i].nr / 5 * 2; + assert(inst->src[i].offset / param_width < 12); + const unsigned delta = inst->src[i].nr % 5 * 12 + + inst->src[i].offset / (param_width * chan_sz) * chan_sz + + inst->src[i].offset % chan_sz; + reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type), + delta); + } else { + /* Earlier platforms pack 2 logical input components + * per 32B register. + */ + const unsigned grf = urb_start + inst->src[i].nr / 2 * max_polygons; + assert(inst->src[i].offset / param_width < REG_SIZE / 2); + const unsigned delta = (inst->src[i].nr % 2) * (REG_SIZE / 2) + + inst->src[i].offset / (param_width * chan_sz) * chan_sz + + inst->src[i].offset % chan_sz; + reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type), + delta); + } if (max_polygons > 1) { assert(devinfo->ver == 12); diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 70ee4ecae82..088542dc717 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4237,9 +4237,14 @@ fs_nir_emit_fs_intrinsic(nir_to_brw_state &ntb, retype(s.per_primitive_reg(bld, base, comp + i), dest.type)); } } else { + /* Gfx20+ packs the plane parameters of a single logical + * input in a vec3 format instead of the previously used vec4 + * format. + */ + const unsigned k = devinfo->ver >= 20 ? 0 : 3; for (unsigned int i = 0; i < num_components; i++) { bld.MOV(offset(dest, bld, i), - retype(s.interp_reg(bld, base, comp + i, 3), dest.type)); + retype(s.interp_reg(bld, base, comp + i, k), dest.type)); } } break; @@ -4251,9 +4256,21 @@ fs_nir_emit_fs_intrinsic(nir_to_brw_state &ntb, const unsigned base = nir_intrinsic_base(instr); const unsigned comp = nir_intrinsic_component(instr); dest.type = BRW_REGISTER_TYPE_F; - bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 3)); - bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 1)); - bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 0)); + + /* Gfx20+ packs the plane parameters of a single logical + * input in a vec3 format instead of the previously used vec4 + * format. + */ + if (devinfo->ver >= 20) { + bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 0)); + bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 2)); + bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 1)); + } else { + bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 3)); + bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 1)); + bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 0)); + } + break; }
