On Tue, Jun 5, 2018 at 10:38 AM, Philipp Zabel <p.za...@pengutronix.de> wrote: > Since all threads share a global temporary vec4 register file, it is > important to reduce temporary register use of shaders. > Using source swizzles and destination write mask of ALU operations we > can layer smaller virtual registers on top of the physical base > registers that overlap with their base register and partially with each > other: > > +----+---------+-------------+---------+ > |VEC4| VEC3 | VEC2 | SCALAR | > +----+---------+-------------+---------+ > | X | X X X | X X X | X | > | Y | Y Y Y | Y Y Y | Y | > | Z | Z Z Z | Z Z Z | Z | > | W | W W W | W W W | W | > +----+---------+-------------+---------+ > > There are four possible virtual vec3 registers that leave the remaining > component usable as a scalar virtual register, six possible vec2 > registers, and four possible scalar registers that only use a single > component. > > This patch adds an interference graph for virtual registers to the > register allocator, using information about SSA interference and virtual > register overlap. If possible, SSAs with smaller num_components are > allocated from the unused components of already partially used temporary > registers. > > Signed-off-by: Philipp Zabel <p.za...@pengutronix.de> > Signed-off-by: Michael Tretter <m.tret...@pengutronix.de> > ---
so one quick note, constructing the register classes can be expensive.. you probably only want to do this once and then re-use for each shader BR, -R > src/gallium/drivers/etnaviv/etnaviv_nir.c | 282 ++++++++++++++++++++-- > 1 file changed, 259 insertions(+), 23 deletions(-) > > diff --git a/src/gallium/drivers/etnaviv/etnaviv_nir.c > b/src/gallium/drivers/etnaviv/etnaviv_nir.c > index b73d4be31bc6..752e87248e31 100644 > --- a/src/gallium/drivers/etnaviv/etnaviv_nir.c > +++ b/src/gallium/drivers/etnaviv/etnaviv_nir.c > @@ -375,11 +375,111 @@ etna_instr_replaceable_ssa_dest(nir_instr *instr) > return NULL; > } > > -/* Return the NIR global register corresponding to a given temporary > register, > - * creating it if necessary. > +/* Swizzles and write masks can be used to layer virtual non-interfering > + * registers on top of the real VEC4 registers. For example, the virtual > + * VEC3_XYZ register and the virtual SCALAR_W register that use the same > + * physical VEC4 base register do not interfere. > + */ > +enum { > + ETNA_REG_CLASS_VEC4, > + ETNA_REG_CLASS_VIRT_VEC3, > + ETNA_REG_CLASS_VIRT_VEC2, > + ETNA_REG_CLASS_VIRT_SCALAR, > + ETNA_NUM_REG_CLASSES, > +} etna_reg_class; > + > +enum { > + ETNA_REG_TYPE_VEC4, > + ETNA_REG_TYPE_VIRT_VEC3_XYZ, > + ETNA_REG_TYPE_VIRT_VEC3_XYW, > + ETNA_REG_TYPE_VIRT_VEC3_XZW, > + ETNA_REG_TYPE_VIRT_VEC3_YZW, > + ETNA_REG_TYPE_VIRT_VEC2_XY, > + ETNA_REG_TYPE_VIRT_VEC2_XZ, > + ETNA_REG_TYPE_VIRT_VEC2_XW, > + ETNA_REG_TYPE_VIRT_VEC2_YZ, > + ETNA_REG_TYPE_VIRT_VEC2_YW, > + ETNA_REG_TYPE_VIRT_VEC2_ZW, > + ETNA_REG_TYPE_VIRT_SCALAR_X, > + ETNA_REG_TYPE_VIRT_SCALAR_Y, > + ETNA_REG_TYPE_VIRT_SCALAR_Z, > + ETNA_REG_TYPE_VIRT_SCALAR_W, > + ETNA_NUM_REG_TYPES, > +} etna_reg_type; > + > +static const uint8_t > +etna_reg_writemask[ETNA_NUM_REG_TYPES] = { > + [ETNA_REG_TYPE_VEC4] = 0xf, > + [ETNA_REG_TYPE_VIRT_SCALAR_X] = 0x1, > + [ETNA_REG_TYPE_VIRT_SCALAR_Y] = 0x2, > + [ETNA_REG_TYPE_VIRT_VEC2_XY] = 0x3, > + [ETNA_REG_TYPE_VIRT_SCALAR_Z] = 0x4, > + [ETNA_REG_TYPE_VIRT_VEC2_XZ] = 0x5, > + [ETNA_REG_TYPE_VIRT_VEC2_YZ] = 0x6, > + [ETNA_REG_TYPE_VIRT_VEC3_XYZ] = 0x7, > + [ETNA_REG_TYPE_VIRT_SCALAR_W] = 0x8, > + [ETNA_REG_TYPE_VIRT_VEC2_XW] = 0x9, > + [ETNA_REG_TYPE_VIRT_VEC2_YW] = 0xa, > + [ETNA_REG_TYPE_VIRT_VEC3_XYW] = 0xb, > + [ETNA_REG_TYPE_VIRT_VEC2_ZW] = 0xc, > + [ETNA_REG_TYPE_VIRT_VEC3_XZW] = 0xd, > + [ETNA_REG_TYPE_VIRT_VEC3_YZW] = 0xe, > +}; > + > +static inline int etna_reg_get_type(int virt_reg) > +{ > + return virt_reg % ETNA_NUM_REG_TYPES; > +} > + > +static inline int etna_reg_get_base(int virt_reg) > +{ > + return virt_reg / ETNA_NUM_REG_TYPES; > +} > + > +static inline int etna_reg_get_class(int virt_reg) > +{ > + switch (etna_reg_get_type(virt_reg)) { > + case ETNA_REG_TYPE_VEC4: > + return ETNA_REG_CLASS_VEC4; > + case ETNA_REG_TYPE_VIRT_VEC3_XYZ: > + case ETNA_REG_TYPE_VIRT_VEC3_XYW: > + case ETNA_REG_TYPE_VIRT_VEC3_XZW: > + case ETNA_REG_TYPE_VIRT_VEC3_YZW: > + return ETNA_REG_CLASS_VIRT_VEC3; > + case ETNA_REG_TYPE_VIRT_VEC2_XY: > + case ETNA_REG_TYPE_VIRT_VEC2_XZ: > + case ETNA_REG_TYPE_VIRT_VEC2_XW: > + case ETNA_REG_TYPE_VIRT_VEC2_YZ: > + case ETNA_REG_TYPE_VIRT_VEC2_YW: > + case ETNA_REG_TYPE_VIRT_VEC2_ZW: > + return ETNA_REG_CLASS_VIRT_VEC2; > + case ETNA_REG_TYPE_VIRT_SCALAR_X: > + case ETNA_REG_TYPE_VIRT_SCALAR_Y: > + case ETNA_REG_TYPE_VIRT_SCALAR_Z: > + case ETNA_REG_TYPE_VIRT_SCALAR_W: > + return ETNA_REG_CLASS_VIRT_SCALAR; > + } > + > + assert(false); > +} > + > +/* Q values for the full set. Each virtual register interferes > + * with exactly one base register. And possibly with other virtual > + * registers on top of the same base register. > + */ > +static const unsigned int > +q_val[ETNA_NUM_REG_CLASSES][ETNA_NUM_REG_CLASSES] = { > + { 0, 4, 6, 4 }, > + { 1, 3, 6, 3 }, > + { 1, 4, 4, 2 }, > + { 1, 3, 3, 0 }, > +}; > + > +/* Return a NIR global register corresponding to a given temporary register. > + * The register is created if necessary. > */ > static nir_register * > -etna_ensure_temporary(nir_shader *shader, int index) > +etna_ensure_register(nir_shader *shader, int index) > { > nir_foreach_register(reg, &shader->registers) { > if (reg->index == index) > @@ -387,13 +487,9 @@ etna_ensure_temporary(nir_shader *shader, int index) > } > > nir_register *reg = nir_global_reg_create(shader); > + shader->reg_alloc = MAX2(shader->reg_alloc - 1, index + 1); > reg->num_components = 4; > - reg->num_array_elems = 0; > - reg->bit_size = 32; > reg->index = index; > - if (shader->reg_alloc < index + 1) > - shader->reg_alloc = index + 1; > - reg->name = NULL; > > return reg; > } > @@ -405,15 +501,19 @@ etna_ensure_temporary(nir_shader *shader, int index) > * and all store intrinsics to be moved to the end of the function already, > so > * that interference between input, output, and temporary values is described > * correctly. > + * All SSAs that qualify will be replaced with the assigned registers. > + * Destination SSAs to constant/uniform load and output store intrinsics > + * as well as undefined assignments are kept and will be removed later. > */ > static void > etna_assign_registers(nir_shader *shader) > { > - struct ra_regs *regs = ra_alloc_reg_set(NULL, 64, false); > - int class = ra_alloc_reg_class(regs); > + struct ra_regs *regs = ra_alloc_reg_set(NULL, ETNA_MAX_TEMPS * > + ETNA_NUM_REG_TYPES, false); > + int class[ETNA_NUM_REG_CLASSES]; > unsigned int **q_values; > - unsigned int *input_reg; > - unsigned int *output_reg; > + unsigned int *input_reg = NULL; > + unsigned int *output_reg = NULL; > > /* Input/output registers only have to be assigned manually to the > beginning > * of the temporary register range in the fragment shader. Otherwise the > @@ -427,13 +527,29 @@ etna_assign_registers(nir_shader *shader) > } > > /* A single register file with 64 registers is available to each running > - * shader, with no conflicts between them. > + * shader, with no conflicts between them. We add virtual registers on > + * top of that. > */ > - for (int r = 0; r < 64; r++) > - ra_class_add_reg(regs, class, r); > - q_values = ralloc_array(regs, unsigned *, 1); > - q_values[0] = rzalloc_array(q_values, unsigned, 1); > - q_values[0][0] = 0; > + for (int c = 0; c < ETNA_NUM_REG_CLASSES; c++) > + class[c] = ra_alloc_reg_class(regs); > + for (int r = 0; r < ETNA_NUM_REG_TYPES * ETNA_MAX_TEMPS; r++) > + ra_class_add_reg(regs, class[etna_reg_get_class(r)], r); > + q_values = ralloc_array(regs, unsigned *, ETNA_NUM_REG_CLASSES); > + for (int i = 0; i < ETNA_NUM_REG_CLASSES; i++) { > + q_values[i] = rzalloc_array(q_values, unsigned, ETNA_NUM_REG_CLASSES); > + for (int j = 0; j < ETNA_NUM_REG_CLASSES; j++) > + q_values[i][j] = q_val[i][j]; > + } > + for (int r = 0; r < ETNA_MAX_TEMPS; r++) { > + for (int i = 0; i < ETNA_NUM_REG_TYPES; i++) { > + for (int j = 0; j < i; j++) { > + if (etna_reg_writemask[i] & etna_reg_writemask[j]) { > + ra_add_reg_conflict(regs, ETNA_NUM_REG_TYPES * r + i, > + ETNA_NUM_REG_TYPES * r + j); > + } > + } > + } > + } > ra_set_finalize(regs, q_values); > > nir_foreach_function(function, shader) { > @@ -462,7 +578,54 @@ etna_assign_registers(nir_shader *shader) > } > } > > - struct ra_graph *g = ra_alloc_interference_graph(regs, count); > + int num_nodes = count; > + > + /* Add space for one dummy node, to grab the position register */ > + if (shader->info.stage == MESA_SHADER_FRAGMENT) > + num_nodes++; > + > + struct ra_graph *g = ra_alloc_interference_graph(regs, num_nodes); > + > + /* Assign nodes to the appropriate register class */ > + for (i = 0; i < count; i++) { > + bool can_use_virt = list_empty(&ssa_defs[i]->if_uses); > + can_use_virt &= ssa_defs[i]->parent_instr->type == > nir_instr_type_alu; > + if (can_use_virt) { > + nir_foreach_use(use_src, ssa_defs[i]) { > + if (use_src->parent_instr->type != nir_instr_type_alu) { > + can_use_virt = false; > + break; > + } > + /* These instructions are scalar and only read src.x */ > + nir_alu_instr *alu = nir_instr_as_alu(use_src->parent_instr); > + if (alu->op == nir_op_fexp2 || > + alu->op == nir_op_flog2) { > + can_use_virt = false; > + break; > + } > + } > + } > + > + /* Only choose virtual registers if all uses can be swizzled */ > + if (can_use_virt && ssa_defs[i]->num_components == 1) > + ra_set_node_class(g, i, ETNA_REG_CLASS_VIRT_SCALAR); > + else if (can_use_virt && ssa_defs[i]->num_components == 2) > + ra_set_node_class(g, i, ETNA_REG_CLASS_VIRT_VEC2); > + else if (can_use_virt && ssa_defs[i]->num_components == 3) > + ra_set_node_class(g, i, ETNA_REG_CLASS_VIRT_VEC3); > + else > + ra_set_node_class(g, i, ETNA_REG_CLASS_VEC4); > + } > + > + /* Prevent writes to the position register (temporary register 0) by > + * assigning it to a dummy node that interferes with all other nodes. > + */ > + if (shader->info.stage == MESA_SHADER_FRAGMENT) { > + ra_set_node_class(g, num_nodes - 1, ETNA_REG_CLASS_VEC4); > + ra_set_node_reg(g, num_nodes - 1, 0); > + for (int i = 0; i < count; i++) > + ra_add_node_interference(g, i, num_nodes - 1); > + } > > /* Collect SSA interference information and force input loads to > * the correct registers in the fragment shader. > @@ -491,7 +654,7 @@ etna_assign_registers(nir_shader *shader) > > assert(offset == 0); > > - ra_set_node_reg(g, i, base); > + ra_set_node_reg(g, i, base * ETNA_NUM_REG_TYPES); > } > } > > @@ -517,7 +680,7 @@ etna_assign_registers(nir_shader *shader) > /* Find the replaceable SSA used as source */ > for (i = 0; i < count; i++) { > if (ssa_defs[i] == intr->src[0].ssa) > - ra_set_node_reg(g, i, base); > + ra_set_node_reg(g, i, base * ETNA_NUM_REG_TYPES); > } > } > } > @@ -530,19 +693,92 @@ etna_assign_registers(nir_shader *shader) > /* Replace SSA assignments with allocated registers */ > for (i = 0; i < count; i++) { > int r = ra_get_node_reg(g, i); > - nir_register *reg = etna_ensure_temporary(shader, r); > + nir_register *reg = etna_ensure_register(shader, > etna_reg_get_base(r)); > nir_ssa_def *ssa = ssa_defs[i]; > > - nir_ssa_def_rewrite_uses(ssa, nir_src_for_reg(reg)); > + /* Rewrite uses */ > + if (etna_reg_get_type(r) == ETNA_REG_TYPE_VEC4) { > + nir_ssa_def_rewrite_uses(ssa, nir_src_for_reg(reg)); > + } else { > + nir_src new_src = nir_src_for_reg(reg); > + nir_foreach_use_safe(use_src, ssa) { > + static const unsigned reswizzle[ETNA_NUM_REG_TYPES][4] = { > + { 0, 1, 2, 3 }, /* XYZW */ > + { 0, 1, 2, 2 }, /* XYZ */ > + { 0, 1, 3, 3 }, /* XYW */ > + { 0, 2, 3, 3 }, /* XZW */ > + { 1, 2, 3, 3 }, /* YZW */ > + { 0, 1, 1, 1 }, /* XY */ > + { 0, 2, 2, 2 }, /* XZ */ > + { 0, 3, 3, 3 }, /* XW */ > + { 1, 2, 2, 2 }, /* YZ */ > + { 1, 3, 3, 3 }, /* YW */ > + { 2, 3, 3, 3 }, /* ZW */ > + { 0, 0, 0, 0 }, /* X */ > + { 1, 1, 1 ,1 }, /* Y */ > + { 2, 2, 2, 2 }, /* Z */ > + { 3, 3, 3, 3 }, /* W */ > + }; > + nir_instr_rewrite_src(use_src->parent_instr, use_src, > new_src); > + nir_alu_src *alu_src = container_of(use_src, alu_src, src); > + int t = etna_reg_get_type(r); > + alu_src->swizzle[0] = reswizzle[t][alu_src->swizzle[0]]; > + alu_src->swizzle[1] = reswizzle[t][alu_src->swizzle[1]]; > + alu_src->swizzle[2] = reswizzle[t][alu_src->swizzle[2]]; > + alu_src->swizzle[3] = reswizzle[t][alu_src->swizzle[3]]; > + } > + } > + > assert(list_empty(&ssa->uses) && list_empty(&ssa->if_uses)); > > nir_instr *instr = ssa->parent_instr; > > + /* Rewrite destination */ > if (instr->type == nir_instr_type_alu) { > nir_alu_instr *alu = nir_instr_as_alu(instr); > > nir_instr_rewrite_dest(&alu->instr, &alu->dest.dest, > nir_dest_for_reg(reg)); > + int t = etna_reg_get_type(r); > + alu->dest.write_mask = etna_reg_writemask[t]; > + /* The dot product instructions broadcast their result to all > + * destination components. There is no need to reswizzle their > + * sources here. > + */ > + if (alu->op != nir_op_fdot2 && > + alu->op != nir_op_fdot3 && > + alu->op != nir_op_fdot4) { > + unsigned num_srcs = nir_op_infos[alu->op].num_inputs; > + for (unsigned i = 0; i < num_srcs; i++) { > + static const unsigned reswizzle[ETNA_NUM_REG_TYPES][4] = { > + { 0, 1, 2, 3 }, /* XYZW */ > + { 0, 1, 2, 2 }, /* XYZ */ > + { 0, 1, 1, 2 }, /* XYW */ > + { 0, 0, 1, 2 }, /* XZW */ > + { 0, 0, 1, 2 }, /* YZW */ > + { 0, 1, 1, 1 }, /* XY */ > + { 0, 0, 1, 1 }, /* XZ */ > + { 0, 0, 0, 1 }, /* XW */ > + { 0, 0, 1, 1 }, /* YZ */ > + { 0, 0, 0, 1 }, /* YW */ > + { 0, 0, 0, 1 }, /* ZW */ > + { 0, 0, 0, 0 }, /* X */ > + { 0, 0, 0, 0 }, /* Y */ > + { 0, 0, 0, 0 }, /* Z */ > + { 0, 0, 0, 0 }, /* W */ > + }; > + nir_alu_src *alu_src = &alu->src[i]; > + uint8_t swizzle[4]; > + swizzle[0] = alu_src->swizzle[0]; > + swizzle[1] = alu_src->swizzle[1]; > + swizzle[2] = alu_src->swizzle[2]; > + swizzle[3] = alu_src->swizzle[3]; > + alu_src->swizzle[0] = swizzle[reswizzle[t][0]]; > + alu_src->swizzle[1] = swizzle[reswizzle[t][1]]; > + alu_src->swizzle[2] = swizzle[reswizzle[t][2]]; > + alu_src->swizzle[3] = swizzle[reswizzle[t][3]]; > + } > + } > } else if (instr->type == nir_instr_type_tex) { > nir_tex_instr *tex = nir_instr_as_tex(instr); > > -- > 2.17.1 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev