On Tue, Jul 19, 2016 at 5:02 PM, Kenneth Graunke <kenn...@whitecape.org> wrote:
> On Tuesday, July 19, 2016 1:39:23 PM PDT Jason Ekstrand wrote: > > On Mon, Jul 18, 2016 at 1:26 PM, Kenneth Graunke <kenn...@whitecape.org> > > wrote: > > > > > This eliminates the need to walk the list of input variables, recurse > > > into their types (via logic largely redundant with nir_lower_io), and > > > interpolate all possible inputs up front. The backend no longer has > > > to care about variables at all, which eliminates complications from > > > trying to pack multiple variables into the same location. Instead, > > > each intrinsic specifies exactly what's needed. > > > > > > This should unblock Timothy's work on GL_ARB_enhanced_layouts. > > > > > > Each load_interpolated_input intrinsic corresponds to PLN instructions, > > > while load_barycentric_at_* intrinsics correspond to pixel interpolator > > > messages. The pixel/centroid/sample barycentric intrinsics simply > refer > > > to payload fields (delta_xy[]), and don't actually generate any code. > > > > > > Because we use a single intrinsic for both centroid-qualified variables > > > and interpolateAtCentroid(), they become indistinguishable. We stop > > > sending pixel interpolator messages for those, and instead use the > > > payload provided data, which should be considerably faster. > > > > > > On Broadwell: > > > > > > total instructions in shared programs: 9067751 -> 9067570 (-0.00%) > > > instructions in affected programs: 145902 -> 145721 (-0.12%) > > > helped: 422 > > > HURT: 209 > > > > > > total spills in shared programs: 2849 -> 2899 (1.76%) > > > spills in affected programs: 760 -> 810 (6.58%) > > > helped: 0 > > > HURT: 10 > > > > > > total fills in shared programs: 3910 -> 3950 (1.02%) > > > fills in affected programs: 617 -> 657 (6.48%) > > > helped: 0 > > > HURT: 10 > > > > > > LOST: 3 > > > GAINED: 3 > > > > > > The differences mostly appear to be slight changes in MOVs. > > > > > > Signed-off-by: Kenneth Graunke <kenn...@whitecape.org> > > > --- > > > src/mesa/drivers/dri/i965/brw_fs.cpp | 175 ++++--------- > > > src/mesa/drivers/dri/i965/brw_fs.h | 9 +- > > > src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 410 > > > ++++++++++++++++--------------- > > > src/mesa/drivers/dri/i965/brw_nir.c | 16 +- > > > 4 files changed, 269 insertions(+), 341 deletions(-) > > > > > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp > > > b/src/mesa/drivers/dri/i965/brw_fs.cpp > > > index 94127bc..06007fe 100644 > > > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > > > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > > > @@ -1067,21 +1067,27 @@ fs_visitor::emit_fragcoord_interpolation(fs_reg > > > wpos) > > > bld.MOV(wpos, this->wpos_w); > > > } > > > > > > -static enum brw_barycentric_mode > > > -barycentric_mode(enum glsl_interp_mode mode, > > > - bool is_centroid, bool is_sample) > > > +enum brw_barycentric_mode > > > +brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op) > > > { > > > - unsigned bary; > > > - > > > /* Barycentric modes don't make sense for flat inputs. */ > > > assert(mode != INTERP_MODE_FLAT); > > > > > > - if (is_sample) { > > > - bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE; > > > - } else if (is_centroid) { > > > - bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID; > > > - } else { > > > + unsigned bary; > > > + switch (op) { > > > + case nir_intrinsic_load_barycentric_pixel: > > > + case nir_intrinsic_load_barycentric_at_offset: > > > bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL; > > > + break; > > > + case nir_intrinsic_load_barycentric_centroid: > > > + bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID; > > > + break; > > > + case nir_intrinsic_load_barycentric_sample: > > > + case nir_intrinsic_load_barycentric_at_sample: > > > + bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE; > > > + break; > > > + default: > > > + assert(!"invalid intrinsic"); > > > } > > > > > > if (mode == INTERP_MODE_NOPERSPECTIVE) > > > @@ -1101,107 +1107,6 @@ centroid_to_pixel(enum brw_barycentric_mode > bary) > > > return (enum brw_barycentric_mode) ((unsigned) bary - 1); > > > } > > > > > > -void > > > -fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name, > > > - const glsl_type *type, > > > - glsl_interp_mode > > > interpolation_mode, > > > - int *location, bool > mod_centroid, > > > - bool mod_sample) > > > -{ > > > - assert(stage == MESA_SHADER_FRAGMENT); > > > - brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; > > > - > > > - if (type->is_array() || type->is_matrix()) { > > > - const glsl_type *elem_type = glsl_get_array_element(type); > > > - const unsigned length = glsl_get_length(type); > > > - > > > - for (unsigned i = 0; i < length; i++) { > > > - emit_general_interpolation(attr, name, elem_type, > > > interpolation_mode, > > > - location, mod_centroid, > mod_sample); > > > - } > > > - } else if (type->is_record()) { > > > - for (unsigned i = 0; i < type->length; i++) { > > > - const glsl_type *field_type = type->fields.structure[i].type; > > > - emit_general_interpolation(attr, name, field_type, > > > interpolation_mode, > > > - location, mod_centroid, > mod_sample); > > > - } > > > - } else { > > > - assert(type->is_scalar() || type->is_vector()); > > > - > > > - if (prog_data->urb_setup[*location] == -1) { > > > - /* If there's no incoming setup data for this slot, don't > > > - * emit interpolation for it. > > > - */ > > > - *attr = offset(*attr, bld, type->vector_elements); > > > - (*location)++; > > > - return; > > > - } > > > - > > > - attr->type = brw_type_for_base_type(type->get_scalar_type()); > > > - > > > - if (interpolation_mode == INTERP_MODE_FLAT) { > > > - /* Constant interpolation (flat shading) case. The SF has > > > - * handed us defined values in only the constant offset > > > - * field of the setup reg. > > > - */ > > > - unsigned vector_elements = type->vector_elements; > > > - > > > - /* Data starts at suboffet 3 in 32-bit units (12 bytes), so > it > > > is not > > > - * 64-bit aligned and the current implementation fails to > read > > > the > > > - * data properly. Instead, when there is a double input > varying, > > > - * read it as vector of floats with twice the number of > > > components. > > > - */ > > > - if (attr->type == BRW_REGISTER_TYPE_DF) { > > > - vector_elements *= 2; > > > - attr->type = BRW_REGISTER_TYPE_F; > > > - } > > > - for (unsigned int i = 0; i < vector_elements; i++) { > > > - struct brw_reg interp = interp_reg(*location, i); > > > - interp = suboffset(interp, 3); > > > - interp.type = attr->type; > > > - bld.emit(FS_OPCODE_CINTERP, *attr, fs_reg(interp)); > > > - *attr = offset(*attr, bld, 1); > > > - } > > > - } else { > > > - /* Smooth/noperspective interpolation case. */ > > > - enum brw_barycentric_mode bary = > > > - barycentric_mode(interpolation_mode, mod_centroid, > > > mod_sample); > > > - > > > - for (unsigned int i = 0; i < type->vector_elements; i++) { > > > - fs_reg interp(interp_reg(*location, i)); > > > - if (devinfo->needs_unlit_centroid_workaround && > mod_centroid) > > > { > > > - /* Get the pixel/sample mask into f0 so that we know > > > - * which pixels are lit. Then, for each channel that > is > > > - * unlit, replace the centroid data with non-centroid > > > - * data. > > > - */ > > > - bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS); > > > - > > > - fs_inst *inst; > > > - inst = bld.emit(FS_OPCODE_LINTERP, *attr, > > > - delta_xy[centroid_to_pixel(bary)], > interp); > > > - inst->predicate = BRW_PREDICATE_NORMAL; > > > - inst->predicate_inverse = true; > > > - inst->no_dd_clear = true; > > > - > > > - inst = bld.emit(FS_OPCODE_LINTERP, *attr, > > > - delta_xy[bary], interp); > > > - inst->predicate = BRW_PREDICATE_NORMAL; > > > - inst->predicate_inverse = false; > > > - inst->no_dd_check = true; > > > - } else { > > > - bld.emit(FS_OPCODE_LINTERP, *attr, delta_xy[bary], > interp); > > > - } > > > - if (devinfo->gen < 6 && interpolation_mode == > > > INTERP_MODE_SMOOTH) { > > > - bld.MUL(*attr, *attr, this->pixel_w); > > > - } > > > - *attr = offset(*attr, bld, 1); > > > - } > > > - } > > > - (*location)++; > > > - } > > > -} > > > - > > > fs_reg * > > > fs_visitor::emit_frontfacing_interpolation() > > > { > > > @@ -6327,6 +6232,10 @@ fs_visitor::run_cs() > > > /** > > > * Return a bitfield where bit n is set if barycentric interpolation > mode > > > n > > > * (see enum brw_barycentric_mode) is needed by the fragment shader. > > > + * > > > + * We examine the load_barycentric intrinsics rather than looking at > input > > > + * variables so that we catch interpolateAtCentroid() messages too, > which > > > + * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set > up. > > > */ > > > static unsigned > > > brw_compute_barycentric_interp_modes(const struct brw_device_info > > > *devinfo, > > > @@ -6334,29 +6243,37 @@ brw_compute_barycentric_interp_modes(const > struct > > > brw_device_info *devinfo, > > > { > > > unsigned barycentric_interp_modes = 0; > > > > > > - nir_foreach_variable(var, &shader->inputs) { > > > - /* Ignore WPOS; it doesn't require interpolation. */ > > > - if (var->data.location == VARYING_SLOT_POS) > > > + nir_foreach_function(f, shader) { > > > + if (!f->impl) > > > continue; > > > > > > - /* Flat inputs don't need barycentric modes. */ > > > - if (var->data.interpolation == INTERP_MODE_FLAT) > > > - continue; > > > + nir_foreach_block(block, f->impl) { > > > + nir_foreach_instr(instr, block) { > > > + if (instr->type != nir_instr_type_intrinsic) > > > + continue; > > > > > > - /* Determine the set (or sets) of barycentric coordinates > needed to > > > - * interpolate this variable. Note that when > > > - * brw->needs_unlit_centroid_workaround is set, centroid > > > interpolation > > > - * uses PIXEL interpolation for unlit pixels and CENTROID > > > interpolation > > > - * for lit pixels, so we need both sets of barycentric > coordinates. > > > - */ > > > - enum brw_barycentric_mode bary_mode = > > > - barycentric_mode((glsl_interp_mode) var->data.interpolation, > > > - var->data.centroid, var->data.sample); > > > + nir_intrinsic_instr *intrin = > nir_instr_as_intrinsic(instr); > > > + if (intrin->intrinsic != > > > nir_intrinsic_load_interpolated_input) > > > + continue; > > > > > > > Any particular reason why you're looking at the source of the load rather > > than just looking for the load_barycentric intrinsic directly? > > > > > > > + > > > + /* Ignore WPOS; it doesn't require interpolation. */ > > > + if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS) > > > + continue; > > > > > > > Ugh... > > This is why. gl_FragCoord still generates a load_interpolated_input > with a load_barycentric_pixel, and I want to skip that. (It might be > the only user of pixel coordinates, and if so, we don't want them.) > > You need the load_interpolated_input intrinsic to know which input it is. > > It's pretty clear that gl_FragCoord doesn't behave anything like a > normal input, so it ought to be a system value. With that cleaned up, > we could simplify this. > > That ended up being more work than I expected, so I was hoping to do > it as a follow-on series, to unblock Tim sooner rather than later. > I'm ok with the "unblock Tim ASAP plan". I would like to see gl_FragCoord as a system value eventually though. It just seems so much cleaner. > > > > > > > - barycentric_interp_modes |= 1 << bary_mode; > > > + intrin = > > > nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr); > > > + enum glsl_interp_mode interp = (enum glsl_interp_mode) > > > + nir_intrinsic_interp_mode(intrin); > > > + nir_intrinsic_op bary_op = intrin->intrinsic; > > > + enum brw_barycentric_mode bary = > > > + brw_barycentric_mode(interp, bary_op); > > > > > > - if (var->data.centroid && > devinfo->needs_unlit_centroid_workaround) > > > - barycentric_interp_modes |= 1 << > centroid_to_pixel(bary_mode); > > > + barycentric_interp_modes |= 1 << bary; > > > + > > > + if (devinfo->needs_unlit_centroid_workaround && > > > + bary_op == nir_intrinsic_load_barycentric_centroid) > > > + barycentric_interp_modes |= 1 << > centroid_to_pixel(bary); > > > + } > > > + } > > > } > > > > > > return barycentric_interp_modes; > > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.h > > > b/src/mesa/drivers/dri/i965/brw_fs.h > > > index 7998f51..574475f 100644 > > > --- a/src/mesa/drivers/dri/i965/brw_fs.h > > > +++ b/src/mesa/drivers/dri/i965/brw_fs.h > > > @@ -174,11 +174,6 @@ public: > > > fs_reg *emit_samplepos_setup(); > > > fs_reg *emit_sampleid_setup(); > > > fs_reg *emit_samplemaskin_setup(); > > > - void emit_general_interpolation(fs_reg *attr, const char *name, > > > - const glsl_type *type, > > > - glsl_interp_mode > interpolation_mode, > > > - int *location, bool mod_centroid, > > > - bool mod_sample); > > > fs_reg *emit_vs_system_value(int location); > > > void emit_interpolation_setup_gen4(); > > > void emit_interpolation_setup_gen6(); > > > @@ -195,7 +190,6 @@ public: > > > bool opt_zero_samples(); > > > > > > void emit_nir_code(); > > > - void nir_setup_inputs(); > > > void nir_setup_single_output_varying(fs_reg *reg, const glsl_type > > > *type, > > > unsigned *location); > > > void nir_setup_outputs(); > > > @@ -511,3 +505,6 @@ void shuffle_64bit_data_for_32bit_write(const > > > brw::fs_builder &bld, > > > uint32_t components); > > > fs_reg setup_imm_df(const brw::fs_builder &bld, > > > double v); > > > + > > > +enum brw_barycentric_mode brw_barycentric_mode(enum glsl_interp_mode > mode, > > > + nir_intrinsic_op op); > > > diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp > > > b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp > > > index 6265dc6..610c151 100644 > > > --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp > > > +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp > > > @@ -36,7 +36,6 @@ fs_visitor::emit_nir_code() > > > /* emit the arrays used for inputs and outputs - load/store > intrinsics > > > will > > > * be converted to reads/writes of these arrays > > > */ > > > - nir_setup_inputs(); > > > nir_setup_outputs(); > > > nir_setup_uniforms(); > > > nir_emit_system_values(); > > > @@ -50,38 +49,6 @@ fs_visitor::emit_nir_code() > > > } > > > > > > void > > > -fs_visitor::nir_setup_inputs() > > > -{ > > > - if (stage != MESA_SHADER_FRAGMENT) > > > - return; > > > - > > > - nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs); > > > - > > > - nir_foreach_variable(var, &nir->inputs) { > > > - fs_reg input = offset(nir_inputs, bld, > var->data.driver_location); > > > - > > > - fs_reg reg; > > > - if (var->data.location == VARYING_SLOT_POS) { > > > - emit_fragcoord_interpolation(input); > > > - } else if (var->data.location == VARYING_SLOT_LAYER) { > > > - struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_LAYER, > > > 1), 3); > > > - reg.type = BRW_REGISTER_TYPE_D; > > > - bld.emit(FS_OPCODE_CINTERP, retype(input, > BRW_REGISTER_TYPE_D), > > > reg); > > > - } else if (var->data.location == VARYING_SLOT_VIEWPORT) { > > > - struct brw_reg reg = > suboffset(interp_reg(VARYING_SLOT_VIEWPORT, > > > 2), 3); > > > - reg.type = BRW_REGISTER_TYPE_D; > > > - bld.emit(FS_OPCODE_CINTERP, retype(input, > BRW_REGISTER_TYPE_D), > > > reg); > > > - } else { > > > - int location = var->data.location; > > > - emit_general_interpolation(&input, var->name, var->type, > > > - (glsl_interp_mode) > > > var->data.interpolation, > > > - &location, var->data.centroid, > > > - var->data.sample); > > > - } > > > - } > > > -} > > > - > > > -void > > > fs_visitor::nir_setup_single_output_varying(fs_reg *reg, > > > const glsl_type *type, > > > unsigned *location) > > > @@ -3063,7 +3030,6 @@ fs_visitor::nir_emit_fs_intrinsic(const > fs_builder > > > &bld, > > > nir_intrinsic_instr *instr) > > > { > > > assert(stage == MESA_SHADER_FRAGMENT); > > > - const struct brw_wm_prog_key *wm_key = (const struct > brw_wm_prog_key > > > *) key; > > > > > > fs_reg dest; > > > if (nir_intrinsic_infos[instr->intrinsic].has_dest) > > > @@ -3120,189 +3086,245 @@ fs_visitor::nir_emit_fs_intrinsic(const > > > fs_builder &bld, > > > break; > > > } > > > > > > - case nir_intrinsic_interp_var_at_centroid: > > > - case nir_intrinsic_interp_var_at_sample: > > > - case nir_intrinsic_interp_var_at_offset: { > > > - /* Handle ARB_gpu_shader5 interpolation intrinsics > > > - * > > > - * It's worth a quick word of explanation as to why we handle > the > > > full > > > - * variable-based interpolation intrinsic rather than a lowered > > > version > > > - * with like we do for other inputs. We have to do that because > > > the way > > > - * we set up inputs doesn't allow us to use the already setup > > > inputs for > > > - * interpolation. At the beginning of the shader, we go through > > > all of > > > - * the input variables and do the initial interpolation and put > it > > > in > > > - * the nir_inputs array based on its location as determined in > > > - * nir_lower_io. If the input isn't used, dead code cleans up > and > > > - * everything works fine. However, when we get to the > > > ARB_gpu_shader5 > > > - * interpolation intrinsics, we need to reinterpolate the input > > > - * differently. If we used an intrinsic that just had an index > it > > > would > > > - * only give us the offset into the nir_inputs array. However, > > > this is > > > - * useless because that value is post-interpolation and we need > > > - * pre-interpolation. In order to get the actual location of > the > > > bits > > > - * we get from the vertex fetching hardware, we need the > variable. > > > - */ > > > - fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2); > > > - const glsl_interp_mode interpolation = > > > - (glsl_interp_mode) > instr->variables[0]->var->data.interpolation; > > > + case nir_intrinsic_load_input: { > > > + /* load_input is only used for flat inputs */ > > > + unsigned base = nir_intrinsic_base(instr); > > > + unsigned component = nir_intrinsic_component(instr); > > > + unsigned num_components = instr->num_components; > > > + enum brw_reg_type type = dest.type; > > > > > > - switch (instr->intrinsic) { > > > - case nir_intrinsic_interp_var_at_centroid: > > > - emit_pixel_interpolater_send(bld, > > > - > FS_OPCODE_INTERPOLATE_AT_CENTROID, > > > - dst_xy, > > > - fs_reg(), /* src */ > > > - brw_imm_ud(0u), > > > - interpolation); > > > - break; > > > + /* Special case fields in the VUE header */ > > > + if (base == VARYING_SLOT_LAYER) > > > + component = 1; > > > + else if (base == VARYING_SLOT_VIEWPORT) > > > + component = 2; > > > > > > - case nir_intrinsic_interp_var_at_sample: { > > > - if (!wm_key->multisample_fbo) { > > > - /* From the ARB_gpu_shader5 specification: > > > - * "If multisample buffers are not available, the input > > > varying > > > - * will be evaluated at the center of the pixel." > > > - */ > > > - emit_pixel_interpolater_send(bld, > > > - > > > FS_OPCODE_INTERPOLATE_AT_CENTROID, > > > - dst_xy, > > > - fs_reg(), /* src */ > > > - brw_imm_ud(0u), > > > - interpolation); > > > - break; > > > - } > > > + if (nir_dest_bit_size(instr->dest) == 64) { > > > + /* const_index is in 32-bit type size units that could not be > > > aligned > > > + * with DF. We need to read the double vector as if it was a > > > float > > > + * vector of twice the number of components to fetch the > right > > > data. > > > + */ > > > + type = BRW_REGISTER_TYPE_F; > > > + num_components *= 2; > > > + } > > > > > > - nir_const_value *const_sample = > > > nir_src_as_const_value(instr->src[0]); > > > + for (unsigned int i = 0; i < num_components; i++) { > > > + struct brw_reg interp = interp_reg(base, component + i); > > > + interp = suboffset(interp, 3); > > > + bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, > i), > > > + retype(fs_reg(interp), type)); > > > + } > > > > > > - if (const_sample) { > > > - unsigned msg_data = const_sample->i32[0] << 4; > > > + if (nir_dest_bit_size(instr->dest) == 64) { > > > + shuffle_32bit_load_result_to_64bit_data(bld, > > > + dest, > > > + retype(dest, type), > > > + > instr->num_components); > > > + } > > > + break; > > > + } > > > + > > > + case nir_intrinsic_load_barycentric_pixel: > > > + case nir_intrinsic_load_barycentric_centroid: > > > + case nir_intrinsic_load_barycentric_sample: > > > + /* Do nothing - load_interpolated_input handling will handle it > > > later. */ > > > + break; > > > > > > > I'm very courious why you made this choice. It seems odd to me. > > I originally tried emitting MOVs here. The data layout is a bit funky: > > R3: X coordinates for Slots 0-7 > R4: Y coordinates for Slots 0-7 > R5: X coordinates for Slots 8-15 > R5: Y coordinates for Slots 8-15 > > For SIMD8, you could do two separate MOVs (move X, then Y). You might be > able to do a compressed mov(16)...but would need to whack the channel > enables into 1Q for both halves (if that's possible) or use NoMask > (which I wanted to avoid). > > For SIMD16...you can't move both X's with a single MOV. A <16,8,1> > region would cover it, but sources can't span more than two adjacent > registers. So that doesn't work. It might take four MOVs. Maybe > only two if I could use the above trick. > > Generating MOVs also means that we're relying on the optimizer to > remove them...which it didn't seem to be. It ended up being much > easier to just point LINTERP at delta_xy[] directly. > Right... I forgot about the odd interleaving. That seems like a reasonable reason for doing it this way. > > > > + case nir_intrinsic_load_barycentric_at_sample: { > > > + const glsl_interp_mode interpolation = > > > + (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); > > > + > > > + nir_const_value *const_sample = > > > nir_src_as_const_value(instr->src[0]); > > > + > > > + if (const_sample) { > > > + unsigned msg_data = const_sample->i32[0] << 4; > > > + > > > + emit_pixel_interpolater_send(bld, > > > + FS_OPCODE_INTERPOLATE_AT_SAMPLE, > > > + dest, > > > + fs_reg(), /* src */ > > > + brw_imm_ud(msg_data), > > > + interpolation); > > > + } else { > > > + const fs_reg sample_src = retype(get_nir_src(instr->src[0]), > > > + BRW_REGISTER_TYPE_UD); > > > + > > > + if (nir_src_is_dynamically_uniform(instr->src[0])) { > > > + const fs_reg sample_id = bld.emit_uniformize(sample_src); > > > + const fs_reg msg_data = vgrf(glsl_type::uint_type); > > > + bld.exec_all().group(1, 0) > > > + .SHL(msg_data, sample_id, brw_imm_ud(4u)); > > > emit_pixel_interpolater_send(bld, > > > > FS_OPCODE_INTERPOLATE_AT_SAMPLE, > > > - dst_xy, > > > + dest, > > > fs_reg(), /* src */ > > > - brw_imm_ud(msg_data), > > > + msg_data, > > > interpolation); > > > } else { > > > - const fs_reg sample_src = > retype(get_nir_src(instr->src[0]), > > > - BRW_REGISTER_TYPE_UD); > > > - > > > - if (nir_src_is_dynamically_uniform(instr->src[0])) { > > > - const fs_reg sample_id = > bld.emit_uniformize(sample_src); > > > - const fs_reg msg_data = vgrf(glsl_type::uint_type); > > > - bld.exec_all().group(1, 0) > > > - .SHL(msg_data, sample_id, brw_imm_ud(4u)); > > > + /* Make a loop that sends a message to the pixel > interpolater > > > + * for the sample number in each live channel. If there > are > > > + * multiple channels with the same sample number then > these > > > + * will be handled simultaneously with a single > interation of > > > + * the loop. > > > + */ > > > + bld.emit(BRW_OPCODE_DO); > > > + > > > + /* Get the next live sample number into sample_id_reg */ > > > + const fs_reg sample_id = bld.emit_uniformize(sample_src); > > > + > > > + /* Set the flag register so that we can perform the send > > > + * message on all channels that have the same sample > number > > > + */ > > > + bld.CMP(bld.null_reg_ud(), > > > + sample_src, sample_id, > > > + BRW_CONDITIONAL_EQ); > > > + const fs_reg msg_data = vgrf(glsl_type::uint_type); > > > + bld.exec_all().group(1, 0) > > > + .SHL(msg_data, sample_id, brw_imm_ud(4u)); > > > + fs_inst *inst = > > > emit_pixel_interpolater_send(bld, > > > > > > FS_OPCODE_INTERPOLATE_AT_SAMPLE, > > > - dst_xy, > > > + dest, > > > fs_reg(), /* src */ > > > msg_data, > > > interpolation); > > > - } else { > > > - /* Make a loop that sends a message to the pixel > > > interpolater > > > - * for the sample number in each live channel. If > there are > > > - * multiple channels with the same sample number then > these > > > - * will be handled simultaneously with a single > interation > > > of > > > - * the loop. > > > - */ > > > - bld.emit(BRW_OPCODE_DO); > > > - > > > - /* Get the next live sample number into sample_id_reg > */ > > > - const fs_reg sample_id = > bld.emit_uniformize(sample_src); > > > + set_predicate(BRW_PREDICATE_NORMAL, inst); > > > > > > - /* Set the flag register so that we can perform the > send > > > - * message on all channels that have the same sample > number > > > - */ > > > - bld.CMP(bld.null_reg_ud(), > > > - sample_src, sample_id, > > > - BRW_CONDITIONAL_EQ); > > > - const fs_reg msg_data = vgrf(glsl_type::uint_type); > > > - bld.exec_all().group(1, 0) > > > - .SHL(msg_data, sample_id, brw_imm_ud(4u)); > > > - fs_inst *inst = > > > - emit_pixel_interpolater_send(bld, > > > - > > > FS_OPCODE_INTERPOLATE_AT_SAMPLE, > > > - dst_xy, > > > - fs_reg(), /* src */ > > > - msg_data, > > > - interpolation); > > > - set_predicate(BRW_PREDICATE_NORMAL, inst); > > > - > > > - /* Continue the loop if there are any live channels > left */ > > > - set_predicate_inv(BRW_PREDICATE_NORMAL, > > > - true, /* inverse */ > > > - bld.emit(BRW_OPCODE_WHILE)); > > > - } > > > + /* Continue the loop if there are any live channels left > */ > > > + set_predicate_inv(BRW_PREDICATE_NORMAL, > > > + true, /* inverse */ > > > + bld.emit(BRW_OPCODE_WHILE)); > > > } > > > - > > > - break; > > > } > > > + break; > > > + } > > > > > > - case nir_intrinsic_interp_var_at_offset: { > > > - nir_const_value *const_offset = > > > nir_src_as_const_value(instr->src[0]); > > > + case nir_intrinsic_load_barycentric_at_offset: { > > > + const glsl_interp_mode interpolation = > > > + (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); > > > > > > - if (const_offset) { > > > - unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), > 7) & > > > 0xf; > > > - unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), > 7) & > > > 0xf; > > > + nir_const_value *const_offset = > > > nir_src_as_const_value(instr->src[0]); > > > > > > - emit_pixel_interpolater_send(bld, > > > - > > > FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, > > > - dst_xy, > > > - fs_reg(), /* src */ > > > - brw_imm_ud(off_x | (off_y << > 4)), > > > - interpolation); > > > - } else { > > > - fs_reg src = vgrf(glsl_type::ivec2_type); > > > - fs_reg offset_src = retype(get_nir_src(instr->src[0]), > > > - BRW_REGISTER_TYPE_F); > > > - for (int i = 0; i < 2; i++) { > > > - fs_reg temp = vgrf(glsl_type::float_type); > > > - bld.MUL(temp, offset(offset_src, bld, i), > > > brw_imm_f(16.0f)); > > > - fs_reg itemp = vgrf(glsl_type::int_type); > > > - /* float to int */ > > > - bld.MOV(itemp, temp); > > > - > > > - /* Clamp the upper end of the range to +7/16. > > > - * ARB_gpu_shader5 requires that we support a maximum > > > offset > > > - * of +0.5, which isn't representable in a S0.4 value > -- if > > > - * we didn't clamp it, we'd end up with -8/16, which > is the > > > - * opposite of what the shader author wanted. > > > - * > > > - * This is legal due to ARB_gpu_shader5's quantization > > > - * rules: > > > - * > > > - * "Not all values of <offset> may be supported; x and > y > > > - * offsets may be rounded to fixed-point values with > the > > > - * number of fraction bits given by the > > > - * implementation-dependent constant > > > - * FRAGMENT_INTERPOLATION_OFFSET_BITS" > > > - */ > > > - set_condmod(BRW_CONDITIONAL_L, > > > - bld.SEL(offset(src, bld, i), itemp, > > > brw_imm_d(7))); > > > - } > > > + if (const_offset) { > > > + unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & > 0xf; > > > + unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & > 0xf; > > > > > > - const enum opcode opcode = > > > FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; > > > - emit_pixel_interpolater_send(bld, > > > - opcode, > > > - dst_xy, > > > - src, > > > - brw_imm_ud(0u), > > > - interpolation); > > > + emit_pixel_interpolater_send(bld, > > > + > > > FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, > > > + dest, > > > + fs_reg(), /* src */ > > > + brw_imm_ud(off_x | (off_y << > 4)), > > > + interpolation); > > > + } else { > > > + fs_reg src = vgrf(glsl_type::ivec2_type); > > > + fs_reg offset_src = retype(get_nir_src(instr->src[0]), > > > + BRW_REGISTER_TYPE_F); > > > + for (int i = 0; i < 2; i++) { > > > + fs_reg temp = vgrf(glsl_type::float_type); > > > + bld.MUL(temp, offset(offset_src, bld, i), > brw_imm_f(16.0f)); > > > + fs_reg itemp = vgrf(glsl_type::int_type); > > > + /* float to int */ > > > + bld.MOV(itemp, temp); > > > + > > > + /* Clamp the upper end of the range to +7/16. > > > + * ARB_gpu_shader5 requires that we support a maximum > offset > > > + * of +0.5, which isn't representable in a S0.4 value -- > if > > > + * we didn't clamp it, we'd end up with -8/16, which is > the > > > + * opposite of what the shader author wanted. > > > + * > > > + * This is legal due to ARB_gpu_shader5's quantization > > > + * rules: > > > + * > > > + * "Not all values of <offset> may be supported; x and y > > > + * offsets may be rounded to fixed-point values with the > > > + * number of fraction bits given by the > > > + * implementation-dependent constant > > > + * FRAGMENT_INTERPOLATION_OFFSET_BITS" > > > + */ > > > + set_condmod(BRW_CONDITIONAL_L, > > > + bld.SEL(offset(src, bld, i), itemp, > > > brw_imm_d(7))); > > > } > > > + > > > + const enum opcode opcode = > > > FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; > > > + emit_pixel_interpolater_send(bld, > > > + opcode, > > > + dest, > > > + src, > > > + brw_imm_ud(0u), > > > + interpolation); > > > + } > > > + break; > > > + } > > > + > > > + case nir_intrinsic_load_interpolated_input: { > > > + if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) { > > > + emit_fragcoord_interpolation(dest); > > > break; > > > } > > > > > > - default: > > > - unreachable("Invalid intrinsic"); > > > + assert(instr->src[0].ssa && > > > + instr->src[0].ssa->parent_instr->type == > > > nir_instr_type_intrinsic); > > > + nir_intrinsic_instr *bary_intrinsic = > > > + nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr); > > > + nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic; > > > + enum glsl_interp_mode interp_mode = > > > + (enum glsl_interp_mode) > > > nir_intrinsic_interp_mode(bary_intrinsic); > > > + fs_reg dst_xy; > > > + > > > + if (bary_intrin == nir_intrinsic_load_barycentric_at_offset || > > > + bary_intrin == nir_intrinsic_load_barycentric_at_sample) { > > > + /* Use the result of the PI message */ > > > + dst_xy = retype(get_nir_src(instr->src[0]), > BRW_REGISTER_TYPE_F); > > > + } else { > > > + /* Use the delta_xy values computed from the payload */ > > > + enum brw_barycentric_mode bary = > > > + brw_barycentric_mode(interp_mode, bary_intrin); > > > + > > > + dst_xy = this->delta_xy[bary]; > > > } > > > > > > - for (unsigned j = 0; j < instr->num_components; j++) { > > > - fs_reg src = > interp_reg(instr->variables[0]->var->data.location, > > > j); > > > - src.type = dest.type; > > > + for (unsigned int i = 0; i < instr->num_components; i++) { > > > + fs_reg interp = > > > + fs_reg(interp_reg(nir_intrinsic_base(instr), > > > + nir_intrinsic_component(instr) + i)); > > > + interp.type = BRW_REGISTER_TYPE_F; > > > + dest.type = BRW_REGISTER_TYPE_F; > > > > > > - bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src); > > > - dest = offset(dest, bld, 1); > > > + if (devinfo->needs_unlit_centroid_workaround && > > > + bary_intrin == nir_intrinsic_load_barycentric_centroid) { > > > + > > > + /* Get the pixel/sample mask into f0 so that we know which > > > + * pixels are lit. Then, for each channel that is unlit, > > > + * replace the centroid data with non-centroid data. > > > + */ > > > + bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS); > > > + > > > + fs_reg dest_i = offset(dest, bld, i); > > > + fs_reg dst_xy_pixel = > > > + delta_xy[brw_barycentric_mode(interp_mode, > > > + nir_intrinsic_load_barycentric_pixel)]; > > > + > > > + fs_inst *inst; > > > + inst = bld.emit(FS_OPCODE_LINTERP, dest_i, dst_xy_pixel, > > > interp); > > > + inst->predicate = BRW_PREDICATE_NORMAL; > > > + inst->predicate_inverse = true; > > > + inst->no_dd_clear = true; > > > + > > > + inst = bld.emit(FS_OPCODE_LINTERP, dest_i, dst_xy, > interp); > > > + inst->predicate = BRW_PREDICATE_NORMAL; > > > + inst->predicate_inverse = false; > > > + inst->no_dd_check = true; > > > + } else if (devinfo->gen < 6 && interp_mode == > > > INTERP_MODE_SMOOTH) { > > > + fs_reg tmp = vgrf(glsl_type::float_type); > > > + bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp); > > > + bld.MUL(offset(dest, bld, i), tmp, this->pixel_w); > > > + } else { > > > + bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, > > > interp); > > > + } > > > > > > > Ugh... I think I see why you're handling the barycentric load here now. > > Eventually, I think we could probably be doing this better but I'm not > > seeing an easy path at the moment. > > Yeah, perhaps. I don't think it's that bad as is, though. > > > > } > > > break; > > > } > > > + > > > default: > > > nir_emit_intrinsic(bld, instr); > > > break; > > > @@ -3869,26 +3891,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder > > > &bld, nir_intrinsic_instr *instr > > > } > > > > > > case nir_intrinsic_load_input: { > > > - fs_reg src; > > > + fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type); > > > unsigned num_components = instr->num_components; > > > enum brw_reg_type type = dest.type; > > > > > > - if (stage == MESA_SHADER_VERTEX) { > > > - src = fs_reg(ATTR, instr->const_index[0], dest.type); > > > - } else { > > > - assert(type_sz(type) >= 4); > > > - if (type == BRW_REGISTER_TYPE_DF) { > > > - /* const_index is in 32-bit type size units that could > not be > > > aligned > > > - * with DF. We need to read the double vector as if it > was a > > > float > > > - * vector of twice the number of components to fetch the > > > right data. > > > - */ > > > - dest = retype(dest, BRW_REGISTER_TYPE_F); > > > - num_components *= 2; > > > - } > > > - src = offset(retype(nir_inputs, dest.type), bld, > > > - instr->const_index[0]); > > > - } > > > > > > > This hunk seems odd... It doesn't look like it really has anything to do > > with FS. Is that else clause really FS-only or does it apply to GS and > > tess stages? > > Yep, this is FS specific. TCS/TES/GS input intrinsics are all handled > by nir_emit_{tcs,tes,gs}_intrinsic. This code in the generic intrinsic > handler is only for VS and FS. And the VS code obviously didn't include > this hunk. > > Sorry this was immensely non-obvious. This raises a good point, though: > after this patch, I can move the remaining code to nir_emit_vs_intrinsic > so it's clear it only applies to the VS. > I'd like to see that patch if you don't mind. > > > > - > > > nir_const_value *const_offset = > > > nir_src_as_const_value(instr->src[0]); > > > assert(const_offset && "Indirect input loads not allowed"); > > > src = offset(src, bld, const_offset->u32[0]); > > > diff --git a/src/mesa/drivers/dri/i965/brw_nir.c > > > b/src/mesa/drivers/dri/i965/brw_nir.c > > > index caf9fe0..d1a823a 100644 > > > --- a/src/mesa/drivers/dri/i965/brw_nir.c > > > +++ b/src/mesa/drivers/dri/i965/brw_nir.c > > > @@ -30,7 +30,8 @@ static bool > > > is_input(nir_intrinsic_instr *intrin) > > > { > > > return intrin->intrinsic == nir_intrinsic_load_input || > > > - intrin->intrinsic == nir_intrinsic_load_per_vertex_input; > > > + intrin->intrinsic == nir_intrinsic_load_per_vertex_input || > > > + intrin->intrinsic == nir_intrinsic_load_interpolated_input; > > > } > > > > > > static bool > > > @@ -282,9 +283,16 @@ brw_nir_lower_tes_inputs(nir_shader *nir, const > > > struct brw_vue_map *vue_map) > > > void > > > brw_nir_lower_fs_inputs(nir_shader *nir) > > > { > > > - nir_assign_var_locations(&nir->inputs, &nir->num_inputs, > > > VARYING_SLOT_VAR0, > > > - type_size_scalar); > > > - nir_lower_io(nir, nir_var_shader_in, type_size_scalar, false); > > > + foreach_list_typed(nir_variable, var, node, &nir->inputs) { > > > + var->data.driver_location = var->data.location; > > > + } > > > + > > > + nir_lower_io(nir, nir_var_shader_in, type_size_vec4, true); > > > + > > > + /* This pass needs actual constants */ > > > + nir_opt_constant_folding(nir); > > > + > > > + add_const_offset_to_base(nir, nir_var_shader_in); > > > } > > > > > > void > > > -- > > > 2.9.0 > > > > > > _______________________________________________ > > > mesa-dev mailing list > > > mesa-dev@lists.freedesktop.org > > > https://lists.freedesktop.org/mailman/listinfo/mesa-dev > > > > > > >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev