Re: [Mesa-dev] [PATCH 6/7] i965: Rewrite FS input handling to use the new NIR intrinsics.

Jason Ekstrand Tue, 19 Jul 2016 18:40:05 -0700

On Tue, Jul 19, 2016 at 5:02 PM, Kenneth Graunke <kenn...@whitecape.org>
wrote:


> On Tuesday, July 19, 2016 1:39:23 PM PDT Jason Ekstrand wrote:
> > On Mon, Jul 18, 2016 at 1:26 PM, Kenneth Graunke <kenn...@whitecape.org>
> > wrote:
> >
> > > This eliminates the need to walk the list of input variables, recurse
> > > into their types (via logic largely redundant with nir_lower_io), and
> > > interpolate all possible inputs up front.  The backend no longer has
> > > to care about variables at all, which eliminates complications from
> > > trying to pack multiple variables into the same location.  Instead,
> > > each intrinsic specifies exactly what's needed.
> > >
> > > This should unblock Timothy's work on GL_ARB_enhanced_layouts.
> > >
> > > Each load_interpolated_input intrinsic corresponds to PLN instructions,
> > > while load_barycentric_at_* intrinsics correspond to pixel interpolator
> > > messages.  The pixel/centroid/sample barycentric intrinsics simply
> refer
> > > to payload fields (delta_xy[]), and don't actually generate any code.
> > >
> > > Because we use a single intrinsic for both centroid-qualified variables
> > > and interpolateAtCentroid(), they become indistinguishable.  We stop
> > > sending pixel interpolator messages for those, and instead use the
> > > payload provided data, which should be considerably faster.
> > >
> > > On Broadwell:
> > >
> > > total instructions in shared programs: 9067751 -> 9067570 (-0.00%)
> > > instructions in affected programs: 145902 -> 145721 (-0.12%)
> > > helped: 422
> > > HURT: 209
> > >
> > > total spills in shared programs: 2849 -> 2899 (1.76%)
> > > spills in affected programs: 760 -> 810 (6.58%)
> > > helped: 0
> > > HURT: 10
> > >
> > > total fills in shared programs: 3910 -> 3950 (1.02%)
> > > fills in affected programs: 617 -> 657 (6.48%)
> > > helped: 0
> > > HURT: 10
> > >
> > > LOST:   3
> > > GAINED: 3
> > >
> > > The differences mostly appear to be slight changes in MOVs.
> > >
> > > Signed-off-by: Kenneth Graunke <kenn...@whitecape.org>
> > > ---
> > >  src/mesa/drivers/dri/i965/brw_fs.cpp     | 175 ++++---------
> > >  src/mesa/drivers/dri/i965/brw_fs.h       |   9 +-
> > >  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 410
> > > ++++++++++++++++---------------
> > >  src/mesa/drivers/dri/i965/brw_nir.c      |  16 +-
> > >  4 files changed, 269 insertions(+), 341 deletions(-)
> > >
> > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp
> > > b/src/mesa/drivers/dri/i965/brw_fs.cpp
> > > index 94127bc..06007fe 100644
> > > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> > > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> > > @@ -1067,21 +1067,27 @@ fs_visitor::emit_fragcoord_interpolation(fs_reg
> > > wpos)
> > >     bld.MOV(wpos, this->wpos_w);
> > >  }
> > >
> > > -static enum brw_barycentric_mode
> > > -barycentric_mode(enum glsl_interp_mode mode,
> > > -                 bool is_centroid, bool is_sample)
> > > +enum brw_barycentric_mode
> > > +brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)
> > >  {
> > > -   unsigned bary;
> > > -
> > >     /* Barycentric modes don't make sense for flat inputs. */
> > >     assert(mode != INTERP_MODE_FLAT);
> > >
> > > -   if (is_sample) {
> > > -      bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
> > > -   } else if (is_centroid) {
> > > -      bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
> > > -   } else {
> > > +   unsigned bary;
> > > +   switch (op) {
> > > +   case nir_intrinsic_load_barycentric_pixel:
> > > +   case nir_intrinsic_load_barycentric_at_offset:
> > >        bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
> > > +      break;
> > > +   case nir_intrinsic_load_barycentric_centroid:
> > > +      bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
> > > +      break;
> > > +   case nir_intrinsic_load_barycentric_sample:
> > > +   case nir_intrinsic_load_barycentric_at_sample:
> > > +      bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
> > > +      break;
> > > +   default:
> > > +      assert(!"invalid intrinsic");
> > >     }
> > >
> > >     if (mode == INTERP_MODE_NOPERSPECTIVE)
> > > @@ -1101,107 +1107,6 @@ centroid_to_pixel(enum brw_barycentric_mode
> bary)
> > >     return (enum brw_barycentric_mode) ((unsigned) bary - 1);
> > >  }
> > >
> > > -void
> > > -fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
> > > -                                       const glsl_type *type,
> > > -                                       glsl_interp_mode
> > > interpolation_mode,
> > > -                                       int *location, bool
> mod_centroid,
> > > -                                       bool mod_sample)
> > > -{
> > > -   assert(stage == MESA_SHADER_FRAGMENT);
> > > -   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> > > -
> > > -   if (type->is_array() || type->is_matrix()) {
> > > -      const glsl_type *elem_type = glsl_get_array_element(type);
> > > -      const unsigned length = glsl_get_length(type);
> > > -
> > > -      for (unsigned i = 0; i < length; i++) {
> > > -         emit_general_interpolation(attr, name, elem_type,
> > > interpolation_mode,
> > > -                                    location, mod_centroid,
> mod_sample);
> > > -      }
> > > -   } else if (type->is_record()) {
> > > -      for (unsigned i = 0; i < type->length; i++) {
> > > -         const glsl_type *field_type = type->fields.structure[i].type;
> > > -         emit_general_interpolation(attr, name, field_type,
> > > interpolation_mode,
> > > -                                    location, mod_centroid,
> mod_sample);
> > > -      }
> > > -   } else {
> > > -      assert(type->is_scalar() || type->is_vector());
> > > -
> > > -      if (prog_data->urb_setup[*location] == -1) {
> > > -         /* If there's no incoming setup data for this slot, don't
> > > -          * emit interpolation for it.
> > > -          */
> > > -         *attr = offset(*attr, bld, type->vector_elements);
> > > -         (*location)++;
> > > -         return;
> > > -      }
> > > -
> > > -      attr->type = brw_type_for_base_type(type->get_scalar_type());
> > > -
> > > -      if (interpolation_mode == INTERP_MODE_FLAT) {
> > > -         /* Constant interpolation (flat shading) case. The SF has
> > > -          * handed us defined values in only the constant offset
> > > -          * field of the setup reg.
> > > -          */
> > > -         unsigned vector_elements = type->vector_elements;
> > > -
> > > -         /* Data starts at suboffet 3 in 32-bit units (12 bytes), so
> it
> > > is not
> > > -          * 64-bit aligned and the current implementation fails to
> read
> > > the
> > > -          * data properly. Instead, when there is a double input
> varying,
> > > -          * read it as vector of floats with twice the number of
> > > components.
> > > -          */
> > > -         if (attr->type == BRW_REGISTER_TYPE_DF) {
> > > -            vector_elements *= 2;
> > > -            attr->type = BRW_REGISTER_TYPE_F;
> > > -         }
> > > -         for (unsigned int i = 0; i < vector_elements; i++) {
> > > -            struct brw_reg interp = interp_reg(*location, i);
> > > -            interp = suboffset(interp, 3);
> > > -            interp.type = attr->type;
> > > -            bld.emit(FS_OPCODE_CINTERP, *attr, fs_reg(interp));
> > > -            *attr = offset(*attr, bld, 1);
> > > -         }
> > > -      } else {
> > > -         /* Smooth/noperspective interpolation case. */
> > > -         enum brw_barycentric_mode bary =
> > > -            barycentric_mode(interpolation_mode, mod_centroid,
> > > mod_sample);
> > > -
> > > -         for (unsigned int i = 0; i < type->vector_elements; i++) {
> > > -            fs_reg interp(interp_reg(*location, i));
> > > -            if (devinfo->needs_unlit_centroid_workaround &&
> mod_centroid)
> > > {
> > > -               /* Get the pixel/sample mask into f0 so that we know
> > > -                * which pixels are lit.  Then, for each channel that
> is
> > > -                * unlit, replace the centroid data with non-centroid
> > > -                * data.
> > > -                */
> > > -               bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
> > > -
> > > -               fs_inst *inst;
> > > -               inst = bld.emit(FS_OPCODE_LINTERP, *attr,
> > > -                               delta_xy[centroid_to_pixel(bary)],
> interp);
> > > -               inst->predicate = BRW_PREDICATE_NORMAL;
> > > -               inst->predicate_inverse = true;
> > > -               inst->no_dd_clear = true;
> > > -
> > > -               inst = bld.emit(FS_OPCODE_LINTERP, *attr,
> > > -                               delta_xy[bary], interp);
> > > -               inst->predicate = BRW_PREDICATE_NORMAL;
> > > -               inst->predicate_inverse = false;
> > > -               inst->no_dd_check = true;
> > > -            } else {
> > > -               bld.emit(FS_OPCODE_LINTERP, *attr, delta_xy[bary],
> interp);
> > > -            }
> > > -            if (devinfo->gen < 6 && interpolation_mode ==
> > > INTERP_MODE_SMOOTH) {
> > > -               bld.MUL(*attr, *attr, this->pixel_w);
> > > -            }
> > > -            *attr = offset(*attr, bld, 1);
> > > -         }
> > > -      }
> > > -      (*location)++;
> > > -   }
> > > -}
> > > -
> > >  fs_reg *
> > >  fs_visitor::emit_frontfacing_interpolation()
> > >  {
> > > @@ -6327,6 +6232,10 @@ fs_visitor::run_cs()
> > >  /**
> > >   * Return a bitfield where bit n is set if barycentric interpolation
> mode
> > > n
> > >   * (see enum brw_barycentric_mode) is needed by the fragment shader.
> > > + *
> > > + * We examine the load_barycentric intrinsics rather than looking at
> input
> > > + * variables so that we catch interpolateAtCentroid() messages too,
> which
> > > + * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set
> up.
> > >   */
> > >  static unsigned
> > >  brw_compute_barycentric_interp_modes(const struct brw_device_info
> > > *devinfo,
> > > @@ -6334,29 +6243,37 @@ brw_compute_barycentric_interp_modes(const
> struct
> > > brw_device_info *devinfo,
> > >  {
> > >     unsigned barycentric_interp_modes = 0;
> > >
> > > -   nir_foreach_variable(var, &shader->inputs) {
> > > -      /* Ignore WPOS; it doesn't require interpolation. */
> > > -      if (var->data.location == VARYING_SLOT_POS)
> > > +   nir_foreach_function(f, shader) {
> > > +      if (!f->impl)
> > >           continue;
> > >
> > > -      /* Flat inputs don't need barycentric modes. */
> > > -      if (var->data.interpolation == INTERP_MODE_FLAT)
> > > -         continue;
> > > +      nir_foreach_block(block, f->impl) {
> > > +         nir_foreach_instr(instr, block) {
> > > +            if (instr->type != nir_instr_type_intrinsic)
> > > +               continue;
> > >
> > > -      /* Determine the set (or sets) of barycentric coordinates
> needed to
> > > -       * interpolate this variable.  Note that when
> > > -       * brw->needs_unlit_centroid_workaround is set, centroid
> > > interpolation
> > > -       * uses PIXEL interpolation for unlit pixels and CENTROID
> > > interpolation
> > > -       * for lit pixels, so we need both sets of barycentric
> coordinates.
> > > -       */
> > > -      enum brw_barycentric_mode bary_mode =
> > > -         barycentric_mode((glsl_interp_mode) var->data.interpolation,
> > > -                          var->data.centroid, var->data.sample);
> > > +            nir_intrinsic_instr *intrin =
> nir_instr_as_intrinsic(instr);
> > > +            if (intrin->intrinsic !=
> > > nir_intrinsic_load_interpolated_input)
> > > +               continue;
> > >
> >
> > Any particular reason why you're looking at the source of the load rather
> > than just looking for the load_barycentric intrinsic directly?
> >
> >
> > > +
> > > +            /* Ignore WPOS; it doesn't require interpolation. */
> > > +            if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS)
> > > +               continue;
> > >
> >
> > Ugh...
>
> This is why.  gl_FragCoord still generates a load_interpolated_input
> with a load_barycentric_pixel, and I want to skip that.  (It might be
> the only user of pixel coordinates, and if so, we don't want them.)
>
> You need the load_interpolated_input intrinsic to know which input it is.
>
> It's pretty clear that gl_FragCoord doesn't behave anything like a
> normal input, so it ought to be a system value.  With that cleaned up,
> we could simplify this.
>
> That ended up being more work than I expected, so I was hoping to do
> it as a follow-on series, to unblock Tim sooner rather than later.
>

I'm ok with the "unblock Tim ASAP plan".  I would like to see gl_FragCoord
as a system value eventually though.  It just seems so much cleaner.


>
> > >
> > > -      barycentric_interp_modes |= 1 << bary_mode;
> > > +            intrin =
> > > nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
> > > +            enum glsl_interp_mode interp = (enum glsl_interp_mode)
> > > +               nir_intrinsic_interp_mode(intrin);
> > > +            nir_intrinsic_op bary_op = intrin->intrinsic;
> > > +            enum brw_barycentric_mode bary =
> > > +               brw_barycentric_mode(interp, bary_op);
> > >
> > > -      if (var->data.centroid &&
> devinfo->needs_unlit_centroid_workaround)
> > > -         barycentric_interp_modes |= 1 <<
> centroid_to_pixel(bary_mode);
> > > +            barycentric_interp_modes |= 1 << bary;
> > > +
> > > +            if (devinfo->needs_unlit_centroid_workaround &&
> > > +                bary_op == nir_intrinsic_load_barycentric_centroid)
> > > +               barycentric_interp_modes |= 1 <<
> centroid_to_pixel(bary);
> > > +         }
> > > +      }
> > >     }
> > >
> > >     return barycentric_interp_modes;
> > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.h
> > > b/src/mesa/drivers/dri/i965/brw_fs.h
> > > index 7998f51..574475f 100644
> > > --- a/src/mesa/drivers/dri/i965/brw_fs.h
> > > +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> > > @@ -174,11 +174,6 @@ public:
> > >     fs_reg *emit_samplepos_setup();
> > >     fs_reg *emit_sampleid_setup();
> > >     fs_reg *emit_samplemaskin_setup();
> > > -   void emit_general_interpolation(fs_reg *attr, const char *name,
> > > -                                   const glsl_type *type,
> > > -                                   glsl_interp_mode
> interpolation_mode,
> > > -                                   int *location, bool mod_centroid,
> > > -                                   bool mod_sample);
> > >     fs_reg *emit_vs_system_value(int location);
> > >     void emit_interpolation_setup_gen4();
> > >     void emit_interpolation_setup_gen6();
> > > @@ -195,7 +190,6 @@ public:
> > >     bool opt_zero_samples();
> > >
> > >     void emit_nir_code();
> > > -   void nir_setup_inputs();
> > >     void nir_setup_single_output_varying(fs_reg *reg, const glsl_type
> > > *type,
> > >                                          unsigned *location);
> > >     void nir_setup_outputs();
> > > @@ -511,3 +505,6 @@ void shuffle_64bit_data_for_32bit_write(const
> > > brw::fs_builder &bld,
> > >                                          uint32_t components);
> > >  fs_reg setup_imm_df(const brw::fs_builder &bld,
> > >                      double v);
> > > +
> > > +enum brw_barycentric_mode brw_barycentric_mode(enum glsl_interp_mode
> mode,
> > > +                                               nir_intrinsic_op op);
> > > diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> > > b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> > > index 6265dc6..610c151 100644
> > > --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> > > +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> > > @@ -36,7 +36,6 @@ fs_visitor::emit_nir_code()
> > >     /* emit the arrays used for inputs and outputs - load/store
> intrinsics
> > > will
> > >      * be converted to reads/writes of these arrays
> > >      */
> > > -   nir_setup_inputs();
> > >     nir_setup_outputs();
> > >     nir_setup_uniforms();
> > >     nir_emit_system_values();
> > > @@ -50,38 +49,6 @@ fs_visitor::emit_nir_code()
> > >  }
> > >
> > >  void
> > > -fs_visitor::nir_setup_inputs()
> > > -{
> > > -   if (stage != MESA_SHADER_FRAGMENT)
> > > -      return;
> > > -
> > > -   nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs);
> > > -
> > > -   nir_foreach_variable(var, &nir->inputs) {
> > > -      fs_reg input = offset(nir_inputs, bld,
> var->data.driver_location);
> > > -
> > > -      fs_reg reg;
> > > -      if (var->data.location == VARYING_SLOT_POS) {
> > > -         emit_fragcoord_interpolation(input);
> > > -      } else if (var->data.location == VARYING_SLOT_LAYER) {
> > > -         struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_LAYER,
> > > 1), 3);
> > > -         reg.type = BRW_REGISTER_TYPE_D;
> > > -         bld.emit(FS_OPCODE_CINTERP, retype(input,
> BRW_REGISTER_TYPE_D),
> > > reg);
> > > -      } else if (var->data.location == VARYING_SLOT_VIEWPORT) {
> > > -         struct brw_reg reg =
> suboffset(interp_reg(VARYING_SLOT_VIEWPORT,
> > > 2), 3);
> > > -         reg.type = BRW_REGISTER_TYPE_D;
> > > -         bld.emit(FS_OPCODE_CINTERP, retype(input,
> BRW_REGISTER_TYPE_D),
> > > reg);
> > > -      } else {
> > > -         int location = var->data.location;
> > > -         emit_general_interpolation(&input, var->name, var->type,
> > > -                                    (glsl_interp_mode)
> > > var->data.interpolation,
> > > -                                    &location, var->data.centroid,
> > > -                                    var->data.sample);
> > > -      }
> > > -   }
> > > -}
> > > -
> > > -void
> > >  fs_visitor::nir_setup_single_output_varying(fs_reg *reg,
> > >                                              const glsl_type *type,
> > >                                              unsigned *location)
> > > @@ -3063,7 +3030,6 @@ fs_visitor::nir_emit_fs_intrinsic(const
> fs_builder
> > > &bld,
> > >                                    nir_intrinsic_instr *instr)
> > >  {
> > >     assert(stage == MESA_SHADER_FRAGMENT);
> > > -   const struct brw_wm_prog_key *wm_key = (const struct
> brw_wm_prog_key
> > > *) key;
> > >
> > >     fs_reg dest;
> > >     if (nir_intrinsic_infos[instr->intrinsic].has_dest)
> > > @@ -3120,189 +3086,245 @@ fs_visitor::nir_emit_fs_intrinsic(const
> > > fs_builder &bld,
> > >        break;
> > >     }
> > >
> > > -   case nir_intrinsic_interp_var_at_centroid:
> > > -   case nir_intrinsic_interp_var_at_sample:
> > > -   case nir_intrinsic_interp_var_at_offset: {
> > > -      /* Handle ARB_gpu_shader5 interpolation intrinsics
> > > -       *
> > > -       * It's worth a quick word of explanation as to why we handle
> the
> > > full
> > > -       * variable-based interpolation intrinsic rather than a lowered
> > > version
> > > -       * with like we do for other inputs.  We have to do that because
> > > the way
> > > -       * we set up inputs doesn't allow us to use the already setup
> > > inputs for
> > > -       * interpolation.  At the beginning of the shader, we go through
> > > all of
> > > -       * the input variables and do the initial interpolation and put
> it
> > > in
> > > -       * the nir_inputs array based on its location as determined in
> > > -       * nir_lower_io.  If the input isn't used, dead code cleans up
> and
> > > -       * everything works fine.  However, when we get to the
> > > ARB_gpu_shader5
> > > -       * interpolation intrinsics, we need to reinterpolate the input
> > > -       * differently.  If we used an intrinsic that just had an index
> it
> > > would
> > > -       * only give us the offset into the nir_inputs array.  However,
> > > this is
> > > -       * useless because that value is post-interpolation and we need
> > > -       * pre-interpolation.  In order to get the actual location of
> the
> > > bits
> > > -       * we get from the vertex fetching hardware, we need the
> variable.
> > > -       */
> > > -      fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
> > > -      const glsl_interp_mode interpolation =
> > > -         (glsl_interp_mode)
> instr->variables[0]->var->data.interpolation;
> > > +   case nir_intrinsic_load_input: {
> > > +      /* load_input is only used for flat inputs */
> > > +      unsigned base = nir_intrinsic_base(instr);
> > > +      unsigned component = nir_intrinsic_component(instr);
> > > +      unsigned num_components = instr->num_components;
> > > +      enum brw_reg_type type = dest.type;
> > >
> > > -      switch (instr->intrinsic) {
> > > -      case nir_intrinsic_interp_var_at_centroid:
> > > -         emit_pixel_interpolater_send(bld,
> > > -
> FS_OPCODE_INTERPOLATE_AT_CENTROID,
> > > -                                      dst_xy,
> > > -                                      fs_reg(), /* src */
> > > -                                      brw_imm_ud(0u),
> > > -                                      interpolation);
> > > -         break;
> > > +      /* Special case fields in the VUE header */
> > > +      if (base == VARYING_SLOT_LAYER)
> > > +         component = 1;
> > > +      else if (base == VARYING_SLOT_VIEWPORT)
> > > +         component = 2;
> > >
> > > -      case nir_intrinsic_interp_var_at_sample: {
> > > -         if (!wm_key->multisample_fbo) {
> > > -            /* From the ARB_gpu_shader5 specification:
> > > -             * "If multisample buffers are not available, the input
> > > varying
> > > -             *  will be evaluated at the center of the pixel."
> > > -             */
> > > -            emit_pixel_interpolater_send(bld,
> > > -
> > >  FS_OPCODE_INTERPOLATE_AT_CENTROID,
> > > -                                         dst_xy,
> > > -                                         fs_reg(), /* src */
> > > -                                         brw_imm_ud(0u),
> > > -                                         interpolation);
> > > -            break;
> > > -         }
> > > +      if (nir_dest_bit_size(instr->dest) == 64) {
> > > +         /* const_index is in 32-bit type size units that could not be
> > > aligned
> > > +          * with DF. We need to read the double vector as if it was a
> > > float
> > > +          * vector of twice the number of components to fetch the
> right
> > > data.
> > > +          */
> > > +         type = BRW_REGISTER_TYPE_F;
> > > +         num_components *= 2;
> > > +      }
> > >
> > > -         nir_const_value *const_sample =
> > > nir_src_as_const_value(instr->src[0]);
> > > +      for (unsigned int i = 0; i < num_components; i++) {
> > > +         struct brw_reg interp = interp_reg(base, component + i);
> > > +         interp = suboffset(interp, 3);
> > > +         bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld,
> i),
> > > +                  retype(fs_reg(interp), type));
> > > +      }
> > >
> > > -         if (const_sample) {
> > > -            unsigned msg_data = const_sample->i32[0] << 4;
> > > +      if (nir_dest_bit_size(instr->dest) == 64) {
> > > +         shuffle_32bit_load_result_to_64bit_data(bld,
> > > +                                                 dest,
> > > +                                                 retype(dest, type),
> > > +
>  instr->num_components);
> > > +      }
> > > +      break;
> > > +   }
> > > +
> > > +   case nir_intrinsic_load_barycentric_pixel:
> > > +   case nir_intrinsic_load_barycentric_centroid:
> > > +   case nir_intrinsic_load_barycentric_sample:
> > > +      /* Do nothing - load_interpolated_input handling will handle it
> > > later. */
> > > +      break;
> > >
> >
> > I'm very courious why you made this choice.  It seems odd to me.
>
> I originally tried emitting MOVs here.  The data layout is a bit funky:
>
>     R3: X coordinates for Slots 0-7
>     R4: Y coordinates for Slots 0-7
>     R5: X coordinates for Slots 8-15
>     R5: Y coordinates for Slots 8-15
>
> For SIMD8, you could do two separate MOVs (move X, then Y).  You might be
> able to do a compressed mov(16)...but would need to whack the channel
> enables into 1Q for both halves (if that's possible) or use NoMask
> (which I wanted to avoid).
>
> For SIMD16...you can't move both X's with a single MOV.  A <16,8,1>
> region would cover it, but sources can't span more than two adjacent
> registers.  So that doesn't work.  It might take four MOVs.  Maybe
> only two if I could use the above trick.
>
> Generating MOVs also means that we're relying on the optimizer to
> remove them...which it didn't seem to be.  It ended up being much
> easier to just point LINTERP at delta_xy[] directly.
>

Right... I forgot about the odd interleaving.  That seems like a reasonable
reason for doing it this way.


>
> > > +   case nir_intrinsic_load_barycentric_at_sample: {
> > > +      const glsl_interp_mode interpolation =
> > > +         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
> > > +
> > > +      nir_const_value *const_sample =
> > > nir_src_as_const_value(instr->src[0]);
> > > +
> > > +      if (const_sample) {
> > > +         unsigned msg_data = const_sample->i32[0] << 4;
> > > +
> > > +         emit_pixel_interpolater_send(bld,
> > > +                                      FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> > > +                                      dest,
> > > +                                      fs_reg(), /* src */
> > > +                                      brw_imm_ud(msg_data),
> > > +                                      interpolation);
> > > +      } else {
> > > +         const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
> > > +                                          BRW_REGISTER_TYPE_UD);
> > > +
> > > +         if (nir_src_is_dynamically_uniform(instr->src[0])) {
> > > +            const fs_reg sample_id = bld.emit_uniformize(sample_src);
> > > +            const fs_reg msg_data = vgrf(glsl_type::uint_type);
> > > +            bld.exec_all().group(1, 0)
> > > +               .SHL(msg_data, sample_id, brw_imm_ud(4u));
> > >              emit_pixel_interpolater_send(bld,
> > >
>  FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> > > -                                         dst_xy,
> > > +                                         dest,
> > >                                           fs_reg(), /* src */
> > > -                                         brw_imm_ud(msg_data),
> > > +                                         msg_data,
> > >                                           interpolation);
> > >           } else {
> > > -            const fs_reg sample_src =
> retype(get_nir_src(instr->src[0]),
> > > -                                             BRW_REGISTER_TYPE_UD);
> > > -
> > > -            if (nir_src_is_dynamically_uniform(instr->src[0])) {
> > > -               const fs_reg sample_id =
> bld.emit_uniformize(sample_src);
> > > -               const fs_reg msg_data = vgrf(glsl_type::uint_type);
> > > -               bld.exec_all().group(1, 0)
> > > -                  .SHL(msg_data, sample_id, brw_imm_ud(4u));
> > > +            /* Make a loop that sends a message to the pixel
> interpolater
> > > +             * for the sample number in each live channel. If there
> are
> > > +             * multiple channels with the same sample number then
> these
> > > +             * will be handled simultaneously with a single
> interation of
> > > +             * the loop.
> > > +             */
> > > +            bld.emit(BRW_OPCODE_DO);
> > > +
> > > +            /* Get the next live sample number into sample_id_reg */
> > > +            const fs_reg sample_id = bld.emit_uniformize(sample_src);
> > > +
> > > +            /* Set the flag register so that we can perform the send
> > > +             * message on all channels that have the same sample
> number
> > > +             */
> > > +            bld.CMP(bld.null_reg_ud(),
> > > +                    sample_src, sample_id,
> > > +                    BRW_CONDITIONAL_EQ);
> > > +            const fs_reg msg_data = vgrf(glsl_type::uint_type);
> > > +            bld.exec_all().group(1, 0)
> > > +               .SHL(msg_data, sample_id, brw_imm_ud(4u));
> > > +            fs_inst *inst =
> > >                 emit_pixel_interpolater_send(bld,
> > >
> > >  FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> > > -                                            dst_xy,
> > > +                                            dest,
> > >                                              fs_reg(), /* src */
> > >                                              msg_data,
> > >                                              interpolation);
> > > -            } else {
> > > -               /* Make a loop that sends a message to the pixel
> > > interpolater
> > > -                * for the sample number in each live channel. If
> there are
> > > -                * multiple channels with the same sample number then
> these
> > > -                * will be handled simultaneously with a single
> interation
> > > of
> > > -                * the loop.
> > > -                */
> > > -               bld.emit(BRW_OPCODE_DO);
> > > -
> > > -               /* Get the next live sample number into sample_id_reg
> */
> > > -               const fs_reg sample_id =
> bld.emit_uniformize(sample_src);
> > > +            set_predicate(BRW_PREDICATE_NORMAL, inst);
> > >
> > > -               /* Set the flag register so that we can perform the
> send
> > > -                * message on all channels that have the same sample
> number
> > > -                */
> > > -               bld.CMP(bld.null_reg_ud(),
> > > -                       sample_src, sample_id,
> > > -                       BRW_CONDITIONAL_EQ);
> > > -               const fs_reg msg_data = vgrf(glsl_type::uint_type);
> > > -               bld.exec_all().group(1, 0)
> > > -                  .SHL(msg_data, sample_id, brw_imm_ud(4u));
> > > -               fs_inst *inst =
> > > -                  emit_pixel_interpolater_send(bld,
> > > -
> > >  FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> > > -                                               dst_xy,
> > > -                                               fs_reg(), /* src */
> > > -                                               msg_data,
> > > -                                               interpolation);
> > > -               set_predicate(BRW_PREDICATE_NORMAL, inst);
> > > -
> > > -               /* Continue the loop if there are any live channels
> left */
> > > -               set_predicate_inv(BRW_PREDICATE_NORMAL,
> > > -                                 true, /* inverse */
> > > -                                 bld.emit(BRW_OPCODE_WHILE));
> > > -            }
> > > +            /* Continue the loop if there are any live channels left
> */
> > > +            set_predicate_inv(BRW_PREDICATE_NORMAL,
> > > +                              true, /* inverse */
> > > +                              bld.emit(BRW_OPCODE_WHILE));
> > >           }
> > > -
> > > -         break;
> > >        }
> > > +      break;
> > > +   }
> > >
> > > -      case nir_intrinsic_interp_var_at_offset: {
> > > -         nir_const_value *const_offset =
> > > nir_src_as_const_value(instr->src[0]);
> > > +   case nir_intrinsic_load_barycentric_at_offset: {
> > > +      const glsl_interp_mode interpolation =
> > > +         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
> > >
> > > -         if (const_offset) {
> > > -            unsigned off_x = MIN2((int)(const_offset->f32[0] * 16),
> 7) &
> > > 0xf;
> > > -            unsigned off_y = MIN2((int)(const_offset->f32[1] * 16),
> 7) &
> > > 0xf;
> > > +      nir_const_value *const_offset =
> > > nir_src_as_const_value(instr->src[0]);
> > >
> > > -            emit_pixel_interpolater_send(bld,
> > > -
> > >  FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
> > > -                                         dst_xy,
> > > -                                         fs_reg(), /* src */
> > > -                                         brw_imm_ud(off_x | (off_y <<
> 4)),
> > > -                                         interpolation);
> > > -         } else {
> > > -            fs_reg src = vgrf(glsl_type::ivec2_type);
> > > -            fs_reg offset_src = retype(get_nir_src(instr->src[0]),
> > > -                                       BRW_REGISTER_TYPE_F);
> > > -            for (int i = 0; i < 2; i++) {
> > > -               fs_reg temp = vgrf(glsl_type::float_type);
> > > -               bld.MUL(temp, offset(offset_src, bld, i),
> > > brw_imm_f(16.0f));
> > > -               fs_reg itemp = vgrf(glsl_type::int_type);
> > > -               /* float to int */
> > > -               bld.MOV(itemp, temp);
> > > -
> > > -               /* Clamp the upper end of the range to +7/16.
> > > -                * ARB_gpu_shader5 requires that we support a maximum
> > > offset
> > > -                * of +0.5, which isn't representable in a S0.4 value
> -- if
> > > -                * we didn't clamp it, we'd end up with -8/16, which
> is the
> > > -                * opposite of what the shader author wanted.
> > > -                *
> > > -                * This is legal due to ARB_gpu_shader5's quantization
> > > -                * rules:
> > > -                *
> > > -                * "Not all values of <offset> may be supported; x and
> y
> > > -                * offsets may be rounded to fixed-point values with
> the
> > > -                * number of fraction bits given by the
> > > -                * implementation-dependent constant
> > > -                * FRAGMENT_INTERPOLATION_OFFSET_BITS"
> > > -                */
> > > -               set_condmod(BRW_CONDITIONAL_L,
> > > -                           bld.SEL(offset(src, bld, i), itemp,
> > > brw_imm_d(7)));
> > > -            }
> > > +      if (const_offset) {
> > > +         unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) &
> 0xf;
> > > +         unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) &
> 0xf;
> > >
> > > -            const enum opcode opcode =
> > > FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
> > > -            emit_pixel_interpolater_send(bld,
> > > -                                         opcode,
> > > -                                         dst_xy,
> > > -                                         src,
> > > -                                         brw_imm_ud(0u),
> > > -                                         interpolation);
> > > +         emit_pixel_interpolater_send(bld,
> > > +
> > > FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
> > > +                                      dest,
> > > +                                      fs_reg(), /* src */
> > > +                                      brw_imm_ud(off_x | (off_y <<
> 4)),
> > > +                                      interpolation);
> > > +      } else {
> > > +         fs_reg src = vgrf(glsl_type::ivec2_type);
> > > +         fs_reg offset_src = retype(get_nir_src(instr->src[0]),
> > > +                                    BRW_REGISTER_TYPE_F);
> > > +         for (int i = 0; i < 2; i++) {
> > > +            fs_reg temp = vgrf(glsl_type::float_type);
> > > +            bld.MUL(temp, offset(offset_src, bld, i),
> brw_imm_f(16.0f));
> > > +            fs_reg itemp = vgrf(glsl_type::int_type);
> > > +            /* float to int */
> > > +            bld.MOV(itemp, temp);
> > > +
> > > +            /* Clamp the upper end of the range to +7/16.
> > > +             * ARB_gpu_shader5 requires that we support a maximum
> offset
> > > +             * of +0.5, which isn't representable in a S0.4 value --
> if
> > > +             * we didn't clamp it, we'd end up with -8/16, which is
> the
> > > +             * opposite of what the shader author wanted.
> > > +             *
> > > +             * This is legal due to ARB_gpu_shader5's quantization
> > > +             * rules:
> > > +             *
> > > +             * "Not all values of <offset> may be supported; x and y
> > > +             * offsets may be rounded to fixed-point values with the
> > > +             * number of fraction bits given by the
> > > +             * implementation-dependent constant
> > > +             * FRAGMENT_INTERPOLATION_OFFSET_BITS"
> > > +             */
> > > +            set_condmod(BRW_CONDITIONAL_L,
> > > +                        bld.SEL(offset(src, bld, i), itemp,
> > > brw_imm_d(7)));
> > >           }
> > > +
> > > +         const enum opcode opcode =
> > > FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
> > > +         emit_pixel_interpolater_send(bld,
> > > +                                      opcode,
> > > +                                      dest,
> > > +                                      src,
> > > +                                      brw_imm_ud(0u),
> > > +                                      interpolation);
> > > +      }
> > > +      break;
> > > +   }
> > > +
> > > +   case nir_intrinsic_load_interpolated_input: {
> > > +      if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
> > > +         emit_fragcoord_interpolation(dest);
> > >           break;
> > >        }
> > >
> > > -      default:
> > > -         unreachable("Invalid intrinsic");
> > > +      assert(instr->src[0].ssa &&
> > > +             instr->src[0].ssa->parent_instr->type ==
> > > nir_instr_type_intrinsic);
> > > +      nir_intrinsic_instr *bary_intrinsic =
> > > +         nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
> > > +      nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
> > > +      enum glsl_interp_mode interp_mode =
> > > +         (enum glsl_interp_mode)
> > > nir_intrinsic_interp_mode(bary_intrinsic);
> > > +      fs_reg dst_xy;
> > > +
> > > +      if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
> > > +          bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
> > > +         /* Use the result of the PI message */
> > > +         dst_xy = retype(get_nir_src(instr->src[0]),
> BRW_REGISTER_TYPE_F);
> > > +      } else {
> > > +         /* Use the delta_xy values computed from the payload */
> > > +         enum brw_barycentric_mode bary =
> > > +            brw_barycentric_mode(interp_mode, bary_intrin);
> > > +
> > > +         dst_xy = this->delta_xy[bary];
> > >        }
> > >
> > > -      for (unsigned j = 0; j < instr->num_components; j++) {
> > > -         fs_reg src =
> interp_reg(instr->variables[0]->var->data.location,
> > > j);
> > > -         src.type = dest.type;
> > > +      for (unsigned int i = 0; i < instr->num_components; i++) {
> > > +         fs_reg interp =
> > > +            fs_reg(interp_reg(nir_intrinsic_base(instr),
> > > +                              nir_intrinsic_component(instr) + i));
> > > +         interp.type = BRW_REGISTER_TYPE_F;
> > > +         dest.type = BRW_REGISTER_TYPE_F;
> > >
> > > -         bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
> > > -         dest = offset(dest, bld, 1);
> > > +         if (devinfo->needs_unlit_centroid_workaround &&
> > > +             bary_intrin == nir_intrinsic_load_barycentric_centroid) {
> > > +
> > > +            /* Get the pixel/sample mask into f0 so that we know which
> > > +             * pixels are lit.  Then, for each channel that is unlit,
> > > +             * replace the centroid data with non-centroid data.
> > > +             */
> > > +            bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
> > > +
> > > +            fs_reg dest_i = offset(dest, bld, i);
> > > +            fs_reg dst_xy_pixel =
> > > +               delta_xy[brw_barycentric_mode(interp_mode,
> > > +                  nir_intrinsic_load_barycentric_pixel)];
> > > +
> > > +            fs_inst *inst;
> > > +            inst = bld.emit(FS_OPCODE_LINTERP, dest_i, dst_xy_pixel,
> > > interp);
> > > +            inst->predicate = BRW_PREDICATE_NORMAL;
> > > +            inst->predicate_inverse = true;
> > > +            inst->no_dd_clear = true;
> > > +
> > > +            inst = bld.emit(FS_OPCODE_LINTERP, dest_i, dst_xy,
> interp);
> > > +            inst->predicate = BRW_PREDICATE_NORMAL;
> > > +            inst->predicate_inverse = false;
> > > +            inst->no_dd_check = true;
> > > +         } else if (devinfo->gen < 6 && interp_mode ==
> > > INTERP_MODE_SMOOTH) {
> > > +            fs_reg tmp = vgrf(glsl_type::float_type);
> > > +            bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
> > > +            bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
> > > +         } else {
> > > +            bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy,
> > > interp);
> > > +         }
> > >
> >
> > Ugh... I think I see why you're handling the barycentric load here now.
> > Eventually, I think we could probably be doing this better but I'm not
> > seeing an easy path at the moment.
>
> Yeah, perhaps.  I don't think it's that bad as is, though.
>
> > >        }
> > >        break;
> > >     }
> > > +
> > >     default:
> > >        nir_emit_intrinsic(bld, instr);
> > >        break;
> > > @@ -3869,26 +3891,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder
> > > &bld, nir_intrinsic_instr *instr
> > >     }
> > >
> > >     case nir_intrinsic_load_input: {
> > > -      fs_reg src;
> > > +      fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type);
> > >        unsigned num_components = instr->num_components;
> > >        enum brw_reg_type type = dest.type;
> > >
> > > -      if (stage == MESA_SHADER_VERTEX) {
> > > -         src = fs_reg(ATTR, instr->const_index[0], dest.type);
> > > -      } else {
> > > -         assert(type_sz(type) >= 4);
> > > -         if (type == BRW_REGISTER_TYPE_DF) {
> > > -            /* const_index is in 32-bit type size units that could
> not be
> > > aligned
> > > -             * with DF. We need to read the double vector as if it
> was a
> > > float
> > > -             * vector of twice the number of components to fetch the
> > > right data.
> > > -             */
> > > -            dest = retype(dest, BRW_REGISTER_TYPE_F);
> > > -            num_components *= 2;
> > > -         }
> > > -         src = offset(retype(nir_inputs, dest.type), bld,
> > > -                      instr->const_index[0]);
> > > -      }
> > >
> >
> > This hunk seems odd... It doesn't look like it really has anything to do
> > with FS.  Is that else clause really FS-only or does it apply to GS and
> > tess stages?
>
> Yep, this is FS specific.  TCS/TES/GS input intrinsics are all handled
> by nir_emit_{tcs,tes,gs}_intrinsic.  This code in the generic intrinsic
> handler is only for VS and FS.  And the VS code obviously didn't include
> this hunk.
>
> Sorry this was immensely non-obvious.  This raises a good point, though:
> after this patch, I can move the remaining code to nir_emit_vs_intrinsic
> so it's clear it only applies to the VS.
>

I'd like to see that patch if you don't mind.


>
> > > -
> > >        nir_const_value *const_offset =
> > > nir_src_as_const_value(instr->src[0]);
> > >        assert(const_offset && "Indirect input loads not allowed");
> > >        src = offset(src, bld, const_offset->u32[0]);
> > > diff --git a/src/mesa/drivers/dri/i965/brw_nir.c
> > > b/src/mesa/drivers/dri/i965/brw_nir.c
> > > index caf9fe0..d1a823a 100644
> > > --- a/src/mesa/drivers/dri/i965/brw_nir.c
> > > +++ b/src/mesa/drivers/dri/i965/brw_nir.c
> > > @@ -30,7 +30,8 @@ static bool
> > >  is_input(nir_intrinsic_instr *intrin)
> > >  {
> > >     return intrin->intrinsic == nir_intrinsic_load_input ||
> > > -          intrin->intrinsic == nir_intrinsic_load_per_vertex_input;
> > > +          intrin->intrinsic == nir_intrinsic_load_per_vertex_input ||
> > > +          intrin->intrinsic == nir_intrinsic_load_interpolated_input;
> > >  }
> > >
> > >  static bool
> > > @@ -282,9 +283,16 @@ brw_nir_lower_tes_inputs(nir_shader *nir, const
> > > struct brw_vue_map *vue_map)
> > >  void
> > >  brw_nir_lower_fs_inputs(nir_shader *nir)
> > >  {
> > > -   nir_assign_var_locations(&nir->inputs, &nir->num_inputs,
> > > VARYING_SLOT_VAR0,
> > > -                            type_size_scalar);
> > > -   nir_lower_io(nir, nir_var_shader_in, type_size_scalar, false);
> > > +   foreach_list_typed(nir_variable, var, node, &nir->inputs) {
> > > +      var->data.driver_location = var->data.location;
> > > +   }
> > > +
> > > +   nir_lower_io(nir, nir_var_shader_in, type_size_vec4, true);
> > > +
> > > +   /* This pass needs actual constants */
> > > +   nir_opt_constant_folding(nir);
> > > +
> > > +   add_const_offset_to_base(nir, nir_var_shader_in);
> > >  }
> > >
> > >  void
> > > --
> > > 2.9.0
> > >
> > > _______________________________________________
> > > mesa-dev mailing list
> > > mesa-dev@lists.freedesktop.org
> > > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
> > >
> >
>
>

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 6/7] i965: Rewrite FS input handling to use the new NIR intrinsics.

Reply via email to