Re: [Mesa-dev] [PATCH 6/7] i965: Rewrite FS input handling to use the new NIR intrinsics.

Jason Ekstrand Tue, 19 Jul 2016 13:40:15 -0700

On Mon, Jul 18, 2016 at 1:26 PM, Kenneth Graunke <kenn...@whitecape.org>
wrote:


> This eliminates the need to walk the list of input variables, recurse
> into their types (via logic largely redundant with nir_lower_io), and
> interpolate all possible inputs up front.  The backend no longer has
> to care about variables at all, which eliminates complications from
> trying to pack multiple variables into the same location.  Instead,
> each intrinsic specifies exactly what's needed.
>
> This should unblock Timothy's work on GL_ARB_enhanced_layouts.
>
> Each load_interpolated_input intrinsic corresponds to PLN instructions,
> while load_barycentric_at_* intrinsics correspond to pixel interpolator
> messages.  The pixel/centroid/sample barycentric intrinsics simply refer
> to payload fields (delta_xy[]), and don't actually generate any code.
>
> Because we use a single intrinsic for both centroid-qualified variables
> and interpolateAtCentroid(), they become indistinguishable.  We stop
> sending pixel interpolator messages for those, and instead use the
> payload provided data, which should be considerably faster.
>
> On Broadwell:
>
> total instructions in shared programs: 9067751 -> 9067570 (-0.00%)
> instructions in affected programs: 145902 -> 145721 (-0.12%)
> helped: 422
> HURT: 209
>
> total spills in shared programs: 2849 -> 2899 (1.76%)
> spills in affected programs: 760 -> 810 (6.58%)
> helped: 0
> HURT: 10
>
> total fills in shared programs: 3910 -> 3950 (1.02%)
> fills in affected programs: 617 -> 657 (6.48%)
> helped: 0
> HURT: 10
>
> LOST:   3
> GAINED: 3
>
> The differences mostly appear to be slight changes in MOVs.
>
> Signed-off-by: Kenneth Graunke <kenn...@whitecape.org>
> ---
>  src/mesa/drivers/dri/i965/brw_fs.cpp     | 175 ++++---------
>  src/mesa/drivers/dri/i965/brw_fs.h       |   9 +-
>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 410
> ++++++++++++++++---------------
>  src/mesa/drivers/dri/i965/brw_nir.c      |  16 +-
>  4 files changed, 269 insertions(+), 341 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp
> b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 94127bc..06007fe 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -1067,21 +1067,27 @@ fs_visitor::emit_fragcoord_interpolation(fs_reg
> wpos)
>     bld.MOV(wpos, this->wpos_w);
>  }
>
> -static enum brw_barycentric_mode
> -barycentric_mode(enum glsl_interp_mode mode,
> -                 bool is_centroid, bool is_sample)
> +enum brw_barycentric_mode
> +brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)
>  {
> -   unsigned bary;
> -
>     /* Barycentric modes don't make sense for flat inputs. */
>     assert(mode != INTERP_MODE_FLAT);
>
> -   if (is_sample) {
> -      bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
> -   } else if (is_centroid) {
> -      bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
> -   } else {
> +   unsigned bary;
> +   switch (op) {
> +   case nir_intrinsic_load_barycentric_pixel:
> +   case nir_intrinsic_load_barycentric_at_offset:
>        bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
> +      break;
> +   case nir_intrinsic_load_barycentric_centroid:
> +      bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
> +      break;
> +   case nir_intrinsic_load_barycentric_sample:
> +   case nir_intrinsic_load_barycentric_at_sample:
> +      bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
> +      break;
> +   default:
> +      assert(!"invalid intrinsic");
>     }
>
>     if (mode == INTERP_MODE_NOPERSPECTIVE)
> @@ -1101,107 +1107,6 @@ centroid_to_pixel(enum brw_barycentric_mode bary)
>     return (enum brw_barycentric_mode) ((unsigned) bary - 1);
>  }
>
> -void
> -fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
> -                                       const glsl_type *type,
> -                                       glsl_interp_mode
> interpolation_mode,
> -                                       int *location, bool mod_centroid,
> -                                       bool mod_sample)
> -{
> -   assert(stage == MESA_SHADER_FRAGMENT);
> -   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> -
> -   if (type->is_array() || type->is_matrix()) {
> -      const glsl_type *elem_type = glsl_get_array_element(type);
> -      const unsigned length = glsl_get_length(type);
> -
> -      for (unsigned i = 0; i < length; i++) {
> -         emit_general_interpolation(attr, name, elem_type,
> interpolation_mode,
> -                                    location, mod_centroid, mod_sample);
> -      }
> -   } else if (type->is_record()) {
> -      for (unsigned i = 0; i < type->length; i++) {
> -         const glsl_type *field_type = type->fields.structure[i].type;
> -         emit_general_interpolation(attr, name, field_type,
> interpolation_mode,
> -                                    location, mod_centroid, mod_sample);
> -      }
> -   } else {
> -      assert(type->is_scalar() || type->is_vector());
> -
> -      if (prog_data->urb_setup[*location] == -1) {
> -         /* If there's no incoming setup data for this slot, don't
> -          * emit interpolation for it.
> -          */
> -         *attr = offset(*attr, bld, type->vector_elements);
> -         (*location)++;
> -         return;
> -      }
> -
> -      attr->type = brw_type_for_base_type(type->get_scalar_type());
> -
> -      if (interpolation_mode == INTERP_MODE_FLAT) {
> -         /* Constant interpolation (flat shading) case. The SF has
> -          * handed us defined values in only the constant offset
> -          * field of the setup reg.
> -          */
> -         unsigned vector_elements = type->vector_elements;
> -
> -         /* Data starts at suboffet 3 in 32-bit units (12 bytes), so it
> is not
> -          * 64-bit aligned and the current implementation fails to read
> the
> -          * data properly. Instead, when there is a double input varying,
> -          * read it as vector of floats with twice the number of
> components.
> -          */
> -         if (attr->type == BRW_REGISTER_TYPE_DF) {
> -            vector_elements *= 2;
> -            attr->type = BRW_REGISTER_TYPE_F;
> -         }
> -         for (unsigned int i = 0; i < vector_elements; i++) {
> -            struct brw_reg interp = interp_reg(*location, i);
> -            interp = suboffset(interp, 3);
> -            interp.type = attr->type;
> -            bld.emit(FS_OPCODE_CINTERP, *attr, fs_reg(interp));
> -            *attr = offset(*attr, bld, 1);
> -         }
> -      } else {
> -         /* Smooth/noperspective interpolation case. */
> -         enum brw_barycentric_mode bary =
> -            barycentric_mode(interpolation_mode, mod_centroid,
> mod_sample);
> -
> -         for (unsigned int i = 0; i < type->vector_elements; i++) {
> -            fs_reg interp(interp_reg(*location, i));
> -            if (devinfo->needs_unlit_centroid_workaround && mod_centroid)
> {
> -               /* Get the pixel/sample mask into f0 so that we know
> -                * which pixels are lit.  Then, for each channel that is
> -                * unlit, replace the centroid data with non-centroid
> -                * data.
> -                */
> -               bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
> -
> -               fs_inst *inst;
> -               inst = bld.emit(FS_OPCODE_LINTERP, *attr,
> -                               delta_xy[centroid_to_pixel(bary)], interp);
> -               inst->predicate = BRW_PREDICATE_NORMAL;
> -               inst->predicate_inverse = true;
> -               inst->no_dd_clear = true;
> -
> -               inst = bld.emit(FS_OPCODE_LINTERP, *attr,
> -                               delta_xy[bary], interp);
> -               inst->predicate = BRW_PREDICATE_NORMAL;
> -               inst->predicate_inverse = false;
> -               inst->no_dd_check = true;
> -            } else {
> -               bld.emit(FS_OPCODE_LINTERP, *attr, delta_xy[bary], interp);
> -            }
> -            if (devinfo->gen < 6 && interpolation_mode ==
> INTERP_MODE_SMOOTH) {
> -               bld.MUL(*attr, *attr, this->pixel_w);
> -            }
> -            *attr = offset(*attr, bld, 1);
> -         }
> -      }
> -      (*location)++;
> -   }
> -}
> -
>  fs_reg *
>  fs_visitor::emit_frontfacing_interpolation()
>  {
> @@ -6327,6 +6232,10 @@ fs_visitor::run_cs()
>  /**
>   * Return a bitfield where bit n is set if barycentric interpolation mode
> n
>   * (see enum brw_barycentric_mode) is needed by the fragment shader.
> + *
> + * We examine the load_barycentric intrinsics rather than looking at input
> + * variables so that we catch interpolateAtCentroid() messages too, which
> + * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
>   */
>  static unsigned
>  brw_compute_barycentric_interp_modes(const struct brw_device_info
> *devinfo,
> @@ -6334,29 +6243,37 @@ brw_compute_barycentric_interp_modes(const struct
> brw_device_info *devinfo,
>  {
>     unsigned barycentric_interp_modes = 0;
>
> -   nir_foreach_variable(var, &shader->inputs) {
> -      /* Ignore WPOS; it doesn't require interpolation. */
> -      if (var->data.location == VARYING_SLOT_POS)
> +   nir_foreach_function(f, shader) {
> +      if (!f->impl)
>           continue;
>
> -      /* Flat inputs don't need barycentric modes. */
> -      if (var->data.interpolation == INTERP_MODE_FLAT)
> -         continue;
> +      nir_foreach_block(block, f->impl) {
> +         nir_foreach_instr(instr, block) {
> +            if (instr->type != nir_instr_type_intrinsic)
> +               continue;
>
> -      /* Determine the set (or sets) of barycentric coordinates needed to
> -       * interpolate this variable.  Note that when
> -       * brw->needs_unlit_centroid_workaround is set, centroid
> interpolation
> -       * uses PIXEL interpolation for unlit pixels and CENTROID
> interpolation
> -       * for lit pixels, so we need both sets of barycentric coordinates.
> -       */
> -      enum brw_barycentric_mode bary_mode =
> -         barycentric_mode((glsl_interp_mode) var->data.interpolation,
> -                          var->data.centroid, var->data.sample);
> +            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
> +            if (intrin->intrinsic !=
> nir_intrinsic_load_interpolated_input)
> +               continue;
>

Any particular reason why you're looking at the source of the load rather
than just looking for the load_barycentric intrinsic directly?


> +
> +            /* Ignore WPOS; it doesn't require interpolation. */
> +            if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS)
> +               continue;
>

Ugh...


>
> -      barycentric_interp_modes |= 1 << bary_mode;
> +            intrin =
> nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
> +            enum glsl_interp_mode interp = (enum glsl_interp_mode)
> +               nir_intrinsic_interp_mode(intrin);
> +            nir_intrinsic_op bary_op = intrin->intrinsic;
> +            enum brw_barycentric_mode bary =
> +               brw_barycentric_mode(interp, bary_op);
>
> -      if (var->data.centroid && devinfo->needs_unlit_centroid_workaround)
> -         barycentric_interp_modes |= 1 << centroid_to_pixel(bary_mode);
> +            barycentric_interp_modes |= 1 << bary;
> +
> +            if (devinfo->needs_unlit_centroid_workaround &&
> +                bary_op == nir_intrinsic_load_barycentric_centroid)
> +               barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
> +         }
> +      }
>     }
>
>     return barycentric_interp_modes;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h
> b/src/mesa/drivers/dri/i965/brw_fs.h
> index 7998f51..574475f 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -174,11 +174,6 @@ public:
>     fs_reg *emit_samplepos_setup();
>     fs_reg *emit_sampleid_setup();
>     fs_reg *emit_samplemaskin_setup();
> -   void emit_general_interpolation(fs_reg *attr, const char *name,
> -                                   const glsl_type *type,
> -                                   glsl_interp_mode interpolation_mode,
> -                                   int *location, bool mod_centroid,
> -                                   bool mod_sample);
>     fs_reg *emit_vs_system_value(int location);
>     void emit_interpolation_setup_gen4();
>     void emit_interpolation_setup_gen6();
> @@ -195,7 +190,6 @@ public:
>     bool opt_zero_samples();
>
>     void emit_nir_code();
> -   void nir_setup_inputs();
>     void nir_setup_single_output_varying(fs_reg *reg, const glsl_type
> *type,
>                                          unsigned *location);
>     void nir_setup_outputs();
> @@ -511,3 +505,6 @@ void shuffle_64bit_data_for_32bit_write(const
> brw::fs_builder &bld,
>                                          uint32_t components);
>  fs_reg setup_imm_df(const brw::fs_builder &bld,
>                      double v);
> +
> +enum brw_barycentric_mode brw_barycentric_mode(enum glsl_interp_mode mode,
> +                                               nir_intrinsic_op op);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> index 6265dc6..610c151 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> @@ -36,7 +36,6 @@ fs_visitor::emit_nir_code()
>     /* emit the arrays used for inputs and outputs - load/store intrinsics
> will
>      * be converted to reads/writes of these arrays
>      */
> -   nir_setup_inputs();
>     nir_setup_outputs();
>     nir_setup_uniforms();
>     nir_emit_system_values();
> @@ -50,38 +49,6 @@ fs_visitor::emit_nir_code()
>  }
>
>  void
> -fs_visitor::nir_setup_inputs()
> -{
> -   if (stage != MESA_SHADER_FRAGMENT)
> -      return;
> -
> -   nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs);
> -
> -   nir_foreach_variable(var, &nir->inputs) {
> -      fs_reg input = offset(nir_inputs, bld, var->data.driver_location);
> -
> -      fs_reg reg;
> -      if (var->data.location == VARYING_SLOT_POS) {
> -         emit_fragcoord_interpolation(input);
> -      } else if (var->data.location == VARYING_SLOT_LAYER) {
> -         struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_LAYER,
> 1), 3);
> -         reg.type = BRW_REGISTER_TYPE_D;
> -         bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D),
> reg);
> -      } else if (var->data.location == VARYING_SLOT_VIEWPORT) {
> -         struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_VIEWPORT,
> 2), 3);
> -         reg.type = BRW_REGISTER_TYPE_D;
> -         bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D),
> reg);
> -      } else {
> -         int location = var->data.location;
> -         emit_general_interpolation(&input, var->name, var->type,
> -                                    (glsl_interp_mode)
> var->data.interpolation,
> -                                    &location, var->data.centroid,
> -                                    var->data.sample);
> -      }
> -   }
> -}
> -
> -void
>  fs_visitor::nir_setup_single_output_varying(fs_reg *reg,
>                                              const glsl_type *type,
>                                              unsigned *location)
> @@ -3063,7 +3030,6 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder
> &bld,
>                                    nir_intrinsic_instr *instr)
>  {
>     assert(stage == MESA_SHADER_FRAGMENT);
> -   const struct brw_wm_prog_key *wm_key = (const struct brw_wm_prog_key
> *) key;
>
>     fs_reg dest;
>     if (nir_intrinsic_infos[instr->intrinsic].has_dest)
> @@ -3120,189 +3086,245 @@ fs_visitor::nir_emit_fs_intrinsic(const
> fs_builder &bld,
>        break;
>     }
>
> -   case nir_intrinsic_interp_var_at_centroid:
> -   case nir_intrinsic_interp_var_at_sample:
> -   case nir_intrinsic_interp_var_at_offset: {
> -      /* Handle ARB_gpu_shader5 interpolation intrinsics
> -       *
> -       * It's worth a quick word of explanation as to why we handle the
> full
> -       * variable-based interpolation intrinsic rather than a lowered
> version
> -       * with like we do for other inputs.  We have to do that because
> the way
> -       * we set up inputs doesn't allow us to use the already setup
> inputs for
> -       * interpolation.  At the beginning of the shader, we go through
> all of
> -       * the input variables and do the initial interpolation and put it
> in
> -       * the nir_inputs array based on its location as determined in
> -       * nir_lower_io.  If the input isn't used, dead code cleans up and
> -       * everything works fine.  However, when we get to the
> ARB_gpu_shader5
> -       * interpolation intrinsics, we need to reinterpolate the input
> -       * differently.  If we used an intrinsic that just had an index it
> would
> -       * only give us the offset into the nir_inputs array.  However,
> this is
> -       * useless because that value is post-interpolation and we need
> -       * pre-interpolation.  In order to get the actual location of the
> bits
> -       * we get from the vertex fetching hardware, we need the variable.
> -       */
> -      fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
> -      const glsl_interp_mode interpolation =
> -         (glsl_interp_mode) instr->variables[0]->var->data.interpolation;
> +   case nir_intrinsic_load_input: {
> +      /* load_input is only used for flat inputs */
> +      unsigned base = nir_intrinsic_base(instr);
> +      unsigned component = nir_intrinsic_component(instr);
> +      unsigned num_components = instr->num_components;
> +      enum brw_reg_type type = dest.type;
>
> -      switch (instr->intrinsic) {
> -      case nir_intrinsic_interp_var_at_centroid:
> -         emit_pixel_interpolater_send(bld,
> -                                      FS_OPCODE_INTERPOLATE_AT_CENTROID,
> -                                      dst_xy,
> -                                      fs_reg(), /* src */
> -                                      brw_imm_ud(0u),
> -                                      interpolation);
> -         break;
> +      /* Special case fields in the VUE header */
> +      if (base == VARYING_SLOT_LAYER)
> +         component = 1;
> +      else if (base == VARYING_SLOT_VIEWPORT)
> +         component = 2;
>
> -      case nir_intrinsic_interp_var_at_sample: {
> -         if (!wm_key->multisample_fbo) {
> -            /* From the ARB_gpu_shader5 specification:
> -             * "If multisample buffers are not available, the input
> varying
> -             *  will be evaluated at the center of the pixel."
> -             */
> -            emit_pixel_interpolater_send(bld,
> -
>  FS_OPCODE_INTERPOLATE_AT_CENTROID,
> -                                         dst_xy,
> -                                         fs_reg(), /* src */
> -                                         brw_imm_ud(0u),
> -                                         interpolation);
> -            break;
> -         }
> +      if (nir_dest_bit_size(instr->dest) == 64) {
> +         /* const_index is in 32-bit type size units that could not be
> aligned
> +          * with DF. We need to read the double vector as if it was a
> float
> +          * vector of twice the number of components to fetch the right
> data.
> +          */
> +         type = BRW_REGISTER_TYPE_F;
> +         num_components *= 2;
> +      }
>
> -         nir_const_value *const_sample =
> nir_src_as_const_value(instr->src[0]);
> +      for (unsigned int i = 0; i < num_components; i++) {
> +         struct brw_reg interp = interp_reg(base, component + i);
> +         interp = suboffset(interp, 3);
> +         bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i),
> +                  retype(fs_reg(interp), type));
> +      }
>
> -         if (const_sample) {
> -            unsigned msg_data = const_sample->i32[0] << 4;
> +      if (nir_dest_bit_size(instr->dest) == 64) {
> +         shuffle_32bit_load_result_to_64bit_data(bld,
> +                                                 dest,
> +                                                 retype(dest, type),
> +                                                 instr->num_components);
> +      }
> +      break;
> +   }
> +
> +   case nir_intrinsic_load_barycentric_pixel:
> +   case nir_intrinsic_load_barycentric_centroid:
> +   case nir_intrinsic_load_barycentric_sample:
> +      /* Do nothing - load_interpolated_input handling will handle it
> later. */
> +      break;
>

I'm very courious why you made this choice.  It seems odd to me.


>
> +   case nir_intrinsic_load_barycentric_at_sample: {
> +      const glsl_interp_mode interpolation =
> +         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
> +
> +      nir_const_value *const_sample =
> nir_src_as_const_value(instr->src[0]);
> +
> +      if (const_sample) {
> +         unsigned msg_data = const_sample->i32[0] << 4;
> +
> +         emit_pixel_interpolater_send(bld,
> +                                      FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> +                                      dest,
> +                                      fs_reg(), /* src */
> +                                      brw_imm_ud(msg_data),
> +                                      interpolation);
> +      } else {
> +         const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
> +                                          BRW_REGISTER_TYPE_UD);
> +
> +         if (nir_src_is_dynamically_uniform(instr->src[0])) {
> +            const fs_reg sample_id = bld.emit_uniformize(sample_src);
> +            const fs_reg msg_data = vgrf(glsl_type::uint_type);
> +            bld.exec_all().group(1, 0)
> +               .SHL(msg_data, sample_id, brw_imm_ud(4u));
>              emit_pixel_interpolater_send(bld,
>                                           FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> -                                         dst_xy,
> +                                         dest,
>                                           fs_reg(), /* src */
> -                                         brw_imm_ud(msg_data),
> +                                         msg_data,
>                                           interpolation);
>           } else {
> -            const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
> -                                             BRW_REGISTER_TYPE_UD);
> -
> -            if (nir_src_is_dynamically_uniform(instr->src[0])) {
> -               const fs_reg sample_id = bld.emit_uniformize(sample_src);
> -               const fs_reg msg_data = vgrf(glsl_type::uint_type);
> -               bld.exec_all().group(1, 0)
> -                  .SHL(msg_data, sample_id, brw_imm_ud(4u));
> +            /* Make a loop that sends a message to the pixel interpolater
> +             * for the sample number in each live channel. If there are
> +             * multiple channels with the same sample number then these
> +             * will be handled simultaneously with a single interation of
> +             * the loop.
> +             */
> +            bld.emit(BRW_OPCODE_DO);
> +
> +            /* Get the next live sample number into sample_id_reg */
> +            const fs_reg sample_id = bld.emit_uniformize(sample_src);
> +
> +            /* Set the flag register so that we can perform the send
> +             * message on all channels that have the same sample number
> +             */
> +            bld.CMP(bld.null_reg_ud(),
> +                    sample_src, sample_id,
> +                    BRW_CONDITIONAL_EQ);
> +            const fs_reg msg_data = vgrf(glsl_type::uint_type);
> +            bld.exec_all().group(1, 0)
> +               .SHL(msg_data, sample_id, brw_imm_ud(4u));
> +            fs_inst *inst =
>                 emit_pixel_interpolater_send(bld,
>
>  FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> -                                            dst_xy,
> +                                            dest,
>                                              fs_reg(), /* src */
>                                              msg_data,
>                                              interpolation);
> -            } else {
> -               /* Make a loop that sends a message to the pixel
> interpolater
> -                * for the sample number in each live channel. If there are
> -                * multiple channels with the same sample number then these
> -                * will be handled simultaneously with a single interation
> of
> -                * the loop.
> -                */
> -               bld.emit(BRW_OPCODE_DO);
> -
> -               /* Get the next live sample number into sample_id_reg */
> -               const fs_reg sample_id = bld.emit_uniformize(sample_src);
> +            set_predicate(BRW_PREDICATE_NORMAL, inst);
>
> -               /* Set the flag register so that we can perform the send
> -                * message on all channels that have the same sample number
> -                */
> -               bld.CMP(bld.null_reg_ud(),
> -                       sample_src, sample_id,
> -                       BRW_CONDITIONAL_EQ);
> -               const fs_reg msg_data = vgrf(glsl_type::uint_type);
> -               bld.exec_all().group(1, 0)
> -                  .SHL(msg_data, sample_id, brw_imm_ud(4u));
> -               fs_inst *inst =
> -                  emit_pixel_interpolater_send(bld,
> -
>  FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> -                                               dst_xy,
> -                                               fs_reg(), /* src */
> -                                               msg_data,
> -                                               interpolation);
> -               set_predicate(BRW_PREDICATE_NORMAL, inst);
> -
> -               /* Continue the loop if there are any live channels left */
> -               set_predicate_inv(BRW_PREDICATE_NORMAL,
> -                                 true, /* inverse */
> -                                 bld.emit(BRW_OPCODE_WHILE));
> -            }
> +            /* Continue the loop if there are any live channels left */
> +            set_predicate_inv(BRW_PREDICATE_NORMAL,
> +                              true, /* inverse */
> +                              bld.emit(BRW_OPCODE_WHILE));
>           }
> -
> -         break;
>        }
> +      break;
> +   }
>
> -      case nir_intrinsic_interp_var_at_offset: {
> -         nir_const_value *const_offset =
> nir_src_as_const_value(instr->src[0]);
> +   case nir_intrinsic_load_barycentric_at_offset: {
> +      const glsl_interp_mode interpolation =
> +         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
>
> -         if (const_offset) {
> -            unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) &
> 0xf;
> -            unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) &
> 0xf;
> +      nir_const_value *const_offset =
> nir_src_as_const_value(instr->src[0]);
>
> -            emit_pixel_interpolater_send(bld,
> -
>  FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
> -                                         dst_xy,
> -                                         fs_reg(), /* src */
> -                                         brw_imm_ud(off_x | (off_y << 4)),
> -                                         interpolation);
> -         } else {
> -            fs_reg src = vgrf(glsl_type::ivec2_type);
> -            fs_reg offset_src = retype(get_nir_src(instr->src[0]),
> -                                       BRW_REGISTER_TYPE_F);
> -            for (int i = 0; i < 2; i++) {
> -               fs_reg temp = vgrf(glsl_type::float_type);
> -               bld.MUL(temp, offset(offset_src, bld, i),
> brw_imm_f(16.0f));
> -               fs_reg itemp = vgrf(glsl_type::int_type);
> -               /* float to int */
> -               bld.MOV(itemp, temp);
> -
> -               /* Clamp the upper end of the range to +7/16.
> -                * ARB_gpu_shader5 requires that we support a maximum
> offset
> -                * of +0.5, which isn't representable in a S0.4 value -- if
> -                * we didn't clamp it, we'd end up with -8/16, which is the
> -                * opposite of what the shader author wanted.
> -                *
> -                * This is legal due to ARB_gpu_shader5's quantization
> -                * rules:
> -                *
> -                * "Not all values of <offset> may be supported; x and y
> -                * offsets may be rounded to fixed-point values with the
> -                * number of fraction bits given by the
> -                * implementation-dependent constant
> -                * FRAGMENT_INTERPOLATION_OFFSET_BITS"
> -                */
> -               set_condmod(BRW_CONDITIONAL_L,
> -                           bld.SEL(offset(src, bld, i), itemp,
> brw_imm_d(7)));
> -            }
> +      if (const_offset) {
> +         unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;
> +         unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf;
>
> -            const enum opcode opcode =
> FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
> -            emit_pixel_interpolater_send(bld,
> -                                         opcode,
> -                                         dst_xy,
> -                                         src,
> -                                         brw_imm_ud(0u),
> -                                         interpolation);
> +         emit_pixel_interpolater_send(bld,
> +
> FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
> +                                      dest,
> +                                      fs_reg(), /* src */
> +                                      brw_imm_ud(off_x | (off_y << 4)),
> +                                      interpolation);
> +      } else {
> +         fs_reg src = vgrf(glsl_type::ivec2_type);
> +         fs_reg offset_src = retype(get_nir_src(instr->src[0]),
> +                                    BRW_REGISTER_TYPE_F);
> +         for (int i = 0; i < 2; i++) {
> +            fs_reg temp = vgrf(glsl_type::float_type);
> +            bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
> +            fs_reg itemp = vgrf(glsl_type::int_type);
> +            /* float to int */
> +            bld.MOV(itemp, temp);
> +
> +            /* Clamp the upper end of the range to +7/16.
> +             * ARB_gpu_shader5 requires that we support a maximum offset
> +             * of +0.5, which isn't representable in a S0.4 value -- if
> +             * we didn't clamp it, we'd end up with -8/16, which is the
> +             * opposite of what the shader author wanted.
> +             *
> +             * This is legal due to ARB_gpu_shader5's quantization
> +             * rules:
> +             *
> +             * "Not all values of <offset> may be supported; x and y
> +             * offsets may be rounded to fixed-point values with the
> +             * number of fraction bits given by the
> +             * implementation-dependent constant
> +             * FRAGMENT_INTERPOLATION_OFFSET_BITS"
> +             */
> +            set_condmod(BRW_CONDITIONAL_L,
> +                        bld.SEL(offset(src, bld, i), itemp,
> brw_imm_d(7)));
>           }
> +
> +         const enum opcode opcode =
> FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
> +         emit_pixel_interpolater_send(bld,
> +                                      opcode,
> +                                      dest,
> +                                      src,
> +                                      brw_imm_ud(0u),
> +                                      interpolation);
> +      }
> +      break;
> +   }
> +
> +   case nir_intrinsic_load_interpolated_input: {
> +      if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
> +         emit_fragcoord_interpolation(dest);
>           break;
>        }
>
> -      default:
> -         unreachable("Invalid intrinsic");
> +      assert(instr->src[0].ssa &&
> +             instr->src[0].ssa->parent_instr->type ==
> nir_instr_type_intrinsic);
> +      nir_intrinsic_instr *bary_intrinsic =
> +         nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
> +      nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
> +      enum glsl_interp_mode interp_mode =
> +         (enum glsl_interp_mode)
> nir_intrinsic_interp_mode(bary_intrinsic);
> +      fs_reg dst_xy;
> +
> +      if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
> +          bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
> +         /* Use the result of the PI message */
> +         dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
> +      } else {
> +         /* Use the delta_xy values computed from the payload */
> +         enum brw_barycentric_mode bary =
> +            brw_barycentric_mode(interp_mode, bary_intrin);
> +
> +         dst_xy = this->delta_xy[bary];
>        }
>
> -      for (unsigned j = 0; j < instr->num_components; j++) {
> -         fs_reg src = interp_reg(instr->variables[0]->var->data.location,
> j);
> -         src.type = dest.type;
> +      for (unsigned int i = 0; i < instr->num_components; i++) {
> +         fs_reg interp =
> +            fs_reg(interp_reg(nir_intrinsic_base(instr),
> +                              nir_intrinsic_component(instr) + i));
> +         interp.type = BRW_REGISTER_TYPE_F;
> +         dest.type = BRW_REGISTER_TYPE_F;
>
> -         bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
> -         dest = offset(dest, bld, 1);
> +         if (devinfo->needs_unlit_centroid_workaround &&
> +             bary_intrin == nir_intrinsic_load_barycentric_centroid) {
> +
> +            /* Get the pixel/sample mask into f0 so that we know which
> +             * pixels are lit.  Then, for each channel that is unlit,
> +             * replace the centroid data with non-centroid data.
> +             */
> +            bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
> +
> +            fs_reg dest_i = offset(dest, bld, i);
> +            fs_reg dst_xy_pixel =
> +               delta_xy[brw_barycentric_mode(interp_mode,
> +                  nir_intrinsic_load_barycentric_pixel)];
> +
> +            fs_inst *inst;
> +            inst = bld.emit(FS_OPCODE_LINTERP, dest_i, dst_xy_pixel,
> interp);
> +            inst->predicate = BRW_PREDICATE_NORMAL;
> +            inst->predicate_inverse = true;
> +            inst->no_dd_clear = true;
> +
> +            inst = bld.emit(FS_OPCODE_LINTERP, dest_i, dst_xy, interp);
> +            inst->predicate = BRW_PREDICATE_NORMAL;
> +            inst->predicate_inverse = false;
> +            inst->no_dd_check = true;
> +         } else if (devinfo->gen < 6 && interp_mode ==
> INTERP_MODE_SMOOTH) {
> +            fs_reg tmp = vgrf(glsl_type::float_type);
> +            bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
> +            bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
> +         } else {
> +            bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy,
> interp);
> +         }
>

Ugh... I think I see why you're handling the barycentric load here now.
Eventually, I think we could probably be doing this better but I'm not
seeing an easy path at the moment.


>        }
>        break;
>     }
> +
>     default:
>        nir_emit_intrinsic(bld, instr);
>        break;
> @@ -3869,26 +3891,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder
> &bld, nir_intrinsic_instr *instr
>     }
>
>     case nir_intrinsic_load_input: {
> -      fs_reg src;
> +      fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type);
>        unsigned num_components = instr->num_components;
>        enum brw_reg_type type = dest.type;
>
> -      if (stage == MESA_SHADER_VERTEX) {
> -         src = fs_reg(ATTR, instr->const_index[0], dest.type);
> -      } else {
> -         assert(type_sz(type) >= 4);
> -         if (type == BRW_REGISTER_TYPE_DF) {
> -            /* const_index is in 32-bit type size units that could not be
> aligned
> -             * with DF. We need to read the double vector as if it was a
> float
> -             * vector of twice the number of components to fetch the
> right data.
> -             */
> -            dest = retype(dest, BRW_REGISTER_TYPE_F);
> -            num_components *= 2;
> -         }
> -         src = offset(retype(nir_inputs, dest.type), bld,
> -                      instr->const_index[0]);
> -      }
>

This hunk seems odd... It doesn't look like it really has anything to do
with FS.  Is that else clause really FS-only or does it apply to GS and
tess stages?


> -
>        nir_const_value *const_offset =
> nir_src_as_const_value(instr->src[0]);
>        assert(const_offset && "Indirect input loads not allowed");
>        src = offset(src, bld, const_offset->u32[0]);
> diff --git a/src/mesa/drivers/dri/i965/brw_nir.c
> b/src/mesa/drivers/dri/i965/brw_nir.c
> index caf9fe0..d1a823a 100644
> --- a/src/mesa/drivers/dri/i965/brw_nir.c
> +++ b/src/mesa/drivers/dri/i965/brw_nir.c
> @@ -30,7 +30,8 @@ static bool
>  is_input(nir_intrinsic_instr *intrin)
>  {
>     return intrin->intrinsic == nir_intrinsic_load_input ||
> -          intrin->intrinsic == nir_intrinsic_load_per_vertex_input;
> +          intrin->intrinsic == nir_intrinsic_load_per_vertex_input ||
> +          intrin->intrinsic == nir_intrinsic_load_interpolated_input;
>  }
>
>  static bool
> @@ -282,9 +283,16 @@ brw_nir_lower_tes_inputs(nir_shader *nir, const
> struct brw_vue_map *vue_map)
>  void
>  brw_nir_lower_fs_inputs(nir_shader *nir)
>  {
> -   nir_assign_var_locations(&nir->inputs, &nir->num_inputs,
> VARYING_SLOT_VAR0,
> -                            type_size_scalar);
> -   nir_lower_io(nir, nir_var_shader_in, type_size_scalar, false);
> +   foreach_list_typed(nir_variable, var, node, &nir->inputs) {
> +      var->data.driver_location = var->data.location;
> +   }
> +
> +   nir_lower_io(nir, nir_var_shader_in, type_size_vec4, true);
> +
> +   /* This pass needs actual constants */
> +   nir_opt_constant_folding(nir);
> +
> +   add_const_offset_to_base(nir, nir_var_shader_in);
>  }
>
>  void
> --
> 2.9.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 6/7] i965: Rewrite FS input handling to use the new NIR intrinsics.

Reply via email to