Re: [Mesa-dev] [PATCH v2 12/13] i965: Add tessellation control shaders.

Jordan Justen Tue, 22 Dec 2015 01:44:37 -0800

We discussed all my questions / comments on irc...

12 & 13 Reviewed-by: Jordan Justen <jordan.l.jus...@intel.com>


On 2015-12-11 13:24:01, Kenneth Graunke wrote:
> The TCS is the first tessellation shader stage, and the most
> complicated.  It has access to each of the control points in the input
> patch, and computes a new output patch.  There is one logical invocation
> per output control point; all invocations run in parallel, and can
> communicate by reading and writing output variables.
> 
> One of the main responsibilities of the TCS is to write the special
> gl_TessLevelOuter[] and gl_TessLevelInner[] output variables which
> control how much new geometry the hardware tessellation engine will
> produce.  Otherwise, it simply writes outputs that are passed along
> to the TES.
> 
> We run in SIMD4x2 mode, handling two logical invocations per EU thread.
> The hardware doesn't properly manage the dispatch mask for us; it always
> initializes it to 0xFF.  We wrap the whole program in an IF..ENDIF block
> to handle an odd number of invocations, essentially falling back to
> SIMD4x1 on the last thread.
> 
> Signed-off-by: Kenneth Graunke <kenn...@whitecape.org>
> ---
>  src/mesa/drivers/dri/i965/Makefile.sources         |   2 +
>  src/mesa/drivers/dri/i965/brw_compiler.h           |  26 ++
>  src/mesa/drivers/dri/i965/brw_context.h            |   6 +
>  src/mesa/drivers/dri/i965/brw_defines.h            |   8 +
>  src/mesa/drivers/dri/i965/brw_link.cpp             |   4 +
>  src/mesa/drivers/dri/i965/brw_program.h            |   1 +
>  src/mesa/drivers/dri/i965/brw_reg.h                |   1 +
>  src/mesa/drivers/dri/i965/brw_shader.cpp           |  17 +
>  src/mesa/drivers/dri/i965/brw_shader.h             |   3 +
>  src/mesa/drivers/dri/i965/brw_state_upload.c       |   1 +
>  src/mesa/drivers/dri/i965/brw_tcs.c                | 262 +++++++++++
>  src/mesa/drivers/dri/i965/brw_vec4.cpp             |  10 +-
>  src/mesa/drivers/dri/i965/brw_vec4.h               |   1 +
>  src/mesa/drivers/dri/i965/brw_vec4_cse.cpp         |   2 +
>  .../dri/i965/brw_vec4_dead_code_eliminate.cpp      |   3 +
>  src/mesa/drivers/dri/i965/brw_vec4_generator.cpp   | 238 ++++++++++
>  src/mesa/drivers/dri/i965/brw_vec4_nir.cpp         |  23 +-
>  src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp         | 496 
> +++++++++++++++++++++
>  src/mesa/drivers/dri/i965/brw_vec4_tcs.h           |  84 ++++
>  19 files changed, 1186 insertions(+), 2 deletions(-)
>  create mode 100644 src/mesa/drivers/dri/i965/brw_tcs.c
>  create mode 100644 src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
>  create mode 100644 src/mesa/drivers/dri/i965/brw_vec4_tcs.h
> 
> diff --git a/src/mesa/drivers/dri/i965/Makefile.sources 
> b/src/mesa/drivers/dri/i965/Makefile.sources
> index 7354aaf..0b706de 100644
> --- a/src/mesa/drivers/dri/i965/Makefile.sources
> +++ b/src/mesa/drivers/dri/i965/Makefile.sources
> @@ -75,6 +75,7 @@ i965_compiler_FILES = \
>         brw_vec4_reg_allocate.cpp \
>         brw_vec4_surface_builder.cpp \
>         brw_vec4_surface_builder.h \
> +       brw_vec4_tcs.cpp \
>         brw_vec4_visitor.cpp \
>         brw_vec4_vs_visitor.cpp \
>         brw_vue_map.c \
> @@ -150,6 +151,7 @@ i965_FILES = \
>         brw_state.h \
>         brw_state_upload.c \
>         brw_structs.h \
> +       brw_tcs.c \
>         brw_tcs_surface_state.c \
>         brw_tes.c \
>         brw_tes_surface_state.c \
> diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h 
> b/src/mesa/drivers/dri/i965/brw_compiler.h
> index 64d831d..e6bae8e 100644
> --- a/src/mesa/drivers/dri/i965/brw_compiler.h
> +++ b/src/mesa/drivers/dri/i965/brw_compiler.h
> @@ -191,6 +191,16 @@ struct brw_vs_prog_key {
>     struct brw_sampler_prog_key_data tex;
>  };
>  
> +/** The program key for Tessellation Control Shaders. */
> +struct brw_tcs_prog_key
> +{
> +   unsigned program_string_id;
> +
> +   GLenum tes_primitive_mode;
> +
> +   struct brw_sampler_prog_key_data tex;
> +};
> +
>  /** The program key for Tessellation Evaluation Shaders. */
>  struct brw_tes_prog_key
>  {
> @@ -677,6 +687,22 @@ brw_compile_vs(const struct brw_compiler *compiler, void 
> *log_data,
>                 char **error_str);
>  
>  /**
> + * Compile a tessellation control shader.
> + *
> + * Returns the final assembly and the program's size.
> + */
> +const unsigned *
> +brw_compile_tcs(const struct brw_compiler *compiler,
> +                void *log_data,
> +                void *mem_ctx,
> +                const struct brw_tcs_prog_key *key,
> +                struct brw_tcs_prog_data *prog_data,
> +                const struct nir_shader *nir,
> +                int shader_time_index,
> +                unsigned *final_assembly_size,
> +                char **error_str);
> +
> +/**
>   * Compile a tessellation evaluation shader.
>   *
>   * Returns the final assembly and the program's size.
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
> b/src/mesa/drivers/dri/i965/brw_context.h
> index 5e840d1..1d989f3 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -1704,6 +1704,12 @@ brw_vertex_program_const(const struct 
> gl_vertex_program *p)
>     return (const struct brw_vertex_program *) p;
>  }
>  
> +static inline struct brw_tess_ctrl_program *
> +brw_tess_ctrl_program(struct gl_tess_ctrl_program *p)
> +{
> +   return (struct brw_tess_ctrl_program *) p;
> +}
> +
>  static inline struct brw_tess_eval_program *
>  brw_tess_eval_program(struct gl_tess_eval_program *p)
>  {
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
> b/src/mesa/drivers/dri/i965/brw_defines.h
> index 4a184cf..cc19c06 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -1305,6 +1305,14 @@ enum opcode {
>      *           UD immediate).
>      */
>     SHADER_OPCODE_MOV_INDIRECT,
> +
> +   VEC4_OPCODE_URB_READ,
> +   TCS_OPCODE_GET_INSTANCE_ID,
> +   TCS_OPCODE_URB_WRITE,
> +   TCS_OPCODE_SET_INPUT_URB_OFFSETS,
> +   TCS_OPCODE_SET_OUTPUT_URB_OFFSETS,
> +   TCS_OPCODE_GET_PRIMITIVE_ID,
> +   TCS_OPCODE_CREATE_BARRIER_HEADER,
>  };
>  
>  enum brw_urb_write_flags {
> diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp 
> b/src/mesa/drivers/dri/i965/brw_link.cpp
> index f5a7d20..7cdc830 100644
> --- a/src/mesa/drivers/dri/i965/brw_link.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_link.cpp
> @@ -42,6 +42,7 @@ brw_shader_precompile(struct gl_context *ctx,
>                        struct gl_shader_program *sh_prog)
>  {
>     struct gl_shader *vs = sh_prog->_LinkedShaders[MESA_SHADER_VERTEX];
> +   struct gl_shader *tcs = sh_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
>     struct gl_shader *tes = sh_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL];
>     struct gl_shader *gs = sh_prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
>     struct gl_shader *fs = sh_prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
> @@ -56,6 +57,9 @@ brw_shader_precompile(struct gl_context *ctx,
>     if (tes && !brw_tes_precompile(ctx, sh_prog, tes->Program))
>        return false;
>  
> +   if (tcs && !brw_tcs_precompile(ctx, sh_prog, tcs->Program))
> +      return false;
> +
>     if (vs && !brw_vs_precompile(ctx, sh_prog, vs->Program))
>        return false;
>  
> diff --git a/src/mesa/drivers/dri/i965/brw_program.h 
> b/src/mesa/drivers/dri/i965/brw_program.h
> index 1cdab97..3d9e1b9 100644
> --- a/src/mesa/drivers/dri/i965/brw_program.h
> +++ b/src/mesa/drivers/dri/i965/brw_program.h
> @@ -56,6 +56,7 @@ void
>  brw_dump_ir(const char *stage, struct gl_shader_program *shader_prog,
>              struct gl_shader *shader, struct gl_program *prog);
>  
> +void brw_upload_tcs_prog(struct brw_context *brw);
>  void brw_upload_tes_prog(struct brw_context *brw);
>  
>  #ifdef __cplusplus
> diff --git a/src/mesa/drivers/dri/i965/brw_reg.h 
> b/src/mesa/drivers/dri/i965/brw_reg.h
> index fa912c9..9f2ff9a 100644
> --- a/src/mesa/drivers/dri/i965/brw_reg.h
> +++ b/src/mesa/drivers/dri/i965/brw_reg.h
> @@ -84,6 +84,7 @@ struct brw_device_info;
>  #define BRW_SWIZZLE_YZXW      BRW_SWIZZLE4(1,2,0,3)
>  #define BRW_SWIZZLE_ZXYW      BRW_SWIZZLE4(2,0,1,3)
>  #define BRW_SWIZZLE_ZWZW      BRW_SWIZZLE4(2,3,2,3)
> +#define BRW_SWIZZLE_WZYX      BRW_SWIZZLE4(3,2,1,0)
>  
>  static inline bool
>  brw_is_single_value_swizzle(unsigned swiz)
> diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp 
> b/src/mesa/drivers/dri/i965/brw_shader.cpp
> index d954568..9b64ae4 100644
> --- a/src/mesa/drivers/dri/i965/brw_shader.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
> @@ -85,6 +85,7 @@ brw_compiler_create(void *mem_ctx, const struct 
> brw_device_info *devinfo)
>  
>     compiler->scalar_stage[MESA_SHADER_VERTEX] =
>        devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
> +   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false;
>     compiler->scalar_stage[MESA_SHADER_TESS_EVAL] = true;
>     compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
>        devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", false);
> @@ -137,6 +138,7 @@ brw_compiler_create(void *mem_ctx, const struct 
> brw_device_info *devinfo)
>        compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
>     }
>  
> +   
> compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = 
> false;
>     
> compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = 
> false;
>  
>     if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
> @@ -549,6 +551,21 @@ brw_instruction_name(enum opcode op)
>        return "mulh";
>     case SHADER_OPCODE_MOV_INDIRECT:
>        return "mov_indirect";
> +
> +   case VEC4_OPCODE_URB_READ:
> +      return "urb_read";
> +   case TCS_OPCODE_GET_INSTANCE_ID:
> +      return "tcs_get_instance_id";
> +   case TCS_OPCODE_URB_WRITE:
> +      return "tcs_urb_write";
> +   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
> +      return "tcs_set_input_urb_offsets";
> +   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
> +      return "tcs_set_output_urb_offsets";
> +   case TCS_OPCODE_GET_PRIMITIVE_ID:
> +      return "tcs_get_primitive_id";
> +   case TCS_OPCODE_CREATE_BARRIER_HEADER:
> +      return "tcs_create_barrier_header";
>     }
>  
>     unreachable("not reached");
> diff --git a/src/mesa/drivers/dri/i965/brw_shader.h 
> b/src/mesa/drivers/dri/i965/brw_shader.h
> index 2e73f12..5933613 100644
> --- a/src/mesa/drivers/dri/i965/brw_shader.h
> +++ b/src/mesa/drivers/dri/i965/brw_shader.h
> @@ -273,6 +273,9 @@ brw_assign_common_binding_table_offsets(gl_shader_stage 
> stage,
>  bool brw_vs_precompile(struct gl_context *ctx,
>                         struct gl_shader_program *shader_prog,
>                         struct gl_program *prog);
> +bool brw_tcs_precompile(struct gl_context *ctx,
> +                        struct gl_shader_program *shader_prog,
> +                        struct gl_program *prog);
>  bool brw_tes_precompile(struct gl_context *ctx,
>                          struct gl_shader_program *shader_prog,
>                          struct gl_program *prog);
> diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c 
> b/src/mesa/drivers/dri/i965/brw_state_upload.c
> index c657b25..56962d5 100644
> --- a/src/mesa/drivers/dri/i965/brw_state_upload.c
> +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
> @@ -678,6 +678,7 @@ brw_upload_programs(struct brw_context *brw,
>  {
>     if (pipeline == BRW_RENDER_PIPELINE) {
>        brw_upload_vs_prog(brw);
> +      brw_upload_tcs_prog(brw);
>        brw_upload_tes_prog(brw);
>  
>        if (brw->gen < 6)
> diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c 
> b/src/mesa/drivers/dri/i965/brw_tcs.c
> new file mode 100644
> index 0000000..4acfaea
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_tcs.c
> @@ -0,0 +1,262 @@
> +/*
> + * Copyright © 2013 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +/**
> + * \file brw_tcs.c
> + *
> + * Tessellation control shader state upload code.
> + */
> +
> +#include "brw_context.h"
> +#include "brw_nir.h"
> +#include "brw_program.h"
> +#include "brw_shader.h"
> +#include "brw_state.h"
> +#include "program/prog_parameter.h"
> +
> +static void
> +brw_tcs_debug_recompile(struct brw_context *brw,
> +                       struct gl_shader_program *shader_prog,
> +                       const struct brw_tcs_prog_key *key)
> +{
> +   struct brw_cache_item *c = NULL;
> +   const struct brw_tcs_prog_key *old_key = NULL;
> +   bool found = false;
> +
> +   perf_debug("Recompiling tessellation control shader for program %d\n",
> +              shader_prog->Name);
> +
> +   for (unsigned int i = 0; i < brw->cache.size; i++) {
> +      for (c = brw->cache.items[i]; c; c = c->next) {
> +         if (c->cache_id == BRW_CACHE_TCS_PROG) {
> +            old_key = c->key;
> +
> +            if (old_key->program_string_id == key->program_string_id)
> +               break;
> +         }
> +      }
> +      if (c)
> +         break;
> +   }
> +
> +   if (!c) {
> +      perf_debug("  Didn't find previous compile in the shader cache for "
> +                 "debug\n");
> +      return;
> +   }
> +
> +   found |= key_debug(brw, "TES primitive mode", old_key->tes_primitive_mode,
> +                      key->tes_primitive_mode);
> +   found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex);
> +
> +   if (!found) {
> +      perf_debug("  Something else\n");
> +   }
> +}
> +
> +static bool
> +brw_codegen_tcs_prog(struct brw_context *brw,
> +                     struct gl_shader_program *shader_prog,
> +                     struct brw_tess_ctrl_program *tcp,
> +                     struct brw_tcs_prog_key *key)
> +{
> +   const struct brw_compiler *compiler = brw->intelScreen->compiler;
> +   struct brw_stage_state *stage_state = &brw->tcs.base;
> +   nir_shader *nir = tcp->program.Base.nir;
> +   struct brw_tcs_prog_data prog_data;
> +   bool start_busy = false;
> +   double start_time = 0;
> +
> +   memset(&prog_data, 0, sizeof(prog_data));
> +
> +   /* Allocate the references to the uniforms that will end up in the
> +    * prog_data associated with the compiled program, and which will be freed
> +    * by the state cache.
> +    *
> +    * Note: param_count needs to be num_uniform_components * 4, since we add
> +    * padding around uniform values below vec4 size, so the worst case is 
> that
> +    * every uniform is a float which gets padded to the size of a vec4.
> +    */
> +   struct gl_shader *tcs = 
> shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
> +   int param_count = nir->num_uniforms;
> +   if (!compiler->scalar_stage[MESA_SHADER_TESS_CTRL])
> +      param_count *= 4;
> +
> +   prog_data.base.base.param =
> +      rzalloc_array(NULL, const gl_constant_value *, param_count);
> +   prog_data.base.base.pull_param =
> +      rzalloc_array(NULL, const gl_constant_value *, param_count);
> +   prog_data.base.base.image_param =
> +      rzalloc_array(NULL, struct brw_image_param, tcs->NumImages);
> +   prog_data.base.base.nr_params = param_count;
> +   prog_data.base.base.nr_image_params = tcs->NumImages;
> +
> +   brw_nir_setup_glsl_uniforms(nir, shader_prog, &tcp->program.Base,
> +                               &prog_data.base.base, false);
> +
> +   if (unlikely(INTEL_DEBUG & DEBUG_TCS))
> +      brw_dump_ir("tessellation control", shader_prog, tcs, NULL);
> +
> +   int st_index = -1;
> +   if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
> +      st_index = brw_get_shader_time_index(brw, shader_prog, NULL, ST_TCS);
> +
> +   if (unlikely(brw->perf_debug)) {
> +      start_busy = brw->batch.last_bo && 
> drm_intel_bo_busy(brw->batch.last_bo);
> +      start_time = get_time();
> +   }
> +
> +   void *mem_ctx = ralloc_context(NULL);
> +   unsigned program_size;
> +   char *error_str;
> +   const unsigned *program =
> +      brw_compile_tcs(compiler, brw, mem_ctx, key, &prog_data, nir, st_index,
> +                      &program_size, &error_str);
> +   if (program == NULL) {
> +      if (shader_prog) {
> +         shader_prog->LinkStatus = false;
> +         ralloc_strcat(&shader_prog->InfoLog, error_str);
> +      }
> +
> +      _mesa_problem(NULL, "Failed to compile tessellation control shader: "
> +                    "%s\n", error_str);
> +
> +      ralloc_free(mem_ctx);
> +      return false;
> +   }
> +
> +   if (unlikely(brw->perf_debug)) {
> +      struct brw_shader *btcs = (struct brw_shader *) tcs;
> +      if (btcs->compiled_once) {
> +         brw_tcs_debug_recompile(brw, shader_prog, key);
> +      }
> +      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
> +         perf_debug("TCS compile took %.03f ms and stalled the GPU\n",
> +                    (get_time() - start_time) * 1000);
> +      }
> +      btcs->compiled_once = true;
> +   }
> +
> +   /* Scratch space is used for register spilling */
> +   if (prog_data.base.base.total_scratch) {
> +      brw_get_scratch_bo(brw, &stage_state->scratch_bo,
> +                        prog_data.base.base.total_scratch *
> +                         brw->max_hs_threads);
> +   }
> +
> +   brw_upload_cache(&brw->cache, BRW_CACHE_TCS_PROG,
> +                    key, sizeof(*key),
> +                    program, program_size,
> +                    &prog_data, sizeof(prog_data),
> +                    &stage_state->prog_offset, &brw->tcs.prog_data);
> +   ralloc_free(mem_ctx);
> +
> +   return true;
> +}
> +
> +
> +void
> +brw_upload_tcs_prog(struct brw_context *brw)
> +{
> +   struct gl_context *ctx = &brw->ctx;
> +   struct gl_shader_program **current = ctx->_Shader->CurrentProgram;
> +   struct brw_stage_state *stage_state = &brw->tcs.base;
> +   struct brw_tcs_prog_key key;
> +   /* BRW_NEW_TESS_CTRL_PROGRAM */
> +   struct brw_tess_ctrl_program *tcp =
> +      (struct brw_tess_ctrl_program *) brw->tess_ctrl_program;
> +
> +   if (!brw_state_dirty(brw,
> +                        _NEW_TEXTURE,
> +                        BRW_NEW_TESS_CTRL_PROGRAM |
> +                        BRW_NEW_TESS_EVAL_PROGRAM))
> +      return;
> +
> +   if (tcp == NULL) {
> +      /* Other state atoms had better not try to access prog_data, since
> +       * there's no HS program.
> +       */
> +      brw->tcs.prog_data = NULL;
> +      brw->tcs.base.prog_data = NULL;
> +      return;
> +   }
> +
> +   struct gl_program *prog = &tcp->program.Base;
> +
> +   memset(&key, 0, sizeof(key));
> +
> +   key.program_string_id = tcp->id;
> +
> +   /* _NEW_TEXTURE */
> +   brw_populate_sampler_prog_key_data(ctx, prog, stage_state->sampler_count,
> +                                      &key.tex);
> +
> +   /* BRW_NEW_TESS_EVAL_PROGRAM */
> +   /* We need to specialize our code generation for tessellation levels
> +    * based on the domain the DS is expecting to tessellate.
> +    */
> +   struct brw_tess_eval_program *tep =
> +      (struct brw_tess_eval_program *) brw->tess_eval_program;
> +   assert(tep);
> +   key.tes_primitive_mode = tep->program.PrimitiveMode;
> +
> +   if (!brw_search_cache(&brw->cache, BRW_CACHE_TCS_PROG,
> +                         &key, sizeof(key),
> +                         &stage_state->prog_offset, &brw->tcs.prog_data)) {
> +      bool success = brw_codegen_tcs_prog(brw, 
> current[MESA_SHADER_TESS_CTRL],
> +                                          tcp, &key);
> +      assert(success);
> +      (void)success;
> +   }
> +   brw->tcs.base.prog_data = &brw->tcs.prog_data->base.base;
> +}
> +
> +
> +bool
> +brw_tcs_precompile(struct gl_context *ctx,
> +                   struct gl_shader_program *shader_prog,
> +                   struct gl_program *prog)
> +{
> +   struct brw_context *brw = brw_context(ctx);
> +   struct brw_tcs_prog_key key;
> +   uint32_t old_prog_offset = brw->tcs.base.prog_offset;
> +   struct brw_tcs_prog_data *old_prog_data = brw->tcs.prog_data;
> +   bool success;
> +
> +   struct gl_tess_ctrl_program *tcp = (struct gl_tess_ctrl_program *)prog;
> +   struct brw_tess_ctrl_program *btcp = brw_tess_ctrl_program(tcp);
> +
> +   memset(&key, 0, sizeof(key));
> +
> +   key.program_string_id = btcp->id;
> +   brw_setup_tex_for_precompile(brw, &key.tex, prog);
> +
> +   key.tes_primitive_mode = GL_TRIANGLES;
> +
> +   success = brw_codegen_tcs_prog(brw, shader_prog, btcp, &key);
> +
> +   brw->tcs.base.prog_offset = old_prog_offset;
> +   brw->tcs.prog_data = old_prog_data;
> +
> +   return success;
> +}
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp 
> b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> index a697bdf..0cded0c 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> @@ -155,6 +155,9 @@ vec4_instruction::is_send_from_grf()
>     case SHADER_OPCODE_TYPED_ATOMIC:
>     case SHADER_OPCODE_TYPED_SURFACE_READ:
>     case SHADER_OPCODE_TYPED_SURFACE_WRITE:
> +   case VEC4_OPCODE_URB_READ:
> +   case TCS_OPCODE_URB_WRITE:
> +   case SHADER_OPCODE_BARRIER:
>        return true;
>     default:
>        return false;
> @@ -184,7 +187,9 @@ bool
>  vec4_instruction::has_source_and_destination_hazard() const
>  {
>     switch (opcode) {
> -   /* Most opcodes in the vec4 world use MRFs. */
> +   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
> +   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
> +      return true;
>     default:
>        return false;
>     }
> @@ -204,6 +209,7 @@ vec4_instruction::regs_read(unsigned arg) const
>     case SHADER_OPCODE_TYPED_ATOMIC:
>     case SHADER_OPCODE_TYPED_SURFACE_READ:
>     case SHADER_OPCODE_TYPED_SURFACE_WRITE:
> +   case TCS_OPCODE_URB_WRITE:
>        return arg == 0 ? mlen : 1;
>  
>     case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
> @@ -281,6 +287,8 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
>        return 0;
>     case GS_OPCODE_FF_SYNC:
>        return 1;
> +   case TCS_OPCODE_URB_WRITE:
> +      return 0;
>     case SHADER_OPCODE_SHADER_TIME_ADD:
>        return 0;
>     case SHADER_OPCODE_TEX:
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h 
> b/src/mesa/drivers/dri/i965/brw_vec4.h
> index ae5bf69..6bbac83 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.h
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.h
> @@ -340,6 +340,7 @@ public:
>                         unsigned num_components = 4);
>     src_reg get_nir_src(nir_src src,
>                         unsigned num_components = 4);
> +   src_reg get_indirect_offset(nir_intrinsic_instr *instr);
>  
>     virtual dst_reg *make_reg_for_system_value(int location,
>                                                const glsl_type *type) = 0;
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp 
> b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
> index 85cbf24..0c1f0c3 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
> @@ -75,6 +75,8 @@ is_expression(const vec4_instruction *const inst)
>     case VEC4_OPCODE_UNPACK_UNIFORM:
>     case SHADER_OPCODE_FIND_LIVE_CHANNEL:
>     case SHADER_OPCODE_BROADCAST:
> +   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
> +   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
>        return true;
>     case SHADER_OPCODE_RCP:
>     case SHADER_OPCODE_RSQ:
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp 
> b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
> index 2d0722a..c31e72d 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
> @@ -45,6 +45,9 @@ can_do_writemask(const struct brw_device_info *devinfo,
>     case VS_OPCODE_PULL_CONSTANT_LOAD:
>     case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
>     case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
> +   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
> +   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
> +   case VEC4_OPCODE_URB_READ:
>        return false;
>     default:
>        /* The MATH instruction on Gen6 only executes in align1 mode, which 
> does
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp 
> b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> index c3426dd..076b1dd 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> @@ -714,6 +714,211 @@ generate_gs_set_primitive_id(struct brw_codegen *p, 
> struct brw_reg dst)
>  }
>  
>  static void
> +generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst)
> +{
> +   /* "Instance Count" comes as part of the payload in r0.2 bits 23:17.
> +    *
> +    * Since we operate in SIMD4x2 mode, we need run half as many threads
> +    * as necessary.  So we assign (2i + 1, 2i) as the thread counts.  We
> +    * shift right by one less to accomplish the multiplication by two.
> +    */
> +   dst = retype(dst, BRW_REGISTER_TYPE_UD);
> +   struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
> +
> +   brw_push_insn_state(p);
> +   brw_set_default_access_mode(p, BRW_ALIGN_1);
> +
> +   const int mask = INTEL_MASK(23, 17);
> +   const int shift = 17;
> +
> +   brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), 
> brw_imm_ud(mask));
> +   brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0),
> +           brw_imm_ud(shift - 1));
> +   brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1));
> +
> +   brw_pop_insn_state(p);
> +}
> +
> +static void
> +generate_tcs_urb_write(struct brw_codegen *p,
> +                       vec4_instruction *inst,
> +                       struct brw_reg urb_header)
> +{
> +   const struct brw_device_info *devinfo = p->devinfo;
> +
> +   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
> +   brw_set_dest(p, send, brw_null_reg());
> +   brw_set_src0(p, send, urb_header);
> +
> +   brw_set_message_descriptor(p, send, BRW_SFID_URB,
> +                              inst->mlen /* mlen */, 0 /* rlen */,
> +                              true /* header */, false /* eot */);
> +   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD);
> +   brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
> +   brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
> +   brw_inst_set_urb_swizzle_control(devinfo, send, 
> BRW_URB_SWIZZLE_INTERLEAVE);
> +
> +   /* what happens to swizzles? */
> +}
> +
> +
> +static void
> +generate_tcs_input_urb_offsets(struct brw_codegen *p,
> +                               struct brw_reg dst,
> +                               struct brw_reg vertex,
> +                               struct brw_reg offset)
> +{
> +   /* Generates an URB read/write message header for HS/DS operation.
> +    * Inputs are a vertex index, and a byte offset from the beginning of
> +    * the vertex. */
> +
> +   /* If `vertex` is not an immediate, we clobber a0.0 */
> +
> +   assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == 
> BRW_GENERAL_REGISTER_FILE);
> +   assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == 
> BRW_REGISTER_TYPE_D);
> +
> +   assert(dst.file == BRW_GENERAL_REGISTER_FILE);
> +
> +   brw_push_insn_state(p);
> +   brw_set_default_access_mode(p, BRW_ALIGN_1);
> +   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
> +   brw_MOV(p, dst, brw_imm_ud(0));
> +
> +   /* m0.5 bits 8-15 are channel enables */
> +   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
> +
> +   /* m0.0-0.1: URB handles */
> +   if (vertex.file == BRW_IMMEDIATE_VALUE) {
> +      uint32_t vertex_index = vertex.ud;
> +      struct brw_reg index_reg = brw_vec1_grf(
> +            1 + (vertex_index >> 3), vertex_index & 7);
> +
> +      brw_MOV(p, vec2(get_element_ud(dst, 0)),
> +              retype(index_reg, BRW_REGISTER_TYPE_UD));
> +   } else {
> +      /* indirect via a0.0 */
> +      struct brw_reg addr = brw_address_reg(0);
> +
> +      /* bottom half: m0.0 = g[1.0 + vertex.0]UD */
> +      brw_ADD(p, addr, get_element_ud(vertex, 0), brw_imm_uw(0x8));
> +      brw_SHL(p, addr, addr, brw_imm_ud(2));
> +      brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0));
> +
> +      /* top half: m0.1 = g[1.0 + vertex.4]UD */
> +      brw_ADD(p, addr, get_element_ud(vertex, 4), brw_imm_uw(0x8));
> +      brw_SHL(p, addr, addr, brw_imm_ud(2));
> +      brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0));
> +   }
> +
> +   /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
> +   if (offset.file != ARF)
> +      brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
> +
> +   brw_pop_insn_state(p);
> +}
> +
> +
> +static void
> +generate_tcs_output_urb_offsets(struct brw_codegen *p,
> +                                struct brw_reg dst,
> +                                struct brw_reg write_mask,
> +                                struct brw_reg offset)
> +{
> +   /* Generates an URB read/write message header for HS/DS operation, for 
> the patch URB entry. */
> +   assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == 
> BRW_MESSAGE_REGISTER_FILE);
> +
> +   assert(write_mask.file == BRW_IMMEDIATE_VALUE);
> +   assert(write_mask.type == BRW_REGISTER_TYPE_UD);
> +
> +   brw_push_insn_state(p);
> +
> +   brw_set_default_access_mode(p, BRW_ALIGN_1);
> +   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
> +   brw_MOV(p, dst, brw_imm_ud(0));
> +
> +   unsigned mask = write_mask.ud;
> +
> +   /* m0.5 bits 15:12 and 11:8 are channel enables */
> +   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 
> 12)));
> +
> +   /* HS patch URB handle is delivered in r0.0 */
> +   struct brw_reg urb_handle = brw_vec1_grf(0, 0);
> +
> +   /* m0.0-0.1: URB handles */
> +   brw_MOV(p, vec2(get_element_ud(dst, 0)),
> +           retype(urb_handle, BRW_REGISTER_TYPE_UD));
> +
> +   /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
> +   if (offset.file != ARF)
> +      brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
> +
> +   brw_pop_insn_state(p);
> +}
> +
> +static void
> +generate_vec4_urb_read(struct brw_codegen *p,
> +                       vec4_instruction *inst,
> +                       struct brw_reg dst,
> +                       struct brw_reg header)
> +{
> +   const struct brw_device_info *devinfo = p->devinfo;
> +
> +   assert(header.file == BRW_GENERAL_REGISTER_FILE);
> +   assert(header.type == BRW_REGISTER_TYPE_UD);
> +
> +   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
> +   brw_set_dest(p, send, dst);
> +   brw_set_src0(p, send, header);
> +
> +   brw_set_message_descriptor(p, send, BRW_SFID_URB,
> +                              1 /* mlen */, 1 /* rlen */,
> +                              true /* header */, false /* eot */);
> +   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
> +   brw_inst_set_urb_swizzle_control(devinfo, send, 
> BRW_URB_SWIZZLE_INTERLEAVE);
> +   brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
> +
> +   brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
> +}
> +
> +static void
> +generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
> +{
> +   brw_push_insn_state(p);
> +   brw_set_default_access_mode(p, BRW_ALIGN_1);
> +   brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
> +   brw_pop_insn_state(p);
> +}
> +
> +static void
> +generate_tcs_create_barrier_header(struct brw_codegen *p,
> +                                   struct brw_vue_prog_data *prog_data,
> +                                   struct brw_reg dst)
> +{
> +   struct brw_reg m0_2 = get_element_ud(dst, 2);
> +   unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances;
> +
> +   brw_push_insn_state(p);
> +   brw_set_default_access_mode(p, BRW_ALIGN_1);
> +   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
> +
> +   /* Zero the message header */
> +   brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
> +
> +   /* Copy "Barrier ID" from DW0 bits 16:13 */
> +   brw_AND(p, m0_2,
> +           retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
> +           brw_imm_ud(0x1e000));
> +
> +   /* Shift it into place */
> +   brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(11));
> +
> +   /* Set the Barrier Count and the enable bit */
> +   brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15)));
> +
> +   brw_pop_insn_state(p);
> +}
> +
> +static void
>  generate_oword_dual_block_offsets(struct brw_codegen *p,
>                                    struct brw_reg m1,
>                                    struct brw_reg index)
> @@ -1538,6 +1743,39 @@ generate_code(struct brw_codegen *p,
>           break;
>        }
>  
> +      case TCS_OPCODE_URB_WRITE:
> +         generate_tcs_urb_write(p, inst, src[0]);
> +         break;
> +
> +      case VEC4_OPCODE_URB_READ:
> +         generate_vec4_urb_read(p, inst, dst, src[0]);
> +         break;
> +
> +      case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
> +         generate_tcs_input_urb_offsets(p, dst, src[0], src[1]);
> +         break;
> +
> +      case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
> +         generate_tcs_output_urb_offsets(p, dst, src[0], src[1]);
> +         break;
> +
> +      case TCS_OPCODE_GET_INSTANCE_ID:
> +         generate_tcs_get_instance_id(p, dst);
> +         break;
> +
> +      case TCS_OPCODE_GET_PRIMITIVE_ID:
> +         generate_tcs_get_primitive_id(p, dst);
> +         break;
> +
> +      case TCS_OPCODE_CREATE_BARRIER_HEADER:
> +         generate_tcs_create_barrier_header(p, prog_data, dst);
> +         break;
> +
> +      case SHADER_OPCODE_BARRIER:
> +         brw_barrier(p, src[0]);
> +         brw_WAIT(p);
> +         break;
> +
>        default:
>           unreachable("Unsupported opcode");
>        }
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp 
> b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
> index f965b39..45ff7a3 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
> @@ -327,6 +327,24 @@ vec4_visitor::get_nir_src(nir_src src, unsigned 
> num_components)
>     return get_nir_src(src, nir_type_int, num_components);
>  }
>  
> +src_reg
> +vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
> +{
> +   nir_src *offset_src = nir_get_io_offset_src(instr);
> +   nir_const_value *const_value = nir_src_as_const_value(*offset_src);
> +
> +   if (const_value) {
> +      /* The only constant offset we should find is 0.  brw_nir.c's
> +       * add_const_offset_to_base() will fold other constant offsets
> +       * into instr->const_index[0].
> +       */
> +      assert(const_value->u[0] == 0);
> +      return src_reg();
> +   }
> +
> +   return get_nir_src(*offset_src, BRW_REGISTER_TYPE_UD, 1);
> +}
> +
>  void
>  vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
>  {
> @@ -650,7 +668,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr 
> *instr)
>  
>     case nir_intrinsic_load_vertex_id_zero_base:
>     case nir_intrinsic_load_base_vertex:
> -   case nir_intrinsic_load_instance_id: {
> +   case nir_intrinsic_load_instance_id:
> +   case nir_intrinsic_load_invocation_id:
> +   case nir_intrinsic_load_tess_level_inner:
> +   case nir_intrinsic_load_tess_level_outer: {
>        gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
>        src_reg val = src_reg(nir_system_values[sv]);
>        assert(val.file != BAD_FILE);
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp 
> b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
> new file mode 100644
> index 0000000..22224d1
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
> @@ -0,0 +1,496 @@
> +/*
> + * Copyright © 2013 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +/**
> + * \file brw_vec4_tcs.cpp
> + *
> + * Tessellaton control shader specific code derived from the vec4_visitor 
> class.
> + */
> +
> +#include "brw_nir.h"
> +#include "brw_vec4_tcs.h"
> +
> +namespace brw {
> +
> +vec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler,
> +                                   void *log_data,
> +                                   const struct brw_tcs_prog_key *key,
> +                                   struct brw_tcs_prog_data *prog_data,
> +                                   const nir_shader *nir,
> +                                   void *mem_ctx,
> +                                   int shader_time_index)
> +   : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base,
> +                  nir, mem_ctx, false, shader_time_index),
> +     key(key)
> +{
> +}
> +
> +
> +void
> +vec4_tcs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr 
> *instr)
> +{
> +}
> +
> +dst_reg *
> +vec4_tcs_visitor::make_reg_for_system_value(int location, const glsl_type 
> *type)
> +{
> +   return NULL;
> +}
> +
> +
> +void
> +vec4_tcs_visitor::setup_payload()
> +{
> +   int reg = 0;
> +
> +   /* The payload always contains important data in r0, which contains
> +    * the URB handles that are passed on to the URB write at the end
> +    * of the thread.
> +    */
> +   reg++;
> +
> +   /* r1.0 - r4.7 may contain the input control point URB handles,
> +    * which we use to pull vertex data.
> +    */
> +   reg += 4;
> +
> +   /* Push constants may start at r5.0 */
> +   reg = setup_uniforms(reg);
> +
> +   this->first_non_payload_grf = reg;
> +}
> +
> +
> +void
> +vec4_tcs_visitor::emit_prolog()
> +{
> +   invocation_id = src_reg(this, glsl_type::uint_type);
> +   emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id));
> +
> +   /* HS threads are dispatched with the dispatch mask set to 0xFF.
> +    * If there are an odd number of output vertices, then the final
> +    * HS instance dispatched will only have its bottom half doing real
> +    * work, and so we need to disable the upper half:
> +    */
> +   if (nir->info.tcs.vertices_out % 2) {
> +      emit(CMP(dst_null_d(), invocation_id,
> +               brw_imm_ud(nir->info.tcs.vertices_out), BRW_CONDITIONAL_L));
> +
> +      /* Matching ENDIF is in emit_thread_end() */
> +      emit(IF(BRW_PREDICATE_NORMAL));
> +   }
> +}
> +
> +
> +void
> +vec4_tcs_visitor::emit_thread_end()
> +{
> +   current_annotation = "thread end";
> +
> +   if (nir->info.tcs.vertices_out % 2) {
> +      emit(BRW_OPCODE_ENDIF);
> +   }
> +
> +   if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
> +      emit_shader_time_end();
> +
> +   vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
> +   inst->mlen = 1;   /* just the header, no data. */
> +   inst->urb_write_flags = BRW_URB_WRITE_EOT_COMPLETE;
> +}
> +
> +
> +void
> +vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst,
> +                                      const src_reg &vertex_index,
> +                                      unsigned base_offset,
> +                                      const src_reg &indirect_offset)
> +{
> +   vec4_instruction *inst;
> +   dst_reg temp(this, glsl_type::ivec4_type);
> +   temp.type = dst.type;
> +
> +   /* Set up the message header to reference the proper parts of the URB */
> +   dst_reg header = dst_reg(this, glsl_type::uvec4_type);
> +   inst = emit(TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index,
> +               indirect_offset);
> +   inst->force_writemask_all = true;
> +
> +   /* Read into a temporary, ignoring writemasking. */
> +   inst = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header));
> +   inst->offset = base_offset;
> +   inst->mlen = 1;
> +   inst->base_mrf = -1;
> +
> +   /* Copy the temporary to the destination to deal with writemasking.
> +    *
> +    * Also attempt to deal with gl_PointSize being in the .w component.
> +    */
> +   if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
> +      emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW)));
> +   } else {
> +      emit(MOV(dst, src_reg(temp)));
> +   }
> +}
> +
> +void
> +vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst,
> +                                       unsigned base_offset,
> +                                       const src_reg &indirect_offset)
> +{
> +   vec4_instruction *inst;
> +
> +   /* Set up the message header to reference the proper parts of the URB */
> +   dst_reg header = dst_reg(this, glsl_type::uvec4_type);
> +   inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header,
> +               brw_imm_ud(dst.writemask), indirect_offset);
> +   inst->force_writemask_all = true;
> +
> +   /* Read into a temporary, ignoring writemasking. */
> +   vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, dst, src_reg(header));
> +   read->offset = base_offset;
> +   read->mlen = 1;
> +   read->base_mrf = -1;
> +}
> +
> +void
> +vec4_tcs_visitor::emit_urb_write(const src_reg &value,
> +                                 unsigned writemask,
> +                                 unsigned base_offset,
> +                                 const src_reg &indirect_offset)
> +{
> +   if (writemask == 0)
> +      return;
> +
> +   src_reg message(this, glsl_type::uvec4_type, 2);
> +   vec4_instruction *inst;
> +
> +   inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message),
> +               brw_imm_ud(writemask), indirect_offset);
> +   inst->force_writemask_all = true;
> +   inst = emit(MOV(offset(dst_reg(retype(message, value.type)), 1), value));
> +   inst->force_writemask_all = true;
> +
> +   inst = emit(TCS_OPCODE_URB_WRITE, dst_null_f(), message);
> +   inst->offset = base_offset;
> +   inst->mlen = 2;
> +   inst->base_mrf = -1;
> +}
> +
> +static unsigned
> +tesslevel_outer_components(GLenum tes_primitive_mode)
> +{
> +   switch (tes_primitive_mode) {
> +   case GL_QUADS:
> +      return 4;
> +   case GL_TRIANGLES:
> +      return 3;
> +   case GL_ISOLINES:
> +      return 2;
> +   default:
> +      unreachable("Bogus tessellation domain");
> +   }
> +   return 0;
> +}
> +
> +static unsigned
> +tesslevel_inner_components(GLenum tes_primitive_mode)
> +{
> +   switch (tes_primitive_mode) {
> +   case GL_QUADS:
> +      return 2;
> +   case GL_TRIANGLES:
> +      return 1;
> +   case GL_ISOLINES:
> +      return 0;
> +   default:
> +      unreachable("Bogus tessellation domain");
> +   }
> +   return 0;
> +}
> +
> +/**
> + * Given a normal .xyzw writemask, convert it to a writemask for a vector
> + * that's stored backwards, i.e. .wzyx.
> + */
> +static unsigned
> +writemask_for_backwards_vector(unsigned mask)
> +{
> +   unsigned new_mask = 0;
> +
> +   for (int i = 0; i < 4; i++)
> +      new_mask |= ((mask >> i) & 1) << (3 - i);
> +
> +   return new_mask;
> +}
> +
> +void
> +vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
> +{
> +   switch (instr->intrinsic) {
> +   case nir_intrinsic_load_invocation_id:
> +      emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD),
> +               invocation_id));
> +      break;
> +   case nir_intrinsic_load_primitive_id:
> +      emit(TCS_OPCODE_GET_PRIMITIVE_ID,
> +           get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD));
> +      break;
> +   case nir_intrinsic_load_patch_vertices_in:
> +      unreachable("XXX: gl_PatchVerticesIn not implemented yet.");
> +      break;
> +   case nir_intrinsic_load_per_vertex_input: {
> +      src_reg indirect_offset = get_indirect_offset(instr);
> +      unsigned imm_offset = instr->const_index[0];
> +
> +      nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]);
> +      src_reg vertex_index =
> +         vertex_const ? src_reg(brw_imm_ud(vertex_const->u[0]))
> +                      : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
> +
> +      dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
> +      dst.writemask = brw_writemask_for_size(instr->num_components);
> +
> +      emit_input_urb_read(dst, vertex_index, imm_offset, indirect_offset);
> +      break;
> +   }
> +   case nir_intrinsic_load_input:
> +      unreachable("nir_lower_io should use load_per_vertex_input 
> intrinsics");
> +      break;
> +   case nir_intrinsic_load_output:
> +   case nir_intrinsic_load_per_vertex_output: {
> +      src_reg indirect_offset = get_indirect_offset(instr);
> +      unsigned imm_offset = instr->const_index[0];;
> +
> +      dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
> +      dst.writemask = brw_writemask_for_size(instr->num_components);
> +
> +      if (imm_offset == 0 && indirect_offset.file == BAD_FILE) {
> +         dst.type = BRW_REGISTER_TYPE_F;
> +
> +         /* This is a read of gl_TessLevelInner[], which lives in the
> +          * Patch URB header.  The layout depends on the domain.
> +          */
> +         switch (key->tes_primitive_mode) {
> +         case GL_QUADS: {
> +            /* DWords 3-2 (reversed); use offset 0 and WZYX swizzle. */
> +            dst_reg tmp(this, glsl_type::vec4_type);
> +            emit_output_urb_read(tmp, 0, src_reg());
> +            emit(MOV(writemask(dst, WRITEMASK_XY),
> +                     swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX)));
> +            break;
> +         }
> +         case GL_TRIANGLES:
> +            /* DWord 4; use offset 1 but normal swizzle/writemask. */
> +            emit_output_urb_read(writemask(dst, WRITEMASK_X), 1, src_reg());
> +            break;
> +         case GL_ISOLINES:
> +            /* All channels are undefined. */
> +            return;
> +         default:
> +            unreachable("Bogus tessellation domain");
> +         }
> +      } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) {
> +         dst.type = BRW_REGISTER_TYPE_F;
> +
> +         /* This is a read of gl_TessLevelOuter[], which lives in the
> +          * high 4 DWords of the Patch URB header, in reverse order.
> +          */
> +         switch (key->tes_primitive_mode) {
> +         case GL_QUADS:
> +            dst.writemask = WRITEMASK_XYZW;
> +            break;
> +         case GL_TRIANGLES:
> +            dst.writemask = WRITEMASK_XYZ;
> +            break;
> +         case GL_ISOLINES:
> +            dst.writemask = WRITEMASK_XY;
> +            return;
> +         default:
> +            unreachable("Bogus tessellation domain");
> +         }
> +
> +         dst_reg tmp(this, glsl_type::vec4_type);
> +         emit_output_urb_read(tmp, 1, src_reg());
> +         emit(MOV(dst, swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX)));
> +      } else {
> +         emit_output_urb_read(dst, imm_offset, indirect_offset);
> +      }
> +      break;
> +   }
> +   case nir_intrinsic_store_output:
> +   case nir_intrinsic_store_per_vertex_output: {
> +      src_reg value = get_nir_src(instr->src[0]);
> +      unsigned mask = instr->const_index[1];
> +      unsigned swiz = BRW_SWIZZLE_XYZW;
> +
> +      src_reg indirect_offset = get_indirect_offset(instr);
> +      unsigned imm_offset = instr->const_index[0];
> +
> +      if (imm_offset == 0 && indirect_offset.file == BAD_FILE) {
> +         value.type = BRW_REGISTER_TYPE_F;
> +
> +         mask &= (1 << tesslevel_inner_components(key->tes_primitive_mode)) 
> - 1;
> +
> +         /* This is a write to gl_TessLevelInner[], which lives in the
> +          * Patch URB header.  The layout depends on the domain.
> +          */
> +         switch (key->tes_primitive_mode) {
> +         case GL_QUADS:
> +            /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed).
> +             * We use an XXYX swizzle to reverse put .xy in the .wz
> +             * channels, and use a .zw writemask.
> +             */
> +            swiz = BRW_SWIZZLE4(0, 0, 1, 0);
> +            mask = writemask_for_backwards_vector(mask);
> +            break;
> +         case GL_TRIANGLES:
> +            /* gl_TessLevelInner[].x lives at DWord 4, so we set the
> +             * writemask to X and bump the URB offset by 1.
> +             */
> +            imm_offset = 1;
> +            break;
> +         case GL_ISOLINES:
> +            /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */
> +            return;
> +         default:
> +            unreachable("Bogus tessellation domain");
> +         }
> +      } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) {
> +         value.type = BRW_REGISTER_TYPE_F;
> +
> +         mask &= (1 << tesslevel_outer_components(key->tes_primitive_mode)) 
> - 1;
> +
> +         /* This is a write to gl_TessLevelOuter[] which lives in the
> +          * Patch URB Header at DWords 4-7.  However, it's reversed, so
> +          * instead of .xyzw we have .wzyx.
> +          */
> +         swiz = BRW_SWIZZLE_WZYX;
> +         mask = writemask_for_backwards_vector(mask);
> +      }
> +
> +      emit_urb_write(swizzle(value, swiz), mask,
> +                     imm_offset, indirect_offset);
> +      break;
> +   }
> +
> +   case nir_intrinsic_barrier: {
> +      dst_reg header = dst_reg(this, glsl_type::uvec4_type);
> +      emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
> +      emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
> +      break;
> +   }
> +
> +   default:
> +      vec4_visitor::nir_emit_intrinsic(instr);
> +   }
> +}
> +
> +
> +extern "C" const unsigned *
> +brw_compile_tcs(const struct brw_compiler *compiler,
> +                void *log_data,
> +                void *mem_ctx,
> +                const struct brw_tcs_prog_key *key,
> +                struct brw_tcs_prog_data *prog_data,
> +                const nir_shader *src_shader,
> +                int shader_time_index,
> +                unsigned *final_assembly_size,
> +                char **error_str)
> +{
> +   const struct brw_device_info *devinfo = compiler->devinfo;
> +   struct brw_vue_prog_data *vue_prog_data = &prog_data->base;
> +   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
> +
> +   nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
> +   nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
> +   nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
> +
> +   prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2);
> +
> +   brw_compute_tess_vue_map(&vue_prog_data->vue_map,
> +                            nir->info.outputs_written,
> +                            nir->info.patch_outputs_written);
> +
> +   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
> +    * That divides up as follows:
> +    *
> +    *     32 bytes for the patch header (tessellation factors)
> +    *    480 bytes for per-patch varyings (a varying component is 4 bytes and
> +    *              gl_MaxTessPatchComponents = 120)
> +    *  16384 bytes for per-vertex varyings (a varying component is 4 bytes,
> +    *              gl_MaxPatchVertices = 32 and
> +    *              gl_MaxTessControlOutputComponents = 128)
> +    *
> +    *  15808 bytes left for varying packing overhead
> +    */
> +   const int num_per_patch_slots = 
> vue_prog_data->vue_map.num_per_patch_slots;
> +   const int num_per_vertex_slots = 
> vue_prog_data->vue_map.num_per_vertex_slots;
> +   unsigned output_size_bytes = 0;
> +   /* Note that the patch header is counted in num_per_patch_slots. */
> +   output_size_bytes += num_per_patch_slots * 16;
> +   output_size_bytes += nir->info.tcs.vertices_out * num_per_vertex_slots * 
> 16;
> +
> +   assert(output_size_bytes >= 1);
> +   if (output_size_bytes > GEN7_MAX_HS_URB_ENTRY_SIZE_BYTES)
> +      return false;
> +
> +   /* URB entry sizes are stored as a multiple of 64 bytes. */
> +   vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
> +
> +   struct brw_vue_map input_vue_map;
> +   brw_compute_vue_map(devinfo, &input_vue_map,
> +                       nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID,
> +                       true);
> +
> +   /* HS does not use the usual payload pushing from URB to GRFs,
> +    * because we don't have enough registers for a full-size payload, and
> +    * the hardware is broken on Haswell anyway.
> +    */
> +   vue_prog_data->urb_read_length = 0;
> +
> +   if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
> +      fprintf(stderr, "TCS Input ");
> +      brw_print_vue_map(stderr, &input_vue_map);
> +      fprintf(stderr, "TCS Output ");
> +      brw_print_vue_map(stderr, &vue_prog_data->vue_map);
> +   }
> +
> +   vec4_tcs_visitor v(compiler, log_data, key, prog_data,
> +                      nir, mem_ctx, shader_time_index);
> +   if (!v.run()) {
> +      if (error_str)
> +         *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
> +      return NULL;
> +   }
> +
> +   if (unlikely(INTEL_DEBUG & DEBUG_TCS))
> +      v.dump_instructions();
> +
> +   return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
> +                                     &prog_data->base, v.cfg,
> +                                     final_assembly_size);
> +}
> +
> +
> +} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.h 
> b/src/mesa/drivers/dri/i965/brw_vec4_tcs.h
> new file mode 100644
> index 0000000..2bf4885
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.h
> @@ -0,0 +1,84 @@
> +/*
> + * Copyright © 2013 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +/**
> + * \file brw_vec4_tcs.h
> + *
> + * The vec4-mode tessellation control shader compiler backend.
> + */
> +
> +#ifndef BRW_VEC4_TCS_H
> +#define BRW_VEC4_TCS_H
> +
> +#include "brw_compiler.h"
> +#include "brw_vec4.h"
> +
> +#ifdef __cplusplus
> +namespace brw {
> +
> +class vec4_tcs_visitor : public vec4_visitor
> +{
> +public:
> +   vec4_tcs_visitor(const struct brw_compiler *compiler,
> +                    void *log_data,
> +                    const struct brw_tcs_prog_key *key,
> +                    struct brw_tcs_prog_data *prog_data,
> +                    const nir_shader *nir,
> +                    void *mem_ctx,
> +                    int shader_time_index);
> +
> +protected:
> +   virtual dst_reg *make_reg_for_system_value(int location,
> +                                              const glsl_type *type);
> +   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
> +   virtual void setup_payload();
> +   virtual void emit_prolog();
> +   virtual void emit_thread_end();
> +
> +   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
> +
> +   void emit_input_urb_read(const dst_reg &dst,
> +                            const src_reg &vertex_index,
> +                            unsigned base_offset,
> +                            const src_reg &indirect_offset);
> +   void emit_output_urb_read(const dst_reg &dst,
> +                             unsigned base_offset,
> +                             const src_reg &indirect_offset);
> +
> +   void emit_urb_write(const src_reg &value, unsigned writemask,
> +                       unsigned base_offset, const src_reg &indirect_offset);
> +
> +   /* we do not use the normal end-of-shader URB write mechanism -- but 
> every vec4 stage
> +    * must provide implementations of these:
> +    */
> +   virtual void emit_urb_write_header(int mrf) {}
> +   virtual vec4_instruction *emit_urb_write_opcode(bool complete) { return 
> NULL; }
> +
> +   const struct brw_tcs_prog_key *key;
> +   src_reg invocation_id;
> +};
> +
> +} /* namespace brw */
> +#endif /* __cplusplus */
> +
> +#endif /* BRW_VEC4_TCS_H */
> -- 
> 2.6.3
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH v2 12/13] i965: Add tessellation control shaders.

Reply via email to