We discussed all my questions / comments on irc... 12 & 13 Reviewed-by: Jordan Justen <jordan.l.jus...@intel.com>
On 2015-12-11 13:24:01, Kenneth Graunke wrote: > The TCS is the first tessellation shader stage, and the most > complicated. It has access to each of the control points in the input > patch, and computes a new output patch. There is one logical invocation > per output control point; all invocations run in parallel, and can > communicate by reading and writing output variables. > > One of the main responsibilities of the TCS is to write the special > gl_TessLevelOuter[] and gl_TessLevelInner[] output variables which > control how much new geometry the hardware tessellation engine will > produce. Otherwise, it simply writes outputs that are passed along > to the TES. > > We run in SIMD4x2 mode, handling two logical invocations per EU thread. > The hardware doesn't properly manage the dispatch mask for us; it always > initializes it to 0xFF. We wrap the whole program in an IF..ENDIF block > to handle an odd number of invocations, essentially falling back to > SIMD4x1 on the last thread. > > Signed-off-by: Kenneth Graunke <kenn...@whitecape.org> > --- > src/mesa/drivers/dri/i965/Makefile.sources | 2 + > src/mesa/drivers/dri/i965/brw_compiler.h | 26 ++ > src/mesa/drivers/dri/i965/brw_context.h | 6 + > src/mesa/drivers/dri/i965/brw_defines.h | 8 + > src/mesa/drivers/dri/i965/brw_link.cpp | 4 + > src/mesa/drivers/dri/i965/brw_program.h | 1 + > src/mesa/drivers/dri/i965/brw_reg.h | 1 + > src/mesa/drivers/dri/i965/brw_shader.cpp | 17 + > src/mesa/drivers/dri/i965/brw_shader.h | 3 + > src/mesa/drivers/dri/i965/brw_state_upload.c | 1 + > src/mesa/drivers/dri/i965/brw_tcs.c | 262 +++++++++++ > src/mesa/drivers/dri/i965/brw_vec4.cpp | 10 +- > src/mesa/drivers/dri/i965/brw_vec4.h | 1 + > src/mesa/drivers/dri/i965/brw_vec4_cse.cpp | 2 + > .../dri/i965/brw_vec4_dead_code_eliminate.cpp | 3 + > src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 238 ++++++++++ > src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 23 +- > src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp | 496 > +++++++++++++++++++++ > src/mesa/drivers/dri/i965/brw_vec4_tcs.h | 84 ++++ > 19 files changed, 1186 insertions(+), 2 deletions(-) > create mode 100644 src/mesa/drivers/dri/i965/brw_tcs.c > create mode 100644 src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp > create mode 100644 src/mesa/drivers/dri/i965/brw_vec4_tcs.h > > diff --git a/src/mesa/drivers/dri/i965/Makefile.sources > b/src/mesa/drivers/dri/i965/Makefile.sources > index 7354aaf..0b706de 100644 > --- a/src/mesa/drivers/dri/i965/Makefile.sources > +++ b/src/mesa/drivers/dri/i965/Makefile.sources > @@ -75,6 +75,7 @@ i965_compiler_FILES = \ > brw_vec4_reg_allocate.cpp \ > brw_vec4_surface_builder.cpp \ > brw_vec4_surface_builder.h \ > + brw_vec4_tcs.cpp \ > brw_vec4_visitor.cpp \ > brw_vec4_vs_visitor.cpp \ > brw_vue_map.c \ > @@ -150,6 +151,7 @@ i965_FILES = \ > brw_state.h \ > brw_state_upload.c \ > brw_structs.h \ > + brw_tcs.c \ > brw_tcs_surface_state.c \ > brw_tes.c \ > brw_tes_surface_state.c \ > diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h > b/src/mesa/drivers/dri/i965/brw_compiler.h > index 64d831d..e6bae8e 100644 > --- a/src/mesa/drivers/dri/i965/brw_compiler.h > +++ b/src/mesa/drivers/dri/i965/brw_compiler.h > @@ -191,6 +191,16 @@ struct brw_vs_prog_key { > struct brw_sampler_prog_key_data tex; > }; > > +/** The program key for Tessellation Control Shaders. */ > +struct brw_tcs_prog_key > +{ > + unsigned program_string_id; > + > + GLenum tes_primitive_mode; > + > + struct brw_sampler_prog_key_data tex; > +}; > + > /** The program key for Tessellation Evaluation Shaders. */ > struct brw_tes_prog_key > { > @@ -677,6 +687,22 @@ brw_compile_vs(const struct brw_compiler *compiler, void > *log_data, > char **error_str); > > /** > + * Compile a tessellation control shader. > + * > + * Returns the final assembly and the program's size. > + */ > +const unsigned * > +brw_compile_tcs(const struct brw_compiler *compiler, > + void *log_data, > + void *mem_ctx, > + const struct brw_tcs_prog_key *key, > + struct brw_tcs_prog_data *prog_data, > + const struct nir_shader *nir, > + int shader_time_index, > + unsigned *final_assembly_size, > + char **error_str); > + > +/** > * Compile a tessellation evaluation shader. > * > * Returns the final assembly and the program's size. > diff --git a/src/mesa/drivers/dri/i965/brw_context.h > b/src/mesa/drivers/dri/i965/brw_context.h > index 5e840d1..1d989f3 100644 > --- a/src/mesa/drivers/dri/i965/brw_context.h > +++ b/src/mesa/drivers/dri/i965/brw_context.h > @@ -1704,6 +1704,12 @@ brw_vertex_program_const(const struct > gl_vertex_program *p) > return (const struct brw_vertex_program *) p; > } > > +static inline struct brw_tess_ctrl_program * > +brw_tess_ctrl_program(struct gl_tess_ctrl_program *p) > +{ > + return (struct brw_tess_ctrl_program *) p; > +} > + > static inline struct brw_tess_eval_program * > brw_tess_eval_program(struct gl_tess_eval_program *p) > { > diff --git a/src/mesa/drivers/dri/i965/brw_defines.h > b/src/mesa/drivers/dri/i965/brw_defines.h > index 4a184cf..cc19c06 100644 > --- a/src/mesa/drivers/dri/i965/brw_defines.h > +++ b/src/mesa/drivers/dri/i965/brw_defines.h > @@ -1305,6 +1305,14 @@ enum opcode { > * UD immediate). > */ > SHADER_OPCODE_MOV_INDIRECT, > + > + VEC4_OPCODE_URB_READ, > + TCS_OPCODE_GET_INSTANCE_ID, > + TCS_OPCODE_URB_WRITE, > + TCS_OPCODE_SET_INPUT_URB_OFFSETS, > + TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, > + TCS_OPCODE_GET_PRIMITIVE_ID, > + TCS_OPCODE_CREATE_BARRIER_HEADER, > }; > > enum brw_urb_write_flags { > diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp > b/src/mesa/drivers/dri/i965/brw_link.cpp > index f5a7d20..7cdc830 100644 > --- a/src/mesa/drivers/dri/i965/brw_link.cpp > +++ b/src/mesa/drivers/dri/i965/brw_link.cpp > @@ -42,6 +42,7 @@ brw_shader_precompile(struct gl_context *ctx, > struct gl_shader_program *sh_prog) > { > struct gl_shader *vs = sh_prog->_LinkedShaders[MESA_SHADER_VERTEX]; > + struct gl_shader *tcs = sh_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]; > struct gl_shader *tes = sh_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL]; > struct gl_shader *gs = sh_prog->_LinkedShaders[MESA_SHADER_GEOMETRY]; > struct gl_shader *fs = sh_prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; > @@ -56,6 +57,9 @@ brw_shader_precompile(struct gl_context *ctx, > if (tes && !brw_tes_precompile(ctx, sh_prog, tes->Program)) > return false; > > + if (tcs && !brw_tcs_precompile(ctx, sh_prog, tcs->Program)) > + return false; > + > if (vs && !brw_vs_precompile(ctx, sh_prog, vs->Program)) > return false; > > diff --git a/src/mesa/drivers/dri/i965/brw_program.h > b/src/mesa/drivers/dri/i965/brw_program.h > index 1cdab97..3d9e1b9 100644 > --- a/src/mesa/drivers/dri/i965/brw_program.h > +++ b/src/mesa/drivers/dri/i965/brw_program.h > @@ -56,6 +56,7 @@ void > brw_dump_ir(const char *stage, struct gl_shader_program *shader_prog, > struct gl_shader *shader, struct gl_program *prog); > > +void brw_upload_tcs_prog(struct brw_context *brw); > void brw_upload_tes_prog(struct brw_context *brw); > > #ifdef __cplusplus > diff --git a/src/mesa/drivers/dri/i965/brw_reg.h > b/src/mesa/drivers/dri/i965/brw_reg.h > index fa912c9..9f2ff9a 100644 > --- a/src/mesa/drivers/dri/i965/brw_reg.h > +++ b/src/mesa/drivers/dri/i965/brw_reg.h > @@ -84,6 +84,7 @@ struct brw_device_info; > #define BRW_SWIZZLE_YZXW BRW_SWIZZLE4(1,2,0,3) > #define BRW_SWIZZLE_ZXYW BRW_SWIZZLE4(2,0,1,3) > #define BRW_SWIZZLE_ZWZW BRW_SWIZZLE4(2,3,2,3) > +#define BRW_SWIZZLE_WZYX BRW_SWIZZLE4(3,2,1,0) > > static inline bool > brw_is_single_value_swizzle(unsigned swiz) > diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp > b/src/mesa/drivers/dri/i965/brw_shader.cpp > index d954568..9b64ae4 100644 > --- a/src/mesa/drivers/dri/i965/brw_shader.cpp > +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp > @@ -85,6 +85,7 @@ brw_compiler_create(void *mem_ctx, const struct > brw_device_info *devinfo) > > compiler->scalar_stage[MESA_SHADER_VERTEX] = > devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS); > + compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false; > compiler->scalar_stage[MESA_SHADER_TESS_EVAL] = true; > compiler->scalar_stage[MESA_SHADER_GEOMETRY] = > devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", false); > @@ -137,6 +138,7 @@ brw_compiler_create(void *mem_ctx, const struct > brw_device_info *devinfo) > compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true; > } > > + > compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = > false; > > compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = > false; > > if (compiler->scalar_stage[MESA_SHADER_GEOMETRY]) > @@ -549,6 +551,21 @@ brw_instruction_name(enum opcode op) > return "mulh"; > case SHADER_OPCODE_MOV_INDIRECT: > return "mov_indirect"; > + > + case VEC4_OPCODE_URB_READ: > + return "urb_read"; > + case TCS_OPCODE_GET_INSTANCE_ID: > + return "tcs_get_instance_id"; > + case TCS_OPCODE_URB_WRITE: > + return "tcs_urb_write"; > + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: > + return "tcs_set_input_urb_offsets"; > + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: > + return "tcs_set_output_urb_offsets"; > + case TCS_OPCODE_GET_PRIMITIVE_ID: > + return "tcs_get_primitive_id"; > + case TCS_OPCODE_CREATE_BARRIER_HEADER: > + return "tcs_create_barrier_header"; > } > > unreachable("not reached"); > diff --git a/src/mesa/drivers/dri/i965/brw_shader.h > b/src/mesa/drivers/dri/i965/brw_shader.h > index 2e73f12..5933613 100644 > --- a/src/mesa/drivers/dri/i965/brw_shader.h > +++ b/src/mesa/drivers/dri/i965/brw_shader.h > @@ -273,6 +273,9 @@ brw_assign_common_binding_table_offsets(gl_shader_stage > stage, > bool brw_vs_precompile(struct gl_context *ctx, > struct gl_shader_program *shader_prog, > struct gl_program *prog); > +bool brw_tcs_precompile(struct gl_context *ctx, > + struct gl_shader_program *shader_prog, > + struct gl_program *prog); > bool brw_tes_precompile(struct gl_context *ctx, > struct gl_shader_program *shader_prog, > struct gl_program *prog); > diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c > b/src/mesa/drivers/dri/i965/brw_state_upload.c > index c657b25..56962d5 100644 > --- a/src/mesa/drivers/dri/i965/brw_state_upload.c > +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c > @@ -678,6 +678,7 @@ brw_upload_programs(struct brw_context *brw, > { > if (pipeline == BRW_RENDER_PIPELINE) { > brw_upload_vs_prog(brw); > + brw_upload_tcs_prog(brw); > brw_upload_tes_prog(brw); > > if (brw->gen < 6) > diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c > b/src/mesa/drivers/dri/i965/brw_tcs.c > new file mode 100644 > index 0000000..4acfaea > --- /dev/null > +++ b/src/mesa/drivers/dri/i965/brw_tcs.c > @@ -0,0 +1,262 @@ > +/* > + * Copyright © 2013 Intel Corporation > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice (including the next > + * paragraph) shall be included in all copies or substantial portions of the > + * Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > + * DEALINGS IN THE SOFTWARE. > + */ > + > +/** > + * \file brw_tcs.c > + * > + * Tessellation control shader state upload code. > + */ > + > +#include "brw_context.h" > +#include "brw_nir.h" > +#include "brw_program.h" > +#include "brw_shader.h" > +#include "brw_state.h" > +#include "program/prog_parameter.h" > + > +static void > +brw_tcs_debug_recompile(struct brw_context *brw, > + struct gl_shader_program *shader_prog, > + const struct brw_tcs_prog_key *key) > +{ > + struct brw_cache_item *c = NULL; > + const struct brw_tcs_prog_key *old_key = NULL; > + bool found = false; > + > + perf_debug("Recompiling tessellation control shader for program %d\n", > + shader_prog->Name); > + > + for (unsigned int i = 0; i < brw->cache.size; i++) { > + for (c = brw->cache.items[i]; c; c = c->next) { > + if (c->cache_id == BRW_CACHE_TCS_PROG) { > + old_key = c->key; > + > + if (old_key->program_string_id == key->program_string_id) > + break; > + } > + } > + if (c) > + break; > + } > + > + if (!c) { > + perf_debug(" Didn't find previous compile in the shader cache for " > + "debug\n"); > + return; > + } > + > + found |= key_debug(brw, "TES primitive mode", old_key->tes_primitive_mode, > + key->tes_primitive_mode); > + found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex); > + > + if (!found) { > + perf_debug(" Something else\n"); > + } > +} > + > +static bool > +brw_codegen_tcs_prog(struct brw_context *brw, > + struct gl_shader_program *shader_prog, > + struct brw_tess_ctrl_program *tcp, > + struct brw_tcs_prog_key *key) > +{ > + const struct brw_compiler *compiler = brw->intelScreen->compiler; > + struct brw_stage_state *stage_state = &brw->tcs.base; > + nir_shader *nir = tcp->program.Base.nir; > + struct brw_tcs_prog_data prog_data; > + bool start_busy = false; > + double start_time = 0; > + > + memset(&prog_data, 0, sizeof(prog_data)); > + > + /* Allocate the references to the uniforms that will end up in the > + * prog_data associated with the compiled program, and which will be freed > + * by the state cache. > + * > + * Note: param_count needs to be num_uniform_components * 4, since we add > + * padding around uniform values below vec4 size, so the worst case is > that > + * every uniform is a float which gets padded to the size of a vec4. > + */ > + struct gl_shader *tcs = > shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]; > + int param_count = nir->num_uniforms; > + if (!compiler->scalar_stage[MESA_SHADER_TESS_CTRL]) > + param_count *= 4; > + > + prog_data.base.base.param = > + rzalloc_array(NULL, const gl_constant_value *, param_count); > + prog_data.base.base.pull_param = > + rzalloc_array(NULL, const gl_constant_value *, param_count); > + prog_data.base.base.image_param = > + rzalloc_array(NULL, struct brw_image_param, tcs->NumImages); > + prog_data.base.base.nr_params = param_count; > + prog_data.base.base.nr_image_params = tcs->NumImages; > + > + brw_nir_setup_glsl_uniforms(nir, shader_prog, &tcp->program.Base, > + &prog_data.base.base, false); > + > + if (unlikely(INTEL_DEBUG & DEBUG_TCS)) > + brw_dump_ir("tessellation control", shader_prog, tcs, NULL); > + > + int st_index = -1; > + if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME)) > + st_index = brw_get_shader_time_index(brw, shader_prog, NULL, ST_TCS); > + > + if (unlikely(brw->perf_debug)) { > + start_busy = brw->batch.last_bo && > drm_intel_bo_busy(brw->batch.last_bo); > + start_time = get_time(); > + } > + > + void *mem_ctx = ralloc_context(NULL); > + unsigned program_size; > + char *error_str; > + const unsigned *program = > + brw_compile_tcs(compiler, brw, mem_ctx, key, &prog_data, nir, st_index, > + &program_size, &error_str); > + if (program == NULL) { > + if (shader_prog) { > + shader_prog->LinkStatus = false; > + ralloc_strcat(&shader_prog->InfoLog, error_str); > + } > + > + _mesa_problem(NULL, "Failed to compile tessellation control shader: " > + "%s\n", error_str); > + > + ralloc_free(mem_ctx); > + return false; > + } > + > + if (unlikely(brw->perf_debug)) { > + struct brw_shader *btcs = (struct brw_shader *) tcs; > + if (btcs->compiled_once) { > + brw_tcs_debug_recompile(brw, shader_prog, key); > + } > + if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { > + perf_debug("TCS compile took %.03f ms and stalled the GPU\n", > + (get_time() - start_time) * 1000); > + } > + btcs->compiled_once = true; > + } > + > + /* Scratch space is used for register spilling */ > + if (prog_data.base.base.total_scratch) { > + brw_get_scratch_bo(brw, &stage_state->scratch_bo, > + prog_data.base.base.total_scratch * > + brw->max_hs_threads); > + } > + > + brw_upload_cache(&brw->cache, BRW_CACHE_TCS_PROG, > + key, sizeof(*key), > + program, program_size, > + &prog_data, sizeof(prog_data), > + &stage_state->prog_offset, &brw->tcs.prog_data); > + ralloc_free(mem_ctx); > + > + return true; > +} > + > + > +void > +brw_upload_tcs_prog(struct brw_context *brw) > +{ > + struct gl_context *ctx = &brw->ctx; > + struct gl_shader_program **current = ctx->_Shader->CurrentProgram; > + struct brw_stage_state *stage_state = &brw->tcs.base; > + struct brw_tcs_prog_key key; > + /* BRW_NEW_TESS_CTRL_PROGRAM */ > + struct brw_tess_ctrl_program *tcp = > + (struct brw_tess_ctrl_program *) brw->tess_ctrl_program; > + > + if (!brw_state_dirty(brw, > + _NEW_TEXTURE, > + BRW_NEW_TESS_CTRL_PROGRAM | > + BRW_NEW_TESS_EVAL_PROGRAM)) > + return; > + > + if (tcp == NULL) { > + /* Other state atoms had better not try to access prog_data, since > + * there's no HS program. > + */ > + brw->tcs.prog_data = NULL; > + brw->tcs.base.prog_data = NULL; > + return; > + } > + > + struct gl_program *prog = &tcp->program.Base; > + > + memset(&key, 0, sizeof(key)); > + > + key.program_string_id = tcp->id; > + > + /* _NEW_TEXTURE */ > + brw_populate_sampler_prog_key_data(ctx, prog, stage_state->sampler_count, > + &key.tex); > + > + /* BRW_NEW_TESS_EVAL_PROGRAM */ > + /* We need to specialize our code generation for tessellation levels > + * based on the domain the DS is expecting to tessellate. > + */ > + struct brw_tess_eval_program *tep = > + (struct brw_tess_eval_program *) brw->tess_eval_program; > + assert(tep); > + key.tes_primitive_mode = tep->program.PrimitiveMode; > + > + if (!brw_search_cache(&brw->cache, BRW_CACHE_TCS_PROG, > + &key, sizeof(key), > + &stage_state->prog_offset, &brw->tcs.prog_data)) { > + bool success = brw_codegen_tcs_prog(brw, > current[MESA_SHADER_TESS_CTRL], > + tcp, &key); > + assert(success); > + (void)success; > + } > + brw->tcs.base.prog_data = &brw->tcs.prog_data->base.base; > +} > + > + > +bool > +brw_tcs_precompile(struct gl_context *ctx, > + struct gl_shader_program *shader_prog, > + struct gl_program *prog) > +{ > + struct brw_context *brw = brw_context(ctx); > + struct brw_tcs_prog_key key; > + uint32_t old_prog_offset = brw->tcs.base.prog_offset; > + struct brw_tcs_prog_data *old_prog_data = brw->tcs.prog_data; > + bool success; > + > + struct gl_tess_ctrl_program *tcp = (struct gl_tess_ctrl_program *)prog; > + struct brw_tess_ctrl_program *btcp = brw_tess_ctrl_program(tcp); > + > + memset(&key, 0, sizeof(key)); > + > + key.program_string_id = btcp->id; > + brw_setup_tex_for_precompile(brw, &key.tex, prog); > + > + key.tes_primitive_mode = GL_TRIANGLES; > + > + success = brw_codegen_tcs_prog(brw, shader_prog, btcp, &key); > + > + brw->tcs.base.prog_offset = old_prog_offset; > + brw->tcs.prog_data = old_prog_data; > + > + return success; > +} > diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp > b/src/mesa/drivers/dri/i965/brw_vec4.cpp > index a697bdf..0cded0c 100644 > --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp > +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp > @@ -155,6 +155,9 @@ vec4_instruction::is_send_from_grf() > case SHADER_OPCODE_TYPED_ATOMIC: > case SHADER_OPCODE_TYPED_SURFACE_READ: > case SHADER_OPCODE_TYPED_SURFACE_WRITE: > + case VEC4_OPCODE_URB_READ: > + case TCS_OPCODE_URB_WRITE: > + case SHADER_OPCODE_BARRIER: > return true; > default: > return false; > @@ -184,7 +187,9 @@ bool > vec4_instruction::has_source_and_destination_hazard() const > { > switch (opcode) { > - /* Most opcodes in the vec4 world use MRFs. */ > + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: > + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: > + return true; > default: > return false; > } > @@ -204,6 +209,7 @@ vec4_instruction::regs_read(unsigned arg) const > case SHADER_OPCODE_TYPED_ATOMIC: > case SHADER_OPCODE_TYPED_SURFACE_READ: > case SHADER_OPCODE_TYPED_SURFACE_WRITE: > + case TCS_OPCODE_URB_WRITE: > return arg == 0 ? mlen : 1; > > case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: > @@ -281,6 +287,8 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst) > return 0; > case GS_OPCODE_FF_SYNC: > return 1; > + case TCS_OPCODE_URB_WRITE: > + return 0; > case SHADER_OPCODE_SHADER_TIME_ADD: > return 0; > case SHADER_OPCODE_TEX: > diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h > b/src/mesa/drivers/dri/i965/brw_vec4.h > index ae5bf69..6bbac83 100644 > --- a/src/mesa/drivers/dri/i965/brw_vec4.h > +++ b/src/mesa/drivers/dri/i965/brw_vec4.h > @@ -340,6 +340,7 @@ public: > unsigned num_components = 4); > src_reg get_nir_src(nir_src src, > unsigned num_components = 4); > + src_reg get_indirect_offset(nir_intrinsic_instr *instr); > > virtual dst_reg *make_reg_for_system_value(int location, > const glsl_type *type) = 0; > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp > b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp > index 85cbf24..0c1f0c3 100644 > --- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp > +++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp > @@ -75,6 +75,8 @@ is_expression(const vec4_instruction *const inst) > case VEC4_OPCODE_UNPACK_UNIFORM: > case SHADER_OPCODE_FIND_LIVE_CHANNEL: > case SHADER_OPCODE_BROADCAST: > + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: > + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: > return true; > case SHADER_OPCODE_RCP: > case SHADER_OPCODE_RSQ: > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp > b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp > index 2d0722a..c31e72d 100644 > --- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp > +++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp > @@ -45,6 +45,9 @@ can_do_writemask(const struct brw_device_info *devinfo, > case VS_OPCODE_PULL_CONSTANT_LOAD: > case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: > case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: > + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: > + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: > + case VEC4_OPCODE_URB_READ: > return false; > default: > /* The MATH instruction on Gen6 only executes in align1 mode, which > does > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp > b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp > index c3426dd..076b1dd 100644 > --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp > +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp > @@ -714,6 +714,211 @@ generate_gs_set_primitive_id(struct brw_codegen *p, > struct brw_reg dst) > } > > static void > +generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst) > +{ > + /* "Instance Count" comes as part of the payload in r0.2 bits 23:17. > + * > + * Since we operate in SIMD4x2 mode, we need run half as many threads > + * as necessary. So we assign (2i + 1, 2i) as the thread counts. We > + * shift right by one less to accomplish the multiplication by two. > + */ > + dst = retype(dst, BRW_REGISTER_TYPE_UD); > + struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); > + > + brw_push_insn_state(p); > + brw_set_default_access_mode(p, BRW_ALIGN_1); > + > + const int mask = INTEL_MASK(23, 17); > + const int shift = 17; > + > + brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), > brw_imm_ud(mask)); > + brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0), > + brw_imm_ud(shift - 1)); > + brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1)); > + > + brw_pop_insn_state(p); > +} > + > +static void > +generate_tcs_urb_write(struct brw_codegen *p, > + vec4_instruction *inst, > + struct brw_reg urb_header) > +{ > + const struct brw_device_info *devinfo = p->devinfo; > + > + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); > + brw_set_dest(p, send, brw_null_reg()); > + brw_set_src0(p, send, urb_header); > + > + brw_set_message_descriptor(p, send, BRW_SFID_URB, > + inst->mlen /* mlen */, 0 /* rlen */, > + true /* header */, false /* eot */); > + brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD); > + brw_inst_set_urb_global_offset(devinfo, send, inst->offset); > + brw_inst_set_urb_per_slot_offset(devinfo, send, 1); > + brw_inst_set_urb_swizzle_control(devinfo, send, > BRW_URB_SWIZZLE_INTERLEAVE); > + > + /* what happens to swizzles? */ > +} > + > + > +static void > +generate_tcs_input_urb_offsets(struct brw_codegen *p, > + struct brw_reg dst, > + struct brw_reg vertex, > + struct brw_reg offset) > +{ > + /* Generates an URB read/write message header for HS/DS operation. > + * Inputs are a vertex index, and a byte offset from the beginning of > + * the vertex. */ > + > + /* If `vertex` is not an immediate, we clobber a0.0 */ > + > + assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == > BRW_GENERAL_REGISTER_FILE); > + assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == > BRW_REGISTER_TYPE_D); > + > + assert(dst.file == BRW_GENERAL_REGISTER_FILE); > + > + brw_push_insn_state(p); > + brw_set_default_access_mode(p, BRW_ALIGN_1); > + brw_set_default_mask_control(p, BRW_MASK_DISABLE); > + brw_MOV(p, dst, brw_imm_ud(0)); > + > + /* m0.5 bits 8-15 are channel enables */ > + brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00)); > + > + /* m0.0-0.1: URB handles */ > + if (vertex.file == BRW_IMMEDIATE_VALUE) { > + uint32_t vertex_index = vertex.ud; > + struct brw_reg index_reg = brw_vec1_grf( > + 1 + (vertex_index >> 3), vertex_index & 7); > + > + brw_MOV(p, vec2(get_element_ud(dst, 0)), > + retype(index_reg, BRW_REGISTER_TYPE_UD)); > + } else { > + /* indirect via a0.0 */ > + struct brw_reg addr = brw_address_reg(0); > + > + /* bottom half: m0.0 = g[1.0 + vertex.0]UD */ > + brw_ADD(p, addr, get_element_ud(vertex, 0), brw_imm_uw(0x8)); > + brw_SHL(p, addr, addr, brw_imm_ud(2)); > + brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0)); > + > + /* top half: m0.1 = g[1.0 + vertex.4]UD */ > + brw_ADD(p, addr, get_element_ud(vertex, 4), brw_imm_uw(0x8)); > + brw_SHL(p, addr, addr, brw_imm_ud(2)); > + brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0)); > + } > + > + /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */ > + if (offset.file != ARF) > + brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); > + > + brw_pop_insn_state(p); > +} > + > + > +static void > +generate_tcs_output_urb_offsets(struct brw_codegen *p, > + struct brw_reg dst, > + struct brw_reg write_mask, > + struct brw_reg offset) > +{ > + /* Generates an URB read/write message header for HS/DS operation, for > the patch URB entry. */ > + assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == > BRW_MESSAGE_REGISTER_FILE); > + > + assert(write_mask.file == BRW_IMMEDIATE_VALUE); > + assert(write_mask.type == BRW_REGISTER_TYPE_UD); > + > + brw_push_insn_state(p); > + > + brw_set_default_access_mode(p, BRW_ALIGN_1); > + brw_set_default_mask_control(p, BRW_MASK_DISABLE); > + brw_MOV(p, dst, brw_imm_ud(0)); > + > + unsigned mask = write_mask.ud; > + > + /* m0.5 bits 15:12 and 11:8 are channel enables */ > + brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << > 12))); > + > + /* HS patch URB handle is delivered in r0.0 */ > + struct brw_reg urb_handle = brw_vec1_grf(0, 0); > + > + /* m0.0-0.1: URB handles */ > + brw_MOV(p, vec2(get_element_ud(dst, 0)), > + retype(urb_handle, BRW_REGISTER_TYPE_UD)); > + > + /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */ > + if (offset.file != ARF) > + brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); > + > + brw_pop_insn_state(p); > +} > + > +static void > +generate_vec4_urb_read(struct brw_codegen *p, > + vec4_instruction *inst, > + struct brw_reg dst, > + struct brw_reg header) > +{ > + const struct brw_device_info *devinfo = p->devinfo; > + > + assert(header.file == BRW_GENERAL_REGISTER_FILE); > + assert(header.type == BRW_REGISTER_TYPE_UD); > + > + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); > + brw_set_dest(p, send, dst); > + brw_set_src0(p, send, header); > + > + brw_set_message_descriptor(p, send, BRW_SFID_URB, > + 1 /* mlen */, 1 /* rlen */, > + true /* header */, false /* eot */); > + brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD); > + brw_inst_set_urb_swizzle_control(devinfo, send, > BRW_URB_SWIZZLE_INTERLEAVE); > + brw_inst_set_urb_per_slot_offset(devinfo, send, 1); > + > + brw_inst_set_urb_global_offset(devinfo, send, inst->offset); > +} > + > +static void > +generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst) > +{ > + brw_push_insn_state(p); > + brw_set_default_access_mode(p, BRW_ALIGN_1); > + brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD)); > + brw_pop_insn_state(p); > +} > + > +static void > +generate_tcs_create_barrier_header(struct brw_codegen *p, > + struct brw_vue_prog_data *prog_data, > + struct brw_reg dst) > +{ > + struct brw_reg m0_2 = get_element_ud(dst, 2); > + unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances; > + > + brw_push_insn_state(p); > + brw_set_default_access_mode(p, BRW_ALIGN_1); > + brw_set_default_mask_control(p, BRW_MASK_DISABLE); > + > + /* Zero the message header */ > + brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)); > + > + /* Copy "Barrier ID" from DW0 bits 16:13 */ > + brw_AND(p, m0_2, > + retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), > + brw_imm_ud(0x1e000)); > + > + /* Shift it into place */ > + brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(11)); > + > + /* Set the Barrier Count and the enable bit */ > + brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15))); > + > + brw_pop_insn_state(p); > +} > + > +static void > generate_oword_dual_block_offsets(struct brw_codegen *p, > struct brw_reg m1, > struct brw_reg index) > @@ -1538,6 +1743,39 @@ generate_code(struct brw_codegen *p, > break; > } > > + case TCS_OPCODE_URB_WRITE: > + generate_tcs_urb_write(p, inst, src[0]); > + break; > + > + case VEC4_OPCODE_URB_READ: > + generate_vec4_urb_read(p, inst, dst, src[0]); > + break; > + > + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: > + generate_tcs_input_urb_offsets(p, dst, src[0], src[1]); > + break; > + > + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: > + generate_tcs_output_urb_offsets(p, dst, src[0], src[1]); > + break; > + > + case TCS_OPCODE_GET_INSTANCE_ID: > + generate_tcs_get_instance_id(p, dst); > + break; > + > + case TCS_OPCODE_GET_PRIMITIVE_ID: > + generate_tcs_get_primitive_id(p, dst); > + break; > + > + case TCS_OPCODE_CREATE_BARRIER_HEADER: > + generate_tcs_create_barrier_header(p, prog_data, dst); > + break; > + > + case SHADER_OPCODE_BARRIER: > + brw_barrier(p, src[0]); > + brw_WAIT(p); > + break; > + > default: > unreachable("Unsupported opcode"); > } > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp > b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp > index f965b39..45ff7a3 100644 > --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp > +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp > @@ -327,6 +327,24 @@ vec4_visitor::get_nir_src(nir_src src, unsigned > num_components) > return get_nir_src(src, nir_type_int, num_components); > } > > +src_reg > +vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr) > +{ > + nir_src *offset_src = nir_get_io_offset_src(instr); > + nir_const_value *const_value = nir_src_as_const_value(*offset_src); > + > + if (const_value) { > + /* The only constant offset we should find is 0. brw_nir.c's > + * add_const_offset_to_base() will fold other constant offsets > + * into instr->const_index[0]. > + */ > + assert(const_value->u[0] == 0); > + return src_reg(); > + } > + > + return get_nir_src(*offset_src, BRW_REGISTER_TYPE_UD, 1); > +} > + > void > vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr) > { > @@ -650,7 +668,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr > *instr) > > case nir_intrinsic_load_vertex_id_zero_base: > case nir_intrinsic_load_base_vertex: > - case nir_intrinsic_load_instance_id: { > + case nir_intrinsic_load_instance_id: > + case nir_intrinsic_load_invocation_id: > + case nir_intrinsic_load_tess_level_inner: > + case nir_intrinsic_load_tess_level_outer: { > gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); > src_reg val = src_reg(nir_system_values[sv]); > assert(val.file != BAD_FILE); > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp > b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp > new file mode 100644 > index 0000000..22224d1 > --- /dev/null > +++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp > @@ -0,0 +1,496 @@ > +/* > + * Copyright © 2013 Intel Corporation > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice (including the next > + * paragraph) shall be included in all copies or substantial portions of the > + * Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > + * DEALINGS IN THE SOFTWARE. > + */ > + > +/** > + * \file brw_vec4_tcs.cpp > + * > + * Tessellaton control shader specific code derived from the vec4_visitor > class. > + */ > + > +#include "brw_nir.h" > +#include "brw_vec4_tcs.h" > + > +namespace brw { > + > +vec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler, > + void *log_data, > + const struct brw_tcs_prog_key *key, > + struct brw_tcs_prog_data *prog_data, > + const nir_shader *nir, > + void *mem_ctx, > + int shader_time_index) > + : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base, > + nir, mem_ctx, false, shader_time_index), > + key(key) > +{ > +} > + > + > +void > +vec4_tcs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr > *instr) > +{ > +} > + > +dst_reg * > +vec4_tcs_visitor::make_reg_for_system_value(int location, const glsl_type > *type) > +{ > + return NULL; > +} > + > + > +void > +vec4_tcs_visitor::setup_payload() > +{ > + int reg = 0; > + > + /* The payload always contains important data in r0, which contains > + * the URB handles that are passed on to the URB write at the end > + * of the thread. > + */ > + reg++; > + > + /* r1.0 - r4.7 may contain the input control point URB handles, > + * which we use to pull vertex data. > + */ > + reg += 4; > + > + /* Push constants may start at r5.0 */ > + reg = setup_uniforms(reg); > + > + this->first_non_payload_grf = reg; > +} > + > + > +void > +vec4_tcs_visitor::emit_prolog() > +{ > + invocation_id = src_reg(this, glsl_type::uint_type); > + emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id)); > + > + /* HS threads are dispatched with the dispatch mask set to 0xFF. > + * If there are an odd number of output vertices, then the final > + * HS instance dispatched will only have its bottom half doing real > + * work, and so we need to disable the upper half: > + */ > + if (nir->info.tcs.vertices_out % 2) { > + emit(CMP(dst_null_d(), invocation_id, > + brw_imm_ud(nir->info.tcs.vertices_out), BRW_CONDITIONAL_L)); > + > + /* Matching ENDIF is in emit_thread_end() */ > + emit(IF(BRW_PREDICATE_NORMAL)); > + } > +} > + > + > +void > +vec4_tcs_visitor::emit_thread_end() > +{ > + current_annotation = "thread end"; > + > + if (nir->info.tcs.vertices_out % 2) { > + emit(BRW_OPCODE_ENDIF); > + } > + > + if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME)) > + emit_shader_time_end(); > + > + vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE); > + inst->mlen = 1; /* just the header, no data. */ > + inst->urb_write_flags = BRW_URB_WRITE_EOT_COMPLETE; > +} > + > + > +void > +vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst, > + const src_reg &vertex_index, > + unsigned base_offset, > + const src_reg &indirect_offset) > +{ > + vec4_instruction *inst; > + dst_reg temp(this, glsl_type::ivec4_type); > + temp.type = dst.type; > + > + /* Set up the message header to reference the proper parts of the URB */ > + dst_reg header = dst_reg(this, glsl_type::uvec4_type); > + inst = emit(TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index, > + indirect_offset); > + inst->force_writemask_all = true; > + > + /* Read into a temporary, ignoring writemasking. */ > + inst = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header)); > + inst->offset = base_offset; > + inst->mlen = 1; > + inst->base_mrf = -1; > + > + /* Copy the temporary to the destination to deal with writemasking. > + * > + * Also attempt to deal with gl_PointSize being in the .w component. > + */ > + if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { > + emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW))); > + } else { > + emit(MOV(dst, src_reg(temp))); > + } > +} > + > +void > +vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst, > + unsigned base_offset, > + const src_reg &indirect_offset) > +{ > + vec4_instruction *inst; > + > + /* Set up the message header to reference the proper parts of the URB */ > + dst_reg header = dst_reg(this, glsl_type::uvec4_type); > + inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header, > + brw_imm_ud(dst.writemask), indirect_offset); > + inst->force_writemask_all = true; > + > + /* Read into a temporary, ignoring writemasking. */ > + vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, dst, src_reg(header)); > + read->offset = base_offset; > + read->mlen = 1; > + read->base_mrf = -1; > +} > + > +void > +vec4_tcs_visitor::emit_urb_write(const src_reg &value, > + unsigned writemask, > + unsigned base_offset, > + const src_reg &indirect_offset) > +{ > + if (writemask == 0) > + return; > + > + src_reg message(this, glsl_type::uvec4_type, 2); > + vec4_instruction *inst; > + > + inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message), > + brw_imm_ud(writemask), indirect_offset); > + inst->force_writemask_all = true; > + inst = emit(MOV(offset(dst_reg(retype(message, value.type)), 1), value)); > + inst->force_writemask_all = true; > + > + inst = emit(TCS_OPCODE_URB_WRITE, dst_null_f(), message); > + inst->offset = base_offset; > + inst->mlen = 2; > + inst->base_mrf = -1; > +} > + > +static unsigned > +tesslevel_outer_components(GLenum tes_primitive_mode) > +{ > + switch (tes_primitive_mode) { > + case GL_QUADS: > + return 4; > + case GL_TRIANGLES: > + return 3; > + case GL_ISOLINES: > + return 2; > + default: > + unreachable("Bogus tessellation domain"); > + } > + return 0; > +} > + > +static unsigned > +tesslevel_inner_components(GLenum tes_primitive_mode) > +{ > + switch (tes_primitive_mode) { > + case GL_QUADS: > + return 2; > + case GL_TRIANGLES: > + return 1; > + case GL_ISOLINES: > + return 0; > + default: > + unreachable("Bogus tessellation domain"); > + } > + return 0; > +} > + > +/** > + * Given a normal .xyzw writemask, convert it to a writemask for a vector > + * that's stored backwards, i.e. .wzyx. > + */ > +static unsigned > +writemask_for_backwards_vector(unsigned mask) > +{ > + unsigned new_mask = 0; > + > + for (int i = 0; i < 4; i++) > + new_mask |= ((mask >> i) & 1) << (3 - i); > + > + return new_mask; > +} > + > +void > +vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) > +{ > + switch (instr->intrinsic) { > + case nir_intrinsic_load_invocation_id: > + emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD), > + invocation_id)); > + break; > + case nir_intrinsic_load_primitive_id: > + emit(TCS_OPCODE_GET_PRIMITIVE_ID, > + get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD)); > + break; > + case nir_intrinsic_load_patch_vertices_in: > + unreachable("XXX: gl_PatchVerticesIn not implemented yet."); > + break; > + case nir_intrinsic_load_per_vertex_input: { > + src_reg indirect_offset = get_indirect_offset(instr); > + unsigned imm_offset = instr->const_index[0]; > + > + nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]); > + src_reg vertex_index = > + vertex_const ? src_reg(brw_imm_ud(vertex_const->u[0])) > + : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1); > + > + dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); > + dst.writemask = brw_writemask_for_size(instr->num_components); > + > + emit_input_urb_read(dst, vertex_index, imm_offset, indirect_offset); > + break; > + } > + case nir_intrinsic_load_input: > + unreachable("nir_lower_io should use load_per_vertex_input > intrinsics"); > + break; > + case nir_intrinsic_load_output: > + case nir_intrinsic_load_per_vertex_output: { > + src_reg indirect_offset = get_indirect_offset(instr); > + unsigned imm_offset = instr->const_index[0];; > + > + dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); > + dst.writemask = brw_writemask_for_size(instr->num_components); > + > + if (imm_offset == 0 && indirect_offset.file == BAD_FILE) { > + dst.type = BRW_REGISTER_TYPE_F; > + > + /* This is a read of gl_TessLevelInner[], which lives in the > + * Patch URB header. The layout depends on the domain. > + */ > + switch (key->tes_primitive_mode) { > + case GL_QUADS: { > + /* DWords 3-2 (reversed); use offset 0 and WZYX swizzle. */ > + dst_reg tmp(this, glsl_type::vec4_type); > + emit_output_urb_read(tmp, 0, src_reg()); > + emit(MOV(writemask(dst, WRITEMASK_XY), > + swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX))); > + break; > + } > + case GL_TRIANGLES: > + /* DWord 4; use offset 1 but normal swizzle/writemask. */ > + emit_output_urb_read(writemask(dst, WRITEMASK_X), 1, src_reg()); > + break; > + case GL_ISOLINES: > + /* All channels are undefined. */ > + return; > + default: > + unreachable("Bogus tessellation domain"); > + } > + } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) { > + dst.type = BRW_REGISTER_TYPE_F; > + > + /* This is a read of gl_TessLevelOuter[], which lives in the > + * high 4 DWords of the Patch URB header, in reverse order. > + */ > + switch (key->tes_primitive_mode) { > + case GL_QUADS: > + dst.writemask = WRITEMASK_XYZW; > + break; > + case GL_TRIANGLES: > + dst.writemask = WRITEMASK_XYZ; > + break; > + case GL_ISOLINES: > + dst.writemask = WRITEMASK_XY; > + return; > + default: > + unreachable("Bogus tessellation domain"); > + } > + > + dst_reg tmp(this, glsl_type::vec4_type); > + emit_output_urb_read(tmp, 1, src_reg()); > + emit(MOV(dst, swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX))); > + } else { > + emit_output_urb_read(dst, imm_offset, indirect_offset); > + } > + break; > + } > + case nir_intrinsic_store_output: > + case nir_intrinsic_store_per_vertex_output: { > + src_reg value = get_nir_src(instr->src[0]); > + unsigned mask = instr->const_index[1]; > + unsigned swiz = BRW_SWIZZLE_XYZW; > + > + src_reg indirect_offset = get_indirect_offset(instr); > + unsigned imm_offset = instr->const_index[0]; > + > + if (imm_offset == 0 && indirect_offset.file == BAD_FILE) { > + value.type = BRW_REGISTER_TYPE_F; > + > + mask &= (1 << tesslevel_inner_components(key->tes_primitive_mode)) > - 1; > + > + /* This is a write to gl_TessLevelInner[], which lives in the > + * Patch URB header. The layout depends on the domain. > + */ > + switch (key->tes_primitive_mode) { > + case GL_QUADS: > + /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed). > + * We use an XXYX swizzle to reverse put .xy in the .wz > + * channels, and use a .zw writemask. > + */ > + swiz = BRW_SWIZZLE4(0, 0, 1, 0); > + mask = writemask_for_backwards_vector(mask); > + break; > + case GL_TRIANGLES: > + /* gl_TessLevelInner[].x lives at DWord 4, so we set the > + * writemask to X and bump the URB offset by 1. > + */ > + imm_offset = 1; > + break; > + case GL_ISOLINES: > + /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */ > + return; > + default: > + unreachable("Bogus tessellation domain"); > + } > + } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) { > + value.type = BRW_REGISTER_TYPE_F; > + > + mask &= (1 << tesslevel_outer_components(key->tes_primitive_mode)) > - 1; > + > + /* This is a write to gl_TessLevelOuter[] which lives in the > + * Patch URB Header at DWords 4-7. However, it's reversed, so > + * instead of .xyzw we have .wzyx. > + */ > + swiz = BRW_SWIZZLE_WZYX; > + mask = writemask_for_backwards_vector(mask); > + } > + > + emit_urb_write(swizzle(value, swiz), mask, > + imm_offset, indirect_offset); > + break; > + } > + > + case nir_intrinsic_barrier: { > + dst_reg header = dst_reg(this, glsl_type::uvec4_type); > + emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header); > + emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header)); > + break; > + } > + > + default: > + vec4_visitor::nir_emit_intrinsic(instr); > + } > +} > + > + > +extern "C" const unsigned * > +brw_compile_tcs(const struct brw_compiler *compiler, > + void *log_data, > + void *mem_ctx, > + const struct brw_tcs_prog_key *key, > + struct brw_tcs_prog_data *prog_data, > + const nir_shader *src_shader, > + int shader_time_index, > + unsigned *final_assembly_size, > + char **error_str) > +{ > + const struct brw_device_info *devinfo = compiler->devinfo; > + struct brw_vue_prog_data *vue_prog_data = &prog_data->base; > + const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL]; > + > + nir_shader *nir = nir_shader_clone(mem_ctx, src_shader); > + nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar); > + nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar); > + > + prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2); > + > + brw_compute_tess_vue_map(&vue_prog_data->vue_map, > + nir->info.outputs_written, > + nir->info.patch_outputs_written); > + > + /* Compute URB entry size. The maximum allowed URB entry size is 32k. > + * That divides up as follows: > + * > + * 32 bytes for the patch header (tessellation factors) > + * 480 bytes for per-patch varyings (a varying component is 4 bytes and > + * gl_MaxTessPatchComponents = 120) > + * 16384 bytes for per-vertex varyings (a varying component is 4 bytes, > + * gl_MaxPatchVertices = 32 and > + * gl_MaxTessControlOutputComponents = 128) > + * > + * 15808 bytes left for varying packing overhead > + */ > + const int num_per_patch_slots = > vue_prog_data->vue_map.num_per_patch_slots; > + const int num_per_vertex_slots = > vue_prog_data->vue_map.num_per_vertex_slots; > + unsigned output_size_bytes = 0; > + /* Note that the patch header is counted in num_per_patch_slots. */ > + output_size_bytes += num_per_patch_slots * 16; > + output_size_bytes += nir->info.tcs.vertices_out * num_per_vertex_slots * > 16; > + > + assert(output_size_bytes >= 1); > + if (output_size_bytes > GEN7_MAX_HS_URB_ENTRY_SIZE_BYTES) > + return false; > + > + /* URB entry sizes are stored as a multiple of 64 bytes. */ > + vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64; > + > + struct brw_vue_map input_vue_map; > + brw_compute_vue_map(devinfo, &input_vue_map, > + nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID, > + true); > + > + /* HS does not use the usual payload pushing from URB to GRFs, > + * because we don't have enough registers for a full-size payload, and > + * the hardware is broken on Haswell anyway. > + */ > + vue_prog_data->urb_read_length = 0; > + > + if (unlikely(INTEL_DEBUG & DEBUG_TCS)) { > + fprintf(stderr, "TCS Input "); > + brw_print_vue_map(stderr, &input_vue_map); > + fprintf(stderr, "TCS Output "); > + brw_print_vue_map(stderr, &vue_prog_data->vue_map); > + } > + > + vec4_tcs_visitor v(compiler, log_data, key, prog_data, > + nir, mem_ctx, shader_time_index); > + if (!v.run()) { > + if (error_str) > + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); > + return NULL; > + } > + > + if (unlikely(INTEL_DEBUG & DEBUG_TCS)) > + v.dump_instructions(); > + > + return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir, > + &prog_data->base, v.cfg, > + final_assembly_size); > +} > + > + > +} /* namespace brw */ > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.h > b/src/mesa/drivers/dri/i965/brw_vec4_tcs.h > new file mode 100644 > index 0000000..2bf4885 > --- /dev/null > +++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.h > @@ -0,0 +1,84 @@ > +/* > + * Copyright © 2013 Intel Corporation > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice (including the next > + * paragraph) shall be included in all copies or substantial portions of the > + * Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > + * DEALINGS IN THE SOFTWARE. > + */ > + > +/** > + * \file brw_vec4_tcs.h > + * > + * The vec4-mode tessellation control shader compiler backend. > + */ > + > +#ifndef BRW_VEC4_TCS_H > +#define BRW_VEC4_TCS_H > + > +#include "brw_compiler.h" > +#include "brw_vec4.h" > + > +#ifdef __cplusplus > +namespace brw { > + > +class vec4_tcs_visitor : public vec4_visitor > +{ > +public: > + vec4_tcs_visitor(const struct brw_compiler *compiler, > + void *log_data, > + const struct brw_tcs_prog_key *key, > + struct brw_tcs_prog_data *prog_data, > + const nir_shader *nir, > + void *mem_ctx, > + int shader_time_index); > + > +protected: > + virtual dst_reg *make_reg_for_system_value(int location, > + const glsl_type *type); > + virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr); > + virtual void setup_payload(); > + virtual void emit_prolog(); > + virtual void emit_thread_end(); > + > + virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr); > + > + void emit_input_urb_read(const dst_reg &dst, > + const src_reg &vertex_index, > + unsigned base_offset, > + const src_reg &indirect_offset); > + void emit_output_urb_read(const dst_reg &dst, > + unsigned base_offset, > + const src_reg &indirect_offset); > + > + void emit_urb_write(const src_reg &value, unsigned writemask, > + unsigned base_offset, const src_reg &indirect_offset); > + > + /* we do not use the normal end-of-shader URB write mechanism -- but > every vec4 stage > + * must provide implementations of these: > + */ > + virtual void emit_urb_write_header(int mrf) {} > + virtual vec4_instruction *emit_urb_write_opcode(bool complete) { return > NULL; } > + > + const struct brw_tcs_prog_key *key; > + src_reg invocation_id; > +}; > + > +} /* namespace brw */ > +#endif /* __cplusplus */ > + > +#endif /* BRW_VEC4_TCS_H */ > -- > 2.6.3 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev