--- src/gallium/drivers/nvfx/nvfx_fragprog.c | 146 ++++++++++++++++++---------- src/gallium/drivers/nvfx/nvfx_shader.h | 1 + src/gallium/drivers/nvfx/nvfx_state.c | 4 + src/gallium/drivers/nvfx/nvfx_state.h | 15 +++ src/gallium/drivers/nvfx/nvfx_state_emit.c | 2 +- src/gallium/drivers/nvfx/nvfx_vertprog.c | 40 ++++++-- 6 files changed, 143 insertions(+), 65 deletions(-)
diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c index 5fa825a..b4b63e2 100644 --- a/src/gallium/drivers/nvfx/nvfx_fragprog.c +++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c @@ -1,6 +1,7 @@ #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_state.h" +#include "util/u_semantics.h" #include "util/u_inlines.h" #include "pipe/p_shader_tokens.h" @@ -16,8 +17,6 @@ struct nvfx_fpc { struct nvfx_fragment_program *fp; - uint attrib_map[PIPE_MAX_SHADER_INPUTS]; - unsigned r_temps; unsigned r_temps_discard; struct nvfx_sreg r_result[PIPE_MAX_SHADER_OUTPUTS]; @@ -36,6 +35,8 @@ struct nvfx_fpc { struct nvfx_sreg imm[MAX_IMM]; unsigned nr_imm; + + unsigned char sem_table[256]; /* semantic idx for each input semantic */ }; static INLINE struct nvfx_sreg @@ -111,6 +112,11 @@ emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_sreg src) sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT); sr |= (src.index << NVFX_FP_REG_SRC_SHIFT); break; + case NVFXSR_RELOCATED: + sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT); + printf("adding relocation at %x for %x\n", fpc->inst_offset, src.index); + util_dynarray_append(&fpc->fp->sem_relocs[src.index], unsigned, fpc->inst_offset); + break; case NVFXSR_CONST: if (!fpc->have_const) { grow_insns(fpc, 4); @@ -241,8 +247,28 @@ tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc) switch (fsrc->Register.File) { case TGSI_FILE_INPUT: - src = nvfx_sr(NVFXSR_INPUT, - fpc->attrib_map[fsrc->Register.Index]); + if(fpc->fp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_POSITION) { + assert(fpc->fp->info.input_semantic_index[fsrc->Register.Index] == 0); + src = nvfx_sr(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_POSITION); + } else if(fpc->fp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_COLOR) { + if(fpc->fp->info.input_semantic_index[fsrc->Register.Index] == 0) + src = nvfx_sr(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_COL0); + else if(fpc->fp->info.input_semantic_index[fsrc->Register.Index] == 1) + src = nvfx_sr(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_COL1); + else + assert(0); + } else if(fpc->fp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FOG) { + assert(fpc->fp->info.input_semantic_index[fsrc->Register.Index] == 0); + src = nvfx_sr(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_FOGC); + } else if(fpc->fp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FACE) { + /* TODO: check this has the correct values */ + /* XXX: what do we do for nv30 here (assuming it lacks facing)?! */ + assert(fpc->fp->info.input_semantic_index[fsrc->Register.Index] == 0); + src = nvfx_sr(NVFXSR_INPUT, NV40_FP_OP_INPUT_SRC_FACING); + } else { + assert(fpc->fp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_GENERIC); + src = nvfx_sr(NVFXSR_RELOCATED, fpc->sem_table[fpc->fp->info.input_semantic_index[fsrc->Register.Index]]); + } break; case TGSI_FILE_CONSTANT: src = constant(fpc, fsrc->Register.Index, NULL); @@ -611,48 +637,6 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, } static boolean -nvfx_fragprog_parse_decl_attrib(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, - const struct tgsi_full_declaration *fdec) -{ - int hw; - - switch (fdec->Semantic.Name) { - case TGSI_SEMANTIC_POSITION: - hw = NVFX_FP_OP_INPUT_SRC_POSITION; - break; - case TGSI_SEMANTIC_COLOR: - if (fdec->Semantic.Index == 0) { - hw = NVFX_FP_OP_INPUT_SRC_COL0; - } else - if (fdec->Semantic.Index == 1) { - hw = NVFX_FP_OP_INPUT_SRC_COL1; - } else { - NOUVEAU_ERR("bad colour semantic index\n"); - return FALSE; - } - break; - case TGSI_SEMANTIC_FOG: - hw = NVFX_FP_OP_INPUT_SRC_FOGC; - break; - case TGSI_SEMANTIC_GENERIC: - if (fdec->Semantic.Index <= 7) { - hw = NVFX_FP_OP_INPUT_SRC_TC(fdec->Semantic. - Index); - } else { - NOUVEAU_ERR("bad generic semantic index\n"); - return FALSE; - } - break; - default: - NOUVEAU_ERR("bad input semantic\n"); - return FALSE; - } - - fpc->attrib_map[fdec->Range.First] = hw; - return TRUE; -} - -static boolean nvfx_fragprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, const struct tgsi_full_declaration *fdec) { @@ -691,6 +675,15 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc) { struct tgsi_parse_context p; int high_temp = -1, i; + struct util_semantic_set set; + + fpc->fp->num_semantics = util_semantic_set_from_program_file(&set, fpc->fp->pipe.tokens, TGSI_FILE_INPUT); + if(fpc->fp->num_semantics > 8) + return FALSE; + util_semantic_layout_from_set(fpc->fp->semantics, &set, 0, 8); + util_semantic_table_from_layout(fpc->sem_table, fpc->fp->semantics, 0, 8); + + memset(fpc->fp->cur_slots, 0xff, sizeof(fpc->fp->cur_slots)); tgsi_parse_init(&p, fpc->fp->pipe.tokens); while (!tgsi_parse_end_of_tokens(&p)) { @@ -703,10 +696,6 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc) const struct tgsi_full_declaration *fdec; fdec = &p.FullToken.FullDeclaration; switch (fdec->Declaration.File) { - case TGSI_FILE_INPUT: - if (!nvfx_fragprog_parse_decl_attrib(nvfx, fpc, fdec)) - goto out_err; - break; case TGSI_FILE_OUTPUT: if (!nvfx_fragprog_parse_decl_output(nvfx, fpc, fdec)) goto out_err; @@ -878,6 +867,31 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) if (nvfx->dirty & NVFX_NEW_FRAGCONST) update = TRUE; + struct nvfx_vertex_program* vp = nvfx->render_mode == HW ? nvfx->vertprog : nvfx->swtnl.vertprog; + if (fp->last_vp_id != vp->id) { + char* vp_sem_table = vp->sem_table; + unsigned char* fp_semantics = fp->semantics; + unsigned diff = 0; + fp->last_vp_id = nvfx->vertprog->id; + unsigned char* cur_slots = fp->cur_slots; + for(unsigned i = 0; i < fp->num_semantics; ++i) { + unsigned char slot_mask = vp_sem_table[fp_semantics[i]]; + diff |= (slot_mask >> 4) & (slot_mask ^ cur_slots[i]); + } + + if(diff) + { + fp->cur_slots_progs_left = fp->progs; + for(unsigned i = 0; i < fp->num_semantics; ++i) { + /* if 0xff, then this will write to the dummy value at fp->last_layout_mask[0] */ + fp->cur_slots[i] = vp_sem_table[fp_semantics[i]] & 0xf; + printf("fp: GENERIC[%i] from fpreg %i\n", fp_semantics[i], fp->cur_slots[i]); + } + + update = TRUE; + } + } + if(update) { ++fp->bo_prog_idx; if(fp->bo_prog_idx >= fp->progs_per_bo) @@ -888,7 +902,9 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) } else { - struct nvfx_fragment_program_bo* fpbo = os_malloc_aligned(sizeof(struct nvfx_fragment_program) + fp->prog_size * fp->progs_per_bo, 16); + struct nvfx_fragment_program_bo* fpbo = os_malloc_aligned(sizeof(struct nvfx_fragment_program) + (fp->prog_size + 8) * fp->progs_per_bo, 16); + fpbo->slots = &fpbo->insn[(fp->prog_size) * fp->progs_per_bo]; + memset(fpbo->slots, 0, 8 * fp->progs_per_bo); if(fp->fpbo) { fpbo->next = fp->fpbo->next; @@ -898,6 +914,8 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) fpbo->next = fpbo; fp->fpbo = fpbo; fpbo->bo = 0; + fp->progs += fp->progs_per_bo; + fp->cur_slots_progs_left += fp->progs_per_bo; nouveau_bo_new(nvfx->screen->base.device, NOUVEAU_BO_VRAM | NOUVEAU_BO_MAP, 64, fp->prog_size * fp->progs_per_bo, &fpbo->bo); nouveau_bo_map(fpbo->bo, NOUVEAU_BO_NOSYNC); @@ -915,6 +933,7 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) } int offset = fp->bo_prog_idx * fp->prog_size; + uint32_t* fpmap = (uint32_t*)((char*)fp->fpbo->bo->map + offset); if(nvfx->constbuf[PIPE_SHADER_FRAGMENT]) { struct pipe_resource* constbuf = nvfx->constbuf[PIPE_SHADER_FRAGMENT]; @@ -922,7 +941,6 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) struct pipe_transfer* transfer; // TODO: does this check make any sense, or should we do this unconditionally? uint32_t* map = pipe_buffer_map(&nvfx->pipe, constbuf, PIPE_TRANSFER_READ, &transfer); - uint32_t* fpmap = (uint32_t*)((char*)fp->fpbo->bo->map + offset); uint32_t* buf = (uint32_t*)((char*)fp->fpbo->insn + offset); for (i = 0; i < fp->nr_consts; ++i) { unsigned off = fp->consts[i].offset; @@ -936,6 +954,25 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) } pipe_buffer_unmap(&nvfx->pipe, constbuf, transfer); } + + if(fp->cur_slots_progs_left) { + unsigned char* fpbo_slots = &fp->fpbo->slots[fp->bo_prog_idx * 8]; + for(unsigned i = 0; i < fp->num_semantics; ++i) { + unsigned value = fp->cur_slots[i];; + if(value != fpbo_slots[i]) { + unsigned* p = (unsigned*)fp->sem_relocs[i].data; + unsigned* pend = (unsigned*)((char*)fp->sem_relocs[i].data + fp->sem_relocs[i].size); + for(; p != pend; ++p) { + unsigned off = *p; + unsigned dw = fp->insn[off]; + dw = (dw & ~NVFX_FP_OP_INPUT_SRC_MASK) | (value << NVFX_FP_OP_INPUT_SRC_SHIFT); + nvfx_fp_memcpy(&fpmap[*p], &dw, sizeof(dw)); + } + fpbo_slots[i] = value; + } + } + --fp->cur_slots_progs_left; + } } if(update || (nvfx->dirty & NVFX_NEW_FRAGPROG)) { @@ -977,6 +1014,7 @@ void nvfx_fragprog_destroy(struct nvfx_context *nvfx, struct nvfx_fragment_program *fp) { + unsigned i; struct nvfx_fragment_program_bo* fpbo = fp->fpbo; if(fpbo) { @@ -991,7 +1029,9 @@ nvfx_fragprog_destroy(struct nvfx_context *nvfx, while(fpbo != fp->fpbo); } + for(i = 0; i < 8; ++i) + util_dynarray_fini(&fp->sem_relocs[i]); + if (fp->insn_len) FREE(fp->insn); } - diff --git a/src/gallium/drivers/nvfx/nvfx_shader.h b/src/gallium/drivers/nvfx/nvfx_shader.h index 50830b3..88cf91b 100644 --- a/src/gallium/drivers/nvfx/nvfx_shader.h +++ b/src/gallium/drivers/nvfx/nvfx_shader.h @@ -323,6 +323,7 @@ #define NVFXSR_INPUT 2 #define NVFXSR_TEMP 3 #define NVFXSR_CONST 4 +#define NVFXSR_RELOCATED 5 #define NVFX_COND_FL 0 #define NVFX_COND_LT 1 diff --git a/src/gallium/drivers/nvfx/nvfx_state.c b/src/gallium/drivers/nvfx/nvfx_state.c index 315de49..3f0c8e6 100644 --- a/src/gallium/drivers/nvfx/nvfx_state.c +++ b/src/gallium/drivers/nvfx/nvfx_state.c @@ -411,9 +411,13 @@ nvfx_vp_state_create(struct pipe_context *pipe, struct nvfx_context *nvfx = nvfx_context(pipe); struct nvfx_vertex_program *vp; + // TODO: use a 64-bit atomic here! + static unsigned long long id = 0; + vp = CALLOC(1, sizeof(struct nvfx_vertex_program)); vp->pipe.tokens = tgsi_dup_tokens(cso->tokens); vp->draw = draw_create_vertex_shader(nvfx->draw, &vp->pipe); + vp->id = ++id; return (void *)vp; } diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h index 9ceb257..3cd7981 100644 --- a/src/gallium/drivers/nvfx/nvfx_state.h +++ b/src/gallium/drivers/nvfx/nvfx_state.h @@ -4,6 +4,8 @@ #include "pipe/p_state.h" #include "tgsi/tgsi_scan.h" #include "nouveau/nouveau_statebuf.h" +#include "util/u_dynarray.h" +#include "util/u_linkage.h" struct nvfx_vertex_program_exec { uint32_t data[4]; @@ -18,6 +20,7 @@ struct nvfx_vertex_program_data { struct nvfx_vertex_program { struct pipe_shader_state pipe; + unsigned long long id; struct draw_vertex_shader *draw; @@ -30,6 +33,8 @@ struct nvfx_vertex_program { struct nvfx_vertex_program_data *consts; unsigned nr_consts; + char sem_table[256]; + struct nouveau_resource *exec; unsigned exec_start; struct nouveau_resource *data; @@ -49,6 +54,7 @@ struct nvfx_fragment_program_data { struct nvfx_fragment_program_bo { struct nvfx_fragment_program_bo* next; struct nouveau_bo* bo; + unsigned char* slots; char insn[] __attribute__((aligned(16))); }; @@ -65,11 +71,20 @@ struct nvfx_fragment_program { struct nvfx_fragment_program_data *consts; unsigned nr_consts; + unsigned num_semantics; /* how many input semantics? */ + unsigned char semantics[8]; /* semantics */ + unsigned char cur_slots[8]; /* current assignment of slots for each used semantic */ + unsigned cur_slots_progs_left; + unsigned long long last_vp_id; + struct util_dynarray sem_relocs[8]; /* semantic relocation offset */ + uint32_t fp_control; unsigned bo_prog_idx; unsigned prog_size; unsigned progs_per_bo; + unsigned progs; + struct nvfx_fragment_program_bo* fpbo; }; diff --git a/src/gallium/drivers/nvfx/nvfx_state_emit.c b/src/gallium/drivers/nvfx/nvfx_state_emit.c index 4137849..1398597 100644 --- a/src/gallium/drivers/nvfx/nvfx_state_emit.c +++ b/src/gallium/drivers/nvfx/nvfx_state_emit.c @@ -47,7 +47,7 @@ nvfx_state_validate_common(struct nvfx_context *nvfx) if(dirty & NVFX_NEW_STIPPLE) nvfx_state_stipple_validate(nvfx); - if(dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_FRAGCONST)) + if(dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_FRAGCONST | NVFX_NEW_VERTPROG)) nvfx_fragprog_validate(nvfx); if(dirty & NVFX_NEW_SAMPLER) diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c index b405fd9..4241e73 100644 --- a/src/gallium/drivers/nvfx/nvfx_vertprog.c +++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c @@ -1,7 +1,8 @@ #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_state.h" -#include "util/u_inlines.h" +#include "util/u_semantics.h" +#include "util/u_linkage.h" #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" @@ -60,7 +61,7 @@ temp(struct nvfx_vpc *vpc) return nvfx_sr(NVFXSR_TEMP, idx); } -static INLINE void +static inline void release_temps(struct nvfx_vpc *vpc) { vpc->r_temps &= ~vpc->r_temps_discard; @@ -332,7 +333,7 @@ nvfx_vp_arith(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, int slot, int op, emit_src(nvfx, vpc, hw, 2, s2); } -static INLINE struct nvfx_sreg +static inline struct nvfx_sreg tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) { struct nvfx_sreg src; @@ -378,14 +379,14 @@ tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) { dst = vpc->r_address[fdst->Register.Index]; break; default: - NOUVEAU_ERR("bad dst file\n"); + NOUVEAU_ERR("bad dst file %i\n", fdst->Register.File); break; } return dst; } -static INLINE int +static inline int tgsi_mask(uint tgsi) { int mask = 0; @@ -643,12 +644,8 @@ nvfx_vertprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, hw = NVFX_VP(INST_DEST_PSZ); break; case TGSI_SEMANTIC_GENERIC: - if (fdec->Semantic.Index <= 7) { - hw = NVFX_VP(INST_DEST_TC(fdec->Semantic.Index)); - } else { - NOUVEAU_ERR("bad generic semantic index\n"); - return FALSE; - } + hw = (vpc->vp->sem_table[fdec->Semantic.Index] & 0xf) + + NVFX_VP(INST_DEST_TC(0)) - NVFX_FP_OP_INPUT_SRC_TC(0); break; case TGSI_SEMANTIC_EDGEFLAG: /* not really an error just a fallback */ @@ -668,6 +665,27 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc) { struct tgsi_parse_context p; int high_temp = -1, high_addr = -1, nr_imm = 0, i; + struct util_semantic_set set; + unsigned char sem_layout[8]; + unsigned sem_layout_size; + unsigned num_outputs; + + num_outputs = util_semantic_set_from_program_file(&set, vpc->vp->pipe.tokens, TGSI_FILE_OUTPUT); + + if(num_outputs > 8) { + NOUVEAU_ERR("too many vertex program outputs: %i\n", num_outputs); + return FALSE; + } + util_semantic_layout_from_set(sem_layout, &set, 8, 8); + + /* hope 0xf is (0, 0, 0, 1) initialized; otherwise, we are _probably_ not required to do this */ + memset(vpc->vp->sem_table, 0x0f, sizeof(vpc->vp->sem_table)); + for(int i = 0; i < 8; ++i) { + if(sem_layout[i] == 0xff) + continue; + printf("vp: GENERIC[%i] to fpreg %i\n", sem_layout[i], NVFX_FP_OP_INPUT_SRC_TC(0) + i); + vpc->vp->sem_table[sem_layout[i]] = 0xf0 | (NVFX_FP_OP_INPUT_SRC_TC(0) + i); + } tgsi_parse_init(&p, vpc->vp->pipe.tokens); while (!tgsi_parse_end_of_tokens(&p)) { -- 1.7.0.1.147.g6d84b ------------------------------------------------------------------------------ Download Intel® Parallel Studio Eval Try the new software tools for yourself. Speed compiling, find bugs proactively, and fine-tune applications for parallel performance. See why Intel Parallel Studio got high marks during beta. http://p.sf.net/sfu/intel-sw-dev _______________________________________________ Mesa3d-dev mailing list Mesa3d-dev@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/mesa3d-dev