Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu> --- Untested beyond compiling a few shaders to see if they look like they might work. nvdisasm agrees with envydis's decoding of these things.
Will definitely get ahold of a G200 to run tests on before pushing this. .../drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp | 94 ++++++++++++++++++--- .../nouveau/codegen/nv50_ir_lowering_nv50.cpp | 97 +++++++++++++++++++++- .../nouveau/codegen/nv50_ir_target_nv50.cpp | 2 +- src/gallium/drivers/nouveau/nv50/nv50_screen.c | 4 + 4 files changed, 185 insertions(+), 12 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp index b1e7409..7c6f7da 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp @@ -96,9 +96,12 @@ private: void emitUADD(const Instruction *); void emitAADD(const Instruction *); void emitFADD(const Instruction *); + void emitDADD(const Instruction *); void emitIMUL(const Instruction *); void emitFMUL(const Instruction *); + void emitDMUL(const Instruction *); void emitFMAD(const Instruction *); + void emitDMAD(const Instruction *); void emitIMAD(const Instruction *); void emitISAD(const Instruction *); @@ -923,11 +926,13 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i) assert(0); break; } - code[1] |= i->src(0).mod.abs() << 20; - code[1] |= i->src(0).mod.neg() << 26; - code[1] |= i->src(1).mod.abs() << 19; - code[1] |= i->src(1).mod.neg() << 27; } + + code[1] |= i->src(0).mod.abs() << 20; + code[1] |= i->src(0).mod.neg() << 26; + code[1] |= i->src(1).mod.abs() << 19; + code[1] |= i->src(1).mod.neg() << 27; + emitForm_MAD(i); } @@ -963,6 +968,26 @@ CodeEmitterNV50::emitFMAD(const Instruction *i) } void +CodeEmitterNV50::emitDMAD(const Instruction *i) +{ + const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg(); + const int neg_add = i->src(2).mod.neg(); + + assert(i->encSize == 8); + assert(!i->saturate); + + code[1] = 0x40000000; + code[0] = 0xe0000000; + + code[1] |= neg_mul << 26; + code[1] |= neg_add << 27; + + roundMode_MAD(i); + + emitForm_MAD(i); +} + +void CodeEmitterNV50::emitFADD(const Instruction *i) { const int neg0 = i->src(0).mod.neg(); @@ -997,6 +1022,25 @@ CodeEmitterNV50::emitFADD(const Instruction *i) } void +CodeEmitterNV50::emitDADD(const Instruction *i) +{ + const int neg0 = i->src(0).mod.neg(); + const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0); + + assert(!(i->src(0).mod | i->src(1).mod).abs()); + assert(!i->saturate); + assert(i->encSize == 8); + + code[1] = 0x60000000; + code[0] = 0xe0000000; + + emitForm_ADD(i); + + code[1] |= neg0 << 26; + code[1] |= neg1 << 27; +} + +void CodeEmitterNV50::emitUADD(const Instruction *i) { const int neg0 = i->src(0).mod.neg(); @@ -1090,6 +1134,25 @@ CodeEmitterNV50::emitFMUL(const Instruction *i) } void +CodeEmitterNV50::emitDMUL(const Instruction *i) +{ + const int neg = (i->src(0).mod ^ i->src(1).mod).neg(); + + assert(!i->saturate); + assert(i->encSize == 8); + + code[1] = 0x80000000; + code[0] = 0xe0000000; + + if (neg) + code[1] |= 0x08000000; + + roundMode_CVT(i->rnd); + + emitForm_MAD(i); +} + +void CodeEmitterNV50::emitIMAD(const Instruction *i) { code[0] = 0x60000000; @@ -1150,9 +1213,11 @@ CodeEmitterNV50::emitSET(const Instruction *i) code[0] = 0x30000000; code[1] = 0x60000000; - emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14); - switch (i->sType) { + case TYPE_F64: + code[0] = 0xe0000000; + code[1] = 0xe0000000; + break; case TYPE_F32: code[0] |= 0x80000000; break; case TYPE_S32: code[1] |= 0x0c000000; break; case TYPE_U32: code[1] |= 0x04000000; break; @@ -1162,6 +1227,9 @@ CodeEmitterNV50::emitSET(const Instruction *i) assert(0); break; } + + emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14); + if (i->src(0).mod.neg()) code[1] |= 0x04000000; if (i->src(1).mod.neg()) code[1] |= 0x08000000; if (i->src(0).mod.abs()) code[1] |= 0x00100000; @@ -1725,7 +1793,9 @@ CodeEmitterNV50::emitInstruction(Instruction *insn) break; case OP_ADD: case OP_SUB: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDADD(insn); + else if (isFloatType(insn->dType)) emitFADD(insn); else if (insn->getDef(0)->reg.file == FILE_ADDRESS) emitAADD(insn); @@ -1733,14 +1803,18 @@ CodeEmitterNV50::emitInstruction(Instruction *insn) emitUADD(insn); break; case OP_MUL: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDMUL(insn); + else if (isFloatType(insn->dType)) emitFMUL(insn); else emitIMUL(insn); break; case OP_MAD: case OP_FMA: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDMAD(insn); + else if (isFloatType(insn->dType)) emitFMAD(insn); else emitIMAD(insn); @@ -1912,7 +1986,7 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const { const Target::OpInfo &info = targ->getOpInfo(i); - if (info.minEncSize > 4) + if (info.minEncSize > 4 || i->dType == TYPE_F64) return 8; // check constraints on dst and src operands diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp index 1ad0860..d5dadc2 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp @@ -314,6 +314,7 @@ private: void handleDIV(Instruction *); void handleMOD(Instruction *); void handleMUL(Instruction *); + void handleDRCPRSQ(Instruction *); void handleAddrDef(Instruction *); inline bool isARL(const Instruction *) const; @@ -552,6 +553,95 @@ NV50LegalizeSSA::handleMOD(Instruction *mod) mod->setSrc(1, m); } +void +NV50LegalizeSSA::handleDRCPRSQ(Instruction *i) +{ + /* We need to replace this instruction with a sequence that computes the + * appropriate function. As a first guess, we use the "quake" style + * approximation for RSQ: + * + * 0x5fe6eb50c7b537a9 - num >> 1 + * + * For RCP, we will then square it. + */ + Value *abs, *guess, *parts[2], *input[2], *shr[4], *pred; + + bld.setPosition(i, false); + + abs = bld.mkOp1v(OP_ABS, TYPE_F64, bld.getSSA(8), i->getSrc(0)); + + parts[0] = bld.loadImm(NULL, 0xc7b537a9); + parts[1] = bld.loadImm(NULL, 0x5fe6eb50); + guess = bld.mkOp2v(OP_MERGE, TYPE_F64, bld.getSSA(8), parts[0], parts[1]); + + bld.mkSplit(input, 4, abs); + shr[0] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(4), input[0], bld.mkImm(1)); + shr[1] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(4), input[1], bld.mkImm(1)); + + // If the bottom bit of the high word was set, set the high bit of the + // bottom word. + pred = bld.getSSA(1, FILE_FLAGS); + bld.mkOp2(OP_AND, TYPE_U32, NULL, input[1], bld.loadImm(NULL, 1)) + ->setFlagsDef(0, pred); + shr[2] = bld.getSSA(4); shr[3] = bld.getSSA(4); + bld.mkOp2(OP_OR, TYPE_U32, shr[2], shr[0], bld.loadImm(NULL, 0x80000000)) + ->setPredicate(CC_S, pred); + bld.mkMov(shr[3], shr[0]) + ->setPredicate(CC_NS, pred); + shr[0] = bld.mkOp2v(OP_UNION, TYPE_U32, bld.getSSA(4), shr[2], shr[3]); + + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), guess, + bld.mkOp2v(OP_MERGE, TYPE_F64, bld.getSSA(8), shr[0], shr[1])); + + if (i->op == OP_RCP) { + Value *two = bld.getSSA(8), *neg = bld.getSSA(8), *copy = bld.getSSA(8); + + bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f)); + + /* Square the guess first, since it was for RSQ */ + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess); + + // RCP: x_{n+1} = 2 * x_n - input * x_n^2 + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess), + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), abs, + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess))); + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess), + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), abs, + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess))); + + // Restore the sign on the output + bld.mkSplit(input, 4, i->getSrc(0)); + bld.mkOp2(OP_AND, TYPE_U32, NULL, input[1], bld.loadImm(NULL, 0x80000000)) + ->setFlagsDef(0, (pred = bld.getSSA(1, FILE_FLAGS))); + bld.mkOp1(OP_NEG, TYPE_F64, neg, guess) + ->setPredicate(CC_S, pred); + bld.mkMov(copy, guess) + ->setPredicate(CC_NS, pred); + guess = bld.mkOp2v(OP_UNION, TYPE_U64, bld.getSSA(8), neg, copy); + } else { + Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8); + bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f)); + bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f)); + + half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, abs); + // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2) + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input, + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess), + three_half)); + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input, + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess), + three_half)); + } + + i->op = OP_MOV; + i->setSrc(0, guess); +} + + bool NV50LegalizeSSA::visit(BasicBlock *bb) { @@ -578,6 +668,11 @@ NV50LegalizeSSA::visit(BasicBlock *bb) case OP_MUL: handleMUL(insn); break; + case OP_RCP: + case OP_RSQ: + if (insn->dType == TYPE_F64) + handleDRCPRSQ(insn); + break; default: break; } @@ -1162,7 +1257,7 @@ NV50LoweringPreSSA::handleDIV(Instruction *i) bool NV50LoweringPreSSA::handleSQRT(Instruction *i) { - Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32, + Instruction *rsq = bld.mkOp1(OP_RSQ, i->dType, bld.getSSA(), i->getSrc(0)); i->op = OP_MUL; i->setSrc(1, rsq->getDef(0)); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp index 178a167..f3d8733 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp @@ -388,7 +388,7 @@ TargetNV50::isAccessSupported(DataFile file, DataType ty) const bool TargetNV50::isOpSupported(operation op, DataType ty) const { - if (ty == TYPE_F64 && chipset < 0xa0) + if (ty == TYPE_F64 && chipset != 0xa0) return false; switch (op) { diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index ed07ba4..4532957 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -237,6 +237,8 @@ static int nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, enum pipe_shader_cap param) { + struct nouveau_device *dev = nouveau_screen(pscreen)->device; + switch (shader) { case PIPE_SHADER_VERTEX: case PIPE_SHADER_GEOMETRY: @@ -287,7 +289,9 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: return MIN2(32, PIPE_MAX_SAMPLERS); case PIPE_SHADER_CAP_DOUBLES: + return dev->chipset == 0xa0; case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: + return dev->chipset == 0xa0; case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: return 0; default: -- 2.0.5 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev