[Beignet] [PATCH 6/8] Backend: Implement FDIV64 on BDW.
From: Junyan HeAccording to the document, we use a set of instructions to implement double type division. Signed-off-by: Junyan He --- backend/src/backend/gen8_context.cpp | 68 backend/src/backend/gen8_context.hpp | 2 ++ 2 files changed, 70 insertions(+) diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp index b497ee5..f465832 100644 --- a/backend/src/backend/gen8_context.cpp +++ b/backend/src/backend/gen8_context.cpp @@ -924,6 +924,74 @@ namespace gbe this->unpackLongVec(src, dst, p->curr.execWidth); } + void Gen8Context::emitF64DIVInstruction(const SelectionInstruction ) { +/* Macro for Double Precision IEEE Compliant fdiv + + Set Rounding Mode in CR to RNE + GRF are initialized: r0 = 0, r6 = a, r7 = b, r1 = 1 + The default data type for the macro is :df + + math.eo.f0.0 (4) r8.acc2 r6.noacc r7.noacc 0xE + (-f0.0) if + madm (4) r9.acc3 r0.noacc r6.noacc r8.acc2 // Step(1), q0=a*y0 + madm (4) r10.acc4 r1.noacc -r7.noacc r8.acc2 // Step(2), e0=(1-b*y0) + madm (4) r11.acc5 r6.noacc -r7.noacc r9.acc3 // Step(3), r0=a-b*q0 + madm (4) r12.acc6 r8.acc2 r10.acc4 r8.acc2 // Step(4), y1=y0+e0*y0 + madm (4) r13.acc7 r1.noacc -r7.noacc r12.acc6// Step(5), e1=(1-b*y1) + madm (4) r8.acc8 r8.acc2 r10.acc4 r12.acc6 // Step(6), y2=y0+e0*y1 + madm (4) r9.acc9 r9.acc3 r11.acc5 r12.acc6 // Step(7), q1=q0+r0*y1 + madm (4) r12.acc2 r12.acc6 r8.acc8 r13.acc7 // Step(8), y3=y1+e1*y2 + madm (4) r11.acc3 r6.noacc -r7.noacc r9.acc9 // Step(9), r1=a-b*q1 + + Change Rounding Mode in CR if required + Implicit Accumulator for destination is NULL + + madm (4) r8.noacc r9.acc9 r11.acc3 r12.acc2 // Step(10), q=q1+r1*y3 + endif */ +GenRegister r6 = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_DF); +GenRegister r7 = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_DF); +GenRegister r8 = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_DF); +const GenRegister r0 = GenRegister::retype(ra->genReg(insn.dst(1)), GEN_TYPE_DF); +const GenRegister r1 = GenRegister::retype(ra->genReg(insn.dst(2)), GEN_TYPE_DF); +const GenRegister r9 = GenRegister::retype(ra->genReg(insn.dst(3)), GEN_TYPE_DF); +const GenRegister r10 = GenRegister::retype(ra->genReg(insn.dst(4)), GEN_TYPE_DF); +const GenRegister r11 = GenRegister::retype(ra->genReg(insn.dst(5)), GEN_TYPE_DF); +const GenRegister r12 = GenRegister::retype(ra->genReg(insn.dst(6)), GEN_TYPE_DF); +const GenRegister r13 = GenRegister::retype(ra->genReg(insn.dst(7)), GEN_TYPE_DF); +Gen8Encoder *p8 = reinterpret_cast(p); +p->push(); { + p->curr.execWidth = 4; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask= 1; + p->MOV(r1, GenRegister::immdf(1.0d)); + p->MOV(r0, GenRegister::immdf(0.0d)); + + for (int i = 0; i < (simdWidth == 16 ? 4 : 2); i++) { +p->curr.predicate = GEN_PREDICATE_NONE; +p8->MATH_WITH_ACC(r8, GEN8_MATH_FUNCTION_INVM, r6, r7, GEN8_INSN_ACC2, GEN8_INSN_NOACC, GEN8_INSN_NOACC); +p->curr.useFlag(insn.state.flag, insn.state.subFlag); +p->curr.predicate = GEN_PREDICATE_NORMAL; +p->curr.inversePredicate = 1; +p->curr.noMask= 0; +p8->MADM(r9, r0, r6, r8, GEN8_INSN_ACC3, GEN8_INSN_NOACC, GEN8_INSN_NOACC, GEN8_INSN_ACC2); +p8->MADM(r10, r1, GenRegister::negate(r7), r8, GEN8_INSN_ACC4, GEN8_INSN_NOACC, GEN8_INSN_NOACC, GEN8_INSN_ACC2); +p8->MADM(r11, r6, GenRegister::negate(r7), r9, GEN8_INSN_ACC5, GEN8_INSN_NOACC, GEN8_INSN_NOACC, GEN8_INSN_ACC3); +p8->MADM(r12, r8, r10, r8, GEN8_INSN_ACC6, GEN8_INSN_ACC2, GEN8_INSN_ACC4, GEN8_INSN_ACC2); +p8->MADM(r13, r1, GenRegister::negate(r7), r12, GEN8_INSN_ACC7, GEN8_INSN_NOACC, GEN8_INSN_NOACC, GEN8_INSN_ACC6); +p8->MADM(r8, r8, r10, r12, GEN8_INSN_ACC8, GEN8_INSN_ACC2, GEN8_INSN_ACC4, GEN8_INSN_ACC6); +p8->MADM(r9, r9, r11, r12, GEN8_INSN_ACC9, GEN8_INSN_ACC3, GEN8_INSN_ACC5, GEN8_INSN_ACC6); +p8->MADM(r12, r12, r8, r13, GEN8_INSN_ACC2, GEN8_INSN_ACC6, GEN8_INSN_ACC8, GEN8_INSN_ACC7); +p8->MADM(r11, r6, GenRegister::negate(r7), r9, GEN8_INSN_ACC3, GEN8_INSN_NOACC, GEN8_INSN_NOACC, GEN8_INSN_ACC9); + +p8->MADM(r8, r9, r11, r12, GEN8_INSN_NOACC, GEN8_INSN_ACC9, GEN8_INSN_ACC3, GEN8_INSN_ACC2); + +r6 = GenRegister::offset(r6, 1); +r7 = GenRegister::offset(r7, 1); +r8 = GenRegister::offset(r8, 1); + } +} p->pop(); + } + void Gen8Context::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) { if (sz == 0) sz = 16; diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp index 84508e9..386f7f3 100644 ---
Re: [Beignet] [PATCH 6/8] Backend: Implement FDIV64 on BDW.
On Tue, Sep 15, 2015 at 06:00:57AM -0700, Matt Turner wrote: > Date: Tue, 15 Sep 2015 06:00:57 -0700 > From: Matt Turner <matts...@gmail.com> > To: "junyan.he" <junyan...@inbox.com> > Cc: "beignet@lists.freedesktop.org" <beignet@lists.freedesktop.org> > Subject: Re: [Beignet] [PATCH 6/8] Backend: Implement FDIV64 on BDW. > > On Tue, Sep 15, 2015 at 4:15 AM, <junyan...@inbox.com> wrote: > > From: Junyan He <junyan...@linux.intel.com> > > > > According to the document, we use a set of instructions > > to implement double type division. > > > > Signed-off-by: Junyan He <junyan...@linux.intel.com> > > --- > > backend/src/backend/gen8_context.cpp | 68 > > > > backend/src/backend/gen8_context.hpp | 2 ++ > > 2 files changed, 70 insertions(+) > > > > diff --git a/backend/src/backend/gen8_context.cpp > > b/backend/src/backend/gen8_context.cpp > > index b497ee5..f465832 100644 > > --- a/backend/src/backend/gen8_context.cpp > > +++ b/backend/src/backend/gen8_context.cpp > > @@ -924,6 +924,74 @@ namespace gbe > > this->unpackLongVec(src, dst, p->curr.execWidth); > >} > > > > + void Gen8Context::emitF64DIVInstruction(const SelectionInstruction > > ) { > > +/* Macro for Double Precision IEEE Compliant fdiv > > + > > + Set Rounding Mode in CR to RNE > > + GRF are initialized: r0 = 0, r6 = a, r7 = b, r1 = 1 > > + The default data type for the macro is :df > > + > > + math.eo.f0.0 (4) r8.acc2 r6.noacc r7.noacc 0xE > > + (-f0.0) if > > + madm (4) r9.acc3 r0.noacc r6.noacc r8.acc2 // Step(1), q0=a*y0 > > + madm (4) r10.acc4 r1.noacc -r7.noacc r8.acc2 // Step(2), > > e0=(1-b*y0) > > + madm (4) r11.acc5 r6.noacc -r7.noacc r9.acc3 // Step(3), > > r0=a-b*q0 > > + madm (4) r12.acc6 r8.acc2 r10.acc4 r8.acc2 // Step(4), > > y1=y0+e0*y0 > > + madm (4) r13.acc7 r1.noacc -r7.noacc r12.acc6// Step(5), > > e1=(1-b*y1) > > + madm (4) r8.acc8 r8.acc2 r10.acc4 r12.acc6 // Step(6), > > y2=y0+e0*y1 > > + madm (4) r9.acc9 r9.acc3 r11.acc5 r12.acc6 // Step(7), > > q1=q0+r0*y1 > > + madm (4) r12.acc2 r12.acc6 r8.acc8 r13.acc7 // Step(8), > > y3=y1+e1*y2 > > + madm (4) r11.acc3 r6.noacc -r7.noacc r9.acc9 // Step(9), > > r1=a-b*q1 > > + > > + Change Rounding Mode in CR if required > > + Implicit Accumulator for destination is NULL > > + > > + madm (4) r8.noacc r9.acc9 r11.acc3 r12.acc2 // Step(10), > > q=q1+r1*y3 > > + endif */ > > I don't see an IF or an ENDIF instruction emitted in the code below. > Is that intentional, or am I misreading the code? > Here, we use f0.1 as the predication for all the instructions, like: (-f0.1) madm (4) r9.acc3 r0.noacc r6.noacc r8.acc2 (-f0.1) madm (4) r10.acc4 r1.noacc -r7.noacc r8.acc2 . I avoid using IF-Endif here, because we need to calculate the instruction number within IF clause, and it is not convenient. > > +GenRegister r6 = GenRegister::retype(ra->genReg(insn.src(0)), > > GEN_TYPE_DF); > > +GenRegister r7 = GenRegister::retype(ra->genReg(insn.src(1)), > > GEN_TYPE_DF); > > +GenRegister r8 = GenRegister::retype(ra->genReg(insn.dst(0)), > > GEN_TYPE_DF); > > +const GenRegister r0 = GenRegister::retype(ra->genReg(insn.dst(1)), > > GEN_TYPE_DF); > > +const GenRegister r1 = GenRegister::retype(ra->genReg(insn.dst(2)), > > GEN_TYPE_DF); > > +const GenRegister r9 = GenRegister::retype(ra->genReg(insn.dst(3)), > > GEN_TYPE_DF); > > +const GenRegister r10 = GenRegister::retype(ra->genReg(insn.dst(4)), > > GEN_TYPE_DF); > > +const GenRegister r11 = GenRegister::retype(ra->genReg(insn.dst(5)), > > GEN_TYPE_DF); > > +const GenRegister r12 = GenRegister::retype(ra->genReg(insn.dst(6)), > > GEN_TYPE_DF); > > +const GenRegister r13 = GenRegister::retype(ra->genReg(insn.dst(7)), > > GEN_TYPE_DF); > > +Gen8Encoder *p8 = reinterpret_cast(p); > > +p->push(); { > > + p->curr.execWidth = 4; > > + p->curr.predicate = GEN_PREDICATE_NONE; > > + p->curr.noMask= 1; > > + p->MOV(r1, GenRegister::immdf(1.0d)); > > + p->MOV(r0, GenRegister::immdf(0.0d)); > > + > > + for (int i = 0; i < (simdWidth == 16 ? 4 : 2); i++) { > > +p->curr.predicate
Re: [Beignet] [PATCH 6/8] Backend: Implement FDIV64 on BDW.
On Tue, Sep 15, 2015 at 4:15 AM,wrote: > From: Junyan He > > According to the document, we use a set of instructions > to implement double type division. > > Signed-off-by: Junyan He > --- > backend/src/backend/gen8_context.cpp | 68 > > backend/src/backend/gen8_context.hpp | 2 ++ > 2 files changed, 70 insertions(+) > > diff --git a/backend/src/backend/gen8_context.cpp > b/backend/src/backend/gen8_context.cpp > index b497ee5..f465832 100644 > --- a/backend/src/backend/gen8_context.cpp > +++ b/backend/src/backend/gen8_context.cpp > @@ -924,6 +924,74 @@ namespace gbe > this->unpackLongVec(src, dst, p->curr.execWidth); >} > > + void Gen8Context::emitF64DIVInstruction(const SelectionInstruction ) { > +/* Macro for Double Precision IEEE Compliant fdiv > + > + Set Rounding Mode in CR to RNE > + GRF are initialized: r0 = 0, r6 = a, r7 = b, r1 = 1 > + The default data type for the macro is :df > + > + math.eo.f0.0 (4) r8.acc2 r6.noacc r7.noacc 0xE > + (-f0.0) if > + madm (4) r9.acc3 r0.noacc r6.noacc r8.acc2 // Step(1), q0=a*y0 > + madm (4) r10.acc4 r1.noacc -r7.noacc r8.acc2 // Step(2), > e0=(1-b*y0) > + madm (4) r11.acc5 r6.noacc -r7.noacc r9.acc3 // Step(3), r0=a-b*q0 > + madm (4) r12.acc6 r8.acc2 r10.acc4 r8.acc2 // Step(4), > y1=y0+e0*y0 > + madm (4) r13.acc7 r1.noacc -r7.noacc r12.acc6// Step(5), > e1=(1-b*y1) > + madm (4) r8.acc8 r8.acc2 r10.acc4 r12.acc6 // Step(6), > y2=y0+e0*y1 > + madm (4) r9.acc9 r9.acc3 r11.acc5 r12.acc6 // Step(7), > q1=q0+r0*y1 > + madm (4) r12.acc2 r12.acc6 r8.acc8 r13.acc7 // Step(8), > y3=y1+e1*y2 > + madm (4) r11.acc3 r6.noacc -r7.noacc r9.acc9 // Step(9), r1=a-b*q1 > + > + Change Rounding Mode in CR if required > + Implicit Accumulator for destination is NULL > + > + madm (4) r8.noacc r9.acc9 r11.acc3 r12.acc2 // Step(10), > q=q1+r1*y3 > + endif */ I don't see an IF or an ENDIF instruction emitted in the code below. Is that intentional, or am I misreading the code? > +GenRegister r6 = GenRegister::retype(ra->genReg(insn.src(0)), > GEN_TYPE_DF); > +GenRegister r7 = GenRegister::retype(ra->genReg(insn.src(1)), > GEN_TYPE_DF); > +GenRegister r8 = GenRegister::retype(ra->genReg(insn.dst(0)), > GEN_TYPE_DF); > +const GenRegister r0 = GenRegister::retype(ra->genReg(insn.dst(1)), > GEN_TYPE_DF); > +const GenRegister r1 = GenRegister::retype(ra->genReg(insn.dst(2)), > GEN_TYPE_DF); > +const GenRegister r9 = GenRegister::retype(ra->genReg(insn.dst(3)), > GEN_TYPE_DF); > +const GenRegister r10 = GenRegister::retype(ra->genReg(insn.dst(4)), > GEN_TYPE_DF); > +const GenRegister r11 = GenRegister::retype(ra->genReg(insn.dst(5)), > GEN_TYPE_DF); > +const GenRegister r12 = GenRegister::retype(ra->genReg(insn.dst(6)), > GEN_TYPE_DF); > +const GenRegister r13 = GenRegister::retype(ra->genReg(insn.dst(7)), > GEN_TYPE_DF); > +Gen8Encoder *p8 = reinterpret_cast(p); > +p->push(); { > + p->curr.execWidth = 4; > + p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask= 1; > + p->MOV(r1, GenRegister::immdf(1.0d)); > + p->MOV(r0, GenRegister::immdf(0.0d)); > + > + for (int i = 0; i < (simdWidth == 16 ? 4 : 2); i++) { > +p->curr.predicate = GEN_PREDICATE_NONE; > +p8->MATH_WITH_ACC(r8, GEN8_MATH_FUNCTION_INVM, r6, r7, > GEN8_INSN_ACC2, GEN8_INSN_NOACC, GEN8_INSN_NOACC); > +p->curr.useFlag(insn.state.flag, insn.state.subFlag); > +p->curr.predicate = GEN_PREDICATE_NORMAL; > +p->curr.inversePredicate = 1; > +p->curr.noMask= 0; > +p8->MADM(r9, r0, r6, r8, GEN8_INSN_ACC3, GEN8_INSN_NOACC, > GEN8_INSN_NOACC, GEN8_INSN_ACC2); > +p8->MADM(r10, r1, GenRegister::negate(r7), r8, GEN8_INSN_ACC4, > GEN8_INSN_NOACC, GEN8_INSN_NOACC, GEN8_INSN_ACC2); > +p8->MADM(r11, r6, GenRegister::negate(r7), r9, GEN8_INSN_ACC5, > GEN8_INSN_NOACC, GEN8_INSN_NOACC, GEN8_INSN_ACC3); > +p8->MADM(r12, r8, r10, r8, GEN8_INSN_ACC6, GEN8_INSN_ACC2, > GEN8_INSN_ACC4, GEN8_INSN_ACC2); > +p8->MADM(r13, r1, GenRegister::negate(r7), r12, GEN8_INSN_ACC7, > GEN8_INSN_NOACC, GEN8_INSN_NOACC, GEN8_INSN_ACC6); > +p8->MADM(r8, r8, r10, r12, GEN8_INSN_ACC8, GEN8_INSN_ACC2, > GEN8_INSN_ACC4, GEN8_INSN_ACC6); > +p8->MADM(r9, r9, r11, r12, GEN8_INSN_ACC9, GEN8_INSN_ACC3, > GEN8_INSN_ACC5, GEN8_INSN_ACC6); > +p8->MADM(r12, r12, r8, r13, GEN8_INSN_ACC2, GEN8_INSN_ACC6, > GEN8_INSN_ACC8, GEN8_INSN_ACC7); > +p8->MADM(r11, r6, GenRegister::negate(r7), r9, GEN8_INSN_ACC3, > GEN8_INSN_NOACC, GEN8_INSN_NOACC, GEN8_INSN_ACC9); > + > +p8->MADM(r8, r9, r11, r12, GEN8_INSN_NOACC, GEN8_INSN_ACC9, > GEN8_INSN_ACC3, GEN8_INSN_ACC2); > + > +