Re: [Beignet] [PATCH 19/19] Backend: Implement StoreProfilingInstruction in GenContext.

2015-11-03 Thread Yang, Rong R


> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
> junyan...@inbox.com
> Sent: Wednesday, September 9, 2015 8:01
> To: beignet@lists.freedesktop.org
> Subject: [Beignet] [PATCH 19/19] Backend: Implement
> StoreProfilingInstruction in GenContext.
> 
> From: Junyan He <junyan...@linux.intel.com>
> 
> The offset 0 of the profiling buffer contains the log number.
> We will use atomic instruction to inc it every time a log is generated.
> We will generate one log for each HW gpu thread. The log contains the XYZ
> range of global work items which are executed on this thread, the EU id, the
> Sub Slice id,  thread number, and 20 points' timestamp which we are
> interested in.
> 
> Signed-off-by: Junyan He <junyan...@linux.intel.com>
> ---
>  backend/src/backend/gen_context.cpp |  173
> +++
>  1 file changed, 173 insertions(+)
> 
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index 26af4cd..df36e9a 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -2469,6 +2469,179 @@ namespace gbe
>}
> 
>void GenContext::emitStoreProfilingInstruction(const SelectionInstruction
> ) {
> +uint32_t simdType;
> +if (this->simdWidth == 16) {
> +  simdType = ir::ProfilingInfo::ProfilingSimdType16;
> +} else if (this->simdWidth == 8) {
> +  simdType = ir::ProfilingInfo::ProfilingSimdType8;
> +} else {
> +  simdType = ir::ProfilingInfo::ProfilingSimdType1;
> +  GBE_ASSERT(0);
> +}
> +
> +p->NOP();
> +p->NOP();
> +
> +GenRegister tmArf = GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
> +0xc0,
> +0,
> +GEN_TYPE_UW,
> +GEN_VERTICAL_STRIDE_4,
> +GEN_WIDTH_4,
> +GEN_HORIZONTAL_STRIDE_1);
It's better to add a tmArf function in GenRegister class and get it by call 
this function instead of hard code in the patchset.

> +GenRegister profilingReg[5];
> +if (p->curr.execWidth == 16) {
> +  profilingReg[0] = GenRegister::retype(ra->genReg(insn.src(0)),
> GEN_TYPE_UD);
> +  profilingReg[1] = GenRegister::offset(profilingReg[0], 1);
> +  profilingReg[2] = GenRegister::retype(ra->genReg(insn.src(1)),
> GEN_TYPE_UD);
> +  profilingReg[3] = GenRegister::offset(profilingReg[2], 1);
> +  profilingReg[4] = GenRegister::retype(ra->genReg(insn.src(2)),
> GEN_TYPE_UD);
> +} else {
> +  GBE_ASSERT(p->curr.execWidth == 8);
> +  profilingReg[0] = GenRegister::retype(ra->genReg(insn.src(0)),
> GEN_TYPE_UD);
> +  profilingReg[1] = GenRegister::retype(ra->genReg(insn.src(1)),
> GEN_TYPE_UD);
> +  profilingReg[2] = GenRegister::retype(ra->genReg(insn.src(2)),
> GEN_TYPE_UD);
> +  profilingReg[3] = GenRegister::retype(ra->genReg(insn.src(3)),
> GEN_TYPE_UD);
> +  profilingReg[4] = GenRegister::retype(ra->genReg(insn.src(4)),
> GEN_TYPE_UD);
> +}
> +GenRegister tmp = ra->genReg(insn.dst(0));
> +uint32_t profilingType = insn.extra.profilingType;
> +uint32_t bti = insn.extra.profilingBTI;
> +GBE_ASSERT(profilingType == 1);
> +GenRegister flagReg = GenRegister::flag(insn.state.flag,
> insn.state.subFlag);
> +GenRegister lastTsReg = GenRegister::toUniform(profilingReg[3],
> GEN_TYPE_UL);
> +lastTsReg = GenRegister::offset(lastTsReg, 0, 2*sizeof(uint64_t));
> +GenRegister realClock = GenRegister::offset(lastTsReg, 0,
> sizeof(uint64_t));
> +GenRegister tmp0 = GenRegister::toUniform(profilingReg[3],
> + GEN_TYPE_UL);
> +
> +/* MOV(4)   tmp0<1>:UWarf_tm<4,4,1>:UW  */
> +p->push(); {
> +  p->curr.execWidth = 4;
> +  p->curr.predicate = GEN_PREDICATE_NONE;
> +  p->curr.noMask = 1;
> +  GenRegister _tmp0 = tmp0;
> +  _tmp0.type = GEN_TYPE_UW;
> +  _tmp0.hstride = GEN_HORIZONTAL_STRIDE_1;
> +  _tmp0.vstride = GEN_VERTICAL_STRIDE_4;
> +  _tmp0.width = GEN_WIDTH_4;
> +  p->MOV(_tmp0, tmArf);
> +} p->pop();
> +
> +/* Calc the time elapsed. */
> +subTimestamps(tmp0, lastTsReg, tmp);
> +/* Update the real clock */
> +addTimestamps(realClock, tmp0, tmp);
> +
> +//the epilog, record the last timestamp and return.
> +/* MOV(1)   epilog<1>:UL   realclock<0,1,0>:UL  */
> +/* ADD(1)   epilog<1>:UL   prolog<0,1,0>:UL  */
> +GenRegister prolog = GenRegister::toUniform(profilingReg[2],
> GEN_TYPE_UD);
> +prolog = GenRegister::offset(prolog, 0, 4*sizeof(uint32_t));
> +GenRegis

[Beignet] [PATCH 19/19] Backend: Implement StoreProfilingInstruction in GenContext.

2015-09-08 Thread junyan . he
From: Junyan He 

The offset 0 of the profiling buffer contains the log number.
We will use atomic instruction to inc it every time a log
is generated.
We will generate one log for each HW gpu thread. The log
contains the XYZ range of global work items which are executed
on this thread, the EU id, the Sub Slice id,  thread number,
and 20 points' timestamp which we are interested in.

Signed-off-by: Junyan He 
---
 backend/src/backend/gen_context.cpp |  173 +++
 1 file changed, 173 insertions(+)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 26af4cd..df36e9a 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2469,6 +2469,179 @@ namespace gbe
   }
 
   void GenContext::emitStoreProfilingInstruction(const SelectionInstruction 
) {
+uint32_t simdType;
+if (this->simdWidth == 16) {
+  simdType = ir::ProfilingInfo::ProfilingSimdType16;
+} else if (this->simdWidth == 8) {
+  simdType = ir::ProfilingInfo::ProfilingSimdType8;
+} else {
+  simdType = ir::ProfilingInfo::ProfilingSimdType1;
+  GBE_ASSERT(0);
+}
+
+p->NOP();
+p->NOP();
+
+GenRegister tmArf = GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+0xc0,
+0,
+GEN_TYPE_UW,
+GEN_VERTICAL_STRIDE_4,
+GEN_WIDTH_4,
+GEN_HORIZONTAL_STRIDE_1);
+GenRegister profilingReg[5];
+if (p->curr.execWidth == 16) {
+  profilingReg[0] = GenRegister::retype(ra->genReg(insn.src(0)), 
GEN_TYPE_UD);
+  profilingReg[1] = GenRegister::offset(profilingReg[0], 1);
+  profilingReg[2] = GenRegister::retype(ra->genReg(insn.src(1)), 
GEN_TYPE_UD);
+  profilingReg[3] = GenRegister::offset(profilingReg[2], 1);
+  profilingReg[4] = GenRegister::retype(ra->genReg(insn.src(2)), 
GEN_TYPE_UD);
+} else {
+  GBE_ASSERT(p->curr.execWidth == 8);
+  profilingReg[0] = GenRegister::retype(ra->genReg(insn.src(0)), 
GEN_TYPE_UD);
+  profilingReg[1] = GenRegister::retype(ra->genReg(insn.src(1)), 
GEN_TYPE_UD);
+  profilingReg[2] = GenRegister::retype(ra->genReg(insn.src(2)), 
GEN_TYPE_UD);
+  profilingReg[3] = GenRegister::retype(ra->genReg(insn.src(3)), 
GEN_TYPE_UD);
+  profilingReg[4] = GenRegister::retype(ra->genReg(insn.src(4)), 
GEN_TYPE_UD);
+}
+GenRegister tmp = ra->genReg(insn.dst(0));
+uint32_t profilingType = insn.extra.profilingType;
+uint32_t bti = insn.extra.profilingBTI;
+GBE_ASSERT(profilingType == 1);
+GenRegister flagReg = GenRegister::flag(insn.state.flag, 
insn.state.subFlag);
+GenRegister lastTsReg = GenRegister::toUniform(profilingReg[3], 
GEN_TYPE_UL);
+lastTsReg = GenRegister::offset(lastTsReg, 0, 2*sizeof(uint64_t));
+GenRegister realClock = GenRegister::offset(lastTsReg, 0, 
sizeof(uint64_t));
+GenRegister tmp0 = GenRegister::toUniform(profilingReg[3], GEN_TYPE_UL);
+
+/* MOV(4)   tmp0<1>:UW  arf_tm<4,4,1>:UW  */
+p->push(); {
+  p->curr.execWidth = 4;
+  p->curr.predicate = GEN_PREDICATE_NONE;
+  p->curr.noMask = 1;
+  GenRegister _tmp0 = tmp0;
+  _tmp0.type = GEN_TYPE_UW;
+  _tmp0.hstride = GEN_HORIZONTAL_STRIDE_1;
+  _tmp0.vstride = GEN_VERTICAL_STRIDE_4;
+  _tmp0.width = GEN_WIDTH_4;
+  p->MOV(_tmp0, tmArf);
+} p->pop();
+
+/* Calc the time elapsed. */
+subTimestamps(tmp0, lastTsReg, tmp);
+/* Update the real clock */
+addTimestamps(realClock, tmp0, tmp);
+
+//the epilog, record the last timestamp and return.
+/* MOV(1)   epilog<1>:UL   realclock<0,1,0>:UL  */
+/* ADD(1)   epilog<1>:UL   prolog<0,1,0>:UL  */
+GenRegister prolog = GenRegister::toUniform(profilingReg[2], GEN_TYPE_UD);
+prolog = GenRegister::offset(prolog, 0, 4*sizeof(uint32_t));
+GenRegister epilog = GenRegister::offset(prolog, 0, 2*sizeof(uint32_t));
+p->push(); {
+  p->curr.execWidth = 1;
+  p->curr.predicate = GEN_PREDICATE_NONE;
+  p->curr.noMask = 1;
+  p->MOV(epilog, GenRegister::retype(realClock, GEN_TYPE_UD));
+  p->MOV(GenRegister::offset(epilog, 0, sizeof(uint32_t)),
+  GenRegister::offset(GenRegister::retype(realClock, GEN_TYPE_UD), 0, 
sizeof(uint32_t)));
+  addTimestamps(epilog, prolog, tmp);
+} p->pop();
+
+/* Now, begin to write the results out. */
+// Inc the log items number.
+p->push(); {
+  //ptr[0] is the total count of the log items.
+  GenRegister sndMsg = GenRegister::retype(tmp, GEN_TYPE_UD);
+  sndMsg.width = GEN_WIDTH_8;
+  sndMsg.hstride = GEN_HORIZONTAL_STRIDE_1;
+  sndMsg.vstride = GEN_VERTICAL_STRIDE_8;
+  p->curr.execWidth = 8;
+  p->curr.predicate = GEN_PREDICATE_NONE;
+  p->curr.noMask = 1;
+  p->MOV(sndMsg, GenRegister::immud(0x0));
+
+  GenRegister incRes = GenRegister::offset(sndMsg, 1);
+  p->push();
+  {
+