[Beignet] [PATCH] support sends for long write

2016-11-28 Thread Guo, Yejun
Signed-off-by: Guo, Yejun 
---
 backend/src/backend/gen_insn_selection.cpp | 28 +++-
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index 8090250..9722423 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -1594,7 +1594,6 @@ namespace gbe
   // dst: srcNum, (flagTemp)
   // src: srcNum, addr, srcNum, bti.
   insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum*2 + 2);
-  vector = this->appendVector();
 
   for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
 insn->src(elemID) = src[elemID];
@@ -1615,10 +1614,29 @@ namespace gbe
   }
   insn->extra.elem = srcNum;
 
-  vector->regNum = srcNum + 1;
-  vector->offsetID = srcNum;
-  vector->reg = &insn->src(srcNum);
-  vector->isSrc = 1;
+  if (hasSends()) {
+insn->extra.splitSend = 1;
+
+//addr regs
+vector = this->appendVector();
+vector->regNum = 1;
+vector->offsetID = srcNum;
+vector->reg = &insn->src(srcNum);
+vector->isSrc = 1;
+
+//data regs
+vector = this->appendVector();
+vector->regNum = srcNum;
+vector->offsetID = srcNum+1;
+vector->reg = &insn->src(srcNum+1);
+vector->isSrc = 1;
+  } else {
+vector = this->appendVector();
+vector->regNum = srcNum + 1;
+vector->offsetID = srcNum;
+vector->reg = &insn->src(srcNum);
+vector->isSrc = 1;
+  }
 }
 
 if (bti.file != GEN_IMMEDIATE_VALUE) {
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 1/2] move function setDPByteScatterGather into class GenEncoder

2016-11-28 Thread Guo, Yejun
setDPByteScatterGather will be reused by gen9 sends. As for the
same function in gen8encoder, just leave it untill the reuse case
appears (now, just change the function name to pass build)

Signed-off-by: Guo, Yejun 
---
 backend/src/backend/gen8_encoder.cpp |  6 +++---
 backend/src/backend/gen_encoder.cpp  | 15 ++-
 backend/src/backend/gen_encoder.hpp  |  2 ++
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/backend/src/backend/gen8_encoder.cpp 
b/backend/src/backend/gen8_encoder.cpp
index 4239e84..8f73346 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -84,7 +84,7 @@ namespace gbe
   NOT_SUPPORTED;
   }
 
-  static void setDPByteScatterGather(GenEncoder *p,
+  static void setDPByteScatterGatherA64(GenEncoder *p,
  GenNativeInstruction *insn,
  uint32_t bti,
  uint32_t block_size,
@@ -350,7 +350,7 @@ namespace gbe
 GBE_ASSERT(this->curr.execWidth == 8);
 const uint32_t msg_length = 2;
 const uint32_t response_length = 1;
-setDPByteScatterGather(this,
+setDPByteScatterGatherA64(this,
insn,
0xff,
0x0,
@@ -375,7 +375,7 @@ namespace gbe
 this->setSrc1(insn, GenRegister::immud(0));
 const uint32_t msg_length = 3;
 const uint32_t response_length = 0;
-setDPByteScatterGather(this,
+setDPByteScatterGatherA64(this,
insn,
0xff,
0x0,
diff --git a/backend/src/backend/gen_encoder.cpp 
b/backend/src/backend/gen_encoder.cpp
index b379419..637403c 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -237,8 +237,7 @@ namespace gbe
   NOT_SUPPORTED;
   }
 
-  static void setDPByteScatterGather(GenEncoder *p,
- GenNativeInstruction *insn,
+  void GenEncoder::setDPByteScatterGather(GenNativeInstruction *insn,
  uint32_t bti,
  uint32_t elem_size,
  uint32_t msg_type,
@@ -246,13 +245,13 @@ namespace gbe
  uint32_t response_length)
   {
 const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
-p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+setMessageDescriptor(insn, sfid, msg_length, response_length);
 insn->bits3.gen7_byte_rw.msg_type = msg_type;
 insn->bits3.gen7_byte_rw.bti = bti;
 insn->bits3.gen7_byte_rw.data_size = elem_size;
-if (p->curr.execWidth == 8)
+if (curr.execWidth == 8)
   insn->bits3.gen7_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD8;
-else if (p->curr.execWidth == 16)
+else if (curr.execWidth == 16)
   insn->bits3.gen7_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD16;
 else
   NOT_SUPPORTED;
@@ -472,8 +471,7 @@ namespace gbe
   response_length = 2;
 } else
   NOT_IMPLEMENTED;
-setDPByteScatterGather(this,
-   insn,
+setDPByteScatterGather(insn,
bti,
elemSize,
GEN7_BYTE_GATHER,
@@ -515,8 +513,7 @@ namespace gbe
 } else
   NOT_IMPLEMENTED;
 
-setDPByteScatterGather(this,
-   insn,
+setDPByteScatterGather(insn,
bti,
elemSize,
GEN7_BYTE_SCATTER,
diff --git a/backend/src/backend/gen_encoder.hpp 
b/backend/src/backend/gen_encoder.hpp
index e6f362b..b9446e6 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -247,6 +247,8 @@ namespace gbe
 
 // Helper functions to encode
 
+void setDPByteScatterGather(GenNativeInstruction *insn, uint32_t bti, 
uint32_t elem_size,
+ uint32_t msg_type, uint32_t msg_length, 
uint32_t response_length);
 virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, 
uint32_t rgba,
 uint32_t msg_type, uint32_t msg_length,
 uint32_t response_length);
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 2/2] add sends support for byte write

2016-11-28 Thread Guo, Yejun
Signed-off-by: Guo, Yejun 
---
 backend/src/backend/gen9_encoder.cpp   | 47 ++
 backend/src/backend/gen9_encoder.hpp   |  2 ++
 backend/src/backend/gen_context.cpp| 15 +++---
 backend/src/backend/gen_encoder.cpp| 14 -
 backend/src/backend/gen_encoder.hpp|  4 ++-
 backend/src/backend/gen_insn_selection.cpp | 26 +
 6 files changed, 96 insertions(+), 12 deletions(-)

diff --git a/backend/src/backend/gen9_encoder.cpp 
b/backend/src/backend/gen9_encoder.cpp
index 68ab7ae..b5be852 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -143,4 +143,51 @@ namespace gbe
 gen9_insn->bits2.sends.sel_reg32_desc = 1;
 }
   }
+
+  unsigned Gen9Encoder::setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize)
+  {
+uint32_t msg_length = 0;
+uint32_t response_length = 0;
+if (this->curr.execWidth == 8) {
+  msg_length = 1;
+} else if (this->curr.execWidth == 16) {
+  msg_length = 2;
+} else
+  NOT_IMPLEMENTED;
+
+setDPByteScatterGather(insn,
+   bti,
+   elemSize,
+   GEN7_BYTE_SCATTER,
+   msg_length,
+   response_length);
+return insn->bits3.ud;
+  }
+
+  void Gen9Encoder::BYTE_SCATTER(GenRegister addr, GenRegister data, 
GenRegister bti, uint32_t elemSize)
+  {
+if (addr.reg() == data.reg())
+  Gen8Encoder::BYTE_SCATTER(addr, data, bti, elemSize);
+else {
+  GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+  Gen9NativeInstruction *gen9_insn = &insn->gen9_insn;
+
+  this->setHeader(insn);
+  insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+  setSendsOperands(gen9_insn, GenRegister::null(), addr, data);
+  if (this->curr.execWidth == 8)
+gen9_insn->bits2.sends.src1_length = 1;
+  else if (this->curr.execWidth == 16)
+gen9_insn->bits2.sends.src1_length = 2;
+  else
+assert(!"unsupported");
+
+  if (bti.file == GEN_IMMEDIATE_VALUE) {
+gen9_insn->bits2.sends.sel_reg32_desc = 0;
+setByteScatterSendsMessageDesc(insn, bti.value.ud, elemSize);
+  } else
+gen9_insn->bits2.sends.sel_reg32_desc = 1;
+}
+  }
 } /* End of the name space. */
diff --git a/backend/src/backend/gen9_encoder.hpp 
b/backend/src/backend/gen9_encoder.hpp
index 5b6328d..1c40b92 100644
--- a/backend/src/backend/gen9_encoder.hpp
+++ b/backend/src/backend/gen9_encoder.hpp
@@ -50,6 +50,8 @@ namespace gbe
 void setSendsOperands(Gen9NativeInstruction *gen9_insn, GenRegister dst, 
GenRegister src0, GenRegister src1);
 virtual void UNTYPED_WRITE(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemNum);
 virtual unsigned setUntypedWriteSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemNum);
+virtual void BYTE_SCATTER(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemSize);
+virtual unsigned setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize);
   };
 }
 #endif /* __GBE_GEN9_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 848933e..9505592 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2220,16 +2220,23 @@ namespace gbe
   }
 
   void GenContext::emitByteScatterInstruction(const SelectionInstruction 
&insn) {
-const GenRegister src = ra->genReg(insn.src(0));
+const GenRegister addr = ra->genReg(insn.src(0));
+GenRegister data = ra->genReg(insn.src(1));
+if (!insn.extra.splitSend)
+  data = addr;
 const uint32_t elemSize = insn.extra.elem;
 const GenRegister bti = ra->genReg(insn.src(2));
 
 if (bti.file == GEN_IMMEDIATE_VALUE) {
-  p->BYTE_SCATTER(src, bti, elemSize);
+  p->BYTE_SCATTER(addr, data, bti, elemSize);
 } else {
   const GenRegister tmp = ra->genReg(insn.dst(0));
   const GenRegister btiTmp = ra->genReg(insn.dst(1));
-  unsigned desc = p->generateByteScatterMessageDesc(0, elemSize);
+  unsigned desc = 0;
+  if (insn.extra.splitSend)
+desc = p->generateByteScatterSendsMessageDesc(0, elemSize);
+  else
+desc = p->generateByteScatterMessageDesc(0, elemSize);
 
   unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
 
@@ -2237,7 +2244,7 @@ namespace gbe
   p->push();
 p->curr.predicate = GEN_PREDICATE_NORMAL;
 p->curr.useFlag(insn.state.flag, insn.state.subFlag);
-p->BYTE_SCATTER(src, GenRegister::addr1(0), elemSize);
+p->BYTE_SCATTER(addr, data, GenRegister::addr1(0), elemSize);
   p->pop();
   afterMessage(insn, bti, tmp, btiTmp, jip0);
 }
diff --git a/backend/src/backend/gen_encoder.cpp 
b/backend/src/backend/gen_encoder.cpp
index 637403c..381

Re: [Beignet] [PATCH 1/3] do not touch src1 when setting instruction header

2016-11-28 Thread Song, Ruiling
The patchset LGTM.

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH V3 2/3] prepare gen9 sends binary format and enable the ASM dump for sends

2016-11-28 Thread Guo, Yejun
v2: output dst register for sends
v3: check dst reg file when output dst register
Signed-off-by: Guo, Yejun 
---
 backend/src/backend/gen/gen_mesa_disasm.c | 31 ++--
 backend/src/backend/gen9_instruction.hpp  | 84 +++
 backend/src/backend/gen_defs.hpp  |  3 ++
 3 files changed, 114 insertions(+), 4 deletions(-)
 create mode 100644 backend/src/backend/gen9_instruction.hpp

diff --git a/backend/src/backend/gen/gen_mesa_disasm.c 
b/backend/src/backend/gen/gen_mesa_disasm.c
index c30f168..56fda89 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -50,6 +50,7 @@
 
 #include "backend/gen_defs.hpp"
 #include "backend/gen7_instruction.hpp"
+#include "backend/gen9_instruction.hpp"
 #include "src/cl_device_data.h"
 
 static const struct {
@@ -104,6 +105,7 @@ static const struct {
 
   [GEN_OPCODE_SEND] = { .name = "send", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_SENDS] = { .name = "sends", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
@@ -1411,7 +1413,8 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
 }
 
   } else if (OPCODE(inst) != GEN_OPCODE_SEND &&
- OPCODE(inst) != GEN_OPCODE_SENDC) {
+ OPCODE(inst) != GEN_OPCODE_SENDC &&
+ OPCODE(inst) != GEN_OPCODE_SENDS) {
 err |= control(file, "conditional modifier", conditional_modifier,
COND_DST_OR_MODIFIER(inst), NULL);
 if (COND_DST_OR_MODIFIER(inst))
@@ -1426,7 +1429,20 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
 string(file, ")");
   }
 
-  if (opcode[OPCODE(inst)].nsrc == 3) {
+  if (OPCODE(inst) == GEN_OPCODE_SENDS) {
+const union Gen9NativeInstruction *gen9_insn = (const union 
Gen9NativeInstruction *)inst;
+pad(file, 16);
+if (gen9_insn->bits1.sends.dest_reg_file_0 == 0)
+  reg(file, GEN_ARCHITECTURE_REGISTER_FILE, 
gen9_insn->bits1.sends.dest_reg_nr);
+else
+  format(file, "g%d", gen9_insn->bits1.sends.dest_reg_nr);
+pad(file, 32);
+format(file, "g%d(addLen:%d)", gen9_insn->bits2.sends.src0_reg_nr, 
GENERIC_MSG_LENGTH(inst));
+pad(file, 48);
+format(file, "g%d(dataLen:%d)", gen9_insn->bits1.sends.src1_reg_nr, 
gen9_insn->bits2.sends.src1_length);
+pad(file, 64);
+format(file, "0x%08x", gen9_insn->bits3.ud);
+  } else if (opcode[OPCODE(inst)].nsrc == 3) {
 pad(file, 16);
 err |= dest_3src(file, inst);
 
@@ -1469,7 +1485,8 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
   }
 
   if (OPCODE(inst) == GEN_OPCODE_SEND ||
-  OPCODE(inst) == GEN_OPCODE_SENDC) {
+  OPCODE(inst) == GEN_OPCODE_SENDC ||
+  OPCODE(inst) == GEN_OPCODE_SENDS) {
 enum GenMessageTarget target = COND_DST_OR_MODIFIER(inst);
 
 newline(file);
@@ -1484,7 +1501,13 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
  target, &space);
 }
 
-if (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, 
bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE) {
+int immbti = 0;
+if (OPCODE(inst) == GEN_OPCODE_SENDS) {
+  const union Gen9NativeInstruction *gen9_insn = (const union 
Gen9NativeInstruction *)inst;
+  immbti = !(gen9_insn->bits2.sends.sel_reg32_desc);
+} else
+  immbti = (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, 
bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE);
+if (immbti) {
   switch (target) {
 case GEN_SFID_VIDEO_MOTION_EST:
   format(file, " (bti: %d, msg_type: %d)",
diff --git a/backend/src/backend/gen9_instruction.hpp 
b/backend/src/backend/gen9_instruction.hpp
new file mode 100644
index 000..16114ca
--- /dev/null
+++ b/backend/src/backend/gen9_instruction.hpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see .
+ *
+ * Author: Guo, Yejun 
+ */
+
+
+#ifndef __GEN9_INSTRUCTION_HPP__
+#define __GEN9_INSTRUCTION_HPP__
+
+union Gen9NativeInstruction
+{
+  struct {
+struct {
+  uint32_t opcode:7;
+  uint32_t pad:1

[Beignet] [PATCH V3 3/3] support sends (split send) for untyped write

2016-11-28 Thread Guo, Yejun
sends is a new instruction starting from gen9 to split the registers
of address and data for write, the register pressure can be loosed
since they are not necessary to be continuous any more.

more patches for sends will be sent out.

we can choose send or sends based on hasSends() in selection stage,
only enabeld as default for skylake now.

v2: add function setSendsOperands
v3: reuse function setDPUntypedRW
Signed-off-by: Guo, Yejun 
---
 backend/src/backend/gen75_encoder.cpp  |  2 +-
 backend/src/backend/gen75_encoder.hpp  |  2 +-
 backend/src/backend/gen8_context.cpp   | 21 +---
 backend/src/backend/gen8_encoder.cpp   |  2 +-
 backend/src/backend/gen8_encoder.hpp   |  2 +-
 backend/src/backend/gen9_encoder.cpp   | 77 ++
 backend/src/backend/gen9_encoder.hpp   |  4 +-
 backend/src/backend/gen_context.cpp| 41 +---
 backend/src/backend/gen_encoder.cpp| 14 +-
 backend/src/backend/gen_encoder.hpp|  4 +-
 backend/src/backend/gen_insn_selection.cpp | 22 -
 backend/src/backend/gen_insn_selection.hpp |  1 +
 12 files changed, 159 insertions(+), 33 deletions(-)

diff --git a/backend/src/backend/gen75_encoder.cpp 
b/backend/src/backend/gen75_encoder.cpp
index fc37991..9cafaa7 100644
--- a/backend/src/backend/gen75_encoder.cpp
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -199,7 +199,7 @@ namespace gbe
 return insn->bits3.ud;
   }
 
-  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t 
elemNum) {
+  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister data, 
GenRegister bti, uint32_t elemNum) {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 assert(elemNum >= 1 || elemNum <= 4);
 this->setHeader(insn);
diff --git a/backend/src/backend/gen75_encoder.hpp 
b/backend/src/backend/gen75_encoder.hpp
index d06f393..517afff 100644
--- a/backend/src/backend/gen75_encoder.hpp
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -44,7 +44,7 @@ namespace gbe
 virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
 virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, 
GenRegister bti, uint32_t srcNum);
 virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister 
bti, uint32_t elemNum);
-virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t 
elemNum);
+virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister 
bti, uint32_t elemNum);
 virtual void setHeader(GenNativeInstruction *insn);
 virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, 
uint32_t rgba,
uint32_t msg_type, uint32_t msg_length, uint32_t 
response_length);
diff --git a/backend/src/backend/gen8_context.cpp 
b/backend/src/backend/gen8_context.cpp
index 71c54fb..95b1013 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -968,6 +968,9 @@ namespace gbe
 GBE_ASSERT(elemNum == 1);
 const GenRegister addr = ra->genReg(insn.src(elemNum));
 const GenRegister bti = ra->genReg(insn.src(elemNum*2+1));
+GenRegister data = ra->genReg(insn.src(elemNum+1));
+if (!insn.extra.splitSend)
+  data = addr;
 
 /* Because BDW's store and load send instructions for 64 bits require the 
bti to be surfaceless,
which we can not accept. We just fallback to 2 DW untypewrite here. */
@@ -978,11 +981,15 @@ namespace gbe
 }
 
 if (bti.file == GEN_IMMEDIATE_VALUE) {
-  p->UNTYPED_WRITE(addr, bti, elemNum*2);
+  p->UNTYPED_WRITE(addr, data, bti, elemNum*2);
 } else {
   const GenRegister tmp = ra->genReg(insn.dst(elemNum));
   const GenRegister btiTmp = ra->genReg(insn.dst(elemNum + 1));
-  unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
+  unsigned desc = 0;
+  if (insn.extra.splitSend)
+desc = p->generateUntypedWriteSendsMessageDesc(0, elemNum*2);
+  else
+desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
 
   unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
 
@@ -990,7 +997,7 @@ namespace gbe
   p->push();
 p->curr.predicate = GEN_PREDICATE_NORMAL;
 p->curr.useFlag(insn.state.flag, insn.state.subFlag);
-p->UNTYPED_WRITE(addr, GenRegister::addr1(0), elemNum*2);
+p->UNTYPED_WRITE(addr, data, GenRegister::addr1(0), elemNum*2);
   p->pop();
   afterMessage(insn, bti, tmp, btiTmp, jip0);
 }
@@ -1351,7 +1358,7 @@ namespace gbe
   nextDst = GenRegister::Qn(tempDst, 1);
   p->MOV(nextDst, nextSrc);
 p->pop();
-p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1);
 p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
 
 p->push();
@@ -1367,7 +1374,7 @@ namespace gbe
   nextDst = GenRegister::Qn(tempDst, 1);
   p->MOV(nextDst, nextSrc);
 p->pop();
-p->UNTYPED_WRITE(addr, Gen

[Beignet] [PATCH 1/3] do not touch src1 when setting instruction header

2016-11-28 Thread Guo, Yejun
Signed-off-by: Guo, Yejun 
---
 backend/src/backend/gen9_encoder.cpp | 1 +
 backend/src/backend/gen_encoder.cpp  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/backend/src/backend/gen9_encoder.cpp 
b/backend/src/backend/gen9_encoder.cpp
index 80df50d..e66ae08 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -60,6 +60,7 @@ namespace gbe
  this->setHeader(insn);
  this->setDst(insn, dest);
  this->setSrc0(insn, msg);
+ this->setSrc1(insn, GenRegister::immud(0));
  setSamplerMessage(insn, bti, sampler, msg_type,
response_length, msg_length,
header_present,
diff --git a/backend/src/backend/gen_encoder.cpp 
b/backend/src/backend/gen_encoder.cpp
index a69adc7..060d65f 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -203,7 +203,6 @@ namespace gbe
 unsigned msg_length, unsigned 
response_length,
 bool header_present, bool 
end_of_thread)
   {
- setSrc1(inst, GenRegister::immud(0));
  inst->bits3.generic_gen5.header_present = header_present;
  inst->bits3.generic_gen5.response_length = response_length;
  inst->bits3.generic_gen5.msg_length = msg_length;
@@ -1178,6 +1177,7 @@ namespace gbe
  this->setHeader(insn);
  this->setDst(insn, dest);
  this->setSrc0(insn, msg);
+ this->setSrc1(insn, GenRegister::immud(0));
  setSamplerMessage(insn, bti, sampler, msg_type,
response_length, msg_length,
header_present,
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 17/19] OCL20: handle device enqueue in runtime.

2016-11-28 Thread Yang Rong
There are some step to handle device enqueue:
1. allocate the device enqueue bo to store the device enqueue
information for parent kernel. Add must convert all global buffers to
SVM buffers to make sure the child kernels have the same GPU address.
2. When flush the command, check whether have device enqueue or not. If
has device enqueue, must wait finish and parse the device enqueue info.
3. Start the child ndrange according the device enqueue info, and the
parent's global buffers as the exec info.

Because of non uniform workgroup size, one enqueue api will flush
serveral times, but device enqueue only need handle once, so add a flag
to function cl_command_queue_flush to indicate the last flush.

Signed-off-by: Yang Rong 
Reviewed-by: Pan Xiuli 
---
 src/CMakeLists.txt  |2 +
 src/cl_api_kernel.c |   97 +++-
 src/cl_command_queue.c  |   88 +---
 src/cl_command_queue.h  |   11 +-
 src/cl_command_queue_gen7.c |   15 +-
 src/cl_context.c|   17 +
 src/cl_context.h|3 +
 src/cl_device_enqueue.c |  198 +++
 src/cl_device_enqueue.h |   31 ++
 src/cl_enqueue.c|7 +
 src/cl_enqueue.h|3 +
 src/cl_kernel.c |6 +
 src/cl_kernel.h |6 +-
 src/cl_mem.c|   30 +-
 src/intel/intel_driver.c| 1189 ++-
 15 files changed, 1003 insertions(+), 700 deletions(-)
 create mode 100644 src/cl_device_enqueue.c
 create mode 100644 src/cl_device_enqueue.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 26cccea..b7fc13d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -88,6 +88,8 @@ set(OPENCL_SRC
 cl_context.c
 cl_command_queue.c
 cl_command_queue.h
+cl_device_enqueue.c
+cl_device_enqueue.h
 cl_command_queue_gen7.c
 cl_command_queue_enqueue.c
 cl_driver.h
diff --git a/src/cl_api_kernel.c b/src/cl_api_kernel.c
index 70140b2..7812acf 100644
--- a/src/cl_api_kernel.c
+++ b/src/cl_api_kernel.c
@@ -160,35 +160,86 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
   break;
 }
 
-e = cl_event_create(command_queue->ctx, command_queue, 
num_events_in_wait_list,
-event_wait_list, CL_COMMAND_NDRANGE_KERNEL, &err);
-if (err != CL_SUCCESS) {
-  break;
-}
+int i,j,k;
+const size_t global_wk_sz_div[3] = {
+  fixed_global_sz[0] / fixed_local_sz[0] * fixed_local_sz[0],
+  fixed_global_sz[1] / fixed_local_sz[1] * fixed_local_sz[1],
+  fixed_global_sz[2] / fixed_local_sz[2] * fixed_local_sz[2]
+};
+
+const size_t global_wk_sz_rem[3] = {
+  fixed_global_sz[0] % fixed_local_sz[0],
+  fixed_global_sz[1] % fixed_local_sz[1],
+  fixed_global_sz[2] % fixed_local_sz[2]
+};
+cl_uint count;
+count = global_wk_sz_rem[0] ? 2 : 1;
+count *= global_wk_sz_rem[1] ? 2 : 1;
+count *= global_wk_sz_rem[2] ? 2 : 1;
+
+const size_t *global_wk_all[2] = {global_wk_sz_div, global_wk_sz_rem};
+/* Go through the at most 8 cases and euque if there is work items left */
+for (i = 0; i < 2;i++) {
+  for (j = 0; j < 2;j++) {
+for (k = 0; k < 2; k++) {
+  size_t global_wk_sz_use[3] = {global_wk_all[k][0], 
global_wk_all[j][1], global_wk_all[i][2]};
+  size_t global_dim_off[3] = {
+k * global_wk_sz_div[0] / fixed_local_sz[0],
+j * global_wk_sz_div[1] / fixed_local_sz[1],
+i * global_wk_sz_div[2] / fixed_local_sz[2]
+  };
+  size_t local_wk_sz_use[3] = {
+k ? global_wk_sz_rem[0] : fixed_local_sz[0],
+j ? global_wk_sz_rem[1] : fixed_local_sz[1],
+i ? global_wk_sz_rem[2] : fixed_local_sz[2]
+  };
+  if (local_wk_sz_use[0] == 0 || local_wk_sz_use[1] == 0 || 
local_wk_sz_use[2] == 0)
+continue;
+
+  e = cl_event_create(command_queue->ctx, command_queue, 
num_events_in_wait_list,
+  event_wait_list, CL_COMMAND_NDRANGE_KERNEL, 
&err);
+  if (err != CL_SUCCESS) {
+break;
+  }
 
-/* Do device specific checks are enqueue the kernel */
-err = cl_command_queue_ND_range(command_queue, kernel, e, work_dim,
-fixed_global_off, fixed_global_sz, 
fixed_local_sz);
-if (err != CL_SUCCESS) {
-  break;
-}
+  /* Do device specific checks are enqueue the kernel */
+  err = cl_command_queue_ND_range(command_queue, kernel, e, work_dim,
+  fixed_global_off, global_dim_off, 
fixed_global_sz,
+  global_wk_sz_use, fixed_local_sz, 
local_wk_sz_use);
+  if (err != CL_SUCCESS) {
+break;
+  }
+  e->exec_data.mid_event_of_enq = (count > 1);
+  count--;
+
+  /* We will flush the ndrange if no event depend. Else we will add it 
to queue list.
+

[Beignet] [PATCH 19/19] CMake: add an option to enable OpenCL 2.0.

2016-11-28 Thread Yang Rong
The OpenCL 2.0 is disable default, to enable it, using option
-DENABLE_OPENCL_20 to enable it.
Now skylake and newer devices support OpenCL 2.0, if enable OpenCL 2.0,
LLVM 3.9 and libdrm 2.4.66 is required.

Signed-off-by: Yang Rong 
---
 CMakeLists.txt | 56 +---
 GetGenID.sh| 50 +-
 2 files changed, 74 insertions(+), 32 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 713cfa9..22090ba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,15 +16,6 @@ endif ()
 
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
 PROJECT(OCL)
-set (LIBCL_DRIVER_VERSION_MAJOR 1)
-set (LIBCL_DRIVER_VERSION_MINOR 3)
-if (ENABLE_OPENCL_20)
-  set (LIBCL_C_VERSION_MAJOR 2)
-  set (LIBCL_C_VERSION_MINOR 0)
-else (ENABLE_OPENCL_20)
-  set (LIBCL_C_VERSION_MAJOR 1)
-  set (LIBCL_C_VERSION_MINOR 2)
-endif (ENABLE_OPENCL_20)
 if( ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
   set(COMPILER "CLANG")
 elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
@@ -32,11 +23,6 @@ elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
 elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
   set(COMPILER "ICC")
 endif()
-configure_file (
-  "src/OCLConfig.h.in"
-  "src/OCLConfig.h"
-)
-
 set (NOT_BUILD_STAND_ALONE_UTEST 1)
 
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
@@ -236,10 +222,50 @@ ENDIF(OCLIcd_FOUND)
 Find_Package(PythonInterp)
 
 OPTION(EXPERIMENTAL_DOUBLE "Enable experimental double support" OFF)
-IF(EXPERIMENTAL_DOUBLE)
+IF (EXPERIMENTAL_DOUBLE)
   ADD_DEFINITIONS(-DENABLE_FP64)
 ENDIF(EXPERIMENTAL_DOUBLE)
 
+OPTION(ENABLE_OPENCL_20 "Enable opencl 2.0 support" OFF)
+IF (ENABLE_OPENCL_20)
+  Find_Program(LSPCI lspci)
+  IF (NOT LSPCI)
+MESSAGE(FATAL_ERROR "Looking for lspci - not found")
+  ENDIF (NOT LSPCI)
+  EXECUTE_PROCESS(COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/GetGenID.sh"
+  RESULT_VARIABLE SUPPORT_OCL20_DEVICE
+  OUTPUT_VARIABLE PCI_ID_NOT_USED)
+
+  IF (NOT SUPPORT_OCL20_DEVICE EQUAL 1)
+MESSAGE(FATAL_ERROR "Only SKL and newer devices support OpenCL 2.0 now, 
your device don't support.")
+  ENDIF (NOT SUPPORT_OCL20_DEVICE EQUAL 1)
+
+  IF (NOT HAVE_DRM_INTEL_BO_SET_SOFTPIN)
+MESSAGE(FATAL_ERROR "Please update libdrm to version 2.4.66 or later to 
enable OpenCL 2.0.")
+  ENDIF (NOT HAVE_DRM_INTEL_BO_SET_SOFTPIN)
+
+  IF (LLVM_VERSION_NODOT VERSION_LESS 39)
+MESSAGE(FATAL_ERROR "Please update LLVM to version 3.9 or later to enable 
OpenCL 2.0.")
+  ENDIF (LLVM_VERSION_NODOT VERSION_LESS 39)
+
+  ADD_DEFINITIONS(-DENABLE_OPENCL_20)
+ENDIF(ENABLE_OPENCL_20)
+
+set (LIBCL_DRIVER_VERSION_MAJOR 1)
+set (LIBCL_DRIVER_VERSION_MINOR 3)
+if (ENABLE_OPENCL_20)
+  set (LIBCL_C_VERSION_MAJOR 2)
+  set (LIBCL_C_VERSION_MINOR 0)
+else (ENABLE_OPENCL_20)
+  set (LIBCL_C_VERSION_MAJOR 1)
+  set (LIBCL_C_VERSION_MINOR 2)
+endif (ENABLE_OPENCL_20)
+configure_file (
+  "src/OCLConfig.h.in"
+  "src/OCLConfig.h"
+)
+
+
 OPTION(BUILD_EXAMPLES "Build examples" OFF)
 IF(BUILD_EXAMPLES)
 IF(NOT X11_FOUND)
diff --git a/GetGenID.sh b/GetGenID.sh
index a0e5f85..5e5cafd 100755
--- a/GetGenID.sh
+++ b/GetGenID.sh
@@ -12,34 +12,50 @@ genpciid+=(0d02 0d12 0d22 0d0a 0d1a 0d2a 0d06 0d16 0d26 
0d0b 0d1b 0d2b 0d0e 0d1e
 genpciid+=(1602 1606 160a 160d 160e 1612 1616 161a 161d 161e 1622 1626 162a 
162d 162e)
 #BSW
 genpciid+=(22b0 22b1 22b2 22b3)
+#Only enable OpenCL 2.0 after SKL.
 #SKL
-genpciid+=(1906 1916 1926 190e 191e 1902 1912 1932 190b 191b 192b 193b 190a 
191a 192a 193a)
+genpciid_20=(1906 1916 1926 190e 191e 1902 1912 1932 190b 191b 192b 193b 190a 
191a 192a 193a)
 #BXT
-genpciid+=(5a84 5a85)
+genpciid_20+=(5a84 5a85 1a84 1a85)
 #KBL
-genpciid+=(5906 5916 5926 5913 5921 5923 5927 5902 5912 5917)
-genpciid+=(590b 591b 593b 5908 590e 591e 5915 590a 591a 591d)
+genpciid_20+=(5906 5916 5926 5913 5921 5923 5927 5902 5912 5917)
+genpciid_20+=(590b 591b 593b 5908 590e 591e 5915 590a 591a 591d)
 pciid=($(lspci -nn | grep "\[8086:.*\]" -o | awk -F : '{print $2}' | awk -F ] 
'{print $1}'))
 n=${#pciid[*]}
 i=0
 m=${#genpciid[*]}
+t=${#genpciid_20[*]}
 j=0
 while [ $i -lt $n ]
 do
-id1=${pciid[$i]}
-let j=0
+  id1=${pciid[$i]}
+  let j=0
 
-while [ $j -lt $m ]
-do
-   id2=${genpciid[$j]}
+  while [ $j -lt $m ]
+  do
+id2=${genpciid[$j]}
 
-   if [ ${id1} == ${id2} ]
-   then
-   echo ${id1}
-   exit 0
-   fi
-   let j=j+1
-done
+if [ ${id1} == ${id2} ]
+then
+  echo ${id1}
+  exit 0
+fi
+let j=j+1
+  done
 
-let i=i+1
+  let j=0
+  while [ $j -lt $t ]
+  do
+id2=${genpciid_20[$j]}
+
+if [ ${id1} == ${id2} ]
+then
+  echo ${id1}
+  exit 1
+fi
+let j=j+1
+  done
+
+  let i=i+1
 done
+exit -1
-- 
2.1.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 14/19] OCL20: handle device enqueue helper functions in the backend.

2016-11-28 Thread Yang Rong
Add useDeviceEnqueue to kernel to indicate the kernel use device
enqueue or not.

V2: Remove and correct debug info.
Signed-off-by: Yang Rong 
Reviewed-by: Pan Xiuli 
Reviewed-by: Ruiling Song 
---
 backend/src/backend/context.cpp|  1 +
 backend/src/backend/program.cpp|  4 ++-
 backend/src/backend/program.hpp| 15 +++
 backend/src/ir/function.cpp|  2 +-
 backend/src/ir/function.hpp|  7 +
 backend/src/llvm/llvm_gen_backend.cpp  | 43 +++---
 backend/src/llvm/llvm_gen_ocl_function.hxx |  5 
 7 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 8174954..e9ddd17 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -393,6 +393,7 @@ namespace gbe
 if(this->kernel != NULL) {
   this->kernel->scratchSize = 
this->alignScratchSize(scratchAllocator->getMaxScatchMemUsed());
   this->kernel->ctx = this;
+  this->kernel->setUseDeviceEnqueue(fn.getUseDeviceEnqueue());
 }
 return this->kernel;
   }
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index e0107dc..dcbaaf4 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -89,7 +89,8 @@ namespace gbe {
   Kernel::Kernel(const std::string &name) :
 name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), 
useSLM(false),
 slmSize(0), ctx(NULL), samplerSet(NULL), imageSet(NULL), 
printfSet(NULL),
-profilingInfo(NULL) {}
+profilingInfo(NULL), useDeviceEnqueue(false) {}
+
   Kernel::~Kernel(void) {
 if(ctx) GBE_DELETE(ctx);
 if(samplerSet) GBE_DELETE(samplerSet);
@@ -181,6 +182,7 @@ namespace gbe {
   bool Program::buildFromUnit(const ir::Unit &unit, std::string &error) {
 constantSet = new ir::ConstantSet(unit.getConstantSet());
 relocTable = new ir::RelocTable(unit.getRelocTable());
+blockFuncs = unit.blockFuncs;
 const auto &set = unit.getFunctionSet();
 const uint32_t kernelNum = set.size();
 if (OCL_OUTPUT_GEN_IR) std::cout << unit;
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 35ebad5..1aff8b9 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -232,6 +232,12 @@ namespace gbe {
 virtual uint32_t serializeToBin(std::ostream& outs);
 virtual uint32_t deserializeFromBin(std::istream& ins);
 virtual void printStatus(int indent, std::ostream& outs);
+/*! Does kernel use device enqueue */
+INLINE bool getUseDeviceEnqueue(void) const { return 
this->useDeviceEnqueue; }
+/*! Change the device enqueue info of the function */
+INLINE bool setUseDeviceEnqueue(bool useDeviceEnqueue) {
+  return this->useDeviceEnqueue = useDeviceEnqueue;
+}
 
   protected:
 friend class Context;  //!< Owns the kernels
@@ -254,6 +260,7 @@ namespace gbe {
 ir::ProfilingInfo *profilingInfo;  //!< Copy from the corresponding 
function.
 uint32_t compileWgSize[3]; //!< required work group size by kernel 
attribute.
 std::string functionAttributes; //!< function attribute qualifiers 
combined.
+bool useDeviceEnqueue;  //!< Has device enqueue?
 GBE_CLASS(Kernel); //!< Use custom allocators
   };
 
@@ -290,6 +297,12 @@ namespace gbe {
   }
   return kernel;
 }
+
+const char *getDeviceEnqueueKernelName(uint32_t index) const {
+  if(index >= blockFuncs.size())
+return NULL;
+  return blockFuncs[index].c_str();
+}
 /*! Build a program from a ir::Unit */
 bool buildFromUnit(const ir::Unit &unit, std::string &error);
 /*! Buils a program from a LLVM source code */
@@ -336,6 +349,8 @@ namespace gbe {
 ir::ConstantSet *constantSet;
 /*! relocation table */
 ir::RelocTable *relocTable;
+/*! device enqueue functions */
+vector blockFuncs;
 /*! Use custom allocators */
 GBE_CLASS(Program);
   };
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
index 29be0a4..4c19a42 100644
--- a/backend/src/ir/function.cpp
+++ b/backend/src/ir/function.cpp
@@ -44,7 +44,7 @@ namespace ir {
 
   Function::Function(const std::string &name, const Unit &unit, Profile 
profile) :
 name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false), 
slmSize(0), stackSize(0),
-wgBroadcastSLM(-1), tidMapSLM(-1)
+wgBroadcastSLM(-1), tidMapSLM(-1), useDeviceEnqueue(false)
   {
 initProfile(*this);
 samplerSet = GBE_NEW(SamplerSet);
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 8582508..5fcb14a 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -557,6 +557,12 @@ namespace ir {
 /*! Output the control flow graph to .dot file */
 void outputCFG();
 uint32_t getOclVersion(void) const;
+/*! Does it use device enqueue */
+INLINE bool 

[Beignet] [PATCH 09/19] OCL20: Add __OPENCL_VERSION__ and CL_VERSION_2_0 define.

2016-11-28 Thread Yang Rong
Because spir and spir64's data layout is different, so copy the ll files
and change data layout and triple to spir64.
Also correct ocl2.0 include typo.

Signed-off-by: Yang Rong 
---
 backend/src/libocl/include/ocl.h|  3 +-
 backend/src/libocl/include/ocl_types.h  |  8 +++-
 backend/src/libocl/src/ocl_barrier_20.ll| 25 +++
 backend/src/libocl/src/ocl_clz_20.ll| 65 +
 backend/src/libocl/src/ocl_geometric.cl |  4 ++
 backend/src/libocl/src/ocl_image.cl |  4 ++
 backend/src/libocl/tmpl/ocl_defines.tmpl.h  |  7 +++-
 backend/src/libocl/tmpl/ocl_math_20.tmpl.cl |  2 +-
 backend/src/libocl/tmpl/ocl_math_20.tmpl.h  |  4 +-
 9 files changed, 115 insertions(+), 7 deletions(-)
 create mode 100644 backend/src/libocl/src/ocl_barrier_20.ll
 create mode 100644 backend/src/libocl/src/ocl_clz_20.ll

diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index e6f2567..677d2d4 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -89,7 +89,6 @@
 #include "ocl_geometric.h"
 #include "ocl_image.h"
 #include "ocl_integer.h"
-#include "ocl_math.h"
 #include "ocl_memcpy.h"
 #include "ocl_memset.h"
 #include "ocl_misc.h"
@@ -100,9 +99,11 @@
 #include "ocl_vload_20.h"
 #include "ocl_atom_20.h"
 #include "ocl_pipe.h"
+#include "ocl_math_20.h"
 #else
 #include "ocl_vload.h"
 #include "ocl_atom.h"
+#include "ocl_math.h"
 #endif
 #include "ocl_workitem.h"
 #include "ocl_simd.h"
diff --git a/backend/src/libocl/include/ocl_types.h 
b/backend/src/libocl/include/ocl_types.h
index 824262d..327624b 100644
--- a/backend/src/libocl/include/ocl_types.h
+++ b/backend/src/libocl/include/ocl_types.h
@@ -47,8 +47,12 @@ typedef unsigned int uint;
 typedef unsigned long ulong;
 typedef __typeof__(sizeof(int)) size_t;
 typedef __typeof__((int *)0-(int *)0) ptrdiff_t;
-typedef signed int intptr_t;
-typedef unsigned int uintptr_t;
+#define __int_t_type(a,b,c) a##b##c
+#define __int_type(type,n) __int_t_type(type,n,_TYPE__)
+typedef __int_type(__INT,__INTPTR_WIDTH__) intptr_t;
+typedef __int_type(__UINT,__INTPTR_WIDTH__) uintptr_t;
+#undef __int_type
+#undef __int_t_type
 
 /
 // OpenCL address space
diff --git a/backend/src/libocl/src/ocl_barrier_20.ll 
b/backend/src/libocl/src/ocl_barrier_20.ll
new file mode 100644
index 000..8935076
--- /dev/null
+++ b/backend/src/libocl/src/ocl_barrier_20.ll
@@ -0,0 +1,25 @@
+;XXX FIXME as llvm can't use macros, we hardcoded 3, 1, 2
+;here, we may need to use a more grace way to handle this type
+;of values latter.
+;#define CLK_LOCAL_MEM_FENCE  (1 << 0)
+;#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+
+target datalayout = 
"e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+declare i32 @_get_local_mem_fence() nounwind alwaysinline
+declare i32 @_get_global_mem_fence() nounwind alwaysinline
+declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_barrier_global() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_debugwait() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_barrier(i32) nounwind alwaysinline noduplicate
+
+define void @_Z7barrierj(i32 %flags) nounwind noduplicate alwaysinline {
+  call void @__gen_ocl_barrier(i32 %flags)
+  ret void
+}
+
+define void @_Z9debugwaitv() nounwind noduplicate alwaysinline {
+  call void @__gen_ocl_debugwait()
+  ret void
+}
diff --git a/backend/src/libocl/src/ocl_clz_20.ll 
b/backend/src/libocl/src/ocl_clz_20.ll
new file mode 100644
index 000..19f4e35
--- /dev/null
+++ b/backend/src/libocl/src/ocl_clz_20.ll
@@ -0,0 +1,65 @@
+target datalayout = 
"e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @clz_s8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+  ret i8 %call
+}
+
+define i8 @clz_u8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+  ret i8 %call
+}
+
+define i16 @clz_s16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+  ret i16 %call
+}
+
+define i16 @clz_u16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+  ret i16 %call
+}
+
+define i32 @clz_s32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+  ret i32 %call
+}
+
+define i32 @clz_u32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+  ret i32 %call
+}
+
+define i64 @clz_s64(i64 %x) nounwind readnone alwaysinline {
+  %1 = bitcast i64 %x to <2 x i32>
+  %2 = extractelement <2 x i32> %1, i32 0
+  %3 = extractelement <2 x i32> %

[Beignet] [PATCH 13/19] OCL20: add ir register enqueuebufptr for enqueue global buffer.

2016-11-28 Thread Yang Rong
Signed-off-by: Yang Rong 
Reviewed-by: Pan Xiuli 
---
 backend/src/backend/program.h | 9 -
 backend/src/ir/profile.cpp| 3 ++-
 backend/src/ir/profile.hpp| 3 ++-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 6373282..e601c97 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -116,6 +116,7 @@ enum gbe_curbe_type {
   GBE_CURBE_THREAD_ID,
   GBE_CURBE_CONSTANT_ADDRSPACE,
   GBE_CURBE_STACK_SIZE,
+  GBE_CURBE_ENQUEUE_BUF_POINTER,
   GBE_GEN_REG,
 };
 
@@ -301,6 +302,9 @@ extern gbe_program_get_kernel_by_name_cb 
*gbe_program_get_kernel_by_name;
 typedef gbe_kernel (gbe_program_get_kernel_cb)(gbe_program, uint32_t ID);
 extern gbe_program_get_kernel_cb *gbe_program_get_kernel;
 
+typedef const char* 
(gbe_program_get_device_enqueue_kernel_name_cb)(gbe_program, uint32_t ID);
+extern gbe_program_get_device_enqueue_kernel_name_cb 
*gbe_program_get_device_enqueue_kernel_name;
+
 /*! Get the kernel name */
 typedef const char *(gbe_kernel_get_name_cb)(gbe_kernel);
 extern gbe_kernel_get_name_cb *gbe_kernel_get_name;
@@ -373,9 +377,12 @@ extern gbe_kernel_use_slm_cb *gbe_kernel_use_slm;
 /*! Get slm size needed for kernel local variables */
 typedef int32_t (gbe_kernel_get_slm_size_cb)(gbe_kernel);
 extern gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size;
-
+/*! Get the kernel's opencl version. */
 typedef uint32_t (gbe_kernel_get_ocl_version_cb)(gbe_kernel);
 extern gbe_kernel_get_ocl_version_cb *gbe_kernel_get_ocl_version;
+/* Kernel use device enqueue or not.  */
+typedef uint32_t (gbe_kernel_use_device_enqueue_cb)(gbe_kernel);
+extern gbe_kernel_use_device_enqueue_cb *gbe_kernel_use_device_enqueue;
 
 /*mutex to lock global llvmcontext access.*/
 extern void acquireLLVMContextLock();
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index dbd2599..212af0d 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -50,7 +50,7 @@ namespace ir {
 "profiling_timestamps4",
 "threadid",
 "constant_addrspace_start",
-"stack_size"
+"stack_size", "enqueue_buffer_pointer",
 };
 
 #if GBE_DEBUG
@@ -107,6 +107,7 @@ namespace ir {
   DECL_NEW_REG(FAMILY_DWORD, threadid, 1, GBE_CURBE_THREAD_ID);
   DECL_NEW_REG(FAMILY_QWORD, constant_addrspace, 1, 
GBE_CURBE_CONSTANT_ADDRSPACE);
   DECL_NEW_REG(FAMILY_QWORD, stacksize, 1, GBE_CURBE_STACK_SIZE);
+  DECL_NEW_REG(FAMILY_QWORD, enqueuebufptr, 1, 
GBE_CURBE_ENQUEUE_BUF_POINTER);
 }
 #undef DECL_NEW_REG
 
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 3494beb..ebd5142 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -81,7 +81,8 @@ namespace ir {
 static const Register threadid = Register(37); // the thread id of this 
thread.
 static const Register constant_addrspace = Register(38);  // starting 
address of program-scope constant
 static const Register stacksize = Register(39); // stack buffer total size
-static const uint32_t regNum = 40; // number of special 
registers
+static const Register enqueuebufptr = Register(40); // enqueue buffer 
address .
+static const uint32_t regNum = 41; // number of special 
registers
 extern const char *specialRegMean[];   // special register name.
   } /* namespace ocl */
 
-- 
2.1.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 07/19] OCL20: Add read_write image type of image apis.

2016-11-28 Thread Yang Rong
Use macro to declare the image_read and image_write built-ins.
Also add the read_write help functions.

Signed-off-by: Yang Rong 
---
 backend/src/libocl/include/ocl_image.h | 222 -
 backend/src/libocl/src/ocl_image.cl| 214 ---
 2 files changed, 305 insertions(+), 131 deletions(-)

diff --git a/backend/src/libocl/include/ocl_image.h 
b/backend/src/libocl/include/ocl_image.h
index cdb3411..5a679aa 100644
--- a/backend/src/libocl/include/ocl_image.h
+++ b/backend/src/libocl/include/ocl_image.h
@@ -20,28 +20,77 @@
 
 #include "ocl_types.h"
 
-OVERLOADABLE int4 read_imagei(read_only image1d_t cl_image, const sampler_t 
sampler, int coord);
-OVERLOADABLE int4 read_imagei(read_only image1d_t cl_image, const sampler_t 
sampler, float coord);
-OVERLOADABLE int4 read_imagei(read_only image1d_t cl_image, int coord);
-OVERLOADABLE void write_imagei(write_only image1d_t cl_image, int coord, int4 
color);
-OVERLOADABLE void write_imagei(write_only image1d_t cl_image, float coord, 
int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image1d_t cl_image, const sampler_t 
sampler, int coord);
-OVERLOADABLE uint4 read_imageui(read_only image1d_t cl_image, const sampler_t 
sampler, float coord);
-OVERLOADABLE uint4 read_imageui(read_only image1d_t cl_image, int coord);
-OVERLOADABLE void write_imageui(write_only image1d_t cl_image, int coord, 
uint4 color);
-OVERLOADABLE void write_imageui(write_only image1d_t cl_image, float coord, 
uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image1d_t cl_image, const sampler_t 
sampler, int coord);
-OVERLOADABLE float4 read_imagef(read_only image1d_t cl_image, const sampler_t 
sampler, float coord);
-OVERLOADABLE float4 read_imagef(read_only image1d_t cl_image, int coord);
-OVERLOADABLE void write_imagef(write_only image1d_t cl_image, int coord, 
float4 color);
-OVERLOADABLE void write_imagef(write_only image1d_t cl_image, float coord, 
float4 color);
-OVERLOADABLE int4 read_imagei(read_only image1d_buffer_t cl_image, int coord);
-OVERLOADABLE void write_imagei(write_only image1d_buffer_t cl_image, int 
coord, int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image1d_buffer_t cl_image, int 
coord);
-OVERLOADABLE void write_imageui(write_only image1d_buffer_t cl_image, int 
coord, uint4 color);
-OVERLOADABLE void write_imageui(write_only image1d_buffer_t cl_image, float 
coord, uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image1d_buffer_t cl_image, int 
coord);
-OVERLOADABLE void write_imagef(write_only image1d_buffer_t cl_image, int 
coord, float4 color);
+#define int1 int
+#define float1 float
+
+#define DECL_IMAGE_READ_SAMPLE_RETTYPE(IMG_TYPE, DATA_YPE, SUFFIX, N) \
+  OVERLOADABLE DATA_YPE read_image ## SUFFIX(IMG_TYPE cl_image, const 
sampler_t sampler, int##N coord); \
+  OVERLOADABLE DATA_YPE read_image ## SUFFIX(IMG_TYPE cl_image, const 
sampler_t sampler, float##N coord);
+
+#define DECL_IMAGE_READ_NO_SAMPLE_RETTYPE(IMG_TYPE, DATA_YPE, SUFFIX, N) \
+  OVERLOADABLE DATA_YPE read_image ## SUFFIX(IMG_TYPE cl_image, int##N coord);
+
+#define DECL_IMAGE_WRITE_RETTYPE(IMG_TYPE, DATA_YPE, SUFFIX, N) \
+  OVERLOADABLE void write_image ## SUFFIX(IMG_TYPE cl_image, int##N coord, 
DATA_YPE color);
+
+#define DECL_IMAGE_TYPE_READ_NO_SAMPLE(IMG_TYPE, N)\
+DECL_IMAGE_READ_NO_SAMPLE_RETTYPE(IMG_TYPE, int4, i, N) \
+DECL_IMAGE_READ_NO_SAMPLE_RETTYPE(IMG_TYPE, uint4, ui, N) \
+DECL_IMAGE_READ_NO_SAMPLE_RETTYPE(IMG_TYPE, float4, f, N)
+
+#define DECL_IMAGE_TYPE_READ_SAMPLE(IMG_TYPE, N)\
+DECL_IMAGE_READ_SAMPLE_RETTYPE(IMG_TYPE, int4, i, N) \
+DECL_IMAGE_READ_SAMPLE_RETTYPE(IMG_TYPE, uint4, ui, N) \
+DECL_IMAGE_READ_SAMPLE_RETTYPE(IMG_TYPE, float4, f, N)
+
+#define DECL_IMAGE_TYPE_WRITE(IMG_TYPE, N)\
+DECL_IMAGE_WRITE_RETTYPE(IMG_TYPE, int4, i, N) \
+DECL_IMAGE_WRITE_RETTYPE(IMG_TYPE, uint4, ui, N) \
+DECL_IMAGE_WRITE_RETTYPE(IMG_TYPE, float4, f, N)
+
+#if (__OPENCL_C_VERSION__ >= 200)
+#define DECL_IMAGE(IMG_TYPE, N) \
+DECL_IMAGE_TYPE_READ_NO_SAMPLE(read_only IMG_TYPE, N) \
+DECL_IMAGE_TYPE_READ_NO_SAMPLE(read_write IMG_TYPE, N) \
+DECL_IMAGE_TYPE_READ_SAMPLE(read_only IMG_TYPE, N) \
+DECL_IMAGE_TYPE_WRITE(write_only IMG_TYPE, N) \
+DECL_IMAGE_TYPE_WRITE(read_write IMG_TYPE, N)
+#else
+#define DECL_IMAGE(IMG_TYPE, N) \
+DECL_IMAGE_TYPE_READ_NO_SAMPLE(read_only IMG_TYPE, N) \
+DECL_IMAGE_TYPE_READ_SAMPLE(read_only IMG_TYPE, N) \
+DECL_IMAGE_TYPE_WRITE(write_only IMG_TYPE, N)
+#endif
+
+DECL_IMAGE(image1d_t, 1)
+DECL_IMAGE(image2d_t, 2)
+DECL_IMAGE(image1d_array_t, 2)
+DECL_IMAGE(image3d_t, 3)
+DECL_IMAGE(image3d_t, 4)
+DECL_IMAGE(image2d_array_t, 3)
+DECL_IMAGE(image2d_array_t, 4)
+
+#undef DECL_IMAGE
+
+#if (__OPENCL_C_VERSION__ >= 200)
+#define DECL_IMAGE(IMG_TYPE, N) \
+DECL_IMAGE_TYPE_READ_NO_SAMPLE(read_only IMG_TYPE, N) \
+DECL_IMAGE_TYPE_READ_NO_SAMPLE(read_write IMG_TYPE, N) \
+DECL_IMAGE_TYPE_WRITE(write_only IMG_T

[Beignet] [PATCH 12/19] OCL20: add device enqueue builtins.

2016-11-28 Thread Yang Rong
Add three gen helper function calls for enqueue builtins.
Store the ndrange info to stack, and write the device enqueue infos
to the auxiliary global buffer.
Store the slm informations to the global buffer.
Skip all events, because we run device enqueue in order, that would
make sure all parent's enqueue has finished.

Signed-off-by: Yang Rong 
Reviewed-by: Pan Xiuli 
---
 backend/src/backend/program.cpp  |   5 +-
 backend/src/libocl/CMakeLists.txt|   4 +-
 backend/src/libocl/include/ocl.h |   1 +
 backend/src/libocl/include/ocl_enqueue.h |  90 
 backend/src/libocl/src/ocl_enqueue.cl| 238 +++
 5 files changed, 334 insertions(+), 4 deletions(-)
 create mode 100644 backend/src/libocl/include/ocl_enqueue.h
 create mode 100644 backend/src/libocl/src/ocl_enqueue.cl

diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 413f397..e0107dc 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -690,9 +690,10 @@ namespace gbe {
 args.push_back("-x");
 args.push_back("cl");
 args.push_back("-triple");
-if (oclVersion >= 200)
+if (oclVersion >= 200) {
   args.push_back("spir64");
-else
+  args.push_back("-fblocks");
+} else
   args.push_back("spir");
 #endif /* LLVM_VERSION_MINOR <= 2 */
 args.push_back("stringInput.cl");
diff --git a/backend/src/libocl/CMakeLists.txt 
b/backend/src/libocl/CMakeLists.txt
index a4f575f..c68ecb0 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -68,7 +68,7 @@ FOREACH(M ${OCL_COPY_MODULES_12})
 COPY_THE_SOURCE(OCL_SOURCE_FILES_12 ${M})
 ENDFOREACH(M)
 
-SET (OCL_COPY_MODULES_20 ocl_vload_20 ocl_atom_20 ocl_pipe)
+SET (OCL_COPY_MODULES_20 ocl_vload_20 ocl_atom_20 ocl_pipe ocl_enqueue)
 FOREACH(M ${OCL_COPY_MODULES_20})
 COPY_THE_HEADER(${M})
 COPY_THE_SOURCE(OCL_SOURCE_FILES_20 ${M})
@@ -157,7 +157,7 @@ FOREACH(M ${OCL_BASH_GENERATED_MODULES})
 ENDFOREACH(M) 
 
 SET (CLANG_OCL_FLAGS -fno-builtin -ffp-contract=off -triple spir 
-cl-kernel-arg-info -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND "-cl-std=CL1.2" 
-D__OPENCL_C_VERSION__=120)
-SET (CLANG_OCL_FLAGS_20 -fno-builtin -ffp-contract=off -triple spir64 
-cl-kernel-arg-info -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND "-cl-std=CL2.0" 
-D__OPENCL_C_VERSION__=200)
+SET (CLANG_OCL_FLAGS_20 -fno-builtin -ffp-contract=off -triple spir64 
-cl-kernel-arg-info -fblocks -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND 
"-cl-std=CL2.0" -D__OPENCL_C_VERSION__=200)
 
 MACRO(ADD_CL_TO_BC_TARGET _file _output _clang_flag)
 # CMake seems can not add pattern rule, use MACRO to replace.
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index 677d2d4..2548cb7 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -100,6 +100,7 @@
 #include "ocl_atom_20.h"
 #include "ocl_pipe.h"
 #include "ocl_math_20.h"
+#include "ocl_enqueue.h"
 #else
 #include "ocl_vload.h"
 #include "ocl_atom.h"
diff --git a/backend/src/libocl/include/ocl_enqueue.h 
b/backend/src/libocl/include/ocl_enqueue.h
new file mode 100644
index 000..6479df7
--- /dev/null
+++ b/backend/src/libocl/include/ocl_enqueue.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see .
+ *
+ */
+
+#ifndef __OCL_ENQUEUE_H__
+#define __OCL_ENQUEUE_H__
+
+#include "ocl_types.h"
+#define CLK_ENQUEUE_FLAGS_WAIT_KERNEL 0
+#define CLK_ENQUEUE_FLAGS_NO_WAIT 1
+#define CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP 2
+#define CLK_SUCCESS 0
+#define CL_COMPLETE 0
+#define CLK_PROFILING_COMMAND_EXEC_TIME 0
+
+struct ndrange_info_t {
+  int type;
+  int global_work_size[3];
+  int local_work_size[3];
+  int global_work_offset[3];
+};
+
+struct Block_literal {
+  void *isa; // initialized to &_NSConcreteStackBlock or 
&_NSConcreteGlobalBlock
+  int flags;
+  int reserved;
+  __global void (*invoke)(void *, ...);
+  struct Block_descriptor_1 {
+unsigned long int reserved; // NULL
+unsigned long int size; // sizeof(struct Block_literal_1)
+// optional helper functions
+void (*copy_helper)(void *dst, void *src); // IFF (1<<25)
+void (*dispose_helper)(void *src); // IFF (1<<25)
+// required ABI.2010.3.16
+const char *signature;

[Beignet] [PATCH 08/19] OCL20: add beignet_20.pch and beignet_20.bc.

2016-11-28 Thread Yang Rong
Always build beignet.pch and beignet.pch, and build beignet_20.bc and
beignet_20.pch when enable OpenCL 2.0.

Signed-off-by: Yang Rong 
---
 backend/CMakeLists.txt |   3 +-
 backend/src/CMakeLists.txt |  12 
 backend/src/GBEConfig.h.in |   2 +
 backend/src/libocl/CMakeLists.txt  | 111 +++--
 backend/src/libocl/src/ocl_vload_20.cl |   2 +-
 5 files changed, 81 insertions(+), 49 deletions(-)

diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt
index 915d60f..57df8a5 100644
--- a/backend/CMakeLists.txt
+++ b/backend/CMakeLists.txt
@@ -50,7 +50,8 @@ set(LOCAL_GBE_OBJECT_DIR ${LOCAL_GBE_OBJECT_DIR} PARENT_SCOPE)
 set(LOCAL_INTERP_OBJECT_DIR ${LOCAL_INTERP_OBJECT_DIR} PARENT_SCOPE)
 
 set (GBE_BIN_GENERATER
- env OCL_BITCODE_LIB_PATH=${LOCAL_OCL_BITCODE_BIN} 
OCL_HEADER_FILE_DIR=${LOCAL_OCL_HEADER_DIR} 
OCL_PCH_PATH=${LOCAL_OCL_PCH_OBJECT})
+ env OCL_BITCODE_LIB_PATH=${LOCAL_OCL_BITCODE_BIN} 
OCL_HEADER_FILE_DIR=${LOCAL_OCL_HEADER_DIR} OCL_PCH_PATH=${LOCAL_OCL_PCH_OBJECT}
+ OCL_BITCODE_LIB_20_PATH=${LOCAL_OCL_BITCODE_BIN_20} 
OCL_PCH_20_PATH=${LOCAL_OCL_PCH_OBJECT_20})
 
 if (USE_STANDALONE_GBE_COMPILER STREQUAL "true")
 set (GBE_BIN_GENERATER
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index 6ff25e7..2af0844 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -3,6 +3,10 @@ set (OCL_HEADER_DIR "${BEIGNET_INSTALL_DIR}/include")
 set (OCL_PCH_OBJECT "${BEIGNET_INSTALL_DIR}/beignet.pch")
 set (GBE_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbe.so")
 set (INTERP_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbeinterp.so")
+if (ENABLE_OPENCL_20)
+set (OCL_BITCODE_BIN_20 "${BEIGNET_INSTALL_DIR}/beignet_20.bc")
+set (OCL_PCH_OBJECT_20 "${BEIGNET_INSTALL_DIR}/beignet_20.pch")
+endif (ENABLE_OPENCL_20)
 
 configure_file (
 "GBEConfig.h.in"
@@ -19,6 +23,10 @@ set (LOCAL_INTERP_OBJECT_DIR 
"${CMAKE_CURRENT_BINARY_DIR}/libgbeinterp.so" PAREN
 set (LOCAL_OCL_BITCODE_BIN "${OCL_OBJECT_DIR}/beignet.bc" PARENT_SCOPE)
 set (LOCAL_OCL_HEADER_DIR "${OCL_OBJECT_DIR}/include/" PARENT_SCOPE)
 set (LOCAL_OCL_PCH_OBJECT "${OCL_OBJECT_DIR}/beignet.local.pch" PARENT_SCOPE)
+if (ENABLE_OPENCL_20)
+set (LOCAL_OCL_BITCODE_BIN_20 "${OCL_OBJECT_DIR}/beignet_20.bc" PARENT_SCOPE)
+set (LOCAL_OCL_PCH_OBJECT_20 "${OCL_OBJECT_DIR}/beignet_20.local.pch" 
PARENT_SCOPE)
+endif (ENABLE_OPENCL_20)
 
 set (GBE_SRC
 ${ocl_blob_file}
@@ -197,6 +205,10 @@ endif ()
 install (TARGETS gbe LIBRARY DESTINATION ${BEIGNET_INSTALL_DIR})
 install (FILES ${OCL_OBJECT_DIR}/beignet.bc DESTINATION ${BEIGNET_INSTALL_DIR})
 install (FILES ${OCL_OBJECT_DIR}/beignet.pch DESTINATION 
${BEIGNET_INSTALL_DIR})
+if (ENABLE_OPENCL_20)
+install (FILES ${OCL_OBJECT_DIR}/beignet_20.bc DESTINATION 
${BEIGNET_INSTALL_DIR})
+install (FILES ${OCL_OBJECT_DIR}/beignet_20.pch DESTINATION 
${BEIGNET_INSTALL_DIR})
+endif (ENABLE_OPENCL_20)
 install (FILES ${OCL_HEADER_FILES} DESTINATION ${BEIGNET_INSTALL_DIR}/include)
 endif (NOT (USE_STANDALONE_GBE_COMPILER STREQUAL "true"))
 
diff --git a/backend/src/GBEConfig.h.in b/backend/src/GBEConfig.h.in
index b5bec14..9514483 100644
--- a/backend/src/GBEConfig.h.in
+++ b/backend/src/GBEConfig.h.in
@@ -6,3 +6,5 @@
 #define OCL_BITCODE_BIN "@OCL_BITCODE_BIN@"
 #define OCL_HEADER_DIR "@OCL_HEADER_DIR@"
 #define OCL_PCH_OBJECT "@OCL_PCH_OBJECT@"
+#define OCL_BITCODE_BIN_20 "@OCL_BITCODE_BIN_20@"
+#define OCL_PCH_OBJECT_20 "@OCL_PCH_OBJECT_20@"
diff --git a/backend/src/libocl/CMakeLists.txt 
b/backend/src/libocl/CMakeLists.txt
index e828fd0..a4f575f 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -68,7 +68,7 @@ FOREACH(M ${OCL_COPY_MODULES_12})
 COPY_THE_SOURCE(OCL_SOURCE_FILES_12 ${M})
 ENDFOREACH(M)
 
-SET (OCL_COPY_MODULES_20 ocl_vload_20 ocl_atom ocl_atom_20 ocl_pipe)
+SET (OCL_COPY_MODULES_20 ocl_vload_20 ocl_atom_20 ocl_pipe)
 FOREACH(M ${OCL_COPY_MODULES_20})
 COPY_THE_HEADER(${M})
 COPY_THE_SOURCE(OCL_SOURCE_FILES_20 ${M})
@@ -156,15 +156,15 @@ FOREACH(M ${OCL_BASH_GENERATED_MODULES})
 GENERATE_SOURCE_BASH(${M})
 ENDFOREACH(M) 
 
+SET (CLANG_OCL_FLAGS -fno-builtin -ffp-contract=off -triple spir 
-cl-kernel-arg-info -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND "-cl-std=CL1.2" 
-D__OPENCL_C_VERSION__=120)
+SET (CLANG_OCL_FLAGS_20 -fno-builtin -ffp-contract=off -triple spir64 
-cl-kernel-arg-info -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND "-cl-std=CL2.0" 
-D__OPENCL_C_VERSION__=200)
 
-SET (CLANG_OCL_FLAGS -fno-builtin -ffp-contract=off -cl-kernel-arg-info 
-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND "-cl-std=CL1.2")
-MACRO(ADD_CL_TO_BC_TARGET _file)
+MACRO(ADD_CL_TO_BC_TARGET _file _output _clang_flag)
 # CMake seems can not add pattern rule, use MACRO to replace.
-STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" 
"${OCL_OBJECT_DIR}/\\1.bc" output_name ${_file})
-ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
+ADD_CUSTOM_COMMAND(OUTPUT ${_ou

[Beignet] [PATCH 10/19] OCL20: enable -cl-std=CL2.0.

2016-11-28 Thread Yang Rong
When build from source, get the OpenCL version from the option. Use
spir64 triple if it is OpenCL 2.0.
Get the OpenCL version for llvm module's meta. If OpenCL version is
2.0, set the unit's point size to 64 bits before using
unit.getPointerSize().

Signed-off-by: Yang Rong 
---
 backend/src/backend/context.cpp|  2 +-
 backend/src/backend/gen_program.cpp|  6 ++--
 backend/src/backend/program.cpp| 56 +-
 backend/src/ir/unit.hpp|  3 +-
 backend/src/llvm/llvm_bitcode_link.cpp | 19 +---
 backend/src/llvm/llvm_gen_backend.cpp  |  3 ++
 backend/src/llvm/llvm_gen_backend.hpp  |  4 ++-
 backend/src/llvm/llvm_passes.cpp   | 25 +--
 backend/src/llvm/llvm_to_gen.cpp   |  5 +--
 9 files changed, 94 insertions(+), 29 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 1426e0c..8174954 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -345,7 +345,7 @@ namespace gbe
   Context::Context(const ir::Unit &unit, const std::string &name) :
 unit(unit), fn(*unit.getFunction(name)), name(name), liveness(NULL), 
dag(NULL), useDWLabel(false)
   {
-GBE_ASSERT(unit.getPointerSize() == ir::POINTER_32_BITS);
+GBE_ASSERT(unit.getPointerSize() == ir::POINTER_32_BITS || 
unit.getPointerSize() == ir::POINTER_64_BITS);
 this->liveness = GBE_NEW(ir::Liveness, const_cast(fn), 
true);
 this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
 // r0 (GEN_REG_SIZE) is always set by the HW and used at the end by EOT
diff --git a/backend/src/backend/gen_program.cpp 
b/backend/src/backend/gen_program.cpp
index ebba7d4..1872b01 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -352,8 +352,10 @@ namespace gbe {
 #endif
 // if load 32 bit spir binary, the triple should be spir-unknown-unknown.
 llvm::Triple triple(module->getTargetTriple());
-if(triple.getArchName() == "spir" && triple.getVendorName() == "unknown" 
&& triple.getOSName() == "unknown"){
+if (triple.getArchName() == "spir" && triple.getVendorName() == "unknown" 
&& triple.getOSName() == "unknown"){
   module->setTargetTriple("spir");
+} else if (triple.getArchName() == "spir64" && triple.getVendorName() == 
"unknown" && triple.getOSName() == "unknown"){
+  module->setTargetTriple("spir64");
 }
 releaseLLVMContextLock();
 if(module == NULL){
@@ -525,7 +527,7 @@ namespace gbe {
   size_t stringSize,
   char *err,
   size_t *errSize,
-  const char *  options)
+  const char * options)
   {
 #ifdef GBE_COMPILER_AVAILABLE
 using namespace gbe;
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index c68d604..413f397 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -122,7 +122,10 @@ namespace gbe {
   IVAR(OCL_PROFILING_LOG, 0, 0, 1); // Int for different profiling types.
   BVAR(OCL_OUTPUT_BUILD_LOG, false);
 
-  bool Program::buildFromLLVMFile(const char *fileName, const void* module, 
std::string &error, int optLevel) {
+  bool Program::buildFromLLVMFile(const char *fileName,
+ const void* module,
+ std::string &error,
+ int optLevel) {
 ir::Unit *unit = new ir::Unit();
 llvm::Module * cloned_module = NULL;
 bool ret = false;
@@ -649,7 +652,7 @@ namespace gbe {
 #ifdef GBE_COMPILER_AVAILABLE
   static bool buildModuleFromSource(const char *source, llvm::Module** 
out_module, llvm::LLVMContext* llvm_ctx,
 std::string dumpLLVMFileName, std::string 
dumpSPIRBinaryName, std::vector& options, size_t stringSize, char 
*err,
-size_t *errSize) {
+size_t *errSize, uint32_t oclVersion) {
 // Arguments to pass to the clang frontend
 vector args;
 bool bFastMath = false;
@@ -687,7 +690,10 @@ namespace gbe {
 args.push_back("-x");
 args.push_back("cl");
 args.push_back("-triple");
-args.push_back("spir");
+if (oclVersion >= 200)
+  args.push_back("spir64");
+else
+  args.push_back("spir");
 #endif /* LLVM_VERSION_MINOR <= 2 */
 args.push_back("stringInput.cl");
 args.push_back("-ffp-contract=on");
@@ -829,6 +835,7 @@ namespace gbe {
 
 
   SVAR(OCL_PCH_PATH, OCL_PCH_OBJECT);
+  SVAR(OCL_PCH_20_PATH, OCL_PCH_OBJECT_20);
   SVAR(OCL_HEADER_FILE_DIR, OCL_HEADER_DIR);
   BVAR(OCL_OUTPUT_KERNEL_SOURCE, false);
 
@@ -842,10 +849,9 @@ namespace gbe {
  int& optLevel,
  size_t stringSize,
  

[Beignet] [PATCH 18/19] OCL20: add device enqueue test case.

2016-11-28 Thread Yang Rong
Signed-off-by: Yang Rong 
Reviewed-by: Pan Xiuli 
---
 kernels/compiler_device_enqueue.cl | 18 ++
 utests/CMakeLists.txt  |  3 ++-
 utests/compiler_device_enqueue.cpp | 36 
 3 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 kernels/compiler_device_enqueue.cl
 create mode 100644 utests/compiler_device_enqueue.cpp

diff --git a/kernels/compiler_device_enqueue.cl 
b/kernels/compiler_device_enqueue.cl
new file mode 100644
index 000..cb20142
--- /dev/null
+++ b/kernels/compiler_device_enqueue.cl
@@ -0,0 +1,18 @@
+void block_fn(__global uint* val)
+{
+  atomic_add(val, get_global_id(0));
+}
+
+kernel void compiler_device_enqueue(uint glob_size_arr, __global uint* val)
+{
+  size_t tid = get_global_id(0);
+
+  for(int i = 0; i < glob_size_arr; i++)
+  {
+ndrange_t ndrange = ndrange_1D(glob_size_arr);
+__global uint * v = val + tid;
+void (^kernelBlock)(void) = ^{ block_fn(v); };
+queue_t q = get_default_queue();
+enqueue_kernel(q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);
+  }
+}
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 0ca7f77..969b695 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -306,7 +306,8 @@ if (ENABLE_OPENCL_20)
   compiler_sampler.cpp
   compiler_generic_pointer.cpp
   runtime_pipe_query.cpp
-  compiler_pipe_builtin.cpp)
+  compiler_pipe_builtin.cpp
+  compiler_device_enqueue.cpp)
 endif (ENABLE_OPENCL_20)
 
 if (NOT_BUILD_STAND_ALONE_UTEST)
diff --git a/utests/compiler_device_enqueue.cpp 
b/utests/compiler_device_enqueue.cpp
new file mode 100644
index 000..a9e3e2d
--- /dev/null
+++ b/utests/compiler_device_enqueue.cpp
@@ -0,0 +1,36 @@
+#include "utest_helper.hpp"
+
+void compiler_device_enqueue(void)
+{
+  const size_t n = 32;
+  const uint32_t global_sz = 3;
+  uint32_t result = 0;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_device_enqueue");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(uint32_t), &global_sz);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[0]);
+
+  OCL_MAP_BUFFER(0);
+  for(uint32_t i = 0; i < 69; ++i)
+((short *)buf_data[0])[i] = 0;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  for(uint32_t i = 0; i < global_sz; ++i) {
+result += i;
+  }
+  result *= global_sz;
+
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i)
+OCL_ASSERT(((uint32_t *)buf_data[0])[i] == result);
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_device_enqueue);
-- 
2.1.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 01/19] GBE: fix ctz fail.

2016-11-28 Thread Yang Rong
LZD require ud type.

Signed-off-by: Yang Rong 
---
 backend/src/llvm/llvm_gen_backend.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
b/backend/src/llvm/llvm_gen_backend.cpp
index 397c721..dea031b 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -4628,7 +4628,7 @@ namespace gbe
   ir::Type revType = ir::TYPE_U32;
   ir::Register revTmp = ctx.reg(getFamily(revType));
   ctx.ALU1(ir::OP_BFREV, revType, revTmp, src);
-  ctx.ALU1(ir::OP_LZD, dstType, dst, revTmp);
+  ctx.ALU1(ir::OP_LZD, ir::TYPE_U32, dst, revTmp);
 }
   }
   break;
-- 
2.1.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 11/19] OCL20: add device enqueue helper functions in backend.

2016-11-28 Thread Yang Rong
This functions collect all block infos, convert unnamed call to named function
call. Collect device enqueue's invoke functions and store them in the unit,
set these functions to OpenCL kernel function.
Because it change the module's kernel functions, so must called before link,
otherwize, the built-in functions called in invoke functions may not be 
materialized.

Signed-off-by: Yang Rong 
Reviewed-by: Pan Xiuli 
---
 backend/src/CMakeLists.txt   |   1 +
 backend/src/ir/unit.hpp  |   1 +
 backend/src/llvm/llvm_device_enqueue.cpp | 414 +++
 backend/src/llvm/llvm_gen_backend.hpp|   3 +
 backend/src/llvm/llvm_to_gen.cpp |  11 +
 5 files changed, 430 insertions(+)
 create mode 100644 backend/src/llvm/llvm_device_enqueue.cpp

diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index 2af0844..7c1f4db 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -101,6 +101,7 @@ set (GBE_SRC
 llvm/ExpandUtils.cpp
 llvm/PromoteIntegers.cpp
 llvm/ExpandLargeIntegers.cpp
+llvm/llvm_device_enqueue.cpp
 llvm/StripAttributes.cpp
 llvm/llvm_to_gen.cpp
 llvm/llvm_loadstore_optimization.cpp
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index 08dc646..46d7be7 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -47,6 +47,7 @@ namespace ir {
 typedef map FunctionSet;
 /*! Moved from printf pass */
 map printfs;
+vector blockFuncs;
 /*! Create an empty unit */
 Unit(PointerSize pointerSize = POINTER_32_BITS);
 /*! Release everything (*including* the function pointers) */
diff --git a/backend/src/llvm/llvm_device_enqueue.cpp 
b/backend/src/llvm/llvm_device_enqueue.cpp
new file mode 100644
index 000..ff6fbbb
--- /dev/null
+++ b/backend/src/llvm/llvm_device_enqueue.cpp
@@ -0,0 +1,414 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see .
+ *
+ */
+
+#include "llvm_includes.hpp"
+
+#include "ir/unit.hpp"
+#include "llvm_gen_backend.hpp"
+#include "ocl_common_defines.h"
+
+using namespace llvm;
+
+namespace gbe {
+  BitCastInst *isInvokeBitcast(Instruction *I) {
+BitCastInst* bt = dyn_cast(I);
+if (bt == NULL)
+  return NULL;
+
+Type* type = bt->getOperand(0)->getType();
+if(!type->isPointerTy())
+  return NULL;
+
+PointerType *pointerType = dyn_cast(type);
+Type *pointed = pointerType->getElementType();
+if(!pointed->isFunctionTy())
+  return NULL;
+
+Function *Fn = dyn_cast(bt->getOperand(0));
+if(Fn == NULL)
+  return NULL;
+
+/* This is a fake, to check the function bitcast is for block or not */
+std::string fnName = Fn->getName();
+if(fnName.find("_invoke") == std::string::npos)
+  return NULL;
+
+return bt;
+  }
+
+  void mutateArgAddressSpace(Argument *arg)
+  {
+std::listWorkList;
+WorkList.push_back(arg);
+
+while(!WorkList.empty()) {
+  Value *v = WorkList.front();
+
+  for (Value::use_iterator iter = v->use_begin(); iter != v->use_end(); 
++iter) {
+// After LLVM 3.5, use_iterator points to 'Use' instead of 'User',
+// which is more straightforward.
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+User *theUser = *iter;
+#else
+User *theUser = iter->getUser();
+#endif
+// becareful with sub operation
+if (isa(theUser) || isa(theUser))
+  continue;
+
+WorkList.push_back(theUser);
+  }
+
+  PointerType *ty = dyn_cast(v->getType());
+  if(ty == NULL) continue;   //should only one argument, private pointer 
type
+  ty = PointerType::get(ty->getPointerElementType(), 1);
+  v->mutateType(ty);
+  WorkList.pop_front();
+}
+  }
+
+  Function* setFunctionAsKernel(Module *mod, Function *Fn)
+  {
+LLVMContext &Context = mod->getContext();
+Type *intTy = IntegerType::get(mod->getContext(), 32);
+SmallVector kernelMDArgs;
+
+// MDNode for the kernel argument address space qualifiers.
+SmallVector addressQuals;
+
+// MDNode for the kernel argument access qualifiers (images only).
+SmallVector accessQuals;
+
+// MDNode for the kernel argument type names.
+SmallVector argTypeNames;
+
+// MDNode for the kernel argument base type names.
+SmallVector a

[Beignet] [PATCH 04/19] GBE: remove image type's access qual from image type name.

2016-11-28 Thread Yang Rong
OpenCL spec require type name don't include access qual, so remove it.

Signed-off-by: Yang Rong 
---
 backend/src/llvm/llvm_gen_backend.cpp | 8 
 1 file changed, 8 insertions(+)

diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
b/backend/src/llvm/llvm_gen_backend.cpp
index dea031b..17eaec6 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2351,6 +2351,14 @@ namespace gbe
 }
 if(typeNameNode) {
   llvmInfo.typeName = 
(cast(typeNameNode->getOperand(opID)))->getString();
+  //LLVM 3.9 image's type name include access qual, don't match OpenCL 
spec, erase them.
+  std::vector filters = {"__read_only ", "__write_only "};
+  for (uint32_t i = 0; i < filters.size(); i++) {
+size_t pos = llvmInfo.typeName.find(filters[i]);
+if (pos != std::string::npos) {
+  llvmInfo.typeName = llvmInfo.typeName.erase(pos, 
filters[i].length());
+}
+  }
 }
 if(typeBaseNameNode){
   llvmInfo.typeBaseName = 
(cast(typeBaseNameNode->getOperand(opID)))->getString();
-- 
2.1.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 03/19] GBE: don't use call->getCalledFunction() to decide the materialize function.

2016-11-28 Thread Yang Rong
If the call inst is a bitcast value, call->getCalledFunction() will
return NULL. Use the call->getCalledValue()->stripPointerCasts()->getName()
to check.

Signed-off-by: Yang Rong 
---
 backend/src/llvm/llvm_bitcode_link.cpp | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/backend/src/llvm/llvm_bitcode_link.cpp 
b/backend/src/llvm/llvm_bitcode_link.cpp
index a3f9886..0b01929 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -86,11 +86,11 @@ namespace gbe
 }
 
 llvm::Function * callFunc = call->getCalledFunction();
-if(!callFunc) {
-  continue;
-}
+//if(!callFunc) {
+//  continue;
+//}
 
-if (callFunc->getIntrinsicID() != 0)
+if (callFunc && callFunc->getIntrinsicID() != 0)
   continue;
 
 std::string fnName = 
call->getCalledValue()->stripPointerCasts()->getName();
-- 
2.1.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 05/19] Runtime: fix fill image event assert and some SVM rebase error.

2016-11-28 Thread Yang Rong
Also remove the useless function cl_context_add_svm.

Signed-off-by: Yang Rong 
---
 src/cl_api_mem.c|  2 +-
 src/cl_context.c| 14 +-
 src/cl_context.h|  2 --
 src/cl_event.c  |  2 +-
 src/cl_mem.c| 35 +--
 src/cl_mem.h|  2 +-
 src/intel/intel_gpgpu.c | 41 +
 7 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 7314a48..36a1421 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -2215,7 +2215,7 @@ clEnqueueFillImage(cl_command_queue command_queue,
   break;
 }
 
-err = cl_image_fill(command_queue, fill_color, image, origin, region);
+err = cl_image_fill(command_queue, e, fill_color, image, origin, region);
 if (err != CL_SUCCESS) {
   break;
 }
diff --git a/src/cl_context.c b/src/cl_context.c
index 4417e3b..c2adf3f 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -81,19 +81,6 @@ cl_context_add_mem(cl_context ctx, cl_mem mem) {
 }
 
 LOCAL void
-cl_context_add_svm(cl_context ctx, cl_mem mem) {
-  assert(mem->ctx == NULL);
-  cl_context_add_ref(ctx);
-
-  CL_OBJECT_LOCK(ctx);
-  list_add_tail(&mem->base.node, &ctx->svm_objects);
-  ctx->svm_object_num++;
-  CL_OBJECT_UNLOCK(ctx);
-
-  mem->ctx = ctx;
-}
-
-LOCAL void
 cl_context_remove_mem(cl_context ctx, cl_mem mem) {
   assert(mem->ctx == ctx);
   CL_OBJECT_LOCK(ctx);
@@ -476,6 +463,7 @@ cl_context_get_svm_from_ptr(cl_context ctx, const void * p)
 buf = (cl_mem)list_entry(pos, _cl_base_object, node);
 if(buf->host_ptr == NULL) continue;
 if(buf->is_svm == 0) continue;
+if(buf->type != CL_MEM_SVM_TYPE) continue;
 if((size_t)buf->host_ptr <= (size_t)p &&
(size_t)p < ((size_t)buf->host_ptr + buf->size))
   return buf;
diff --git a/src/cl_context.h b/src/cl_context.h
index 268e7b9..caa57dc 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -107,8 +107,6 @@ struct _cl_context {
   cl_uint queue_cookie; /* Cookie will change every time we change 
queue list. */
   list_head mem_objects;/* All memory object currently allocated */
   cl_uint mem_object_num;   /* All memory number currently allocated */
-  list_head svm_objects;/* All svm object currently allocated */
-  cl_uint svm_object_num;   /* All svm number currently allocated */
   list_head samplers;   /* All sampler object currently allocated 
*/
   cl_uint sampler_num;  /* All sampler number currently allocated 
*/
   list_head events; /* All event object currently allocated */
diff --git a/src/cl_event.c b/src/cl_event.c
index 0804dbd..58ec2c5 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -101,7 +101,7 @@ cl_event_new(cl_context ctx, cl_command_queue queue, 
cl_command_type type,
   list_init(&e->callbacks);
   list_init(&e->enqueue_node);
 
-  assert(type >= CL_COMMAND_NDRANGE_KERNEL && type <= CL_COMMAND_FILL_IMAGE);
+  assert(type >= CL_COMMAND_NDRANGE_KERNEL && type <= CL_COMMAND_SVM_UNMAP);
   e->event_type = type;
   if (type == CL_COMMAND_USER) {
 e->status = CL_SUBMITTED;
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 798daaf..f856ba3 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -345,19 +345,19 @@ cl_mem_allocate(enum cl_mem_type type,
 mem->is_svm = 1;
   /* userptr not support tiling */
   if (!is_tiled) {
-if ((ALIGN((unsigned long)host_ptr, cacheline_size) == (unsigned 
long)host_ptr) &&
+if(svm_mem != NULL) {  //SVM always paged alignment
+  mem->offset = 0;
+  mem->is_userptr = 1;
+  mem->bo = svm_mem->bo;
+  cl_mem_add_ref(svm_mem);
+  bufCreated = 1;
+} else if ((ALIGN((unsigned long)host_ptr, cacheline_size) == 
(unsigned long)host_ptr) &&
 (ALIGN((unsigned long)sz, cacheline_size) == (unsigned 
long)sz)) {
   void* aligned_host_ptr = (void*)(((unsigned long)host_ptr) & 
(~(page_size - 1)));
   mem->offset = host_ptr - aligned_host_ptr;
   mem->is_userptr = 1;
   size_t aligned_sz = ALIGN((mem->offset + sz), page_size);
-
-  if(svm_mem != NULL) {
-mem->bo = svm_mem->bo;
-cl_mem_add_ref(svm_mem);
-  } else
-mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory 
object", aligned_host_ptr, aligned_sz, 0);
-
+  mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory 
object", aligned_host_ptr, aligned_sz, 0);
   bufCreated = 1;
 }
   }
@@ -1404,14 +1404,6 @@ cl_mem_delete(cl_mem mem)
 }
   }
 
-  if(mem->is_svm && mem->type != CL_MEM_SVM_TYPE) {
-cl_mem svm_mem = cl_context_get_svm_from_ptr(mem->ctx, mem->host_ptr);
-if(svm_mem)
-  cl_mem_delete(svm_mem);
-  }
-  /* Remove it from the list */
-  c

[Beignet] [PATCH 06/19] OCL20: Add generic address space memcpy and memset.

2016-11-28 Thread Yang Rong
Signed-off-by: Yang Rong 
---
 backend/src/libocl/src/ocl_memcpy.cl | 15 +++
 backend/src/libocl/src/ocl_memset.cl |  3 +++
 backend/src/llvm/llvm_bitcode_link.cpp   | 22 ++
 backend/src/llvm/llvm_intrinsic_lowering.cpp |  2 ++
 4 files changed, 42 insertions(+)

diff --git a/backend/src/libocl/src/ocl_memcpy.cl 
b/backend/src/libocl/src/ocl_memcpy.cl
index 85f490f..131574d 100644
--- a/backend/src/libocl/src/ocl_memcpy.cl
+++ b/backend/src/libocl/src/ocl_memcpy.cl
@@ -37,13 +37,28 @@ void __gen_memcpy_ ##NAME (DST_SPACE uchar* dst, SRC_SPACE 
uchar* src, size_t si
   } \
 }
 
+#if (__OPENCL_C_VERSION__ >= 200)
 #define DECL_ONE_SPACE_MEMCOPY_FN(NAME, DST_SPACE) \
   DECL_TWO_SPACE_MEMCOPY_FN( NAME## g, DST_SPACE, __global) \
   DECL_TWO_SPACE_MEMCOPY_FN( NAME## l, DST_SPACE, __local) \
   DECL_TWO_SPACE_MEMCOPY_FN( NAME## p, DST_SPACE, __private) \
+  DECL_TWO_SPACE_MEMCOPY_FN( NAME## n, DST_SPACE, __generic) \
   DECL_TWO_SPACE_MEMCOPY_FN( NAME## c, DST_SPACE, __constant)
 
 DECL_ONE_SPACE_MEMCOPY_FN(g, __global)
 DECL_ONE_SPACE_MEMCOPY_FN(l, __local)
 DECL_ONE_SPACE_MEMCOPY_FN(p, __private)
+DECL_ONE_SPACE_MEMCOPY_FN(n, __generic)
+#else
+#define DECL_ONE_SPACE_MEMCOPY_FN(NAME, DST_SPACE) \
+  DECL_TWO_SPACE_MEMCOPY_FN( NAME## g, DST_SPACE, __global) \
+  DECL_TWO_SPACE_MEMCOPY_FN( NAME## l, DST_SPACE, __local) \
+  DECL_TWO_SPACE_MEMCOPY_FN( NAME## p, DST_SPACE, __private) \
+  DECL_TWO_SPACE_MEMCOPY_FN( NAME## c, DST_SPACE, __constant)
+
+DECL_ONE_SPACE_MEMCOPY_FN(g, __global)
+DECL_ONE_SPACE_MEMCOPY_FN(l, __local)
+DECL_ONE_SPACE_MEMCOPY_FN(p, __private)
+
+#endif
 
diff --git a/backend/src/libocl/src/ocl_memset.cl 
b/backend/src/libocl/src/ocl_memset.cl
index d8bc5df..dda7e55 100644
--- a/backend/src/libocl/src/ocl_memset.cl
+++ b/backend/src/libocl/src/ocl_memset.cl
@@ -41,4 +41,7 @@ void __gen_memset_ ##NAME (DST_SPACE uchar* dst, uchar val, 
size_t size) { \
 DECL_MEMSET_FN(g, __global)
 DECL_MEMSET_FN(l, __local)
 DECL_MEMSET_FN(p, __private)
+#if (__OPENCL_C_VERSION__ >= 200)
+DECL_MEMSET_FN(n, __generic)
+#endif
 
diff --git a/backend/src/llvm/llvm_bitcode_link.cpp 
b/backend/src/llvm/llvm_bitcode_link.cpp
index 0b01929..934dd2e 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -182,6 +182,28 @@ namespace gbe
 builtinFuncs.push_back("__gen_memcpy_gc_align");
 builtinFuncs.push_back("__gen_memcpy_lc_align");
 
+if (oclVersion >= 200) {
+  builtinFuncs.push_back("__gen_memcpy_gn");
+  builtinFuncs.push_back("__gen_memcpy_pn");
+  builtinFuncs.push_back("__gen_memcpy_ln");
+  builtinFuncs.push_back("__gen_memcpy_ng");
+  builtinFuncs.push_back("__gen_memcpy_np");
+  builtinFuncs.push_back("__gen_memcpy_nl");
+  builtinFuncs.push_back("__gen_memcpy_nc");
+  builtinFuncs.push_back("__gen_memcpy_nn");
+  builtinFuncs.push_back("__gen_memset_n");
+
+  builtinFuncs.push_back("__gen_memcpy_gn_align");
+  builtinFuncs.push_back("__gen_memcpy_pn_align");
+  builtinFuncs.push_back("__gen_memcpy_ln_align");
+  builtinFuncs.push_back("__gen_memcpy_ng_align");
+  builtinFuncs.push_back("__gen_memcpy_np_align");
+  builtinFuncs.push_back("__gen_memcpy_nl_align");
+  builtinFuncs.push_back("__gen_memcpy_nc_align");
+  builtinFuncs.push_back("__gen_memcpy_nn_align");
+  builtinFuncs.push_back("__gen_memset_n_align");
+}
+
 for (Module::iterator SF = mod->begin(), E = mod->end(); SF != E; ++SF) {
   if (SF->isDeclaration()) continue;
   if (!isKernelFunction(*SF)) continue;
diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp 
b/backend/src/llvm/llvm_intrinsic_lowering.cpp
index c26e96a..f01bb51 100644
--- a/backend/src/llvm/llvm_intrinsic_lowering.cpp
+++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
@@ -54,6 +54,8 @@ namespace gbe {
 return 'c';
   case 3:
 return 'l';
+  case 4:
+return 'n';
   default:
 assert(0 && "Non support address space");
 return '\0';
-- 
2.1.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 02/19] Runtime: fix clEnqueueMigrateMemObjects fail.

2016-11-28 Thread Yang Rong
clEnqueueMigrateMemObjects's parameter may be clBuffer or clImage, so
should call CL_OBJECT_IS_MEM.

Signed-off-by: Yang Rong 
---
 src/cl_api_mem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 054c37a..7314a48 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -1177,7 +1177,7 @@ clEnqueueMigrateMemObjects(cl_command_queue command_queue,
 }
 
 for (i = 0; i < num_mem_objects; i++) {
-  if (!CL_OBJECT_IS_BUFFER(mem_objects[i])) {
+  if (!CL_OBJECT_IS_MEM(mem_objects[i])) {
 err = CL_INVALID_MEM_OBJECT;
 break;
   }
-- 
2.1.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 16/19] OCL20: add a cl_kernel pointer to gpgpu.

2016-11-28 Thread Yang Rong
Because in flush the command queue, must check the currunt flushed
command queue has device enqueue or not, it need the cl_kernel. So store
the cl_kernel pointer to gpgpu. And add two function intel_gpgpu_set_kernel
and intel_gpgpu_get_kernel for it.

Signed-off-by: Yang Rong 
Reviewed-by: Pan Xiuli 
---
 src/cl_driver.h | 18 +-
 src/cl_driver_defs.c|  3 +++
 src/intel/intel_gpgpu.c | 14 ++
 src/intel/intel_gpgpu.h |  1 +
 4 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/src/cl_driver.h b/src/cl_driver.h
index b45e2fb..3a8abf5 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -123,7 +123,7 @@ typedef enum gpu_command_status {
 typedef struct cl_gpgpu_kernel {
   const char *name;/* kernel name and bo name */
   uint32_t grf_blocks; /* register blocks kernel wants (in 8 reg blocks) */
-  uint32_t curbe_sz; /* total size of all curbes */
+  uint32_t curbe_sz;   /* total size of all curbes */
   cl_buffer bo;/* kernel code in the proper addr space */
   int32_t barrierID;   /* barrierID for _this_ kernel */
   uint32_t use_slm:1;  /* For gen7 (automatic barrier management) */
@@ -147,6 +147,12 @@ extern cl_gpgpu_sync_cb *cl_gpgpu_sync;
 typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, 
uint32_t internal_offset, size_t size, uint8_t bti);
 extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
 
+typedef void (cl_gpgpu_set_kernel_cb)(cl_gpgpu, void *);
+extern cl_gpgpu_set_kernel_cb *cl_gpgpu_set_kernel;
+
+typedef void* (cl_gpgpu_get_kernel_cb)(cl_gpgpu);
+extern cl_gpgpu_get_kernel_cb *cl_gpgpu_get_kernel;
+
 /* bind samplers defined in both kernel and kernel args. */
 typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t 
sampler_sz);
 extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler;
@@ -330,7 +336,6 @@ typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
   const size_t global_wk_sz[3],
   const size_t local_wk_sz[3]);
 extern cl_gpgpu_walker_cb *cl_gpgpu_walker;
-
 /**
  * Buffer
  **/
@@ -341,14 +346,17 @@ extern cl_buffer_alloc_cb *cl_buffer_alloc;
 typedef cl_buffer (cl_buffer_alloc_userptr_cb)(cl_buffer_mgr, const char*, 
void *, size_t, unsigned long);
 extern cl_buffer_alloc_userptr_cb *cl_buffer_alloc_userptr;
 
-typedef cl_buffer (cl_buffer_set_softpin_offset_cb)(cl_buffer, uint64_t);
+typedef int (cl_buffer_set_softpin_offset_cb)(cl_buffer, uint64_t);
 extern cl_buffer_set_softpin_offset_cb *cl_buffer_set_softpin_offset;
 
-typedef cl_buffer (cl_buffer_set_bo_use_full_range_cb)(cl_buffer, uint32_t);
+typedef int (cl_buffer_set_bo_use_full_range_cb)(cl_buffer, uint32_t);
 extern cl_buffer_set_bo_use_full_range_cb *cl_buffer_set_bo_use_full_range;
 
+typedef int (cl_buffer_disable_reuse_cb)(cl_buffer);
+extern cl_buffer_disable_reuse_cb *cl_buffer_disable_reuse;
+
 /* Set a buffer's tiling mode */
-typedef cl_buffer (cl_buffer_set_tiling_cb)(cl_buffer, int tiling, size_t 
stride);
+typedef int (cl_buffer_set_tiling_cb)(cl_buffer, int tiling, size_t stride);
 extern cl_buffer_set_tiling_cb *cl_buffer_set_tiling;
 
 #include "cl_context.h"
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index f5f5fe2..18ab473 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -35,6 +35,7 @@ LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
 LOCAL cl_buffer_alloc_userptr_cb *cl_buffer_alloc_userptr = NULL;
 LOCAL cl_buffer_set_softpin_offset_cb *cl_buffer_set_softpin_offset = NULL;
 LOCAL cl_buffer_set_bo_use_full_range_cb *cl_buffer_set_bo_use_full_range = 
NULL;
+LOCAL cl_buffer_disable_reuse_cb *cl_buffer_disable_reuse = NULL;
 LOCAL cl_buffer_set_tiling_cb *cl_buffer_set_tiling = NULL;
 LOCAL cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture = NULL;
 LOCAL cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture = NULL;
@@ -102,4 +103,6 @@ LOCAL cl_gpgpu_unmap_printf_buffer_cb 
*cl_gpgpu_unmap_printf_buffer = NULL;
 LOCAL cl_gpgpu_set_printf_info_cb *cl_gpgpu_set_printf_info = NULL;
 LOCAL cl_gpgpu_get_printf_info_cb *cl_gpgpu_get_printf_info = NULL;
 LOCAL cl_gpgpu_release_printf_buffer_cb *cl_gpgpu_release_printf_buffer = NULL;
+LOCAL cl_gpgpu_set_kernel_cb *cl_gpgpu_set_kernel = NULL;
+LOCAL cl_gpgpu_get_kernel_cb *cl_gpgpu_get_kernel = NULL;
 
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index b36c21a..c851f42 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -2448,6 +2448,18 @@ intel_gpgpu_get_printf_info(intel_gpgpu_t *gpgpu)
   return gpgpu->printf_info;
 }
 
+static void
+intel_gpgpu_set_kernel(intel_gpgpu_t *gpgpu, void * kernel)
+{
+  gpgpu->kernel = kernel;
+}
+
+static void*
+intel_gpgpu_get_kernel(intel_gpgpu_t *gpgpu)
+{
+  return gpgpu->kernel;
+}
+
 LOCAL void
 intel_set_gpgp

[Beignet] [PATCH 15/19] OCL20: Add runtime functions to get the device enqueue info.

2016-11-28 Thread Yang Rong
Add two functions gbe_kernel_use_device_enqueue and
gbe_program_get_device_enqueue_kernel_name.

Signed-off-by: Yang Rong 
Reviewed-by: Pan Xiuli 
---
 backend/src/backend/program.cpp | 16 
 backend/src/gbe_bin_interpreter.cpp |  2 ++
 src/cl_gbe_loader.cpp   | 10 ++
 src/cl_gbe_loader.h |  2 ++
 4 files changed, 30 insertions(+)

diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index dcbaaf4..808974d 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -1266,6 +1266,12 @@ EXTEND_QUOTE:
 return program->getKernelNum();
   }
 
+  const static char* programGetDeviceEnqueueKernelName(gbe_program gbeProgram, 
uint32_t index) {
+if (gbeProgram == NULL) return 0;
+const gbe::Program *program = (const gbe::Program*) gbeProgram;
+return program->getDeviceEnqueueKernelName(index);
+  }
+
   static gbe_kernel programGetKernelByName(gbe_program gbeProgram, const char 
*name) {
 if (gbeProgram == NULL) return NULL;
 const gbe::Program *program = (gbe::Program*) gbeProgram;
@@ -1431,6 +1437,12 @@ EXTEND_QUOTE:
 return ps->getPrintfNum();
   }
 
+  static uint32_t kernelUseDeviceEnqueue(gbe_kernel gbeKernel) {
+if (gbeKernel == NULL) return 0;
+const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+return kernel->getUseDeviceEnqueue();
+  }
+
   static void* kernelDupPrintfSet(gbe_kernel gbeKernel) {
 if (gbeKernel == NULL) return NULL;
 const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
@@ -1516,6 +1528,7 @@ GBE_EXPORT_SYMBOL gbe_program_delete_cb 
*gbe_program_delete = NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_kernel_num_cb *gbe_program_get_kernel_num = 
NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_kernel_by_name_cb 
*gbe_program_get_kernel_by_name = NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_kernel_cb *gbe_program_get_kernel = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_device_enqueue_kernel_name_cb 
*gbe_program_get_device_enqueue_kernel_name = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_name_cb *gbe_kernel_get_name = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_attributes_cb *gbe_kernel_get_attributes = 
NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_code_cb *gbe_kernel_get_code = NULL;
@@ -1548,6 +1561,7 @@ GBE_EXPORT_SYMBOL gbe_dup_printfset_cb *gbe_dup_printfset 
= NULL;
 GBE_EXPORT_SYMBOL gbe_get_printf_buf_bti_cb *gbe_get_printf_buf_bti = NULL;
 GBE_EXPORT_SYMBOL gbe_release_printf_info_cb *gbe_release_printf_info = NULL;
 GBE_EXPORT_SYMBOL gbe_output_printf_cb *gbe_output_printf = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_use_device_enqueue_cb 
*gbe_kernel_use_device_enqueue = NULL;
 
 #ifdef GBE_COMPILER_AVAILABLE
 namespace gbe
@@ -1567,6 +1581,7 @@ namespace gbe
   gbe_program_clean_llvm_resource = gbe::programCleanLlvmResource;
   gbe_program_delete = gbe::programDelete;
   gbe_program_get_kernel_num = gbe::programGetKernelNum;
+  gbe_program_get_device_enqueue_kernel_name = 
gbe::programGetDeviceEnqueueKernelName;
   gbe_program_get_kernel_by_name = gbe::programGetKernelByName;
   gbe_program_get_kernel = gbe::programGetKernel;
   gbe_kernel_get_name = gbe::kernelGetName;
@@ -1601,6 +1616,7 @@ namespace gbe
   gbe_dup_printfset = gbe::kernelDupPrintfSet;
   gbe_release_printf_info = gbe::kernelReleasePrintfSet;
   gbe_output_printf = gbe::kernelOutputPrintf;
+  gbe_kernel_use_device_enqueue = gbe::kernelUseDeviceEnqueue;
   genSetupCallBacks();
 }
 
diff --git a/backend/src/gbe_bin_interpreter.cpp 
b/backend/src/gbe_bin_interpreter.cpp
index dd3ce01..64cacd9 100644
--- a/backend/src/gbe_bin_interpreter.cpp
+++ b/backend/src/gbe_bin_interpreter.cpp
@@ -40,6 +40,7 @@ struct BinInterpCallBackInitializer
 gbe_program_get_kernel_num = gbe::programGetKernelNum;
 gbe_program_get_kernel_by_name = gbe::programGetKernelByName;
 gbe_program_get_kernel = gbe::programGetKernel;
+gbe_program_get_device_enqueue_kernel_name = 
gbe::programGetDeviceEnqueueKernelName;
 gbe_kernel_get_code_size = gbe::kernelGetCodeSize;
 gbe_kernel_get_code = gbe::kernelGetCode;
 gbe_kernel_get_arg_num = gbe::kernelGetArgNum;
@@ -77,6 +78,7 @@ struct BinInterpCallBackInitializer
 gbe_dup_printfset = gbe::kernelDupPrintfSet;
 gbe_release_printf_info = gbe::kernelReleasePrintfSet;
 gbe_output_printf = gbe::kernelOutputPrintf;
+gbe_kernel_use_device_enqueue = gbe::kernelUseDeviceEnqueue;
   }
 
   ~BinInterpCallBackInitializer() {
diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
index 3736c86..f190b0d 100644
--- a/src/cl_gbe_loader.cpp
+++ b/src/cl_gbe_loader.cpp
@@ -44,6 +44,7 @@ gbe_program_delete_cb *interp_program_delete = NULL;
 gbe_program_get_kernel_num_cb *interp_program_get_kernel_num = NULL;
 gbe_program_get_kernel_by_name_cb *interp_program_get_kernel_by_name = NULL;
 gbe_program_get_kernel_cb *interp_program_get_kernel = NULL;
+gbe_program_get_device_enqueue_kernel_name_cb 
*inte