Re: [Beignet] [PATCH] do constant folding for kernel struct args

2017-06-13 Thread Guo, Yejun
In current implementation, only loadi and add are considered.

In the example, since %22 is dst of MOV, it will not be recorded. It is 
recorded and so impacts the IR only if %22 is dst of ADD.

-Original Message-
From: Yang, Rong R 
Sent: Tuesday, June 13, 2017 4:59 PM
To: Guo, Yejun; Wang, Rander; Pan, Xiuli; beignet@lists.freedesktop.org
Subject: RE: [Beignet] [PATCH] do constant folding for kernel struct args

foldFunctionStructArgConstOffset is called before the lowerFunctionArguments.
If foldFunctionStructArgConstOffset is wrong, the INDIRECT_MOV generated in 
lowerFunctionArguments also wrong.

I afraid the following ir:

BB2:
LOADI %30, 4
Add %20, %10, %30//%10 is a struct argument
MOV %22, %20   //phi-mov

BB3:
LOADI %31, 8
Add %21, %11, %31//%11 is another struct argument
MOV %22, %21   //phi-mov

BB4:
LOADI %32, 4
Add %33, %22, %32

Will be converted to:
LOADI %42, 8
Add %33, %10, %42

If so, the lowerFunctionArguments will wrong.

> -Original Message-
> From: Guo, Yejun
> Sent: Tuesday, June 13, 2017 16:39
> To: Yang, Rong R <rong.r.y...@intel.com>; Wang, Rander 
> <rander.w...@intel.com>; Pan, Xiuli <xiuli@intel.com>; 
> beignet@lists.freedesktop.org
> Subject: RE: [Beignet] [PATCH] do constant folding for kernel struct 
> args
> 
> I just tried such kernel, and the generated GEN IR is INDIRECT_MOV, it 
> has nothing to do with this patch.
> 
> Thanks
> Yejun
> 
> -Original Message-

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] do constant folding for kernel struct args

2017-06-13 Thread Guo, Yejun
I just tried such kernel, and the generated GEN IR is INDIRECT_MOV, it has 
nothing to do with this patch.

Thanks
Yejun

-Original Message-
From: Yang, Rong R 
Sent: Tuesday, June 13, 2017 3:54 PM
To: Guo, Yejun; Wang, Rander; Pan, Xiuli; beignet@lists.freedesktop.org
Subject: RE: [Beignet] [PATCH] do constant folding for kernel struct args

Has you consider the value from two arguments case. For example:

Struct  s1{
int i,
   float4 f4;
}

Struct  s2{
int i;
short s;
   float4 f4;
}

__kernel void k(s1, s2, __global float *dst) {
int gid = get_global_id(0);
float4 *p;
   if (gid % 2) {
  p = 
   } else {
  P = 
   }
dst[gid] = *p.s1;
}

> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf 
> Of Guo, Yejun
> Sent: Thursday, June 8, 2017 21:08
> To: Wang, Rander <rander.w...@intel.com>; Pan, Xiuli 
> <xiuli@intel.com>; beignet@lists.freedesktop.org
> Subject: Re: [Beignet] [PATCH] do constant folding for kernel struct 
> args
> 
> Yes, the constant folding for kernel struct arg is a must here.
> 
> As for the general constant folding and propagation optimization, I do 
> not have a position that sel ir or gen ir is better.
> 
> -Original Message-
> From: Wang, Rander
> Sent: Thursday, June 08, 2017 1:14 PM
> To: Pan, Xiuli; Guo, Yejun; beignet@lists.freedesktop.org
> Cc: Guo, Yejun
> Subject: RE: [Beignet] [PATCH] do constant folding for kernel struct 
> args
> 
> Yes, so I may be able to give some advice
> 
> -Original Message-
> From: Pan, Xiuli
> Sent: Thursday, June 8, 2017 1:09 PM
> To: Guo, Yejun <yejun@intel.com>; beignet@lists.freedesktop.org
> Cc: Guo, Yejun <yejun@intel.com>; Wang, Rander 
> <rander.w...@intel.com>
> Subject: RE: [Beignet] [PATCH] do constant folding for kernel struct 
> args
> 
> Rander seems to have a similar optimization about imm value at sel ir.
> If your case here need the optimization done in GEN IR level then  
> rander's patch may no longer be needed.
> 
> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf 
> Of Guo, Yejun
> Sent: Thursday, June 8, 2017 12:41
> To: beignet@lists.freedesktop.org
> Cc: Guo, Yejun <yejun@intel.com>
> Subject: [Beignet] [PATCH] do constant folding for kernel struct args
> 
> for the following GEN IR, %41 is kernel argument (struct) the first 
> LOAD will be mov, and the second LOAD will be indirect move (see 
> lowerFunctionArguments). It hurts performance, and even impacts the 
> correctness of reg liveness of indriect mov
> 
> LOADI.uint64 %1114 72
> ADD.int64 %78 %41 %1114
> LOAD.int64.private.aligned {%79} %78 bti:255
> LOADI.int64 %1115 8
> ADD.int64 %1116 %78 %1115
> LOAD.int64.private.aligned {%80} %1116 bti:255
> 
> this function folds the constants of 72 and 8 together, and so it will 
> be direct mov.
> the GEN IR looks like:
> LOADI.int64 %1115 80
> ADD.int64 %1116 %41 %1115
> ---
>  backend/src/CMakeLists.txt |   2 +
>  backend/src/ir/constopt.cpp| 144
> +
>  backend/src/ir/constopt.hpp|  54 
>  backend/src/ir/context.cpp |   5 ++
>  backend/src/ir/instruction.cpp |   7 ++
>  backend/src/ir/instruction.hpp |   1 +
>  6 files changed, 213 insertions(+)
>  create mode 100644 backend/src/ir/constopt.cpp  create mode 100644 
> backend/src/ir/constopt.hpp
> 
> diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt 
> index c9ff833..74d7bab 100644
> --- a/backend/src/CMakeLists.txt
> +++ b/backend/src/CMakeLists.txt
> @@ -73,6 +73,8 @@ set (GBE_SRC
>  ir/value.hpp
>  ir/lowering.cpp
>  ir/lowering.hpp
> +ir/constopt.cpp
> +ir/constopt.hpp
>  ir/profiling.cpp
>  ir/profiling.hpp
>  ir/printf.cpp
> diff --git a/backend/src/ir/constopt.cpp b/backend/src/ir/constopt.cpp 
> new file mode 100644 index 000..24878b8
> --- /dev/null
> +++ b/backend/src/ir/constopt.cpp
> @@ -0,0 +1,144 @@
> +/*
> + * Copyright © 2017 Intel Corporation
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have 

Re: [Beignet] [PATCH] do constant folding for kernel struct args

2017-06-08 Thread Guo, Yejun
Yes, the constant folding for kernel struct arg is a must here.

As for the general constant folding and propagation optimization, I do not have 
a position that sel ir or gen ir is better.

-Original Message-
From: Wang, Rander 
Sent: Thursday, June 08, 2017 1:14 PM
To: Pan, Xiuli; Guo, Yejun; beignet@lists.freedesktop.org
Cc: Guo, Yejun
Subject: RE: [Beignet] [PATCH] do constant folding for kernel struct args

Yes, so I may be able to give some advice

-Original Message-
From: Pan, Xiuli
Sent: Thursday, June 8, 2017 1:09 PM
To: Guo, Yejun <yejun@intel.com>; beignet@lists.freedesktop.org
Cc: Guo, Yejun <yejun@intel.com>; Wang, Rander <rander.w...@intel.com>
Subject: RE: [Beignet] [PATCH] do constant folding for kernel struct args

Rander seems to have a similar optimization about imm value at sel ir.
If your case here need the optimization done in GEN IR level then  rander's 
patch may no longer be needed.

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Guo, 
Yejun
Sent: Thursday, June 8, 2017 12:41
To: beignet@lists.freedesktop.org
Cc: Guo, Yejun <yejun@intel.com>
Subject: [Beignet] [PATCH] do constant folding for kernel struct args

for the following GEN IR, %41 is kernel argument (struct) the first LOAD will 
be mov, and the second LOAD will be indirect move (see lowerFunctionArguments). 
It hurts performance, and even impacts the correctness of reg liveness of 
indriect mov

LOADI.uint64 %1114 72
ADD.int64 %78 %41 %1114
LOAD.int64.private.aligned {%79} %78 bti:255
LOADI.int64 %1115 8
ADD.int64 %1116 %78 %1115
LOAD.int64.private.aligned {%80} %1116 bti:255

this function folds the constants of 72 and 8 together, and so it will be 
direct mov.
the GEN IR looks like:
LOADI.int64 %1115 80
ADD.int64 %1116 %41 %1115
---
 backend/src/CMakeLists.txt |   2 +
 backend/src/ir/constopt.cpp| 144 +
 backend/src/ir/constopt.hpp|  54 
 backend/src/ir/context.cpp |   5 ++
 backend/src/ir/instruction.cpp |   7 ++
 backend/src/ir/instruction.hpp |   1 +
 6 files changed, 213 insertions(+)
 create mode 100644 backend/src/ir/constopt.cpp  create mode 100644 
backend/src/ir/constopt.hpp

diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt index 
c9ff833..74d7bab 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -73,6 +73,8 @@ set (GBE_SRC
 ir/value.hpp
 ir/lowering.cpp
 ir/lowering.hpp
+ir/constopt.cpp
+ir/constopt.hpp
 ir/profiling.cpp
 ir/profiling.hpp
 ir/printf.cpp
diff --git a/backend/src/ir/constopt.cpp b/backend/src/ir/constopt.cpp new file 
mode 100644 index 000..24878b8
--- /dev/null
+++ b/backend/src/ir/constopt.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Guo Yejun <yejun@intel.com>  */
+
+#include 
+#include "ir/context.hpp"
+#include "ir/value.hpp"
+#include "ir/constopt.hpp"
+#include "sys/set.hpp"
+
+namespace gbe {
+namespace ir {
+
+  class FunctionStructArgConstOffsetFolder : public Context  {
+  public:
+/*! Build the helper structure */
+FunctionStructArgConstOffsetFolder(Unit ) : Context(unit) {
+  records.clear();
+  loadImms.clear();
+}
+/*! Free everything we needed */
+virtual ~FunctionStructArgConstOffsetFolder() {
+  for (size_t i = 0; i < records.size(); ++i) {
+delete records[i];
+  }
+  records.clear();
+  loadImms.clear();
+}
+/*! Perform all function arguments substitution if needed */
+void folding(const std::string );
+
+  private:
+class Record {  //add dst, arg (kernel struct arg base reg), imm_value
+public:
+  Record(Register dst, Register arg, int64_t immv) :
+_dst(dst), _arg(arg), _immv(immv) { }
+  Register _dst;
+  Register _arg;
+  int64_t _immv;
+};
+std::vector<Record*> records;
+std::map<Register, LoadImmInstruction*> loadImms; //
+
+void AddRecord(Register dst, Register arg, int64_t immv) {
+  Record* rec = new Record(dst, arg, immv);
+  records.push_back(rec);
+}
+  };
+
+  void FunctionStr

[Beignet] [PATCH] do constant folding for kernel struct args

2017-06-07 Thread Guo, Yejun
for the following GEN IR, %41 is kernel argument (struct)
the first LOAD will be mov, and the second LOAD will be indirect move
(see lowerFunctionArguments). It hurts performance,
and even impacts the correctness of reg liveness of indriect mov

LOADI.uint64 %1114 72
ADD.int64 %78 %41 %1114
LOAD.int64.private.aligned {%79} %78 bti:255
LOADI.int64 %1115 8
ADD.int64 %1116 %78 %1115
LOAD.int64.private.aligned {%80} %1116 bti:255

this function folds the constants of 72 and 8 together,
and so it will be direct mov.
the GEN IR looks like:
LOADI.int64 %1115 80
ADD.int64 %1116 %41 %1115
---
 backend/src/CMakeLists.txt |   2 +
 backend/src/ir/constopt.cpp| 144 +
 backend/src/ir/constopt.hpp|  54 
 backend/src/ir/context.cpp |   5 ++
 backend/src/ir/instruction.cpp |   7 ++
 backend/src/ir/instruction.hpp |   1 +
 6 files changed, 213 insertions(+)
 create mode 100644 backend/src/ir/constopt.cpp
 create mode 100644 backend/src/ir/constopt.hpp

diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index c9ff833..74d7bab 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -73,6 +73,8 @@ set (GBE_SRC
 ir/value.hpp
 ir/lowering.cpp
 ir/lowering.hpp
+ir/constopt.cpp
+ir/constopt.hpp
 ir/profiling.cpp
 ir/profiling.hpp
 ir/printf.cpp
diff --git a/backend/src/ir/constopt.cpp b/backend/src/ir/constopt.cpp
new file mode 100644
index 000..24878b8
--- /dev/null
+++ b/backend/src/ir/constopt.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Guo Yejun <yejun@intel.com>
+ */
+
+#include 
+#include "ir/context.hpp"
+#include "ir/value.hpp"
+#include "ir/constopt.hpp"
+#include "sys/set.hpp"
+
+namespace gbe {
+namespace ir {
+
+  class FunctionStructArgConstOffsetFolder : public Context
+  {
+  public:
+/*! Build the helper structure */
+FunctionStructArgConstOffsetFolder(Unit ) : Context(unit) {
+  records.clear();
+  loadImms.clear();
+}
+/*! Free everything we needed */
+virtual ~FunctionStructArgConstOffsetFolder() {
+  for (size_t i = 0; i < records.size(); ++i) {
+delete records[i];
+  }
+  records.clear();
+  loadImms.clear();
+}
+/*! Perform all function arguments substitution if needed */
+void folding(const std::string );
+
+  private:
+class Record {  //add dst, arg (kernel struct arg base reg), imm_value
+public:
+  Record(Register dst, Register arg, int64_t immv) :
+_dst(dst), _arg(arg), _immv(immv) { }
+  Register _dst;
+  Register _arg;
+  int64_t _immv;
+};
+std::vector<Record*> records;
+std::map<Register, LoadImmInstruction*> loadImms; //
+
+void AddRecord(Register dst, Register arg, int64_t immv) {
+  Record* rec = new Record(dst, arg, immv);
+  records.push_back(rec);
+}
+  };
+
+  void FunctionStructArgConstOffsetFolder::folding(const std::string ) {
+Function *fn = unit.getFunction(name);
+if (fn == NULL)
+  return;
+
+const uint32_t argNum = fn->argNum();
+for (uint32_t argID = 0; argID < argNum; ++argID) {
+  FunctionArgument  = fn->getArg(argID);
+  if (arg.type != FunctionArgument::STRUCTURE)
+continue;
+  AddRecord(arg.reg, arg.reg, 0);
+}
+
+fn->foreachInstruction([&](Instruction ) {
+  if (insn.getOpcode() == OP_LOADI) {
+LoadImmInstruction *loadImm = cast();
+if(!loadImm)
+  return;
+
+//to avoid regression, limit for the case: LOADI.int64 %1164 32
+//we can loose the limit if necessary
+if (loadImm->getImmediate().getType() != TYPE_S64 &&
+loadImm->getImmediate().getType() != TYPE_U64)
+  return;
+
+Register dst = insn.getDst();
+loadImms[dst] = loadImm;
+return;
+  }
+
+  //we will change imm of loadi directly, so it should not be dst
+  for (size_t i = 0; i < insn.getDstNum(); ++i) {
+Register dst = insn.getDst(i);
+assert(loadImms.find(dst) == loadImms.end());
+  }
+
+  if (insn.getOpcode() != OP_ADD)
+return;

[Beignet] [PATCH] keep GEN IR as SSA style

2017-06-07 Thread Guo, Yejun
---
 backend/src/llvm/llvm_gen_backend.cpp | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
b/backend/src/llvm/llvm_gen_backend.cpp
index 831666e..31b8bf2 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2984,10 +2984,12 @@ namespace gbe
   this->newRegister(const_cast());
   ir::Register reg = 
regTranslator.getScalar(const_cast(), 0);
   ir::Constant  = unit.getConstantSet().getConstant(v.getName());
-  ctx.LOADI(getType(ctx, v.getType()), reg, 
ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType(;
   if (!legacyMode) {
-ctx.ADD(getType(ctx, v.getType()), reg, 
ir::ocl::constant_addrspace, reg);
-  }
+ir::Register regload = ctx.reg(getFamily(getType(ctx, 
v.getType(;
+ctx.LOADI(getType(ctx, v.getType()), regload, 
ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType(;
+ctx.ADD(getType(ctx, v.getType()), reg, 
ir::ocl::constant_addrspace, regload);
+  } else
+ctx.LOADI(getType(ctx, v.getType()), reg, 
ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType(;
 }
   } else if(addrSpace == ir::MEM_PRIVATE) {
   this->newRegister(const_cast());
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] Backend: Fix llvm40 assert about literal structs

2017-05-17 Thread Guo, Yejun
Looks fine to me, thanks.

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Pan, 
Xiuli
Sent: Wednesday, May 17, 2017 3:15 PM
To: beignet@lists.freedesktop.org
Subject: Re: [Beignet] [PATCH] Backend: Fix llvm40 assert about literal structs

Ping for review.
If llvm is debug version will cause assert for device enqueue cases.

-Original Message-
From: Pan, Xiuli 
Sent: Tuesday, April 25, 2017 13:27
To: beignet@lists.freedesktop.org
Cc: Pan, Xiuli 
Subject: [PATCH] Backend: Fix llvm40 assert about literal structs

From: Pan Xiuli 

In llvm literal structs have no name, so check it first.

Signed-off-by: Pan Xiuli 
---
 backend/src/llvm/llvm_gen_backend.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
b/backend/src/llvm/llvm_gen_backend.cpp
index 9954021..831666e 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -362,7 +362,8 @@ namespace gbe
 Type *eltTy = dyn_cast(type)->getElementType();
 if (eltTy->isStructTy()) {
   StructType *strTy = dyn_cast(eltTy);
-  if (strTy->getName().data() && strstr(strTy->getName().data(), 
"sampler"))
+  if (!strTy->isLiteral() && strTy->getName().data() &&
+  strstr(strTy->getName().data(), "sampler"))
 type = Type::getInt32Ty(value->getContext());
 }
   }
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] refresh DAG when an arg has both direct and indirect read

2017-05-17 Thread Guo, Yejun
when the return value is ARG_INDIRECT_READ, there is still possible
that some IRs read it directly, and will be handled in buildConstantPush()
so we need to refresh the dag afer function buildConstantPush

another method is to update DAG accordingly, but i don't think it
is easy compared with the refresh method, so i do not choose it.
---
 backend/src/ir/lowering.cpp | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp
index 93bd96a..bcf5940 100644
--- a/backend/src/ir/lowering.cpp
+++ b/backend/src/ir/lowering.cpp
@@ -199,6 +199,7 @@ namespace ir {
 GBE_SAFE_DELETE(liveness);
 this->liveness = GBE_NEW(ir::Liveness, *fn);
 this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
+bool needRefreshDag = false;
 
 // Process all structure arguments and find all the direct loads we can
 // replace
@@ -207,13 +208,27 @@ namespace ir {
 for (uint32_t argID = 0; argID < argNum; ++argID) {
   FunctionArgument  = fn->getArg(argID);
   if (arg.type != FunctionArgument::STRUCTURE) continue;
-  if(this->lower(argID) == ARG_INDIRECT_READ)
+  if(this->lower(argID) == ARG_INDIRECT_READ) {
 indirctReadArgs.push_back(argID);
+//when the return value is ARG_INDIRECT_READ, there is still possible
+//that some IRs read it directly, and will be handled in 
buildConstantPush()
+//so we need to refresh the dag afer function buildConstantPush
+for (const auto  : seq) {
+  if (loadAddImm.argID == argID)
+needRefreshDag = true;
+}
+  }
 }
 
 // Build the constant push description and remove the instruction that
 // therefore become useless
 this->buildConstantPush();
+if (needRefreshDag) {
+  GBE_SAFE_DELETE(dag);
+  GBE_SAFE_DELETE(liveness);
+  this->liveness = GBE_NEW(ir::Liveness, *fn);
+  this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
+}
 for (uint32_t i = 0; i < indirctReadArgs.size(); ++i){
   lowerIndirectRead(indirctReadArgs[i]);
 }
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] defer dead insn removal since lowerIndirectRead still needs them

2017-04-26 Thread Guo, Yejun
Please ignore this patch, there is still something not correct. Thanks.

-Original Message-
From: Guo, Yejun 
Sent: Wednesday, April 26, 2017 4:43 PM
To: beignet@lists.freedesktop.org
Cc: Guo, Yejun
Subject: [PATCH] defer dead insn removal since lowerIndirectRead still needs 
them

---
 backend/src/ir/lowering.cpp | 54 ++---
 1 file changed, 26 insertions(+), 28 deletions(-)

diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp
index 93bd96a..53aafa4 100644
--- a/backend/src/ir/lowering.cpp
+++ b/backend/src/ir/lowering.cpp
@@ -117,7 +117,7 @@ namespace ir {
 /*! Lower the given function argument accesses */
 ArgUse lower(uint32_t argID);
 /*! Build the constant push for the function */
-void buildConstantPush(void);
+void buildConstantPush(set& dead);
 /* Lower indirect Read to indirct Mov */
 void lowerIndirectRead(uint32_t argID);
 /* Convert indirectLoad to indirect Mov */
@@ -192,6 +192,25 @@ namespace ir {
 GBE_SAFE_DELETE(liveness);
   }
 
+// Remove all the given instructions from the stream (if dead)
+#define REMOVE_INSN(WHICH) \
+  for (const auto  : seq) { \
+Instruction *WHICH = loadAddImm.WHICH; \
+if (WHICH == NULL) continue; \
+const UseSet  = dag->getUse(WHICH, 0); \
+bool isDead = true; \
+for (auto use : useSet) { \
+  if (dead.contains(use->getInstruction()) == false) { \
+isDead = false; \
+break; \
+  } \
+} \
+if (isDead && !dead.contains(WHICH)) { \
+  dead.insert(WHICH); \
+  WHICH->remove(); \
+} \
+  }
+
   void FunctionArgumentLowerer::lower(const std::string ) {
 if ((this->fn = unit.getFunction(functionName)) == NULL)
   return;
@@ -213,40 +232,22 @@ namespace ir {
 
 // Build the constant push description and remove the instruction that
 // therefore become useless
-this->buildConstantPush();
+set dead;
+this->buildConstantPush(dead);
 for (uint32_t i = 0; i < indirctReadArgs.size(); ++i){
   lowerIndirectRead(indirctReadArgs[i]);
 }
-ReplaceIndirectLoad();
-  }
+REMOVE_INSN(add)
+REMOVE_INSN(loadImm)
 
-// Remove all the given instructions from the stream (if dead)
-#define REMOVE_INSN(WHICH) \
-  for (const auto  : seq) { \
-Instruction *WHICH = loadAddImm.WHICH; \
-if (WHICH == NULL) continue; \
-const UseSet  = dag->getUse(WHICH, 0); \
-bool isDead = true; \
-for (auto use : useSet) { \
-  if (dead.contains(use->getInstruction()) == false) { \
-isDead = false; \
-break; \
-  } \
-} \
-if (isDead && !dead.contains(WHICH)) { \
-  dead.insert(WHICH); \
-  WHICH->remove(); \
-} \
+ReplaceIndirectLoad();
   }
 
-  void FunctionArgumentLowerer::buildConstantPush(void)
+  void FunctionArgumentLowerer::buildConstantPush(set& 
dead)
   {
 if (seq.size() == 0)
   return;
 
-// Track instructions we remove to recursively kill them properly
-set dead;
-
 // The argument location we already pushed (since the same argument 
location
 // can be used several times)
 set inserted;
@@ -291,9 +292,6 @@ namespace ir {
 load->remove();
   }
 }
-
-REMOVE_INSN(add)
-REMOVE_INSN(loadImm)
   }
 
 #undef REMOVE_INSN
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] defer dead insn removal since lowerIndirectRead still needs them

2017-04-26 Thread Guo, Yejun
---
 backend/src/ir/lowering.cpp | 54 ++---
 1 file changed, 26 insertions(+), 28 deletions(-)

diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp
index 93bd96a..53aafa4 100644
--- a/backend/src/ir/lowering.cpp
+++ b/backend/src/ir/lowering.cpp
@@ -117,7 +117,7 @@ namespace ir {
 /*! Lower the given function argument accesses */
 ArgUse lower(uint32_t argID);
 /*! Build the constant push for the function */
-void buildConstantPush(void);
+void buildConstantPush(set& dead);
 /* Lower indirect Read to indirct Mov */
 void lowerIndirectRead(uint32_t argID);
 /* Convert indirectLoad to indirect Mov */
@@ -192,6 +192,25 @@ namespace ir {
 GBE_SAFE_DELETE(liveness);
   }
 
+// Remove all the given instructions from the stream (if dead)
+#define REMOVE_INSN(WHICH) \
+  for (const auto  : seq) { \
+Instruction *WHICH = loadAddImm.WHICH; \
+if (WHICH == NULL) continue; \
+const UseSet  = dag->getUse(WHICH, 0); \
+bool isDead = true; \
+for (auto use : useSet) { \
+  if (dead.contains(use->getInstruction()) == false) { \
+isDead = false; \
+break; \
+  } \
+} \
+if (isDead && !dead.contains(WHICH)) { \
+  dead.insert(WHICH); \
+  WHICH->remove(); \
+} \
+  }
+
   void FunctionArgumentLowerer::lower(const std::string ) {
 if ((this->fn = unit.getFunction(functionName)) == NULL)
   return;
@@ -213,40 +232,22 @@ namespace ir {
 
 // Build the constant push description and remove the instruction that
 // therefore become useless
-this->buildConstantPush();
+set dead;
+this->buildConstantPush(dead);
 for (uint32_t i = 0; i < indirctReadArgs.size(); ++i){
   lowerIndirectRead(indirctReadArgs[i]);
 }
-ReplaceIndirectLoad();
-  }
+REMOVE_INSN(add)
+REMOVE_INSN(loadImm)
 
-// Remove all the given instructions from the stream (if dead)
-#define REMOVE_INSN(WHICH) \
-  for (const auto  : seq) { \
-Instruction *WHICH = loadAddImm.WHICH; \
-if (WHICH == NULL) continue; \
-const UseSet  = dag->getUse(WHICH, 0); \
-bool isDead = true; \
-for (auto use : useSet) { \
-  if (dead.contains(use->getInstruction()) == false) { \
-isDead = false; \
-break; \
-  } \
-} \
-if (isDead && !dead.contains(WHICH)) { \
-  dead.insert(WHICH); \
-  WHICH->remove(); \
-} \
+ReplaceIndirectLoad();
   }
 
-  void FunctionArgumentLowerer::buildConstantPush(void)
+  void FunctionArgumentLowerer::buildConstantPush(set& 
dead)
   {
 if (seq.size() == 0)
   return;
 
-// Track instructions we remove to recursively kill them properly
-set dead;
-
 // The argument location we already pushed (since the same argument 
location
 // can be used several times)
 set inserted;
@@ -291,9 +292,6 @@ namespace ir {
 load->remove();
   }
 }
-
-REMOVE_INSN(add)
-REMOVE_INSN(loadImm)
   }
 
 #undef REMOVE_INSN
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] search llvm-config first

2017-01-10 Thread Guo, Yejun
in a system with multiple llvm/clang versions, we can create soft
link llvm-config to the desired version.

Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 CMake/FindLLVM.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMake/FindLLVM.cmake b/CMake/FindLLVM.cmake
index 6129909..8ca8553 100644
--- a/CMake/FindLLVM.cmake
+++ b/CMake/FindLLVM.cmake
@@ -8,12 +8,12 @@
 # LLVM_FOUND   - True if llvm found.
 if (LLVM_INSTALL_DIR)
   find_program(LLVM_CONFIG_EXECUTABLE
-   NAMES llvm-config-37 llvm-config-3.7 llvm-config-36 
llvm-config-3.6 llvm-config-38 llvm-config-3.8 llvm-config llvm-config-35 
llvm-config-3.5 llvm-config-34 llvm-config-3.4
+   NAMES llvm-config llvm-config-37 llvm-config-3.7 llvm-config-36 
llvm-config-3.6 llvm-config-38 llvm-config-3.8 llvm-config-35 llvm-config-3.5 
llvm-config-34 llvm-config-3.4
DOC "llvm-config executable"
PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
 else (LLVM_INSTALL_DIR)
   find_program(LLVM_CONFIG_EXECUTABLE
-   NAMES llvm-config-37 llvm-config-3.7 llvm-config-36 
llvm-config-3.6 llvm-config-38 llvm-config-3.8 llvm-config llvm-config-35 
llvm-config-3.5 llvm-config-34 llvm-config-3.4
+   NAMES llvm-config llvm-config-37 llvm-config-3.7 llvm-config-36 
llvm-config-3.6 llvm-config-38 llvm-config-3.8 llvm-config-35 llvm-config-3.5 
llvm-config-34 llvm-config-3.4
DOC "llvm-config executable")
 endif (LLVM_INSTALL_DIR)
 
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH V2] add sends support for oword/media block write

2016-12-27 Thread Guo, Yejun
v2: should also change the virtual function prototype for gen7
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen7_encoder.cpp   |  2 +-
 backend/src/backend/gen7_encoder.hpp   |  2 +-
 backend/src/backend/gen9_encoder.cpp   | 57 ++
 backend/src/backend/gen9_encoder.hpp   |  2 ++
 backend/src/backend/gen_context.cpp|  6 ++--
 backend/src/backend/gen_encoder.cpp|  4 +--
 backend/src/backend/gen_encoder.hpp|  4 +--
 backend/src/backend/gen_insn_selection.cpp | 54 +---
 8 files changed, 111 insertions(+), 20 deletions(-)

diff --git a/backend/src/backend/gen7_encoder.cpp 
b/backend/src/backend/gen7_encoder.cpp
index 4f35491..4b2cd9a 100644
--- a/backend/src/backend/gen7_encoder.cpp
+++ b/backend/src/backend/gen7_encoder.cpp
@@ -280,7 +280,7 @@ namespace gbe
 response_length);
   }
 
-  void Gen7Encoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+  void Gen7Encoder::MBWRITE(GenRegister header, GenRegister data, uint32_t 
bti, uint32_t size, bool useSends) {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 const uint32_t msg_length = 1 + size;
 const uint32_t response_length = 0; // Size of registers
diff --git a/backend/src/backend/gen7_encoder.hpp 
b/backend/src/backend/gen7_encoder.hpp
index edb711d..7585b34 100644
--- a/backend/src/backend/gen7_encoder.hpp
+++ b/backend/src/backend/gen7_encoder.hpp
@@ -45,7 +45,7 @@ namespace gbe
 /*! MBlock read */
 virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, 
uint32_t elemSize);
 /*! MBlock write */
-virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
+virtual void MBWRITE(GenRegister header, GenRegister data, uint32_t bti, 
uint32_t elemSize, bool useSends);
   };
 }
 #endif /* __GBE_GEN7_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen9_encoder.cpp 
b/backend/src/backend/gen9_encoder.cpp
index 940809b..b37fd98 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -244,4 +244,61 @@ namespace gbe
 gen9_insn->bits2.sends.sel_reg32_desc = 1;
 }
   }
+
+  void Gen9Encoder::OBWRITE(GenRegister header, GenRegister data, uint32_t 
bti, uint32_t ow_size, bool useSends)
+  {
+if (!useSends)
+  Gen8Encoder::OBWRITE(header, data, bti, ow_size, false);
+else {
+  GBE_ASSERT(data.reg() != header.reg());
+  GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+  Gen9NativeInstruction *gen9_insn = >gen9_insn;
+
+  this->setHeader(insn);
+  insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+  setSendsOperands(gen9_insn, GenRegister::null(), header, data);
+
+  uint32_t dataRegs = ow_size / 2;
+  // half reg should also have size 1
+  if (dataRegs == 0)
+dataRegs = 1;
+  gen9_insn->bits2.sends.src1_length = dataRegs;
+
+  const uint32_t block_size = getOBlockSize(ow_size);
+  const uint32_t msg_length = 1;
+  const uint32_t response_length = 0;
+  setOBlockRW(insn,
+bti,
+block_size,
+GEN7_OBLOCK_WRITE,
+msg_length,
+response_length);
+}
+  }
+
+  void Gen9Encoder::MBWRITE(GenRegister header, GenRegister data, uint32_t 
bti, uint32_t data_size, bool useSends)
+  {
+if (!useSends)
+  Gen8Encoder::MBWRITE(header, data, bti, data_size, false);
+else {
+  GBE_ASSERT(data.reg() != header.reg());
+  GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+  Gen9NativeInstruction *gen9_insn = >gen9_insn;
+
+  this->setHeader(insn);
+  insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+  setSendsOperands(gen9_insn, GenRegister::null(), header, data);
+  gen9_insn->bits2.sends.src1_length = data_size;
+
+  const uint32_t msg_length = 1;
+  const uint32_t response_length = 0;
+  setMBlockRW(insn,
+bti,
+GEN75_P1_MEDIA_TYPED_BWRITE,
+msg_length,
+response_length);
+}
+  }
 } /* End of the name space. */
diff --git a/backend/src/backend/gen9_encoder.hpp 
b/backend/src/backend/gen9_encoder.hpp
index d754d59..2eaa538 100644
--- a/backend/src/backend/gen9_encoder.hpp
+++ b/backend/src/backend/gen9_encoder.hpp
@@ -54,6 +54,8 @@ namespace gbe
 virtual void BYTE_SCATTER(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemSize, bool useSends);
 virtual unsigned setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize);
 virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister addr, 
GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends);
+virtual void OBWRITE(GenRegister header, GenRegister data, uint32_t bti, 
uint32_t ow_size, bool useSends);
+virtual void MBWRITE(GenR

Re: [Beignet] [PATCH] add sends support for oword/media block write

2016-12-27 Thread Guo, Yejun
nice catch, will send v2, thanks.

-Original Message-
From: Pan, Xiuli 
Sent: Wednesday, December 28, 2016 2:04 PM
To: Guo, Yejun; beignet@lists.freedesktop.org
Cc: Guo, Yejun
Subject: RE: [Beignet] [PATCH] add sends support for oword/media block write

It seems you missed the gen7_encoder part for the media block write, gen7 has a 
different data port and the virtual function for MBWRITE should also be changed 
for Gen7Encoder.

Others LGTM.


-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Guo, 
Yejun
Sent: Monday, December 19, 2016 6:13 PM
To: beignet@lists.freedesktop.org
Cc: Guo, Yejun <yejun@intel.com>
Subject: [Beignet] [PATCH] add sends support for oword/media block write

Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen9_encoder.cpp   | 57 ++
 backend/src/backend/gen9_encoder.hpp   |  2 ++
 backend/src/backend/gen_context.cpp|  6 ++--
 backend/src/backend/gen_encoder.cpp|  4 +--
 backend/src/backend/gen_encoder.hpp|  4 +--
 backend/src/backend/gen_insn_selection.cpp | 54 +---
 6 files changed, 109 insertions(+), 18 deletions(-)

diff --git a/backend/src/backend/gen9_encoder.cpp 
b/backend/src/backend/gen9_encoder.cpp
index 940809b..b37fd98 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -244,4 +244,61 @@ namespace gbe
 gen9_insn->bits2.sends.sel_reg32_desc = 1;
 }
   }
+
+  void Gen9Encoder::OBWRITE(GenRegister header, GenRegister data, 
+ uint32_t bti, uint32_t ow_size, bool useSends)  {
+if (!useSends)
+  Gen8Encoder::OBWRITE(header, data, bti, ow_size, false);
+else {
+  GBE_ASSERT(data.reg() != header.reg());
+  GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+  Gen9NativeInstruction *gen9_insn = >gen9_insn;
+
+  this->setHeader(insn);
+  insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+  setSendsOperands(gen9_insn, GenRegister::null(), header, data);
+
+  uint32_t dataRegs = ow_size / 2;
+  // half reg should also have size 1
+  if (dataRegs == 0)
+dataRegs = 1;
+  gen9_insn->bits2.sends.src1_length = dataRegs;
+
+  const uint32_t block_size = getOBlockSize(ow_size);
+  const uint32_t msg_length = 1;
+  const uint32_t response_length = 0;
+  setOBlockRW(insn,
+bti,
+block_size,
+GEN7_OBLOCK_WRITE,
+msg_length,
+response_length);
+}
+  }
+
+  void Gen9Encoder::MBWRITE(GenRegister header, GenRegister data, 
+ uint32_t bti, uint32_t data_size, bool useSends)  {
+if (!useSends)
+  Gen8Encoder::MBWRITE(header, data, bti, data_size, false);
+else {
+  GBE_ASSERT(data.reg() != header.reg());
+  GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+  Gen9NativeInstruction *gen9_insn = >gen9_insn;
+
+  this->setHeader(insn);
+  insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+  setSendsOperands(gen9_insn, GenRegister::null(), header, data);
+  gen9_insn->bits2.sends.src1_length = data_size;
+
+  const uint32_t msg_length = 1;
+  const uint32_t response_length = 0;
+  setMBlockRW(insn,
+bti,
+GEN75_P1_MEDIA_TYPED_BWRITE,
+msg_length,
+response_length);
+}
+  }
 } /* End of the name space. */
diff --git a/backend/src/backend/gen9_encoder.hpp 
b/backend/src/backend/gen9_encoder.hpp
index d754d59..2eaa538 100644
--- a/backend/src/backend/gen9_encoder.hpp
+++ b/backend/src/backend/gen9_encoder.hpp
@@ -54,6 +54,8 @@ namespace gbe
 virtual void BYTE_SCATTER(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemSize, bool useSends);
 virtual unsigned setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize);
 virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister addr, 
GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends);
+virtual void OBWRITE(GenRegister header, GenRegister data, uint32_t bti, 
uint32_t ow_size, bool useSends);
+virtual void MBWRITE(GenRegister header, GenRegister data, uint32_t 
+ bti, uint32_t data_size, bool useSends);
   };
 }
 #endif /* __GBE_GEN9_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 10e2c9e..5f60ff3 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3569,13 +3569,14 @@ namespace gbe
 
   void GenContext::emitOBWriteInstruction(const SelectionInstruction ) {
 const GenRegister header = ra->genReg(insn.src(0));
+const GenRegister data = ra->genReg(insn.src(1));
 const uint32_t bti = insn.getbti();
 const uint32_t ow_size = insn.extra.elem;
 bool isA64 = bti =

[Beignet] [PATCH V2] output more detail of GEN IR for workgroup op

2016-12-21 Thread Guo, Yejun
v2: the src number changes for different ops
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/ir/instruction.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 0687dbf..f0c3957 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1917,7 +1917,8 @@ namespace ir {
   }
 
   out << " %" << this->getDst(fn, 0);
-  out << " %" << this->getSrc(fn, 0);
+  for (uint32_t i = 0; i < this->getSrcNum(); ++i)
+out << " %" << this->getSrc(fn, i);
 
   if (this->workGroupOp == WORKGROUP_OP_BROADCAST) {
 do {
@@ -1942,7 +1943,7 @@ namespace ir {
 } while(0);
   }
 
-  out << "TheadID Map at SLM: " << this->slmAddr;
+  out << " (TheadID Map at SLM: " << this->slmAddr << ")";
 }
 
 INLINE void SubGroupInstruction::out(std::ostream , const Function 
) const {
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] output more detail of GEN IR for workgroup op

2016-12-19 Thread Guo, Yejun
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/ir/instruction.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 0687dbf..12ffbdb 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1918,6 +1918,8 @@ namespace ir {
 
   out << " %" << this->getDst(fn, 0);
   out << " %" << this->getSrc(fn, 0);
+  out << " %" << this->getSrc(fn, 1);
+  out << " %" << this->getSrc(fn, 2);
 
   if (this->workGroupOp == WORKGROUP_OP_BROADCAST) {
 do {
@@ -1942,7 +1944,7 @@ namespace ir {
 } while(0);
   }
 
-  out << "TheadID Map at SLM: " << this->slmAddr;
+  out << "(TheadID Map at SLM: " << this->slmAddr << ")";
 }
 
 INLINE void SubGroupInstruction::out(std::ostream , const Function 
) const {
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] add sends support for oword/media block write

2016-12-19 Thread Guo, Yejun
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen9_encoder.cpp   | 57 ++
 backend/src/backend/gen9_encoder.hpp   |  2 ++
 backend/src/backend/gen_context.cpp|  6 ++--
 backend/src/backend/gen_encoder.cpp|  4 +--
 backend/src/backend/gen_encoder.hpp|  4 +--
 backend/src/backend/gen_insn_selection.cpp | 54 +---
 6 files changed, 109 insertions(+), 18 deletions(-)

diff --git a/backend/src/backend/gen9_encoder.cpp 
b/backend/src/backend/gen9_encoder.cpp
index 940809b..b37fd98 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -244,4 +244,61 @@ namespace gbe
 gen9_insn->bits2.sends.sel_reg32_desc = 1;
 }
   }
+
+  void Gen9Encoder::OBWRITE(GenRegister header, GenRegister data, uint32_t 
bti, uint32_t ow_size, bool useSends)
+  {
+if (!useSends)
+  Gen8Encoder::OBWRITE(header, data, bti, ow_size, false);
+else {
+  GBE_ASSERT(data.reg() != header.reg());
+  GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+  Gen9NativeInstruction *gen9_insn = >gen9_insn;
+
+  this->setHeader(insn);
+  insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+  setSendsOperands(gen9_insn, GenRegister::null(), header, data);
+
+  uint32_t dataRegs = ow_size / 2;
+  // half reg should also have size 1
+  if (dataRegs == 0)
+dataRegs = 1;
+  gen9_insn->bits2.sends.src1_length = dataRegs;
+
+  const uint32_t block_size = getOBlockSize(ow_size);
+  const uint32_t msg_length = 1;
+  const uint32_t response_length = 0;
+  setOBlockRW(insn,
+bti,
+block_size,
+GEN7_OBLOCK_WRITE,
+msg_length,
+response_length);
+}
+  }
+
+  void Gen9Encoder::MBWRITE(GenRegister header, GenRegister data, uint32_t 
bti, uint32_t data_size, bool useSends)
+  {
+if (!useSends)
+  Gen8Encoder::MBWRITE(header, data, bti, data_size, false);
+else {
+  GBE_ASSERT(data.reg() != header.reg());
+  GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+  Gen9NativeInstruction *gen9_insn = >gen9_insn;
+
+  this->setHeader(insn);
+  insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+  setSendsOperands(gen9_insn, GenRegister::null(), header, data);
+  gen9_insn->bits2.sends.src1_length = data_size;
+
+  const uint32_t msg_length = 1;
+  const uint32_t response_length = 0;
+  setMBlockRW(insn,
+bti,
+GEN75_P1_MEDIA_TYPED_BWRITE,
+msg_length,
+response_length);
+}
+  }
 } /* End of the name space. */
diff --git a/backend/src/backend/gen9_encoder.hpp 
b/backend/src/backend/gen9_encoder.hpp
index d754d59..2eaa538 100644
--- a/backend/src/backend/gen9_encoder.hpp
+++ b/backend/src/backend/gen9_encoder.hpp
@@ -54,6 +54,8 @@ namespace gbe
 virtual void BYTE_SCATTER(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemSize, bool useSends);
 virtual unsigned setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize);
 virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister addr, 
GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends);
+virtual void OBWRITE(GenRegister header, GenRegister data, uint32_t bti, 
uint32_t ow_size, bool useSends);
+virtual void MBWRITE(GenRegister header, GenRegister data, uint32_t bti, 
uint32_t data_size, bool useSends);
   };
 }
 #endif /* __GBE_GEN9_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 10e2c9e..5f60ff3 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3569,13 +3569,14 @@ namespace gbe
 
   void GenContext::emitOBWriteInstruction(const SelectionInstruction ) {
 const GenRegister header = ra->genReg(insn.src(0));
+const GenRegister data = ra->genReg(insn.src(1));
 const uint32_t bti = insn.getbti();
 const uint32_t ow_size = insn.extra.elem;
 bool isA64 = bti == 255;
 if (isA64)
p->OBWRITEA64(header, bti, ow_size);
 else
-   p->OBWRITE(header, bti, ow_size);
+   p->OBWRITE(header, data, bti, ow_size, insn.extra.splitSend);
   }
 
   void GenContext::emitMBReadInstruction(const SelectionInstruction ) {
@@ -3587,8 +3588,9 @@ namespace gbe
 
   void GenContext::emitMBWriteInstruction(const SelectionInstruction ) {
 const GenRegister header = ra->genReg(insn.dst(0));
+const GenRegister data = ra->genReg(insn.dst(1));
 const size_t data_size = insn.extra.elem;
-p->MBWRITE(header, insn.getbti(), data_size);
+p->MBWRITE(header, data, insn.getbti(), data_size, insn.extra.splitSend);
   }
 
   BVAR(OCL_OUTPUT_REG_ALLOC, false);
diff --git a/backend/src/ba

Re: [Beignet] [PATCH V4] Backend: Refine block read/write instruction selection

2016-12-19 Thread Guo, Yejun
LGTM, thanks.

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Xiuli 
Pan
Sent: Monday, December 19, 2016 3:58 PM
To: beignet@lists.freedesktop.org
Cc: Pan, Xiuli
Subject: [Beignet] [PATCH V4] Backend: Refine block read/write instruction 
selection

From: Pan Xiuli 

Move the block pack/unpack into instruction selection in order to get
optimization. Also change some variable name to avoid misleading.
And make some new function in GenEncoder class.
V2: Use ud8grf instead of f8grf to save a retype.
V3: Merge change name patch and fix some comments.
V4: Fix some simd 8 related bug and comments typo.

Signed-off-by: Pan Xiuli 
---
 backend/src/backend/gen8_encoder.cpp   |  40 ++-
 backend/src/backend/gen_context.cpp| 459 ++---
 backend/src/backend/gen_encoder.cpp| 105 ---
 backend/src/backend/gen_encoder.hpp|  18 +-
 backend/src/backend/gen_insn_selection.cpp | 448 +---
 5 files changed, 440 insertions(+), 630 deletions(-)

diff --git a/backend/src/backend/gen8_encoder.cpp 
b/backend/src/backend/gen8_encoder.cpp
index 8f73346..39dcfd3 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -840,20 +840,15 @@ namespace gbe
 gen8_insn->bits3.gen8_block_rw_a64.header_present = 1;
   }
 
-  void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header, uint32_t 
bti, uint32_t size) {
-   GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+  void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header, uint32_t 
bti, uint32_t ow_size) {
+GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 const uint32_t msg_length = 1;
-uint32_t rsize = size / 2;
-uint32_t msgsize = size;
-// When size is 1 OWord, which means half a reg, we need to know which 
half to use
-if (size == 1) {
-  if (dst.subnr == 0)
-msgsize = 0;
-  else
-msgsize = 1;
-}
-rsize = rsize == 0 ? 1 : rsize;
-const uint32_t response_length = rsize; // Size is in regs
+uint32_t sizeinreg = ow_size / 2;
+// half reg should also have size 1
+sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+const uint32_t block_size = getOBlockSize(ow_size, dst.subnr == 0);
+const uint32_t response_length = sizeinreg; // Size is in reg
+
 this->setHeader(insn);
 this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
 this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
@@ -861,21 +856,22 @@ namespace gbe
 setOBlockRWA64(this,
insn,
bti,
-   msgsize,
+   block_size,
GEN8_P1_BLOCK_READ_A64,
msg_length,
response_length);
 
   }
 
-  void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti, uint32_t 
size) {
-   GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
-uint32_t rsize = size / 2;
-rsize = rsize == 0 ? 1 : rsize;
-const uint32_t msg_length = 1 + rsize; // Size is in owords
+  void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti, uint32_t 
ow_size) {
+GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+uint32_t sizeinreg = ow_size / 2;
+// half reg should also have size 1
+sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+const uint32_t msg_length = 1 + sizeinreg; // Size is in reg and header
 const uint32_t response_length = 0;
-uint32_t msgsize = size;
-msgsize = msgsize == 1 ? 0 : msgsize;
+const uint32_t block_size = getOBlockSize(ow_size);
+
 this->setHeader(insn);
 this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
 this->setSrc1(insn, GenRegister::immud(0));
@@ -883,7 +879,7 @@ namespace gbe
 setOBlockRWA64(this,
insn,
bti,
-   msgsize,
+   block_size,
GEN8_P1_BLOCK_WRITE_A64,
msg_length,
response_length);
diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 8288fa5..791e607 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3551,458 +3551,39 @@ namespace gbe
   }
 
   void GenContext::emitOBReadInstruction(const SelectionInstruction ) {
-const GenRegister dst= ra->genReg(insn.dst(1));
-const GenRegister addrreg = ra->genReg(insn.src(0));
-uint32_t type = dst.type;
-uint32_t typesize = typeSize(type);
-const uint32_t vec_size = insn.extra.elem;
-const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + 
vec_size)), type);
-const uint32_t simdWidth = p->curr.execWidth;
-const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), 
GEN_TYPE_UD);
-const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
-GenRegister headeraddr;
-bool 

[Beignet] [PATCH] add sends support for printf

2016-12-18 Thread Guo, Yejun
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen_context.cpp| 21 +++--
 backend/src/backend/gen_context.hpp|  2 +-
 backend/src/backend/gen_insn_selection.cpp | 25 -
 backend/src/backend/gen_insn_selection.hpp |  1 +
 4 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index c8019e3..cb25b73 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3477,13 +3477,13 @@ namespace gbe
   }
 
   void GenContext::emitPrintfLongInstruction(GenRegister& addr, GenRegister& 
data,
- GenRegister& src, uint32_t bti) {
+ GenRegister& src, uint32_t bti, 
bool useSends) {
 p->MOV(GenRegister::retype(data, GEN_TYPE_UD), src.bottom_half());
-p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1, false);
+p->UNTYPED_WRITE(addr, data, GenRegister::immud(bti), 1, useSends);
 p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
 
 p->MOV(GenRegister::retype(data, GEN_TYPE_UD), 
src.top_half(this->simdWidth));
-p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1, false);
+p->UNTYPED_WRITE(addr, data, GenRegister::immud(bti), 1, useSends);
 p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
   }
 
@@ -3495,6 +3495,7 @@ namespace gbe
 
 GenRegister addr = GenRegister::retype(tmp0, GEN_TYPE_UD);
 GenRegister data = GenRegister::retype(tmp1, GEN_TYPE_UD);
+bool useSends = insn.extra.printfSplitSend;
 
 if (!insn.extra.continueFlag) {
   p->push(); {
@@ -3505,18 +3506,18 @@ namespace gbe
 p->MOV(data, GenRegister::immud(insn.extra.printfSize + 12));
   } p->pop();
 
-  p->ATOMIC(addr, GEN_ATOMIC_OP_ADD, addr, addr, 
GenRegister::immud(insn.extra.printfBTI), 2, false);
+  p->ATOMIC(addr, GEN_ATOMIC_OP_ADD, addr, data, 
GenRegister::immud(insn.extra.printfBTI), 2, useSends);
   /* Write out the header. */
   p->MOV(data, GenRegister::immud(0xAABBCCDD));
-  p->UNTYPED_WRITE(addr, addr, GenRegister::immud(insn.extra.printfBTI), 
1, false);
+  p->UNTYPED_WRITE(addr, data, GenRegister::immud(insn.extra.printfBTI), 
1, useSends);
 
   p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
   p->MOV(data, GenRegister::immud(insn.extra.printfSize + 12));
-  p->UNTYPED_WRITE(addr, addr, GenRegister::immud(insn.extra.printfBTI), 
1, false);
+  p->UNTYPED_WRITE(addr, data, GenRegister::immud(insn.extra.printfBTI), 
1, useSends);
 
   p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
   p->MOV(data, GenRegister::immud(insn.extra.printfNum));
-  p->UNTYPED_WRITE(addr, addr, GenRegister::immud(insn.extra.printfBTI), 
1, false);
+  p->UNTYPED_WRITE(addr, data, GenRegister::immud(insn.extra.printfBTI), 
1, useSends);
 
   p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
 }
@@ -3526,14 +3527,14 @@ namespace gbe
   src = ra->genReg(insn.src(i));
   if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D || src.type == 
GEN_TYPE_F) {
 p->MOV(GenRegister::retype(data, src.type), src);
-p->UNTYPED_WRITE(addr, addr, GenRegister::immud(insn.extra.printfBTI), 
1, false);
+p->UNTYPED_WRITE(addr, data, GenRegister::immud(insn.extra.printfBTI), 
1, useSends);
 p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
   } else if (src.type == GEN_TYPE_B || src.type == GEN_TYPE_UB ) {
 p->MOV(GenRegister::retype(data, GEN_TYPE_UD), src);
-p->UNTYPED_WRITE(addr, addr, GenRegister::immud(insn.extra.printfBTI), 
1, false);
+p->UNTYPED_WRITE(addr, data, GenRegister::immud(insn.extra.printfBTI), 
1, useSends);
 p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
   } else if (src.type == GEN_TYPE_L || src.type == GEN_TYPE_UL ) {
-emitPrintfLongInstruction(addr, data, src, insn.extra.printfBTI);
+emitPrintfLongInstruction(addr, data, src, insn.extra.printfBTI, 
useSends);
   }
 }
   }
diff --git a/backend/src/backend/gen_context.hpp 
b/backend/src/backend/gen_context.hpp
index fe7317d..7fd40d1 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -243,7 +243,7 @@ namespace gbe
 void calcGlobalXYZRange(GenRegister& reg, GenRegister& tmp, int flag, int 
subFlag);
 virtual void subTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& 
tmp);
 virtual void addTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& 
tmp);
-virtual void emitPrintfLongInstruction(GenRegister& addr, GenRegister& 
data, GenRegister& src, uint32_t bti);
+virtual void emitPrintfLongInstr

[Beignet] [PATCH V2 2/5] support sends for long write

2016-12-14 Thread Guo, Yejun
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 28 +++-
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index 1cd6137..f46207f 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -1625,7 +1625,6 @@ namespace gbe
   // dst: srcNum, (flagTemp)
   // src: srcNum, addr, srcNum, bti.
   insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum*2 + 2);
-  vector = this->appendVector();
 
   for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
 insn->src(elemID) = src[elemID];
@@ -1646,10 +1645,29 @@ namespace gbe
   }
   insn->extra.elem = srcNum;
 
-  vector->regNum = srcNum + 1;
-  vector->offsetID = srcNum;
-  vector->reg = >src(srcNum);
-  vector->isSrc = 1;
+  if (hasSends()) {
+insn->extra.splitSend = 1;
+
+//addr regs
+vector = this->appendVector();
+vector->regNum = 1;
+vector->offsetID = srcNum;
+vector->reg = >src(srcNum);
+vector->isSrc = 1;
+
+//data regs
+vector = this->appendVector();
+vector->regNum = srcNum;
+vector->offsetID = srcNum+1;
+vector->reg = >src(srcNum+1);
+vector->isSrc = 1;
+  } else {
+vector = this->appendVector();
+vector->regNum = srcNum + 1;
+vector->offsetID = srcNum;
+vector->reg = >src(srcNum);
+vector->isSrc = 1;
+  }
 }
 
 if (bti.file != GEN_IMMEDIATE_VALUE) {
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH V2 5/5] enable sends for typed write

2016-12-14 Thread Guo, Yejun
v2: use GBE_ASSERT
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen9_encoder.cpp   | 22 +
 backend/src/backend/gen9_encoder.hpp   |  1 +
 backend/src/backend/gen_context.cpp|  3 ++-
 backend/src/backend/gen_encoder.cpp|  2 +-
 backend/src/backend/gen_encoder.hpp|  4 +++-
 backend/src/backend/gen_insn_selection.cpp | 31 --
 backend/src/backend/gen_insn_selection.hpp |  1 +
 7 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/backend/src/backend/gen9_encoder.cpp 
b/backend/src/backend/gen9_encoder.cpp
index 8d3e39a..940809b 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -146,6 +146,28 @@ namespace gbe
 }
   }
 
+  void Gen9Encoder::TYPED_WRITE(GenRegister header, GenRegister data, bool 
header_present, unsigned char bti, bool useSends)
+  {
+if (!useSends)
+  Gen8Encoder::TYPED_WRITE(header, data, header_present, bti, false);
+else {
+  GBE_ASSERT(header.reg() != data.reg());
+
+  GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+  Gen9NativeInstruction *gen9_insn = >gen9_insn;
+  assert(header_present);
+
+  this->setHeader(insn);
+  insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+  setSendsOperands(gen9_insn, GenRegister::null(), header, data);
+  gen9_insn->bits2.sends.src1_length = 4;   //src0_length: 
5(header+u+v+w+lod), src1_length: 4(data)
+
+  gen9_insn->bits2.sends.sel_reg32_desc = 0;
+  setTypedWriteMessage(insn, bti, GEN_TYPED_WRITE, 5, header_present);
+}
+  }
+
   unsigned Gen9Encoder::setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize)
   {
 uint32_t msg_length = 0;
diff --git a/backend/src/backend/gen9_encoder.hpp 
b/backend/src/backend/gen9_encoder.hpp
index 9b3af13..d754d59 100644
--- a/backend/src/backend/gen9_encoder.hpp
+++ b/backend/src/backend/gen9_encoder.hpp
@@ -49,6 +49,7 @@ namespace gbe
 bool isUniform);
 void setSendsOperands(Gen9NativeInstruction *gen9_insn, GenRegister dst, 
GenRegister src0, GenRegister src1);
 virtual void UNTYPED_WRITE(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemNum, bool useSends);
+virtual void TYPED_WRITE(GenRegister header, GenRegister data, bool 
header_present, unsigned char bti, bool useSends);
 virtual unsigned setUntypedWriteSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemNum);
 virtual void BYTE_SCATTER(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemSize, bool useSends);
 virtual unsigned setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize);
diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index d161ebf..c8019e3 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2465,8 +2465,9 @@ namespace gbe
 
   void GenContext::emitTypedWriteInstruction(const SelectionInstruction ) 
{
 const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), 
GEN_TYPE_UD);
+GenRegister data = ra->genReg(insn.src(5));
 const uint32_t bti = insn.getbti();
-p->TYPED_WRITE(header, true, bti);
+p->TYPED_WRITE(header, data, true, bti, insn.extra.typedWriteSplitSend);
   }
 
   static void calcGID(GenRegister& reg, GenRegister& tmp, int flag, int 
subFlag, int dim, GenContext *gc)
diff --git a/backend/src/backend/gen_encoder.cpp 
b/backend/src/backend/gen_encoder.cpp
index a9bdd3a..5dea48a 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -1257,7 +1257,7 @@ namespace gbe
   msg_type, vme_search_path_lut, lut_sub);
   }
 
-  void GenEncoder::TYPED_WRITE(GenRegister msg, bool header_present, unsigned 
char bti)
+  void GenEncoder::TYPED_WRITE(GenRegister msg, GenRegister data, bool 
header_present, unsigned char bti, bool useSends)
   {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 uint32_t msg_type = GEN_TYPED_WRITE;
diff --git a/backend/src/backend/gen_encoder.hpp 
b/backend/src/backend/gen_encoder.hpp
index b86e9e4..66aa9cb 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -234,8 +234,10 @@ namespace gbe
 
 /*! TypedWrite instruction for texture */
 virtual void TYPED_WRITE(GenRegister header,
+ GenRegister data,
  bool header_present,
- unsigned char bti);
+ unsigned char bti,
+ bool useSends);
 /*! Extended math function (2 sources) */
 void MATH(GenRegister dst, uint32_t function, GenRegister src0, 
GenRegister src1);
 /*! Extended math function (1 source) */
diff --git a/backend/src/backen

[Beignet] [PATCH V2 1/5] refine code to change insn.extra.splitSend as encoder funtion parameter

2016-12-14 Thread Guo, Yejun
it makes possible to switch send and sends within the encoder function.

v2: use GBE_ASSERT etc.
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen8_context.cpp | 14 ++---
 backend/src/backend/gen8_encoder.cpp |  2 +-
 backend/src/backend/gen8_encoder.hpp |  2 +-
 backend/src/backend/gen9_encoder.cpp | 22 -
 backend/src/backend/gen9_encoder.hpp |  4 ++--
 backend/src/backend/gen_context.cpp  | 38 
 backend/src/backend/gen_encoder.cpp  |  4 ++--
 backend/src/backend/gen_encoder.hpp  |  4 ++--
 8 files changed, 44 insertions(+), 46 deletions(-)

diff --git a/backend/src/backend/gen8_context.cpp 
b/backend/src/backend/gen8_context.cpp
index 95b1013..a3045ce 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -969,8 +969,6 @@ namespace gbe
 const GenRegister addr = ra->genReg(insn.src(elemNum));
 const GenRegister bti = ra->genReg(insn.src(elemNum*2+1));
 GenRegister data = ra->genReg(insn.src(elemNum+1));
-if (!insn.extra.splitSend)
-  data = addr;
 
 /* Because BDW's store and load send instructions for 64 bits require the 
bti to be surfaceless,
which we can not accept. We just fallback to 2 DW untypewrite here. */
@@ -981,7 +979,7 @@ namespace gbe
 }
 
 if (bti.file == GEN_IMMEDIATE_VALUE) {
-  p->UNTYPED_WRITE(addr, data, bti, elemNum*2);
+  p->UNTYPED_WRITE(addr, data, bti, elemNum*2, insn.extra.splitSend);
 } else {
   const GenRegister tmp = ra->genReg(insn.dst(elemNum));
   const GenRegister btiTmp = ra->genReg(insn.dst(elemNum + 1));
@@ -997,7 +995,7 @@ namespace gbe
   p->push();
 p->curr.predicate = GEN_PREDICATE_NORMAL;
 p->curr.useFlag(insn.state.flag, insn.state.subFlag);
-p->UNTYPED_WRITE(addr, data, GenRegister::addr1(0), elemNum*2);
+p->UNTYPED_WRITE(addr, data, GenRegister::addr1(0), elemNum*2, 
insn.extra.splitSend);
   p->pop();
   afterMessage(insn, bti, tmp, btiTmp, jip0);
 }
@@ -1358,7 +1356,7 @@ namespace gbe
   nextDst = GenRegister::Qn(tempDst, 1);
   p->MOV(nextDst, nextSrc);
 p->pop();
-p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1);
+p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1, false);
 p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
 
 p->push();
@@ -1374,7 +1372,7 @@ namespace gbe
   nextDst = GenRegister::Qn(tempDst, 1);
   p->MOV(nextDst, nextSrc);
 p->pop();
-p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1);
+p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1, false);
 p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
   }
 
@@ -1801,7 +1799,7 @@ namespace gbe
   p->curr.execWidth = 8;
   p->MUL(msgAddr, threadId, GenRegister::immd(0x8));
   p->ADD(msgAddr, msgAddr, msgSlmOff);
-  p->UNTYPED_WRITE(msg, msg, GenRegister::immw(0xFE), 2);
+  p->UNTYPED_WRITE(msg, msg, GenRegister::immw(0xFE), 2, false);
 }
 else
 {
@@ -1809,7 +1807,7 @@ namespace gbe
   p->MOV(msgData, threadData);
   p->MUL(msgAddr, threadId, GenRegister::immd(0x4));
   p->ADD(msgAddr, msgAddr, msgSlmOff);
-  p->UNTYPED_WRITE(msg, msg, GenRegister::immw(0xFE), 1);
+  p->UNTYPED_WRITE(msg, msg, GenRegister::immw(0xFE), 1, false);
 }
 
 /* init partialData register, it will hold the final result */
diff --git a/backend/src/backend/gen8_encoder.cpp 
b/backend/src/backend/gen8_encoder.cpp
index 8f73346..2928943 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -268,7 +268,7 @@ namespace gbe
 return insn->bits3.ud;
   }
 
-  void Gen8Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister data, 
GenRegister bti, uint32_t elemNum) {
+  void Gen8Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister data, 
GenRegister bti, uint32_t elemNum, bool useSends) {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 assert(elemNum >= 1 || elemNum <= 4);
 this->setHeader(insn);
diff --git a/backend/src/backend/gen8_encoder.hpp 
b/backend/src/backend/gen8_encoder.hpp
index f6a91a0..4afec0c 100644
--- a/backend/src/backend/gen8_encoder.hpp
+++ b/backend/src/backend/gen8_encoder.hpp
@@ -47,7 +47,7 @@ namespace gbe
 virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, 
GenRegister bti, uint32_t srcNum);
 virtual void ATOMICA64(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister bti, uint32_t srcNum);
 virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister 
bti, uint32_t elemNum);
-virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister 
bti, uint32_t elemNum);
+virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister 
bti, uint32_t elemNum

[Beignet] [PATCH V2 4/5] refine code starting from header in typedwrite

2016-12-14 Thread Guo, Yejun
With this refine, the virtual reg and physical reg will be logically
1:1 mapping, and it helps the later instruction sends

v2: use macro NOT_SUPPORTED
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 145 -
 1 file changed, 78 insertions(+), 67 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index 1ebbbe6..852d347 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -6809,86 +6809,97 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
   {
 INLINE bool emitOne(Selection::Opaque , const 
ir::TypedWriteInstruction , bool ) const
 {
-  using namespace ir;
-  const uint32_t simdWidth = sel.ctx.getSimdWidth();
-  GenRegister msgs[9]; // (header + U + V + R + LOD + 4)
-  const uint32_t msgNum = (8 / (simdWidth / 8)) + 1;
-  const uint32_t dim = insn.getSrcNum() - 4;
-
-  if (simdWidth == 16) {
-for(uint32_t i = 0; i < msgNum; i++)
-  msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-  } else {
-uint32_t valueID = 0;
-uint32_t msgID = 0;
-msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-for(; msgID < 1 + dim; msgID++, valueID++)
-  msgs[msgID] = sel.selReg(insn.getSrc(msgID - 1), 
insn.getCoordType());
-
-// fake v.
-if (dim < 2)
-  msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-// fake w.
-if (dim < 3)
-  msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-// LOD.
-msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-for(; valueID < insn.getSrcNum(); msgID++, valueID++)
-  msgs[msgID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
-  }
-
+  const GenRegister header = GenRegister::ud8grf(sel.reg(ir::FAMILY_REG));
   sel.push();
   sel.curr.predicate = GEN_PREDICATE_NONE;
   sel.curr.noMask = 1;
-  sel.MOV(msgs[0], GenRegister::immud(0));
+  sel.MOV(header, GenRegister::immud(0));
   sel.curr.execWidth = 1;
-
-  GenRegister channelEn = sel.getOffsetReg(msgs[0], 0, 7*4);
+  GenRegister channelEn = sel.getOffsetReg(header, 0, 7*4);
   // Enable all channels.
   sel.MOV(channelEn, GenRegister::immud(0x));
-  sel.curr.execWidth = 8;
-  // Set zero LOD.
-  if (simdWidth == 8)
-sel.MOV(msgs[4], GenRegister::immud(0));
-  else
-sel.MOV(GenRegister::Qn(msgs[2], 0), GenRegister::immud(0));
   sel.pop();
 
+  const uint32_t simdWidth = sel.ctx.getSimdWidth();
+  if (simdWidth == 16)
+emitWithSimd16(sel, insn, markChildren, header);
+  else if (simdWidth == 8)
+emitWithSimd8(sel, insn, markChildren, header);
+  else
+NOT_SUPPORTED;
+  return true;
+}
+
+INLINE bool emitWithSimd16(Selection::Opaque , const 
ir::TypedWriteInstruction , bool , const GenRegister& header) 
const
+{
+  using namespace ir;
+
+  GenRegister msgs[9]; // (header + U + V + W + LOD + 4)
+  msgs[0] = header;
+  for (uint32_t i = 1; i < 9; ++i) {
+//SIMD16 will be split into two SIMD8,
+//each virtual reg in msgs requires one physical reg with 8 DWORDs (32 
bytes),
+//so, declare with FAMILY_WORD, and the allocated size will be 
sizeof(WORD)*SIMD16 = 32 bytes
+msgs[i] = sel.selReg(sel.reg(FAMILY_WORD), TYPE_U32);
+  }
+
+  const uint32_t dims = insn.getSrcNum() - 4;
   uint32_t bti = insn.getImageIndex();
-  if (simdWidth == 8)
-sel.TYPED_WRITE(msgs, msgNum, bti, dim == 3);
-  else {
-sel.push();
-sel.curr.execWidth = 8;
-for( uint32_t quarter = 0; quarter < 2; quarter++)
-{
-  #define QUARTER_MOV0(msgs, msgid, src) \
-sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], 
GEN_TYPE_UD), msgid % 2), \
-GenRegister::Qn(src, quarter))
-
-  #define QUARTER_MOV1(msgs, msgid, src) \
-  sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], 
src.type), msgid % 2), \
-  GenRegister::Qn(src, quarter))
-  sel.curr.quarterControl = (quarter == 0) ? GEN_COMPRESSION_Q1 : 
GEN_COMPRESSION_Q2;
-  // Set U,V,W
-  QUARTER_MOV0(msgs, 1, sel.selReg(insn.getSrc(0), 
insn.getCoordType()));
-  if (dim > 1)
-QUARTER_MOV0(msgs, 2, sel.selReg(insn.getSrc(1), 
insn.getCoordType()));
-  if (dim > 2)
-QUARTER_MOV0(msgs, 3, sel.selReg(insn.getSrc(2), 
insn.getCoordType()));
-  // Set R, G, B, A
-  QUARTER_MOV1(msgs, 5, sel.selReg(insn.getSrc(dim), 
insn.getSrcType()));
-  QUARTER_MOV1(msgs, 6, sel.selReg(insn.getSrc(dim + 1), 
insn.getSrcType

[Beignet] [PATCH V2 3/5] add sends for atomic operation, only for ocl 1.2

2016-12-14 Thread Guo, Yejun
v2: use GBE_ASSERT etc
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen75_encoder.cpp  |  2 +-
 backend/src/backend/gen75_encoder.hpp  |  2 +-
 backend/src/backend/gen8_encoder.cpp   |  2 +-
 backend/src/backend/gen8_encoder.hpp   |  2 +-
 backend/src/backend/gen9_encoder.cpp   | 28 
 backend/src/backend/gen9_encoder.hpp   |  1 +
 backend/src/backend/gen_context.cpp| 20 ++--
 backend/src/backend/gen_encoder.cpp|  4 ++--
 backend/src/backend/gen_encoder.hpp|  2 +-
 backend/src/backend/gen_insn_selection.cpp | 25 -
 10 files changed, 70 insertions(+), 18 deletions(-)

diff --git a/backend/src/backend/gen75_encoder.cpp 
b/backend/src/backend/gen75_encoder.cpp
index 9cafaa7..725c774 100644
--- a/backend/src/backend/gen75_encoder.cpp
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -126,7 +126,7 @@ namespace gbe
 return gen7_insn->bits3.ud;
   }
 
-  void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister bti, uint32_t srcNum) {
+  void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister bti, uint32_t srcNum, bool useSends) {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 
 this->setHeader(insn);
diff --git a/backend/src/backend/gen75_encoder.hpp 
b/backend/src/backend/gen75_encoder.hpp
index 517afff..2a226cc 100644
--- a/backend/src/backend/gen75_encoder.hpp
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -42,7 +42,7 @@ namespace gbe
 virtual void JMPI(GenRegister src, bool longjmp = false);
 /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump 
distance */
 virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
-virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, 
GenRegister bti, uint32_t srcNum);
+virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, 
GenRegister bti, uint32_t srcNum, bool useSends);
 virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister 
bti, uint32_t elemNum);
 virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister 
bti, uint32_t elemNum);
 virtual void setHeader(GenNativeInstruction *insn);
diff --git a/backend/src/backend/gen8_encoder.cpp 
b/backend/src/backend/gen8_encoder.cpp
index 2928943..277acda 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -153,7 +153,7 @@ namespace gbe
 return gen8_insn->bits3.ud;
   }
 
-  void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister bti, uint32_t srcNum) {
+  void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends) {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 
 this->setHeader(insn);
diff --git a/backend/src/backend/gen8_encoder.hpp 
b/backend/src/backend/gen8_encoder.hpp
index 4afec0c..fa62a8d 100644
--- a/backend/src/backend/gen8_encoder.hpp
+++ b/backend/src/backend/gen8_encoder.hpp
@@ -44,7 +44,7 @@ namespace gbe
 virtual void F16TO32(GenRegister dest, GenRegister src0);
 virtual void F32TO16(GenRegister dest, GenRegister src0);
 virtual void LOAD_INT64_IMM(GenRegister dest, GenRegister value);
-virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, 
GenRegister bti, uint32_t srcNum);
+virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister addr, 
GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends);
 virtual void ATOMICA64(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister bti, uint32_t srcNum);
 virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister 
bti, uint32_t elemNum);
 virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister 
bti, uint32_t elemNum, bool useSends);
diff --git a/backend/src/backend/gen9_encoder.cpp 
b/backend/src/backend/gen9_encoder.cpp
index 47175f6..8d3e39a 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -194,4 +194,32 @@ namespace gbe
 gen9_insn->bits2.sends.sel_reg32_desc = 1;
 }
   }
+
+  void Gen9Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister 
addr, GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends)
+  {
+if (!useSends)
+  Gen8Encoder::ATOMIC(dst, function, addr, data, bti, srcNum, false);
+else {
+  GBE_ASSERT(addr.reg() != data.reg());
+
+  GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+  Gen9NativeInstruction *gen9_insn = >gen9_insn;
+  this->setHeader(insn);
+  insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+  setSendsOperands(gen9_insn, dst, addr, data);
+  if (this->curr.execWidth == 8)
+gen9_insn->bits

Re: [Beignet] [PATCH 3/5] add sends for atomic operation, only for ocl 1.2

2016-12-14 Thread Guo, Yejun
yes, better to use our macros for GBE, I just copied the assert from somewhere 
in the file, will update after all the reviews.

profiling, printf and maybe more will be the next step, it's hard to put them 
together, this patch focuses on the atomic functions.

-Original Message-
From: Pan, Xiuli 
Sent: Wednesday, December 14, 2016 4:48 PM
To: Guo, Yejun; beignet@lists.freedesktop.org
Cc: Guo, Yejun
Subject: RE: [Beignet] [PATCH 3/5] add sends for atomic operation, only for ocl 
1.2

We have GBE_ASSERT and NOT_IMPLEMENTED NOT_SUPPORTED macro for GBE. Maybe we 
should not use assert directly.
And I see we do not used split for some atomic in profiling and printf, if 
there is any difference or not need to use sends?

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Guo, 
Yejun
Sent: Friday, December 9, 2016 6:08 PM
To: beignet@lists.freedesktop.org
Cc: Guo, Yejun <yejun@intel.com>
Subject: [Beignet] [PATCH 3/5] add sends for atomic operation, only for ocl 1.2

Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen75_encoder.cpp  |  2 +-
 backend/src/backend/gen75_encoder.hpp  |  2 +-
 backend/src/backend/gen8_encoder.cpp   |  2 +-
 backend/src/backend/gen8_encoder.hpp   |  2 +-
 backend/src/backend/gen9_encoder.cpp   | 28 
 backend/src/backend/gen9_encoder.hpp   |  1 +
 backend/src/backend/gen_context.cpp| 20 ++--
 backend/src/backend/gen_encoder.cpp|  4 ++--
 backend/src/backend/gen_encoder.hpp|  2 +-
 backend/src/backend/gen_insn_selection.cpp | 25 -
 10 files changed, 70 insertions(+), 18 deletions(-)

diff --git a/backend/src/backend/gen75_encoder.cpp 
b/backend/src/backend/gen75_encoder.cpp
index 9cafaa7..725c774 100644
--- a/backend/src/backend/gen75_encoder.cpp
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -126,7 +126,7 @@ namespace gbe
 return gen7_insn->bits3.ud;
   }
 
-  void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister bti, uint32_t srcNum) {
+  void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, 
+ GenRegister src, GenRegister bti, uint32_t srcNum, bool useSends) {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 
 this->setHeader(insn);
diff --git a/backend/src/backend/gen75_encoder.hpp 
b/backend/src/backend/gen75_encoder.hpp
index 517afff..2a226cc 100644
--- a/backend/src/backend/gen75_encoder.hpp
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -42,7 +42,7 @@ namespace gbe
 virtual void JMPI(GenRegister src, bool longjmp = false);
 /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump 
distance */
 virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
-virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, 
GenRegister bti, uint32_t srcNum);
+virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister 
+ src, GenRegister bti, uint32_t srcNum, bool useSends);
 virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister 
bti, uint32_t elemNum);
 virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister 
bti, uint32_t elemNum);
 virtual void setHeader(GenNativeInstruction *insn); diff --git 
a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp
index 2928943..277acda 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -153,7 +153,7 @@ namespace gbe
 return gen8_insn->bits3.ud;
   }
 
-  void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister bti, uint32_t srcNum) {
+  void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, 
+ GenRegister src, GenRegister data, GenRegister bti, uint32_t srcNum, 
+ bool useSends) {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 
 this->setHeader(insn);
diff --git a/backend/src/backend/gen8_encoder.hpp 
b/backend/src/backend/gen8_encoder.hpp
index 4afec0c..fa62a8d 100644
--- a/backend/src/backend/gen8_encoder.hpp
+++ b/backend/src/backend/gen8_encoder.hpp
@@ -44,7 +44,7 @@ namespace gbe
 virtual void F16TO32(GenRegister dest, GenRegister src0);
 virtual void F32TO16(GenRegister dest, GenRegister src0);
 virtual void LOAD_INT64_IMM(GenRegister dest, GenRegister value);
-virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, 
GenRegister bti, uint32_t srcNum);
+virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister 
+ addr, GenRegister data, GenRegister bti, uint32_t srcNum, bool 
+ useSends);
 virtual void ATOMICA64(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister bti, uint32_t srcNum);
 virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister 
bti, uint32_t elemNum);
 virtual void UNTYPED_WRITE(GenRegister src, GenRegister da

Re: [Beignet] [PATCH] Backend: Refine block read/write instruction selection

2016-12-13 Thread Guo, Yejun
two comments, thanks.

1.  for header register, we can call:
 const GenRegister header = GenRegister::ud8grf(sel.reg(ir::FAMILY_REG));
instead of:
const GenRegister header = 
GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_REG)), GEN_TYPE_UD);

2. how about separate the logic for SIMD8 and SIMD16?
two consideration: 
a) In current patch, I see you have finished an elaborate algorithm to handle 
all the cases. If we separate it, the logic can be simpler, easier to be 
understood.

b) at SIMD8, help to decrease the reg pressure. For example the following 
instructions:
[46]MOV(8)  %66<1>:UD   :   %56<8,8,1>:UD
[48]MOV(8)  %67<1>:UD   :   %57<8,8,1>:UD
[50]MOV(8)  %68<1>:UD   :   %58<8,8,1>:UD
[52]MOV(8)  %69<1>:UD   :   %59<8,8,1>:UD
[54]OBWRITE(8)  :   %65<8,8,1>:UD   %66<8,8,1>:UD   
%67<8,8,1>:UD   %68<8,8,1>:UD   %69<8,8,1>:UD
can be replaced with:
OBWRITE(8)  :   %65<8,8,1>:UD   %56<8,8,1>:UD   %57<8,8,1>:UD   
%58<8,8,1>:UD   %59<8,8,1>:UD

due to the reg vector requirement, the previous instructions could not be 
optimized at sel ir level, and it is feasible for us to generate the optimized 
ir.

thanks
yejun

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Xiuli 
Pan
Sent: Friday, December 09, 2016 3:01 PM
To: beignet@lists.freedesktop.org
Cc: Pan, Xiuli
Subject: [Beignet] [PATCH] Backend: Refine block read/write instruction 
selection

From: Pan Xiuli 

Move the block pack/unpack into instruction selection in order to get 
optimization.

Signed-off-by: Pan Xiuli 
---
 backend/src/backend/gen_context.cpp| 459 ++---
 backend/src/backend/gen_insn_selection.cpp | 439 ---
 2 files changed, 346 insertions(+), 552 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 798fac8..4e971a2 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3551,458 +3551,39 @@ namespace gbe
   }
 
   void GenContext::emitOBReadInstruction(const SelectionInstruction ) {
-const GenRegister dst= ra->genReg(insn.dst(1));
-const GenRegister addrreg = ra->genReg(insn.src(0));
-uint32_t type = dst.type;
-uint32_t typesize = typeSize(type);
-const uint32_t vec_size = insn.extra.elem;
-const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + 
vec_size)), type);
-const uint32_t simdWidth = p->curr.execWidth;
-const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), 
GEN_TYPE_UD);
-const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
-GenRegister headeraddr;
-bool isA64 = insn.getbti() == 255;
+const GenRegister header = ra->genReg(insn.src(0));
+const GenRegister tmp = ra->genReg(insn.dst(0));
+const uint32_t bti = insn.getbti();
+const uint32_t ow_size = insn.extra.elem;
+bool isA64 = bti == 255;
 if (isA64)
-  headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), 
GEN_TYPE_UL);
+   p->OBREADA64(tmp, header, bti, ow_size);
 else
-  headeraddr = GenRegister::offset(header, 0, 2*4);
-
-// Make header
-p->push();
-{
-  // Copy r0 into the header first
-  p->curr.execWidth = 8;
-  p->curr.predicate = GEN_PREDICATE_NONE;
-  p->curr.noMask = 1;
-  p->MOV(header, GenRegister::ud8grf(0, 0));
-
-  // Update the header with the current address
-  p->curr.execWidth = 1;
-  p->MOV(headeraddr, addr);
-
-  // Put zero in the general state base address
-  if (!isA64)
-p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
-
-}
-p->pop();
-// Now read the data, oword block read can only work with simd16 and no 
mask
-if (vec_size == 1) {
-  p->push();
-  {
-p->curr.execWidth = 16;
-p->curr.noMask = 1;
-if (isA64) {
-  //p->curr.execWidth = 8;
-  p->OBREADA64(dst, header, insn.getbti(), simdWidth * typesize / 16);
-}
-else
-  p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16);
-  }
-  p->pop();
-} else if (vec_size == 2) {
-  p->push();
-  {
-p->curr.execWidth = 16;
-p->curr.noMask = 1;
-if (isA64)
-  p->OBREADA64(tmp, header, insn.getbti(), simdWidth * typesize / 8);
-else
-  p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8);
-  }
-  p->pop();
-  p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0));
-  p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, 0, simdWidth * 
typesize ));
-} else if (vec_size == 4) {
-  if (simdWidth == 8) {
-p->push();
-{
-  p->curr.execWidth = 16;
-  p->curr.noMask = 1;
-

Re: [Beignet] [PATCH 02/19] Runtime: fix clEnqueueMigrateMemObjects fail.

2016-12-13 Thread Guo, Yejun
this [PATCH 02/19] looks good to me, thanks.

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Yang 
Rong
Sent: Monday, November 28, 2016 7:32 PM
To: beignet@lists.freedesktop.org
Cc: Yang, Rong R
Subject: [Beignet] [PATCH 02/19] Runtime: fix clEnqueueMigrateMemObjects fail.

clEnqueueMigrateMemObjects's parameter may be clBuffer or clImage, so should 
call CL_OBJECT_IS_MEM.

Signed-off-by: Yang Rong 
---
 src/cl_api_mem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c index 054c37a..7314a48 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -1177,7 +1177,7 @@ clEnqueueMigrateMemObjects(cl_command_queue command_queue,
 }
 
 for (i = 0; i < num_mem_objects; i++) {
-  if (!CL_OBJECT_IS_BUFFER(mem_objects[i])) {
+  if (!CL_OBJECT_IS_MEM(mem_objects[i])) {
 err = CL_INVALID_MEM_OBJECT;
 break;
   }
--
2.1.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] fix cts issue for clEnqueueMigrateMemObjects

2016-12-13 Thread Guo, Yejun
sure.

-Original Message-
From: Yang, Rong R 
Sent: Wednesday, December 14, 2016 10:37 AM
To: Guo, Yejun; beignet@lists.freedesktop.org
Cc: Guo, Yejun
Subject: RE: [Beignet] [PATCH] fix cts issue for clEnqueueMigrateMemObjects

I have send a same patch " [PATCH 02/19] Runtime: fix 
clEnqueueMigrateMemObjects fail. ", can you try it?

> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf 
> Of Guo, Yejun
> Sent: Monday, December 12, 2016 11:14
> To: beignet@lists.freedesktop.org
> Cc: Guo, Yejun <yejun@intel.com>
> Subject: [Beignet] [PATCH] fix cts issue for 
> clEnqueueMigrateMemObjects
> 
> test case: test_buffers image_migrate
> according to spec, the input parameter is valid with mem object
> 
> Signed-off-by: Guo, Yejun <yejun@intel.com>
> ---
>  src/cl_api_mem.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c index 
> 054c37a..7314a48
> 100644
> --- a/src/cl_api_mem.c
> +++ b/src/cl_api_mem.c
> @@ -1177,7 +1177,7 @@
> clEnqueueMigrateMemObjects(cl_command_queue command_queue,
>  }
> 
>  for (i = 0; i < num_mem_objects; i++) {
> -  if (!CL_OBJECT_IS_BUFFER(mem_objects[i])) {
> +  if (!CL_OBJECT_IS_MEM(mem_objects[i])) {
>  err = CL_INVALID_MEM_OBJECT;
>  break;
>}
> --
> 1.9.1
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] fix cts issue for clEnqueueMigrateMemObjects

2016-12-11 Thread Guo, Yejun
test case: test_buffers image_migrate
according to spec, the input parameter is valid with mem object

Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 src/cl_api_mem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 054c37a..7314a48 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -1177,7 +1177,7 @@ clEnqueueMigrateMemObjects(cl_command_queue command_queue,
 }
 
 for (i = 0; i < num_mem_objects; i++) {
-  if (!CL_OBJECT_IS_BUFFER(mem_objects[i])) {
+  if (!CL_OBJECT_IS_MEM(mem_objects[i])) {
 err = CL_INVALID_MEM_OBJECT;
 break;
   }
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 5/5] enable sends for typed write

2016-12-09 Thread Guo, Yejun
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen9_encoder.cpp   | 22 +
 backend/src/backend/gen9_encoder.hpp   |  1 +
 backend/src/backend/gen_context.cpp|  3 ++-
 backend/src/backend/gen_encoder.cpp|  2 +-
 backend/src/backend/gen_encoder.hpp|  4 +++-
 backend/src/backend/gen_insn_selection.cpp | 31 --
 backend/src/backend/gen_insn_selection.hpp |  1 +
 7 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/backend/src/backend/gen9_encoder.cpp 
b/backend/src/backend/gen9_encoder.cpp
index b42c833..cfbf985 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -146,6 +146,28 @@ namespace gbe
 }
   }
 
+  void Gen9Encoder::TYPED_WRITE(GenRegister header, GenRegister data, bool 
header_present, unsigned char bti, bool useSends)
+  {
+if (!useSends)
+  Gen8Encoder::TYPED_WRITE(header, data, header_present, bti, false);
+else {
+  assert(header.reg() != data.reg());
+
+  GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+  Gen9NativeInstruction *gen9_insn = >gen9_insn;
+  assert(header_present);
+
+  this->setHeader(insn);
+  insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+  setSendsOperands(gen9_insn, GenRegister::null(), header, data);
+  gen9_insn->bits2.sends.src1_length = 4;   //src0_length: 
5(header+u+v+w+lod), src1_length: 4(data)
+
+  gen9_insn->bits2.sends.sel_reg32_desc = 0;
+  setTypedWriteMessage(insn, bti, GEN_TYPED_WRITE, 5, header_present);
+}
+  }
+
   unsigned Gen9Encoder::setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize)
   {
 uint32_t msg_length = 0;
diff --git a/backend/src/backend/gen9_encoder.hpp 
b/backend/src/backend/gen9_encoder.hpp
index 9b3af13..d754d59 100644
--- a/backend/src/backend/gen9_encoder.hpp
+++ b/backend/src/backend/gen9_encoder.hpp
@@ -49,6 +49,7 @@ namespace gbe
 bool isUniform);
 void setSendsOperands(Gen9NativeInstruction *gen9_insn, GenRegister dst, 
GenRegister src0, GenRegister src1);
 virtual void UNTYPED_WRITE(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemNum, bool useSends);
+virtual void TYPED_WRITE(GenRegister header, GenRegister data, bool 
header_present, unsigned char bti, bool useSends);
 virtual unsigned setUntypedWriteSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemNum);
 virtual void BYTE_SCATTER(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemSize, bool useSends);
 virtual unsigned setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize);
diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index d161ebf..c8019e3 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2465,8 +2465,9 @@ namespace gbe
 
   void GenContext::emitTypedWriteInstruction(const SelectionInstruction ) 
{
 const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), 
GEN_TYPE_UD);
+GenRegister data = ra->genReg(insn.src(5));
 const uint32_t bti = insn.getbti();
-p->TYPED_WRITE(header, true, bti);
+p->TYPED_WRITE(header, data, true, bti, insn.extra.typedWriteSplitSend);
   }
 
   static void calcGID(GenRegister& reg, GenRegister& tmp, int flag, int 
subFlag, int dim, GenContext *gc)
diff --git a/backend/src/backend/gen_encoder.cpp 
b/backend/src/backend/gen_encoder.cpp
index a9bdd3a..5dea48a 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -1257,7 +1257,7 @@ namespace gbe
   msg_type, vme_search_path_lut, lut_sub);
   }
 
-  void GenEncoder::TYPED_WRITE(GenRegister msg, bool header_present, unsigned 
char bti)
+  void GenEncoder::TYPED_WRITE(GenRegister msg, GenRegister data, bool 
header_present, unsigned char bti, bool useSends)
   {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 uint32_t msg_type = GEN_TYPED_WRITE;
diff --git a/backend/src/backend/gen_encoder.hpp 
b/backend/src/backend/gen_encoder.hpp
index b86e9e4..66aa9cb 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -234,8 +234,10 @@ namespace gbe
 
 /*! TypedWrite instruction for texture */
 virtual void TYPED_WRITE(GenRegister header,
+ GenRegister data,
  bool header_present,
- unsigned char bti);
+ unsigned char bti,
+ bool useSends);
 /*! Extended math function (2 sources) */
 void MATH(GenRegister dst, uint32_t function, GenRegister src0, 
GenRegister src1);
 /*! Extended math function (1 source) */
diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/b

[Beignet] [PATCH 4/5] refine code starting from header in typedwrite

2016-12-09 Thread Guo, Yejun
With this refine, the virtual reg and physical reg will be logically
1:1 mapping, and it helps the later instruction sends

Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 145 -
 1 file changed, 78 insertions(+), 67 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index 1ebbbe6..ec0897c 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -6809,86 +6809,97 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
   {
 INLINE bool emitOne(Selection::Opaque , const 
ir::TypedWriteInstruction , bool ) const
 {
-  using namespace ir;
-  const uint32_t simdWidth = sel.ctx.getSimdWidth();
-  GenRegister msgs[9]; // (header + U + V + R + LOD + 4)
-  const uint32_t msgNum = (8 / (simdWidth / 8)) + 1;
-  const uint32_t dim = insn.getSrcNum() - 4;
-
-  if (simdWidth == 16) {
-for(uint32_t i = 0; i < msgNum; i++)
-  msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-  } else {
-uint32_t valueID = 0;
-uint32_t msgID = 0;
-msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-for(; msgID < 1 + dim; msgID++, valueID++)
-  msgs[msgID] = sel.selReg(insn.getSrc(msgID - 1), 
insn.getCoordType());
-
-// fake v.
-if (dim < 2)
-  msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-// fake w.
-if (dim < 3)
-  msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-// LOD.
-msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-for(; valueID < insn.getSrcNum(); msgID++, valueID++)
-  msgs[msgID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
-  }
-
+  const GenRegister header = GenRegister::ud8grf(sel.reg(ir::FAMILY_REG));
   sel.push();
   sel.curr.predicate = GEN_PREDICATE_NONE;
   sel.curr.noMask = 1;
-  sel.MOV(msgs[0], GenRegister::immud(0));
+  sel.MOV(header, GenRegister::immud(0));
   sel.curr.execWidth = 1;
-
-  GenRegister channelEn = sel.getOffsetReg(msgs[0], 0, 7*4);
+  GenRegister channelEn = sel.getOffsetReg(header, 0, 7*4);
   // Enable all channels.
   sel.MOV(channelEn, GenRegister::immud(0x));
-  sel.curr.execWidth = 8;
-  // Set zero LOD.
-  if (simdWidth == 8)
-sel.MOV(msgs[4], GenRegister::immud(0));
-  else
-sel.MOV(GenRegister::Qn(msgs[2], 0), GenRegister::immud(0));
   sel.pop();
 
+  const uint32_t simdWidth = sel.ctx.getSimdWidth();
+  if (simdWidth == 16)
+emitWithSimd16(sel, insn, markChildren, header);
+  else if (simdWidth == 8)
+emitWithSimd8(sel, insn, markChildren, header);
+  else
+assert(!"not supported");
+  return true;
+}
+
+INLINE bool emitWithSimd16(Selection::Opaque , const 
ir::TypedWriteInstruction , bool , const GenRegister& header) 
const
+{
+  using namespace ir;
+
+  GenRegister msgs[9]; // (header + U + V + W + LOD + 4)
+  msgs[0] = header;
+  for (uint32_t i = 1; i < 9; ++i) {
+//SIMD16 will be split into two SIMD8,
+//each virtual reg in msgs requires one physical reg with 8 DWORDs (32 
bytes),
+//so, declare with FAMILY_WORD, and the allocated size will be 
sizeof(WORD)*SIMD16 = 32 bytes
+msgs[i] = sel.selReg(sel.reg(FAMILY_WORD), TYPE_U32);
+  }
+
+  const uint32_t dims = insn.getSrcNum() - 4;
   uint32_t bti = insn.getImageIndex();
-  if (simdWidth == 8)
-sel.TYPED_WRITE(msgs, msgNum, bti, dim == 3);
-  else {
-sel.push();
-sel.curr.execWidth = 8;
-for( uint32_t quarter = 0; quarter < 2; quarter++)
-{
-  #define QUARTER_MOV0(msgs, msgid, src) \
-sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], 
GEN_TYPE_UD), msgid % 2), \
-GenRegister::Qn(src, quarter))
-
-  #define QUARTER_MOV1(msgs, msgid, src) \
-  sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], 
src.type), msgid % 2), \
-  GenRegister::Qn(src, quarter))
-  sel.curr.quarterControl = (quarter == 0) ? GEN_COMPRESSION_Q1 : 
GEN_COMPRESSION_Q2;
-  // Set U,V,W
-  QUARTER_MOV0(msgs, 1, sel.selReg(insn.getSrc(0), 
insn.getCoordType()));
-  if (dim > 1)
-QUARTER_MOV0(msgs, 2, sel.selReg(insn.getSrc(1), 
insn.getCoordType()));
-  if (dim > 2)
-QUARTER_MOV0(msgs, 3, sel.selReg(insn.getSrc(2), 
insn.getCoordType()));
-  // Set R, G, B, A
-  QUARTER_MOV1(msgs, 5, sel.selReg(insn.getSrc(dim), 
insn.getSrcType()));
-  QUARTER_MOV1(msgs, 6, sel.selReg(insn.getSrc(dim + 1), 
insn.getSrcType()));
-   

[Beignet] [PATCH 3/5] add sends for atomic operation, only for ocl 1.2

2016-12-09 Thread Guo, Yejun
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen75_encoder.cpp  |  2 +-
 backend/src/backend/gen75_encoder.hpp  |  2 +-
 backend/src/backend/gen8_encoder.cpp   |  2 +-
 backend/src/backend/gen8_encoder.hpp   |  2 +-
 backend/src/backend/gen9_encoder.cpp   | 28 
 backend/src/backend/gen9_encoder.hpp   |  1 +
 backend/src/backend/gen_context.cpp| 20 ++--
 backend/src/backend/gen_encoder.cpp|  4 ++--
 backend/src/backend/gen_encoder.hpp|  2 +-
 backend/src/backend/gen_insn_selection.cpp | 25 -
 10 files changed, 70 insertions(+), 18 deletions(-)

diff --git a/backend/src/backend/gen75_encoder.cpp 
b/backend/src/backend/gen75_encoder.cpp
index 9cafaa7..725c774 100644
--- a/backend/src/backend/gen75_encoder.cpp
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -126,7 +126,7 @@ namespace gbe
 return gen7_insn->bits3.ud;
   }
 
-  void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister bti, uint32_t srcNum) {
+  void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister bti, uint32_t srcNum, bool useSends) {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 
 this->setHeader(insn);
diff --git a/backend/src/backend/gen75_encoder.hpp 
b/backend/src/backend/gen75_encoder.hpp
index 517afff..2a226cc 100644
--- a/backend/src/backend/gen75_encoder.hpp
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -42,7 +42,7 @@ namespace gbe
 virtual void JMPI(GenRegister src, bool longjmp = false);
 /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump 
distance */
 virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
-virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, 
GenRegister bti, uint32_t srcNum);
+virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, 
GenRegister bti, uint32_t srcNum, bool useSends);
 virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister 
bti, uint32_t elemNum);
 virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister 
bti, uint32_t elemNum);
 virtual void setHeader(GenNativeInstruction *insn);
diff --git a/backend/src/backend/gen8_encoder.cpp 
b/backend/src/backend/gen8_encoder.cpp
index 2928943..277acda 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -153,7 +153,7 @@ namespace gbe
 return gen8_insn->bits3.ud;
   }
 
-  void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister bti, uint32_t srcNum) {
+  void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends) {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 
 this->setHeader(insn);
diff --git a/backend/src/backend/gen8_encoder.hpp 
b/backend/src/backend/gen8_encoder.hpp
index 4afec0c..fa62a8d 100644
--- a/backend/src/backend/gen8_encoder.hpp
+++ b/backend/src/backend/gen8_encoder.hpp
@@ -44,7 +44,7 @@ namespace gbe
 virtual void F16TO32(GenRegister dest, GenRegister src0);
 virtual void F32TO16(GenRegister dest, GenRegister src0);
 virtual void LOAD_INT64_IMM(GenRegister dest, GenRegister value);
-virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, 
GenRegister bti, uint32_t srcNum);
+virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister addr, 
GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends);
 virtual void ATOMICA64(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister bti, uint32_t srcNum);
 virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister 
bti, uint32_t elemNum);
 virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister 
bti, uint32_t elemNum, bool useSends);
diff --git a/backend/src/backend/gen9_encoder.cpp 
b/backend/src/backend/gen9_encoder.cpp
index 37ffb0d..b42c833 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -194,4 +194,32 @@ namespace gbe
 gen9_insn->bits2.sends.sel_reg32_desc = 1;
 }
   }
+
+  void Gen9Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister 
addr, GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends)
+  {
+if (!useSends)
+  Gen8Encoder::ATOMIC(dst, function, addr, data, bti, srcNum, false);
+else {
+  assert(addr.reg() != data.reg());
+
+  GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+  Gen9NativeInstruction *gen9_insn = >gen9_insn;
+  this->setHeader(insn);
+  insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+  setSendsOperands(gen9_insn, dst, addr, data);
+  if (this->curr.execWidth == 8)
+gen9_insn->bits2.sends.src1_length = srcNum - 

[Beignet] [PATCH 2/5] support sends for long write

2016-12-09 Thread Guo, Yejun
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 28 +++-
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index 1cd6137..f46207f 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -1625,7 +1625,6 @@ namespace gbe
   // dst: srcNum, (flagTemp)
   // src: srcNum, addr, srcNum, bti.
   insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum*2 + 2);
-  vector = this->appendVector();
 
   for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
 insn->src(elemID) = src[elemID];
@@ -1646,10 +1645,29 @@ namespace gbe
   }
   insn->extra.elem = srcNum;
 
-  vector->regNum = srcNum + 1;
-  vector->offsetID = srcNum;
-  vector->reg = >src(srcNum);
-  vector->isSrc = 1;
+  if (hasSends()) {
+insn->extra.splitSend = 1;
+
+//addr regs
+vector = this->appendVector();
+vector->regNum = 1;
+vector->offsetID = srcNum;
+vector->reg = >src(srcNum);
+vector->isSrc = 1;
+
+//data regs
+vector = this->appendVector();
+vector->regNum = srcNum;
+vector->offsetID = srcNum+1;
+vector->reg = >src(srcNum+1);
+vector->isSrc = 1;
+  } else {
+vector = this->appendVector();
+vector->regNum = srcNum + 1;
+vector->offsetID = srcNum;
+vector->reg = >src(srcNum);
+vector->isSrc = 1;
+  }
 }
 
 if (bti.file != GEN_IMMEDIATE_VALUE) {
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 1/5] refine code to change insn.extra.splitSend as encoder funtion parameter

2016-12-09 Thread Guo, Yejun
it makes possible to switch send and sends within the encoder function.

Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen8_context.cpp | 14 ++---
 backend/src/backend/gen8_encoder.cpp |  2 +-
 backend/src/backend/gen8_encoder.hpp |  2 +-
 backend/src/backend/gen9_encoder.cpp | 16 +--
 backend/src/backend/gen9_encoder.hpp |  4 ++--
 backend/src/backend/gen_context.cpp  | 38 
 backend/src/backend/gen_encoder.cpp  |  4 ++--
 backend/src/backend/gen_encoder.hpp  |  4 ++--
 8 files changed, 41 insertions(+), 43 deletions(-)

diff --git a/backend/src/backend/gen8_context.cpp 
b/backend/src/backend/gen8_context.cpp
index 95b1013..a3045ce 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -969,8 +969,6 @@ namespace gbe
 const GenRegister addr = ra->genReg(insn.src(elemNum));
 const GenRegister bti = ra->genReg(insn.src(elemNum*2+1));
 GenRegister data = ra->genReg(insn.src(elemNum+1));
-if (!insn.extra.splitSend)
-  data = addr;
 
 /* Because BDW's store and load send instructions for 64 bits require the 
bti to be surfaceless,
which we can not accept. We just fallback to 2 DW untypewrite here. */
@@ -981,7 +979,7 @@ namespace gbe
 }
 
 if (bti.file == GEN_IMMEDIATE_VALUE) {
-  p->UNTYPED_WRITE(addr, data, bti, elemNum*2);
+  p->UNTYPED_WRITE(addr, data, bti, elemNum*2, insn.extra.splitSend);
 } else {
   const GenRegister tmp = ra->genReg(insn.dst(elemNum));
   const GenRegister btiTmp = ra->genReg(insn.dst(elemNum + 1));
@@ -997,7 +995,7 @@ namespace gbe
   p->push();
 p->curr.predicate = GEN_PREDICATE_NORMAL;
 p->curr.useFlag(insn.state.flag, insn.state.subFlag);
-p->UNTYPED_WRITE(addr, data, GenRegister::addr1(0), elemNum*2);
+p->UNTYPED_WRITE(addr, data, GenRegister::addr1(0), elemNum*2, 
insn.extra.splitSend);
   p->pop();
   afterMessage(insn, bti, tmp, btiTmp, jip0);
 }
@@ -1358,7 +1356,7 @@ namespace gbe
   nextDst = GenRegister::Qn(tempDst, 1);
   p->MOV(nextDst, nextSrc);
 p->pop();
-p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1);
+p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1, false);
 p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
 
 p->push();
@@ -1374,7 +1372,7 @@ namespace gbe
   nextDst = GenRegister::Qn(tempDst, 1);
   p->MOV(nextDst, nextSrc);
 p->pop();
-p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1);
+p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1, false);
 p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
   }
 
@@ -1801,7 +1799,7 @@ namespace gbe
   p->curr.execWidth = 8;
   p->MUL(msgAddr, threadId, GenRegister::immd(0x8));
   p->ADD(msgAddr, msgAddr, msgSlmOff);
-  p->UNTYPED_WRITE(msg, msg, GenRegister::immw(0xFE), 2);
+  p->UNTYPED_WRITE(msg, msg, GenRegister::immw(0xFE), 2, false);
 }
 else
 {
@@ -1809,7 +1807,7 @@ namespace gbe
   p->MOV(msgData, threadData);
   p->MUL(msgAddr, threadId, GenRegister::immd(0x4));
   p->ADD(msgAddr, msgAddr, msgSlmOff);
-  p->UNTYPED_WRITE(msg, msg, GenRegister::immw(0xFE), 1);
+  p->UNTYPED_WRITE(msg, msg, GenRegister::immw(0xFE), 1, false);
 }
 
 /* init partialData register, it will hold the final result */
diff --git a/backend/src/backend/gen8_encoder.cpp 
b/backend/src/backend/gen8_encoder.cpp
index 8f73346..2928943 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -268,7 +268,7 @@ namespace gbe
 return insn->bits3.ud;
   }
 
-  void Gen8Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister data, 
GenRegister bti, uint32_t elemNum) {
+  void Gen8Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister data, 
GenRegister bti, uint32_t elemNum, bool useSends) {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 assert(elemNum >= 1 || elemNum <= 4);
 this->setHeader(insn);
diff --git a/backend/src/backend/gen8_encoder.hpp 
b/backend/src/backend/gen8_encoder.hpp
index f6a91a0..4afec0c 100644
--- a/backend/src/backend/gen8_encoder.hpp
+++ b/backend/src/backend/gen8_encoder.hpp
@@ -47,7 +47,7 @@ namespace gbe
 virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, 
GenRegister bti, uint32_t srcNum);
 virtual void ATOMICA64(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister bti, uint32_t srcNum);
 virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister 
bti, uint32_t elemNum);
-virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister 
bti, uint32_t elemNum);
+virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister 
bti, uint32_t elemNum, bool useSends);
 virtual vo

Re: [Beignet] [PATCH V2] Backend: Initialize the extra value for selection instruction

2016-12-08 Thread Guo, Yejun
good catch, looks fine to me, thanks.

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Xiuli 
Pan
Sent: Friday, December 09, 2016 11:09 AM
To: beignet@lists.freedesktop.org
Cc: Pan, Xiuli
Subject: [Beignet] [PATCH V2] Backend: Initialize the extra value for selection 
instruction

From: Pan Xiuli 

If we do not initialize the extra, we may get some random result when just use 
some bits of the extra, ex splitSend.
V2: Refine the value to be uint64_t to make sure all bits is set

Signed-off-by: Pan Xiuli 
---
 backend/src/backend/gen_insn_selection.cpp | 2 +-  
backend/src/backend/gen_insn_selection.hpp | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index 6624337..7fc22fa 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -169,7 +169,7 @@ namespace gbe
   SelectionInstruction::SelectionInstruction(SelectionOpcode op, uint32_t dst, 
uint32_t src) :
 parent(NULL), opcode(op), dstNum(dst), srcNum(src)
   {
-extra.function = 0;
+extra.value = 0ul;
   }
 
   void SelectionInstruction::prepend(SelectionInstruction ) { diff --git 
a/backend/src/backend/gen_insn_selection.hpp 
b/backend/src/backend/gen_insn_selection.hpp
index 7ce2b94..1ba5253 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -157,6 +157,7 @@ namespace gbe
 uint16_t printfSize;
   };
   uint32_t workgroupOp;
+  uint64_t value;
 } extra;
 /*! Gen opcode */
 uint8_t opcode;
--
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 3/3] enable sends for typed write

2016-12-08 Thread Guo, Yejun
hi,

please ignore all my un-pushed patches.

I got a new idea after discussed with Xiuli, I'll add a function parameter 
'useSends' for the encoder, so we can switch instructions sends and send inside 
the encoder function even if all the message payloads are continuous.

thanks
yejun

-Original Message-
From: Guo, Yejun 
Sent: Wednesday, December 07, 2016 7:10 PM
To: beignet@lists.freedesktop.org
Cc: Guo, Yejun
Subject: [PATCH 3/3] enable sends for typed write

Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen9_encoder.cpp   | 20 +++
 backend/src/backend/gen9_encoder.hpp   |  1 +
 backend/src/backend/gen_context.cpp|  5 -
 backend/src/backend/gen_insn_selection.cpp | 31 -- 
 backend/src/backend/gen_insn_selection.hpp |  1 +
 5 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/backend/src/backend/gen9_encoder.cpp 
b/backend/src/backend/gen9_encoder.cpp
index b5be852..35fbcb9 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -144,6 +144,26 @@ namespace gbe
 }
   }
 
+  void Gen9Encoder::TYPED_WRITE(GenRegister header, GenRegister data, 
+ bool header_present, unsigned char bti)  {
+if (header.reg() == data.reg())
+  Gen8Encoder::TYPED_WRITE(header, data, header_present, bti);
+else {
+  GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+  Gen9NativeInstruction *gen9_insn = >gen9_insn;
+  assert(header_present);
+
+  this->setHeader(insn);
+  insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+  setSendsOperands(gen9_insn, GenRegister::null(), header, data);
+  gen9_insn->bits2.sends.src1_length = 4;   //src0_length: 
5(header+u+v+w+lod), src1_length: 4(data)
+
+  gen9_insn->bits2.sends.sel_reg32_desc = 0;
+  setTypedWriteMessage(insn, bti, GEN_TYPED_WRITE, 5, header_present);
+}
+  }
+
   unsigned Gen9Encoder::setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize)
   {
 uint32_t msg_length = 0;
diff --git a/backend/src/backend/gen9_encoder.hpp 
b/backend/src/backend/gen9_encoder.hpp
index 1c40b92..20f269f 100644
--- a/backend/src/backend/gen9_encoder.hpp
+++ b/backend/src/backend/gen9_encoder.hpp
@@ -49,6 +49,7 @@ namespace gbe
 bool isUniform);
 void setSendsOperands(Gen9NativeInstruction *gen9_insn, GenRegister dst, 
GenRegister src0, GenRegister src1);
 virtual void UNTYPED_WRITE(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemNum);
+virtual void TYPED_WRITE(GenRegister header, GenRegister data, bool 
+ header_present, unsigned char bti);
 virtual unsigned setUntypedWriteSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemNum);
 virtual void BYTE_SCATTER(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemSize);
 virtual unsigned setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize); diff --git 
a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 302a65b..090470f 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2461,8 +2461,11 @@ namespace gbe
 
   void GenContext::emitTypedWriteInstruction(const SelectionInstruction ) 
{
 const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), 
GEN_TYPE_UD);
+GenRegister data = ra->genReg(insn.src(5));
+if (!insn.extra.typedWriteSplitSend)
+  data = header;
 const uint32_t bti = insn.getbti();
-p->TYPED_WRITE(header, header, true, bti);
+p->TYPED_WRITE(header, data, true, bti);
   }
 
   static void calcGID(GenRegister& reg, GenRegister& tmp, int flag, int 
subFlag, int dim, GenContext *gc) diff --git 
a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index 94c5e9e..44d7fbc 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2759,7 +2759,6 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
 uint32_t elemID = 0;
 uint32_t i;
 SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, 
msgNum);
-SelectionVector *msgVector = this->appendVector();;
 
 for( i = 0; i < msgNum; ++i, ++elemID)
   insn->src(elemID) = msgs[i];
@@ -2767,11 +2766,31 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
 insn->setbti(bti);
 insn->extra.msglen = msgNum;
 insn->extra.is3DWrite = is3D;
-// Sends require contiguous allocation
-msgVector->regNum = msgNum;
-msgVector->isSrc = 1;
-msgVector->offsetID = 0;
-msgVector->reg = >src(0);
+
+if (hasSends()) {
+  assert(msgNum == 9);
+  insn->extra.typedWriteSplitSend = 1;
+  //header + coords
+  SelectionVecto

[Beignet] [PATCH 2/3] change interface for TYPED_WRITE, preparing for sends

2016-12-07 Thread Guo, Yejun
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen_context.cpp | 2 +-
 backend/src/backend/gen_encoder.cpp | 2 +-
 backend/src/backend/gen_encoder.hpp | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 798fac8..302a65b 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2462,7 +2462,7 @@ namespace gbe
   void GenContext::emitTypedWriteInstruction(const SelectionInstruction ) 
{
 const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), 
GEN_TYPE_UD);
 const uint32_t bti = insn.getbti();
-p->TYPED_WRITE(header, true, bti);
+p->TYPED_WRITE(header, header, true, bti);
   }
 
   static void calcGID(GenRegister& reg, GenRegister& tmp, int flag, int 
subFlag, int dim, GenContext *gc)
diff --git a/backend/src/backend/gen_encoder.cpp 
b/backend/src/backend/gen_encoder.cpp
index 49d93e8..3a4b936 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -1257,7 +1257,7 @@ namespace gbe
   msg_type, vme_search_path_lut, lut_sub);
   }
 
-  void GenEncoder::TYPED_WRITE(GenRegister msg, bool header_present, unsigned 
char bti)
+  void GenEncoder::TYPED_WRITE(GenRegister msg, GenRegister data, bool 
header_present, unsigned char bti)
   {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 uint32_t msg_type = GEN_TYPED_WRITE;
diff --git a/backend/src/backend/gen_encoder.hpp 
b/backend/src/backend/gen_encoder.hpp
index e5eb2e2..3e0a650 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -234,6 +234,7 @@ namespace gbe
 
 /*! TypedWrite instruction for texture */
 virtual void TYPED_WRITE(GenRegister header,
+ GenRegister data,
  bool header_present,
  unsigned char bti);
 /*! Extended math function (2 sources) */
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 1/3] refine code starting from header in typedwrite

2016-12-07 Thread Guo, Yejun
With this refine, the virtual reg and physical reg will be logically
1:1 mapping, and it helps the later instruction sends

Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 145 -
 1 file changed, 78 insertions(+), 67 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index 6624337..94c5e9e 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -6734,86 +6734,97 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
   {
 INLINE bool emitOne(Selection::Opaque , const 
ir::TypedWriteInstruction , bool ) const
 {
-  using namespace ir;
-  const uint32_t simdWidth = sel.ctx.getSimdWidth();
-  GenRegister msgs[9]; // (header + U + V + R + LOD + 4)
-  const uint32_t msgNum = (8 / (simdWidth / 8)) + 1;
-  const uint32_t dim = insn.getSrcNum() - 4;
-
-  if (simdWidth == 16) {
-for(uint32_t i = 0; i < msgNum; i++)
-  msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-  } else {
-uint32_t valueID = 0;
-uint32_t msgID = 0;
-msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-for(; msgID < 1 + dim; msgID++, valueID++)
-  msgs[msgID] = sel.selReg(insn.getSrc(msgID - 1), 
insn.getCoordType());
-
-// fake v.
-if (dim < 2)
-  msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-// fake w.
-if (dim < 3)
-  msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-// LOD.
-msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-for(; valueID < insn.getSrcNum(); msgID++, valueID++)
-  msgs[msgID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
-  }
-
+  const GenRegister header = GenRegister::ud8grf(sel.reg(ir::FAMILY_REG));
   sel.push();
   sel.curr.predicate = GEN_PREDICATE_NONE;
   sel.curr.noMask = 1;
-  sel.MOV(msgs[0], GenRegister::immud(0));
+  sel.MOV(header, GenRegister::immud(0));
   sel.curr.execWidth = 1;
-
-  GenRegister channelEn = sel.getOffsetReg(msgs[0], 0, 7*4);
+  GenRegister channelEn = sel.getOffsetReg(header, 0, 7*4);
   // Enable all channels.
   sel.MOV(channelEn, GenRegister::immud(0x));
-  sel.curr.execWidth = 8;
-  // Set zero LOD.
-  if (simdWidth == 8)
-sel.MOV(msgs[4], GenRegister::immud(0));
-  else
-sel.MOV(GenRegister::Qn(msgs[2], 0), GenRegister::immud(0));
   sel.pop();
 
+  const uint32_t simdWidth = sel.ctx.getSimdWidth();
+  if (simdWidth == 16)
+emitWithSimd16(sel, insn, markChildren, header);
+  else if (simdWidth == 8)
+emitWithSimd8(sel, insn, markChildren, header);
+  else
+assert(!"not supported");
+  return true;
+}
+
+INLINE bool emitWithSimd16(Selection::Opaque , const 
ir::TypedWriteInstruction , bool , const GenRegister& header) 
const
+{
+  using namespace ir;
+
+  GenRegister msgs[9]; // (header + U + V + W + LOD + 4)
+  msgs[0] = header;
+  for (uint32_t i = 1; i < 9; ++i) {
+//SIMD16 will be split into two SIMD8,
+//each virtual reg in msgs requires one physical reg with 8 DWORDs (32 
bytes),
+//so, declare with FAMILY_WORD, and the allocated size will be 
sizeof(WORD)*SIMD16 = 32 bytes
+msgs[i] = sel.selReg(sel.reg(FAMILY_WORD), TYPE_U32);
+  }
+
+  const uint32_t dims = insn.getSrcNum() - 4;
   uint32_t bti = insn.getImageIndex();
-  if (simdWidth == 8)
-sel.TYPED_WRITE(msgs, msgNum, bti, dim == 3);
-  else {
-sel.push();
-sel.curr.execWidth = 8;
-for( uint32_t quarter = 0; quarter < 2; quarter++)
-{
-  #define QUARTER_MOV0(msgs, msgid, src) \
-sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], 
GEN_TYPE_UD), msgid % 2), \
-GenRegister::Qn(src, quarter))
-
-  #define QUARTER_MOV1(msgs, msgid, src) \
-  sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], 
src.type), msgid % 2), \
-  GenRegister::Qn(src, quarter))
-  sel.curr.quarterControl = (quarter == 0) ? GEN_COMPRESSION_Q1 : 
GEN_COMPRESSION_Q2;
-  // Set U,V,W
-  QUARTER_MOV0(msgs, 1, sel.selReg(insn.getSrc(0), 
insn.getCoordType()));
-  if (dim > 1)
-QUARTER_MOV0(msgs, 2, sel.selReg(insn.getSrc(1), 
insn.getCoordType()));
-  if (dim > 2)
-QUARTER_MOV0(msgs, 3, sel.selReg(insn.getSrc(2), 
insn.getCoordType()));
-  // Set R, G, B, A
-  QUARTER_MOV1(msgs, 5, sel.selReg(insn.getSrc(dim), 
insn.getSrcType()));
-  QUARTER_MOV1(msgs, 6, sel.selReg(insn.getSrc(dim + 1), 
insn.getSrcType()));
-   

[Beignet] [PATCH] enable sends for skl

2016-12-06 Thread Guo, Yejun
the previous code expected to enable for skl enables kbl

Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index 27f3059..6624337 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2728,6 +2728,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
 this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);
 this->opaque->setSlowByteGather(false);
 this->opaque->setHasHalfType(true);
+this->opaque->setHasSends(true);
 opt_features = SIOF_LOGICAL_SRCMOD;
   }
 
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 1/2] Backend: Add RegisterFamily for ir

2016-12-05 Thread Guo, Yejun
this [PATCH 1/2] looks good, it helps the logical 1:1 mapping between virtual 
register and physical register.

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Xiuli 
Pan
Sent: Thursday, November 24, 2016 5:53 PM
To: beignet@lists.freedesktop.org
Cc: Pan, Xiuli
Subject: [Beignet] [PATCH 1/2] Backend: Add RegisterFamily for ir

From: Pan Xiuli 

We may need some bigger family like OWORD or HWORD and 32 word will be a
reg. This can be used for tmp and header registers.

Signed-off-by: Pan Xiuli 
---
 backend/src/backend/gen_reg_allocation.cpp | 8 ++--
 backend/src/ir/register.cpp| 3 +++
 backend/src/ir/register.hpp| 8 ++--
 backend/src/ir/type.hpp| 4 ++--
 4 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/backend/src/backend/gen_reg_allocation.cpp 
b/backend/src/backend/gen_reg_allocation.cpp
index 495d830..522d8fd 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -92,8 +92,12 @@ namespace gbe
   const bool isScalar = ctx.sel->isScalarReg(reg);
   const RegisterData regData = ctx.sel->getRegisterData(reg);
   const RegisterFamily family = regData.family;
-  const uint32_t typeSize = isScalar ? familyScalarSize[family] : 
familyVectorSize[family];
-  regSize = isScalar ? typeSize : ctx.getSimdWidth() * typeSize;
+  if (family == ir::FAMILY_REG)
+regSize = 32;
+  else {
+const uint32_t typeSize = isScalar ? familyScalarSize[family] : 
familyVectorSize[family];
+regSize = isScalar ? typeSize : ctx.getSimdWidth() * typeSize;
+  }
   if (regFamily != NULL)
 *regFamily = family;
 }
diff --git a/backend/src/ir/register.cpp b/backend/src/ir/register.cpp
index 8200c31..1e78722 100644
--- a/backend/src/ir/register.cpp
+++ b/backend/src/ir/register.cpp
@@ -35,6 +35,9 @@ namespace ir {
   case FAMILY_WORD: return out << "word";
   case FAMILY_DWORD: return out << "dword";
   case FAMILY_QWORD: return out << "qword";
+  case FAMILY_OWORD: return out << "oword";
+  case FAMILY_HWORD: return out << "hword";
+  case FAMILY_REG: return out << "reg";
 };
 return out;
   }
diff --git a/backend/src/ir/register.hpp b/backend/src/ir/register.hpp
index 11ab756..09af24e 100644
--- a/backend/src/ir/register.hpp
+++ b/backend/src/ir/register.hpp
@@ -45,11 +45,14 @@ namespace ir {
 FAMILY_BYTE  = 1,
 FAMILY_WORD  = 2,
 FAMILY_DWORD = 3,
-FAMILY_QWORD = 4
+FAMILY_QWORD = 4,
+FAMILY_OWORD = 5,
+FAMILY_HWORD = 6,
+FAMILY_REG   = 7
   };
 
   INLINE char getFamilyName(RegisterFamily family) {
-static char registerFamilyName[] = {'b', 'B', 'W', 'D', 'Q'};
+static char registerFamilyName[] = {'b', 'B', 'W', 'D', 'Q', 'O', 'H', 
'R'};
 return registerFamilyName[family];
   }
 
@@ -59,6 +62,7 @@ namespace ir {
   case FAMILY_WORD: return 2;
   case FAMILY_DWORD: return 4;
   case FAMILY_QWORD: return 8;
+  case FAMILY_REG: return 32;
   default: NOT_SUPPORTED;
 };
 return 0;
diff --git a/backend/src/ir/type.hpp b/backend/src/ir/type.hpp
index d528859..3ac758f 100644
--- a/backend/src/ir/type.hpp
+++ b/backend/src/ir/type.hpp
@@ -86,8 +86,8 @@ namespace ir {
   case FAMILY_WORD: return TYPE_U16;
   case FAMILY_DWORD: return TYPE_U32;
   case FAMILY_QWORD: return TYPE_U64;
-};
-return TYPE_U32;
+  default: return TYPE_U32;
+}
   }
 
 } /* namespace ir */
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] save host_ptr when create sub buffer from CL_MEM_ALLOC_HOST_PTR

2016-11-30 Thread Guo, Yejun
it fixes issue at https://bugs.freedesktop.org/show_bug.cgi?id=98490

Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 src/cl_mem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cl_mem.c b/src/cl_mem.c
index 798daaf..afce315 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -636,7 +636,7 @@ cl_mem_new_sub_buffer(cl_mem buffer,
   mem->bo = buffer->bo;
   mem->size = info->size;
   sub_buf->sub_offset = info->origin;
-  if (buffer->flags & CL_MEM_USE_HOST_PTR || buffer->flags & 
CL_MEM_COPY_HOST_PTR) {
+  if (buffer->flags & CL_MEM_USE_HOST_PTR || buffer->flags & 
CL_MEM_COPY_HOST_PTR || buffer->flags & CL_MEM_ALLOC_HOST_PTR) {
 mem->host_ptr = buffer->host_ptr;
   }
 
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH V2] disable CMRT as default, since no real case reported

2016-11-30 Thread Guo, Yejun
and this feature also sometimes introduces build issue.

v2: add option INVOKE_CMRT to enable CMRT if necessary
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 CMakeLists.txt | 4 
 1 file changed, 4 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 713cfa9..503c609 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -176,10 +176,14 @@ ELSE(DRM_INTEL_FOUND)
 ENDIF(DRM_INTEL_FOUND)
 
 # CMRT
+#disable CMRT as default, since we do not see real case,
+#but see build issue of this feature
+IF(INVOKE_CMRT)
 pkg_check_modules(CMRT libcmrt)
 IF(CMRT_FOUND)
 INCLUDE_DIRECTORIES(${CMRT_INCLUDE_DIRS})
 ENDIF(CMRT_FOUND)
+ENDIF(INVOKE_CMRT)
 
 # Threads
 Find_Package(Threads)
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] disable CMRT as default, since no real case reported

2016-11-30 Thread Guo, Yejun
sure, let me send a v2 patch to add a new cmake option

-Original Message-
From: Yang, Rong R 
Sent: Wednesday, November 30, 2016 4:33 PM
To: Guo, Yejun; beignet@lists.freedesktop.org
Subject: RE: [Beignet] [PATCH] disable CMRT as default, since no real case 
reported

If disable it, can you add an explanation to readme or add a cmake option?

> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf 
> Of Guo, Yejun
> Sent: Monday, November 21, 2016 10:55
> To: beignet@lists.freedesktop.org
> Subject: Re: [Beignet] [PATCH] disable CMRT as default, since no real 
> case reported
> 
> ping for review, thanks.
> 
> -----Original Message-
> From: Guo, Yejun
> Sent: Tuesday, October 25, 2016 3:33 PM
> To: beignet@lists.freedesktop.org
> Cc: Guo, Yejun
> Subject: [PATCH] disable CMRT as default, since no real case reported
> 
> and this feature also sometimes introduces build issue.
> 
> Signed-off-by: Guo, Yejun <yejun@intel.com>
> ---
>  CMakeLists.txt | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/CMakeLists.txt b/CMakeLists.txt index d839f3f..039f9cd 
> 100644
> --- a/CMakeLists.txt
> +++ b/CMakeLists.txt
> @@ -170,7 +170,9 @@ ELSE(DRM_INTEL_FOUND)
>  ENDIF(DRM_INTEL_FOUND)
> 
>  # CMRT
> -pkg_check_modules(CMRT libcmrt)
> +#disable CMRT as default, since we do not see real case, #while see 
> +build issue of this feature #pkg_check_modules(CMRT libcmrt)
>  IF(CMRT_FOUND)
>  INCLUDE_DIRECTORIES(${CMRT_INCLUDE_DIRS})
>  ENDIF(CMRT_FOUND)
> --
> 1.9.1
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] add sends for atomic operation, only for ocl 1.2

2016-11-29 Thread Guo, Yejun
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen8_encoder.cpp   |  2 +-
 backend/src/backend/gen8_encoder.hpp   |  2 +-
 backend/src/backend/gen9_encoder.cpp   | 26 ++
 backend/src/backend/gen9_encoder.hpp   |  1 +
 backend/src/backend/gen_context.cpp| 20 ++--
 backend/src/backend/gen_encoder.cpp|  4 ++--
 backend/src/backend/gen_encoder.hpp|  2 +-
 backend/src/backend/gen_insn_selection.cpp | 26 +-
 8 files changed, 67 insertions(+), 16 deletions(-)

diff --git a/backend/src/backend/gen8_encoder.cpp 
b/backend/src/backend/gen8_encoder.cpp
index 8f73346..a29e59d 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -153,7 +153,7 @@ namespace gbe
 return gen8_insn->bits3.ud;
   }
 
-  void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister bti, uint32_t srcNum) {
+  void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister data, GenRegister bti, uint32_t srcNum) {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 
 this->setHeader(insn);
diff --git a/backend/src/backend/gen8_encoder.hpp 
b/backend/src/backend/gen8_encoder.hpp
index f6a91a0..1e8ce36 100644
--- a/backend/src/backend/gen8_encoder.hpp
+++ b/backend/src/backend/gen8_encoder.hpp
@@ -44,7 +44,7 @@ namespace gbe
 virtual void F16TO32(GenRegister dest, GenRegister src0);
 virtual void F32TO16(GenRegister dest, GenRegister src0);
 virtual void LOAD_INT64_IMM(GenRegister dest, GenRegister value);
-virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, 
GenRegister bti, uint32_t srcNum);
+virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister addr, 
GenRegister data, GenRegister bti, uint32_t srcNum);
 virtual void ATOMICA64(GenRegister dst, uint32_t function, GenRegister 
src, GenRegister bti, uint32_t srcNum);
 virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister 
bti, uint32_t elemNum);
 virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister 
bti, uint32_t elemNum);
diff --git a/backend/src/backend/gen9_encoder.cpp 
b/backend/src/backend/gen9_encoder.cpp
index b5be852..2d7e572 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -190,4 +190,30 @@ namespace gbe
 gen9_insn->bits2.sends.sel_reg32_desc = 1;
 }
   }
+
+  void Gen9Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister 
addr, GenRegister data, GenRegister bti, uint32_t srcNum)
+  {
+if (addr.reg() == data.reg())
+  Gen8Encoder::ATOMIC(dst, function, addr, data, bti, srcNum);
+else {
+  GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+  Gen9NativeInstruction *gen9_insn = >gen9_insn;
+  this->setHeader(insn);
+  insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+  setSendsOperands(gen9_insn, dst, addr, data);
+  if (this->curr.execWidth == 8)
+gen9_insn->bits2.sends.src1_length = srcNum - 1;
+  else if (this->curr.execWidth == 16)
+gen9_insn->bits2.sends.src1_length = 2 * (srcNum - 1);
+  else
+assert(!"unsupported");
+
+  if (bti.file == GEN_IMMEDIATE_VALUE) {
+gen9_insn->bits2.sends.sel_reg32_desc = 0;
+setAtomicMessageDesc(insn, function, bti.value.ud, 1);
+  } else
+gen9_insn->bits2.sends.sel_reg32_desc = 1;
+}
+  }
 } /* End of the name space. */
diff --git a/backend/src/backend/gen9_encoder.hpp 
b/backend/src/backend/gen9_encoder.hpp
index 1c40b92..8b738cc 100644
--- a/backend/src/backend/gen9_encoder.hpp
+++ b/backend/src/backend/gen9_encoder.hpp
@@ -52,6 +52,7 @@ namespace gbe
 virtual unsigned setUntypedWriteSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemNum);
 virtual void BYTE_SCATTER(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemSize);
 virtual unsigned setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize);
+virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister addr, 
GenRegister data, GenRegister bti, uint32_t srcNum);
   };
 }
 #endif /* __GBE_GEN9_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 9505592..0a76209 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1893,26 +1893,34 @@ namespace gbe
   }
 
   void GenContext::emitAtomicInstruction(const SelectionInstruction ) {
-const GenRegister src = ra->genReg(insn.src(0));
+const GenRegister addr = ra->genReg(insn.src(0));
 const GenRegister dst = ra->genReg(insn.dst(0));
 const uint32_t function = insn.extra.function;
 unsigned srcNum = insn.extra.elem;
 
+GenRegister da

[Beignet] [PATCH] support sends for long write

2016-11-28 Thread Guo, Yejun
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 28 +++-
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index 8090250..9722423 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -1594,7 +1594,6 @@ namespace gbe
   // dst: srcNum, (flagTemp)
   // src: srcNum, addr, srcNum, bti.
   insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum*2 + 2);
-  vector = this->appendVector();
 
   for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
 insn->src(elemID) = src[elemID];
@@ -1615,10 +1614,29 @@ namespace gbe
   }
   insn->extra.elem = srcNum;
 
-  vector->regNum = srcNum + 1;
-  vector->offsetID = srcNum;
-  vector->reg = >src(srcNum);
-  vector->isSrc = 1;
+  if (hasSends()) {
+insn->extra.splitSend = 1;
+
+//addr regs
+vector = this->appendVector();
+vector->regNum = 1;
+vector->offsetID = srcNum;
+vector->reg = >src(srcNum);
+vector->isSrc = 1;
+
+//data regs
+vector = this->appendVector();
+vector->regNum = srcNum;
+vector->offsetID = srcNum+1;
+vector->reg = >src(srcNum+1);
+vector->isSrc = 1;
+  } else {
+vector = this->appendVector();
+vector->regNum = srcNum + 1;
+vector->offsetID = srcNum;
+vector->reg = >src(srcNum);
+vector->isSrc = 1;
+  }
 }
 
 if (bti.file != GEN_IMMEDIATE_VALUE) {
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 1/2] move function setDPByteScatterGather into class GenEncoder

2016-11-28 Thread Guo, Yejun
setDPByteScatterGather will be reused by gen9 sends. As for the
same function in gen8encoder, just leave it untill the reuse case
appears (now, just change the function name to pass build)

Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen8_encoder.cpp |  6 +++---
 backend/src/backend/gen_encoder.cpp  | 15 ++-
 backend/src/backend/gen_encoder.hpp  |  2 ++
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/backend/src/backend/gen8_encoder.cpp 
b/backend/src/backend/gen8_encoder.cpp
index 4239e84..8f73346 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -84,7 +84,7 @@ namespace gbe
   NOT_SUPPORTED;
   }
 
-  static void setDPByteScatterGather(GenEncoder *p,
+  static void setDPByteScatterGatherA64(GenEncoder *p,
  GenNativeInstruction *insn,
  uint32_t bti,
  uint32_t block_size,
@@ -350,7 +350,7 @@ namespace gbe
 GBE_ASSERT(this->curr.execWidth == 8);
 const uint32_t msg_length = 2;
 const uint32_t response_length = 1;
-setDPByteScatterGather(this,
+setDPByteScatterGatherA64(this,
insn,
0xff,
0x0,
@@ -375,7 +375,7 @@ namespace gbe
 this->setSrc1(insn, GenRegister::immud(0));
 const uint32_t msg_length = 3;
 const uint32_t response_length = 0;
-setDPByteScatterGather(this,
+setDPByteScatterGatherA64(this,
insn,
0xff,
0x0,
diff --git a/backend/src/backend/gen_encoder.cpp 
b/backend/src/backend/gen_encoder.cpp
index b379419..637403c 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -237,8 +237,7 @@ namespace gbe
   NOT_SUPPORTED;
   }
 
-  static void setDPByteScatterGather(GenEncoder *p,
- GenNativeInstruction *insn,
+  void GenEncoder::setDPByteScatterGather(GenNativeInstruction *insn,
  uint32_t bti,
  uint32_t elem_size,
  uint32_t msg_type,
@@ -246,13 +245,13 @@ namespace gbe
  uint32_t response_length)
   {
 const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
-p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+setMessageDescriptor(insn, sfid, msg_length, response_length);
 insn->bits3.gen7_byte_rw.msg_type = msg_type;
 insn->bits3.gen7_byte_rw.bti = bti;
 insn->bits3.gen7_byte_rw.data_size = elem_size;
-if (p->curr.execWidth == 8)
+if (curr.execWidth == 8)
   insn->bits3.gen7_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD8;
-else if (p->curr.execWidth == 16)
+else if (curr.execWidth == 16)
   insn->bits3.gen7_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD16;
 else
   NOT_SUPPORTED;
@@ -472,8 +471,7 @@ namespace gbe
   response_length = 2;
 } else
   NOT_IMPLEMENTED;
-setDPByteScatterGather(this,
-   insn,
+setDPByteScatterGather(insn,
bti,
elemSize,
GEN7_BYTE_GATHER,
@@ -515,8 +513,7 @@ namespace gbe
 } else
   NOT_IMPLEMENTED;
 
-setDPByteScatterGather(this,
-   insn,
+setDPByteScatterGather(insn,
bti,
elemSize,
GEN7_BYTE_SCATTER,
diff --git a/backend/src/backend/gen_encoder.hpp 
b/backend/src/backend/gen_encoder.hpp
index e6f362b..b9446e6 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -247,6 +247,8 @@ namespace gbe
 
 // Helper functions to encode
 
+void setDPByteScatterGather(GenNativeInstruction *insn, uint32_t bti, 
uint32_t elem_size,
+ uint32_t msg_type, uint32_t msg_length, 
uint32_t response_length);
 virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, 
uint32_t rgba,
 uint32_t msg_type, uint32_t msg_length,
 uint32_t response_length);
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 2/2] add sends support for byte write

2016-11-28 Thread Guo, Yejun
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen9_encoder.cpp   | 47 ++
 backend/src/backend/gen9_encoder.hpp   |  2 ++
 backend/src/backend/gen_context.cpp| 15 +++---
 backend/src/backend/gen_encoder.cpp| 14 -
 backend/src/backend/gen_encoder.hpp|  4 ++-
 backend/src/backend/gen_insn_selection.cpp | 26 +
 6 files changed, 96 insertions(+), 12 deletions(-)

diff --git a/backend/src/backend/gen9_encoder.cpp 
b/backend/src/backend/gen9_encoder.cpp
index 68ab7ae..b5be852 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -143,4 +143,51 @@ namespace gbe
 gen9_insn->bits2.sends.sel_reg32_desc = 1;
 }
   }
+
+  unsigned Gen9Encoder::setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize)
+  {
+uint32_t msg_length = 0;
+uint32_t response_length = 0;
+if (this->curr.execWidth == 8) {
+  msg_length = 1;
+} else if (this->curr.execWidth == 16) {
+  msg_length = 2;
+} else
+  NOT_IMPLEMENTED;
+
+setDPByteScatterGather(insn,
+   bti,
+   elemSize,
+   GEN7_BYTE_SCATTER,
+   msg_length,
+   response_length);
+return insn->bits3.ud;
+  }
+
+  void Gen9Encoder::BYTE_SCATTER(GenRegister addr, GenRegister data, 
GenRegister bti, uint32_t elemSize)
+  {
+if (addr.reg() == data.reg())
+  Gen8Encoder::BYTE_SCATTER(addr, data, bti, elemSize);
+else {
+  GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+  Gen9NativeInstruction *gen9_insn = >gen9_insn;
+
+  this->setHeader(insn);
+  insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+  setSendsOperands(gen9_insn, GenRegister::null(), addr, data);
+  if (this->curr.execWidth == 8)
+gen9_insn->bits2.sends.src1_length = 1;
+  else if (this->curr.execWidth == 16)
+gen9_insn->bits2.sends.src1_length = 2;
+  else
+assert(!"unsupported");
+
+  if (bti.file == GEN_IMMEDIATE_VALUE) {
+gen9_insn->bits2.sends.sel_reg32_desc = 0;
+setByteScatterSendsMessageDesc(insn, bti.value.ud, elemSize);
+  } else
+gen9_insn->bits2.sends.sel_reg32_desc = 1;
+}
+  }
 } /* End of the name space. */
diff --git a/backend/src/backend/gen9_encoder.hpp 
b/backend/src/backend/gen9_encoder.hpp
index 5b6328d..1c40b92 100644
--- a/backend/src/backend/gen9_encoder.hpp
+++ b/backend/src/backend/gen9_encoder.hpp
@@ -50,6 +50,8 @@ namespace gbe
 void setSendsOperands(Gen9NativeInstruction *gen9_insn, GenRegister dst, 
GenRegister src0, GenRegister src1);
 virtual void UNTYPED_WRITE(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemNum);
 virtual unsigned setUntypedWriteSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemNum);
+virtual void BYTE_SCATTER(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemSize);
+virtual unsigned setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize);
   };
 }
 #endif /* __GBE_GEN9_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 848933e..9505592 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2220,16 +2220,23 @@ namespace gbe
   }
 
   void GenContext::emitByteScatterInstruction(const SelectionInstruction 
) {
-const GenRegister src = ra->genReg(insn.src(0));
+const GenRegister addr = ra->genReg(insn.src(0));
+GenRegister data = ra->genReg(insn.src(1));
+if (!insn.extra.splitSend)
+  data = addr;
 const uint32_t elemSize = insn.extra.elem;
 const GenRegister bti = ra->genReg(insn.src(2));
 
 if (bti.file == GEN_IMMEDIATE_VALUE) {
-  p->BYTE_SCATTER(src, bti, elemSize);
+  p->BYTE_SCATTER(addr, data, bti, elemSize);
 } else {
   const GenRegister tmp = ra->genReg(insn.dst(0));
   const GenRegister btiTmp = ra->genReg(insn.dst(1));
-  unsigned desc = p->generateByteScatterMessageDesc(0, elemSize);
+  unsigned desc = 0;
+  if (insn.extra.splitSend)
+desc = p->generateByteScatterSendsMessageDesc(0, elemSize);
+  else
+desc = p->generateByteScatterMessageDesc(0, elemSize);
 
   unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
 
@@ -2237,7 +2244,7 @@ namespace gbe
   p->push();
 p->curr.predicate = GEN_PREDICATE_NORMAL;
 p->curr.useFlag(insn.state.flag, insn.state.subFlag);
-p->BYTE_SCATTER(src, GenRegister::addr1(0), elemSize);
+p->BYTE_SCATTER(addr, data, GenRegister::addr1(0), elemSize);
   p->pop();
   afterMessage(insn, bti, tmp, btiTmp

[Beignet] [PATCH V3 2/3] prepare gen9 sends binary format and enable the ASM dump for sends

2016-11-28 Thread Guo, Yejun
v2: output dst register for sends
v3: check dst reg file when output dst register
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen/gen_mesa_disasm.c | 31 ++--
 backend/src/backend/gen9_instruction.hpp  | 84 +++
 backend/src/backend/gen_defs.hpp  |  3 ++
 3 files changed, 114 insertions(+), 4 deletions(-)
 create mode 100644 backend/src/backend/gen9_instruction.hpp

diff --git a/backend/src/backend/gen/gen_mesa_disasm.c 
b/backend/src/backend/gen/gen_mesa_disasm.c
index c30f168..56fda89 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -50,6 +50,7 @@
 
 #include "backend/gen_defs.hpp"
 #include "backend/gen7_instruction.hpp"
+#include "backend/gen9_instruction.hpp"
 #include "src/cl_device_data.h"
 
 static const struct {
@@ -104,6 +105,7 @@ static const struct {
 
   [GEN_OPCODE_SEND] = { .name = "send", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_SENDS] = { .name = "sends", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
@@ -1411,7 +1413,8 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
 }
 
   } else if (OPCODE(inst) != GEN_OPCODE_SEND &&
- OPCODE(inst) != GEN_OPCODE_SENDC) {
+ OPCODE(inst) != GEN_OPCODE_SENDC &&
+ OPCODE(inst) != GEN_OPCODE_SENDS) {
 err |= control(file, "conditional modifier", conditional_modifier,
COND_DST_OR_MODIFIER(inst), NULL);
 if (COND_DST_OR_MODIFIER(inst))
@@ -1426,7 +1429,20 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
 string(file, ")");
   }
 
-  if (opcode[OPCODE(inst)].nsrc == 3) {
+  if (OPCODE(inst) == GEN_OPCODE_SENDS) {
+const union Gen9NativeInstruction *gen9_insn = (const union 
Gen9NativeInstruction *)inst;
+pad(file, 16);
+if (gen9_insn->bits1.sends.dest_reg_file_0 == 0)
+  reg(file, GEN_ARCHITECTURE_REGISTER_FILE, 
gen9_insn->bits1.sends.dest_reg_nr);
+else
+  format(file, "g%d", gen9_insn->bits1.sends.dest_reg_nr);
+pad(file, 32);
+format(file, "g%d(addLen:%d)", gen9_insn->bits2.sends.src0_reg_nr, 
GENERIC_MSG_LENGTH(inst));
+pad(file, 48);
+format(file, "g%d(dataLen:%d)", gen9_insn->bits1.sends.src1_reg_nr, 
gen9_insn->bits2.sends.src1_length);
+pad(file, 64);
+format(file, "0x%08x", gen9_insn->bits3.ud);
+  } else if (opcode[OPCODE(inst)].nsrc == 3) {
 pad(file, 16);
 err |= dest_3src(file, inst);
 
@@ -1469,7 +1485,8 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
   }
 
   if (OPCODE(inst) == GEN_OPCODE_SEND ||
-  OPCODE(inst) == GEN_OPCODE_SENDC) {
+  OPCODE(inst) == GEN_OPCODE_SENDC ||
+  OPCODE(inst) == GEN_OPCODE_SENDS) {
 enum GenMessageTarget target = COND_DST_OR_MODIFIER(inst);
 
 newline(file);
@@ -1484,7 +1501,13 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
  target, );
 }
 
-if (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, 
bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE) {
+int immbti = 0;
+if (OPCODE(inst) == GEN_OPCODE_SENDS) {
+  const union Gen9NativeInstruction *gen9_insn = (const union 
Gen9NativeInstruction *)inst;
+  immbti = !(gen9_insn->bits2.sends.sel_reg32_desc);
+} else
+  immbti = (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, 
bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE);
+if (immbti) {
   switch (target) {
 case GEN_SFID_VIDEO_MOTION_EST:
   format(file, " (bti: %d, msg_type: %d)",
diff --git a/backend/src/backend/gen9_instruction.hpp 
b/backend/src/backend/gen9_instruction.hpp
new file mode 100644
index 000..16114ca
--- /dev/null
+++ b/backend/src/backend/gen9_instruction.hpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see

[Beignet] [PATCH V3 3/3] support sends (split send) for untyped write

2016-11-28 Thread Guo, Yejun
sends is a new instruction starting from gen9 to split the registers
of address and data for write, the register pressure can be loosed
since they are not necessary to be continuous any more.

more patches for sends will be sent out.

we can choose send or sends based on hasSends() in selection stage,
only enabeld as default for skylake now.

v2: add function setSendsOperands
v3: reuse function setDPUntypedRW
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen75_encoder.cpp  |  2 +-
 backend/src/backend/gen75_encoder.hpp  |  2 +-
 backend/src/backend/gen8_context.cpp   | 21 +---
 backend/src/backend/gen8_encoder.cpp   |  2 +-
 backend/src/backend/gen8_encoder.hpp   |  2 +-
 backend/src/backend/gen9_encoder.cpp   | 77 ++
 backend/src/backend/gen9_encoder.hpp   |  4 +-
 backend/src/backend/gen_context.cpp| 41 +---
 backend/src/backend/gen_encoder.cpp| 14 +-
 backend/src/backend/gen_encoder.hpp|  4 +-
 backend/src/backend/gen_insn_selection.cpp | 22 -
 backend/src/backend/gen_insn_selection.hpp |  1 +
 12 files changed, 159 insertions(+), 33 deletions(-)

diff --git a/backend/src/backend/gen75_encoder.cpp 
b/backend/src/backend/gen75_encoder.cpp
index fc37991..9cafaa7 100644
--- a/backend/src/backend/gen75_encoder.cpp
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -199,7 +199,7 @@ namespace gbe
 return insn->bits3.ud;
   }
 
-  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t 
elemNum) {
+  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister data, 
GenRegister bti, uint32_t elemNum) {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 assert(elemNum >= 1 || elemNum <= 4);
 this->setHeader(insn);
diff --git a/backend/src/backend/gen75_encoder.hpp 
b/backend/src/backend/gen75_encoder.hpp
index d06f393..517afff 100644
--- a/backend/src/backend/gen75_encoder.hpp
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -44,7 +44,7 @@ namespace gbe
 virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
 virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, 
GenRegister bti, uint32_t srcNum);
 virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister 
bti, uint32_t elemNum);
-virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t 
elemNum);
+virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister 
bti, uint32_t elemNum);
 virtual void setHeader(GenNativeInstruction *insn);
 virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, 
uint32_t rgba,
uint32_t msg_type, uint32_t msg_length, uint32_t 
response_length);
diff --git a/backend/src/backend/gen8_context.cpp 
b/backend/src/backend/gen8_context.cpp
index 71c54fb..95b1013 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -968,6 +968,9 @@ namespace gbe
 GBE_ASSERT(elemNum == 1);
 const GenRegister addr = ra->genReg(insn.src(elemNum));
 const GenRegister bti = ra->genReg(insn.src(elemNum*2+1));
+GenRegister data = ra->genReg(insn.src(elemNum+1));
+if (!insn.extra.splitSend)
+  data = addr;
 
 /* Because BDW's store and load send instructions for 64 bits require the 
bti to be surfaceless,
which we can not accept. We just fallback to 2 DW untypewrite here. */
@@ -978,11 +981,15 @@ namespace gbe
 }
 
 if (bti.file == GEN_IMMEDIATE_VALUE) {
-  p->UNTYPED_WRITE(addr, bti, elemNum*2);
+  p->UNTYPED_WRITE(addr, data, bti, elemNum*2);
 } else {
   const GenRegister tmp = ra->genReg(insn.dst(elemNum));
   const GenRegister btiTmp = ra->genReg(insn.dst(elemNum + 1));
-  unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
+  unsigned desc = 0;
+  if (insn.extra.splitSend)
+desc = p->generateUntypedWriteSendsMessageDesc(0, elemNum*2);
+  else
+desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
 
   unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
 
@@ -990,7 +997,7 @@ namespace gbe
   p->push();
 p->curr.predicate = GEN_PREDICATE_NORMAL;
 p->curr.useFlag(insn.state.flag, insn.state.subFlag);
-p->UNTYPED_WRITE(addr, GenRegister::addr1(0), elemNum*2);
+p->UNTYPED_WRITE(addr, data, GenRegister::addr1(0), elemNum*2);
   p->pop();
   afterMessage(insn, bti, tmp, btiTmp, jip0);
 }
@@ -1351,7 +1358,7 @@ namespace gbe
   nextDst = GenRegister::Qn(tempDst, 1);
   p->MOV(nextDst, nextSrc);
 p->pop();
-p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1);
 p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
 
 p->push();
@@ -1367,7 +1374,7 @@ namespace gbe
   nextDs

[Beignet] [PATCH 1/3] do not touch src1 when setting instruction header

2016-11-28 Thread Guo, Yejun
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen9_encoder.cpp | 1 +
 backend/src/backend/gen_encoder.cpp  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/backend/src/backend/gen9_encoder.cpp 
b/backend/src/backend/gen9_encoder.cpp
index 80df50d..e66ae08 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -60,6 +60,7 @@ namespace gbe
  this->setHeader(insn);
  this->setDst(insn, dest);
  this->setSrc0(insn, msg);
+ this->setSrc1(insn, GenRegister::immud(0));
  setSamplerMessage(insn, bti, sampler, msg_type,
response_length, msg_length,
header_present,
diff --git a/backend/src/backend/gen_encoder.cpp 
b/backend/src/backend/gen_encoder.cpp
index a69adc7..060d65f 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -203,7 +203,6 @@ namespace gbe
 unsigned msg_length, unsigned 
response_length,
 bool header_present, bool 
end_of_thread)
   {
- setSrc1(inst, GenRegister::immud(0));
  inst->bits3.generic_gen5.header_present = header_present;
  inst->bits3.generic_gen5.response_length = response_length;
  inst->bits3.generic_gen5.msg_length = msg_length;
@@ -1178,6 +1177,7 @@ namespace gbe
  this->setHeader(insn);
  this->setDst(insn, dest);
  this->setSrc0(insn, msg);
+ this->setSrc1(insn, GenRegister::immud(0));
  setSamplerMessage(insn, bti, sampler, msg_type,
response_length, msg_length,
header_present,
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] revert clCreateCommandQueue* from ocl2.0 back to 1.2 in utests

2016-11-27 Thread Guo, Yejun
since utests is designed to be a general stand-alone application,
it is better to use ocl1.2 version API, otherwise, link error on
some platforms with only ocl1.2.

Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 utests/profiling_exec.cpp | 3 +--
 utests/utest_helper.cpp   | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/utests/profiling_exec.cpp b/utests/profiling_exec.cpp
index 1859134..437a628 100644
--- a/utests/profiling_exec.cpp
+++ b/utests/profiling_exec.cpp
@@ -52,8 +52,7 @@ static void profiling_exec(void)
 
 
 /* Because the profiling prop, we can not use default queue. */
-const cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, 
CL_QUEUE_PROFILING_ENABLE, 0};
-profiling_queue = clCreateCommandQueueWithProperties(ctx, device, 
properties, );
+profiling_queue = clCreateCommandQueue(ctx, device, 
CL_QUEUE_PROFILING_ENABLE, );
 OCL_ASSERT(status == CL_SUCCESS);
 
 OCL_CREATE_KERNEL("compiler_fabs");
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index b713b79..f1a4bdd 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -511,9 +511,9 @@ cl_ocl_init(void)
 cl_test_channel_type_string(fmt[i].image_channel_data_type));
 
   /* We are going to push NDRange kernels here */
-  queue = clCreateCommandQueueWithProperties(ctx, device, 0, );
+  queue = clCreateCommandQueue(ctx, device, 0, );
   if (status != CL_SUCCESS) {
-fprintf(stderr, "error calling clCreateCommandQueueWithProperties\n");
+fprintf(stderr, "error calling clCreateCommandQueue\n");
 goto error;
   }
 
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 3/3] support sends (split send) for untyped write

2016-11-27 Thread Guo, Yejun
sends is a new instruction starting from gen9 to split the registers
of address and data for write, the register pressure can be loosed
since they are not necessary to be continuous any more.

more patches for sends will be sent out.

we can choose send or sends based on hasSends() in selection stage,
only enabeld as default for skylake now.

v2: add function setSendsOperands
v3: reuse function setDPUntypedRW
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen75_encoder.cpp  |  2 +-
 backend/src/backend/gen75_encoder.hpp  |  2 +-
 backend/src/backend/gen8_context.cpp   | 21 +---
 backend/src/backend/gen8_encoder.cpp   |  2 +-
 backend/src/backend/gen8_encoder.hpp   |  2 +-
 backend/src/backend/gen9_encoder.cpp   | 77 ++
 backend/src/backend/gen9_encoder.hpp   |  4 +-
 backend/src/backend/gen_context.cpp| 41 +---
 backend/src/backend/gen_encoder.cpp| 14 +-
 backend/src/backend/gen_encoder.hpp|  4 +-
 backend/src/backend/gen_insn_selection.cpp | 22 -
 backend/src/backend/gen_insn_selection.hpp |  1 +
 12 files changed, 159 insertions(+), 33 deletions(-)

diff --git a/backend/src/backend/gen75_encoder.cpp 
b/backend/src/backend/gen75_encoder.cpp
index fc37991..9cafaa7 100644
--- a/backend/src/backend/gen75_encoder.cpp
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -199,7 +199,7 @@ namespace gbe
 return insn->bits3.ud;
   }
 
-  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t 
elemNum) {
+  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister data, 
GenRegister bti, uint32_t elemNum) {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 assert(elemNum >= 1 || elemNum <= 4);
 this->setHeader(insn);
diff --git a/backend/src/backend/gen75_encoder.hpp 
b/backend/src/backend/gen75_encoder.hpp
index d06f393..517afff 100644
--- a/backend/src/backend/gen75_encoder.hpp
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -44,7 +44,7 @@ namespace gbe
 virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
 virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, 
GenRegister bti, uint32_t srcNum);
 virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister 
bti, uint32_t elemNum);
-virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t 
elemNum);
+virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister 
bti, uint32_t elemNum);
 virtual void setHeader(GenNativeInstruction *insn);
 virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, 
uint32_t rgba,
uint32_t msg_type, uint32_t msg_length, uint32_t 
response_length);
diff --git a/backend/src/backend/gen8_context.cpp 
b/backend/src/backend/gen8_context.cpp
index 71c54fb..95b1013 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -968,6 +968,9 @@ namespace gbe
 GBE_ASSERT(elemNum == 1);
 const GenRegister addr = ra->genReg(insn.src(elemNum));
 const GenRegister bti = ra->genReg(insn.src(elemNum*2+1));
+GenRegister data = ra->genReg(insn.src(elemNum+1));
+if (!insn.extra.splitSend)
+  data = addr;
 
 /* Because BDW's store and load send instructions for 64 bits require the 
bti to be surfaceless,
which we can not accept. We just fallback to 2 DW untypewrite here. */
@@ -978,11 +981,15 @@ namespace gbe
 }
 
 if (bti.file == GEN_IMMEDIATE_VALUE) {
-  p->UNTYPED_WRITE(addr, bti, elemNum*2);
+  p->UNTYPED_WRITE(addr, data, bti, elemNum*2);
 } else {
   const GenRegister tmp = ra->genReg(insn.dst(elemNum));
   const GenRegister btiTmp = ra->genReg(insn.dst(elemNum + 1));
-  unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
+  unsigned desc = 0;
+  if (insn.extra.splitSend)
+desc = p->generateUntypedWriteSendsMessageDesc(0, elemNum*2);
+  else
+desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
 
   unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
 
@@ -990,7 +997,7 @@ namespace gbe
   p->push();
 p->curr.predicate = GEN_PREDICATE_NORMAL;
 p->curr.useFlag(insn.state.flag, insn.state.subFlag);
-p->UNTYPED_WRITE(addr, GenRegister::addr1(0), elemNum*2);
+p->UNTYPED_WRITE(addr, data, GenRegister::addr1(0), elemNum*2);
   p->pop();
   afterMessage(insn, bti, tmp, btiTmp, jip0);
 }
@@ -1351,7 +1358,7 @@ namespace gbe
   nextDst = GenRegister::Qn(tempDst, 1);
   p->MOV(nextDst, nextSrc);
 p->pop();
-p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1);
 p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
 
 p->push();
@@ -1367,7 +1374,7 @@ namespace gbe
   nextDs

[Beignet] [PATCH 2/3] prepare gen9 sends binary format and enable the ASM dump for sends

2016-11-27 Thread Guo, Yejun
v2: output dst register for sends
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen/gen_mesa_disasm.c | 31 ++--
 backend/src/backend/gen9_instruction.hpp  | 84 +++
 backend/src/backend/gen_defs.hpp  |  3 ++
 3 files changed, 114 insertions(+), 4 deletions(-)
 create mode 100644 backend/src/backend/gen9_instruction.hpp

diff --git a/backend/src/backend/gen/gen_mesa_disasm.c 
b/backend/src/backend/gen/gen_mesa_disasm.c
index c30f168..ebb1a57 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -50,6 +50,7 @@
 
 #include "backend/gen_defs.hpp"
 #include "backend/gen7_instruction.hpp"
+#include "backend/gen9_instruction.hpp"
 #include "src/cl_device_data.h"
 
 static const struct {
@@ -104,6 +105,7 @@ static const struct {
 
   [GEN_OPCODE_SEND] = { .name = "send", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_SENDS] = { .name = "sends", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
@@ -1411,7 +1413,8 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
 }
 
   } else if (OPCODE(inst) != GEN_OPCODE_SEND &&
- OPCODE(inst) != GEN_OPCODE_SENDC) {
+ OPCODE(inst) != GEN_OPCODE_SENDC &&
+ OPCODE(inst) != GEN_OPCODE_SENDS) {
 err |= control(file, "conditional modifier", conditional_modifier,
COND_DST_OR_MODIFIER(inst), NULL);
 if (COND_DST_OR_MODIFIER(inst))
@@ -1426,7 +1429,20 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
 string(file, ")");
   }
 
-  if (opcode[OPCODE(inst)].nsrc == 3) {
+  if (OPCODE(inst) == GEN_OPCODE_SENDS) {
+const union Gen9NativeInstruction *gen9_insn = (const union 
Gen9NativeInstruction *)inst;
+pad(file, 16);
+if (gen9_insn->bits1.sends.dest_reg_nr == 0)
+  string(file, "null");
+else
+  format(file, "g%d", gen9_insn->bits1.sends.dest_reg_nr);
+pad(file, 32);
+format(file, "g%d(addLen:%d)", gen9_insn->bits2.sends.src0_reg_nr, 
GENERIC_MSG_LENGTH(inst));
+pad(file, 48);
+format(file, "g%d(dataLen:%d)", gen9_insn->bits1.sends.src1_reg_nr, 
gen9_insn->bits2.sends.src1_length);
+pad(file, 64);
+format(file, "0x%08x", gen9_insn->bits3.ud);
+  } else if (opcode[OPCODE(inst)].nsrc == 3) {
 pad(file, 16);
 err |= dest_3src(file, inst);
 
@@ -1469,7 +1485,8 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
   }
 
   if (OPCODE(inst) == GEN_OPCODE_SEND ||
-  OPCODE(inst) == GEN_OPCODE_SENDC) {
+  OPCODE(inst) == GEN_OPCODE_SENDC ||
+  OPCODE(inst) == GEN_OPCODE_SENDS) {
 enum GenMessageTarget target = COND_DST_OR_MODIFIER(inst);
 
 newline(file);
@@ -1484,7 +1501,13 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
  target, );
 }
 
-if (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, 
bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE) {
+int immbti = 0;
+if (OPCODE(inst) == GEN_OPCODE_SENDS) {
+  const union Gen9NativeInstruction *gen9_insn = (const union 
Gen9NativeInstruction *)inst;
+  immbti = !(gen9_insn->bits2.sends.sel_reg32_desc);
+} else
+  immbti = (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, 
bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE);
+if (immbti) {
   switch (target) {
 case GEN_SFID_VIDEO_MOTION_EST:
   format(file, " (bti: %d, msg_type: %d)",
diff --git a/backend/src/backend/gen9_instruction.hpp 
b/backend/src/backend/gen9_instruction.hpp
new file mode 100644
index 000..16114ca
--- /dev/null
+++ b/backend/src/backend/gen9_instruction.hpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Guo, Yejun <yejun@intel.com>
+ */
+
+
+#ifndef 

[Beignet] [PATCH 1/3] do not touch src1 when setting instruction header

2016-11-27 Thread Guo, Yejun
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen9_encoder.cpp | 1 +
 backend/src/backend/gen_encoder.cpp  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/backend/src/backend/gen9_encoder.cpp 
b/backend/src/backend/gen9_encoder.cpp
index 80df50d..e66ae08 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -60,6 +60,7 @@ namespace gbe
  this->setHeader(insn);
  this->setDst(insn, dest);
  this->setSrc0(insn, msg);
+ this->setSrc1(insn, GenRegister::immud(0));
  setSamplerMessage(insn, bti, sampler, msg_type,
response_length, msg_length,
header_present,
diff --git a/backend/src/backend/gen_encoder.cpp 
b/backend/src/backend/gen_encoder.cpp
index a69adc7..060d65f 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -203,7 +203,6 @@ namespace gbe
 unsigned msg_length, unsigned 
response_length,
 bool header_present, bool 
end_of_thread)
   {
- setSrc1(inst, GenRegister::immud(0));
  inst->bits3.generic_gen5.header_present = header_present;
  inst->bits3.generic_gen5.response_length = response_length;
  inst->bits3.generic_gen5.msg_length = msg_length;
@@ -1178,6 +1177,7 @@ namespace gbe
  this->setHeader(insn);
  this->setDst(insn, dest);
  this->setSrc0(insn, msg);
+ this->setSrc1(insn, GenRegister::immud(0));
  setSamplerMessage(insn, bti, sampler, msg_type,
response_length, msg_length,
header_present,
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] fix build issue when HAS_BO_SET_SOFTPIN is false

2016-11-24 Thread Guo, Yejun
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 src/cl_mem.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/cl_mem.c b/src/cl_mem.c
index 2b783b9..6a4729d 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -801,9 +801,10 @@ void* cl_mem_svm_allocate(cl_context ctx, cl_svm_mem_flags 
flags,
 
   /* Append the svm in the context buffer list */
   cl_context_add_mem(ctx, mem);
-#endif
-
   return ptr;
+#else
+  return NULL;
+#endif
 }
 
 void
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 2/4] support sends (split send) for untyped write

2016-11-24 Thread Guo, Yejun
please hold on my v2 patch.

let me first try to refine code starting from GenEncoder::setMessageDescriptor. 

-Original Message-
From: Song, Ruiling 
Sent: Thursday, November 24, 2016 8:15 PM
To: Guo, Yejun; beignet@lists.freedesktop.org
Subject: RE: [Beignet] [PATCH 2/4] support sends (split send) for untyped write



> -Original Message-
> From: Guo, Yejun
> Sent: Thursday, November 24, 2016 10:55 AM
> To: Song, Ruiling <ruiling.s...@intel.com>; 
> beignet@lists.freedesktop.org
> Subject: RE: [Beignet] [PATCH 2/4] support sends (split send) for 
> untyped write
> 
> > +  unsigned
> > Gen9Encoder::setUntypedWriteSendsMessageDesc(GenNativeInstruction
> > *insn, unsigned bti, unsigned elemNum)
> The message desc encoding is same for send and sends, what about 
> calling existing function?
> 66   void Gen8Encoder::setDPUntypedRW(GenNativeInstruction *insn,
>  67 uint32_t bti,
>  68 uint32_t rgba,
>  69 uint32_t msg_type,
>  70 uint32_t msg_length,
>  71 uint32_t response_length)
> [yejun] there is a change, 
> gen9_insn->bits3.sends_untyped_rw.src0_length is 1 for SIMD8 and 2 for SIM16, 
> not the msg_length.
>I don't want to reuse the Gen8Encoder function with 
> msg_length=1 since it is different at concept level.
If you think it in another perspective, message descriptor describe src0_length 
for both send and sends.
They are unified at concept level.
>   Another reason is that setDPUntypedRW calls 
> setMessageDescriptor which changes other bits, it is not a pure function to 
> set the msg desc.
I agree with you the setDPUntypedRW is a bad design. But the correct way to do 
is refine the function and reuse it.
Just moving the sfid setting out of the function. Then things will be clearer.
in the setMessageDescriptor() it calls setSrc1(). This should be also moved out.
> 
> 
> > +  gen9_insn->bits1.sends.dest_reg_file_0 = 1;//01 for GRF
> Generally, we should set sends destination to null register, so it is ARF.
> [yejun] I'm open to use GRF or ARF, but does this bit really matter, 
> is there a description mentions null register is ARF in hw spec?
Check the chapter architecture register file.
> 
> 
> > +  gen9_insn->bits2.sends.src0_subreg_nr = addr.subnr;
> Setting src0_subreg_nr here is meaningless, only the src0_subreg_nr[4] 
> bit left, I am not sure whether hw use it correctly.
> Generally the message payload register subnr should be 0. You can 
> remove above line, add an assert(addr.subnr == 0); [Yejun] Actually, 
> there are subreg_nr for src0 and dst, I'm not sure if 
> src0_subreg_nr[4] is a typo in spec. Anyway, add an assert will be a 
> good choice for unclear things.
Src0_subreg_nr[3:0] bits that it bits[67:64] was already taken by ExDesc[9:6].
So you cannot set src0_subreg_nr like above.
> 
> And I would also suggest you define below functions to implement sends 
> encoding logic as sends has very different encoding.
> setSendsDst(nullreg);
> setSendsSrc0(src0);
> setSendsSrc1(src1);
> so that untyped_write() byte_scatter() typed_write can call these 
> functions instead of repeating same logic at every place.
> [Yejun] Actually, I'm refining the code locally with 
> setSendsOperand(dst, src0, src1). What's your opinion?
Separate would be better. I noticed you already sent new version, setting all 
operands together seems acceptable.

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] Backend: Add help function to get a reg in selection

2016-11-23 Thread Guo, Yejun
it is tricky that ir reg is UD but allocated as UW.

for this special case, header/tmp should be allocated with fixed size of 
physical register, no matter of SIMD-n,  how about to add a new FAMILY at ir 
level for such special case.

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Xiuli 
Pan
Sent: Friday, November 18, 2016 12:56 PM
To: beignet@lists.freedesktop.org
Cc: Guo, Yejun; Pan, Xiuli
Subject: [Beignet] [PATCH] Backend: Add help function to get a reg in selection

From: Pan Xiuli <xiuli@intel.com>

We now can get a reg as header or tmp register for send instruction or others. 
Also refine some old wrong attempt to get a reg.

Signed-off-by: Pan Xiuli <xiuli@intel.com>
---
 backend/src/backend/gen_context.cpp|  4 ++--
 backend/src/backend/gen_insn_selection.cpp | 28 
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index c38b7af..d872a70 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3697,7 +3697,7 @@ namespace gbe
 uint32_t type = ra->genReg(insn.src(1)).type;
 uint32_t typesize = typeSize(type);
 const uint32_t vec_size = insn.extra.elem;
-const GenRegister tmp = GenRegister::offset(header, 1);
+const GenRegister tmp = 
+ GenRegister::retype(ra->genReg(insn.dst(1)), GEN_TYPE_UD);
 const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
 GenRegister headeraddr;
 bool isA64 = insn.getbti() == 255;
@@ -3921,7 +3921,7 @@ namespace gbe
 const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)), 
GEN_TYPE_D);
 const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)), 
GEN_TYPE_D);
 const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), 
GEN_TYPE_UD);
-const GenRegister tmp = GenRegister::offset(header, 1);
+const GenRegister tmp = 
+ GenRegister::retype(ra->genReg(insn.dst(1)), GEN_TYPE_UD);
 GenRegister offsetx, offsety, blocksizereg;
 size_t vec_size = insn.extra.elem;
 uint32_t type = ra->genReg(insn.src(2)).type; diff --git 
a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index c14e0bc..ae523b0 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -432,6 +432,8 @@ namespace gbe
 SelectionInstruction *create(SelectionOpcode, uint32_t dstNum, uint32_t 
srcNum);
 /*! Return the selection register from the GenIR one */
 GenRegister selReg(ir::Register, ir::Type type = ir::TYPE_FLOAT) const;
+/*! Return one REG for tmp and header. */
+GenRegister selReg(void);
 /*! Compute the nth register part when using SIMD8 with Qn (n in 2,3,4) */
 GenRegister selRegQn(ir::Register, uint32_t quarter, ir::Type type = 
ir::TYPE_FLOAT) const;
 /*! Size of the stack (should be large enough) */ @@ -1201,6 +1203,16 @@ 
namespace gbe
 
 #undef SEL_REG
 
+  GenRegister Selection::Opaque::selReg(void) {
+const uint32_t simdWidth = ctx.getSimdWidth();
+if (simdWidth == 8)
+  return GenRegister::retype(GenRegister::f8grf(reg(ir::FAMILY_DWORD)), 
GEN_TYPE_UD);
+else if (simdWidth == 16)
+  return GenRegister::retype(GenRegister::f8grf(reg(ir::FAMILY_WORD)), 
GEN_TYPE_UD);
+GBE_ASSERT(false);
+return GenRegister();
+  }
+
   GenRegister Selection::Opaque::selRegQn(ir::Register reg, uint32_t q, 
ir::Type type) const {
 GenRegister sreg = this->selReg(reg, type);
 sreg.quarter = q;
@@ -4675,7 +4687,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
   const uint32_t simdWidth = sel.ctx.getSimdWidth();
   const Type type = insn.getValueType();
   const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
-  const GenRegister header = 
GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+  const GenRegister header = sel.selReg();
   vector valuesVec;
   for(uint32_t i = 0; i < vec_size; i++)
 valuesVec.push_back(sel.selReg(insn.getValue(i), type)); @@ -4685,7 
+4697,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in 
program.cpp
   tmp_size = tmp_size > 4 ? 4 : tmp_size;
   vector tmpVec;
   for(uint32_t i = 0; i < tmp_size; i++)
-
tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), 
GEN_TYPE_UD));
+tmpVec.push_back(sel.selReg());
   sel.OBREAD([0], vec_size, address, header, SI, [0], 
tmp_size);
 }
 
@@ -5121,7 +5133,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
   const uint32_t simdWidth = sel.ctx.getSimdWidth();
   const Type type = insn.getValueType();
   const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
-  const GenRegister header = 
G

[Beignet] [PATCH V2 4/4] add sends support for byte write

2016-11-23 Thread Guo, Yejun
v2: use function setSendsOperands
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen9_encoder.cpp   | 48 ++
 backend/src/backend/gen9_encoder.hpp   |  2 ++
 backend/src/backend/gen_context.cpp| 15 +++---
 backend/src/backend/gen_encoder.cpp| 14 -
 backend/src/backend/gen_encoder.hpp|  4 ++-
 backend/src/backend/gen_insn_selection.cpp | 26 
 6 files changed, 97 insertions(+), 12 deletions(-)

diff --git a/backend/src/backend/gen9_encoder.cpp 
b/backend/src/backend/gen9_encoder.cpp
index 867f3f1..f8338ef 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -141,4 +141,52 @@ namespace gbe
 gen9_insn->bits2.sends.sel_reg32_desc = 1;
 }
   }
+
+  unsigned Gen9Encoder::setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize)
+  {
+Gen9NativeInstruction *gen9_insn = >gen9_insn;
+gen9_insn->bits3.sends_byte_rw.header_present = 0;
+gen9_insn->bits3.sends_byte_rw.response_length = 0;
+gen9_insn->bits3.sends_byte_rw.end_of_thread = 0;
+gen9_insn->bits3.sends_byte_rw.msg_type = GEN7_BYTE_SCATTER;
+gen9_insn->bits3.sends_byte_rw.bti = bti;
+gen9_insn->bits3.sends_byte_rw.data_size = elemSize;
+
+if (this->curr.execWidth == 8) {
+  gen9_insn->bits3.sends_byte_rw.src0_length = 1;
+  gen9_insn->bits3.sends_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD8;
+} else if (this->curr.execWidth == 16) {
+  gen9_insn->bits3.sends_byte_rw.src0_length = 2;
+  gen9_insn->bits3.sends_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD16;
+}
+
+return gen9_insn->bits3.ud;
+  }
+
+  void Gen9Encoder::BYTE_SCATTER(GenRegister addr, GenRegister data, 
GenRegister bti, uint32_t elemSize)
+  {
+if (addr.reg() == data.reg())
+  Gen8Encoder::BYTE_SCATTER(addr, data, bti, elemSize);
+else {
+  GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+  Gen9NativeInstruction *gen9_insn = >gen9_insn;
+
+  this->setHeader(insn);
+  insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+  setSendsOperands(gen9_insn, GenRegister::null(), addr, data);
+  if (this->curr.execWidth == 8)
+gen9_insn->bits2.sends.src1_length = 1;
+  else if (this->curr.execWidth == 16)
+gen9_insn->bits2.sends.src1_length = 2;
+  else
+assert(!"unsupported");
+
+  if (bti.file == GEN_IMMEDIATE_VALUE) {
+gen9_insn->bits2.sends.sel_reg32_desc = 0;
+setByteScatterSendsMessageDesc(insn, bti.value.ud, elemSize);
+  } else
+gen9_insn->bits2.sends.sel_reg32_desc = 1;
+}
+  }
 } /* End of the name space. */
diff --git a/backend/src/backend/gen9_encoder.hpp 
b/backend/src/backend/gen9_encoder.hpp
index 5b6328d..1c40b92 100644
--- a/backend/src/backend/gen9_encoder.hpp
+++ b/backend/src/backend/gen9_encoder.hpp
@@ -50,6 +50,8 @@ namespace gbe
 void setSendsOperands(Gen9NativeInstruction *gen9_insn, GenRegister dst, 
GenRegister src0, GenRegister src1);
 virtual void UNTYPED_WRITE(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemNum);
 virtual unsigned setUntypedWriteSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemNum);
+virtual void BYTE_SCATTER(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemSize);
+virtual unsigned setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize);
   };
 }
 #endif /* __GBE_GEN9_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 848933e..9505592 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2220,16 +2220,23 @@ namespace gbe
   }
 
   void GenContext::emitByteScatterInstruction(const SelectionInstruction 
) {
-const GenRegister src = ra->genReg(insn.src(0));
+const GenRegister addr = ra->genReg(insn.src(0));
+GenRegister data = ra->genReg(insn.src(1));
+if (!insn.extra.splitSend)
+  data = addr;
 const uint32_t elemSize = insn.extra.elem;
 const GenRegister bti = ra->genReg(insn.src(2));
 
 if (bti.file == GEN_IMMEDIATE_VALUE) {
-  p->BYTE_SCATTER(src, bti, elemSize);
+  p->BYTE_SCATTER(addr, data, bti, elemSize);
 } else {
   const GenRegister tmp = ra->genReg(insn.dst(0));
   const GenRegister btiTmp = ra->genReg(insn.dst(1));
-  unsigned desc = p->generateByteScatterMessageDesc(0, elemSize);
+  unsigned desc = 0;
+  if (insn.extra.splitSend)
+desc = p->generateByteScatterSendsMessageDesc(0, elemSize);
+  else
+desc = p->generateByteScatterMessageDesc(0, elemSize);
 
   unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
 
@@ -2237,7 +2244,7 @@

[Beignet] [PATCH V2 2/4] support sends (split send) for untyped write

2016-11-23 Thread Guo, Yejun
sends is a new instruction starting from gen9 to split the registers
of address and data for write, the register pressure can be loosed
since they are not necessary to be continuous any more.

more patches for sends will be sent out.

we can choose send or sends based on hasSends() in selection stage,
only enabeld as default for skylake now.

v2: add function setSendsOperands()
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen75_encoder.cpp  |  2 +-
 backend/src/backend/gen75_encoder.hpp  |  2 +-
 backend/src/backend/gen8_context.cpp   | 21 ++---
 backend/src/backend/gen8_encoder.cpp   |  2 +-
 backend/src/backend/gen8_encoder.hpp   |  2 +-
 backend/src/backend/gen9_encoder.cpp   | 76 ++
 backend/src/backend/gen9_encoder.hpp   |  4 +-
 backend/src/backend/gen_context.cpp| 41 +---
 backend/src/backend/gen_encoder.cpp| 12 -
 backend/src/backend/gen_encoder.hpp|  4 +-
 backend/src/backend/gen_insn_selection.cpp | 22 -
 backend/src/backend/gen_insn_selection.hpp |  1 +
 12 files changed, 156 insertions(+), 33 deletions(-)

diff --git a/backend/src/backend/gen75_encoder.cpp 
b/backend/src/backend/gen75_encoder.cpp
index fc37991..9cafaa7 100644
--- a/backend/src/backend/gen75_encoder.cpp
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -199,7 +199,7 @@ namespace gbe
 return insn->bits3.ud;
   }
 
-  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t 
elemNum) {
+  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister data, 
GenRegister bti, uint32_t elemNum) {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 assert(elemNum >= 1 || elemNum <= 4);
 this->setHeader(insn);
diff --git a/backend/src/backend/gen75_encoder.hpp 
b/backend/src/backend/gen75_encoder.hpp
index d06f393..517afff 100644
--- a/backend/src/backend/gen75_encoder.hpp
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -44,7 +44,7 @@ namespace gbe
 virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
 virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, 
GenRegister bti, uint32_t srcNum);
 virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister 
bti, uint32_t elemNum);
-virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t 
elemNum);
+virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister 
bti, uint32_t elemNum);
 virtual void setHeader(GenNativeInstruction *insn);
 virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, 
uint32_t rgba,
uint32_t msg_type, uint32_t msg_length, uint32_t 
response_length);
diff --git a/backend/src/backend/gen8_context.cpp 
b/backend/src/backend/gen8_context.cpp
index 71c54fb..95b1013 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -968,6 +968,9 @@ namespace gbe
 GBE_ASSERT(elemNum == 1);
 const GenRegister addr = ra->genReg(insn.src(elemNum));
 const GenRegister bti = ra->genReg(insn.src(elemNum*2+1));
+GenRegister data = ra->genReg(insn.src(elemNum+1));
+if (!insn.extra.splitSend)
+  data = addr;
 
 /* Because BDW's store and load send instructions for 64 bits require the 
bti to be surfaceless,
which we can not accept. We just fallback to 2 DW untypewrite here. */
@@ -978,11 +981,15 @@ namespace gbe
 }
 
 if (bti.file == GEN_IMMEDIATE_VALUE) {
-  p->UNTYPED_WRITE(addr, bti, elemNum*2);
+  p->UNTYPED_WRITE(addr, data, bti, elemNum*2);
 } else {
   const GenRegister tmp = ra->genReg(insn.dst(elemNum));
   const GenRegister btiTmp = ra->genReg(insn.dst(elemNum + 1));
-  unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
+  unsigned desc = 0;
+  if (insn.extra.splitSend)
+desc = p->generateUntypedWriteSendsMessageDesc(0, elemNum*2);
+  else
+desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
 
   unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
 
@@ -990,7 +997,7 @@ namespace gbe
   p->push();
 p->curr.predicate = GEN_PREDICATE_NORMAL;
 p->curr.useFlag(insn.state.flag, insn.state.subFlag);
-p->UNTYPED_WRITE(addr, GenRegister::addr1(0), elemNum*2);
+p->UNTYPED_WRITE(addr, data, GenRegister::addr1(0), elemNum*2);
   p->pop();
   afterMessage(insn, bti, tmp, btiTmp, jip0);
 }
@@ -1351,7 +1358,7 @@ namespace gbe
   nextDst = GenRegister::Qn(tempDst, 1);
   p->MOV(nextDst, nextSrc);
 p->pop();
-p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1);
 p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
 
 p->push();
@@ -1367,7 +1374,7 @@ namespace gbe
   nextDst = GenRegister::Qn(tempDst, 1);

[Beignet] [PATCH V2 1/4] prepare gen9 sends binary format and enable the ASM dump for sends

2016-11-23 Thread Guo, Yejun
v2: output dst register for sends
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen/gen_mesa_disasm.c |  31 +++--
 backend/src/backend/gen9_instruction.hpp  | 112 ++
 backend/src/backend/gen_defs.hpp  |   3 +
 3 files changed, 142 insertions(+), 4 deletions(-)
 create mode 100644 backend/src/backend/gen9_instruction.hpp

diff --git a/backend/src/backend/gen/gen_mesa_disasm.c 
b/backend/src/backend/gen/gen_mesa_disasm.c
index c30f168..db63574 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -50,6 +50,7 @@
 
 #include "backend/gen_defs.hpp"
 #include "backend/gen7_instruction.hpp"
+#include "backend/gen9_instruction.hpp"
 #include "src/cl_device_data.h"
 
 static const struct {
@@ -104,6 +105,7 @@ static const struct {
 
   [GEN_OPCODE_SEND] = { .name = "send", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_SENDS] = { .name = "sends", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
@@ -1411,7 +1413,8 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
 }
 
   } else if (OPCODE(inst) != GEN_OPCODE_SEND &&
- OPCODE(inst) != GEN_OPCODE_SENDC) {
+ OPCODE(inst) != GEN_OPCODE_SENDC &&
+ OPCODE(inst) != GEN_OPCODE_SENDS) {
 err |= control(file, "conditional modifier", conditional_modifier,
COND_DST_OR_MODIFIER(inst), NULL);
 if (COND_DST_OR_MODIFIER(inst))
@@ -1426,7 +1429,20 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
 string(file, ")");
   }
 
-  if (opcode[OPCODE(inst)].nsrc == 3) {
+  if (OPCODE(inst) == GEN_OPCODE_SENDS) {
+const union Gen9NativeInstruction *gen9_insn = (const union 
Gen9NativeInstruction *)inst;
+pad(file, 16);
+if (gen9_insn->bits1.sends.dest_reg_nr == 0)
+  string(file, "null");
+else
+  format(file, "g%d", gen9_insn->bits1.sends.dest_reg_nr);
+pad(file, 32);
+format(file, "g%d(addLen:%d)", gen9_insn->bits2.sends.src0_reg_nr, 
gen9_insn->bits3.sends_untyped_rw.src0_length);
+pad(file, 48);
+format(file, "g%d(dataLen:%d)", gen9_insn->bits1.sends.src1_reg_nr, 
gen9_insn->bits2.sends.src1_length);
+pad(file, 64);
+format(file, "0x%08x", gen9_insn->bits3.ud);
+  } else if (opcode[OPCODE(inst)].nsrc == 3) {
 pad(file, 16);
 err |= dest_3src(file, inst);
 
@@ -1469,7 +1485,8 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
   }
 
   if (OPCODE(inst) == GEN_OPCODE_SEND ||
-  OPCODE(inst) == GEN_OPCODE_SENDC) {
+  OPCODE(inst) == GEN_OPCODE_SENDC ||
+  OPCODE(inst) == GEN_OPCODE_SENDS) {
 enum GenMessageTarget target = COND_DST_OR_MODIFIER(inst);
 
 newline(file);
@@ -1484,7 +1501,13 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
  target, );
 }
 
-if (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, 
bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE) {
+int immbti = 0;
+if (OPCODE(inst) == GEN_OPCODE_SENDS) {
+  const union Gen9NativeInstruction *gen9_insn = (const union 
Gen9NativeInstruction *)inst;
+  immbti = !(gen9_insn->bits2.sends.sel_reg32_desc);
+} else
+  immbti = (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, 
bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE);
+if (immbti) {
   switch (target) {
 case GEN_SFID_VIDEO_MOTION_EST:
   format(file, " (bti: %d, msg_type: %d)",
diff --git a/backend/src/backend/gen9_instruction.hpp 
b/backend/src/backend/gen9_instruction.hpp
new file mode 100644
index 000..9d57f08
--- /dev/null
+++ b/backend/src/backend/gen9_instruction.hpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Guo, Yejun <yejun@intel.co

Re: [Beignet] [PATCH 2/4] support sends (split send) for untyped write

2016-11-23 Thread Guo, Yejun
> +  unsigned
> Gen9Encoder::setUntypedWriteSendsMessageDesc(GenNativeInstruction 
> *insn, unsigned bti, unsigned elemNum)
The message desc encoding is same for send and sends, what about calling 
existing function?
66   void Gen8Encoder::setDPUntypedRW(GenNativeInstruction *insn,
 67 uint32_t bti,
 68 uint32_t rgba,
 69 uint32_t msg_type,
 70 uint32_t msg_length,
 71 uint32_t response_length)
[yejun] there is a change, gen9_insn->bits3.sends_untyped_rw.src0_length is 1 
for SIMD8 and 2 for SIM16, not the msg_length.
   I don't want to reuse the Gen8Encoder function with msg_length=1 
since it is different at concept level.
  Another reason is that setDPUntypedRW calls setMessageDescriptor 
which changes other bits, it is not a pure function to set the msg desc.


> +  gen9_insn->bits1.sends.dest_reg_file_0 = 1;//01 for GRF
Generally, we should set sends destination to null register, so it is ARF.
[yejun] I'm open to use GRF or ARF, but does this bit really matter, is there a 
description mentions null register is ARF in hw spec? 


> +  gen9_insn->bits2.sends.src0_subreg_nr = addr.subnr;
Setting src0_subreg_nr here is meaningless, only the src0_subreg_nr[4] bit 
left, I am not sure whether hw use it correctly.
Generally the message payload register subnr should be 0. You can remove above 
line, add an assert(addr.subnr == 0);
[Yejun] Actually, there are subreg_nr for src0 and dst, I'm not sure if 
src0_subreg_nr[4] is a typo in spec. Anyway, add an assert will be a good 
choice for unclear things.

And I would also suggest you define below functions to implement sends encoding 
logic as sends has very different encoding.
setSendsDst(nullreg);
setSendsSrc0(src0);
setSendsSrc1(src1);
so that untyped_write() byte_scatter() typed_write can call these functions 
instead of repeating same logic at every place.
[Yejun] Actually, I'm refining the code locally with setSendsOperand(dst, src0, 
src1). What's your opinion?


-Original Message-
From: Song, Ruiling 
Sent: Wednesday, November 23, 2016 10:15 PM
To: Guo, Yejun; beignet@lists.freedesktop.org
Cc: Guo, Yejun
Subject: RE: [Beignet] [PATCH 2/4] support sends (split send) for untyped write



> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf 
> Of Guo, Yejun
> Sent: Tuesday, November 22, 2016 2:43 PM
> To: beignet@lists.freedesktop.org
> Cc: Guo, Yejun <yejun@intel.com>
> Subject: [Beignet] [PATCH 2/4] support sends (split send) for untyped 
> write
> 
> sends is a new instruction starting from gen9 to split the registers 
> of address and data for write, the register pressure can be loosed 
> since they are not necessary to be continuous any more.
> 
> more patches for sends will be sent out.
> 
> we can choose send or sends based on hasSends() in selection stage, 
> only enabeld as default for skylake now.
> 
> Signed-off-by: Guo, Yejun <yejun@intel.com>
> ---
>  backend/src/backend/gen75_encoder.cpp  |  2 +-
>  backend/src/backend/gen75_encoder.hpp  |  2 +-
>  backend/src/backend/gen8_context.cpp   | 21 +++
>  backend/src/backend/gen8_encoder.cpp   |  2 +-
>  backend/src/backend/gen8_encoder.hpp   |  2 +-
>  backend/src/backend/gen9_encoder.cpp   | 58
> ++
>  backend/src/backend/gen9_encoder.hpp   |  3 +-
>  backend/src/backend/gen_context.cpp| 41 -
>  backend/src/backend/gen_encoder.cpp| 12 ++-
>  backend/src/backend/gen_encoder.hpp|  4 ++-
>  backend/src/backend/gen_insn_selection.cpp | 22 ++--  
> backend/src/backend/gen_insn_selection.hpp |  1 +
>  12 files changed, 137 insertions(+), 33 deletions(-)
> 
> diff --git a/backend/src/backend/gen75_encoder.cpp
> b/backend/src/backend/gen75_encoder.cpp
> index fc37991..9cafaa7 100644
> --- a/backend/src/backend/gen75_encoder.cpp
> +++ b/backend/src/backend/gen75_encoder.cpp
> @@ -199,7 +199,7 @@ namespace gbe
>  return insn->bits3.ud;
>}
> 
> -  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, 
> uint32_t elemNum) {
> +  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister data,
> GenRegister bti, uint32_t elemNum) {
>  GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
>  assert(elemNum >= 1 || elemNum <= 4);
>  this->setHeader(insn);
> diff --git a/backend/src/backend/gen75_encoder.hpp
> b/backend/src/backend/gen75_encoder.hpp
> index d06f393..517afff 100644
> --- a/backend/src/backend/gen75_encoder.hpp
> +++ b/backend/src/backend/gen75

Re: [Beignet] [PATCH 1/4] prepare gen9 sends binary format and enable the ASM dump for sends

2016-11-23 Thread Guo, Yejun
ok

-Original Message-
From: Song, Ruiling 
Sent: Thursday, November 24, 2016 10:33 AM
To: Guo, Yejun; beignet@lists.freedesktop.org
Subject: RE: [Beignet] [PATCH 1/4] prepare gen9 sends binary format and enable 
the ASM dump for sends



> -Original Message-
> From: Guo, Yejun
> Sent: Thursday, November 24, 2016 10:21 AM
> To: Song, Ruiling <ruiling.s...@intel.com>; 
> beignet@lists.freedesktop.org
> Subject: RE: [Beignet] [PATCH 1/4] prepare gen9 sends binary format 
> and enable the ASM dump for sends
> 
> thanks, and two inline comments, right?
yes
> 
> > +string(file, "null");
> This is not a good idea. Please parse the instruction bits to know 
> whether it is the null register.
> [yejun] yes, when I did something like atomic which has return value, 
> I found it is not always null. So, I fixed this in another patch, locally now.
> 
> > +format(file, "0x%x", gen9_insn->bits3.ud);
> "0x%08x" would be better.
> [yejun] agree, should I send a v2 patch, or fix it in the latter patch 
> together with the above issue?
It's better to send your V2 to fix both issues.
> 

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 1/4] prepare gen9 sends binary format and enable the ASM dump for sends

2016-11-23 Thread Guo, Yejun
thanks, and two inline comments, right?

> +string(file, "null");
This is not a good idea. Please parse the instruction bits to know whether it 
is the null register.
[yejun] yes, when I did something like atomic which has return value, I found 
it is not always null. So, I fixed this in another patch, locally now. 

> +format(file, "0x%x", gen9_insn->bits3.ud);
"0x%08x" would be better.
[yejun] agree, should I send a v2 patch, or fix it in the latter patch together 
with the above issue?

-Original Message-
From: Song, Ruiling 
Sent: Wednesday, November 23, 2016 10:15 PM
To: Guo, Yejun; beignet@lists.freedesktop.org
Cc: Guo, Yejun
Subject: RE: [Beignet] [PATCH 1/4] prepare gen9 sends binary format and enable 
the ASM dump for sends



> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf 
> Of Guo, Yejun
> Sent: Tuesday, November 22, 2016 2:42 PM
> To: beignet@lists.freedesktop.org
> Cc: Guo, Yejun <yejun@intel.com>
> Subject: [Beignet] [PATCH 1/4] prepare gen9 sends binary format and 
> enable the ASM dump for sends
> 
> Signed-off-by: Guo, Yejun <yejun@intel.com>
> ---
>  backend/src/backend/gen/gen_mesa_disasm.c |  28 ++--  
> backend/src/backend/gen9_instruction.hpp  | 112
> ++
>  backend/src/backend/gen_defs.hpp  |   3 +
>  3 files changed, 139 insertions(+), 4 deletions(-)  create mode 
> 100644 backend/src/backend/gen9_instruction.hpp
> 
> diff --git a/backend/src/backend/gen/gen_mesa_disasm.c
> b/backend/src/backend/gen/gen_mesa_disasm.c
> index c30f168..4f6c35d 100644
> --- a/backend/src/backend/gen/gen_mesa_disasm.c
> +++ b/backend/src/backend/gen/gen_mesa_disasm.c
> @@ -50,6 +50,7 @@
> 
>  #include "backend/gen_defs.hpp"
>  #include "backend/gen7_instruction.hpp"
> +#include "backend/gen9_instruction.hpp"
>  #include "src/cl_device_data.h"
> 
>  static const struct {
> @@ -104,6 +105,7 @@ static const struct {
> 
>[GEN_OPCODE_SEND] = { .name = "send", .nsrc = 2, .ndst = 1 },
>[GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 2, .ndst = 1 },
> +  [GEN_OPCODE_SENDS] = { .name = "sends", .nsrc = 2, .ndst = 1 },
>[GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
>[GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
>[GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 }, @@ 
> -1411,7 +1413,8 @@ int gen_disasm (FILE *file, const void *inst, 
> uint32_t deviceID, uint32_t compac
>  }
> 
>} else if (OPCODE(inst) != GEN_OPCODE_SEND &&
> - OPCODE(inst) != GEN_OPCODE_SENDC) {
> + OPCODE(inst) != GEN_OPCODE_SENDC &&
> + OPCODE(inst) != GEN_OPCODE_SENDS) {
>  err |= control(file, "conditional modifier", conditional_modifier,
> COND_DST_OR_MODIFIER(inst), NULL);
>  if (COND_DST_OR_MODIFIER(inst))
> @@ -1426,7 +1429,17 @@ int gen_disasm (FILE *file, const void *inst, 
> uint32_t deviceID, uint32_t compac
>  string(file, ")");
>}
> 
> -  if (opcode[OPCODE(inst)].nsrc == 3) {
> +  if (OPCODE(inst) == GEN_OPCODE_SENDS) {
> +const union Gen9NativeInstruction *gen9_insn = (const union
> Gen9NativeInstruction *)inst;
> +pad(file, 16);
> +string(file, "null");
This is not a good idea. Please parse the instruction bits to know whether it 
is the null register.
> +pad(file, 32);
> +format(file, "g%d(addLen:%d)", 
> + gen9_insn->bits2.sends.src0_reg_nr,
> gen9_insn->bits3.sends_untyped_rw.src0_length);
> +pad(file, 48);
> +format(file, "g%d(dataLen:%d)", 
> + gen9_insn->bits1.sends.src1_reg_nr,
> gen9_insn->bits2.sends.src1_length);
> +pad(file, 64);
> +format(file, "0x%x", gen9_insn->bits3.ud);
"0x%08x" would be better.
 
> +  } else if (opcode[OPCODE(inst)].nsrc == 3) {
>  pad(file, 16);
>  err |= dest_3src(file, inst);
> 
> @@ -1469,7 +1482,8 @@ int gen_disasm (FILE *file, const void *inst, 
> uint32_t deviceID, uint32_t compac
>}
> 
>if (OPCODE(inst) == GEN_OPCODE_SEND ||
> -  OPCODE(inst) == GEN_OPCODE_SENDC) {
> +  OPCODE(inst) == GEN_OPCODE_SENDC ||
> +  OPCODE(inst) == GEN_OPCODE_SENDS) {
>  enum GenMessageTarget target = COND_DST_OR_MODIFIER(inst);
> 
>  newline(file);
> @@ -1484,7 +1498,13 @@ int gen_disasm (FILE *file, const void *inst, 
> uint32_t deviceID, uint32_t compac
>   target, );
>  }
> 
> -if (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, 
> 

[Beignet] [PATCH 4/4] add sends support for byte write

2016-11-21 Thread Guo, Yejun
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen9_encoder.cpp   | 58 ++
 backend/src/backend/gen9_encoder.hpp   |  2 ++
 backend/src/backend/gen_context.cpp| 15 +---
 backend/src/backend/gen_encoder.cpp| 14 +++-
 backend/src/backend/gen_encoder.hpp|  4 ++-
 backend/src/backend/gen_insn_selection.cpp | 26 ++
 6 files changed, 107 insertions(+), 12 deletions(-)

diff --git a/backend/src/backend/gen9_encoder.cpp 
b/backend/src/backend/gen9_encoder.cpp
index 351788c..45e8551 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -123,4 +123,62 @@ namespace gbe
 gen9_insn->bits2.sends.sel_reg32_desc = 1;
 }
   }
+
+  unsigned Gen9Encoder::setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize)
+  {
+Gen9NativeInstruction *gen9_insn = >gen9_insn;
+gen9_insn->bits3.sends_byte_rw.header_present = 0;
+gen9_insn->bits3.sends_byte_rw.response_length = 0;
+gen9_insn->bits3.sends_byte_rw.end_of_thread = 0;
+gen9_insn->bits3.sends_byte_rw.msg_type = GEN7_BYTE_SCATTER;
+gen9_insn->bits3.sends_byte_rw.bti = bti;
+gen9_insn->bits3.sends_byte_rw.data_size = elemSize;
+
+if (this->curr.execWidth == 8) {
+  gen9_insn->bits3.sends_byte_rw.src0_length = 1;
+  gen9_insn->bits3.sends_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD8;
+} else if (this->curr.execWidth == 16) {
+  gen9_insn->bits3.sends_byte_rw.src0_length = 2;
+  gen9_insn->bits3.sends_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD16;
+}
+
+return gen9_insn->bits3.ud;
+  }
+
+  void Gen9Encoder::BYTE_SCATTER(GenRegister addr, GenRegister data, 
GenRegister bti, uint32_t elemSize)
+  {
+if (addr.reg() == data.reg())
+  Gen8Encoder::BYTE_SCATTER(addr, data, bti, elemSize);
+else {
+  GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+  Gen9NativeInstruction *gen9_insn = >gen9_insn;
+  this->setHeader(insn);
+  insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+  gen9_insn->bits1.sends.dest_reg_file_0 = 1;//01 for GRF
+  gen9_insn->bits1.sends.src1_reg_file_0 = 1;
+
+  gen9_insn->bits1.sends.src1_reg_nr = data.nr;
+  gen9_insn->bits1.sends.dest_subreg_nr = 0;
+  gen9_insn->bits1.sends.dest_reg_nr = 0;
+  gen9_insn->bits1.sends.dest_address_mode = 0;  //direct mode
+
+  gen9_insn->bits2.sends.src0_subreg_nr = addr.subnr;
+  gen9_insn->bits2.sends.src0_reg_nr = addr.nr;
+  gen9_insn->bits2.sends.src0_address_mode = 0;
+
+  if (this->curr.execWidth == 8)
+gen9_insn->bits2.sends.src1_length = 1;
+  else if (this->curr.execWidth == 16)
+gen9_insn->bits2.sends.src1_length = 2;
+  else
+assert(!"unsupported");
+
+  if (bti.file == GEN_IMMEDIATE_VALUE) {
+gen9_insn->bits2.sends.sel_reg32_desc = 0;
+setByteScatterSendsMessageDesc(insn, bti.value.ud, elemSize);
+  } else
+gen9_insn->bits2.sends.sel_reg32_desc = 1;
+}
+  }
 } /* End of the name space. */
diff --git a/backend/src/backend/gen9_encoder.hpp 
b/backend/src/backend/gen9_encoder.hpp
index 7b9f0df..d78b029 100644
--- a/backend/src/backend/gen9_encoder.hpp
+++ b/backend/src/backend/gen9_encoder.hpp
@@ -49,6 +49,8 @@ namespace gbe
 bool isUniform);
 virtual void UNTYPED_WRITE(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemNum);
 virtual unsigned setUntypedWriteSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemNum);
+virtual void BYTE_SCATTER(GenRegister addr, GenRegister data, GenRegister 
bti, uint32_t elemSize);
+virtual unsigned setByteScatterSendsMessageDesc(GenNativeInstruction 
*insn, unsigned bti, unsigned elemSize);
   };
 }
 #endif /* __GBE_GEN9_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 848933e..9505592 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2220,16 +2220,23 @@ namespace gbe
   }
 
   void GenContext::emitByteScatterInstruction(const SelectionInstruction 
) {
-const GenRegister src = ra->genReg(insn.src(0));
+const GenRegister addr = ra->genReg(insn.src(0));
+GenRegister data = ra->genReg(insn.src(1));
+if (!insn.extra.splitSend)
+  data = addr;
 const uint32_t elemSize = insn.extra.elem;
 const GenRegister bti = ra->genReg(insn.src(2));
 
 if (bti.file == GEN_IMMEDIATE_VALUE) {
-  p->BYTE_SCATTER(src, bti, elemSize);
+  p->BYTE_SCATTER(addr, data, bti, elemSize);
 } else {
   const GenRegister tmp = ra->genReg(insn.dst(0));
   const GenRegister btiTmp = ra->genReg(insn.dst(1));
-  unsigned desc = p->generateByteSca

[Beignet] [PATCH 2/4] support sends (split send) for untyped write

2016-11-21 Thread Guo, Yejun
sends is a new instruction starting from gen9 to split the registers
of address and data for write, the register pressure can be loosed
since they are not necessary to be continuous any more.

more patches for sends will be sent out.

we can choose send or sends based on hasSends() in selection stage,
only enabeld as default for skylake now.

Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen75_encoder.cpp  |  2 +-
 backend/src/backend/gen75_encoder.hpp  |  2 +-
 backend/src/backend/gen8_context.cpp   | 21 +++
 backend/src/backend/gen8_encoder.cpp   |  2 +-
 backend/src/backend/gen8_encoder.hpp   |  2 +-
 backend/src/backend/gen9_encoder.cpp   | 58 ++
 backend/src/backend/gen9_encoder.hpp   |  3 +-
 backend/src/backend/gen_context.cpp| 41 -
 backend/src/backend/gen_encoder.cpp| 12 ++-
 backend/src/backend/gen_encoder.hpp|  4 ++-
 backend/src/backend/gen_insn_selection.cpp | 22 ++--
 backend/src/backend/gen_insn_selection.hpp |  1 +
 12 files changed, 137 insertions(+), 33 deletions(-)

diff --git a/backend/src/backend/gen75_encoder.cpp 
b/backend/src/backend/gen75_encoder.cpp
index fc37991..9cafaa7 100644
--- a/backend/src/backend/gen75_encoder.cpp
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -199,7 +199,7 @@ namespace gbe
 return insn->bits3.ud;
   }
 
-  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t 
elemNum) {
+  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister data, 
GenRegister bti, uint32_t elemNum) {
 GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
 assert(elemNum >= 1 || elemNum <= 4);
 this->setHeader(insn);
diff --git a/backend/src/backend/gen75_encoder.hpp 
b/backend/src/backend/gen75_encoder.hpp
index d06f393..517afff 100644
--- a/backend/src/backend/gen75_encoder.hpp
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -44,7 +44,7 @@ namespace gbe
 virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
 virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, 
GenRegister bti, uint32_t srcNum);
 virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister 
bti, uint32_t elemNum);
-virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t 
elemNum);
+virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister 
bti, uint32_t elemNum);
 virtual void setHeader(GenNativeInstruction *insn);
 virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, 
uint32_t rgba,
uint32_t msg_type, uint32_t msg_length, uint32_t 
response_length);
diff --git a/backend/src/backend/gen8_context.cpp 
b/backend/src/backend/gen8_context.cpp
index 71c54fb..95b1013 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -968,6 +968,9 @@ namespace gbe
 GBE_ASSERT(elemNum == 1);
 const GenRegister addr = ra->genReg(insn.src(elemNum));
 const GenRegister bti = ra->genReg(insn.src(elemNum*2+1));
+GenRegister data = ra->genReg(insn.src(elemNum+1));
+if (!insn.extra.splitSend)
+  data = addr;
 
 /* Because BDW's store and load send instructions for 64 bits require the 
bti to be surfaceless,
which we can not accept. We just fallback to 2 DW untypewrite here. */
@@ -978,11 +981,15 @@ namespace gbe
 }
 
 if (bti.file == GEN_IMMEDIATE_VALUE) {
-  p->UNTYPED_WRITE(addr, bti, elemNum*2);
+  p->UNTYPED_WRITE(addr, data, bti, elemNum*2);
 } else {
   const GenRegister tmp = ra->genReg(insn.dst(elemNum));
   const GenRegister btiTmp = ra->genReg(insn.dst(elemNum + 1));
-  unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
+  unsigned desc = 0;
+  if (insn.extra.splitSend)
+desc = p->generateUntypedWriteSendsMessageDesc(0, elemNum*2);
+  else
+desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
 
   unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
 
@@ -990,7 +997,7 @@ namespace gbe
   p->push();
 p->curr.predicate = GEN_PREDICATE_NORMAL;
 p->curr.useFlag(insn.state.flag, insn.state.subFlag);
-p->UNTYPED_WRITE(addr, GenRegister::addr1(0), elemNum*2);
+p->UNTYPED_WRITE(addr, data, GenRegister::addr1(0), elemNum*2);
   p->pop();
   afterMessage(insn, bti, tmp, btiTmp, jip0);
 }
@@ -1351,7 +1358,7 @@ namespace gbe
   nextDst = GenRegister::Qn(tempDst, 1);
   p->MOV(nextDst, nextSrc);
 p->pop();
-p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1);
 p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
 
 p->push();
@@ -1367,7 +1374,7 @@ namespace gbe
   nextDst = GenRegister::Qn(tempDst, 1);
   p->MOV(nextDst

[Beignet] [PATCH 1/4] prepare gen9 sends binary format and enable the ASM dump for sends

2016-11-21 Thread Guo, Yejun
Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen/gen_mesa_disasm.c |  28 ++--
 backend/src/backend/gen9_instruction.hpp  | 112 ++
 backend/src/backend/gen_defs.hpp  |   3 +
 3 files changed, 139 insertions(+), 4 deletions(-)
 create mode 100644 backend/src/backend/gen9_instruction.hpp

diff --git a/backend/src/backend/gen/gen_mesa_disasm.c 
b/backend/src/backend/gen/gen_mesa_disasm.c
index c30f168..4f6c35d 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -50,6 +50,7 @@
 
 #include "backend/gen_defs.hpp"
 #include "backend/gen7_instruction.hpp"
+#include "backend/gen9_instruction.hpp"
 #include "src/cl_device_data.h"
 
 static const struct {
@@ -104,6 +105,7 @@ static const struct {
 
   [GEN_OPCODE_SEND] = { .name = "send", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_SENDS] = { .name = "sends", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
@@ -1411,7 +1413,8 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
 }
 
   } else if (OPCODE(inst) != GEN_OPCODE_SEND &&
- OPCODE(inst) != GEN_OPCODE_SENDC) {
+ OPCODE(inst) != GEN_OPCODE_SENDC &&
+ OPCODE(inst) != GEN_OPCODE_SENDS) {
 err |= control(file, "conditional modifier", conditional_modifier,
COND_DST_OR_MODIFIER(inst), NULL);
 if (COND_DST_OR_MODIFIER(inst))
@@ -1426,7 +1429,17 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
 string(file, ")");
   }
 
-  if (opcode[OPCODE(inst)].nsrc == 3) {
+  if (OPCODE(inst) == GEN_OPCODE_SENDS) {
+const union Gen9NativeInstruction *gen9_insn = (const union 
Gen9NativeInstruction *)inst;
+pad(file, 16);
+string(file, "null");
+pad(file, 32);
+format(file, "g%d(addLen:%d)", gen9_insn->bits2.sends.src0_reg_nr, 
gen9_insn->bits3.sends_untyped_rw.src0_length);
+pad(file, 48);
+format(file, "g%d(dataLen:%d)", gen9_insn->bits1.sends.src1_reg_nr, 
gen9_insn->bits2.sends.src1_length);
+pad(file, 64);
+format(file, "0x%x", gen9_insn->bits3.ud);
+  } else if (opcode[OPCODE(inst)].nsrc == 3) {
 pad(file, 16);
 err |= dest_3src(file, inst);
 
@@ -1469,7 +1482,8 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
   }
 
   if (OPCODE(inst) == GEN_OPCODE_SEND ||
-  OPCODE(inst) == GEN_OPCODE_SENDC) {
+  OPCODE(inst) == GEN_OPCODE_SENDC ||
+  OPCODE(inst) == GEN_OPCODE_SENDS) {
 enum GenMessageTarget target = COND_DST_OR_MODIFIER(inst);
 
 newline(file);
@@ -1484,7 +1498,13 @@ int gen_disasm (FILE *file, const void *inst, uint32_t 
deviceID, uint32_t compac
  target, );
 }
 
-if (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, 
bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE) {
+int immbti = 0;
+if (OPCODE(inst) == GEN_OPCODE_SENDS) {
+  const union Gen9NativeInstruction *gen9_insn = (const union 
Gen9NativeInstruction *)inst;
+  immbti = !(gen9_insn->bits2.sends.sel_reg32_desc);
+} else
+  immbti = (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, 
bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE);
+if (immbti) {
   switch (target) {
 case GEN_SFID_VIDEO_MOTION_EST:
   format(file, " (bti: %d, msg_type: %d)",
diff --git a/backend/src/backend/gen9_instruction.hpp 
b/backend/src/backend/gen9_instruction.hpp
new file mode 100644
index 000..9d57f08
--- /dev/null
+++ b/backend/src/backend/gen9_instruction.hpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Guo, Yejun <yejun@intel.com>
+ */
+
+
+#ifndef __GEN9_INSTRUCTION_HPP__
+#define __GEN9_INSTRUCTION_HPP__
+
+union Gen9NativeInstruction
+{
+  struct {
+struct {
+  uint32_t opcode:7;
+ 

[Beignet] [PATCH 2/2] do not care dst for printf

2016-11-21 Thread Guo, Yejun
acutally, the dst of printf means nothing, don't need to touch it.

Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen_context.cpp| 14 ++
 backend/src/backend/gen_insn_selection.cpp | 20 +---
 2 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 186c8d9..a73ccb6 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3469,9 +3469,8 @@ namespace gbe
   }
 
   void GenContext::emitPrintfInstruction(const SelectionInstruction ) {
-const GenRegister dst = ra->genReg(insn.dst(0));
-const GenRegister tmp0 = ra->genReg(insn.dst(1));
-const GenRegister tmp1 = ra->genReg(insn.dst(2));
+const GenRegister tmp0 = ra->genReg(insn.dst(0));
+const GenRegister tmp1 = ra->genReg(insn.dst(1));
 GenRegister src;
 uint32_t srcNum = insn.srcNum;
 
@@ -3518,15 +3517,6 @@ namespace gbe
 emitPrintfLongInstruction(addr, data, src, insn.extra.printfBTI);
   }
 }
-
-if (dst.hstride == GEN_HORIZONTAL_STRIDE_0) {
-  p->push();
-  p->curr.execWidth = 1;
-}
-p->MOV(dst, GenRegister::immd(0));
-if (dst.hstride == GEN_HORIZONTAL_STRIDE_0) {
-  p->pop();
-}
   }
 
   void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int 
sz) {
diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index 1808c7b..88fe1a6 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -705,7 +705,7 @@ namespace gbe
 /*! Store the profiling info */
 void STORE_PROFILING(uint32_t profilingType, uint32_t bti, GenRegister 
tmp0, GenRegister tmp1, GenRegister ts[5], int tsNum);
 /*! Printf */
-void PRINTF(GenRegister dst, uint8_t bti, GenRegister tmp0, GenRegister 
tmp1, GenRegister src[8],
+void PRINTF(uint8_t bti, GenRegister tmp0, GenRegister tmp1, GenRegister 
src[8],
 int srcNum, uint16_t num, bool isContinue, uint32_t totalSize);
 /*! Multiply 64-bit integers */
 void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister *tmp, bool 
native_long);
@@ -2129,20 +2129,19 @@ namespace gbe
 }
   }
 
-  void Selection::Opaque::PRINTF(GenRegister dst, uint8_t bti, GenRegister 
tmp0, GenRegister tmp1,
+  void Selection::Opaque::PRINTF(uint8_t bti, GenRegister tmp0, GenRegister 
tmp1,
GenRegister src[8], int srcNum, uint16_t num, bool isContinue, 
uint32_t totalSize) {
-SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 3, srcNum);
+SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 2, srcNum);
 SelectionVector *vector = this->appendVector();
 
 for (int i = 0; i < srcNum; i++)
   insn->src(i) = src[i];
 
-insn->dst(0) = dst;
-insn->dst(1) = tmp0;
-insn->dst(2) = tmp1;
+insn->dst(0) = tmp0;
+insn->dst(1) = tmp1;
 
 vector->regNum = 2;
-vector->reg = >dst(1);
+vector->reg = >dst(0);
 vector->offsetID = 0;
 vector->isSrc = 0;
 
@@ -7041,8 +7040,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
   uint8_t BTI = insn.getBti();
   GenRegister tmp0, tmp1;
   uint32_t srcNum = insn.getSrcNum();
-  GenRegister dst = sel.selReg(insn.getDst(0), TYPE_S32);
-  //GBE_ASSERT(srcNum);
+
   uint32_t i = 0;
   uint32_t totalSize = 0;
   bool isContinue = false;
@@ -7063,14 +7061,14 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
   i = 0;
   GenRegister regs[8];
   if (srcNum == 0) {
-  sel.PRINTF(dst, BTI, tmp0, tmp1, regs, srcNum, num, isContinue, 
totalSize);
+  sel.PRINTF(BTI, tmp0, tmp1, regs, srcNum, num, isContinue, 
totalSize);
   } else {
 do {
   uint32_t s = srcNum < 8 ? srcNum : 8;
   for (uint32_t j = 0; j < s; j++) {
 regs[j] = sel.selReg(insn.getSrc(i + j), insn.getType(i + j));
   }
-  sel.PRINTF(dst, BTI, tmp0, tmp1, regs, s, num, isContinue, 
totalSize);
+  sel.PRINTF(BTI, tmp0, tmp1, regs, s, num, isContinue, totalSize);
 
   if (srcNum > 8) {
 srcNum -= 8;
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 1/2] remove some redundant code for printf

2016-11-21 Thread Guo, Yejun
tmp0 is added into src in selection stage, and just ignored at context
stage, it is redundant.

Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 backend/src/backend/gen_context.cpp|  2 --
 backend/src/backend/gen_insn_selection.cpp | 54 +-
 2 files changed, 15 insertions(+), 41 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index c38b7af..186c8d9 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3474,8 +3474,6 @@ namespace gbe
 const GenRegister tmp1 = ra->genReg(insn.dst(2));
 GenRegister src;
 uint32_t srcNum = insn.srcNum;
-if (insn.extra.continueFlag)
-  srcNum--;
 
 GenRegister addr = GenRegister::retype(tmp0, GEN_TYPE_UD);
 GenRegister data = GenRegister::retype(tmp1, GEN_TYPE_UD);
diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index c14e0bc..1808c7b 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2131,49 +2131,25 @@ namespace gbe
 
   void Selection::Opaque::PRINTF(GenRegister dst, uint8_t bti, GenRegister 
tmp0, GenRegister tmp1,
GenRegister src[8], int srcNum, uint16_t num, bool isContinue, 
uint32_t totalSize) {
-if (isContinue) {
-  SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 3, srcNum + 
1);
-  SelectionVector *vector = this->appendVector();
-
-  for (int i = 0; i < srcNum; i++)
-insn->src(i) = src[i];
-
-  insn->src(srcNum) = tmp0;
-
-  insn->dst(0) = dst;
-  insn->dst(1) = tmp0;
-  insn->dst(2) = tmp1;
-
-  vector->regNum = 2;
-  vector->reg = >dst(1);
-  vector->offsetID = 0;
-  vector->isSrc = 0;
-
-  insn->extra.printfSize = static_cast(totalSize);
-  insn->extra.continueFlag = isContinue;
-  insn->extra.printfBTI = bti;
-  insn->extra.printfNum = num;
-} else {
-  SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 3, srcNum);
-  SelectionVector *vector = this->appendVector();
+SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 3, srcNum);
+SelectionVector *vector = this->appendVector();
 
-  for (int i = 0; i < srcNum; i++)
-insn->src(i) = src[i];
+for (int i = 0; i < srcNum; i++)
+  insn->src(i) = src[i];
 
-  insn->dst(0) = dst;
-  insn->dst(1) = tmp0;
-  insn->dst(2) = tmp1;
+insn->dst(0) = dst;
+insn->dst(1) = tmp0;
+insn->dst(2) = tmp1;
 
-  vector->regNum = 2;
-  vector->reg = >dst(1);
-  vector->offsetID = 0;
-  vector->isSrc = 0;
+vector->regNum = 2;
+vector->reg = >dst(1);
+vector->offsetID = 0;
+vector->isSrc = 0;
 
-  insn->extra.printfSize = static_cast(totalSize);
-  insn->extra.continueFlag = isContinue;
-  insn->extra.printfBTI = bti;
-  insn->extra.printfNum = num;
-}
+insn->extra.printfSize = static_cast(totalSize);
+insn->extra.continueFlag = isContinue;
+insn->extra.printfBTI = bti;
+insn->extra.printfNum = num;
   }
 
   void Selection::Opaque::WORKGROUP_OP(uint32_t wg_op,
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] disable CMRT as default, since no real case reported

2016-11-20 Thread Guo, Yejun
ping for review, thanks.

-Original Message-
From: Guo, Yejun 
Sent: Tuesday, October 25, 2016 3:33 PM
To: beignet@lists.freedesktop.org
Cc: Guo, Yejun
Subject: [PATCH] disable CMRT as default, since no real case reported

and this feature also sometimes introduces build issue.

Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt index d839f3f..039f9cd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -170,7 +170,9 @@ ELSE(DRM_INTEL_FOUND)
 ENDIF(DRM_INTEL_FOUND)
 
 # CMRT
-pkg_check_modules(CMRT libcmrt)
+#disable CMRT as default, since we do not see real case, #while see 
+build issue of this feature #pkg_check_modules(CMRT libcmrt)
 IF(CMRT_FOUND)
 INCLUDE_DIRECTORIES(${CMRT_INCLUDE_DIRS})
 ENDIF(CMRT_FOUND)
--
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] disable CMRT as default, since no real case reported

2016-10-25 Thread Guo, Yejun
and this feature also sometimes introduces build issue.

Signed-off-by: Guo, Yejun <yejun@intel.com>
---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d839f3f..039f9cd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -170,7 +170,9 @@ ELSE(DRM_INTEL_FOUND)
 ENDIF(DRM_INTEL_FOUND)
 
 # CMRT
-pkg_check_modules(CMRT libcmrt)
+#disable CMRT as default, since we do not see real case,
+#while see build issue of this feature
+#pkg_check_modules(CMRT libcmrt)
 IF(CMRT_FOUND)
 INCLUDE_DIRECTORIES(${CMRT_INCLUDE_DIRS})
 ENDIF(CMRT_FOUND)
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] Fix build failure with CMRT enabled

2016-10-12 Thread Guo, Yejun
LGTM, thanks.

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of 
Rebecca N. Palmer
Sent: Thursday, October 13, 2016 6:15 AM
To: beignet@lists.freedesktop.org
Subject: [Beignet] [PATCH] Fix build failure with CMRT enabled

2baff9c moved mem->magic to cl_base_object.
---
(Or should this be CL_OBJECT_IS_MEM(mem), i.e. also checking the reference 
count?)

--- a/src/cl_cmrt.cpp
+++ b/src/cl_cmrt.cpp
@@ -256,7 +256,7 @@ cl_int cmrt_set_kernel_arg(cl_kernel k,
 result = cmrt_kernel->SetKernelArg(index, sz, value);
   else {
 cl_mem mem = *(cl_mem*)value;
-if (mem->magic == CL_MAGIC_MEM_HEADER) {
+if (((cl_base_object)mem)->magic == CL_MAGIC_MEM_HEADER) {
   if (!CreateCmrtMemory(mem))
 return CL_INVALID_ARG_VALUE;
 

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] buildsys: Use CMRT_LIBDIR instead of CMRT_LIBRARY_DIRS

2016-10-09 Thread Guo, Yejun
That's great, thanks.

-Original Message-
From: Armin K. [mailto:kre...@email.com] 
Sent: Monday, October 10, 2016 12:57 AM
To: Guo, Yejun; beignet@lists.freedesktop.org
Subject: Re: [Beignet] [PATCH] buildsys: Use CMRT_LIBDIR instead of 
CMRT_LIBRARY_DIRS

On 09.10.2016 04:34, Guo, Yejun wrote:
> thanks Armin, the patch looks good to me.
> 
> could you also send a patch to fix CMRT_INCLUDE_DIRS? thanks.
> 
> 
> thanks
> yejun
> 

Usage of CMRT_INCLUDE_DIRS is correct. Nothing to fix there.


___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] buildsys: Use CMRT_LIBDIR instead of CMRT_LIBRARY_DIRS

2016-10-08 Thread Guo, Yejun
thanks Armin, the patch looks good to me.

could you also send a patch to fix CMRT_INCLUDE_DIRS? thanks.


thanks
yejun

-Original Message-
From: Armin K. [mailto:kre...@email.com] 
Sent: Saturday, October 08, 2016 4:50 PM
To: Guo, Yejun; beignet@lists.freedesktop.org
Subject: Re: [Beignet] [PATCH] buildsys: Use CMRT_LIBDIR instead of 
CMRT_LIBRARY_DIRS

On 8.10.2016 4:18, Guo, Yejun wrote:
> Hi Armin,
>
> thanks for your patch.
>
> We use pkg_check_modules(CMRT libcmrt) in path_of_beignet/CMakeList.txt Line 
> 173 to detect libcmrt.  CMRT_INCLUDE_DIRS and CMRT_LIBRARY_DIRS are the 
> expected macro to use, see 
> https://cmake.org/cmake/help/v3.0/module/FindPkgConfig.html or 
> https://cmake.org/cmake/help/v2.8.12/cmake.html. Could you explain a bit more 
> of your environment? thanks.
>
> btw, do you use the feature to "make Beignet as intermedia layer of CMRT"? 
> Want to know if any real cases from the community.
>
> thanks
> yejun
>

Hi Yejun,

As noted in the patch reply, CMRT_LIBRARY_DIRS is empty here.

Per cmake docs:

"_LIBRARY_DIRS   ... the paths of the libraries (w/o the '-L')"

There is no "-L..." in Libs: section of the libcmrt.pc file, because the file 
is installed in /usr/lib/pkgconfig, and autotools/pkgconfig will strip /usr/lib 
from library paths and /usr/include from include paths (per pkg-config default 
configuration).

CMRT_LIBDIR is however always set, as it reads libdir variable from the .pc 
file, and that one is expected to be always set.

And I don't use cmrt feature at all, I just happened to have libcmrt installed 
because I have intel-hybrid-driver for Skylake VP9 hybrid decoder.

Without this patch, the utest testing the cmrt code will fail. See also:

https://bugs.freedesktop.org/show_bug.cgi?id=94636

> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf 
> Of Armin K
> Sent: Wednesday, October 05, 2016 10:33 PM
> To: beignet@lists.freedesktop.org
> Cc: Armin K
> Subject: [Beignet] [PATCH] buildsys: Use CMRT_LIBDIR instead of 
> CMRT_LIBRARY_DIRS
>
> CMRT_LIBRARY_DIRS doesn't include any library paths when the library 
> is installed system-wide, such as /usr.
>
> Also dlopen versioned library, as distros tend to split non-versioned 
> sonames into -devel packages.
>
> Signed-off-by: Armin K <kre...@email.com>
> ---
>  src/CMakeLists.txt | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 
> a002865..82be7ff 100644
> --- a/src/CMakeLists.txt
> +++ b/src/CMakeLists.txt
> @@ -103,7 +103,7 @@ endif (X11_FOUND)
>
>  if (CMRT_FOUND)
>set(CMAKE_CXX_FLAGS "-DHAS_CMRT ${CMAKE_CXX_FLAGS}")
> -  set(CMAKE_CXX_FLAGS "-DCMRT_PATH=${CMRT_LIBRARY_DIRS}/libcmrt.so 
> ${CMAKE_CXX_FLAGS}")
> +  set(CMAKE_CXX_FLAGS "-DCMRT_PATH=${CMRT_LIBDIR}/libcmrt.so.1 
> + ${CMAKE_CXX_FLAGS}")
>set(CMAKE_C_FLAGS "-DHAS_CMRT ${CMAKE_C_FLAGS}")
>set(OPENCL_SRC ${OPENCL_SRC} cl_cmrt.cpp)  endif (CMRT_FOUND)
>
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] buildsys: Use CMRT_LIBDIR instead of CMRT_LIBRARY_DIRS

2016-10-07 Thread Guo, Yejun
Hi Armin,

thanks for your patch. 

We use pkg_check_modules(CMRT libcmrt) in path_of_beignet/CMakeList.txt Line 
173 to detect libcmrt.  CMRT_INCLUDE_DIRS and CMRT_LIBRARY_DIRS are the 
expected macro to use, see 
https://cmake.org/cmake/help/v3.0/module/FindPkgConfig.html or 
https://cmake.org/cmake/help/v2.8.12/cmake.html. Could you explain a bit more 
of your environment? thanks.

btw, do you use the feature to "make Beignet as intermedia layer of CMRT"? Want 
to know if any real cases from the community.

thanks
yejun

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Armin 
K
Sent: Wednesday, October 05, 2016 10:33 PM
To: beignet@lists.freedesktop.org
Cc: Armin K
Subject: [Beignet] [PATCH] buildsys: Use CMRT_LIBDIR instead of 
CMRT_LIBRARY_DIRS

CMRT_LIBRARY_DIRS doesn't include any library paths
when the library is installed system-wide, such as
/usr.

Also dlopen versioned library, as distros tend to
split non-versioned sonames into -devel packages.

Signed-off-by: Armin K 
---
 src/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a002865..82be7ff 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -103,7 +103,7 @@ endif (X11_FOUND)
 
 if (CMRT_FOUND)
   set(CMAKE_CXX_FLAGS "-DHAS_CMRT ${CMAKE_CXX_FLAGS}")
-  set(CMAKE_CXX_FLAGS "-DCMRT_PATH=${CMRT_LIBRARY_DIRS}/libcmrt.so 
${CMAKE_CXX_FLAGS}")
+  set(CMAKE_CXX_FLAGS "-DCMRT_PATH=${CMRT_LIBDIR}/libcmrt.so.1 
${CMAKE_CXX_FLAGS}")
   set(CMAKE_C_FLAGS "-DHAS_CMRT ${CMAKE_C_FLAGS}")
   set(OPENCL_SRC ${OPENCL_SRC} cl_cmrt.cpp)
 endif (CMRT_FOUND)
-- 
2.9.3

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 2/2] change PCI_CHIP_BROXTON_P to PCI_CHIP_BROXTON_0 to unify the naming

2016-09-29 Thread Guo Yejun
Signed-off-by: Guo Yejun <yejun@intel.com>
---
 src/cl_device_data.h | 4 ++--
 src/cl_device_id.c   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/cl_device_data.h b/src/cl_device_data.h
index 3e6ac91..4ee4ca3 100644
--- a/src/cl_device_data.h
+++ b/src/cl_device_data.h
@@ -297,13 +297,13 @@
 #define IS_SKYLAKE(devid) (IS_SKL_GT1(devid) || IS_SKL_GT2(devid) || 
IS_SKL_GT3(devid) || IS_SKL_GT4(devid))
 
 /* BXT */
-#define PCI_CHIP_BROXTON_P 0x5A84   /* Intel(R) BXT-P for mobile desktop */
+#define PCI_CHIP_BROXTON_0 0x5A84
 #define PCI_CHIP_BROXTON_1 0x5A85
 #define PCI_CHIP_BROXTON_2 0x1A84
 #define PCI_CHIP_BROXTON_3 0x1A85
 
 #define IS_BROXTON(devid)   \
-  (devid == PCI_CHIP_BROXTON_P ||   \
+  (devid == PCI_CHIP_BROXTON_0 ||   \
devid == PCI_CHIP_BROXTON_1 ||   \
devid == PCI_CHIP_BROXTON_2 ||   \
devid == PCI_CHIP_BROXTON_3)
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 19f984f..232082b 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -615,8 +615,8 @@ skl_gt4_break:
   cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
   break;
 
-case PCI_CHIP_BROXTON_P:
-  DECL_INFO_STRING(bxt18eu_break, intel_bxt18eu_device, name, "Intel(R) HD 
Graphics Broxton-P");
+case PCI_CHIP_BROXTON_0:
+  DECL_INFO_STRING(bxt18eu_break, intel_bxt18eu_device, name, "Intel(R) HD 
Graphics Broxton 0");
 case PCI_CHIP_BROXTON_2:
   DECL_INFO_STRING(bxt18eu_break, intel_bxt18eu_device, name, "Intel(R) HD 
Graphics Broxton 2");
 bxt18eu_break:
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 1/2] add bxt with pciid 0x1A85

2016-09-29 Thread Guo Yejun
contributor: Curfman, Matthew C <matthew.c.curf...@intel.com>
Signed-off-by: Guo Yejun <yejun@intel.com>
---
 src/cl_device_data.h |  4 +++-
 src/cl_device_id.c   | 34 ++
 src/intel/intel_driver.c |  3 ++-
 src/intel/intel_gpgpu.c  |  3 ++-
 4 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/src/cl_device_data.h b/src/cl_device_data.h
index a237b0e..3e6ac91 100644
--- a/src/cl_device_data.h
+++ b/src/cl_device_data.h
@@ -300,11 +300,13 @@
 #define PCI_CHIP_BROXTON_P 0x5A84   /* Intel(R) BXT-P for mobile desktop */
 #define PCI_CHIP_BROXTON_1 0x5A85
 #define PCI_CHIP_BROXTON_2 0x1A84
+#define PCI_CHIP_BROXTON_3 0x1A85
 
 #define IS_BROXTON(devid)   \
   (devid == PCI_CHIP_BROXTON_P ||   \
devid == PCI_CHIP_BROXTON_1 ||   \
-   devid == PCI_CHIP_BROXTON_2)
+   devid == PCI_CHIP_BROXTON_2 ||   \
+   devid == PCI_CHIP_BROXTON_3)
 
 #define PCI_CHIP_KABYLAKE_ULT_GT1 0x5906
 #define PCI_CHIP_KABYLAKE_ULT_GT2 0x5916
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 02aad96..19f984f 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -185,7 +185,7 @@ static struct _cl_device_id intel_skl_gt4_device = {
 #include "cl_gen9_device.h"
 };
 
-static struct _cl_device_id intel_bxt_device = {
+static struct _cl_device_id intel_bxt18eu_device = {
   .max_compute_unit = 18,
   .max_thread_per_unit = 6,
   .sub_slice_count = 3,
@@ -195,7 +195,7 @@ static struct _cl_device_id intel_bxt_device = {
 #include "cl_gen9_device.h"
 };
 
-static struct _cl_device_id intel_bxt1_device = {
+static struct _cl_device_id intel_bxt12eu_device = {
   .max_compute_unit = 12,
   .max_thread_per_unit = 6,
   .sub_slice_count = 2,
@@ -616,23 +616,25 @@ skl_gt4_break:
   break;
 
 case PCI_CHIP_BROXTON_P:
-  DECL_INFO_STRING(bxt_break, intel_bxt_device, name, "Intel(R) HD 
Graphics Broxton-P");
+  DECL_INFO_STRING(bxt18eu_break, intel_bxt18eu_device, name, "Intel(R) HD 
Graphics Broxton-P");
 case PCI_CHIP_BROXTON_2:
-  DECL_INFO_STRING(bxt_break, intel_bxt_device, name, "Intel(R) HD 
Graphics Broxton 2");
-bxt_break:
-  intel_bxt_device.device_id = device_id;
-  intel_bxt_device.platform = cl_get_platform_default();
-  ret = _bxt_device;
+  DECL_INFO_STRING(bxt18eu_break, intel_bxt18eu_device, name, "Intel(R) HD 
Graphics Broxton 2");
+bxt18eu_break:
+  intel_bxt18eu_device.device_id = device_id;
+  intel_bxt18eu_device.platform = cl_get_platform_default();
+  ret = _bxt18eu_device;
   cl_intel_platform_get_default_extension(ret);
   cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
   break;
 
 case PCI_CHIP_BROXTON_1:
-  DECL_INFO_STRING(bxt1_break, intel_bxt1_device, name, "Intel(R) HD 
Graphics Broxton 1");
-bxt1_break:
-  intel_bxt1_device.device_id = device_id;
-  intel_bxt1_device.platform = cl_get_platform_default();
-  ret = _bxt1_device;
+  DECL_INFO_STRING(bxt12eu_break, intel_bxt12eu_device, name, "Intel(R) HD 
Graphics Broxton 1");
+case PCI_CHIP_BROXTON_3:
+  DECL_INFO_STRING(bxt12eu_break, intel_bxt12eu_device, name, "Intel(R) HD 
Graphics Broxton 3");
+bxt12eu_break:
+  intel_bxt12eu_device.device_id = device_id;
+  intel_bxt12eu_device.platform = cl_get_platform_default();
+  ret = _bxt12eu_device;
   cl_intel_platform_get_default_extension(ret);
   cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
   break;
@@ -952,8 +954,8 @@ LOCAL cl_bool is_gen_device(cl_device_id device) {
  device == _skl_gt2_device ||
  device == _skl_gt3_device ||
  device == _skl_gt4_device ||
- device == _bxt_device ||
- device == _bxt1_device||
+ device == _bxt18eu_device ||
+ device == _bxt12eu_device ||
  device == _kbl_gt1_device ||
  device == _kbl_gt15_device ||
  device == _kbl_gt2_device ||
@@ -1097,7 +1099,7 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
 *ver = 8;
   } else if (device == _skl_gt1_device || device == _skl_gt2_device
 || device == _skl_gt3_device || device == _skl_gt4_device
-|| device == _bxt_device || device == _bxt1_device || 
device == _kbl_gt1_device
+|| device == _bxt18eu_device || device == _bxt12eu_device 
|| device == _kbl_gt1_device
 || device == _kbl_gt2_device || device == _kbl_gt3_device
 || device == _kbl_gt4_device || device == 
_kbl_gt15_device) {
 *ver = 9;
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index f503b55..5f4afda 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -469,7 +469,8 @@ intel_driver_enlarge_stack_size(struct intel_driver *drv, 
int32_t *stack_size)
 {
 if (drv->gen_ver == 75)
   *stack_size = *stack_size * 4;
-else

[Beignet] [PATCH] add bxt with pciid 0x1A84

2016-09-28 Thread Guo Yejun
Signed-off-by: Guo Yejun <yejun@intel.com>
---
 src/cl_device_data.h | 4 +++-
 src/cl_device_id.c   | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/cl_device_data.h b/src/cl_device_data.h
index 30366ea..a237b0e 100644
--- a/src/cl_device_data.h
+++ b/src/cl_device_data.h
@@ -299,10 +299,12 @@
 /* BXT */
 #define PCI_CHIP_BROXTON_P 0x5A84   /* Intel(R) BXT-P for mobile desktop */
 #define PCI_CHIP_BROXTON_1 0x5A85
+#define PCI_CHIP_BROXTON_2 0x1A84
 
 #define IS_BROXTON(devid)   \
   (devid == PCI_CHIP_BROXTON_P ||   \
-   devid == PCI_CHIP_BROXTON_1)
+   devid == PCI_CHIP_BROXTON_1 ||   \
+   devid == PCI_CHIP_BROXTON_2)
 
 #define PCI_CHIP_KABYLAKE_ULT_GT1 0x5906
 #define PCI_CHIP_KABYLAKE_ULT_GT2 0x5916
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index df3355c..02aad96 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -617,6 +617,8 @@ skl_gt4_break:
 
 case PCI_CHIP_BROXTON_P:
   DECL_INFO_STRING(bxt_break, intel_bxt_device, name, "Intel(R) HD 
Graphics Broxton-P");
+case PCI_CHIP_BROXTON_2:
+  DECL_INFO_STRING(bxt_break, intel_bxt_device, name, "Intel(R) HD 
Graphics Broxton 2");
 bxt_break:
   intel_bxt_device.device_id = device_id;
   intel_bxt_device.platform = cl_get_platform_default();
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] enlarge scratch size for bxt 0x5a85

2016-09-20 Thread Guo Yejun
Signed-off-by: Guo Yejun <yejun@intel.com>
---
 src/intel/intel_gpgpu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 3314ab4..f8eac56 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -1537,8 +1537,8 @@ intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t 
per_thread_size)
   drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
   drm_intel_bo* old = gpgpu->scratch_b.bo;
   uint32_t total = per_thread_size * gpgpu->max_threads;
-  /* Per Bspec, scratch should 2X the desired size, otherwise luxmark may hang 
*/
-  if (IS_HASWELL(gpgpu->drv->device_id) || 
IS_CHERRYVIEW(gpgpu->drv->device_id))
+  /* Per Bspec, scratch should 2X the desired size when EU index is not 
continuous */
+  if (IS_HASWELL(gpgpu->drv->device_id) || 
IS_CHERRYVIEW(gpgpu->drv->device_id) || PCI_CHIP_BROXTON_1 == 
gpgpu->drv->device_id)
   total *= 2;
 
   gpgpu->per_thread_scratch = per_thread_size;
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] enlarge stack size for chv since its EU might be masked

2016-09-12 Thread Guo Yejun
Signed-off-by: Guo Yejun <yejun@intel.com>
---
 src/intel/intel_driver.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index ec2fb31..0766ca3 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -468,7 +468,7 @@ intel_driver_enlarge_stack_size(struct intel_driver *drv, 
int32_t *stack_size)
 {
 if (drv->gen_ver == 75)
   *stack_size = *stack_size * 4;
-else if (drv->device_id == PCI_CHIP_BROXTON_1)
+else if (drv->device_id == PCI_CHIP_BROXTON_1 || 
IS_CHERRYVIEW(drv->device_id))
   *stack_size = *stack_size * 2;
 }
 
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] add another broxton pciid 0x5A85

2016-09-11 Thread Guo, Yejun
to be precise, I'll separate the patch into two, one for bxt, and another one 
for chv.

-Original Message-
From: Pan, Xiuli 
Sent: Monday, September 12, 2016 10:57 AM
To: Guo, Yejun; beignet@lists.freedesktop.org
Subject: RE: [Beignet] [PATCH] add another broxton pciid 0x5A85

I think the pciid patch can go first and then the stack size one, for the stack 
size patch will influence not only BXT but also CHV.

-Original Message-
From: Guo, Yejun
Sent: Monday, September 12, 2016 10:39 AM
To: Pan, Xiuli <xiuli@intel.com>; beignet@lists.freedesktop.org
Subject: RE: [Beignet] [PATCH] add another broxton pciid 0x5A85

thanks, and the stack size bug need to be fixed to get 100% passrate of utest 
for this pciid, that's the reason I merge them into one patch.

-Original Message-
From: Pan, Xiuli
Sent: Monday, September 12, 2016 10:36 AM
To: Guo, Yejun; beignet@lists.freedesktop.org
Cc: Guo, Yejun
Subject: RE: [Beignet] [PATCH] add another broxton pciid 0x5A85

I think this patch can be spilt into two patch.
One is add this pciid. and the other is for the stack size bug.
Others LGTM.

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Guo 
Yejun
Sent: Saturday, September 10, 2016 8:49 AM
To: beignet@lists.freedesktop.org
Cc: Guo, Yejun <yejun@intel.com>
Subject: [Beignet] [PATCH] add another broxton pciid 0x5A85

Signed-off-by: Guo Yejun <yejun@intel.com>
---
 src/cl_command_queue_gen7.c | 11 ++-
 src/cl_device_data.h|  4 +++-
 src/cl_device_id.c  | 23 ++-
 src/cl_driver.h |  4 
 src/cl_driver_defs.c|  1 +
 src/intel/intel_driver.c| 10 ++
 6 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 
6a9cf1f..b6a5920 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -272,12 +272,13 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
   assert(offset >= 0);
   stack_sz *= interp_kernel_get_simd_width(ker->opaque);
   stack_sz *= device->max_compute_unit * ctx->device->max_thread_per_unit;
-  /* Because HSW calc stack offset per thread is relative with half slice, when
- thread schedule in half slice is not balance, would out of bound. Because
- the max half slice is 4 in GT4, multiply stack size with 4 for safe.
+
+  /* for some hardware, part of EUs are disabled with EU id reserved,
+   * it makes the active EU id larger than count of EUs within a subslice,
+   * need to enlarge stack size for such case to avoid out of range.
*/
-  if(cl_driver_get_ver(ctx->drv) == 75)
-stack_sz *= 4;
+  cl_driver_enlarge_stack_size(ctx->drv, _sz);
+
   cl_gpgpu_set_stack(gpgpu, offset, stack_sz, BTI_PRIVATE);  }
 
diff --git a/src/cl_device_data.h b/src/cl_device_data.h index f680219..30366ea 
100644
--- a/src/cl_device_data.h
+++ b/src/cl_device_data.h
@@ -298,9 +298,11 @@
 
 /* BXT */
 #define PCI_CHIP_BROXTON_P 0x5A84   /* Intel(R) BXT-P for mobile desktop */
+#define PCI_CHIP_BROXTON_1 0x5A85
 
 #define IS_BROXTON(devid)   \
-  (devid == PCI_CHIP_BROXTON_P)
+  (devid == PCI_CHIP_BROXTON_P ||   \
+   devid == PCI_CHIP_BROXTON_1)
 
 #define PCI_CHIP_KABYLAKE_ULT_GT1 0x5906
 #define PCI_CHIP_KABYLAKE_ULT_GT2 0x5916
diff --git a/src/cl_device_id.c b/src/cl_device_id.c index 34c182c..ce340c1 
100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -195,6 +195,16 @@ static struct _cl_device_id intel_bxt_device = {  #include 
"cl_gen9_device.h"
 };
 
+static struct _cl_device_id intel_bxt1_device = {
+  .max_compute_unit = 12,
+  .max_thread_per_unit = 6,
+  .sub_slice_count = 2,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen9_device.h"
+};
+
 static struct _cl_device_id intel_kbl_gt1_device = {
   .max_compute_unit = 12,
   .max_thread_per_unit = 7,
@@ -615,6 +625,16 @@ bxt_break:
   cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
   break;
 
+case PCI_CHIP_BROXTON_1:
+  DECL_INFO_STRING(bxt1_break, intel_bxt1_device, name, "Intel(R) 
+HD Graphics Broxton 1");
+bxt1_break:
+  intel_bxt1_device.device_id = device_id;
+  intel_bxt1_device.platform = cl_get_platform_default();
+  ret = _bxt1_device;
+  cl_intel_platform_get_default_extension(ret);
+  cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
+  break;
+
 case PCI_CHIP_KABYLAKE_ULT_GT1:
   DECL_INFO_STRING(kbl_gt1_break, intel_kbl_gt1_device, name, "Intel(R) HD 
Graphics Kabylake ULT GT1");
 case PCI_CHIP_KABYLAKE_DT_GT1:
@@ -931,6 +951,7 @@ LOCAL cl_bool is_gen_device(cl_device_id device) {
  device == _skl_gt3_device ||
  device == _skl_gt4_device ||
  device == _bxt_device ||
+ device == _bxt1_device   

[Beignet] [PATCH] add another broxton pciid 0x5A85

2016-09-09 Thread Guo Yejun
Signed-off-by: Guo Yejun <yejun@intel.com>
---
 src/cl_command_queue_gen7.c | 11 ++-
 src/cl_device_data.h|  4 +++-
 src/cl_device_id.c  | 23 ++-
 src/cl_driver.h |  4 
 src/cl_driver_defs.c|  1 +
 src/intel/intel_driver.c| 10 ++
 6 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 6a9cf1f..b6a5920 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -272,12 +272,13 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
   assert(offset >= 0);
   stack_sz *= interp_kernel_get_simd_width(ker->opaque);
   stack_sz *= device->max_compute_unit * ctx->device->max_thread_per_unit;
-  /* Because HSW calc stack offset per thread is relative with half slice, when
- thread schedule in half slice is not balance, would out of bound. Because
- the max half slice is 4 in GT4, multiply stack size with 4 for safe.
+
+  /* for some hardware, part of EUs are disabled with EU id reserved,
+   * it makes the active EU id larger than count of EUs within a subslice,
+   * need to enlarge stack size for such case to avoid out of range.
*/
-  if(cl_driver_get_ver(ctx->drv) == 75)
-stack_sz *= 4;
+  cl_driver_enlarge_stack_size(ctx->drv, _sz);
+
   cl_gpgpu_set_stack(gpgpu, offset, stack_sz, BTI_PRIVATE);
 }
 
diff --git a/src/cl_device_data.h b/src/cl_device_data.h
index f680219..30366ea 100644
--- a/src/cl_device_data.h
+++ b/src/cl_device_data.h
@@ -298,9 +298,11 @@
 
 /* BXT */
 #define PCI_CHIP_BROXTON_P 0x5A84   /* Intel(R) BXT-P for mobile desktop */
+#define PCI_CHIP_BROXTON_1 0x5A85
 
 #define IS_BROXTON(devid)   \
-  (devid == PCI_CHIP_BROXTON_P)
+  (devid == PCI_CHIP_BROXTON_P ||   \
+   devid == PCI_CHIP_BROXTON_1)
 
 #define PCI_CHIP_KABYLAKE_ULT_GT1 0x5906
 #define PCI_CHIP_KABYLAKE_ULT_GT2 0x5916
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 34c182c..ce340c1 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -195,6 +195,16 @@ static struct _cl_device_id intel_bxt_device = {
 #include "cl_gen9_device.h"
 };
 
+static struct _cl_device_id intel_bxt1_device = {
+  .max_compute_unit = 12,
+  .max_thread_per_unit = 6,
+  .sub_slice_count = 2,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen9_device.h"
+};
+
 static struct _cl_device_id intel_kbl_gt1_device = {
   .max_compute_unit = 12,
   .max_thread_per_unit = 7,
@@ -615,6 +625,16 @@ bxt_break:
   cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
   break;
 
+case PCI_CHIP_BROXTON_1:
+  DECL_INFO_STRING(bxt1_break, intel_bxt1_device, name, "Intel(R) HD 
Graphics Broxton 1");
+bxt1_break:
+  intel_bxt1_device.device_id = device_id;
+  intel_bxt1_device.platform = cl_get_platform_default();
+  ret = _bxt1_device;
+  cl_intel_platform_get_default_extension(ret);
+  cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
+  break;
+
 case PCI_CHIP_KABYLAKE_ULT_GT1:
   DECL_INFO_STRING(kbl_gt1_break, intel_kbl_gt1_device, name, "Intel(R) HD 
Graphics Kabylake ULT GT1");
 case PCI_CHIP_KABYLAKE_DT_GT1:
@@ -931,6 +951,7 @@ LOCAL cl_bool is_gen_device(cl_device_id device) {
  device == _skl_gt3_device ||
  device == _skl_gt4_device ||
  device == _bxt_device ||
+ device == _bxt1_device||
  device == _kbl_gt1_device ||
  device == _kbl_gt15_device ||
  device == _kbl_gt2_device ||
@@ -1074,7 +1095,7 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
 *ver = 8;
   } else if (device == _skl_gt1_device || device == _skl_gt2_device
 || device == _skl_gt3_device || device == _skl_gt4_device
-|| device == _bxt_device || device == _kbl_gt1_device
+|| device == _bxt_device || device == _bxt1_device || 
device == _kbl_gt1_device
 || device == _kbl_gt2_device || device == _kbl_gt3_device
 || device == _kbl_gt4_device || device == 
_kbl_gt15_device) {
 *ver = 9;
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 16730db..584be9d 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -51,6 +51,10 @@ extern cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr;
 typedef uint32_t (cl_driver_get_ver_cb)(cl_driver);
 extern cl_driver_get_ver_cb *cl_driver_get_ver;
 
+/* enlarge stack size from the driver */
+typedef void (cl_driver_enlarge_stack_size_cb)(cl_driver, int32_t*);
+extern cl_driver_enlarge_stack_size_cb *cl_driver_enlarge_stack_size;
+
 typedef enum cl_self_test_res{
   SELF_TEST_PASS = 0,
   SELF_TEST_SLM_FAIL  = 1,
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 31176a4..ea4e90a 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -25,6 +25,7 @@ LOCAL cl_driver_new_cb *cl_driver_new 

Re: [Beignet] [PATCH] Backend: Increase stack size for CHV device

2016-09-09 Thread Guo, Yejun
after sync with Xiuli, I'll merge it to another patch and send out later

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Xiuli 
Pan
Sent: Friday, September 09, 2016 1:38 PM
To: beignet@lists.freedesktop.org
Cc: Pan, Xiuli
Subject: [Beignet] [PATCH] Backend: Increase stack size for CHV device

From: Pan Xiuli 

CHV device have two kinds of EU numbers, this will make stack pointer calculate 
wrong. Make it double for safe.

Signed-off-by: Pan Xiuli 
---
 src/cl_command_queue_gen7.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 
6a9cf1f..d1186c2 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -25,6 +25,7 @@
 #include "cl_mem.h"
 #include "cl_utils.h"
 #include "cl_alloc.h"
+#include "cl_device_data.h"
 
 #include 
 #include 
@@ -278,6 +279,9 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
*/
   if(cl_driver_get_ver(ctx->drv) == 75)
 stack_sz *= 4;
+  /* Because CHV has two kind of EU numbers, multiply stack size with 2 
+ for safe. */
+  if(IS_CHERRYVIEW(device->device_id))
+stack_sz *= 2;
   cl_gpgpu_set_stack(gpgpu, offset, stack_sz, BTI_PRIVATE);  }
 
--
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH V2] fix the condition to check if there are built-in kernels

2016-08-21 Thread Guo Yejun
an empty string is returned if no built-in kernels are supported
by the device, and so the returned size is 1, not 0.

v2: output "Skip!" to make the result clear
Signed-off-by: Guo Yejun <yejun@intel.com>
---
 utests/builtin_kernel_max_global_size.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/utests/builtin_kernel_max_global_size.cpp 
b/utests/builtin_kernel_max_global_size.cpp
index d3e8373..ad9c028 100644
--- a/utests/builtin_kernel_max_global_size.cpp
+++ b/utests/builtin_kernel_max_global_size.cpp
@@ -10,8 +10,10 @@ void builtin_kernel_max_global_size(void)
 
 
   OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, 0, 0, 
_in_kernels_size);
-  if(built_in_kernels_size == 0)
+  if(built_in_kernels_size <= 1) { //the size of empty string is 1
+printf(" no built in kernel, Skip!");
 return;
+  }
 
   built_in_kernel_names = (char* )malloc(built_in_kernels_size * sizeof(char) 
);
   OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, 
built_in_kernels_size, (void*)built_in_kernel_names, _sz);
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] use OCL_MAP_BUFFER_GTT to map climage

2016-08-19 Thread Guo Yejun
Signed-off-by: Guo Yejun <yejun@intel.com>
---
 utests/runtime_cmrt.cpp | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/utests/runtime_cmrt.cpp b/utests/runtime_cmrt.cpp
index 837f09a..92bd368 100644
--- a/utests/runtime_cmrt.cpp
+++ b/utests/runtime_cmrt.cpp
@@ -236,8 +236,8 @@ void runtime_cmrt(void)
   OCL_CREATE_IMAGE(buf[0], 0, , , NULL);
   OCL_CREATE_IMAGE(buf[1], 0, , , NULL);
 
-  OCL_MAP_BUFFER(0);
-  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER_GTT(0);
+  OCL_MAP_BUFFER_GTT(1);
   uint8_t* src = (uint8_t*)buf_data[0];
   uint8_t* dst = (uint8_t*)buf_data[1];
   for (uint32_t j = 0; j < h; ++j)
@@ -245,8 +245,8 @@ void runtime_cmrt(void)
   src[j * w * 4 + i] = i;
   dst[j * w * 4 + i] = 0;
 }
-  OCL_UNMAP_BUFFER(0);
-  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER_GTT(0);
+  OCL_UNMAP_BUFFER_GTT(1);
 
   unsigned int d = 3;
   OCL_SET_ARG(0, sizeof(cl_mem), [0]);
@@ -259,16 +259,16 @@ void runtime_cmrt(void)
   //if kernel uses cm_linear_global_id, locals must be not NULL to invoke 
pCmQueue->EnqueueWithGroup
   OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, 2, NULL, globals, NULL, 0, 
NULL, NULL);
 
-  OCL_MAP_BUFFER(0);
-  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER_GTT(0);
+  OCL_MAP_BUFFER_GTT(1);
   src = (uint8_t*)buf_data[0];
   dst = (uint8_t*)buf_data[1];
   for (uint32_t j = 0; j < h; ++j)
 for (uint32_t i = 0; i < w*4; i++) {
   OCL_ASSERT(src[j * w * 4 + i] / d == dst[j * w * 4 + i]);
 }
-  OCL_UNMAP_BUFFER(0);
-  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER_GTT(0);
+  OCL_UNMAP_BUFFER_GTT(1);
 }
 
 MAKE_UTEST_FROM_FUNCTION(runtime_cmrt);
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] fix the condition to check if there are built-in kernels

2016-08-19 Thread Guo Yejun
an empty string is returned if no built-in kernels are supported
by the device, and so the returned size is 1, not 0.

Signed-off-by: Guo Yejun <yejun@intel.com>
---
 utests/builtin_kernel_max_global_size.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utests/builtin_kernel_max_global_size.cpp 
b/utests/builtin_kernel_max_global_size.cpp
index d3e8373..7580572 100644
--- a/utests/builtin_kernel_max_global_size.cpp
+++ b/utests/builtin_kernel_max_global_size.cpp
@@ -10,7 +10,7 @@ void builtin_kernel_max_global_size(void)
 
 
   OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, 0, 0, 
_in_kernels_size);
-  if(built_in_kernels_size == 0)
+  if(built_in_kernels_size <= 1)  //the size of empty string is 1
 return;
 
   built_in_kernel_names = (char* )malloc(built_in_kernels_size * sizeof(char) 
);
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] add help for 'make package'

2016-08-09 Thread Guo Yejun
Signed-off-by: Guo Yejun <yejun@intel.com>
---
 docs/Beignet.mdwn | 4 
 1 file changed, 4 insertions(+)

diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn
index 9345bfa..914cbce 100644
--- a/docs/Beignet.mdwn
+++ b/docs/Beignet.mdwn
@@ -104,6 +104,10 @@ your library installation directory.
 It installs the OCL icd vendor files to /etc/OpenCL/vendors, if the system 
support ICD.
 - intel-beignet.icd
 
+`> make package`
+
+It packages the driver binaries, you may copy the package to another 
machine with simillar system.
+
 How to run
 --
 
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] only check beignet special test cases on beignet

2016-08-02 Thread Guo Yejun
with other implementation, do not check the result for the beignet
special test cases

Signed-off-by: Guo Yejun <yejun@intel.com>
---
 utests/get_cl_info.cpp | 85 --
 1 file changed, 47 insertions(+), 38 deletions(-)

diff --git a/utests/get_cl_info.cpp b/utests/get_cl_info.cpp
index 801d0c2..42edf1a 100644
--- a/utests/get_cl_info.cpp
+++ b/utests/get_cl_info.cpp
@@ -413,13 +413,15 @@ void get_build_llvm_info(void)
 }
 }
 
-//Test is successful if the backend created the file
-if( (fp = fopen(llvm_file, "r")) == NULL) {
-std::cout << "LLVM file creation.. FAILED";
-OCL_ASSERT(0);
-} else {
-fclose(fp);
-std::cout << "LLVM file created.. SUCCESS";
+if (cl_check_beignet()) {
+   //Test is successful if the backend created the file
+   if( (fp = fopen(llvm_file, "r")) == NULL) {
+   std::cout << "LLVM file creation.. FAILED";
+   OCL_ASSERT(0);
+   } else {
+   fclose(fp);
+   std::cout << "LLVM file created.. SUCCESS";
+   }
 }
 }
 
@@ -466,8 +468,8 @@ void compile_spir_binary(void)
 }
 }
 
-//Test is successful if the backend created the file
 if (cl_check_beignet()) {
+  //Test is successful if the backend created the file
   if( (fp = fopen(spir_file, "r")) == NULL) {
 std::cout << "SPIR file creation.. FAILED";
 OCL_ASSERT(0);
@@ -517,13 +519,15 @@ void build_spir_binary(void)
 }
 }
 
-//Test is successful if the backend created the file
-if( (fp = fopen(spir_file, "r")) == NULL) {
-std::cout << "SPIR file creation.. FAILED";
-OCL_ASSERT(0);
-} else {
-fclose(fp);
-std::cout << "SPIR file created.. SUCCESS";
+if (cl_check_beignet()) {
+  //Test is successful if the backend created the file
+  if( (fp = fopen(spir_file, "r")) == NULL) {
+  std::cout << "SPIR file creation.. FAILED";
+  OCL_ASSERT(0);
+  } else {
+  fclose(fp);
+  std::cout << "SPIR file created.. SUCCESS";
+  }
 }
 }
 MAKE_UTEST_FROM_FUNCTION(build_spir_binary);
@@ -567,13 +571,15 @@ void get_build_asm_info(void)
 }
 }
 
-//Test is successful if the backend created the file
-if( (fp = fopen(asm_file, "r")) == NULL) {
-std::cout << "ASM file creation.. FAILED";
-OCL_ASSERT(0);
-} else {
-fclose(fp);
-std::cout << "ASM file created.. SUCCESS";
+if (cl_check_beignet()) {
+  //Test is successful if the backend created the file
+  if( (fp = fopen(asm_file, "r")) == NULL) {
+  std::cout << "ASM file creation.. FAILED";
+  OCL_ASSERT(0);
+  } else {
+  fclose(fp);
+  std::cout << "ASM file created.. SUCCESS";
+  }
 }
 }
 
@@ -614,15 +620,16 @@ void get_compile_llvm_info(void)
 }
 }
 
-//Test is successful if the backend created the file
-if( (fp = fopen(llvm_file, "r")) == NULL) {
-std::cout << "LLVM file creation.. FAILED";
-OCL_ASSERT(0);
-} else {
-fclose(fp);
-std::cout << "LLVM file created.. SUCCESS";
+if (cl_check_beignet()) {
+  //Test is successful if the backend created the file
+  if( (fp = fopen(llvm_file, "r")) == NULL) {
+  std::cout << "LLVM file creation.. FAILED";
+  OCL_ASSERT(0);
+  } else {
+  fclose(fp);
+  std::cout << "LLVM file created.. SUCCESS";
+  }
 }
-
 }
 
 MAKE_UTEST_FROM_FUNCTION(get_compile_llvm_info);
@@ -662,14 +669,16 @@ void get_link_asm_info(void)
 }
 }
 
-//Test is successful if the backend created the file
-if( (fp = fopen(asm_file, "r")) == NULL) {
-std::cout << "ASM file creation.. FAILED";
-OCL_ASSERT(0);
-} else {
-fclose(fp);
-std::cout << "ASM file created.. SUCCESS";
-} 
+if (cl_check_beignet()) {
+  //Test is successful if the backend created the file
+  if( (fp = fopen(asm_file, "r")) == NULL) {
+  std::cout << "ASM file creation.. FAILED";
+  OCL_ASSERT(0);
+  } else {
+  fclose(fp);
+  std::cout << "ASM file created.. SUCCESS";
+  }
+}
 }
 
 MAKE_UTEST_FROM_FUNCTION(get_link_asm_info);
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] use different pointer alignment for different implementation

2016-08-01 Thread Guo Yejun
beignet only requirs 64 bytes alignment while other implementations
might require 4096 alignment.

and also change function cl_check_beignet for better output message.

Signed-off-by: Guo Yejun <yejun@intel.com>
---
 utests/compiler_time_stamp.cpp|  4 +++-
 utests/runtime_use_host_ptr_image.cpp |  8 ++--
 utests/utest_helper.cpp   | 10 ++
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/utests/compiler_time_stamp.cpp b/utests/compiler_time_stamp.cpp
index e376522..43165c1 100644
--- a/utests/compiler_time_stamp.cpp
+++ b/utests/compiler_time_stamp.cpp
@@ -16,8 +16,10 @@ static void cpu(int global_id, int *src, int *dst) {
 
 void compiler_time_stamp(void)
 {
-  if (!cl_check_beignet())
+  if (!cl_check_beignet()) {
+printf("Not beignet device , Skip!");
 return;
+  }
 
   const size_t n = 16;
   int cpu_dst[16], cpu_src[16];
diff --git a/utests/runtime_use_host_ptr_image.cpp 
b/utests/runtime_use_host_ptr_image.cpp
index 2de9194..4a30e89 100644
--- a/utests/runtime_use_host_ptr_image.cpp
+++ b/utests/runtime_use_host_ptr_image.cpp
@@ -18,8 +18,12 @@ static void runtime_use_host_ptr_image(void)
   desc.image_width = w;
   desc.image_height = h;
 
+  size_t alignment = 4096;  //page size
+  if (cl_check_beignet())
+alignment = 64; //cacheline size, beignet has loose limitaiont to 
enable userptr
+
   //src image
-  int ret = posix_memalign(_data[0], 64, sizeof(uint32_t) * w * h);
+  int ret = posix_memalign(_data[0], alignment, sizeof(uint32_t) * w * h);
   OCL_ASSERT(ret == 0);
   for (size_t i = 0; i < w*h; ++i)
 ((uint32_t*)buf_data[0])[i] = i;
@@ -27,7 +31,7 @@ static void runtime_use_host_ptr_image(void)
   OCL_CREATE_IMAGE(buf[0], CL_MEM_USE_HOST_PTR, , , buf_data[0]);
 
   //dst image
-  ret = posix_memalign(_data[1], 64, sizeof(uint32_t) * w * h);
+  ret = posix_memalign(_data[1], alignment, sizeof(uint32_t) * w * h);
   OCL_ASSERT(ret == 0);
   for (size_t i = 0; i < w*h; ++i)
 ((uint32_t*)buf_data[1])[i] = 0;
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index da4cfbf..3388d9f 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -860,7 +860,6 @@ int cl_check_beignet(void)
   size_t ret_sz;
   OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_VERSION, 0, 0, 
_value_size);
   if(param_value_size == 0) {
-printf("Not beignet device , Skip!");
 return 0;
   }
   char* device_version_str = (char* )malloc(param_value_size * sizeof(char) );
@@ -869,7 +868,6 @@ int cl_check_beignet(void)
 
   if(!strstr(device_version_str, "beignet")) {
 free(device_version_str);
-printf("Not beignet device , Skip!");
 return 0;
   }
   free(device_version_str);
@@ -906,8 +904,10 @@ int cl_check_ocl20(void)
 if(cl_check_beignet()) {
   printf("Beignet extension test!");
   return 1;
+} else {
+  printf("Not beignet device , Skip!");
+  return 0;
 }
-return 0;
   }
   char* device_version_str = (char* )malloc(param_value_size * sizeof(char) );
   OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_OPENCL_C_VERSION, 
param_value_size, (void*)device_version_str, _sz);
@@ -919,8 +919,10 @@ int cl_check_ocl20(void)
 if(cl_check_beignet()) {
   printf("Beignet extension test!");
   return 1;
+} else {
+  printf("Not beignet device , Skip!");
+  return 0;
 }
-return 0;
   }
   free(device_version_str);
   return 1;
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] remove "\n" in output message when test is failed

2016-08-01 Thread Guo Yejun
otherwise, "[FAILED]" and the test name is not in the same line.

Signed-off-by: Guo Yejun <yejun@intel.com>
---
 utests/builtin_lgamma.cpp   | 2 +-
 utests/builtin_lgamma_r.cpp | 2 +-
 utests/builtin_tgamma.cpp   | 2 +-
 utests/image_1D_buffer.cpp  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/utests/builtin_lgamma.cpp b/utests/builtin_lgamma.cpp
index 876699a..57945de 100644
--- a/utests/builtin_lgamma.cpp
+++ b/utests/builtin_lgamma.cpp
@@ -29,7 +29,7 @@ void builtin_lgamma(void) {
float cpu = lgamma(src[i]);
float gpu = dst[i];
if (fabsf(cpu - gpu) >= 1e-3) {
-   printf("%f %f %f\n", src[i], cpu, gpu);
+   printf("%f %f %f", src[i], cpu, gpu);
OCL_ASSERT(0);
}
}
diff --git a/utests/builtin_lgamma_r.cpp b/utests/builtin_lgamma_r.cpp
index b6e5d0e..0258767 100644
--- a/utests/builtin_lgamma_r.cpp
+++ b/utests/builtin_lgamma_r.cpp
@@ -34,7 +34,7 @@ void builtin_lgamma_r(void) {
int gpu_signp = ((int*)buf_data[2])[i];
float gpu = dst[i];
if (cpu_signp != gpu_signp || fabsf(cpu - gpu) >= 1e-3) 
{
-   printf("%f %f %f\n", src[i], cpu, gpu);
+   printf("%f %f %f", src[i], cpu, gpu);
OCL_ASSERT(0);
}
}
diff --git a/utests/builtin_tgamma.cpp b/utests/builtin_tgamma.cpp
index 204f49e..eb6bdd7 100644
--- a/utests/builtin_tgamma.cpp
+++ b/utests/builtin_tgamma.cpp
@@ -46,7 +46,7 @@ void builtin_tgamma(void)
   if (std::isinf(cpu)) {
 OCL_ASSERT(std::isinf(dst[i]));
   } else if (fabsf(cpu - dst[i]) >= cl_FLT_ULP(cpu) * ULPSIZE_FACTOR) {
-printf("%f %f %f\n", src[i], cpu, dst[i]);
+printf("%f %f %f", src[i], cpu, dst[i]);
 OCL_ASSERT(0);
   }
 }
diff --git a/utests/image_1D_buffer.cpp b/utests/image_1D_buffer.cpp
index f2eb7a3..66eb6e7 100644
--- a/utests/image_1D_buffer.cpp
+++ b/utests/image_1D_buffer.cpp
@@ -55,7 +55,7 @@ void image_1D_buffer(void)
   OCL_MAP_BUFFER(1);
   for (uint32_t i = 0; i < buffer_sz; i++) {
 if (((uint32_t*)buf_data[1])[i] != ((uint32_t*)buf_data[0])[i])
-  printf("i %d expected %x got %x \n", i, ((uint32_t*)buf_data[0])[i], 
((uint32_t*)buf_data[1])[i]);
+  printf("i %d expected %x got %x", i, ((uint32_t*)buf_data[0])[i], 
((uint32_t*)buf_data[1])[i]);
 OCL_ASSERT(((uint32_t*)buf_data[1])[i] == ((uint32_t*)buf_data[0])[i]);
   }
   OCL_UNMAP_BUFFER(0);
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] utests: only check -dump-spir-binary on beignet implementation

2016-07-27 Thread Guo Yejun
Signed-off-by: Guo Yejun <yejun@intel.com>
---
 utests/get_cl_info.cpp | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/utests/get_cl_info.cpp b/utests/get_cl_info.cpp
index bdd7e0c..bd557c3 100644
--- a/utests/get_cl_info.cpp
+++ b/utests/get_cl_info.cpp
@@ -465,12 +465,14 @@ void compile_spir_binary(void)
 }
 
 //Test is successful if the backend created the file
-if( (fp = fopen(spir_file, "r")) == NULL) {
+if (cl_check_beignet()) {
+  if( (fp = fopen(spir_file, "r")) == NULL) {
 std::cout << "SPIR file creation.. FAILED";
 OCL_ASSERT(0);
-} else {
+  } else {
 fclose(fp);
 std::cout << "SPIR file created.. SUCCESS";
+  }
 }
 }
 MAKE_UTEST_FROM_FUNCTION(compile_spir_binary);
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] utests: change tolerance check for lgamma

2016-07-27 Thread Guo Yejun
according to spec section 7.4: The ULP values for built-in math
functions lgamma and lgamma_r is currently undefined, let's use
16*ULP for lgamma result.

Signed-off-by: Guo Yejun <yejun@intel.com>
---
 utests/utest_math_gen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utests/utest_math_gen.py b/utests/utest_math_gen.py
index ac70979..a4bfd51 100755
--- a/utests/utest_math_gen.py
+++ b/utests/utest_math_gen.py
@@ -360,7 +360,7 @@ static float ldexp(float x, int y){
   lgamma_input_values = base_input_values
   lgamma_input_type = ['float','float2','float4','float8','float16']
   lgamma_output_type = ['float','float2','float4','float8','float16']
-  lgammaUtests = 
func('lgamma','lgamma',[lgamma_input_type],lgamma_output_type,[lgamma_input_values],'4
 * FLT_ULP')
+  lgammaUtests = 
func('lgamma','lgamma',[lgamma_input_type],lgamma_output_type,[lgamma_input_values],'16
 * FLT_ULP')
 
   # gentype log(gentype)
   log_input_values = base_input_values
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH V3] Utset: Add check for workgroup tests

2016-07-26 Thread Guo, Yejun
looks good to me, thanks.

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Xiuli 
Pan
Sent: Wednesday, July 27, 2016 10:11 AM
To: beignet@lists.freedesktop.org
Cc: Pan, Xiuli
Subject: [Beignet] [PATCH V3] Utset: Add check for workgroup tests

From: Pan Xiuli 

Workgroup is also an OpenCL 2.0 feature, add check for these cases.
V2: Move check before kernel build
V3: Remove uselees check

Signed-off-by: Pan Xiuli 
---
 utests/compiler_workgroup_broadcast.cpp  | 12 ++
 utests/compiler_workgroup_reduce.cpp | 34 
 utests/compiler_workgroup_scan_exclusive.cpp | 30 
 utests/compiler_workgroup_scan_inclusive.cpp | 30 
 4 files changed, 106 insertions(+)

diff --git a/utests/compiler_workgroup_broadcast.cpp 
b/utests/compiler_workgroup_broadcast.cpp
index fd2228c..a323fb6 100644
--- a/utests/compiler_workgroup_broadcast.cpp
+++ b/utests/compiler_workgroup_broadcast.cpp
@@ -242,6 +242,8 @@ static void workgroup_generic(WG_BROADCAST wg_broadcast,
  */
 void compiler_workgroup_broadcast_1D_int(void)
 {
+  if (!cl_check_ocl20())
+return;
   cl_int *input = NULL;
   cl_int *expected = NULL;
   OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_broadcast",
@@ -252,6 +254,8 @@ 
MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_broadcast_1D_int);
 
 void compiler_workgroup_broadcast_1D_long(void)
 {
+  if (!cl_check_ocl20())
+return;
   cl_long *input = NULL;
   cl_long *expected = NULL;
   OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_broadcast",
@@ -265,6 +269,8 @@ 
MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_broadcast_1D_long);
  */
 void compiler_workgroup_broadcast_2D_int(void)
 {
+  if (!cl_check_ocl20())
+return;
   cl_int *input = NULL;
   cl_int *expected = NULL;
   OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_broadcast",
@@ -275,6 +281,8 @@ 
MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_broadcast_2D_int);
 
 void compiler_workgroup_broadcast_2D_long(void)
 {
+  if (!cl_check_ocl20())
+return;
   cl_long *input = NULL;
   cl_long *expected = NULL;
   OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_broadcast",
@@ -289,6 +297,8 @@ 
MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_broadcast_2D_long);
  */
 void compiler_workgroup_broadcast_3D_int(void)
 {
+  if (!cl_check_ocl20())
+return;
   cl_int *input = NULL;
   cl_int *expected = NULL;
   OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_broadcast",
@@ -299,6 +309,8 @@ 
MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_broadcast_3D_int);
 
 void compiler_workgroup_broadcast_3D_long(void)
 {
+  if (!cl_check_ocl20())
+return;
   cl_long *input = NULL;
   cl_long *expected = NULL;
   OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_broadcast",
diff --git a/utests/compiler_workgroup_reduce.cpp 
b/utests/compiler_workgroup_reduce.cpp
index 21bcfa2..1cf4b08 100644
--- a/utests/compiler_workgroup_reduce.cpp
+++ b/utests/compiler_workgroup_reduce.cpp
@@ -219,6 +219,8 @@ static void workgroup_generic(WG_FUNCTION wg_func,
  */
 void compiler_workgroup_any(void)
 {
+  if (!cl_check_ocl20())
+return;
   cl_int *input = NULL;
   cl_int *expected = NULL;
   OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
@@ -228,6 +230,8 @@ void compiler_workgroup_any(void)
 MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_any);
 void compiler_workgroup_all(void)
 {
+  if (!cl_check_ocl20())
+return;
   cl_int *input = NULL;
   cl_int *expected = NULL;
   OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
@@ -240,6 +244,8 @@ MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_all);
  */
 void compiler_workgroup_reduce_add_int(void)
 {
+  if (!cl_check_ocl20())
+return;
   cl_int *input = NULL;
   cl_int *expected = NULL;
   OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
@@ -249,6 +255,8 @@ void compiler_workgroup_reduce_add_int(void)
 MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_add_int);
 void compiler_workgroup_reduce_add_uint(void)
 {
+  if (!cl_check_ocl20())
+return;
   cl_uint *input = NULL;
   cl_uint *expected = NULL;
   OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
@@ -258,6 +266,8 @@ void compiler_workgroup_reduce_add_uint(void)
 MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_add_uint);
 void compiler_workgroup_reduce_add_long(void)
 {
+  if (!cl_check_ocl20())
+return;
   cl_long *input = NULL;
   cl_long *expected = NULL;
   OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
@@ -267,6 +277,8 @@ void compiler_workgroup_reduce_add_long(void)
 MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_reduce_add_long);
 void compiler_workgroup_reduce_add_ulong(void)
 {
+  if (!cl_check_ocl20())
+return;
   cl_ulong *input = NULL;
   cl_ulong *expected = NULL;
   OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
@@ -276,6 +288,8 @@ void compiler_workgroup_reduce_add_ulong(void)
 

Re: [Beignet] [PATCH] Runtime: fix a userptr bug.

2016-07-26 Thread Guo, Yejun
LGTM, thanks.

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Yang 
Rong
Sent: Tuesday, July 26, 2016 4:50 PM
To: beignet@lists.freedesktop.org
Cc: Yang, Rong R
Subject: [Beignet] [PATCH] Runtime: fix a userptr bug.

Userptr also require size cache alignment, otherwise, the remained
memory may be allocated in CPU side, when gpu flush the last cacheline
to memory, will override the value changed by CPU.

Signed-off-by: Yang Rong 
---
 src/cl_mem.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/cl_mem.c b/src/cl_mem.c
index 229bc0a..9e796ef 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -295,7 +295,8 @@ cl_mem_allocate(enum cl_mem_type type,
   assert(host_ptr != NULL);
   /* userptr not support tiling */
   if (!is_tiled) {
-if (ALIGN((unsigned long)host_ptr, cacheline_size) == (unsigned 
long)host_ptr) {
+if ((ALIGN((unsigned long)host_ptr, cacheline_size) == (unsigned 
long)host_ptr) &&
+(ALIGN((unsigned long)sz, cacheline_size) == (unsigned 
long)sz)) {
   void* aligned_host_ptr = (void*)(((unsigned long)host_ptr) & 
(~(page_size - 1)));
   mem->offset = host_ptr - aligned_host_ptr;
   mem->is_userptr = 1;
@@ -851,6 +852,7 @@ _cl_mem_new_image(cl_context ctx,
 cl_get_device_info(ctx->device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, 
sizeof(cacheline_size), _size, NULL);
 if (ALIGN((unsigned long)data, cacheline_size) == (unsigned long)data &&
 ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1)) == h &&
+ALIGN(h * pitch * depth, cacheline_size) == h * pitch * depth && //h 
and pitch should same as aligned_h and aligned_pitch if enable userptr
 ((image_type != CL_MEM_OBJECT_IMAGE3D && image_type != 
CL_MEM_OBJECT_IMAGE1D_ARRAY && image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY) || 
pitch * h == slice_pitch)) {
   tiling = CL_NO_TILE;
   enableUserptr = 1;
-- 
2.1.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] utests: fix issue of CL_PROGRAM_BINARY_SIZES query

2016-07-26 Thread Guo Yejun
the return type of CL_PROGRAM_BINARY_SIZES query is unsigned char*[],
and param_value_size must be >= size of the return type, see spec 1.2
section 5.6.7 (P151)

Signed-off-by: Guo Yejun <yejun@intel.com>
---
 utests/get_cl_info.cpp | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/utests/get_cl_info.cpp b/utests/get_cl_info.cpp
index bdd7e0c..afdf8ca 100644
--- a/utests/get_cl_info.cpp
+++ b/utests/get_cl_info.cpp
@@ -78,9 +78,11 @@ struct Info_Result {
 int *elt_size;
 int size;
 typedef char** type_value;
+int array_size;
 
 Info_Result(char **other, int *sz, int elt_num) {
-size = elt_num;
+array_size = elt_num;
+size = elt_num * sizeof(char**);
 
 ret = (char **)malloc(elt_num * sizeof(char *));
 memset(ret, 0, (elt_num * sizeof(char *)));
@@ -106,7 +108,7 @@ struct Info_Result {
 
 ~Info_Result(void) {
 int i = 0;
-for (; i < size; i++) {
+for (; i < array_size; i++) {
 if (refer[i])
 free(refer[i]);
 free(ret[i]);
@@ -122,7 +124,7 @@ struct Info_Result {
 
 bool check_result (void) {
 int i = 0;
-for (; i < size; i++) {
+for (; i < array_size; i++) {
 if (refer[i] && ::memcmp(ret[i], refer[i], elt_size[i]))
 return false;
 }
@@ -222,7 +224,7 @@ void get_program_info(void)
 expect_value = NO_STANDARD_REF;
 maps.insert(make_pair(CL_PROGRAM_BINARY_SIZES,
   (void *)(new 
Info_Result((size_t)expect_value;
-sz = 4096; //big enough?
+sz = 8192; //big enough?
 expect_source = NULL;
 maps.insert(make_pair(CL_PROGRAM_BINARIES,
   (void *)(new Info_Result(_source, 
, 1;
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] utest: do not check the padding componenet for 3-component vector data types

2016-07-24 Thread Guo Yejun
Per OPenCL 1.2 spec 6.1.5: For 3-component vector data types, the
size of the data type is 4 * sizeof(component). The spec does not
explicitly say what the padding componenet will be, it should be
implementation-dependent, so, do not check the padding componenet
in the unit test.

Signed-off-by: Guo Yejun <yejun@intel.com>
---
 utests/compiler_abs.cpp  | 18 +-
 utests/compiler_abs_diff.cpp | 17 -
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/utests/compiler_abs.cpp b/utests/compiler_abs.cpp
index 3f477a8..49b381d 100644
--- a/utests/compiler_abs.cpp
+++ b/utests/compiler_abs.cpp
@@ -119,6 +119,18 @@ template  static void dump_data 
(T* src, U* dst, int n)
 }
 }
 
+template 
+static void check_result(T* actual, T* expected)
+{
+OCL_ASSERT(*actual == *expected);
+}
+
+template 
+static void check_result(cl_vec<T, N>* actual, cl_vec<T, N>* expected)
+{
+OCL_ASSERT(!memcmp(actual, expected, sizeof(T)*N));
+}
+
 template  static void compiler_abs_with_type(void)
 {
 const size_t n = 16;
@@ -160,7 +172,11 @@ template  static void 
compiler_abs_with_type(void)
 
 //  dump_data(cpu_src, cpu_dst, n);
 
-OCL_ASSERT(!memcmp(buf_data[1], cpu_dst, sizeof(T) * n));
+U* actual = (U*)buf_data[1];
+U* expected = cpu_dst;
+for (size_t i = 0; i < n; ++i)
+check_result([i], [i]);
+
 OCL_UNMAP_BUFFER(1);
 OCL_UNMAP_BUFFER(0);
 }
diff --git a/utests/compiler_abs_diff.cpp b/utests/compiler_abs_diff.cpp
index 15a1f90..1df7d47 100644
--- a/utests/compiler_abs_diff.cpp
+++ b/utests/compiler_abs_diff.cpp
@@ -127,6 +127,18 @@ template  static void dump_data 
(T* x, T* y, U* diff, in
 }
 }
 
+template 
+static void check_result(T* actual, T* expected)
+{
+OCL_ASSERT(*actual == *expected);
+}
+
+template 
+static void check_result(cl_vec<T, N>* actual, cl_vec<T, N>* expected)
+{
+OCL_ASSERT(!memcmp(actual, expected, sizeof(T)*N));
+}
+
 template  static void compiler_abs_diff_with_type(void)
 {
 const size_t n = 16;
@@ -174,7 +186,10 @@ template  static void 
compiler_abs_diff_with_type(void)
 
 //  dump_data(cpu_x, cpu_y, cpu_diff, n);
 
-OCL_ASSERT(!memcmp(buf_data[2], cpu_diff, sizeof(T) * n));
+U* actual = (U*)buf_data[2];
+U* expected = cpu_diff;
+for (size_t i = 0; i < n; ++i)
+check_result([i], [i]);
 
 OCL_UNMAP_BUFFER(0);
 OCL_UNMAP_BUFFER(1);
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] change the bahavior when writing to 3-component vector data types

2016-07-22 Thread Guo Yejun
Per OPenCL 1.2 spec 6.1.5: For 3-component vector data types,
the size of the data type is 4 * sizeof(component). The spec does
not explicitly say what the padding componenet will be, it should
be implementation-dependent, so, let's fill zero for the padding
componenet instead of just caring the 3 components, and the sends
instruction will be decreased (from 4 sends to 1 sends for short3*
case).

Signed-off-by: Guo Yejun <yejun@intel.com>
---
 backend/src/llvm/llvm_gen_backend.cpp | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
b/backend/src/llvm/llvm_gen_backend.cpp
index 4122bdb..5135950 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -5180,13 +5180,6 @@ namespace gbe
   uint32_t elemNum = vectorType->getNumElements();
   GBE_ASSERTM(elemNum == 2 || elemNum == 3 || elemNum == 4 || elemNum == 8 
|| elemNum == 16,
   "Only vectors of 2,3,4,8 or 16 elements are supported");
-  // Per OPenCL 1.2 spec 6.1.5:
-  //   For 3-component vector data types, the size of the data type is 4 * 
sizeof(component).
-  // And the llvm does cast a type3 data to type4 for load/store 
instruction,
-  // so a 4 elements vector may only have 3 valid elements. We need to fix 
it to correct element
-  // count here.
-  if (elemNum == 4 && writer->regTranslator.isUndefConst(llvmValues, 3))
-  elemNum = 3;
 
   // The code is going to be fairly different from types to types (based on
   // size of each vector element)
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] do not use const pointer

2016-07-18 Thread Guo Yejun
Signed-off-by: Guo Yejun <yejun@intel.com>
---
 utests/builtin_acos_asin.cpp | 8 ++--
 utests/builtin_exp.cpp   | 8 ++--
 utests/builtin_pow.cpp   | 7 ++-
 utests/utest_generator.py| 8 ++--
 4 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/utests/builtin_acos_asin.cpp b/utests/builtin_acos_asin.cpp
index 395460b..21fe461 100644
--- a/utests/builtin_acos_asin.cpp
+++ b/utests/builtin_acos_asin.cpp
@@ -10,7 +10,9 @@
   printf("\033[0m");\
 }
 
-const float input_data[] = {-30, -1, -0.92, -0.5, -0.09, 0, 0.09, 0.5, 0.92, 
1, 30};
+namespace {
+
+float input_data[] = {-30, -1, -0.92, -0.5, -0.09, 0, 0.09, 0.5, 0.92, 1, 30};
 const int count_input = sizeof(input_data) / sizeof(input_data[0]);
 const int max_function = 5;
 
@@ -44,7 +46,8 @@ static void builtin_acos_asin(void)
   locals[0] = 1;
 
   clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * 
sizeof(float), input_data, 0, NULL, NULL);
-  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), _function 
, 0, NULL, NULL);
+  int maxfunc = max_function;
+  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), , 0, 
NULL, NULL);
 
// Run the kernel
   OCL_NDRANGE( 1 );
@@ -85,3 +88,4 @@ static void builtin_acos_asin(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(builtin_acos_asin)
+}
diff --git a/utests/builtin_exp.cpp b/utests/builtin_exp.cpp
index 6d51c33..2c214bd 100644
--- a/utests/builtin_exp.cpp
+++ b/utests/builtin_exp.cpp
@@ -15,7 +15,9 @@
   printf("\033[0m");\
 }
 
-const float input_data[] = {FLT_MAX, -FLT_MAX, FLT_MIN, -FLT_MIN, 80, -80, 
3.14, -3.14, -0.5, 0.5, 1, -1, 0.0 };
+namespace{
+
+float input_data[] = {FLT_MAX, -FLT_MAX, FLT_MIN, -FLT_MIN, 80, -80, 3.14, 
-3.14, -0.5, 0.5, 1, -1, 0.0 };
 const int count_input = sizeof(input_data) / sizeof(input_data[0]);
 const int max_function = 5;
 
@@ -51,7 +53,8 @@ static void builtin_exp(void)
   locals[0] = 1;
 
   clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * 
sizeof(float), input_data, 0, NULL, NULL);
-  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), _function 
, 0, NULL, NULL);
+  int maxfunc = max_function;
+  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), , 0, 
NULL, NULL);
 
// Run the kernel
   OCL_NDRANGE( 1 );
@@ -100,3 +103,4 @@ static void builtin_exp(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(builtin_exp)
+}
diff --git a/utests/builtin_pow.cpp b/utests/builtin_pow.cpp
index 21fa895..1f6af0e 100644
--- a/utests/builtin_pow.cpp
+++ b/utests/builtin_pow.cpp
@@ -10,6 +10,9 @@
   printf( __VA_ARGS__ );\
   printf("\033[0m");\
 }
+
+namespace {
+
 const float ori_data[] = {-20.5, -1, -0.9, -0.01, 0, 0.01, 0.9, 1.0, 20.5};
 const int count_input_ori = sizeof(ori_data) / sizeof(ori_data[0]);
 const int count_input = count_input_ori * count_input_ori;
@@ -59,7 +62,8 @@ static void builtin_pow(void)
 
   clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * 
sizeof(float), input_data1, 0, NULL, NULL);
   clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, count_input * 
sizeof(float), input_data2, 0, NULL, NULL);
-  clEnqueueWriteBuffer( queue, buf[3], CL_TRUE, 0, sizeof(int), _function, 
0, NULL, NULL);
+  int maxfunc = max_function;
+  clEnqueueWriteBuffer( queue, buf[3], CL_TRUE, 0, sizeof(int), , 0, 
NULL, NULL);
 
// Run the kernel
   OCL_NDRANGE( 1 );
@@ -100,3 +104,4 @@ static void builtin_pow(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(builtin_pow)
+}
diff --git a/utests/utest_generator.py b/utests/utest_generator.py
index bcb9ac4..2c02ad6 100644
--- a/utests/utest_generator.py
+++ b/utests/utest_generator.py
@@ -280,9 +280,9 @@ which can print more values and information to assist 
debuging the issue.
   vals = vals[0:128]
   break
 vals += self.values[i]
-  self.cpplines += [ "const %s input_data%d[] = {%s};" 
%(self.argtype(i,index),i+1,str(vals).strip('[]').replace('\'','')) ]
+  self.cpplines += [ "%s input_data%d[] = {%s};" 
%(self.argtype(i,index),i+1,str(vals).strip('[]').replace('\'','')) ]
 self.cpplines += [ "const int count_input = sizeof(input_data1) / 
sizeof(input_data1[0]);" ]
-self.cpplines += [ "const int vector = %s;\n"%(vlen) ]
+self.cpplines += [ "int vector = %s;\n"%(vlen) ]
 
 #Cpu Function
   def GenCpuCompilerMath(self,index):
@@ -457,6 +457,8 @@ static void %s_%s(void)
   #The head:
   self.cpplines += [self.Head]
 
+  self.cpplines += ["namespace {\n"]
+
   #Parameters:
   self.GenInputValues(i)
 
@@ -469,6 +471,8 @@ static void %s_%s(void)
   #utest function
   self.utestFunc(i)
 
+  self.cpplines += ["}\n"]
+
   #kernel cl
   self.genCL(i)
 
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] enlarge buf size to avoid memory out of range written by GPU (kernel)

2016-07-03 Thread Guo, Yejun
ping for review, thanks.

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Guo, 
Yejun
Sent: Tuesday, June 28, 2016 3:25 PM
To: beignet@lists.freedesktop.org
Subject: Re: [Beignet] [PATCH] enlarge buf size to avoid memory out of range 
written by GPU (kernel)

ping for review, thanks.

-Original Message-
From: Guo, Yejun
Sent: Wednesday, June 15, 2016 10:36 AM
To: beignet@lists.freedesktop.org
Cc: Guo, Yejun
Subject: [PATCH] enlarge buf size to avoid memory out of range written by GPU 
(kernel)

pseudocode:
float input[] = {...};  -->  float input[] = { ...  ... more} global_size = 
input_len -->  global_size = input_len / vector
the value of vector is 1,2,... or 16.

ocl kernel looks like (for the case of vector=8):
  int i = get_global_id(0);
  dst[i * (*vector) + 0] = ret[0];
  dst[i * (*vector) + 1] = ret[1];
  dst[i * (*vector) + 2] = ret[2];
  dst[i * (*vector) + 3] = ret[3];
  dst[i * (*vector) + 4] = ret[4];
  dst[i * (*vector) + 5] = ret[5];
  dst[i * (*vector) + 6] = ret[6];
  dst[i * (*vector) + 7] = ret[7];

Signed-off-by: Guo Yejun <yejun@intel.com>
---
 utests/utest_generator.py | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/utests/utest_generator.py b/utests/utest_generator.py index 
cde2dbe..3591095 100644
--- a/utests/utest_generator.py
+++ b/utests/utest_generator.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 from __future__ import print_function
-import os,sys,re
+import os,sys,re,string
 
 FLT_MAX_POSI='0x1.fep127f'
 FLT_MIN_NEGA='-0x1.fep127f'
@@ -247,7 +247,7 @@ which can print more values and information to assist 
debuging the issue.
   def argvector(self,paraN,index):
 vector=re.findall(r"[0-9]+",self.inputtype[paraN][index])
 if vector:
-  vector=vector[0]
+  vector=string.atoi(vector[0])
 else:
   vector=1
 return vector
@@ -272,10 +272,17 @@ which can print more values and information to assist 
debuging the issue.
 #Cpu values analyse
   def GenInputValues(self,index):
 #namesuffix=self.inputtype[0][index]
+vlen = self.argvector(self.inputtype.__len__()-1,index)
 for i in range(0,self.values.__len__()):
-  self.cpplines += [ "const %s input_data%d[] = {%s};" 
%(self.argtype(i,index),i+1,str(self.values[i]).strip('[]').replace('\'','')) ]
+vals = []
+for j in range(0, vlen):
+if (len(vals) >= 128): #avoid too many data
+vals = vals[0:128]
+break
+vals += self.values[i]
+self.cpplines += [ "const %s input_data%d[] = {%s};" 
+ %(self.argtype(i,index),i+1,str(vals).strip('[]').replace('\'','')) ]
 self.cpplines += [ "const int count_input = sizeof(input_data1) / 
sizeof(input_data1[0]);" ]
-self.cpplines += [ "const int vector = 
%s;\n"%(self.argvector(self.inputtype.__len__()-1,index)) ]
+self.cpplines += [ "const int vector = %s;\n"%(vlen) ]
 
 #Cpu Function
   def GenCpuCompilerMath(self,index):
@@ -340,7 +347,7 @@ static void %s_%s(void)
   OCL_CREATE_KERNEL(\"%s_%s\");
   OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * sizeof(%s), 
NULL); 
 
-  globals[0] = count_input;
+  globals[0] = count_input / vector;
   locals[0] = 1;
  '''%(self.fileName,namesuffix,\
  self.retType(index),\
--
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 4/6] Backend: Refine sel ir optimization

2016-07-03 Thread Guo, Yejun
this patch looks fine to me, thanks

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Xiuli 
Pan
Sent: Thursday, June 16, 2016 6:11 AM
To: beignet@lists.freedesktop.org
Cc: Pan, Xiuli
Subject: [Beignet] [PATCH 4/6] Backend: Refine sel ir optimization

From: Pan Xiuli 

We may have some inst like:
mov %30, %3
(-f0.1) mov %30, %4
(+f0.1) mov %30, %5
to mask some value out with the flag.
Now only the last mov will left after optimization.
This patch add check about if the intemedia can be replaced even if they are in 
the same prediction state and inverse prediction state.
Also refine the check of prediction state to propoly handle situation:
(-f0.1) infoinst %30, %4
inst %30, %8

Signed-off-by: Pan Xiuli 
---
 backend/src/backend/gen_insn_selection_optimize.cpp | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection_optimize.cpp 
b/backend/src/backend/gen_insn_selection_optimize.cpp
index 35da404..b8aa776 100644
--- a/backend/src/backend/gen_insn_selection_optimize.cpp
+++ b/backend/src/backend/gen_insn_selection_optimize.cpp
@@ -97,7 +97,7 @@ namespace gbe
 void doLocalCopyPropagation();
 void addToReplaceInfoMap(SelectionInstruction& insn);
 void changeInsideReplaceInfoMap(const SelectionInstruction& insn, 
GenRegister& var);
-void removeFromReplaceInfoMap(const GenRegister& var);
+void removeFromReplaceInfoMap(const SelectionInstruction& insn, 
+ const GenRegister& var);
 void doReplacement(ReplaceInfo* info);
 bool CanBeReplaced(const ReplaceInfo* info, const SelectionInstruction& 
insn, const GenRegister& var);
 void cleanReplaceInfoMap();
@@ -127,14 +127,15 @@ namespace gbe
 replaceInfoMap.clear();
   }
 
-  void SelBasicBlockOptimizer::removeFromReplaceInfoMap(const GenRegister& var)
+  void SelBasicBlockOptimizer::removeFromReplaceInfoMap(const
+ SelectionInstruction& insn, const GenRegister& var)
   {
 for (ReplaceInfoMap::iterator pos = replaceInfoMap.begin(); pos != 
replaceInfoMap.end(); ++pos) {
   ReplaceInfo* info = pos->second;
   if (info->intermedia.reg() == var.reg()) {   //intermedia is overwritten
 if (info->intermedia.quarter == var.quarter && info->intermedia.subnr 
== var.subnr) {
-  //the whole intermedia is overwritten, so, do replacement for the 
scanned IRs
-  doReplacement(info);
+  // We need to check the if intermedia is fully overwritten, they may 
be in some prediction state.
+  if (CanBeReplaced(info, insn, var))
+doReplacement(info);
 }
 replaceInfoMap.erase(pos);
 delete info;
@@ -199,7 +200,11 @@ namespace gbe
 if (info->insn.state.noMask == 0 && insn.state.noMask == 1)
   return false;
 
-if (info->insn.state.predicate != insn.state.predicate && 
info->insn.state.predicate != GEN_PREDICATE_NONE)
+// If insn is in no prediction state, it will overwrite the info insn.
+if (info->insn.state.predicate != insn.state.predicate && 
insn.state.predicate != GEN_PREDICATE_NONE)
+  return false;
+
+if (info->insn.state.inversePredicate !=
+ insn.state.inversePredicate)
   return false;
 
 if (info->intermedia.type == var.type && info->intermedia.quarter == 
var.quarter && info->intermedia.subnr == var.subnr) { @@ -235,7 +240,7 @@ 
namespace gbe
 changeInsideReplaceInfoMap(insn, insn.src(i));
 
   for (uint8_t i = 0; i < insn.dstNum; ++i)
-removeFromReplaceInfoMap(insn.dst(i));
+removeFromReplaceInfoMap(insn, insn.dst(i));
 
   if (insn.opcode == SEL_OP_MOV)
 addToReplaceInfoMap(insn);
--
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] enlarge buf size to avoid memory out of range written by GPU (kernel)

2016-06-28 Thread Guo, Yejun
ping for review, thanks.

-Original Message-
From: Guo, Yejun 
Sent: Wednesday, June 15, 2016 10:36 AM
To: beignet@lists.freedesktop.org
Cc: Guo, Yejun
Subject: [PATCH] enlarge buf size to avoid memory out of range written by GPU 
(kernel)

pseudocode:
float input[] = {...};  -->  float input[] = { ...  ... more}
global_size = input_len -->  global_size = input_len / vector
the value of vector is 1,2,... or 16.

ocl kernel looks like (for the case of vector=8):
  int i = get_global_id(0);
  dst[i * (*vector) + 0] = ret[0];
  dst[i * (*vector) + 1] = ret[1];
  dst[i * (*vector) + 2] = ret[2];
  dst[i * (*vector) + 3] = ret[3];
  dst[i * (*vector) + 4] = ret[4];
  dst[i * (*vector) + 5] = ret[5];
  dst[i * (*vector) + 6] = ret[6];
  dst[i * (*vector) + 7] = ret[7];

Signed-off-by: Guo Yejun <yejun@intel.com>
---
 utests/utest_generator.py | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/utests/utest_generator.py b/utests/utest_generator.py
index cde2dbe..3591095 100644
--- a/utests/utest_generator.py
+++ b/utests/utest_generator.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 from __future__ import print_function
-import os,sys,re
+import os,sys,re,string
 
 FLT_MAX_POSI='0x1.fep127f'
 FLT_MIN_NEGA='-0x1.fep127f'
@@ -247,7 +247,7 @@ which can print more values and information to assist 
debuging the issue.
   def argvector(self,paraN,index):
 vector=re.findall(r"[0-9]+",self.inputtype[paraN][index])
 if vector:
-  vector=vector[0]
+  vector=string.atoi(vector[0])
 else:
   vector=1
 return vector
@@ -272,10 +272,17 @@ which can print more values and information to assist 
debuging the issue.
 #Cpu values analyse
   def GenInputValues(self,index):
 #namesuffix=self.inputtype[0][index]
+vlen = self.argvector(self.inputtype.__len__()-1,index)
 for i in range(0,self.values.__len__()):
-  self.cpplines += [ "const %s input_data%d[] = {%s};" 
%(self.argtype(i,index),i+1,str(self.values[i]).strip('[]').replace('\'','')) ]
+vals = []
+for j in range(0, vlen):
+if (len(vals) >= 128): #avoid too many data
+vals = vals[0:128]
+break
+vals += self.values[i]
+self.cpplines += [ "const %s input_data%d[] = {%s};" 
%(self.argtype(i,index),i+1,str(vals).strip('[]').replace('\'','')) ]
 self.cpplines += [ "const int count_input = sizeof(input_data1) / 
sizeof(input_data1[0]);" ]
-self.cpplines += [ "const int vector = 
%s;\n"%(self.argvector(self.inputtype.__len__()-1,index)) ]
+self.cpplines += [ "const int vector = %s;\n"%(vlen) ]
 
 #Cpu Function
   def GenCpuCompilerMath(self,index):
@@ -340,7 +347,7 @@ static void %s_%s(void)
   OCL_CREATE_KERNEL(\"%s_%s\");
   OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * sizeof(%s), 
NULL); 
 
-  globals[0] = count_input;
+  globals[0] = count_input / vector;
   locals[0] = 1;
  '''%(self.fileName,namesuffix,\
  self.retType(index),\
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] enlarge buf size to avoid memory out of range written by GPU (kernel)

2016-06-15 Thread Guo Yejun
pseudocode:
float input[] = {...};  -->  float input[] = { ...  ... more}
global_size = input_len -->  global_size = input_len / vector
the value of vector is 1,2,... or 16.

ocl kernel looks like (for the case of vector=8):
  int i = get_global_id(0);
  dst[i * (*vector) + 0] = ret[0];
  dst[i * (*vector) + 1] = ret[1];
  dst[i * (*vector) + 2] = ret[2];
  dst[i * (*vector) + 3] = ret[3];
  dst[i * (*vector) + 4] = ret[4];
  dst[i * (*vector) + 5] = ret[5];
  dst[i * (*vector) + 6] = ret[6];
  dst[i * (*vector) + 7] = ret[7];

Signed-off-by: Guo Yejun <yejun@intel.com>
---
 utests/utest_generator.py | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/utests/utest_generator.py b/utests/utest_generator.py
index cde2dbe..3591095 100644
--- a/utests/utest_generator.py
+++ b/utests/utest_generator.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 from __future__ import print_function
-import os,sys,re
+import os,sys,re,string
 
 FLT_MAX_POSI='0x1.fep127f'
 FLT_MIN_NEGA='-0x1.fep127f'
@@ -247,7 +247,7 @@ which can print more values and information to assist 
debuging the issue.
   def argvector(self,paraN,index):
 vector=re.findall(r"[0-9]+",self.inputtype[paraN][index])
 if vector:
-  vector=vector[0]
+  vector=string.atoi(vector[0])
 else:
   vector=1
 return vector
@@ -272,10 +272,17 @@ which can print more values and information to assist 
debuging the issue.
 #Cpu values analyse
   def GenInputValues(self,index):
 #namesuffix=self.inputtype[0][index]
+vlen = self.argvector(self.inputtype.__len__()-1,index)
 for i in range(0,self.values.__len__()):
-  self.cpplines += [ "const %s input_data%d[] = {%s};" 
%(self.argtype(i,index),i+1,str(self.values[i]).strip('[]').replace('\'','')) ]
+vals = []
+for j in range(0, vlen):
+if (len(vals) >= 128): #avoid too many data
+vals = vals[0:128]
+break
+vals += self.values[i]
+self.cpplines += [ "const %s input_data%d[] = {%s};" 
%(self.argtype(i,index),i+1,str(vals).strip('[]').replace('\'','')) ]
 self.cpplines += [ "const int count_input = sizeof(input_data1) / 
sizeof(input_data1[0]);" ]
-self.cpplines += [ "const int vector = 
%s;\n"%(self.argvector(self.inputtype.__len__()-1,index)) ]
+self.cpplines += [ "const int vector = %s;\n"%(vlen) ]
 
 #Cpu Function
   def GenCpuCompilerMath(self,index):
@@ -340,7 +347,7 @@ static void %s_%s(void)
   OCL_CREATE_KERNEL(\"%s_%s\");
   OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * sizeof(%s), 
NULL); 
 
-  globals[0] = count_input;
+  globals[0] = count_input / vector;
   locals[0] = 1;
  '''%(self.fileName,namesuffix,\
  self.retType(index),\
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 2/2] Ensure paths to beignet.bc and beignet.pch include a / before the filename

2016-05-24 Thread Guo, Yejun
this patch set looks good, thanks.

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Ross 
Burton
Sent: Tuesday, May 24, 2016 11:14 PM
To: beignet@lists.freedesktop.org
Subject: [Beignet] [PATCH 2/2] Ensure paths to beignet.bc and beignet.pch 
include a / before the filename

Otherwise it's possible for the paths to be something like
/usr/lib/beignetbeignet.bc.

Signed-off-by: Ross Burton 
---
 backend/src/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index a4e909a..a21ca75 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -1,6 +1,6 @@
-set (OCL_BITCODE_BIN "${BEIGNET_INSTALL_DIR}beignet.bc")
+set (OCL_BITCODE_BIN "${BEIGNET_INSTALL_DIR}/beignet.bc")
 set (OCL_HEADER_DIR "${BEIGNET_INSTALL_DIR}/include")
-set (OCL_PCH_OBJECT "${BEIGNET_INSTALL_DIR}beignet.pch")
+set (OCL_PCH_OBJECT "${BEIGNET_INSTALL_DIR}/beignet.pch")
 set (GBE_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbe.so")
 set (INTERP_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbeinterp.so")
 
-- 
2.8.0.rc3

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


  1   2   3   4   >