Re: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid creating temporary cl source file.

2015-09-08 Thread Luo, Xionghu
This patch LGTM except some questions.

How didn't decide the name "stringInput.cl"?
And since this method works, we could also remap all the input headers in API 
clCompileProgram to avoid create temp files under /tmp, anyway, this could be 
processed in another patch.

Luo Xionghu
Best Regards

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of 
Zhigang Gong
Sent: Monday, August 31, 2015 2:30 PM
To: beignet@lists.freedesktop.org
Cc: Gong, Zhigang
Subject: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid creating temporary 
cl source file.

LLVM provides powerful string-remapped feature which could be used to map a 
string to an input file name, thus we don't need to create a temporary cl 
source file any more.

This patch not only make things much clear and avoid the unecessary file 
creation. It only fixes some weird directory related problems.
Because beignet creates the temoprary file at the /tmp directory.
Then the clang will search the include files in that directory by default, but 
the developer expects it to search the working directory firstly. This causing 
two weird things:
1. If a .cl file is including a .h file in the current directory, beignet
   will not find it.

2. Even if the probram add a "-I." option manually, beignet will search /tmp
   firstly, and if there is a .h file in /tmp/ with the eaxct same file
   name, beignet will the file located in /tmp.

Signed-off-by: Zhigang Gong 
---
 backend/src/backend/program.cpp | 40 ++--
 1 file changed, 10 insertions(+), 30 deletions(-)

diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp 
index d9e6416..330bead 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -518,7 +518,7 @@ namespace gbe {
 #ifdef GBE_COMPILER_AVAILABLE
   BVAR(OCL_OUTPUT_BUILD_LOG, false);
 
-  static bool buildModuleFromSource(const char* input, llvm::Module** 
out_module, llvm::LLVMContext* llvm_ctx,
+  static bool buildModuleFromSource(const char *source, llvm::Module** 
+ out_module, llvm::LLVMContext* llvm_ctx,
 std::string dumpLLVMFileName, 
std::vector& options, size_t stringSize, char *err,
 size_t *errSize) {
 // Arguments to pass to the clang frontend @@ -551,8 +551,7 @@ namespace 
gbe {
 args.push_back("-triple");
 args.push_back("spir");
 #endif /* LLVM_VERSION_MINOR <= 2 */
-args.push_back(input);
-
+args.push_back("stringInput.cl");
 args.push_back("-ffp-contract=off");
 
 // The compiler invocation needs a DiagnosticsEngine so it can report 
problems @@ -574,6 +573,9 @@ namespace gbe {
   [0],
   [0] + args.size(),
   Diags);
+llvm::StringRef srcString(source);
+(*CI).getPreprocessorOpts().addRemappedFile("stringInput.cl",
+llvm::MemoryBuffer::getMemBuffer(srcString).release());
 
 // Create the compiler instance
 clang::CompilerInstance Clang;
@@ -670,7 +672,6 @@ namespace gbe {
  std::vector& clOpt,
  std::string& dumpLLVMFileName,
  std::string& dumpASMFileName,
- std::string& clName,
  int& optLevel,
  size_t stringSize,
  char *err, @@ -781,21 +782,6 @@ namespace 
gbe {
   }
 }
 
-char clStr[] = "/tmp/XX.cl";
-int clFd = mkstemps(clStr, 3);
-clName = std::string(clStr);
-
-FILE *clFile = fdopen(clFd, "w");
-FATAL_IF(clFile == NULL, "Failed to open temporary file");
-// XXX enable cl_khr_fp64 may cause some potential bugs.
-// we may need to revisit here latter when we want to support fp64 
completely.
-// For now, as we don't support fp64 actually, just disable it by default.
-#if 0
-#define ENABLE_CL_KHR_FP64_STR "#pragma OPENCL EXTENSION cl_khr_fp64 : 
enable\n"
-if (options && !strstr(const_cast(options), "-cl-std=CL1.1"))
-  fwrite(ENABLE_CL_KHR_FP64_STR, strlen(ENABLE_CL_KHR_FP64_STR), 1, 
clFile);
-#endif
-
 if (!findPCH || invalidPCH) {
   clOpt.push_back("-include");
   clOpt.push_back("ocl.h");
@@ -805,9 +791,6 @@ namespace gbe {
   clOpt.push_back(pchFileName);
 }
 
-// Write the source to the cl file
-fwrite(source, strlen(source), 1, clFile);
-fclose(clFile);
 return true;
   }
 
@@ -820,11 +803,10 @@ namespace gbe {
   {
 int optLevel = 1;
 std::vector clOpt;
-std::string clName;
 std::string dumpLLVMFileName, dumpASMFileName;
 if (!processSourceAndOption(source, options, NULL, clOpt,
 dumpLLVMFileName, 

Re: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid creating temporary cl source file.

2015-09-08 Thread Gong, Zhigang
> -Original Message-
> From: Luo, Xionghu
> Sent: Tuesday, September 8, 2015 2:09 PM
> To: Gong, Zhigang; beignet@lists.freedesktop.org
> Cc: Gong, Zhigang
> Subject: RE: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid creating
> temporary cl source file.
> 
> This patch LGTM except some questions.
> 
> How didn't decide the name "stringInput.cl"?
Not sure what's your meaning here?

> And since this method works, we could also remap all the input headers in API
> clCompileProgram to avoid create temp files under /tmp, anyway, this could be
> processed in another patch.
Right, the header files's processing in clCompileProgram could be refined by 
using the same method.


> 
> Luo Xionghu
> Best Regards
> 
> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
> Zhigang Gong
> Sent: Monday, August 31, 2015 2:30 PM
> To: beignet@lists.freedesktop.org
> Cc: Gong, Zhigang
> Subject: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid creating
> temporary cl source file.
> 
> LLVM provides powerful string-remapped feature which could be used to map a
> string to an input file name, thus we don't need to create a temporary cl 
> source
> file any more.
> 
> This patch not only make things much clear and avoid the unecessary file
> creation. It only fixes some weird directory related problems.
> Because beignet creates the temoprary file at the /tmp directory.
> Then the clang will search the include files in that directory by default, 
> but the
> developer expects it to search the working directory firstly. This causing two
> weird things:
> 1. If a .cl file is including a .h file in the current directory, beignet
>will not find it.
> 
> 2. Even if the probram add a "-I." option manually, beignet will search /tmp
>firstly, and if there is a .h file in /tmp/ with the eaxct same file
>name, beignet will the file located in /tmp.
> 
> Signed-off-by: Zhigang Gong 
> ---
>  backend/src/backend/program.cpp | 40 ++--
>  1 file changed, 10 insertions(+), 30 deletions(-)
> 
> diff --git a/backend/src/backend/program.cpp
> b/backend/src/backend/program.cpp index d9e6416..330bead 100644
> --- a/backend/src/backend/program.cpp
> +++ b/backend/src/backend/program.cpp
> @@ -518,7 +518,7 @@ namespace gbe {
>  #ifdef GBE_COMPILER_AVAILABLE
>BVAR(OCL_OUTPUT_BUILD_LOG, false);
> 
> -  static bool buildModuleFromSource(const char* input, llvm::Module**
> out_module, llvm::LLVMContext* llvm_ctx,
> +  static bool buildModuleFromSource(const char *source, llvm::Module**
> + out_module, llvm::LLVMContext* llvm_ctx,
>  std::string dumpLLVMFileName,
> std::vector& options, size_t stringSize, char *err,
>  size_t *errSize) {
>  // Arguments to pass to the clang frontend @@ -551,8 +551,7 @@
> namespace gbe {
>  args.push_back("-triple");
>  args.push_back("spir");
>  #endif /* LLVM_VERSION_MINOR <= 2 */
> -args.push_back(input);
> -
> +args.push_back("stringInput.cl");
>  args.push_back("-ffp-contract=off");
> 
>  // The compiler invocation needs a DiagnosticsEngine so it can report
> problems @@ -574,6 +573,9 @@ namespace gbe {
>[0],
>[0] +
> args.size(),
>Diags);
> +llvm::StringRef srcString(source);
> +(*CI).getPreprocessorOpts().addRemappedFile("stringInput.cl",
> +llvm::MemoryBuffer::getMemBuffer(srcString).release());
> 
>  // Create the compiler instance
>  clang::CompilerInstance Clang;
> @@ -670,7 +672,6 @@ namespace gbe {
>   std::vector& clOpt,
>   std::string&
> dumpLLVMFileName,
>   std::string&
> dumpASMFileName,
> - std::string& clName,
>   int& optLevel,
>   size_t stringSize,
>   char *err, @@ -781,21 +782,6
> @@ namespace gbe {
>}
>  }
> 
> -char clStr[] = "/tmp/XX.cl";
> -int clFd = mkstemps(clStr, 3);
> -clName = std::string(clStr);
> -
> -FILE *clFile = fdopen(clFd, "w");
> -FATAL_IF(clFile == NULL, "Failed to open temporary file");
> -// XXX enable cl_khr_fp64 may cause some potential bugs.
> -// we may need to revisit here latter when we want to support fp64
> completely.
> -// For now, as we don't support fp64 actually, just disable it by 
> default.
> -#if 0
> -#define ENABLE_CL_KHR_FP64_STR "#pragma OPENCL EXTENSION
> cl_khr_fp64 : enable\n"
> -if (options && !strstr(const_cast(options), "-cl-std=CL1.1"))
> -  fwrite(ENABLE_CL_KHR_FP64_STR,
> strlen(ENABLE_CL_KHR_FP64_STR), 1, clFile);
> -#endif
> -

Re: [Beignet] [PATCH 2/3] add bswap64 for gen7/gen75 and gen8 seperately.

2015-09-08 Thread Yang, Rong R
It seems you don't handle simd == 1 long/ulong case.

> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
> xionghu@intel.com
> Sent: Thursday, August 13, 2015 14:28
> To: beignet@lists.freedesktop.org
> Cc: Luo, Xionghu
> Subject: [Beignet] [PATCH 2/3] add bswap64 for gen7/gen75 and gen8
> seperately.
> 
> From: Luo Xionghu 
> 
> as the long type data layout is not continous on platform gen7/gen75, the
> indirect address access pattern is a bit different than gen8.
> 
> Signed-off-by: Luo Xionghu 
> ---
>  backend/src/backend/gen8_context.cpp |  64 
> backend/src/backend/gen_context.cpp  | 110
> +++
>  2 files changed, 174 insertions(+)
> 
> diff --git a/backend/src/backend/gen8_context.cpp
> b/backend/src/backend/gen8_context.cpp
> index eca8eeb..a283194 100644
> --- a/backend/src/backend/gen8_context.cpp
> +++ b/backend/src/backend/gen8_context.cpp
> @@ -245,6 +245,70 @@ namespace gbe
>p->pop();
> 
>p->MOV(dst, tmp);
> +  }else if (src.type == GEN_TYPE_UL || src.type == GEN_TYPE_L) {
> +  bool uniform_src = (src.hstride == GEN_HORIZONTAL_STRIDE_0);
> +  GBE_ASSERT(uniform_src || src.subnr == 0);
> +  GBE_ASSERT(dst.subnr == 0);
> +  GBE_ASSERT(tmp.subnr == 0);
> +  GBE_ASSERT(start_addr >= 0);
> +  new_a0[0] = start_addr + 7;
> +  new_a0[1] = start_addr + 6;
> +  new_a0[2] = start_addr + 5;
> +  new_a0[3] = start_addr + 4;
> +  new_a0[4] = start_addr + 3;
> +  new_a0[5] = start_addr + 2;
> +  new_a0[6] = start_addr + 1;
> +  new_a0[7] = start_addr;
> +  if(!uniform_src) {
> +new_a0[8] = start_addr + 15;
> +new_a0[9] = start_addr + 14;
> +new_a0[10] = start_addr + 13;
> +new_a0[11] = start_addr + 12;
> +new_a0[12] = start_addr + 11;
> +new_a0[13] = start_addr + 10;
> +new_a0[14] = start_addr + 9;
> +new_a0[15] = start_addr + 8;
> +  } else {
> +new_a0[8] = start_addr + 7;
> +new_a0[9] = start_addr + 6;
> +new_a0[10] = start_addr + 5;
> +new_a0[11] = start_addr + 4;
> +new_a0[12] = start_addr + 3;
> +new_a0[13] = start_addr + 2;
> +new_a0[14] = start_addr + 1;
> +new_a0[15] = start_addr;
> +  }
> +  this->setA0Content(new_a0, 56);
> +
> +  p->push();
> +  p->curr.execWidth = 16;
> +  p->curr.predicate = GEN_PREDICATE_NONE;
> +  p->curr.noMask = 1;
> +  GenRegister ind_src =
> GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
> new_a0[0], 0);
> +  p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
> +  if(!uniform_src)
> +ind_src.addr_imm += 16;
> +  p->MOV(GenRegister::offset(GenRegister::retype(tmp,
> GEN_TYPE_UB), 0, 16), ind_src);
> +  for (int i = 0; i < 2; i++) {
> +if(!uniform_src)
> +  ind_src.addr_imm += 16;
> +p->MOV(GenRegister::offset(GenRegister::retype(tmp,
> GEN_TYPE_UB), 1, 16*i), ind_src);
> +  }
> +  if (simd == 16) {
> +for (int i = 0; i < 2; i++) {
> +  if(!uniform_src)
> +ind_src.addr_imm += 16;
> +  p->MOV(GenRegister::offset(GenRegister::retype(tmp,
> GEN_TYPE_UB), 2, 16*i), ind_src);
> +}
> +for (int i = 0; i < 2; i++) {
> +  if(!uniform_src)
> +ind_src.addr_imm += 16;
> +  p->MOV(GenRegister::offset(GenRegister::retype(tmp,
> GEN_TYPE_UB), 3, 16*i), ind_src);
> +}
> +  }
> +  p->pop();
> +
> +  p->MOV(dst, tmp);
>  } else {
>GBE_ASSERT(0);
>  }
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index 8ee65ee..7fd43bb 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -437,6 +437,116 @@ namespace gbe
>  p->pop();
> 
>  p->MOV(dst, tmp);
> +  }else if (src.type == GEN_TYPE_UL || src.type == GEN_TYPE_L) {
> +bool uniform_src = (src.hstride == GEN_HORIZONTAL_STRIDE_0);
> +GBE_ASSERT(uniform_src || src.subnr == 0);
> +GBE_ASSERT(dst.subnr == 0);
> +GBE_ASSERT(tmp.subnr == 0);
> +GBE_ASSERT(start_addr >= 0);
> +if (!uniform_src) {
> +  new_a0[0] = start_addr + 3;
> +  

Re: [Beignet] [PATCH] Use __attribute__((destructor)), not atexit(3).

2015-09-08 Thread Yang, Rong R
It seems gcc/clang/icc support __attribute__((destructor)), but still have two 
comments. Thanks for your contribution.

> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
> Koop Mast
> Sent: Thursday, August 27, 2015 18:45
> To: beignet@lists.freedesktop.org
> Cc: Koop Mast
> Subject: [Beignet] [PATCH] Use __attribute__((destructor)), not atexit(3).
> 
> On Linux, atexit(3) registered functions are called at program exit or during
> module unload. The latter is a Glibc extension not supported by FreeBSD.
> This means that, on FreeBSD, the registered function could be called after
> the module was unloaded, causing the application to crash.
> ---
>  backend/src/backend/gen_insn_selection.cpp | 2 +-
>  backend/src/sys/alloc.cpp  | 4 ++--
>  src/performance.c  | 7 +--
>  utests/utest.cpp   | 2 +-
>  4 files changed, 5 insertions(+), 10 deletions(-)
> 
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index b84bb4b..e90fd8d 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -1776,11 +1776,11 @@ namespace gbe
> 
>// Boiler plate to initialize the selection library at c++ pre-main
>static SelectionLibrary *selLib = NULL;
> +  __attribute__((destructor))
>static void destroySelectionLibrary(void) { GBE_DELETE(selLib); }
Only register atexit in function SelectionLibraryInitializer, but will always 
call destroySelectionLibrary when unload or exit here, so must add selLib check.


>static struct SelectionLibraryInitializer {
>  SelectionLibraryInitializer(void) {
>selLib = GBE_NEW_NO_ARG(SelectionLibrary);
> -  atexit(destroySelectionLibrary);
>  }
>} selectionLibraryInitializer;
> 
> diff --git a/backend/src/sys/alloc.cpp b/backend/src/sys/alloc.cpp index
> 08dc7b1..30dc887 100644
> --- a/backend/src/sys/alloc.cpp
> +++ b/backend/src/sys/alloc.cpp
> @@ -140,16 +140,17 @@ namespace gbe
>static bool isMutexInitializing = true;
>static size_t memDebuggerCurrSize(0u);
>static size_t memDebuggerMaxSize(0u);
> +  __attribute__((destructor))
>static void SizeMutexDeallocate(void) { if (sizeMutex) delete sizeMutex; }
>static void SizeMutexAllocate(void) {
>  if (sizeMutex == NULL && isMutexInitializing == false) {
>isMutexInitializing = true;
>sizeMutex = new MutexSys;
> -  atexit(SizeMutexDeallocate);
>  }
>}
> 
>/*! Stop the memory debugger */
> +  __attribute__((destructor))
>static void MemDebuggerEnd(void) {
>  MemDebugger *_debug = memDebugger;
>  memDebugger = NULL;
Also need add memDebugger check.


> @@ -172,7 +173,6 @@ namespace gbe
>/*! Start the memory debugger */
>static void MemDebuggerStart(void) {
>  if (memDebugger == NULL) {
> -  atexit(MemDebuggerEnd);
>memDebugger = new MemDebugger;
>  }
>}
> diff --git a/src/performance.c b/src/performance.c index 85cd481..15acded
> 100644
> --- a/src/performance.c
> +++ b/src/performance.c
> @@ -37,7 +37,6 @@ typedef struct storage
> 
> 
>  static storage record;
> -static int atexit_registered = 0;
> 
> 
>  static context_storage_node * prev_context_pointer = NULL; @@ -170,6
> +169,7 @@ static int cmp(const void *a, const void *b)
>  return 0;
>  }
> 
> +__attribute__((destructor))
>  static void print_time_info()
>  {
>context_storage_node *p_context = record.context_storage; @@ -273,11
> +273,6 @@ static void print_time_info()
> 
>  static void insert(cl_context context, const char *kernel_name, const char
> *build_opt, float time)  {
> -  if(!atexit_registered)
> -  {
> -atexit_registered = 1;
> -atexit(print_time_info);
> -  }
>context_storage_node *p_context = find_context(context);
>kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name,
> build_opt);
>prev_context_pointer = p_context;
> diff --git a/utests/utest.cpp b/utests/utest.cpp index 0a03d8b..3d6e001
> 100644
> --- a/utests/utest.cpp
> +++ b/utests/utest.cpp
> @@ -44,6 +44,7 @@ vector *UTest::utestList = NULL;  RStatistics
> UTest::retStatistics;
> 
>  void releaseUTestList(void) { delete UTest::utestList; }
> +__attribute__((destructor))
>  void runSummaryAtExit(void) {
>// If case crashes, count it as fail, and accumulate finishrun
>if(UTest::retStatistics.finishrun != UTest::utestList->size()) { @@ -113,7
> +114,6 @@ UTest::UTest(Function fn, const char *name, bool isBenchMark,
> bool haveIssue, bo
>  utestList = new vector;
> 
>  catch_signal();
> -atexit(runSummaryAtExit);
>}
>utestList->push_back(*this);
>  }
> --
> 2.4.6
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list

Re: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid creating temporary cl source file.

2015-09-08 Thread Yang, Rong R
Ok, pushed. Thanks.

> -Original Message-
> From: Gong, Zhigang
> Sent: Tuesday, September 8, 2015 16:28
> To: Yang, Rong R; Luo, Xionghu; beignet@lists.freedesktop.org
> Subject: RE: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid creating
> temporary cl source file.
> 
> 
> 
> > -Original Message-
> > From: Yang, Rong R
> > Sent: Tuesday, September 8, 2015 4:11 PM
> > To: Gong, Zhigang; Luo, Xionghu; beignet@lists.freedesktop.org
> > Subject: RE: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid
> > creating temporary cl source file.
> >
> > Is this remapped file virtual file? If it is not a virtual file, I am
> > afraid it is not thread/process safe.
> 
> It's not a file, just a map belongs to the clang::CompilerInvocation object.
> 
> >
> > > -Original Message-
> > > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On
> > > Behalf Of Gong, Zhigang
> > > Sent: Tuesday, September 8, 2015 14:33
> > > To: Luo, Xionghu; beignet@lists.freedesktop.org
> > > Subject: Re: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid
> > > creating temporary cl source file.
> > >
> > > > -Original Message-
> > > > From: Luo, Xionghu
> > > > Sent: Tuesday, September 8, 2015 2:09 PM
> > > > To: Gong, Zhigang; beignet@lists.freedesktop.org
> > > > Cc: Gong, Zhigang
> > > > Subject: RE: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid
> > > > creating temporary cl source file.
> > > >
> > > > This patch LGTM except some questions.
> > > >
> > > > How didn't decide the name "stringInput.cl"?
> > > Not sure what's your meaning here?
> > >
> > > > And since this method works, we could also remap all the input
> > > > headers in API clCompileProgram to avoid create temp files under
> > > > /tmp, anyway, this could be processed in another patch.
> > > Right, the header files's processing in clCompileProgram could be
> > > refined by using the same method.
> > >
> > >
> > > >
> > > > Luo Xionghu
> > > > Best Regards
> > > >
> > > > -Original Message-
> > > > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On
> > > > Behalf Of Zhigang Gong
> > > > Sent: Monday, August 31, 2015 2:30 PM
> > > > To: beignet@lists.freedesktop.org
> > > > Cc: Gong, Zhigang
> > > > Subject: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid
> > > > creating temporary cl source file.
> > > >
> > > > LLVM provides powerful string-remapped feature which could be used
> > > > to map a string to an input file name, thus we don't need to
> > > > create a temporary cl source file any more.
> > > >
> > > > This patch not only make things much clear and avoid the
> > > > unecessary file creation. It only fixes some weird directory related
> problems.
> > > > Because beignet creates the temoprary file at the /tmp directory.
> > > > Then the clang will search the include files in that directory by
> > > > default, but the developer expects it to search the working
> > > > directory firstly. This causing two weird things:
> > > > 1. If a .cl file is including a .h file in the current directory, 
> > > > beignet
> > > >will not find it.
> > > >
> > > > 2. Even if the probram add a "-I." option manually, beignet will search
> /tmp
> > > >firstly, and if there is a .h file in /tmp/ with the eaxct same file
> > > >name, beignet will the file located in /tmp.
> > > >
> > > > Signed-off-by: Zhigang Gong 
> > > > ---
> > > >  backend/src/backend/program.cpp | 40
> > > > ++--
> > > >  1 file changed, 10 insertions(+), 30 deletions(-)
> > > >
> > > > diff --git a/backend/src/backend/program.cpp
> > > > b/backend/src/backend/program.cpp index d9e6416..330bead 100644
> > > > --- a/backend/src/backend/program.cpp
> > > > +++ b/backend/src/backend/program.cpp
> > > > @@ -518,7 +518,7 @@ namespace gbe {  #ifdef
> GBE_COMPILER_AVAILABLE
> > > >BVAR(OCL_OUTPUT_BUILD_LOG, false);
> > > >
> > > > -  static bool buildModuleFromSource(const char* input,
> > > > llvm::Module** out_module, llvm::LLVMContext* llvm_ctx,
> > > > +  static bool buildModuleFromSource(const char *source,
> > > > + llvm::Module** out_module, llvm::LLVMContext* llvm_ctx,
> > > >  std::string
> > dumpLLVMFileName,
> > > > std::vector& options, size_t stringSize, char *err,
> > > >  size_t *errSize) {
> > > >  // Arguments to pass to the clang frontend @@ -551,8 +551,7
> > > > @@ namespace gbe {
> > > >  args.push_back("-triple");
> > > >  args.push_back("spir");
> > > >  #endif /* LLVM_VERSION_MINOR <= 2 */
> > > > -args.push_back(input);
> > > > -
> > > > +args.push_back("stringInput.cl");
> > > >  args.push_back("-ffp-contract=off");
> > > >
> > > >  // The compiler invocation needs a DiagnosticsEngine so it
> > > > can report problems @@ -574,6 +573,9 @@ namespace gbe {
> > > >[0],
> > > > 

Re: [Beignet] [PATCH] utests: Added unit tests to test LLVM and ASM dump generation.

2015-09-08 Thread Yang, Rong R
Pushed, thanks.

> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
> Song, Ruiling
> Sent: Sunday, September 6, 2015 15:05
> To: Gandikota, Sirisha; Zou, Nanhai; beignet@lists.freedesktop.org
> Cc: Gandikota, Sirisha
> Subject: Re: [Beignet] [PATCH] utests: Added unit tests to test LLVM and
> ASM dump generation.
> 
> LGTM
> 
> Thanks!
> Ruiling
> > -Original Message-
> > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf
> > Of Sirisha Gandikota
> > Sent: Wednesday, September 2, 2015 4:44 PM
> > To: Zou, Nanhai; beignet@lists.freedesktop.org
> > Cc: Gandikota, Sirisha
> > Subject: [Beignet] [PATCH] utests: Added unit tests to test LLVM and
> > ASM dump generation.
> >
> > This patch adds 2 new tests to the unit tests. It uses the existing
> > framework and data structures and tests the llvm/asm dump generation
> > when these flags (- dump-opt-llvm, -dump-opt-asm) are passed as build
> > options along with the dump file names.
> >
> > Methods added:
> > 1) get_build_llvm_info() tests LLVM dump generation
> > 2) get_build_asm_info() tests ASM dump generation
> >
> > Signed-off-by: Sirisha Gandikota 
> > ---
> >  utests/get_cl_info.cpp | 107
> > +
> >  1 file changed, 107 insertions(+)
> >
> > diff --git a/utests/get_cl_info.cpp b/utests/get_cl_info.cpp index
> > e2dc0d7..7c03d95 100644
> > --- a/utests/get_cl_info.cpp
> > +++ b/utests/get_cl_info.cpp
> > @@ -364,6 +364,113 @@ void get_program_build_info(void)
> >
> >  MAKE_UTEST_FROM_FUNCTION(get_program_build_info);
> >
> > +
> > +// This method uses clGetProgramBuildInfo to check the llvm dump
> > +build options sent // and verifies that the llvm dump file is
> > +actually generated in the
> > backend.
> > +void get_build_llvm_info(void)
> > +{
> > +map maps;
> > +cl_build_status expect_status;
> > +char llvm_file[] = "test_llvm_dump.txt";
> > +char build_opt[] = "-dump-opt-llvm=test_llvm_dump.txt";
> > +FILE *fp = NULL;
> > +int sz;
> > +
> > +//Remove any pre-existing file
> > +if( (fp = fopen(llvm_file, "r")) != NULL) {
> > +fclose(fp);
> > +std::remove(llvm_file);
> > +}
> > +
> > +OCL_CALL (cl_kernel_init, "compiler_if_else.cl",
> > + "compiler_if_else", SOURCE, build_opt);
> > +
> > +/* Do our test.*/
> > +expect_status = CL_BUILD_SUCCESS;
> > +maps.insert(make_pair(CL_PROGRAM_BUILD_STATUS,
> > +  (void *)(new 
> > Info_Result(expect_status;
> > +sz = strlen(build_opt) + 1;
> > +maps.insert(make_pair(CL_PROGRAM_BUILD_OPTIONS,
> > +  (void *)(new Info_Result(build_opt,
> > + sz;
> > +
> > +for (map::iterator x = maps.begin(); x
> > + !=
> > maps.end(); ++x) {
> > +switch (x->first) {
> > +case CL_PROGRAM_BUILD_STATUS:
> > +CALL_PROG_BUILD_INFO_AND_RET(cl_build_status);
> > +break;
> > +case CL_PROGRAM_BUILD_OPTIONS:
> > +CALL_PROG_BUILD_INFO_AND_RET(char *);
> > +break;
> > +default:
> > +break;
> > +}
> > +}
> > +
> > +//Test is successful if the backend created the file
> > +if( (fp = fopen(llvm_file, "r")) == NULL) {
> > +std::cout << "LLVM file creation.. FAILED";
> > +OCL_ASSERT(0);
> > +} else {
> > +fclose(fp);
> > +std::cout << "LLVM file created.. SUCCESS";
> > +}
> > +}
> > +
> > +MAKE_UTEST_FROM_FUNCTION(get_build_llvm_info);
> > +
> > +
> > +// This method uses clGetProgramBuildInfo to check the asm dump build
> > +options sent // And verifies that the asm dump file is actually
> > +generated in the
> > backend.
> > +void get_build_asm_info(void)
> > +{
> > +map maps;
> > +cl_build_status expect_status;
> > +char asm_file[] = "test_asm_dump.txt";
> > +char build_opt[] ="-dump-opt-asm=test_asm_dump.txt";
> > +FILE *fp = NULL;
> > +int sz;
> > +
> > +//Remove any pre-existing file
> > +if( (fp = fopen(asm_file, "r")) != NULL) {
> > +fclose(fp);
> > +std::remove(asm_file);
> > +}
> > +
> > +OCL_CALL (cl_kernel_init, "compiler_if_else.cl",
> > + "compiler_if_else", SOURCE, build_opt);
> > +
> > +/* Do our test.*/
> > +expect_status = CL_BUILD_SUCCESS;
> > +maps.insert(make_pair(CL_PROGRAM_BUILD_STATUS,
> > +  (void *)(new 
> > Info_Result(expect_status;
> > +sz = strlen(build_opt) + 1;
> > +maps.insert(make_pair(CL_PROGRAM_BUILD_OPTIONS,
> > +  (void *)(new Info_Result(build_opt,
> > + sz;
> > +
> > +for (map::iterator x = maps.begin(); x
> > + !=
> > maps.end(); ++x) {
> > +switch (x->first) {
> > +case CL_PROGRAM_BUILD_STATUS:
> > +

Re: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid creating temporary cl source file.

2015-09-08 Thread Yang, Rong R
Is this remapped file virtual file? If it is not a virtual file, I am afraid it 
is not thread/process safe.

> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
> Gong, Zhigang
> Sent: Tuesday, September 8, 2015 14:33
> To: Luo, Xionghu; beignet@lists.freedesktop.org
> Subject: Re: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid creating
> temporary cl source file.
> 
> > -Original Message-
> > From: Luo, Xionghu
> > Sent: Tuesday, September 8, 2015 2:09 PM
> > To: Gong, Zhigang; beignet@lists.freedesktop.org
> > Cc: Gong, Zhigang
> > Subject: RE: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid
> > creating temporary cl source file.
> >
> > This patch LGTM except some questions.
> >
> > How didn't decide the name "stringInput.cl"?
> Not sure what's your meaning here?
> 
> > And since this method works, we could also remap all the input headers
> > in API clCompileProgram to avoid create temp files under /tmp, anyway,
> > this could be processed in another patch.
> Right, the header files's processing in clCompileProgram could be refined by
> using the same method.
> 
> 
> >
> > Luo Xionghu
> > Best Regards
> >
> > -Original Message-
> > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf
> > Of Zhigang Gong
> > Sent: Monday, August 31, 2015 2:30 PM
> > To: beignet@lists.freedesktop.org
> > Cc: Gong, Zhigang
> > Subject: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid creating
> > temporary cl source file.
> >
> > LLVM provides powerful string-remapped feature which could be used to
> > map a string to an input file name, thus we don't need to create a
> > temporary cl source file any more.
> >
> > This patch not only make things much clear and avoid the unecessary
> > file creation. It only fixes some weird directory related problems.
> > Because beignet creates the temoprary file at the /tmp directory.
> > Then the clang will search the include files in that directory by
> > default, but the developer expects it to search the working directory
> > firstly. This causing two weird things:
> > 1. If a .cl file is including a .h file in the current directory, beignet
> >will not find it.
> >
> > 2. Even if the probram add a "-I." option manually, beignet will search /tmp
> >firstly, and if there is a .h file in /tmp/ with the eaxct same file
> >name, beignet will the file located in /tmp.
> >
> > Signed-off-by: Zhigang Gong 
> > ---
> >  backend/src/backend/program.cpp | 40
> > ++--
> >  1 file changed, 10 insertions(+), 30 deletions(-)
> >
> > diff --git a/backend/src/backend/program.cpp
> > b/backend/src/backend/program.cpp index d9e6416..330bead 100644
> > --- a/backend/src/backend/program.cpp
> > +++ b/backend/src/backend/program.cpp
> > @@ -518,7 +518,7 @@ namespace gbe {
> >  #ifdef GBE_COMPILER_AVAILABLE
> >BVAR(OCL_OUTPUT_BUILD_LOG, false);
> >
> > -  static bool buildModuleFromSource(const char* input, llvm::Module**
> > out_module, llvm::LLVMContext* llvm_ctx,
> > +  static bool buildModuleFromSource(const char *source,
> > + llvm::Module** out_module, llvm::LLVMContext* llvm_ctx,
> >  std::string dumpLLVMFileName,
> > std::vector& options, size_t stringSize, char *err,
> >  size_t *errSize) {
> >  // Arguments to pass to the clang frontend @@ -551,8 +551,7 @@
> > namespace gbe {
> >  args.push_back("-triple");
> >  args.push_back("spir");
> >  #endif /* LLVM_VERSION_MINOR <= 2 */
> > -args.push_back(input);
> > -
> > +args.push_back("stringInput.cl");
> >  args.push_back("-ffp-contract=off");
> >
> >  // The compiler invocation needs a DiagnosticsEngine so it can
> > report problems @@ -574,6 +573,9 @@ namespace gbe {
> >[0],
> >[0] + args.size(),
> >Diags);
> > +llvm::StringRef srcString(source);
> > +(*CI).getPreprocessorOpts().addRemappedFile("stringInput.cl",
> > +
> > + llvm::MemoryBuffer::getMemBuffer(srcString).release());
> >
> >  // Create the compiler instance
> >  clang::CompilerInstance Clang;
> > @@ -670,7 +672,6 @@ namespace gbe {
> >   std::vector& clOpt,
> >   std::string& dumpLLVMFileName,
> >   std::string& dumpASMFileName,
> > - std::string& clName,
> >   int& optLevel,
> >   size_t stringSize,
> >   char *err, @@ -781,21 +782,6 @@
> > namespace gbe {
> >}
> >  }
> >
> > -char clStr[] = "/tmp/XX.cl";
> > -int clFd = mkstemps(clStr, 3);
> > -clName = std::string(clStr);
> > -

Re: [Beignet] [PATCH 1/3] fix bswap bug.

2015-09-08 Thread Yang, Rong R


> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
> xionghu@intel.com
> Sent: Thursday, August 13, 2015 14:28
> To: beignet@lists.freedesktop.org
> Cc: Luo, Xionghu
> Subject: [Beignet] [PATCH 1/3] fix bswap bug.
> 
> From: Luo Xionghu 
> 
> if the source is uniform and dst is non-uniform, no need to add the indirect
> address index.
> 
> Signed-off-by: Luo Xionghu 
> ---
>  backend/src/backend/gen8_context.cpp | 6 --
> backend/src/backend/gen_context.cpp  | 9 ++---
>  2 files changed, 10 insertions(+), 5 deletions(-)
> 
> diff --git a/backend/src/backend/gen8_context.cpp
> b/backend/src/backend/gen8_context.cpp
> index b497ee5..eca8eeb 100644
> --- a/backend/src/backend/gen8_context.cpp
> +++ b/backend/src/backend/gen8_context.cpp
> @@ -182,7 +182,8 @@ namespace gbe
>p->MOV(GenRegister::offset(GenRegister::retype(tmp,
> GEN_TYPE_UB), 0, 16), ind_src);
There is another  ind_src.addr_imm += 16; before this line, also 
need uniform_src check.


>if (simd == 16) {
>  for (int i = 0; i < 2; i++) {
> -  ind_src.addr_imm += 16;
> +  if(!uniform_src)
> +ind_src.addr_imm += 16;
>p->MOV(GenRegister::offset(GenRegister::retype(tmp,
> GEN_TYPE_UB), 1, 16*i), ind_src);
>  }
>}
> @@ -237,7 +238,8 @@ namespace gbe
>GenRegister ind_src =
> GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
> new_a0[0], 0);
>p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
>if (simd == 16) {
> -ind_src.addr_imm += 16;
> +if(!uniform_src)
> +  ind_src.addr_imm += 16;
>  p->MOV(GenRegister::offset(GenRegister::retype(tmp,
> GEN_TYPE_UB), 0, 16), ind_src);
>}
>p->pop();
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index e16b0a9..8ee65ee 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -384,12 +384,14 @@ namespace gbe
>  GenRegister ind_src =
> GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
> new_a0[0], 0);
>  p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
>  for (int i = 1; i < 4; i++) {
> -  ind_src.addr_imm += 8;
> +  if (!uniform_src)
> +ind_src.addr_imm += 8;
>p->MOV(GenRegister::offset(GenRegister::retype(tmp,
> GEN_TYPE_UB), 0, 8*i), ind_src);
>  }
>  if (simd == 16) {
>for (int i = 0; i < 4; i++) {
> -ind_src.addr_imm += 8;
> +if (!uniform_src)
> +  ind_src.addr_imm += 8;
>  p->MOV(GenRegister::offset(GenRegister::retype(tmp,
> GEN_TYPE_UB), 1, 8*i), ind_src);
>}
>  }
> @@ -428,7 +430,8 @@ namespace gbe
>  GenRegister ind_src =
> GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
> new_a0[0], 0);
>  p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
>  for (int i = 1; i < (simd == 8 ? 2 : 4); i++) {
> -  ind_src.addr_imm += 8;
> +  if (!uniform_src)
> +ind_src.addr_imm += 8;
>p->MOV(GenRegister::offset(GenRegister::retype(tmp,
> GEN_TYPE_UB), 0, 8*i), ind_src);
>  }
>  p->pop();
> --
> 1.9.1
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] generate MOV instruction at selection stage when do simd_shuffle with imm value.

2015-09-08 Thread Guo, Yejun
Ping for review, thanks.

-Original Message-
From: Guo, Yejun 
Sent: Friday, August 28, 2015 7:06 AM
To: beignet@lists.freedesktop.org
Cc: Guo, Yejun
Subject: [PATCH] generate MOV instruction at selection stage when do 
simd_shuffle with imm value.

the earlier the instruction is generated, the more possible optimization could 
be applied.

Signed-off-by: Guo Yejun 
---
 backend/src/backend/gen8_context.cpp   | 22 ++
 backend/src/backend/gen_context.cpp| 46 +-
 backend/src/backend/gen_insn_selection.cpp | 10 +--
 backend/src/backend/gen_register.hpp   |  8 ++
 4 files changed, 44 insertions(+), 42 deletions(-)

diff --git a/backend/src/backend/gen8_context.cpp 
b/backend/src/backend/gen8_context.cpp
index b497ee5..f02786c 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -259,20 +259,14 @@ namespace gbe
 const GenRegister src0 = ra->genReg(insn.src(0));
 const GenRegister src1 = ra->genReg(insn.src(1));
 assert(insn.opcode == SEL_OP_SIMD_SHUFFLE);
-
-uint32_t simd = p->curr.execWidth;
-if (src1.file == GEN_IMMEDIATE_VALUE) {
-  uint32_t offset = src1.value.ud % simd;
-  GenRegister reg = GenRegister::suboffset(src0, offset);
-  p->MOV(dst, GenRegister::retype(GenRegister::ud1grf(reg.nr, reg.subnr / 
typeSize(reg.type)), reg.type));
-} else {
-  uint32_t base = src0.nr * 32 + src0.subnr * 4;
-  GenRegister baseReg = GenRegister::immuw(base);
-  const GenRegister a0 = GenRegister::addr8(0);
-  p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / 
typeSize(GEN_TYPE_UW)), baseReg);
-  GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
-  p->MOV(dst, indirect);
-}
+assert (src1.file != GEN_IMMEDIATE_VALUE);
+
+uint32_t base = src0.nr * 32 + src0.subnr * 4;
+GenRegister baseReg = GenRegister::immuw(base);
+const GenRegister a0 = GenRegister::addr8(0);
+p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / 
typeSize(GEN_TYPE_UW)), baseReg);
+GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+p->MOV(dst, indirect);
   }
 
   void Gen8Context::emitBinaryInstruction(const SelectionInstruction ) { 
diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 25fdf08..c2be7aa 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -585,35 +585,29 @@ namespace gbe
 const GenRegister src0 = ra->genReg(insn.src(0));
 const GenRegister src1 = ra->genReg(insn.src(1));
 assert(insn.opcode == SEL_OP_SIMD_SHUFFLE);
+assert (src1.file != GEN_IMMEDIATE_VALUE);
 
+uint32_t base = src0.nr * 32 + src0.subnr * 4;
+GenRegister baseReg = GenRegister::immuw(base);
+const GenRegister a0 = GenRegister::addr8(0);
 uint32_t simd = p->curr.execWidth;
-if (src1.file == GEN_IMMEDIATE_VALUE) {
-  uint32_t offset = src1.value.ud % simd;
-  GenRegister reg = GenRegister::suboffset(src0, offset);
-  p->MOV(dst, GenRegister::retype(GenRegister::ud1grf(reg.nr, reg.subnr / 
typeSize(reg.type)), reg.type));
-} else {
-  uint32_t base = src0.nr * 32 + src0.subnr * 4;
-  GenRegister baseReg = GenRegister::immuw(base);
-  const GenRegister a0 = GenRegister::addr8(0);
+p->push();
+  if (simd == 8) {
+p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / 
typeSize(GEN_TYPE_UW)), baseReg);
+GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+p->MOV(dst, indirect);
+  } else if (simd == 16) {
+p->curr.execWidth = 8;
+p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / 
typeSize(GEN_TYPE_UW)), baseReg);
+GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+p->MOV(dst, indirect);
 
-  p->push();
-if (simd == 8) {
-  p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / 
typeSize(GEN_TYPE_UW)), baseReg);
-  GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
-  p->MOV(dst, indirect);
-} else if (simd == 16) {
-  p->curr.execWidth = 8;
-  p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / 
typeSize(GEN_TYPE_UW)), baseReg);
-  GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
-  p->MOV(dst, indirect);
-
-  p->curr.quarterControl = 1;
-  p->ADD(a0, GenRegister::unpacked_uw(src1.nr+1, src1.subnr / 
typeSize(GEN_TYPE_UW)), baseReg);
-  p->MOV(GenRegister::offset(dst, 1, 0), indirect);
-} else
-  NOT_IMPLEMENTED;
-  p->pop();
-}
+p->curr.quarterControl = 1;
+p->ADD(a0, GenRegister::unpacked_uw(src1.nr+1, src1.subnr / 
typeSize(GEN_TYPE_UW)), baseReg);
+p->MOV(GenRegister::offset(dst, 1, 0), indirect);
+  } else
+NOT_IMPLEMENTED;
+p->pop();
   }
 
   void 

Re: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid creating temporary cl source file.

2015-09-08 Thread Gong, Zhigang


> -Original Message-
> From: Yang, Rong R
> Sent: Tuesday, September 8, 2015 4:11 PM
> To: Gong, Zhigang; Luo, Xionghu; beignet@lists.freedesktop.org
> Subject: RE: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid creating
> temporary cl source file.
> 
> Is this remapped file virtual file? If it is not a virtual file, I am afraid 
> it is not
> thread/process safe.

It's not a file, just a map belongs to the clang::CompilerInvocation object.

> 
> > -Original Message-
> > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf
> > Of Gong, Zhigang
> > Sent: Tuesday, September 8, 2015 14:33
> > To: Luo, Xionghu; beignet@lists.freedesktop.org
> > Subject: Re: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid
> > creating temporary cl source file.
> >
> > > -Original Message-
> > > From: Luo, Xionghu
> > > Sent: Tuesday, September 8, 2015 2:09 PM
> > > To: Gong, Zhigang; beignet@lists.freedesktop.org
> > > Cc: Gong, Zhigang
> > > Subject: RE: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid
> > > creating temporary cl source file.
> > >
> > > This patch LGTM except some questions.
> > >
> > > How didn't decide the name "stringInput.cl"?
> > Not sure what's your meaning here?
> >
> > > And since this method works, we could also remap all the input
> > > headers in API clCompileProgram to avoid create temp files under
> > > /tmp, anyway, this could be processed in another patch.
> > Right, the header files's processing in clCompileProgram could be
> > refined by using the same method.
> >
> >
> > >
> > > Luo Xionghu
> > > Best Regards
> > >
> > > -Original Message-
> > > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On
> > > Behalf Of Zhigang Gong
> > > Sent: Monday, August 31, 2015 2:30 PM
> > > To: beignet@lists.freedesktop.org
> > > Cc: Gong, Zhigang
> > > Subject: [Beignet] [PATCH] GBE: Use addRemappedFile to avoid
> > > creating temporary cl source file.
> > >
> > > LLVM provides powerful string-remapped feature which could be used
> > > to map a string to an input file name, thus we don't need to create
> > > a temporary cl source file any more.
> > >
> > > This patch not only make things much clear and avoid the unecessary
> > > file creation. It only fixes some weird directory related problems.
> > > Because beignet creates the temoprary file at the /tmp directory.
> > > Then the clang will search the include files in that directory by
> > > default, but the developer expects it to search the working
> > > directory firstly. This causing two weird things:
> > > 1. If a .cl file is including a .h file in the current directory, beignet
> > >will not find it.
> > >
> > > 2. Even if the probram add a "-I." option manually, beignet will search 
> > > /tmp
> > >firstly, and if there is a .h file in /tmp/ with the eaxct same file
> > >name, beignet will the file located in /tmp.
> > >
> > > Signed-off-by: Zhigang Gong 
> > > ---
> > >  backend/src/backend/program.cpp | 40
> > > ++--
> > >  1 file changed, 10 insertions(+), 30 deletions(-)
> > >
> > > diff --git a/backend/src/backend/program.cpp
> > > b/backend/src/backend/program.cpp index d9e6416..330bead 100644
> > > --- a/backend/src/backend/program.cpp
> > > +++ b/backend/src/backend/program.cpp
> > > @@ -518,7 +518,7 @@ namespace gbe {
> > >  #ifdef GBE_COMPILER_AVAILABLE
> > >BVAR(OCL_OUTPUT_BUILD_LOG, false);
> > >
> > > -  static bool buildModuleFromSource(const char* input,
> > > llvm::Module** out_module, llvm::LLVMContext* llvm_ctx,
> > > +  static bool buildModuleFromSource(const char *source,
> > > + llvm::Module** out_module, llvm::LLVMContext* llvm_ctx,
> > >  std::string
> dumpLLVMFileName,
> > > std::vector& options, size_t stringSize, char *err,
> > >  size_t *errSize) {
> > >  // Arguments to pass to the clang frontend @@ -551,8 +551,7 @@
> > > namespace gbe {
> > >  args.push_back("-triple");
> > >  args.push_back("spir");
> > >  #endif /* LLVM_VERSION_MINOR <= 2 */
> > > -args.push_back(input);
> > > -
> > > +args.push_back("stringInput.cl");
> > >  args.push_back("-ffp-contract=off");
> > >
> > >  // The compiler invocation needs a DiagnosticsEngine so it can
> > > report problems @@ -574,6 +573,9 @@ namespace gbe {
> > >[0],
> > >[0] +
> args.size(),
> > >Diags);
> > > +llvm::StringRef srcString(source);
> > > +(*CI).getPreprocessorOpts().addRemappedFile("stringInput.cl",
> > > +
> > > + llvm::MemoryBuffer::getMemBuffer(srcString).release());
> > >
> > >  // Create the compiler instance
> > >  clang::CompilerInstance Clang;
> > > @@ -670,7 +672,6 @@ namespace gbe {
> > >   std::vector&
> clOpt,
> > > 

[Beignet] [PATCH 17/19] Backend: Add ADD_ and SUB_ timestamps help functions.

2015-09-08 Thread junyan . he
From: Junyan He 

The timestamps are calculated by Long type. Before BDW,
there is no Long type support and we use i32 operations
to implement them.

Signed-off-by: Junyan He 
---
 backend/src/backend/gen8_context.cpp |   24 +++
 backend/src/backend/gen8_context.hpp |2 ++
 backend/src/backend/gen_context.cpp  |   53 ++
 backend/src/backend/gen_context.hpp  |2 ++
 4 files changed, 81 insertions(+)

diff --git a/backend/src/backend/gen8_context.cpp 
b/backend/src/backend/gen8_context.cpp
index b497ee5..9b2fc97 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -943,6 +943,30 @@ namespace gbe
 p->pop();
   }
 
+  void Gen8Context::subTimestamps(GenRegister& t0, GenRegister& t1, 
GenRegister& tmp)
+  {
+p->push(); {
+  p->curr.execWidth = 1;
+  p->curr.predicate = GEN_PREDICATE_NONE;
+  p->curr.noMask = 1;
+  p->ADD(GenRegister::retype(t0, GEN_TYPE_UL), GenRegister::retype(t0, 
GEN_TYPE_UL),
+  GenRegister::negate(GenRegister::retype(t1, GEN_TYPE_UL)));
+  p->MOV(GenRegister::retype(tmp, GEN_TYPE_UL), 
GenRegister::immuint64(0x));
+  p->ADD(GenRegister::retype(t0, GEN_TYPE_UL), GenRegister::retype(t0, 
GEN_TYPE_UL),
+  GenRegister::retype(tmp, GEN_TYPE_UL));
+} p->pop();
+  }
+
+  void Gen8Context::addTimestamps(GenRegister& t0, GenRegister& t1, 
GenRegister& tmp) {
+p->push(); {
+  p->curr.execWidth = 1;
+  p->curr.predicate = GEN_PREDICATE_NONE;
+  p->curr.noMask = 1;
+  p->ADD(GenRegister::retype(t0, GEN_TYPE_UL), GenRegister::retype(t0, 
GEN_TYPE_UL),
+  GenRegister::retype(t1, GEN_TYPE_UL));
+} p->pop();
+  }
+
   void ChvContext::newSelection(void) {
 this->sel = GBE_NEW(SelectionChv, *this);
   }
diff --git a/backend/src/backend/gen8_context.hpp 
b/backend/src/backend/gen8_context.hpp
index 84508e9..aab1fd0 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -76,6 +76,8 @@ namespace gbe
 
   protected:
 virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, 
int sz = 0);
+virtual void subTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& 
tmp);
+virtual void addTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& 
tmp);
 virtual GenEncoder* generateEncoder(void) {
   return GBE_NEW(Gen8Encoder, this->simdWidth, 8, deviceID);
 }
diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index a12d056..7789fe7 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2242,6 +2242,59 @@ namespace gbe
 #undef CALC_GID
   }
 
+  void GenContext::subTimestamps(GenRegister& t0, GenRegister& t1, 
GenRegister& tmp)
+  {
+p->push(); {
+  p->curr.execWidth = 1;
+  p->curr.predicate = GEN_PREDICATE_NONE;
+  p->curr.noMask = 1;
+  p->SUBB(GenRegister::retype(t0, GEN_TYPE_UD),
+  GenRegister::retype(t0, GEN_TYPE_UD), GenRegister::retype(t1, 
GEN_TYPE_UD));
+  /* FIXME We can not get the acc register's value correctly by set simd = 
1. */
+  p->curr.execWidth = 8;
+  p->MOV(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD));
+  p->curr.execWidth = 1;
+  p->ADD(GenRegister::retype(GenRegister::offset(t0, 0, sizeof(uint32_t)), 
GEN_TYPE_UD),
+  GenRegister::retype(GenRegister::offset(t0, 0, sizeof(uint32_t)), 
GEN_TYPE_UD),
+  GenRegister::negate(GenRegister::toUniform(tmp, GEN_TYPE_UD)));
+  p->ADD(GenRegister::retype(GenRegister::offset(t0, 0, sizeof(uint32_t)), 
GEN_TYPE_UD),
+  GenRegister::retype(GenRegister::offset(t0, 0, sizeof(uint32_t)), 
GEN_TYPE_UD),
+  GenRegister::negate(GenRegister::retype(GenRegister::offset(t1, 0, 
sizeof(uint32_t)), GEN_TYPE_UD)));
+  // Mod 0x 
+  p->ADDC(GenRegister::retype(t0, GEN_TYPE_UD),
+  GenRegister::retype(t0, GEN_TYPE_UD), 
GenRegister::immud(0x));
+  p->curr.execWidth = 8;
+  p->MOV(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD));
+  p->curr.execWidth = 1;
+  p->ADD(GenRegister::retype(GenRegister::offset(t0, 0, sizeof(uint32_t)), 
GEN_TYPE_UD),
+  GenRegister::retype(GenRegister::offset(t0, 0, sizeof(uint32_t)), 
GEN_TYPE_UD),
+  GenRegister::toUniform(tmp, GEN_TYPE_UD));
+  p->ADD(GenRegister::retype(GenRegister::offset(t0, 0, sizeof(uint32_t)), 
GEN_TYPE_UD),
+  GenRegister::retype(GenRegister::offset(t0, 0, sizeof(uint32_t)), 
GEN_TYPE_UD),
+  GenRegister::immud(0x));
+} p->pop();
+  }
+
+  void GenContext::addTimestamps(GenRegister& t0, GenRegister& t1, 
GenRegister& tmp)
+  {
+p->push(); {
+  p->curr.execWidth = 1;
+  p->curr.predicate = GEN_PREDICATE_NONE;
+  p->curr.noMask = 1;
+  p->ADDC(GenRegister::retype(t0, GEN_TYPE_UD),
+   

[Beignet] [PATCH 19/19] Backend: Implement StoreProfilingInstruction in GenContext.

2015-09-08 Thread junyan . he
From: Junyan He 

The offset 0 of the profiling buffer contains the log number.
We will use atomic instruction to inc it every time a log
is generated.
We will generate one log for each HW gpu thread. The log
contains the XYZ range of global work items which are executed
on this thread, the EU id, the Sub Slice id,  thread number,
and 20 points' timestamp which we are interested in.

Signed-off-by: Junyan He 
---
 backend/src/backend/gen_context.cpp |  173 +++
 1 file changed, 173 insertions(+)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 26af4cd..df36e9a 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2469,6 +2469,179 @@ namespace gbe
   }
 
   void GenContext::emitStoreProfilingInstruction(const SelectionInstruction 
) {
+uint32_t simdType;
+if (this->simdWidth == 16) {
+  simdType = ir::ProfilingInfo::ProfilingSimdType16;
+} else if (this->simdWidth == 8) {
+  simdType = ir::ProfilingInfo::ProfilingSimdType8;
+} else {
+  simdType = ir::ProfilingInfo::ProfilingSimdType1;
+  GBE_ASSERT(0);
+}
+
+p->NOP();
+p->NOP();
+
+GenRegister tmArf = GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+0xc0,
+0,
+GEN_TYPE_UW,
+GEN_VERTICAL_STRIDE_4,
+GEN_WIDTH_4,
+GEN_HORIZONTAL_STRIDE_1);
+GenRegister profilingReg[5];
+if (p->curr.execWidth == 16) {
+  profilingReg[0] = GenRegister::retype(ra->genReg(insn.src(0)), 
GEN_TYPE_UD);
+  profilingReg[1] = GenRegister::offset(profilingReg[0], 1);
+  profilingReg[2] = GenRegister::retype(ra->genReg(insn.src(1)), 
GEN_TYPE_UD);
+  profilingReg[3] = GenRegister::offset(profilingReg[2], 1);
+  profilingReg[4] = GenRegister::retype(ra->genReg(insn.src(2)), 
GEN_TYPE_UD);
+} else {
+  GBE_ASSERT(p->curr.execWidth == 8);
+  profilingReg[0] = GenRegister::retype(ra->genReg(insn.src(0)), 
GEN_TYPE_UD);
+  profilingReg[1] = GenRegister::retype(ra->genReg(insn.src(1)), 
GEN_TYPE_UD);
+  profilingReg[2] = GenRegister::retype(ra->genReg(insn.src(2)), 
GEN_TYPE_UD);
+  profilingReg[3] = GenRegister::retype(ra->genReg(insn.src(3)), 
GEN_TYPE_UD);
+  profilingReg[4] = GenRegister::retype(ra->genReg(insn.src(4)), 
GEN_TYPE_UD);
+}
+GenRegister tmp = ra->genReg(insn.dst(0));
+uint32_t profilingType = insn.extra.profilingType;
+uint32_t bti = insn.extra.profilingBTI;
+GBE_ASSERT(profilingType == 1);
+GenRegister flagReg = GenRegister::flag(insn.state.flag, 
insn.state.subFlag);
+GenRegister lastTsReg = GenRegister::toUniform(profilingReg[3], 
GEN_TYPE_UL);
+lastTsReg = GenRegister::offset(lastTsReg, 0, 2*sizeof(uint64_t));
+GenRegister realClock = GenRegister::offset(lastTsReg, 0, 
sizeof(uint64_t));
+GenRegister tmp0 = GenRegister::toUniform(profilingReg[3], GEN_TYPE_UL);
+
+/* MOV(4)   tmp0<1>:UW  arf_tm<4,4,1>:UW  */
+p->push(); {
+  p->curr.execWidth = 4;
+  p->curr.predicate = GEN_PREDICATE_NONE;
+  p->curr.noMask = 1;
+  GenRegister _tmp0 = tmp0;
+  _tmp0.type = GEN_TYPE_UW;
+  _tmp0.hstride = GEN_HORIZONTAL_STRIDE_1;
+  _tmp0.vstride = GEN_VERTICAL_STRIDE_4;
+  _tmp0.width = GEN_WIDTH_4;
+  p->MOV(_tmp0, tmArf);
+} p->pop();
+
+/* Calc the time elapsed. */
+subTimestamps(tmp0, lastTsReg, tmp);
+/* Update the real clock */
+addTimestamps(realClock, tmp0, tmp);
+
+//the epilog, record the last timestamp and return.
+/* MOV(1)   epilog<1>:UL   realclock<0,1,0>:UL  */
+/* ADD(1)   epilog<1>:UL   prolog<0,1,0>:UL  */
+GenRegister prolog = GenRegister::toUniform(profilingReg[2], GEN_TYPE_UD);
+prolog = GenRegister::offset(prolog, 0, 4*sizeof(uint32_t));
+GenRegister epilog = GenRegister::offset(prolog, 0, 2*sizeof(uint32_t));
+p->push(); {
+  p->curr.execWidth = 1;
+  p->curr.predicate = GEN_PREDICATE_NONE;
+  p->curr.noMask = 1;
+  p->MOV(epilog, GenRegister::retype(realClock, GEN_TYPE_UD));
+  p->MOV(GenRegister::offset(epilog, 0, sizeof(uint32_t)),
+  GenRegister::offset(GenRegister::retype(realClock, GEN_TYPE_UD), 0, 
sizeof(uint32_t)));
+  addTimestamps(epilog, prolog, tmp);
+} p->pop();
+
+/* Now, begin to write the results out. */
+// Inc the log items number.
+p->push(); {
+  //ptr[0] is the total count of the log items.
+  GenRegister sndMsg = GenRegister::retype(tmp, GEN_TYPE_UD);
+  sndMsg.width = GEN_WIDTH_8;
+  sndMsg.hstride = GEN_HORIZONTAL_STRIDE_1;
+  sndMsg.vstride = GEN_VERTICAL_STRIDE_8;
+  p->curr.execWidth = 8;
+  p->curr.predicate = GEN_PREDICATE_NONE;
+  p->curr.noMask = 1;
+  p->MOV(sndMsg, GenRegister::immud(0x0));
+
+  GenRegister incRes = GenRegister::offset(sndMsg, 1);
+  p->push();
+  {
+

[Beignet] [PATCH 18/19] Backend; Implement emitCalcTimestampInstruction in GenContext.

2015-09-08 Thread junyan . he
From: Junyan He 

We will maintain a real clock to record the real execute time
of the orginal code. We do not want to introduce overhead
because of adding the profiling instructions, so every time
we enter the proliling instructions block, we will calculate the
real time clock value and update the real clock, and when leave
this the proliling instructions block, we will record the time
stamp of that leave point.

Signed-off-by: Junyan He 
---
 backend/src/backend/gen_context.cpp |  115 ++-
 1 file changed, 114 insertions(+), 1 deletion(-)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 7789fe7..26af4cd 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2351,8 +2351,121 @@ namespace gbe
 return;
   }
 
-  void GenContext::emitCalcTimestampInstruction(const SelectionInstruction 
) {
+  /* We will record at most 20 timestamps, each one is 16bits. We also will 
record the
+ prolog and epilog timestamps in 64 bits. So the format of the curbe 
timestamp reg is:
+ -
+ | ts0  | ts1  | ts2  | ts3  | ts4  | ts5  | ts6  | ts7  |  profilingReg0
+ | ts8  | ts9  | ts10 | ts11 | ts12 | ts13 | ts14 | ts15 |  profilingReg1
+ | ts16 | ts17 | ts18 | ts19 |prolog   |epilog   |  profilingReg2
+ -
+ |tmp0 |tmp1 |lasttimestamp|  real clock |  profilingReg3
+ -
+ | | gX s | gX e | gY s | gY e | gZ s | gZ e |  profilingReg4
+ -
+ */
+  void GenContext::emitCalcTimestampInstruction(const SelectionInstruction 
)
+  {
+uint32_t pointNum = insn.extra.pointNum;
+uint32_t tsType = insn.extra.timestampType;
+GenRegister flagReg = GenRegister::flag(insn.state.flag, 
insn.state.subFlag);
 
+GBE_ASSERT(tsType == 1);
+GenRegister tmArf = GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+0xc0,
+0,
+GEN_TYPE_UW,
+GEN_VERTICAL_STRIDE_4,
+GEN_WIDTH_4,
+GEN_HORIZONTAL_STRIDE_1);
+GenRegister profilingReg[5];
+GenRegister tmp;
+if (p->curr.execWidth == 16) {
+  profilingReg[0] = GenRegister::retype(ra->genReg(insn.src(0)), 
GEN_TYPE_UD);
+  profilingReg[1] = GenRegister::offset(profilingReg[0], 1);
+  profilingReg[2] = GenRegister::retype(ra->genReg(insn.src(1)), 
GEN_TYPE_UD);
+  profilingReg[3] = GenRegister::offset(profilingReg[2], 1);
+  profilingReg[4] = GenRegister::retype(ra->genReg(insn.src(2)), 
GEN_TYPE_UD);
+  if (insn.dstNum == 4) {
+tmp = GenRegister::retype(ra->genReg(insn.dst(3)), GEN_TYPE_UD);
+  } else {
+GBE_ASSERT(insn.dstNum == 3);
+tmp = GenRegister::toUniform(profilingReg[4], GEN_TYPE_UL);
+  }
+} else {
+  GBE_ASSERT(p->curr.execWidth == 8);
+  profilingReg[0] = GenRegister::retype(ra->genReg(insn.src(0)), 
GEN_TYPE_UD);
+  profilingReg[1] = GenRegister::retype(ra->genReg(insn.src(1)), 
GEN_TYPE_UD);
+  profilingReg[2] = GenRegister::retype(ra->genReg(insn.src(2)), 
GEN_TYPE_UD);
+  profilingReg[3] = GenRegister::retype(ra->genReg(insn.src(3)), 
GEN_TYPE_UD);
+  profilingReg[4] = GenRegister::retype(ra->genReg(insn.src(4)), 
GEN_TYPE_UD);
+  if (insn.dstNum == 6) {
+tmp = GenRegister::retype(ra->genReg(insn.dst(5)), GEN_TYPE_UD);
+  } else {
+GBE_ASSERT(insn.dstNum == 5);
+tmp = GenRegister::toUniform(profilingReg[4], GEN_TYPE_UL);
+  }
+}
+GenRegister tmp0 = GenRegister::toUniform(profilingReg[3], GEN_TYPE_UL);
+GenRegister lastTsReg = GenRegister::toUniform(profilingReg[3], 
GEN_TYPE_UL);
+lastTsReg = GenRegister::offset(lastTsReg, 0, 2*sizeof(uint64_t));
+GenRegister realClock = GenRegister::offset(lastTsReg, 0, 
sizeof(uint64_t));
+
+/* MOV(4)   tmp0<1>:UW  arf_tm<4,4,1>:UW  */
+p->push(); {
+  p->curr.execWidth = 4;
+  p->curr.predicate = GEN_PREDICATE_NONE;
+  p->curr.noMask = 1;
+  GenRegister _tmp0 = tmp0;
+  _tmp0.type = GEN_TYPE_UW;
+  _tmp0.hstride = GEN_HORIZONTAL_STRIDE_1;
+  _tmp0.vstride = GEN_VERTICAL_STRIDE_4;
+  _tmp0.width = GEN_WIDTH_4;
+  p->MOV(_tmp0, tmArf);
+} p->pop();
+
+/* Calc the time elapsed. */
+// SUB(1)  tmp0<1>:UL  tmp0<1>:UL   lastTS<0,1,0>
+// ADD(1)  tmp0<1>:UL  tmp0<1>:UL   0x//Mod OP */
+subTimestamps(tmp0, lastTsReg, tmp);
+
+/* Update the real clock
+   ADD(1)   realclock<1>:UL  realclock<1>:UL  tmp0<1>:UL */
+addTimestamps(realClock, tmp0, tmp);
+
+/* We just record timestamp of the first time this point is reached. If 
the this point is
+   in loop, it can be reached many times. We will 

[Beignet] [PATCH 13/19] Add profiling info APIs to runtime.

2015-09-08 Thread junyan . he
From: Junyan He 

Signed-off-by: Junyan He 
---
 backend/src/backend/program.cpp |   26 +-
 backend/src/backend/program.h   |   11 +++
 backend/src/backend/program.hpp |   22 ++
 backend/src/gbe_bin_interpreter.cpp |4 
 src/cl_gbe_loader.cpp   |   15 +++
 src/cl_gbe_loader.h |3 +++
 6 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 9408621..2c28e6d 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -88,12 +88,14 @@ namespace gbe {
 
   Kernel::Kernel(const std::string ) :
 name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), 
useSLM(false),
-slmSize(0), ctx(NULL), samplerSet(NULL), imageSet(NULL), 
printfSet(NULL) {}
+slmSize(0), ctx(NULL), samplerSet(NULL), imageSet(NULL), 
printfSet(NULL),
+profilingInfo(NULL) {}
   Kernel::~Kernel(void) {
 if(ctx) GBE_DELETE(ctx);
 if(samplerSet) GBE_DELETE(samplerSet);
 if(imageSet) GBE_DELETE(imageSet);
 if(printfSet) GBE_DELETE(printfSet);
+if(profilingInfo) GBE_DELETE(profilingInfo);
 GBE_SAFE_DELETE_ARRAY(args);
   }
   int32_t Kernel::getCurbeOffset(gbe_curbe_type type, uint32_t subType) const {
@@ -159,6 +161,7 @@ namespace gbe {
 for (const auto  : set) {
   const std::string  = pair.first;
   Kernel *kernel = this->compileKernel(unit, name, 
!OCL_STRICT_CONFORMANCE, OCL_PROFILING_LOG);
+  kernel->setProfilingInfo(new 
ir::ProfilingInfo(*unit.getProfilingInfo()));
   kernel->setSamplerSet(pair.second->getSamplerSet());
   kernel->setImageSet(pair.second->getImageSet());
   kernel->setPrintfSet(pair.second->getPrintfSet());
@@ -1092,6 +1095,21 @@ namespace gbe {
 kernel->getSamplerData(samplers);
   }
 
+  static void* kernelDupProfiling(gbe_kernel gbeKernel) {
+if (gbeKernel == NULL) return NULL;
+const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+return kernel->dupProfilingInfo();
+  }
+  static uint32_t kernelGetProfilingBTI(gbe_kernel gbeKernel) {
+if (gbeKernel == NULL) return 0;
+const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+return kernel->getProfilingBTI();
+  }
+  static void kernelOutputProfiling(void *profiling_info, void* buf) {
+if (profiling_info == NULL) return;
+ir::ProfilingInfo *pi = (ir::ProfilingInfo *)profiling_info;
+return pi->outputProfilingInfo(buf);
+  }
   static uint32_t kernelGetPrintfNum(void * printf_info) {
 if (printf_info == NULL) return 0;
 const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
@@ -1213,6 +1231,9 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_data_cb 
*gbe_kernel_get_sampler_data =
 GBE_EXPORT_SYMBOL gbe_kernel_get_compile_wg_size_cb 
*gbe_kernel_get_compile_wg_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size = 
NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data = 
NULL;
+GBE_EXPORT_SYMBOL gbe_output_profiling_cb *gbe_output_profiling = NULL;
+GBE_EXPORT_SYMBOL gbe_dup_profiling_cb *gbe_dup_profiling = NULL;
+GBE_EXPORT_SYMBOL gbe_get_profiling_bti_cb *gbe_get_profiling_bti = NULL;
 GBE_EXPORT_SYMBOL gbe_get_printf_num_cb *gbe_get_printf_num = NULL;
 GBE_EXPORT_SYMBOL gbe_dup_printfset_cb *gbe_dup_printfset = NULL;
 GBE_EXPORT_SYMBOL gbe_get_printf_buf_bti_cb *gbe_get_printf_buf_bti = NULL;
@@ -1261,7 +1282,10 @@ namespace gbe
   gbe_kernel_get_compile_wg_size = gbe::kernelGetCompileWorkGroupSize;
   gbe_kernel_get_image_size = gbe::kernelGetImageSize;
   gbe_kernel_get_image_data = gbe::kernelGetImageData;
+  gbe_get_profiling_bti = gbe::kernelGetProfilingBTI;
   gbe_get_printf_num = gbe::kernelGetPrintfNum;
+  gbe_dup_profiling = gbe::kernelDupProfiling;
+  gbe_output_profiling = gbe::kernelOutputProfiling;
   gbe_get_printf_buf_bti = gbe::kernelGetPrintfBufBTI;
   gbe_get_printf_indexbuf_bti = gbe::kernelGetPrintfIndexBufBTI;
   gbe_dup_printfset = gbe::kernelDupPrintfSet;
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 4402956..7de8fd6 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -141,6 +141,17 @@ extern gbe_kernel_get_image_size_cb 
*gbe_kernel_get_image_size;
 typedef void (gbe_kernel_get_image_data_cb)(gbe_kernel gbeKernel, ImageInfo 
*images);
 extern gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data;
 
+/*! Get whether we are in the code profiling mode */
+typedef void (gbe_output_profiling_cb)(void* profiling_info, void* buf);
+extern gbe_output_profiling_cb *gbe_output_profiling;
+
+/*! Get the profiling bti */
+typedef uint32_t (gbe_get_profiling_bti_cb)(gbe_kernel gbeKernel);
+extern gbe_get_profiling_bti_cb *gbe_get_profiling_bti;
+
+typedef void* 

[Beignet] [PATCH 04/19] Backend: Add profiling registers to curbe.

2015-09-08 Thread junyan . he
From: Junyan He 

1. Add five timestamp reigsters and one pointer register
   into curbe. The five timestamp reigsters will hold
   all the infomation of profiling timestamps, includes
   20 uint timestamps for each point, 1 ulong prolog holding
   the start time and and 1 ulong epilog holding the
   end time of that kernel. The pointer reigster will hold
   the log buffer address.
2. Delete the unused laneid string in the specialRegMean.

Signed-off-by: Junyan He 
---
 backend/src/ir/profile.cpp |   15 +--
 backend/src/ir/profile.hpp |8 +++-
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 37f2d3d..ad343d5 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -44,9 +44,14 @@ namespace ir {
 "retVal", "slm_offset",
 "printf_buffer_pointer", "printf_index_buffer_pointer",
 "dwblockip",
-"lane_id",
 "invalid",
-"bti_utility"
+"bti_utility",
+"profiling_buffer_pointer",
+"profiling_timestamps0",
+"profiling_timestamps1",
+"profiling_timestamps2",
+"profiling_timestamps3",
+"profiling_timestamps4"
 };
 
 #if GBE_DEBUG
@@ -92,6 +97,12 @@ namespace ir {
   DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0);
   DECL_NEW_REG(FAMILY_DWORD, invalid, 1);
   DECL_NEW_REG(FAMILY_DWORD, btiUtil, 1);
+  DECL_NEW_REG(FAMILY_DWORD, profilingbptr, 1);
+  DECL_NEW_REG(FAMILY_DWORD, profilingts0, 0);
+  DECL_NEW_REG(FAMILY_DWORD, profilingts1, 0);
+  DECL_NEW_REG(FAMILY_DWORD, profilingts2, 0);
+  DECL_NEW_REG(FAMILY_DWORD, profilingts3, 0);
+  DECL_NEW_REG(FAMILY_DWORD, profilingts4, 0);
 }
 #undef DECL_NEW_REG
 
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index bf909be..e15a457 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -74,7 +74,13 @@ namespace ir {
 static const Register dwblockip = Register(30);  // blockip
 static const Register invalid = Register(31);  // used for valid 
comparation.
 static const Register btiUtil = Register(32);  // used for mixed pointer 
as bti utility.
-static const uint32_t regNum = 33; // number of special 
registers
+static const Register profilingbptr = Register(33); // buffer addr for 
profiling.
+static const Register profilingts0 = Register(34); // timestamp for 
profiling.
+static const Register profilingts1 = Register(35); // timestamp for 
profiling.
+static const Register profilingts2 = Register(36); // timestamp for 
profiling.
+static const Register profilingts3 = Register(37); // timestamp for 
profiling.
+static const Register profilingts4 = Register(38); // timestamp for 
profiling.
+static const uint32_t regNum = 39; // number of special 
registers
 extern const char *specialRegMean[];   // special register name.
   } /* namespace ocl */
 
-- 
1.7.9.5



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 09/19] Backend: Add CalcTimestamp and StoreProfiling to insn selection.

2015-09-08 Thread junyan . he
From: Junyan He 

Signed-off-by: Junyan He 
---
 backend/src/backend/gen_context.cpp|9 ++
 backend/src/backend/gen_context.hpp|2 +
 .../src/backend/gen_insn_gen7_schedule_info.hxx|2 +
 backend/src/backend/gen_insn_selection.cpp |  140 
 backend/src/backend/gen_insn_selection.hpp |8 ++
 backend/src/backend/gen_insn_selection.hxx |2 +
 6 files changed, 163 insertions(+)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 25fdf08..435b224 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -32,6 +32,7 @@
 #include "backend/gen/gen_mesa_disasm.h"
 #include "ir/function.hpp"
 #include "ir/value.hpp"
+#include "ir/profiling.hpp"
 #include "sys/cvar.hpp"
 #include 
 #include 
@@ -52,6 +53,7 @@ namespace gbe
 this->asmFileName = NULL;
 this->ifEndifFix = false;
 this->regSpillTick = 0;
+this->inProfilingMode = false;
   }
 
   GenContext::~GenContext(void) {
@@ -2183,6 +2185,13 @@ namespace gbe
 p->TYPED_WRITE(header, true, bti);
   }
 
+  void GenContext::emitCalcTimestampInstruction(const SelectionInstruction 
) {
+
+  }
+
+  void GenContext::emitStoreProfilingInstruction(const SelectionInstruction 
) {
+  }
+
   void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int 
sz) {
 if (sz == 0)
   sz = 8;
diff --git a/backend/src/backend/gen_context.hpp 
b/backend/src/backend/gen_context.hpp
index 8c7b821..bbd48cf 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -173,6 +173,8 @@ namespace gbe
 void emitGetImageInfoInstruction(const SelectionInstruction );
 virtual void emitI64MULInstruction(const SelectionInstruction );
 virtual void emitI64DIVREMInstruction(const SelectionInstruction );
+void emitCalcTimestampInstruction(const SelectionInstruction );
+void emitStoreProfilingInstruction(const SelectionInstruction );
 void scratchWrite(const GenRegister header, uint32_t offset, uint32_t 
reg_num, uint32_t reg_type, uint32_t channel_mode);
 void scratchRead(const GenRegister dst, const GenRegister header, uint32_t 
offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
 unsigned beforeMessage(const SelectionInstruction , GenRegister bti, 
GenRegister flagTemp, unsigned desc);
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx 
b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index d073770..9022d5d 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -43,3 +43,5 @@ DECL_GEN7_SCHEDULE(Atomic,  80,1,1)
 DECL_GEN7_SCHEDULE(I64MUL,  20,40,  20)
 DECL_GEN7_SCHEDULE(I64SATADD,   20,40,  20)
 DECL_GEN7_SCHEDULE(I64SATSUB,   20,40,  20)
+DECL_GEN7_SCHEDULE(CalcTimestamp,   80,1,1)
+DECL_GEN7_SCHEDULE(StoreProfiling,   80,1,1)
diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index ab00269..e9a4b61 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -663,6 +663,10 @@ namespace gbe
 void TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool 
is3D);
 /*! Get image information */
 void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, 
uint32_t bti);
+/*! Calculate the timestamp */
+void CALC_TIMESTAMP(GenRegister ts[4], int tsN, GenRegister tmp, uint32_t 
pointNum, uint32_t tsType);
+/*! Store the profiling info */
+void STORE_PROFILING(uint32_t profilingType, uint32_t bti, GenRegister 
tmp0, GenRegister tmp1, GenRegister ts[4], int tsNum);
 /*! Multiply 64-bit integers */
 void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister *tmp, bool 
native_long);
 /*! 64-bit integer division */
@@ -1774,6 +1778,55 @@ namespace gbe
   insn->dst(i + 1) = tmp[i];
   }
 
+  void Selection::Opaque::CALC_TIMESTAMP(GenRegister ts[4], int tsN, 
GenRegister tmp, uint32_t pointNum, uint32_t tsType) {
+SelectionInstruction *insn = NULL;
+if (!this->hasLongType()) {
+  insn = this->appendInsn(SEL_OP_CALC_TIMESTAMP, tsN + 1, tsN);
+} else {// No need for tmp
+  insn = this->appendInsn(SEL_OP_CALC_TIMESTAMP, tsN, tsN);
+}
+
+for (int i = 0; i < tsN; i++) {
+  insn->src(i) = ts[i];
+  insn->dst(i) = ts[i];
+}
+
+if (!this->hasLongType())
+  insn->dst(tsN) = tmp;
+
+insn->extra.pointNum = static_cast(pointNum);
+insn->extra.timestampType = static_cast(tsType);
+  }
+
+  void Selection::Opaque::STORE_PROFILING(uint32_t profilingType, uint32_t bti,
+GenRegister tmp0, GenRegister tmp1, GenRegister ts[4], int 
tsNum) {
+if (tsNum == 3) { // 

[Beignet] [PATCH 14/19] Runtime: Bind the profiling buffer when profiling enabled.

2015-09-08 Thread junyan . he
From: Junyan He 

Signed-off-by: Junyan He 
---
 src/cl_command_queue.c  |8 ++
 src/cl_command_queue_gen7.c |   37 +++
 src/cl_driver.h |   16 
 src/cl_driver_defs.c|5 
 src/intel/intel_gpgpu.c |   58 +++
 src/intel/intel_gpgpu.h |3 ++-
 6 files changed, 126 insertions(+), 1 deletion(-)

diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 4b92311..a345eb9 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -223,6 +223,7 @@ cl_command_queue_flush_gpgpu(cl_command_queue queue, 
cl_gpgpu gpgpu)
   size_t global_wk_sz[3];
   size_t outbuf_sz = 0;
   void* printf_info = cl_gpgpu_get_printf_info(gpgpu, global_wk_sz, 
_sz);
+  void* profiling_info;
 
   if (cl_gpgpu_flush(gpgpu) < 0)
 return CL_OUT_OF_RESOURCES;
@@ -246,6 +247,13 @@ cl_command_queue_flush_gpgpu(cl_command_queue queue, 
cl_gpgpu gpgpu)
 global_wk_sz[0] = global_wk_sz[1] = global_wk_sz[2] = 0;
 cl_gpgpu_set_printf_info(gpgpu, NULL, global_wk_sz);
   }
+
+  /* If have profiling info, output it. */
+  profiling_info = cl_gpgpu_get_profiling_info(gpgpu);
+  if (profiling_info) {
+interp_output_profiling(profiling_info, 
cl_gpgpu_map_profiling_buffer(gpgpu));
+cl_gpgpu_unmap_profiling_buffer(gpgpu);
+  }
   return CL_SUCCESS;
 }
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 0e60528..6b32a7e 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -262,6 +262,36 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
 }
 
 static int
+cl_bind_profiling(cl_gpgpu gpgpu, uint32_t simd_sz, cl_kernel ker, size_t 
global_sz, size_t local_sz, uint32_t bti) {
+  int32_t offset;
+  int i = 0;
+  int thread_num;
+  if (simd_sz == 16) {
+for(i = 0; i < 3; i++) {
+  offset = interp_kernel_get_curbe_offset(ker->opaque, 
GBE_CURBE_PROFILING_TIMESTAMP0 + i, 0);
+  assert(offset >= 0);
+  memset(ker->curbe + offset, 0x0, sizeof(uint32_t)*8*2);
+  thread_num = (local_sz + 15)/16;
+}
+  } else {
+assert(simd_sz == 8);
+for(i = 0; i < 5; i++) {
+  offset = interp_kernel_get_curbe_offset(ker->opaque, 
GBE_CURBE_PROFILING_TIMESTAMP0 + i, 0);
+  assert(offset >= 0);
+  memset(ker->curbe + offset, 0x0, sizeof(uint32_t)*8);
+  thread_num = (local_sz + 7)/8;
+}
+  }
+
+  offset = interp_kernel_get_curbe_offset(ker->opaque, 
GBE_CURBE_PROFILING_BUF_POINTER, 0);
+  thread_num = thread_num*(global_sz/local_sz);
+  if (cl_gpgpu_set_profiling_buffer(gpgpu, thread_num*128 + 4, offset, bti))
+return -1;
+
+  return 0;
+}
+
+static int
 cl_bind_printf(cl_gpgpu gpgpu, cl_kernel ker, void* printf_info, int 
printf_num, size_t global_sz) {
   int32_t value = GBE_CURBE_PRINTF_INDEX_POINTER;
   int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
@@ -355,6 +385,13 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
 if (cl_bind_printf(gpgpu, ker, printf_info, printf_num, global_size) != 0)
   goto error;
   }
+  if (interp_get_profiling_bti(ker->opaque) != 0) {
+if (cl_bind_profiling(gpgpu, simd_sz, ker, global_size, local_sz, 
interp_get_profiling_bti(ker->opaque)))
+  goto error;
+cl_gpgpu_set_profiling_info(gpgpu, interp_dup_profiling(ker->opaque));
+  } else {
+   cl_gpgpu_set_profiling_info(gpgpu, NULL);
+  }
 
   /* Bind user buffers */
   cl_command_queue_bind_surface(queue, ker);
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 1ab4dff..f130a8e 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -252,6 +252,22 @@ extern cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf;
 typedef void (cl_gpgpu_unref_batch_buf_cb)(void*);
 extern cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf;
 
+/* Set the profiling buffer */
+typedef int (cl_gpgpu_set_profiling_buffer_cb)(cl_gpgpu, uint32_t, uint32_t, 
uint8_t);
+extern cl_gpgpu_set_profiling_buffer_cb *cl_gpgpu_set_profiling_buffer;
+
+typedef int (cl_gpgpu_set_profiling_info_cb)(cl_gpgpu, void *);
+extern cl_gpgpu_set_profiling_info_cb *cl_gpgpu_set_profiling_info;
+
+typedef void* (cl_gpgpu_get_profiling_info_cb)(cl_gpgpu);
+extern cl_gpgpu_get_profiling_info_cb *cl_gpgpu_get_profiling_info;
+
+typedef void* (cl_gpgpu_map_profiling_buffer_cb)(cl_gpgpu);
+extern cl_gpgpu_map_profiling_buffer_cb *cl_gpgpu_map_profiling_buffer;
+
+typedef void (cl_gpgpu_unmap_profiling_buffer_cb)(cl_gpgpu);
+extern cl_gpgpu_unmap_profiling_buffer_cb *cl_gpgpu_unmap_profiling_buffer;
+
 /* Set the printf buffer */
 typedef int (cl_gpgpu_set_printf_buffer_cb)(cl_gpgpu, uint32_t, uint32_t, 
uint32_t, uint8_t);
 extern cl_gpgpu_set_printf_buffer_cb *cl_gpgpu_set_printf_buffer;
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index b77acdc..0d6fa9a 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -90,6 +90,11 @@ LOCAL cl_gpgpu_event_get_exec_timestamp_cb 

[Beignet] [PATCH 07/19] Backend: Insert store_profiling before lowed return.

2015-09-08 Thread junyan . he
From: Junyan He 

After the lowering return pass, a new block which just
has one RET instruction will be generated, and  all RET
INSTs in the middle will be replaced by BRA INST.
We want our store_profiling instruction to be inserted
just before that return instruction and out of any
condition blocks. So we postpone the STORE_PROFILING
here.

Signed-off-by: Junyan He 
---
 backend/src/ir/lowering.cpp |7 +++
 1 file changed, 7 insertions(+)

diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp
index 9fcdf74..b312131 100644
--- a/backend/src/ir/lowering.cpp
+++ b/backend/src/ir/lowering.cpp
@@ -52,6 +52,13 @@ namespace ir {
 const LabelIndex index = this->label();
 this->LABEL(index);
 const BasicBlock *lastBlock = this->bb;
+
+/* Append the STORE_PROFILING just before return. */
+if (unit.getInProfilingMode() == true) {
+  this->STORE_PROFILING(this->getUnit().getProfilingInfo()->getBTI(),
+
this->getUnit().getProfilingInfo()->getProfilingType());
+}
+
 this->RET();
 
 // Now traverse all instructions and replace all returns by GOTO index
-- 
1.7.9.5



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 15/19] Backend: Fix two bugs about curbe related pointer.

2015-09-08 Thread junyan . he
From: Junyan He 

1. rename __gen_ocl_timestamp_buf to __gen_ocl_profiling_buf
2. printfbptr printfiptr and profilingbptr should be 64 bits
   on BDW later platforms. So just set them to QWORD.

Signed-off-by: Junyan He 
---
 backend/src/ir/profile.cpp|6 +++---
 backend/src/llvm/llvm_gen_backend.cpp |8 +---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index ad343d5..3de1ef7 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -92,12 +92,12 @@ namespace ir {
   DECL_NEW_REG(FAMILY_DWORD, one, 1);
   DECL_NEW_REG(FAMILY_WORD, retVal, 1);
   DECL_NEW_REG(FAMILY_DWORD, slmoffset, 1);
-  DECL_NEW_REG(FAMILY_DWORD, printfbptr, 1);
-  DECL_NEW_REG(FAMILY_DWORD, printfiptr, 1);
+  DECL_NEW_REG(FAMILY_QWORD, printfbptr, 1);
+  DECL_NEW_REG(FAMILY_QWORD, printfiptr, 1);
   DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0);
   DECL_NEW_REG(FAMILY_DWORD, invalid, 1);
   DECL_NEW_REG(FAMILY_DWORD, btiUtil, 1);
-  DECL_NEW_REG(FAMILY_DWORD, profilingbptr, 1);
+  DECL_NEW_REG(FAMILY_QWORD, profilingbptr, 1);
   DECL_NEW_REG(FAMILY_DWORD, profilingts0, 0);
   DECL_NEW_REG(FAMILY_DWORD, profilingts1, 0);
   DECL_NEW_REG(FAMILY_DWORD, profilingts2, 0);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
b/backend/src/llvm/llvm_gen_backend.cpp
index 5c2b590..233f7e2 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1112,7 +1112,7 @@ namespace gbe
 } else if 
(origin->getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
   new_bti = btiBase;
   incBtiBase();
-} else if (origin->getName().equals(StringRef("__gen_ocl_timestamp_buf"))) 
{
+} else if (origin->getName().equals(StringRef("__gen_ocl_profiling_buf"))) 
{
   new_bti = btiBase;
   incBtiBase();
 }
@@ -2484,10 +2484,12 @@ namespace gbe
   } else {
 if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) {
   
ctx.getFunction().getPrintfSet()->setBufBTI(BtiMap.find(const_cast())->second);
-  regTranslator.newScalarProxy(ir::ocl::printfbptr, 
const_cast());
+  this->newRegister(const_cast(), NULL, true);
+  ctx.CVT(ir::TYPE_U32, ir::TYPE_U64, 
getRegister(const_cast()), ir::ocl::printfbptr);
 } else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf"))) 
{
   
ctx.getFunction().getPrintfSet()->setIndexBufBTI(BtiMap.find(const_cast())->second);
-  regTranslator.newScalarProxy(ir::ocl::printfiptr, 
const_cast());
+  this->newRegister(const_cast(), NULL, true);
+  ctx.CVT(ir::TYPE_U32, ir::TYPE_U64, 
getRegister(const_cast()), ir::ocl::printfiptr);
 } else if(v.getName().equals(StringRef("__gen_ocl_profiling_buf"))) {
   
ctx.getUnit().getProfilingInfo()->setBTI(BtiMap.find(const_cast())->second);
   regTranslator.newScalarProxy(ir::ocl::profilingbptr, 
const_cast());
-- 
1.7.9.5



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 10/19] Backend: Add a auxiliary function to convert GenReg to uniform.

2015-09-08 Thread junyan . he
From: Junyan He 

Signed-off-by: Junyan He 
---
 backend/src/backend/gen_register.hpp |9 +
 1 file changed, 9 insertions(+)

diff --git a/backend/src/backend/gen_register.hpp 
b/backend/src/backend/gen_register.hpp
index 4f37e30..9e9e0e4 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -274,6 +274,15 @@ namespace gbe
   return r;
 }
 
+static INLINE GenRegister toUniform(GenRegister reg, uint32_t type) {
+  GenRegister r = reg;
+  r.type = type;
+  r.hstride = GEN_HORIZONTAL_STRIDE_0;
+  r.vstride = GEN_VERTICAL_STRIDE_0;
+  r.width = GEN_WIDTH_1;
+  return r;
+}
+
 static INLINE uint32_t grfOffset(GenRegister reg) {
   return reg.nr * GEN_REG_SIZE + reg.subnr;
 }
-- 
1.7.9.5



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 16/19] Backend: Avoid CALC_TIMESTAMP and STORE_PROFILING being scheduled.

2015-09-08 Thread junyan . he
From: Junyan He 

We do not want CALC_TIMESTAMP and STORE_PROFILING to be scheduled
with other instructions, because it will get the wrong timestamps.

Signed-off-by: Junyan He 
---
 backend/src/backend/gen_insn_scheduling.cpp |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/backend/src/backend/gen_insn_scheduling.cpp 
b/backend/src/backend/gen_insn_scheduling.cpp
index 358a2ce..43f67c9 100644
--- a/backend/src/backend/gen_insn_scheduling.cpp
+++ b/backend/src/backend/gen_insn_scheduling.cpp
@@ -589,7 +589,9 @@ namespace gbe
   || node->insn.opcode == SEL_OP_ENDIF
   || node->insn.opcode == SEL_OP_WHILE
   || node->insn.opcode == SEL_OP_READ_ARF
-  || node->insn.opcode == SEL_OP_BARRIER)
+  || node->insn.opcode == SEL_OP_BARRIER
+  || node->insn.opcode == SEL_OP_CALC_TIMESTAMP
+  || node->insn.opcode == SEL_OP_STORE_PROFILING)
 tracker.makeBarrier(insnID, insnNum);
 }
 
-- 
1.7.9.5



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 11/19] Backend: Add profilingProlog function for GenContext.

2015-09-08 Thread junyan . he
From: Junyan He 

The profilingProlog will collect useful information
for profiling, including XYZ global range and prolog
timestamp.

Signed-off-by: Junyan He 
---
 backend/src/backend/gen_context.cpp |  116 +++
 backend/src/backend/gen_context.hpp |2 +
 2 files changed, 118 insertions(+)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 435b224..696d86a 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2185,6 +2185,119 @@ namespace gbe
 p->TYPED_WRITE(header, true, bti);
   }
 
+  void GenContext::calcGlobalXYZRange(GenRegister& reg, GenRegister& tmp, int 
flag, int subFlag)
+  {
+#define CALC_GID(dim)  do {\
+  GenRegister g##dim##start = GenRegister::offset(reg, 0, 8 + dim*8); \
+  GenRegister g##dim##end = GenRegister::offset(g##dim##start, 0, 4);  \
+  GenRegister id##dim = 
GenRegister::toUniform(ra->genReg(GenRegister::ud16grf(ir::ocl::lid##dim)), 
GEN_TYPE_UD); \
+  GenRegister localsz##dim = 
GenRegister::toUniform(ra->genReg(GenRegister::ud1grf(ir::ocl::lsize##dim)), 
GEN_TYPE_UD); \
+  GenRegister gid##dim = 
GenRegister::toUniform(ra->genReg(GenRegister::ud1grf(ir::ocl::groupid##dim)), 
GEN_TYPE_UD); \
+  GenRegister goffset##dim = 
GenRegister::toUniform(ra->genReg(GenRegister::ud1grf(ir::ocl::goffset##dim)), 
GEN_TYPE_UD); \
+  p->MUL(g##dim##start, localsz##dim, gid##dim); \
+  p->ADD(g##dim##start, g##dim##start, id##dim); \
+  p->ADD(g##dim##start, g##dim##start, goffset##dim); \
+  GenRegister ip; \
+  p->MOV(flagReg, GenRegister::immuw(0x0)); \
+  p->curr.useFlag(flag, subFlag); \
+  p->curr.predicate = GEN_PREDICATE_NONE; \
+  if (this->simdWidth == 16) \
+  p->curr.execWidth = 16; \
+  else \
+  p->curr.execWidth = 8; \
+  if (!isDWLabel()) { \
+ip = ra->genReg(GenRegister::uw16grf(ir::ocl::blockip)); \
+p->CMP(GEN_CONDITIONAL_EQ, ip, GenRegister::immuw(0x)); \
+  } else { \
+ip = ra->genReg(GenRegister::ud16grf(ir::ocl::dwblockip)); \
+p->CMP(GEN_CONDITIONAL_EQ, ip, GenRegister::immud(0x)); \
+  } \
+  p->curr.execWidth = 1; \
+  p->MOV(GenRegister::retype(tmp, GEN_TYPE_UW), flagReg); \
+  if (this->simdWidth == 16) \
+  p->OR(tmp, tmp, GenRegister::immud(0x)); \
+  else \
+  p->OR(tmp, tmp, GenRegister::immud(0xff00)); \
+  p->FBL(tmp, tmp); \
+  p->ADD(tmp, tmp, GenRegister::negate(GenRegister::immud(0x1))); \
+  p->MUL(tmp, tmp, GenRegister::immud(4)); \
+  p->MOV(GenRegister::addr1(0), GenRegister::retype(tmp, GEN_TYPE_UW)); \
+  GenRegister dimEnd = GenRegister::to_indirect1xN(id##dim, 0); \
+  p->MOV(tmp, dimEnd); \
+  p->MUL(g##dim##end, localsz##dim, gid##dim); \
+  p->ADD(g##dim##end, g##dim##end, tmp); \
+  p->ADD(g##dim##end, g##dim##end, goffset##dim); \
+} while(0)
+
+GenRegister flagReg = GenRegister::flag(flag, subFlag);
+p->push(); {
+  p->curr.execWidth = 1;
+  p->curr.predicate = GEN_PREDICATE_NONE;
+  p->curr.noMask = 1;
+  CALC_GID(0);
+  CALC_GID(1);
+  CALC_GID(2);
+} p->pop();
+
+#undef CALC_GID
+  }
+
+  void GenContext::profilingProlog(void) {
+// record the prolog, globalXYZ and lasttimestamp at the very beginning.
+GenRegister profilingReg2, profilingReg3, profilingReg4;
+GenRegister tmArf = GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+0xc0,
+0,
+GEN_TYPE_UW,
+GEN_VERTICAL_STRIDE_4,
+GEN_WIDTH_4,
+GEN_HORIZONTAL_STRIDE_1);
+if (this->simdWidth == 16) {
+  profilingReg2 = ra->genReg(GenRegister::ud16grf(ir::ocl::profilingts1));
+  profilingReg3 = GenRegister::offset(profilingReg2, 1);
+  profilingReg4 = ra->genReg(GenRegister::ud16grf(ir::ocl::profilingts2));
+} else {
+  GBE_ASSERT(this->simdWidth == 8);
+  profilingReg2 = ra->genReg(GenRegister::ud8grf(ir::ocl::profilingts2));
+  profilingReg3 = ra->genReg(GenRegister::ud8grf(ir::ocl::profilingts3));
+  profilingReg4 = ra->genReg(GenRegister::ud8grf(ir::ocl::profilingts4));
+}
+
+/* MOV(4)   prolog<1>:UW   arf_tm<4,4,1>:UW  */
+/* MOV(4)   lastTsReg<1>:UW  prolog<4,4,1>:UW  */
+GenRegister prolog = profilingReg2;
+prolog.type = GEN_TYPE_UW;
+prolog.hstride = GEN_HORIZONTAL_STRIDE_1;
+prolog.vstride = GEN_VERTICAL_STRIDE_4;
+prolog.width = GEN_WIDTH_4;
+prolog = GenRegister::offset(prolog, 0, 4*sizeof(uint32_t));
+
+GenRegister lastTsReg = GenRegister::toUniform(profilingReg3, GEN_TYPE_UL);
+lastTsReg = GenRegister::offset(lastTsReg, 0, 2*sizeof(uint64_t));
+lastTsReg.type = GEN_TYPE_UW;
+lastTsReg.hstride = GEN_HORIZONTAL_STRIDE_1;
+lastTsReg.vstride = GEN_VERTICAL_STRIDE_4;
+lastTsReg.width = GEN_WIDTH_4;
+
+GenRegister gids = GenRegister::toUniform(profilingReg4, GEN_TYPE_UD);
+GenRegister tmp = GenRegister::toUniform(profilingReg4, GEN_TYPE_UD);
+
+// X Y and Z
+

[Beignet] [PATCH 12/19] Backend: Add profiling registers into curbe.

2015-09-08 Thread junyan . he
From: Junyan He 

Signed-off-by: Junyan He 
---
 backend/src/backend/gen_context.cpp |   17 +
 backend/src/backend/program.h   |6 ++
 2 files changed, 23 insertions(+)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 696d86a..a12d056 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2348,6 +2348,23 @@ namespace gbe
 allocCurbeReg(zero, GBE_CURBE_ZERO);
 allocCurbeReg(one, GBE_CURBE_ONE);
 allocCurbeReg(btiUtil, GBE_CURBE_BTI_UTIL);
+if (inProfilingMode) {
+  allocCurbeReg(profilingbptr, GBE_CURBE_PROFILING_BUF_POINTER);
+  allocCurbeReg(profilingts0, GBE_CURBE_PROFILING_TIMESTAMP0);
+  allocCurbeReg(profilingts1, GBE_CURBE_PROFILING_TIMESTAMP1);
+  allocCurbeReg(profilingts2, GBE_CURBE_PROFILING_TIMESTAMP2);
+  if (this->simdWidth == 8) {
+allocCurbeReg(profilingts3, GBE_CURBE_PROFILING_TIMESTAMP3);
+allocCurbeReg(profilingts4, GBE_CURBE_PROFILING_TIMESTAMP4);
+  }
+  allocCurbeReg(lsize0, GBE_CURBE_LOCAL_SIZE_X);
+  allocCurbeReg(lsize1, GBE_CURBE_LOCAL_SIZE_Y);
+  allocCurbeReg(lsize2, GBE_CURBE_LOCAL_SIZE_Z);
+  allocCurbeReg(goffset0, GBE_CURBE_GLOBAL_OFFSET_X);
+  allocCurbeReg(goffset1, GBE_CURBE_GLOBAL_OFFSET_Y);
+  allocCurbeReg(goffset2, GBE_CURBE_GLOBAL_OFFSET_Z);
+}
+
 if (stackUse.size() != 0)
   allocCurbeReg(stackbuffer, GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
 // Go over the arguments and find the related patch locations
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index af19732..4402956 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -102,6 +102,12 @@ enum gbe_curbe_type {
   GBE_CURBE_ONE,
   GBE_CURBE_SLM_OFFSET,
   GBE_CURBE_BTI_UTIL,
+  GBE_CURBE_PROFILING_BUF_POINTER,
+  GBE_CURBE_PROFILING_TIMESTAMP0,
+  GBE_CURBE_PROFILING_TIMESTAMP1,
+  GBE_CURBE_PROFILING_TIMESTAMP2,
+  GBE_CURBE_PROFILING_TIMESTAMP3,
+  GBE_CURBE_PROFILING_TIMESTAMP4,
 };
 
 /*! Extra arguments use the negative range of sub-values */
-- 
1.7.9.5



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 06/19] Backend: Add CalcTimestamp and StoreProfiling.

2015-09-08 Thread junyan . he
From: Junyan He 

When in profiling, the profiling inserter function will
insert calc_timestamp for each point which we are interested
in. At the end of the kernel, just before return, we will
insert a store_profiling function call. The function will
hold a reference to the global val profiling_buf and avoid
it being released when run optimization passes.

Signed-off-by: Junyan He 
---
 backend/src/llvm/llvm_gen_backend.cpp  |   43 
 backend/src/llvm/llvm_gen_ocl_function.hxx |5 
 2 files changed, 48 insertions(+)

diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
b/backend/src/llvm/llvm_gen_backend.cpp
index 4905415..5c2b590 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1112,6 +1112,9 @@ namespace gbe
 } else if 
(origin->getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
   new_bti = btiBase;
   incBtiBase();
+} else if (origin->getName().equals(StringRef("__gen_ocl_timestamp_buf"))) 
{
+  new_bti = btiBase;
+  incBtiBase();
 }
 else if (isa(origin)
 && dyn_cast(origin)->isConstant()) {
@@ -2485,6 +2488,9 @@ namespace gbe
 } else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf"))) 
{
   
ctx.getFunction().getPrintfSet()->setIndexBufBTI(BtiMap.find(const_cast())->second);
   regTranslator.newScalarProxy(ir::ocl::printfiptr, 
const_cast());
+} else if(v.getName().equals(StringRef("__gen_ocl_profiling_buf"))) {
+  
ctx.getUnit().getProfilingInfo()->setBTI(BtiMap.find(const_cast())->second);
+  regTranslator.newScalarProxy(ir::ocl::profilingbptr, 
const_cast());
 } else if(v.getName().str().substr(0, 4) == ".str") {
   /* When there are multi printf statements in multi kernel fucntions 
within the same
  translate unit, if they have the same sting parameter, such as
@@ -3494,6 +3500,8 @@ namespace gbe
 this->newRegister();
 break;
   case GEN_OCL_PRINTF:
+  case GEN_OCL_CALC_TIMESTAMP:
+  case GEN_OCL_STORE_PROFILING:
 break;
   case GEN_OCL_NOT_FOUND:
   default:
@@ -4179,6 +4187,41 @@ namespace gbe
 assert(fmt);
 break;
   }
+  case GEN_OCL_CALC_TIMESTAMP:
+  {
+GBE_ASSERT(AI != AE);
+ConstantInt *CI = dyn_cast(*AI);
+GBE_ASSERT(CI);
+uint32_t pointNum = CI->getZExtValue();
+AI++;
+GBE_ASSERT(AI != AE);
+CI = dyn_cast(*AI);
+GBE_ASSERT(CI);
+uint32_t tsType = CI->getZExtValue();
+ctx.CALC_TIMESTAMP(pointNum, tsType);
+break;
+  }
+  case GEN_OCL_STORE_PROFILING:
+  {
+/* The profiling log always begin at 0 offset, so we
+   never need the buffer ptr value and ptrBase, and
+   no need for SUB to calculate the real address, neither.
+   We just pass down the BTI value to the instruction. */
+GBE_ASSERT(AI != AE);
+Value* llvmPtr = *AI;
+Value *bti = getBtiRegister(llvmPtr);
+GBE_ASSERT(isa(bti)); //Should never be mixed pointer.
+uint32_t index = cast(bti)->getZExtValue();
+GBE_ASSERT(btiToGen(index) == ir::MEM_GLOBAL);
+++AI;
+GBE_ASSERT(AI != AE);
+ConstantInt *CI = dyn_cast(*AI);
+GBE_ASSERT(CI);
+uint32_t ptype = CI->getZExtValue();
+ctx.getUnit().getProfilingInfo()->setProfilingType(ptype);
+//ctx.STORE_PROFILING(index, ptype);
+break;
+  }
   case GEN_OCL_SIMD_SIZE:
   {
 const ir::Register dst = this->getRegister();
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx 
b/backend/src/llvm/llvm_gen_ocl_function.hxx
index cabb225..0a6e1da 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -170,3 +170,8 @@ DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region)
 
 // printf function
 DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf)
+
+// store timestamp function
+DECL_LLVM_GEN_FUNCTION(CALC_TIMESTAMP, __gen_ocl_calc_timestamp)
+// store profiling info to the mem.
+DECL_LLVM_GEN_FUNCTION(STORE_PROFILING, __gen_ocl_store_profiling)
-- 
1.7.9.5



___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 03/19] Backend: Add ProfilingInserter and a new function pass.

2015-09-08 Thread junyan . he
From: Junyan He 

When user enables profiling feature, we need to insert
extra instructions to record and store the timestamps.
By now, the function pass will just insert the requred
instructions at the head of first 20 blocks. Later, we
will support to insert timestamps at any point in the code.

Signed-off-by: Junyan He 
Signed-off-by: Bai Yannan 
---
 backend/src/CMakeLists.txt  |1 +
 backend/src/llvm/llvm_profiling.cpp |  210 +++
 2 files changed, 211 insertions(+)
 create mode 100644 backend/src/llvm/llvm_profiling.cpp

diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index daab320..e56df5e 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -87,6 +87,7 @@ set (GBE_SRC
 llvm/llvm_intrinsic_lowering.cpp
 llvm/llvm_barrier_nodup.cpp
 llvm/llvm_printf_parser.cpp
+llvm/llvm_profiling.cpp
 llvm/ExpandConstantExpr.cpp
 llvm/ExpandUtils.cpp
 llvm/PromoteIntegers.cpp
diff --git a/backend/src/llvm/llvm_profiling.cpp 
b/backend/src/llvm/llvm_profiling.cpp
new file mode 100644
index 000..c52e241
--- /dev/null
+++ b/backend/src/llvm/llvm_profiling.cpp
@@ -0,0 +1,210 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see .
+ *
+ */
+
+/**
+ * \file llvm_profiling.cpp
+ * This file will insert some instructions for each profiling point.
+ *
+ */
+
+#include 
+#include 
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CFG.h"
+#else
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#endif
+
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Attributes.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "sys/map.hpp"
+
+#include 
+#include 
+
+
+using namespace llvm;
+using std::vector;
+
+
+namespace gbe
+{
+  using namespace ir;
+
+  class ProfilingInserter : public FunctionPass
+  {
+  public:
+static char ID;
+Module* module;
+IRBuilder<>* builder;
+Type* intTy;
+Type *ptrTy;
+int profilingType;
+
+ProfilingInserter(int profiling) : FunctionPass(ID), 
profilingType(profiling)
+{
+  module = NULL;
+  builder = NULL;
+  intTy = NULL;
+  ptrTy = NULL;
+}
+
+~ProfilingInserter(void)
+{
+}
+
+virtual const char *getPassName() const
+{
+  return "Timestamp Parser";
+}
+
+virtual bool runOnFunction(llvm::Function );
+  };
+
+  bool ProfilingInserter::runOnFunction(llvm::Function )
+  {
+bool changed = false;
+int pointNum = 0;
+
+switch (F.getCallingConv()) {
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+  case CallingConv::PTX_Device:
+return false;
+  case CallingConv::PTX_Kernel:
+#else
+  case CallingConv::C:
+  case CallingConv::Fast:
+  case CallingConv::SPIR_KERNEL:
+#endif
+break;
+  default:
+GBE_ASSERTM(false, "Unsupported calling convention");
+}
+
+// As we inline all function calls, so skip non-kernel functions
+bool bKernel = isKernelFunction(F);
+if (!bKernel) return changed;
+
+module = F.getParent();
+intTy = IntegerType::get(module->getContext(), 32);
+ptrTy = Type::getInt32PtrTy(module->getContext(), 1);
+builder = new IRBuilder<>(module->getContext());
+
+/* alloc a new buffer ptr to collect the timestamps. */
+builder->SetInsertPoint(F.begin()->begin());
+llvm::Constant *profilingBuf = 
module->getGlobalVariable("__gen_ocl_profiling_buf");
+if (!profilingBuf) {
+  profilingBuf = new GlobalVariable(*module, intTy, false,
+  

[Beignet] [PATCH 02/19] Backend: Add StoreProfiling and CalcTimestamp instructions

2015-09-08 Thread junyan . he
From: Junyan He 

Add two instructions for profiling usage. CalcTimestamp will
calculate the timestamps and update the timestamp in the
according slot. StoreProfiling will store the information
to buffer and generate logs.

Signed-off-by: Junyan He 
---
 backend/src/ir/instruction.cpp |   96 +++-
 backend/src/ir/instruction.hpp |   26 +++
 backend/src/ir/instruction.hxx |2 +
 3 files changed, 123 insertions(+), 1 deletion(-)

diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index f93c528..280c60d 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -677,6 +677,58 @@ namespace ir {
   static const uint32_t dstNum = 1;
 };
 
+class ALIGNED_INSTRUCTION CalcTimestampInstruction :
+  public BasePolicy,
+  public NSrcPolicy,
+  public NDstPolicy
+{
+public:
+  CalcTimestampInstruction(uint32_t pointNum, uint32_t timestampType) {
+this->opcode = OP_CALC_TIMESTAMP;
+this->timestampType = static_cast(timestampType);
+this->pointNum = static_cast(pointNum);
+  }
+
+  INLINE bool wellFormed(const Function , std::string ) const;
+  INLINE void out(std::ostream , const Function ) const {
+this->outOpcode(out);
+out << "TimeStamp pointer " << static_cast(pointNum)
+  << " (Type " << static_cast(timestampType) << ")";
+  }
+  uint32_t getPointNum(void) const { return this->pointNum; }
+  uint32_t getTimestamptType(void) const { return this->timestampType; }
+  uint8_t timestampType;   //!< Type of the time stamp, 16bits or 
32bits, eg.
+  uint8_t pointNum;//!< The insert point number.
+  Register dst[0], src[0];
+};
+
+class ALIGNED_INSTRUCTION StoreProfilingInstruction :
+  public BasePolicy,
+  public NSrcPolicy,
+  public NDstPolicy
+{
+public:
+  StoreProfilingInstruction(uint32_t bti, uint32_t profilingType) {
+this->opcode = OP_STORE_PROFILING;
+this->profilingType = static_cast(profilingType);
+this->bti = static_cast(bti);
+  }
+
+  INLINE bool wellFormed(const Function , std::string ) const;
+  INLINE void out(std::ostream , const Function ) const {
+this->outOpcode(out);
+out << " BTI " << static_cast(this->bti)
+  << " (Type " << static_cast(this->profilingType) << ")";
+  }
+
+  uint32_t getProfilingType(void) const { return this->profilingType; }
+  uint32_t getBTI(void) const { return this->bti; }
+  uint8_t profilingType; //!< Type format of profiling, 16bits or 
32bits, eg.
+  uint8_t bti;
+  Register src[0];
+  Register dst[0];
+};
+
 class ALIGNED_INSTRUCTION LoadImmInstruction :
   public BasePolicy,
   public NSrcPolicy,
@@ -1226,6 +1278,26 @@ namespace ir {
   return true;
 }
 
+INLINE bool CalcTimestampInstruction::wellFormed(const Function , 
std::string ) const {
+  if (UNLIKELY(this->timestampType != 1)) {
+whyNot = "Wrong time stamp type";
+return false;
+  }
+  if (UNLIKELY(this->pointNum >= 20 && this->pointNum != 0xff && 
this->pointNum != 0xfe)) {
+whyNot = "To much Insert pointer";
+return false;
+  }
+  return true;
+}
+
+INLINE bool StoreProfilingInstruction::wellFormed(const Function , 
std::string ) const {
+  if (UNLIKELY(this->profilingType != 1)) {
+whyNot = "Wrong profiling format";
+return false;
+  }
+  return true;
+}
+
 #undef CHECK_TYPE
 
 /
@@ -1466,6 +1538,14 @@ START_INTROSPECTION(GetImageInfoInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(GetImageInfoInstruction)
 
+START_INTROSPECTION(CalcTimestampInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(CalcTimestampInstruction)
+
+START_INTROSPECTION(StoreProfilingInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(StoreProfilingInstruction)
+
 START_INTROSPECTION(LoadImmInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(LoadImmInstruction)
@@ -1645,7 +1725,9 @@ END_FUNCTION(Instruction, Register)
 return opcode == OP_STORE ||
opcode == OP_TYPED_WRITE ||
opcode == OP_SYNC ||
-   opcode == OP_ATOMIC;
+   opcode == OP_ATOMIC ||
+   opcode == OP_CALC_TIMESTAMP ||
+   opcode == OP_STORE_PROFILING;
   }
 
 #define DECL_MEM_FN(CLASS, RET, PROTOTYPE, CALL) \
@@ -1699,6 +1781,10 @@ DECL_MEM_FN(TypedWriteInstruction, Type, 
getCoordType(void), getCoordType())
 DECL_MEM_FN(TypedWriteInstruction, uint8_t, getImageIndex(void), 
getImageIndex())
 DECL_MEM_FN(GetImageInfoInstruction, 

[Beignet] [PATCH 08/19] Backend: Add IVAR OCL_PROFILING_LOG to control profiling log.

2015-09-08 Thread junyan . he
From: Junyan He 

We add OCL_PROFILING_LOG as a int type, because there may be
different types of profiling format in the future.

Signed-off-by: Junyan He 
---
 backend/src/backend/gen_context.hpp   |3 +++
 backend/src/backend/gen_program.cpp   |9 -
 backend/src/backend/gen_program.hpp   |2 +-
 backend/src/backend/program.cpp   |9 +
 backend/src/backend/program.hpp   |3 ++-
 backend/src/llvm/llvm_gen_backend.hpp |3 +++
 backend/src/llvm/llvm_to_gen.cpp  |7 ++-
 backend/src/llvm/llvm_to_gen.hpp  |3 ++-
 8 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/backend/src/backend/gen_context.hpp 
b/backend/src/backend/gen_context.hpp
index 34f9293..8c7b821 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -207,6 +207,8 @@ namespace gbe
 bool relaxMath;
 bool getIFENDIFFix(void) const { return ifEndifFix; }
 void setIFENDIFFix(bool fix) { ifEndifFix = fix; }
+bool getProfilingMode(void) const { return inProfilingMode; }
+void setProfilingMode(bool b) { inProfilingMode = b; }
 CompileErrorCode getErrCode() { return errCode; }
 
   protected:
@@ -221,6 +223,7 @@ namespace gbe
   private:
 CompileErrorCode errCode;
 bool ifEndifFix;
+bool inProfilingMode;
 uint32_t regSpillTick;
 const char* asmFileName;
 /*! Build the curbe patch list for the given kernel */
diff --git a/backend/src/backend/gen_program.cpp 
b/backend/src/backend/gen_program.cpp
index 3c4983e..8f1fe8e 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -140,7 +140,8 @@ namespace gbe {
 {8, 16, false},
   };
 
-  Kernel *GenProgram::compileKernel(const ir::Unit , const std::string 
, bool relaxMath) {
+  Kernel *GenProgram::compileKernel(const ir::Unit , const std::string 
,
+bool relaxMath, int profiling) {
 #ifdef GBE_COMPILER_AVAILABLE
 // Be careful when the simdWidth is forced by the programmer. We can see it
 // when the function already provides the simd width we need to use (i.e.
@@ -172,6 +173,12 @@ namespace gbe {
   ctx = GBE_NEW(Gen9Context, unit, name, deviceID, relaxMath);
 }
 GBE_ASSERTM(ctx != NULL, "Fail to create the gen context\n");
+
+if (profiling) {
+  ctx->setProfilingMode(true);
+  unit.getProfilingInfo()->setDeviceID(deviceID);
+}
+
 ctx->setASMFileName(this->asm_file_name);
 
 for (; codeGen < codeGenNum; ++codeGen) {
diff --git a/backend/src/backend/gen_program.hpp 
b/backend/src/backend/gen_program.hpp
index 75d77ba..cc1d526 100644
--- a/backend/src/backend/gen_program.hpp
+++ b/backend/src/backend/gen_program.hpp
@@ -69,7 +69,7 @@ namespace gbe
 /*! Clean LLVM resource */
 virtual void CleanLlvmResource(void);
 /*! Implements base class */
-virtual Kernel *compileKernel(const ir::Unit , const std::string 
, bool relaxMath);
+virtual Kernel *compileKernel(const ir::Unit , const std::string 
, bool relaxMath, int profiling);
 /*! Allocate an empty kernel. */
 virtual Kernel *allocateKernel(const std::string ) {
   return GBE_NEW(GenKernel, name, deviceID);
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index d9e6416..9408621 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -114,6 +114,7 @@ namespace gbe {
 #ifdef GBE_COMPILER_AVAILABLE
   BVAR(OCL_OUTPUT_GEN_IR, false);
   BVAR(OCL_STRICT_CONFORMANCE, true);
+  IVAR(OCL_PROFILING_LOG, 0, 0, 1); // Int for different profiling types.
 
   bool Program::buildFromLLVMFile(const char *fileName, const void* module, 
std::string , int optLevel) {
 ir::Unit *unit = new ir::Unit();
@@ -121,7 +122,7 @@ namespace gbe {
 if(module){
   cloned_module = llvm::CloneModule((llvm::Module*)module);
 }
-if (llvmToGen(*unit, fileName, module, optLevel, OCL_STRICT_CONFORMANCE) 
== false) {
+if (llvmToGen(*unit, fileName, module, optLevel, OCL_STRICT_CONFORMANCE, 
OCL_PROFILING_LOG) == false) {
   if (fileName)
 error = std::string(fileName) + " not found";
   delete unit;
@@ -134,10 +135,10 @@ namespace gbe {
   unit = new ir::Unit();
   if(cloned_module){
 //suppose file exists and llvmToGen will not return false.
-llvmToGen(*unit, fileName, cloned_module, 0, OCL_STRICT_CONFORMANCE);
+llvmToGen(*unit, fileName, cloned_module, 0, OCL_STRICT_CONFORMANCE, 
OCL_PROFILING_LOG);
   }else{
 //suppose file exists and llvmToGen will not return false.
-llvmToGen(*unit, fileName, module, 0, OCL_STRICT_CONFORMANCE);
+llvmToGen(*unit, fileName, module, 0, OCL_STRICT_CONFORMANCE, 
OCL_PROFILING_LOG);
   }
 }
 assert(unit->getValid());
@@ -157,7 +158,7 @@ namespace gbe {
 if (kernelNum == 0) return true;
 for (const auto  : set) {
   const 

[Beignet] [PATCH 01/19] Backend: Add ProfilingInfo class to ir.

2015-09-08 Thread junyan . he
From: Junyan He 

ProfilingInfo will play important role in output
the profiling log. It will record the profiling
information and generate the logs after clfinish.

Signed-off-by: Junyan He 
---
 backend/src/CMakeLists.txt   |2 +
 backend/src/ir/profiling.cpp |   70 ++
 backend/src/ir/profiling.hpp |  132 ++
 3 files changed, 204 insertions(+)
 create mode 100644 backend/src/ir/profiling.cpp
 create mode 100644 backend/src/ir/profiling.hpp

diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index c0d0c23..daab320 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -66,6 +66,8 @@ set (GBE_SRC
 ir/value.hpp
 ir/lowering.cpp
 ir/lowering.hpp
+ir/profiling.cpp
+ir/profiling.hpp
 ir/printf.cpp
 ir/printf.hpp
 ir/immediate.hpp
diff --git a/backend/src/ir/profiling.cpp b/backend/src/ir/profiling.cpp
new file mode 100644
index 000..fe449e6
--- /dev/null
+++ b/backend/src/ir/profiling.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see .
+ *
+ */
+/**
+ * \file profiling.cpp
+ *
+ */
+
+#include 
+#include 
+#include "ir/profiling.hpp"
+#include "src/cl_device_data.h"
+
+namespace gbe
+{
+namespace ir
+{
+  pthread_mutex_t ProfilingInfo::lock = PTHREAD_MUTEX_INITIALIZER;
+
+  void ProfilingInfo::outputProfilingInfo(void * logBuf)
+  {
+LockOutput lock;
+uint32_t logNum = *reinterpret_cast(logBuf);
+printf("Total log number is %u\n", logNum);
+ProfilingReportItem* log = 
reinterpret_cast((char*)logBuf + 4);
+for (int i = 0; i < (int)logNum; i++) {
+  GBE_ASSERT(log->simdType == ProfilingSimdType8 || log->simdType == 
ProfilingSimdType16);
+  uint32_t simd = log->simdType == ProfilingSimdType16 ? 16 : 8;
+  printf("  Log %-6d ---\n", 
i);
+  printf(" | fix functions id:%4d simd: %4d   kernel id: %4d  |\n", 
log->fixedFunctionID,
+  simd, log->kernelID);
+  if (IS_IVYBRIDGE(deviceID)) {
+printf(" | thread id:   %4d EU id:%4d   half slice id:%2d 
|\n", log->genInfo.gen7.thread_id,
+log->genInfo.gen7.eu_id, log->genInfo.gen7.half_slice_id);
+  } else if (IS_HASWELL(deviceID)) {
+printf(" | thread id: %4d  EU id:%4d half slice id:%2d slice id%2d 
|\n", log->genInfo.gen7.thread_id,
+log->genInfo.gen7.eu_id, log->genInfo.gen7.half_slice_id, 
log->genInfo.gen7.slice_id);
+  } else if (IS_BROADWELL(deviceID)) {
+printf(" | thread id: %4d  EU id:%4d  sub slice id:%2d slice id%2d 
|\n", log->genInfo.gen8.thread_id,
+log->genInfo.gen8.eu_id, log->genInfo.gen8.subslice_id, 
log->genInfo.gen8.slice_id);
+  }
+  printf(" | dispatch Mask:%4x prolog:%10lu  epilog:%10lu |\n", 
log->dispatchMask,
+  *reinterpret_cast(&(log->timestampPrologLo)),
+  *reinterpret_cast(&(log->timestampEpilogLo)));
+  printf(" | globalX:%4d~%4d  globalY:%4d~%4d  globalZ:%4d~%4d |\n", 
log->gidXStart, log->gidXEnd,
+  log->gidYStart, log->gidYEnd, log->gidZStart, log->gidZEnd);
+  for (uint32_t i = 0; i < MaxTimestampProfilingPoints - 2; i += 3) {
+printf(" |  ts%-2d:%10u  | ts%-2d:%10u  | ts%-2d:%10u  |\n", i, 
log->userTimestamp[i],
+i + 1, log->userTimestamp[i + 1], i + 2, log->userTimestamp[i + 
2]);
+  }
+  printf(" |  ts18:%10u  | ts19:%10u  |  |\n", 
log->userTimestamp[18], log->userTimestamp[19]);
+  log++;
+}
+  }
+}
+}
diff --git a/backend/src/ir/profiling.hpp b/backend/src/ir/profiling.hpp
new file mode 100644
index 000..ce9866f
--- /dev/null
+++ b/backend/src/ir/profiling.hpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * 

[Beignet] [PATCH 00/19 V2] Add Profiling support in beignet.

2015-09-08 Thread junyan . he
From: Junyan He 

The profiling support is enabled by this patch set.
The profiling information is as following:
-- Log 0 --
| fix functions id:   7 simd:   16   kernel id:0  |
| thread id:  0 EU id:   1   half slice id: 0 |
| dispatch Mask:   1 prolog:   197  epilog:  6699 |
| globalX:   4~   4  globalY:   0~   0  globalZ:   0~   0 |
|  ts0 :64  | ts1 : 0  | ts2 :   930  |
|  ts3 : 0  | ts4 :  1046  | ts5 :  1170  |
|  ts6 : 0  | ts7 : 0  | ts8 : 0  |
|  ts9 :  1624  | ts10:  1838  | ts11: 0  |
|  ts12:  2032  | ts13: 0  | ts14:  2312  |
|  ts15:  2560  | ts16: 0  | ts17: 0  |
|  ts18: 0  | ts19:  2972  |  |

Each hw thread will create one such log items.
Prolog is the timestamp when we enter this kernel, while
epilog is the timestamp we finish and leave it.
ts0~ts19 reocord the time offsets from the prolog, but
the base is 0.
We now just record first 20 blocks' timestamp. Later after
we fully support SourceToBinary, we can set profiling point
at any location.

V2:
1. Fix GLOBAL XYZ wrong value.
   Some curbe registers such as lid0, lid1 may have already expired
   when we reach the bottom block and cause the wrong global values.
2. Fix the problem of wrong device id in profiling info.
3. Fix the pointer size problems on BDW.
   The pointers are 8 bytes value and the dri_bo_emit_reloc will
   write 8 bytes. The buffer pointers for printf and profiling are
   declared as 4 bytes, and so the value next to the pointer in the
   curbe will be erased and cause the wrong results.
4. Place the prolog and epilog logic to the head and tail block.
   The old version places the prolog at the beginning of the first block
   and places the epilog at the last second block, which just before the
   return block. These will cause the proflog and epilog within in predication.
   But they should be executed unconditionally.
5. Improve the sub and add functions for timestamp calculation.
   From BDW, the native long type is supported, use it to make calculation
   more efficient.

Some known issues:
On DBW, some log like this:
 Log 5  ---
| fix functions id:   7 simd:   16   kernel id:0  |
| thread id:0  EU id:   8  sub slice id: 1 slice id 0 |
| dispatch Mask:   1 prolog: 28578  epilog: 15445 |
| globalX:   4~   4  globalY:   0~   0  globalZ:   0~   0 |
|  ts0 :   186  | ts1 : 0  | ts2 :  1504  |
|  ts3 : 0  | ts4 :4294946425  | ts5 :4294946637  |
|  ts6 : 0  | ts7 : 0  | ts8 : 0  |
|  ts9 :4294947235  | ts10:4294947491  | ts11: 0  |
|  ts12:4294947645  | ts13: 0  | ts14:4294947819  |
|  ts15:4294947999  | ts16: 0  | ts17: 0  |
|  ts18: 0  | ts19:4294948561  |  |

The big huge time stamp is really strange and invalid.
It can just be found when run may cases together, can when
we switch to one case run, we can never duplicate it.
It may have relationship with HW and will not cause any
regressions, so I choose to fix it later.


Signed-off-by: Junyan He 
---
backend/src/CMakeLists.txt |3 +
backend/src/backend/gen8_context.cpp   |   24 +
backend/src/backend/gen8_context.hpp   |2 +
backend/src/backend/gen_context.cpp|  481 
backend/src/backend/gen_context.hpp|9 +
.../src/backend/gen_insn_gen7_schedule_info.hxx|2 +
backend/src/backend/gen_insn_scheduling.cpp|4 +-
backend/src/backend/gen_insn_selection.cpp |  140 ++
backend/src/backend/gen_insn_selection.hpp |8 +
backend/src/backend/gen_insn_selection.hxx |2 +
backend/src/backend/gen_program.cpp|9 +-
backend/src/backend/gen_program.hpp|2 +-
backend/src/backend/gen_register.hpp   |9 +
backend/src/backend/program.cpp|   35 +-
backend/src/backend/program.h  |   17 +
backend/src/backend/program.hpp|   25 +-
backend/src/gbe_bin_interpreter.cpp|4 +
backend/src/ir/instruction.cpp |   96 +++-
backend/src/ir/instruction.hpp |   26 ++
backend/src/ir/instruction.hxx |2 +
backend/src/ir/lowering.cpp|7 +
backend/src/ir/profile.cpp |   19 +-
backend/src/ir/profile.hpp |8 +-
backend/src/ir/profiling.cpp   |   70 +++
backend/src/ir/profiling.hpp   |  132 ++
backend/src/ir/unit.cpp|6 +-
backend/src/ir/unit.hpp|   10 +

[Beignet] [PATCH] GBE: fix build error with LLVM 3.5 and previous version.

2015-09-08 Thread Zhigang Gong
Signed-off-by: Zhigang Gong 
---
 backend/src/backend/program.cpp | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 330bead..57a5037 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -575,7 +575,12 @@ namespace gbe {
   Diags);
 llvm::StringRef srcString(source);
 (*CI).getPreprocessorOpts().addRemappedFile("stringInput.cl",
-llvm::MemoryBuffer::getMemBuffer(srcString).release());
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+llvm::MemoryBuffer::getMemBuffer(srcString)
+#else
+llvm::MemoryBuffer::getMemBuffer(srcString).release()
+#endif
+);
 
 // Create the compiler instance
 clang::CompilerInstance Clang;
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 2/3] add bswap64 for gen7/gen75 and gen8 seperately.

2015-09-08 Thread Luo, Xionghu
As LONG type variable is not uniform register, so no need to add the simd == 1 
logic, and the uniform variable is already handled in it. 

Luo Xionghu
Best Regards

-Original Message-
From: Yang, Rong R 
Sent: Tuesday, September 8, 2015 3:00 PM
To: Luo, Xionghu; beignet@lists.freedesktop.org
Cc: Luo, Xionghu
Subject: RE: [Beignet] [PATCH 2/3] add bswap64 for gen7/gen75 and gen8 
seperately.

It seems you don't handle simd == 1 long/ulong case.

> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf 
> Of xionghu@intel.com
> Sent: Thursday, August 13, 2015 14:28
> To: beignet@lists.freedesktop.org
> Cc: Luo, Xionghu
> Subject: [Beignet] [PATCH 2/3] add bswap64 for gen7/gen75 and gen8 
> seperately.
> 
> From: Luo Xionghu 
> 
> as the long type data layout is not continous on platform gen7/gen75, 
> the indirect address access pattern is a bit different than gen8.
> 
> Signed-off-by: Luo Xionghu 
> ---
>  backend/src/backend/gen8_context.cpp |  64  
> backend/src/backend/gen_context.cpp  | 110
> +++
>  2 files changed, 174 insertions(+)
> 
> diff --git a/backend/src/backend/gen8_context.cpp
> b/backend/src/backend/gen8_context.cpp
> index eca8eeb..a283194 100644
> --- a/backend/src/backend/gen8_context.cpp
> +++ b/backend/src/backend/gen8_context.cpp
> @@ -245,6 +245,70 @@ namespace gbe
>p->pop();
> 
>p->MOV(dst, tmp);
> +  }else if (src.type == GEN_TYPE_UL || src.type == GEN_TYPE_L) {
> +  bool uniform_src = (src.hstride == GEN_HORIZONTAL_STRIDE_0);
> +  GBE_ASSERT(uniform_src || src.subnr == 0);
> +  GBE_ASSERT(dst.subnr == 0);
> +  GBE_ASSERT(tmp.subnr == 0);
> +  GBE_ASSERT(start_addr >= 0);
> +  new_a0[0] = start_addr + 7;
> +  new_a0[1] = start_addr + 6;
> +  new_a0[2] = start_addr + 5;
> +  new_a0[3] = start_addr + 4;
> +  new_a0[4] = start_addr + 3;
> +  new_a0[5] = start_addr + 2;
> +  new_a0[6] = start_addr + 1;
> +  new_a0[7] = start_addr;
> +  if(!uniform_src) {
> +new_a0[8] = start_addr + 15;
> +new_a0[9] = start_addr + 14;
> +new_a0[10] = start_addr + 13;
> +new_a0[11] = start_addr + 12;
> +new_a0[12] = start_addr + 11;
> +new_a0[13] = start_addr + 10;
> +new_a0[14] = start_addr + 9;
> +new_a0[15] = start_addr + 8;
> +  } else {
> +new_a0[8] = start_addr + 7;
> +new_a0[9] = start_addr + 6;
> +new_a0[10] = start_addr + 5;
> +new_a0[11] = start_addr + 4;
> +new_a0[12] = start_addr + 3;
> +new_a0[13] = start_addr + 2;
> +new_a0[14] = start_addr + 1;
> +new_a0[15] = start_addr;
> +  }
> +  this->setA0Content(new_a0, 56);
> +
> +  p->push();
> +  p->curr.execWidth = 16;
> +  p->curr.predicate = GEN_PREDICATE_NONE;
> +  p->curr.noMask = 1;
> +  GenRegister ind_src =
> GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB), 
> new_a0[0], 0);
> +  p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
> +  if(!uniform_src)
> +ind_src.addr_imm += 16;
> +  p->MOV(GenRegister::offset(GenRegister::retype(tmp,
> GEN_TYPE_UB), 0, 16), ind_src);
> +  for (int i = 0; i < 2; i++) {
> +if(!uniform_src)
> +  ind_src.addr_imm += 16;
> +p->MOV(GenRegister::offset(GenRegister::retype(tmp,
> GEN_TYPE_UB), 1, 16*i), ind_src);
> +  }
> +  if (simd == 16) {
> +for (int i = 0; i < 2; i++) {
> +  if(!uniform_src)
> +ind_src.addr_imm += 16;
> +  p->MOV(GenRegister::offset(GenRegister::retype(tmp,
> GEN_TYPE_UB), 2, 16*i), ind_src);
> +}
> +for (int i = 0; i < 2; i++) {
> +  if(!uniform_src)
> +ind_src.addr_imm += 16;
> +  p->MOV(GenRegister::offset(GenRegister::retype(tmp,
> GEN_TYPE_UB), 3, 16*i), ind_src);
> +}
> +  }
> +  p->pop();
> +
> +  p->MOV(dst, tmp);
>  } else {
>GBE_ASSERT(0);
>  }
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index 8ee65ee..7fd43bb 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -437,6 +437,116 @@ namespace gbe
>  p->pop();
> 
>  p->MOV(dst, tmp);
> +  }else if (src.type == 

Re: [Beignet] [PATCH] GBE: fix build error with LLVM 3.5 and previous version.

2015-09-08 Thread Yang, Rong R
LGTM, pushed, thanks.

> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
> Zhigang Gong
> Sent: Wednesday, September 9, 2015 09:08
> To: beignet@lists.freedesktop.org
> Cc: Gong, Zhigang
> Subject: [Beignet] [PATCH] GBE: fix build error with LLVM 3.5 and previous
> version.
> 
> Signed-off-by: Zhigang Gong 
> ---
>  backend/src/backend/program.cpp | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/backend/src/backend/program.cpp
> b/backend/src/backend/program.cpp index 330bead..57a5037 100644
> --- a/backend/src/backend/program.cpp
> +++ b/backend/src/backend/program.cpp
> @@ -575,7 +575,12 @@ namespace gbe {
>Diags);
>  llvm::StringRef srcString(source);
>  (*CI).getPreprocessorOpts().addRemappedFile("stringInput.cl",
> -llvm::MemoryBuffer::getMemBuffer(srcString).release());
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
> +llvm::MemoryBuffer::getMemBuffer(srcString)
> +#else
> +llvm::MemoryBuffer::getMemBuffer(srcString).release()
> +#endif
> +);
> 
>  // Create the compiler instance
>  clang::CompilerInstance Clang;
> --
> 1.9.1
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v3 1/3] return 32 could gain 0.2% performance on opencv optical flow case.

2015-09-08 Thread xionghu . luo
From: Luo Xionghu 

Signed-off-by: Luo Xionghu 
---
 src/cl_gt_device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index bd87cc4..a51843d 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -39,7 +39,7 @@
 .native_vector_width_float = 4,
 .native_vector_width_double = 2,
 .native_vector_width_half = 8,
-.preferred_wg_sz_mul = 16,
+.preferred_wg_sz_mul = 32,
 .address_bits = 32,
 .max_mem_alloc_size = 512 * 1024 * 1024,
 .image_support = CL_TRUE,
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v3 3/3] add utest for creating 2d image from buffer.

2015-09-08 Thread xionghu . luo
From: Luo Xionghu 

 v2: check cl_khr_image2d_from_buffer support first;
 use CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT to allocate memory.

Signed-off-by: Luo Xionghu 
---
 utests/CMakeLists.txt|  1 +
 utests/image_from_buffer.cpp | 83 
 2 files changed, 84 insertions(+)
 create mode 100644 utests/image_from_buffer.cpp

diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index e7a9e26..bfb902c 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -204,6 +204,7 @@ set (utests_sources
   enqueue_fill_buf.cpp
   builtin_kernel_max_global_size.cpp
   image_1D_buffer.cpp
+  image_from_buffer.cpp
   compare_image_2d_and_1d_array.cpp
   compiler_fill_image_1d_array.cpp
   compiler_fill_image_2d_array.cpp
diff --git a/utests/image_from_buffer.cpp b/utests/image_from_buffer.cpp
new file mode 100644
index 000..be5f5e6
--- /dev/null
+++ b/utests/image_from_buffer.cpp
@@ -0,0 +1,83 @@
+#include 
+#include "utest_helper.hpp"
+#include 
+#include 
+
+static void image_from_buffer(void)
+{
+  size_t param_value_size;
+  std::string extensionStr;
+  OCL_CALL (clGetPlatformInfo, platform, CL_PLATFORM_EXTENSIONS, 0, 0, 
_value_size);
+  std::vector param_value(param_value_size);
+  OCL_CALL (clGetPlatformInfo, platform, CL_PLATFORM_EXTENSIONS, 
param_value_size, param_value.empty() ? NULL : _value.front(), 
_value_size);
+  if (!param_value.empty())
+extensionStr = std::string(_value.front(), param_value_size-1);
+
+  if (!std::strstr(extensionStr.c_str(), "cl_khr_image2d_from_buffer")) {
+return;
+  }
+
+  OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, 
0, 0, _value_size);
+  size_t base_address_alignment = 0;
+  OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, 
param_value_size, _address_alignment, _value_size);
+  const size_t w = 512;
+  const size_t h = 512;
+  cl_image_format format;
+  cl_image_desc desc;
+  int error;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  size_t buffer_sz = sizeof(uint32_t) * w * h;
+  //buf_data[0] = (uint32_t*) malloc(buffer_sz);
+  buf_data[0] = (uint32_t*)memalign(base_address_alignment, buffer_sz);
+  for (uint32_t j = 0; j < h; ++j)
+for (uint32_t i = 0; i < w; i++)
+  ((uint32_t*)buf_data[0])[j * w + i] = j * w + i;
+
+  cl_mem buff = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, 
buffer_sz, buf_data[0], );
+
+  OCL_ASSERT(error == CL_SUCCESS);
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = w * sizeof(uint32_t);
+
+  desc.buffer = 0;
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, , , buf_data[0]);
+
+  desc.buffer = buff;
+  OCL_CREATE_IMAGE(buf[1], 0, , , NULL);
+
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Check result
+  OCL_MAP_BUFFER_GTT(0);
+  OCL_MAP_BUFFER_GTT(1);
+  for (uint32_t j = 0; j < h; ++j)
+for (uint32_t i = 0; i < w; i++)
+{
+  //printf("%d,%d\n", ((uint32_t*)buf_data[0])[j * w + i], 
((uint32_t*)buf_data[1])[j * w + i]);
+  OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == 
((uint32_t*)buf_data[1])[j * w + i]);
+}
+  OCL_UNMAP_BUFFER_GTT(0);
+  OCL_UNMAP_BUFFER_GTT(1);
+
+  //spec didn't tell the sequence of release buffer of image. so release 
either buffer or image first is ok here.
+  //we follow the rule of destroy the bo at the last release, then the access 
of buffer after release image is legal
+  //and vice verse.
+#if 1
+  clReleaseMemObject(buf[1]);
+  clReleaseMemObject(buff);
+#else
+  clReleaseMemObject(buff);
+  clReleaseMemObject(buf[1]);
+#endif
+}
+
+MAKE_UTEST_FROM_FUNCTION(image_from_buffer);
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v3 2/3] enable create image 2d from buffer in clCreateImage.

2015-09-08 Thread xionghu . luo
From: Luo Xionghu 

this patch allows create 2d image with a cl buffer with zero copy.

v2: should use reference to manage the release the buffer and image.
After being created, the buffer reference count is 2, and image reference
count is 1.
if image is released first, decrease the image reference count and
buffer reference count both, release the bo when the buffer is released
at last;
if buffer is released first, decrease the buffer reference count only,
release the buffer when the image is released.
add CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT in cl_device_info.

v3: move is_image_from_buffer to _cl_mem_image; return
CL_INVALID_IMAGE_SIZE if image size is larger than the buffer.

Signed-off-by: Luo Xionghu 
---
 src/cl_api.c|   3 +-
 src/cl_device_id.c  |   2 +
 src/cl_device_id.h  |   2 +
 src/cl_extensions.c |   2 +
 src/cl_gt_device.h  |   3 +-
 src/cl_mem.c| 115 
 src/cl_mem.h|   1 +
 7 files changed, 99 insertions(+), 29 deletions(-)

diff --git a/src/cl_api.c b/src/cl_api.c
index 5c9b250..0690af4 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -549,8 +549,9 @@ clCreateImage(cl_context context,
 goto error;
   }
   /* buffer refers to a valid buffer memory object if image_type is
- CL_MEM_OBJECT_IMAGE1D_BUFFER. Otherwise it must be NULL. */
+ CL_MEM_OBJECT_IMAGE1D_BUFFER or CL_MEM_OBJECT_IMAGE2D. Otherwise it must 
be NULL. */
   if (image_desc->image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER &&
+  image_desc->image_type != CL_MEM_OBJECT_IMAGE2D &&
  image_desc->buffer) {
 err = CL_INVALID_IMAGE_DESCRIPTOR;
 goto error;
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 1778292..78d2cf4 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -810,6 +810,8 @@ cl_get_device_info(cl_device_id device,
 DECL_FIELD(PARTITION_AFFINITY_DOMAIN, affinity_domain)
 DECL_FIELD(PARTITION_TYPE, partition_type)
 DECL_FIELD(REFERENCE_COUNT, device_reference_count)
+DECL_FIELD(IMAGE_PITCH_ALIGNMENT, image_pitch_alignment)
+DECL_FIELD(IMAGE_BASE_ADDRESS_ALIGNMENT, image_base_address_alignment)
 
 case CL_DRIVER_VERSION:
   if (param_value_size_ret) {
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index b5db91c..02d1e0f 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -116,6 +116,8 @@ struct _cl_device_id {
   cl_device_partition_property partition_type[3];
   cl_uint  device_reference_count;
   uint32_t atomic_test_result;
+  uint32_t image_pitch_alignment;
+  uint32_t image_base_address_alignment;
 };
 
 /* Get a device from the given platform */
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index 3eb303f..6cb1579 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -46,6 +46,8 @@ void check_opt1_extension(cl_extensions_t *extensions)
 if (id == EXT_ID(khr_spir))
   extensions->extensions[id].base.ext_enabled = 1;
 #endif
+if (id == EXT_ID(khr_image2d_from_buffer))
+  extensions->extensions[id].base.ext_enabled = 1;
   }
 }
 
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index a51843d..c2f9f56 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -126,4 +126,5 @@ DECL_INFO_STRING(driver_version, 
LIBCL_DRIVER_VERSION_STRING)
 .affinity_domain = 0,
 .partition_type = {0},
 .device_reference_count = 1,
-
+.image_pitch_alignment = 1,
+.image_base_address_alignment = 4096,
diff --git a/src/cl_mem.c b/src/cl_mem.c
index b5671bd..0358555 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -267,6 +267,9 @@ cl_mem_allocate(enum cl_mem_type type,
   mem->flags = flags;
   mem->is_userptr = 0;
   mem->offset = 0;
+  if (mem->type == CL_MEM_IMAGE_TYPE) {
+cl_mem_image(mem)->is_image_from_buffer = 0;
+  }
 
   if (sz != 0) {
 /* Pinning will require stricter alignment rules */
@@ -308,10 +311,19 @@ cl_mem_allocate(enum cl_mem_type type,
   }
 }
 
-if (!mem->is_userptr)
+if(type == CL_MEM_IMAGE_TYPE && host_ptr && ((cl_mem)host_ptr)->magic == 
CL_MAGIC_MEM_HEADER) {
+  // if the image if created from buffer, should use the bo directly to 
share same bo.
+  mem->bo = ((cl_mem)host_ptr)->bo;
+  cl_mem_image(mem)->is_image_from_buffer = 1;
+} else  if (!mem->is_userptr)
   mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
 #else
-mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
+if(type == CL_MEM_IMAGE_TYPE && host_ptr && ((cl_mem)host_ptr)->magic == 
CL_MAGIC_MEM_HEADER) {
+  // if the image if created from buffer, should use the bo directly to 
share same bo.
+  mem->bo = ((cl_mem)host_ptr)->bo;
+  cl_mem_image(mem)->is_image_from_buffer = 1;
+} else
+  mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
 #endif
 
 if (UNLIKELY(mem->bo == NULL)) {
@@ -756,6 +768,8 @@ _cl_mem_new_image(cl_context ctx,
   h = (w +