Re: [Beignet] [PATCH] Backend: for BDW and after, According to BSpec no need to split CMP when src is DW DF

2017-03-02 Thread Song, Ruiling


> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
> rander
> Sent: Monday, February 27, 2017 12:25 PM
> To: beignet@lists.freedesktop.org
> Cc: Wang, Rander 
> Subject: [Beignet] [PATCH] Backend: for BDW and after, According to BSpec no
> need to split CMP when src is DW DF
> 
> Signed-off-by: rander 
> ---
>  backend/src/backend/gen8_encoder.cpp | 10 ++
>  backend/src/backend/gen8_encoder.hpp |  1 +
>  backend/src/backend/gen9_encoder.cpp |  5 +
>  backend/src/backend/gen9_encoder.hpp |  1 +
>  backend/src/backend/gen_encoder.cpp  |  7 ++-
>  backend/src/backend/gen_encoder.hpp  |  1 +
>  6 files changed, 24 insertions(+), 1 deletion(-)
> 
> diff --git a/backend/src/backend/gen8_encoder.cpp
> b/backend/src/backend/gen8_encoder.cpp
> index a33fbac..bb4fdb0 100644
> --- a/backend/src/backend/gen8_encoder.cpp
> +++ b/backend/src/backend/gen8_encoder.cpp
> @@ -38,6 +38,7 @@ static const uint32_t untypedRWMask[] = {
>  namespace gbe
>  {
>extern bool compactAlu3(GenEncoder *p, uint32_t opcode, GenRegister dst,
> GenRegister src0, GenRegister src1, GenRegister src2);
> +
>void Gen8Encoder::setHeader(GenNativeInstruction *insn) {
>  Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
>  if (this->curr.execWidth == 8)
> @@ -883,4 +884,13 @@ namespace gbe
> msg_length,
> response_length);
> }
> +
> +/* for BDW and after, no need to split CMP when src is DW*/
> +bool Gen8Encoder::needToSplitCmpBySrcType(GenEncoder *p, GenRegister
> src0, GenRegister src1) {
I am a little confusing, in the comment you said no need to split if src is DW.
Why do you still return "true" for GEN_TYPE_F?

> +  if (src0.type == GEN_TYPE_F)
> +return true;
> +  if (src1.type == GEN_TYPE_F)
> +return true;
> +  return false;
> +}
>  } /* End of the name space. */
> diff --git a/backend/src/backend/gen8_encoder.hpp
> b/backend/src/backend/gen8_encoder.hpp
> index fa62a8d..51c079c 100644
> --- a/backend/src/backend/gen8_encoder.hpp
> +++ b/backend/src/backend/gen8_encoder.hpp
> @@ -83,6 +83,7 @@ namespace gbe
>  virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t bti,
> uint32_t elemSize);
>  /*! A64 OBlock write */
>  virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t
> elemSize);
> +virtual bool needToSplitCmpBySrcType(GenEncoder *p, GenRegister src0,
> GenRegister src1);
>};
>  }
>  #endif /* __GBE_GEN8_ENCODER_HPP__ */
> diff --git a/backend/src/backend/gen9_encoder.cpp
> b/backend/src/backend/gen9_encoder.cpp
> index b37fd98..f2b9274 100644
> --- a/backend/src/backend/gen9_encoder.cpp
> +++ b/backend/src/backend/gen9_encoder.cpp
> @@ -301,4 +301,9 @@ namespace gbe
>  response_length);
>  }
>}
> +
> +  bool Gen9Encoder::needToSplitCmpBySrcType(GenEncoder *p, GenRegister
> src0, GenRegister src1)
> +  {
> +return false;
> +  }
>  } /* End of the name space. */
> diff --git a/backend/src/backend/gen9_encoder.hpp
> b/backend/src/backend/gen9_encoder.hpp
> index 2eaa538..69b7490 100644
> --- a/backend/src/backend/gen9_encoder.hpp
> +++ b/backend/src/backend/gen9_encoder.hpp
> @@ -56,6 +56,7 @@ namespace gbe
>  virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister addr,
> GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends);
>  virtual void OBWRITE(GenRegister header, GenRegister data, uint32_t bti,
> uint32_t ow_size, bool useSends);
>  virtual void MBWRITE(GenRegister header, GenRegister data, uint32_t bti,
> uint32_t data_size, bool useSends);
> +virtual bool needToSplitCmpBySrcType(GenEncoder *p, GenRegister src0,
> GenRegister src1);
>};
>  }
>  #endif /* __GBE_GEN9_ENCODER_HPP__ */
> diff --git a/backend/src/backend/gen_encoder.cpp
> b/backend/src/backend/gen_encoder.cpp
> index 03ce0e2..296a0c5 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -192,6 +192,10 @@ namespace gbe
>  if (isSrcDstDiffSpan(dst, src0) == true) return true;
>  if (isSrcDstDiffSpan(dst, src1) == true) return true;


I would like to see needToSplitCmpBySrcType be called here.
That is needToSplitCmp() will do all kind of check. That is conform to its name.
> 
> +return false;
> +  }
> +
> +  bool GenEncoder::needToSplitCmpBySrcType(GenEncoder *p, GenRegister
> src0, GenRegister src1) {
>  if (src0.type == GEN_TYPE_D || src0.type == GEN_TYPE_UD || src0.type ==
> GEN_TYPE_F)
>return true;
>  if (src1.type == GEN_TYPE_D || src1.type == GEN_TYPE_UD || src1.type ==
> GEN_TYPE_F)
> @@ -199,6 +203,7 @@ namespace gbe
>  return false;
>}
> 
> +
>void GenEncoder::setMessageDescriptor(GenNativeInstruction *inst, enum
> GenMessageTarget sfid,
>  unsigned msg_length, unsigned 
> response_length,
>  bool header_pres

[Beignet] [PATCH] Refine command queue's enqueue ndrang.

2017-03-02 Thread junyan . he
From: Junyan He 

Delete all the obsolete code in command_queue_gen7.c
Make the code logic more clean and using the elf info
to do the job. After that, we can total split the GBE
backend from the runtime. We do not need to get the
kernel info from GBE backend at runtime.

Signed-off-by: Junyan He 
---
 src/gen/cl_command_queue_gen.c | 876 +
 src/gen/cl_gen.h   |   9 +
 2 files changed, 885 insertions(+)
 create mode 100644 src/gen/cl_command_queue_gen.c

diff --git a/src/gen/cl_command_queue_gen.c b/src/gen/cl_command_queue_gen.c
new file mode 100644
index 000..d12ced8
--- /dev/null
+++ b/src/gen/cl_command_queue_gen.c
@@ -0,0 +1,876 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see .
+ *
+ */
+
+#include "cl_gen.h"
+#include "gen_device_pci_id.h"
+
+#include "intel_defines.h"
+#include "intel_structs.h"
+#include "intel_batchbuffer.h"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* We can bind only a limited number of buffers */
+enum { max_buf_n = 128 };
+enum { max_img_n = 128 };
+enum { max_sampler_n = 16 };
+
+typedef struct gen_gpgpu {
+  drm_intel_bufmgr *bufmgr; // The drm buffer mgr
+  cl_device_id device;  // The device of this gpu
+  cl_kernel kernel; // The kernel we are executing
+  drm_intel_bo *kernel_bo;  // The buffer object holding kernel bitcode
+  uint32_t simd_size;   // The simd size we are executing.
+  uint32_t atomic_test_result;
+
+  struct intel_batchbuffer *batch; // The batch buffer holding GPU command
+
+  struct {
+drm_intel_bo *aux_bo; // Aux buffer needed by GPU command
+uint32_t surface_heap_offset;
+uint32_t curbe_offset;
+uint32_t idrt_offset;
+uint32_t sampler_state_offset;
+uint32_t sampler_border_color_state_offset;
+  } aux; // All aux setting info
+
+  struct {
+uint32_t local_mem_size; // The total local memory size
+
+uint32_t max_bti;  /* Max bti number */
+uint32_t binded_n; /* Number of buffers binded */
+drm_intel_bo *binded_buf[max_buf_n];   /* All buffers binded for the 
kernel, e.g. kernel's arg */
+uint32_t binded_offset[max_buf_n]; /* The offset in the curbe buffer */
+uint32_t target_buf_offset[max_buf_n]; /* The offset within the buffers to 
be binded */
+
+uint32_t per_thread_scratch_size;
+uint32_t total_scratch_size;
+drm_intel_bo *scratch_bo; /* Scratch buffer */
+
+drm_intel_bo *const_bo; /* Constant buffer */
+
+drm_intel_bo *stack_bo; /* stack buffer */
+
+drm_intel_bo *time_stamp_bo; /* The buffer to record exec timestamps */
+  } mem;
+
+  struct {
+uint64_t sampler_bitmap; /* sampler usage bitmap. */
+  } sampler;
+
+  struct {
+uint32_t barrier_slm_used;   /* Use barrier or slm */
+uint32_t thread_num; // Total thread number we need for this kernel
+uint32_t max_thread_num; // Max thread number we can run at same time
+uint32_t per_thread_scratch; // Scratch buffer size for each thread
+uint32_t num_cs_entries; /* Curbe entry number */
+uint32_t size_cs_entry;  /* size of one entry in 512bit elements */
+char *curbe; /* Curbe content */
+uint32_t curbe_size; /* Curbe size */
+  } thread;
+
+} gen_gpgpu;
+
+#define MAX_IF_DESC 32
+
+typedef struct surface_heap {
+  uint32_t binding_table[256];
+  char surface[256 * sizeof(gen_surface_state_t)];
+} surface_heap_t;
+
+#include "gen_gpgpu_func.c"
+
+static cl_int
+check_work_group_capability(cl_command_queue queue, cl_kernel kernel,
+const size_t *local_wk_sz, uint32_t wk_dim)
+{
+  size_t sz = 0;
+  int i;
+
+  sz = local_wk_sz[0];
+  for (i = 1; i < wk_dim; ++i)
+sz *= local_wk_sz[i];
+
+  if (sz > cl_kernel_get_max_workgroup_size_gen(kernel, queue->device))
+return CL_INVALID_WORK_ITEM_SIZE;
+
+  return CL_SUCCESS;
+}
+
+static cl_int
+gen_gpgpu_setup_curbe(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu 
*gpu,
+  const uint32_t work_dim, const size_t *global_wk_off,
+  const size_t *global_wk_sz, const size_t *local_wk_sz,
+  const size_t *enqueued_local_wk_sz)
+{
+  int curbe_size = 0;
+  char *curbe = NULL;
+  int i;
+  int sz;
+  uint32_t slm_off

[Beignet] [PATCH] Move compiler load/unload logic to gen specific file.

2017-03-02 Thread junyan . he
From: Junyan He 

Signed-off-by: Junyan He 
---
 src/cl_compiler.c | 77 +-
 src/gen/cl_compiler_gen.c | 95 +++
 2 files changed, 96 insertions(+), 76 deletions(-)
 create mode 100644 src/gen/cl_compiler_gen.c

diff --git a/src/cl_compiler.c b/src/cl_compiler.c
index d7eccb2..cc7860a 100644
--- a/src/cl_compiler.c
+++ b/src/cl_compiler.c
@@ -18,83 +18,8 @@
  */
 
 #include "cl_compiler.h"
-#include "cl_device_data.h"
-#include "backend/src/GBEConfig.h"
 #include "cl_device_id.h"
 
-#include 
-#include 
-
-LOCAL cl_int
-cl_compiler_unload_gen(cl_device_id device)
-{
-  assert(device->compiler.available);
-  assert(device->compiler.opaque);
-
-  dlclose(device->compiler.opaque);
-
-  device->compiler.available = CL_FALSE;
-  device->compiler.opaque = NULL;
-  device->compiler.compiler_name = NULL;
-  device->compiler.check_Compiler_option = NULL;
-  device->compiler.build_program = NULL;
-  device->compiler.compile_program = NULL;
-  device->compiler.link_program = NULL;
-  return CL_SUCCESS;
-}
-
-LOCAL cl_int
-cl_compiler_load_gen(cl_device_id device)
-{
-  const char *gbePath = NULL;
-  void *dlhCompiler = NULL;
-  void *genBuildProgram = NULL;
-  void *genLinkProgram = NULL;
-  void *genCompileProgram = NULL;
-  void *genCheckCompilerOption = NULL;
-
-  gbePath = getenv("OCL_GBE_PATH");
-  if (gbePath == NULL || !strcmp(gbePath, ""))
-gbePath = GBE_OBJECT_DIR;
-
-  dlhCompiler = dlopen(gbePath, RTLD_LAZY | RTLD_LOCAL);
-  if (dlhCompiler == NULL)
-return CL_COMPILER_NOT_AVAILABLE;
-
-  genBuildProgram = dlsym(dlhCompiler, "GenBuildProgram");
-  if (genBuildProgram == NULL) {
-dlclose(dlhCompiler);
-return CL_COMPILER_NOT_AVAILABLE;
-  }
-
-  genCompileProgram = dlsym(dlhCompiler, "GenCompileProgram");
-  if (genCompileProgram == NULL) {
-dlclose(dlhCompiler);
-return CL_COMPILER_NOT_AVAILABLE;
-  }
-
-  genLinkProgram = dlsym(dlhCompiler, "GenLinkProgram");
-  if (genLinkProgram == NULL) {
-dlclose(dlhCompiler);
-return CL_COMPILER_NOT_AVAILABLE;
-  }
-
-  genCheckCompilerOption = dlsym(dlhCompiler, "GenCheckCompilerOption");
-  if (genCheckCompilerOption == NULL) {
-dlclose(dlhCompiler);
-return CL_COMPILER_NOT_AVAILABLE;
-  }
-
-  device->compiler.opaque = dlhCompiler;
-  device->compiler.available = CL_TRUE;
-  device->compiler.compiler_name = "libgbe.so";
-  device->compiler.check_Compiler_option = genCheckCompilerOption;
-  device->compiler.build_program = genBuildProgram;
-  device->compiler.compile_program = genCompileProgram;
-  device->compiler.link_program = genLinkProgram;
-  return CL_SUCCESS;
-}
-
 LOCAL cl_int
 cl_compiler_check_available(cl_device_id device)
 {
@@ -110,5 +35,5 @@ cl_compiler_unload(cl_device_id device)
   if (device->compiler.available == CL_FALSE)
 return CL_SUCCESS;
 
-  return cl_compiler_unload_gen(device);
+  return device->api.compiler_unload(device);
 }
diff --git a/src/gen/cl_compiler_gen.c b/src/gen/cl_compiler_gen.c
new file mode 100644
index 000..aaff512
--- /dev/null
+++ b/src/gen/cl_compiler_gen.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see .
+ *
+ * Author: He Junyan 
+ */
+
+#include "cl_gen.h"
+#include "backend/src/GBEConfig.h"
+#include 
+
+LOCAL cl_int
+cl_compiler_load_gen(cl_device_id device)
+{
+  const char *gbePath = NULL;
+  void *dlhCompiler = NULL;
+  void *genBuildProgram = NULL;
+  void *genLinkProgram = NULL;
+  void *genCompileProgram = NULL;
+  void *genCheckCompilerOption = NULL;
+
+  if (device->compiler.available == CL_TRUE)
+return CL_SUCCESS;
+
+  gbePath = getenv("OCL_GBE_PATH");
+  if (gbePath == NULL || !strcmp(gbePath, ""))
+gbePath = GBE_OBJECT_DIR;
+
+  dlhCompiler = dlopen(gbePath, RTLD_LAZY | RTLD_LOCAL);
+  if (dlhCompiler == NULL)
+return CL_COMPILER_NOT_AVAILABLE;
+
+  genBuildProgram = dlsym(dlhCompiler, "GenBuildProgram");
+  if (genBuildProgram == NULL) {
+dlclose(dlhCompiler);
+return CL_COMPILER_NOT_AVAILABLE;
+  }
+
+  genCompileProgram = dlsym(dlhCompiler, "GenCompileProgram");
+  if (genCompileProgram == NULL) {
+dlclose(dlhCompiler);
+return CL_COMPILER_NOT_AVAILABLE;
+  }
+
+  genLinkProgram = dlsym(dlhCompiler, "GenLinkProgram");
+  if (genLinkProgram == NULL) {
+dlc

[Beignet] [PATCH] Move intel_gpgpu.c's functions to new file.

2017-03-02 Thread junyan . he
From: Junyan He 

The gpgpu struct will be re-define. All the global
function pointers are deleted and the funtions need
to use new gpgpu struct pointer as parameter.

Signed-off-by: Junyan He 
---
 src/gen/gen_gpgpu_func.c | 1809 ++
 1 file changed, 1809 insertions(+)
 create mode 100644 src/gen/gen_gpgpu_func.c

diff --git a/src/gen/gen_gpgpu_func.c b/src/gen/gen_gpgpu_func.c
new file mode 100644
index 000..5b7ebed
--- /dev/null
+++ b/src/gen/gen_gpgpu_func.c
@@ -0,0 +1,1809 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see .
+ */
+
+#include 
+
+static uint32_t
+__gen_gpgpu_get_cache_ctrl_gen7(void)
+{
+  return cc_llc_l3;
+}
+
+static uint32_t
+__gen_gpgpu_get_cache_ctrl_gen75(void)
+{
+  return llccc_ec | l3cc_ec;
+}
+
+static uint32_t
+__gen_gpgpu_get_cache_ctrl_gen8(void)
+{
+  return tcc_llc_ec_l3 | mtllc_wb;
+}
+
+static uint32_t
+__gen_gpgpu_get_cache_ctrl_gen9(void)
+{
+  //Kernel-defined cache control registers 2:
+  //L3CC: WB; LeCC: WB; TC: LLC/eLLC;
+  int major = 0, minor = 0;
+  int mocs_index = 0x2;
+
+  struct utsname buf;
+  uname(&buf);
+  sscanf(buf.release, "%d.%d", &major, &minor);
+  //From linux 4.3, kernel redefined the mocs table's value,
+  //But before 4.3, still used the hw defautl value.
+  if (strcmp(buf.sysname, "Linux") == 0 &&
+  major == 4 && minor < 3) { /* linux kernel support skl from  4.x, so 
check from 4 */
+mocs_index = 0x9;
+  }
+
+  return (mocs_index << 1);
+}
+
+static uint32_t
+gen_gpgpu_get_cache_ctrl(gen_gpgpu *gpgpu)
+{
+  if (IS_BROADWELL(gpgpu->device->device_id) || 
IS_CHERRYVIEW(gpgpu->device->device_id))
+return __gen_gpgpu_get_cache_ctrl_gen8();
+
+  if (IS_GEN9(gpgpu->device->device_id))
+return __gen_gpgpu_get_cache_ctrl_gen9();
+
+  if (IS_HASWELL(gpgpu->device->device_id))
+return __gen_gpgpu_get_cache_ctrl_gen75();
+
+  if (IS_IVYBRIDGE(gpgpu->device->device_id))
+return __gen_gpgpu_get_cache_ctrl_gen7();
+
+  assert(0);
+  return 0;
+}
+
+static void
+__gen_gpgpu_setup_bti_gen7(gen_gpgpu *gpgpu, drm_intel_bo *buf, uint32_t 
internal_offset,
+   size_t size, unsigned char index, uint32_t format)
+{
+  assert(size <= (2ul << 30));
+  size_t s = size - 1;
+  surface_heap_t *heap = gpgpu->aux.aux_bo->virtual + 
gpgpu->aux.surface_heap_offset;
+  gen7_surface_state_t *ss0 = (gen7_surface_state_t *)&heap->surface[index * 
sizeof(gen7_surface_state_t)];
+  memset(ss0, 0, sizeof(gen7_surface_state_t));
+  ss0->ss0.surface_type = I965_SURFACE_BUFFER;
+  ss0->ss0.surface_format = format;
+  ss0->ss2.width = s & 0x7f; /* bits 6:0 of sz */
+  // Per bspec, I965_SURFACE_BUFFER and RAW format, size must be a multiple of 
4 byte.
+  if (format == I965_SURFACEFORMAT_RAW)
+assert((ss0->ss2.width & 0x03) == 3);
+  ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
+  ss0->ss3.depth = (s >> 21) & 0x3ff;  /* bits 30:21 of sz */
+  ss0->ss5.cache_control = gen_gpgpu_get_cache_ctrl(gpgpu);
+  heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * 
sizeof(gen7_surface_state_t);
+
+  ss0->ss1.base_addr = buf->offset + internal_offset;
+  dri_bo_emit_reloc(gpgpu->aux.aux_bo,
+I915_GEM_DOMAIN_RENDER,
+I915_GEM_DOMAIN_RENDER,
+internal_offset,
+gpgpu->aux.surface_heap_offset +
+  heap->binding_table[index] +
+  offsetof(gen7_surface_state_t, ss1),
+buf);
+}
+
+static void
+__gen_gpgpu_setup_bti_gen75(gen_gpgpu *gpgpu, drm_intel_bo *buf, uint32_t 
internal_offset,
+size_t size, unsigned char index, uint32_t format)
+{
+  assert(size <= (2ul << 30));
+  size_t s = size - 1;
+  surface_heap_t *heap = gpgpu->aux.aux_bo->virtual + 
gpgpu->aux.surface_heap_offset;
+  gen7_surface_state_t *ss0 = (gen7_surface_state_t *)&heap->surface[index * 
sizeof(gen7_surface_state_t)];
+  memset(ss0, 0, sizeof(gen7_surface_state_t));
+  ss0->ss0.surface_type = I965_SURFACE_BUFFER;
+  ss0->ss0.surface_format = format;
+  if (format != I965_SURFACEFORMAT_RAW) {
+ss0->ss7.shader_r = I965_SURCHAN_SELECT_RED;
+ss0->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
+ss0->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
+ss0->ss7.shader_a = I9

[Beignet] [PATCH] Add Gen's kernel specific struct.

2017-03-02 Thread junyan . he
From: Junyan He 

Add Gen's kernel related info should be parsed from
the program's ELF file and stored in kernel_gen.

Signed-off-by: Junyan He 
---
 src/cl_kernel.h |   7 +
 src/gen/cl_gen.h|  89 
 src/gen/cl_kernel_gen.c | 553 
 3 files changed, 649 insertions(+)
 create mode 100644 src/gen/cl_kernel_gen.c

diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index 8acd82a..4690c0b 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -47,6 +47,13 @@ typedef struct cl_argument {
   uint32_t is_svm:1;/* Indicate this argument is SVMPointer */
 } cl_argument;
 
+typedef struct _cl_kernel_for_device {
+  cl_device_id device;
+  void *exec_code;  /* The binary for exec */
+  cl_uint exec_code_sz; /* The binary for exec size */
+} _cl_kernel_for_device;
+typedef _cl_kernel_for_device *cl_kernel_for_device;
+
 /* One OCL function */
 struct _cl_kernel {
   _cl_base_object base;
diff --git a/src/gen/cl_gen.h b/src/gen/cl_gen.h
index 2926bc7..f761652 100644
--- a/src/gen/cl_gen.h
+++ b/src/gen/cl_gen.h
@@ -35,6 +35,95 @@
 #include 
 #include 
 
+/*** Kernel 
*/
+/* Special virtual registers for OpenCL */
+typedef enum cl_gen_virt_reg {
+  CL_GEN_VIRT_REG_LOCAL_ID_X = 0,
+  CL_GEN_VIRT_REG_LOCAL_ID_Y,
+  CL_GEN_VIRT_REG_LOCAL_ID_Z,
+  CL_GEN_VIRT_REG_LOCAL_SIZE_X,
+  CL_GEN_VIRT_REG_LOCAL_SIZE_Y,
+  CL_GEN_VIRT_REG_LOCAL_SIZE_Z,
+  CL_GEN_VIRT_REG_ENQUEUED_LOCAL_SIZE_X,
+  CL_GEN_VIRT_REG_ENQUEUED_LOCAL_SIZE_Y,
+  CL_GEN_VIRT_REG_ENQUEUED_LOCAL_SIZE_Z,
+  CL_GEN_VIRT_REG_GLOBAL_SIZE_X,
+  CL_GEN_VIRT_REG_GLOBAL_SIZE_Y,
+  CL_GEN_VIRT_REG_GLOBAL_SIZE_Z,
+  CL_GEN_VIRT_REG_GLOBAL_OFFSET_X,
+  CL_GEN_VIRT_REG_GLOBAL_OFFSET_Y,
+  CL_GEN_VIRT_REG_GLOBAL_OFFSET_Z,
+  CL_GEN_VIRT_REG_GROUP_NUM_X,
+  CL_GEN_VIRT_REG_GROUP_NUM_Y,
+  CL_GEN_VIRT_REG_GROUP_NUM_Z,
+  CL_GEN_VIRT_REG_WORK_DIM,
+  CL_GEN_VIRT_REG_IMAGE_INFO,
+  CL_GEN_VIRT_REG_KERNEL_ARGUMENT,
+  CL_GEN_VIRT_REG_EXTRA_ARGUMENT,
+  CL_GEN_VIRT_REG_BLOCK_IP,
+  CL_GEN_VIRT_REG_DW_BLOCK_IP,
+  CL_GEN_VIRT_REG_THREAD_NUM,
+  CL_GEN_VIRT_REG_PROFILING_BUF_POINTER,
+  CL_GEN_VIRT_REG_PROFILING_TIMESTAMP0,
+  CL_GEN_VIRT_REG_PROFILING_TIMESTAMP1,
+  CL_GEN_VIRT_REG_PROFILING_TIMESTAMP2,
+  CL_GEN_VIRT_REG_PROFILING_TIMESTAMP3,
+  CL_GEN_VIRT_REG_PROFILING_TIMESTAMP4,
+  CL_GEN_VIRT_REG_THREAD_ID,
+  CL_GEN_VIRT_REG_CONSTANT_ADDRSPACE,
+  CL_GEN_VIRT_REG_STACK_SIZE,
+  CL_GEN_VIRT_REG_LAST, // Invalid
+} cl_gen_virt_reg;
+
+typedef struct _cl_gen_virt_phy_offset {
+  cl_int virt_reg;
+  cl_int phy_offset;
+  cl_uint size;
+} _cl_gen_virt_phy_offset;
+typedef _cl_gen_virt_phy_offset *cl_gen_virt_phy_offset;
+
+typedef struct _cl_gen_image_info_offset {
+  cl_int bti;
+  cl_int width;
+  cl_int height;
+  cl_int depth;
+  cl_int data_type;
+  cl_int channel_order;
+} _cl_gen_image_info_offset;
+typedef _cl_gen_image_info_offset *cl_gen_image_info_offset;
+
+typedef struct _cl_gen_arg_extra_info {
+  cl_int arg_offset;
+  cl_uint arg_align; // address align for ptr
+  cl_int arg_misc;   //bti, image index
+} _cl_gen_arg_extra_info;
+typedef _cl_gen_arg_extra_info *cl_gen_arg_extra_info;
+
+typedef struct _cl_kernel_gen {
+  _cl_kernel_for_device kern_base;
+  cl_uint local_mem_size;
+  cl_uint barrier_slm_used;
+  cl_uint simd_width;
+  cl_uint scratch_size;
+  cl_uint stack_size;
+  cl_uint samper_info_num;
+  cl_uint *samper_info;
+  cl_uint arg_extra_info_num;
+  cl_gen_arg_extra_info arg_extra_info;
+  cl_uint image_info_num;
+  cl_gen_image_info_offset image_info;
+  cl_uint virt_reg_phy_offset_num; // The mapping between virtual reg and phy 
offset
+  cl_gen_virt_phy_offset virt_reg_phy_offset;
+} _cl_kernel_gen;
+typedef _cl_kernel_gen *cl_kernel_gen;
+
+extern size_t cl_kernel_get_max_workgroup_size_gen(cl_kernel kernel, 
cl_device_id device);
+extern void *cl_kernel_new_gen(cl_device_id device, cl_kernel kernel);
+extern void cl_kernel_delete_gen(cl_device_id device, cl_kernel kernel);
+extern cl_int cl_kernel_get_info_gen(cl_device_id device, cl_kernel kernel,
+ cl_uint param_name, void *param_value);
+extern cl_int cl_kernel_create_gen(cl_device_id device, cl_kernel kernel);
+
 /*** Program 
*/
 typedef struct _cl_program_gen {
   _cl_program_for_device prog_base;
diff --git a/src/gen/cl_kernel_gen.c b/src/gen/cl_kernel_gen.c
new file mode 100644
index 000..f555212
--- /dev/null
+++ b/src/gen/cl_kernel_gen.c
@@ -0,0 +1,553 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ *

[Beignet] [PATCH] Add Gen's specific program struct.

2017-03-02 Thread junyan . he
From: Junyan He 

cl_program_gen struct is for Gen device. Every time
a program is created, cl_program_gen will be generated
by using libelf. The according ELF format binary will
be parsed and stored in cl_program_gen.

Signed-off-by: Junyan He 
---
 src/cl_program.c |   1 -
 src/cl_program.h |  17 +++
 src/gen/cl_gen.h |  71 +++
 src/gen/cl_program_gen.c | 320 +++
 4 files changed, 408 insertions(+), 1 deletion(-)
 create mode 100644 src/gen/cl_gen.h
 create mode 100644 src/gen/cl_program_gen.c

diff --git a/src/cl_program.c b/src/cl_program.c
index 363aed5..46f9d1f 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -117,7 +117,6 @@ cl_program_delete(cl_program p)
   cl_free(p);
 }
 
-#define BUILD_LOG_MAX_SIZE (1024*1024U)
 LOCAL cl_program
 cl_program_new(cl_context ctx)
 {
diff --git a/src/cl_program.h b/src/cl_program.h
index 6e8e84a..4afa553 100644
--- a/src/cl_program.h
+++ b/src/cl_program.h
@@ -28,6 +28,23 @@
 #include 
 #include 
 
+#define BUILD_LOG_MAX_SIZE (256 * 1024U)
+
+typedef struct _cl_program_for_device {
+  cl_device_id device;/* Point to the device it belong to */
+  char *binary;   /* Program binary. */
+  size_t binary_sz;   /* The binary size. */
+  cl_uint binary_type;/* binary type: COMPILED_OBJECT(LLVM IR),
+ LIBRARY(LLVM IR with option 
"-create-library"),
+ or EXECUTABLE(GEN binary). */
+  size_t build_log_max_sz;/* build log maximum size in byte.*/
+  char build_log[BUILD_LOG_MAX_SIZE]; /* The build log for this program. */
+  size_t build_log_sz;/* The actual build log size.*/
+  cl_uint kernel_num; /* Kernel number */
+  char **kernel_names;/* All kernel names of this program */
+} _cl_program_for_device;
+typedef _cl_program_for_device *cl_program_for_device;
+
 // This is the structure ouput by the compiler
 struct _gbe_program;
 
diff --git a/src/gen/cl_gen.h b/src/gen/cl_gen.h
new file mode 100644
index 000..2926bc7
--- /dev/null
+++ b/src/gen/cl_gen.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see .
+ *
+ */
+#ifndef __CL_GEN_H__
+#define __CL_GEN_H__
+
+#include "intel_driver.h"
+#include "cl_program.h"
+#include "cl_kernel.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_platform_id.h"
+#include "cl_mem.h"
+#include "cl_image.h"
+#include "cl_device_id.h"
+#include "cl_sampler.h"
+#include "cl_command_queue.h"
+#include "cl_event.h"
+
+#include 
+#include 
+#include 
+
+/*** Program 
*/
+typedef struct _cl_program_gen {
+  _cl_program_for_device prog_base;
+  Elf *elf;
+  size_t sec_num;
+  Elf_Scn *strtab;
+  cl_int strtab_sec_index;
+  Elf_Data *strtab_data;
+  Elf_Scn *text;
+  cl_int text_sec_index;
+  Elf_Data *text_data;
+  Elf_Scn *rodata;
+  cl_int rodata_sec_index;
+  Elf_Data *rodata_data;
+  Elf_Scn *symtab;
+  cl_int symtab_sec_index;
+  Elf_Data *symtab_data;
+  size_t symtab_entry_num;
+  Elf_Scn *func_gpu_info;
+  cl_int func_gpu_info_sec_index;
+  Elf_Data *func_gpu_info_data;
+  Elf_Scn *func_cl_info;
+  cl_int func_cl_info_sec_index;
+  Elf_Data *func_cl_info_data;
+} _cl_program_gen;
+typedef _cl_program_gen *cl_program_gen;
+
+extern void *cl_program_new_gen(cl_device_id device, cl_program p);
+extern void cl_program_delete_gen(cl_device_id device, cl_program p);
+extern cl_int cl_program_load_binary_gen(cl_device_id device, cl_program prog);
+extern cl_int cl_program_get_info_gen(cl_device_id device, cl_program program,
+  cl_uint param_name, void *param_value);
+
+#endif /* End of __CL_GEN_H__ */
diff --git a/src/gen/cl_program_gen.c b/src/gen/cl_program_gen.c
new file mode 100644
index 000..58be603
--- /dev/null
+++ b/src/gen/cl_program_gen.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any 

[Beignet] [PATCH] Add libelf check in CMakeList

2017-03-02 Thread junyan . he
From: Junyan He 

We need the libelf support to parse the binary files
generated by GBE backend from now on.

Signed-off-by: Junyan He 
---
 CMakeLists.txt | 8 
 src/CMakeLists.txt | 1 +
 2 files changed, 9 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a111fe2..e6babe4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,6 +103,14 @@ ENDIF (USE_STANDALONE_GBE_COMPILER STREQUAL "true")
 
 set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-Bsymbolic 
-Wl,--no-undefined ${LLVM_LDFLAGS}")
 
+pkg_check_modules(ELF_LIB REQUIRED libelf)
+IF(ELF_LIB_FOUND)
+  MESSAGE(STATUS "Looking for libelf - found at ${ELF_LIB_PREFIX} 
${ELF_LIB_VERSION}")
+  INCLUDE_DIRECTORIES(${ELF_LIB_INCLUDE_DIRS})
+ELSE(ELF_LIB_FOUND)
+  MESSAGE(STATUS "Looking for libelf - not found")
+ENDIF(ELF_LIB_FOUND)
+
 # XLib
 Find_Package(X11)
 IF(X11_FOUND)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 94e97ba..bd1007a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -179,6 +179,7 @@ target_link_libraries(
   cl
   rt
   ${X11_LIBRARIES}
+  ${ELF_LIB_LIBRARIES}
   ${XEXT_LIBRARIES}
   ${XFIXES_LIBRARIES}
   ${DRM_INTEL_LIBRARIES}
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Add device API struct to define device behavior

2017-03-02 Thread junyan . he
From: Junyan He 

Each device has its own behavior according to CL APIs.
For example, when call clNDrangeKernel, the actions
of GEN devices should be different from other kind of
devices. After we handle all the common logic in
clNDrangeKernel API, we need to call device->ndrange_kernel()
to do the real work for GEN device.

Signed-off-by: Junyan He 
---
 src/cl_device_api.h | 80 +
 src/cl_device_id.h  |  6 
 2 files changed, 86 insertions(+)
 create mode 100644 src/cl_device_api.h

diff --git a/src/cl_device_api.h b/src/cl_device_api.h
new file mode 100644
index 000..31ed44c
--- /dev/null
+++ b/src/cl_device_api.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see .
+ *
+ * Author: He Junyan 
+ */
+
+#ifndef __CL_DEVICE_API_H__
+#define __CL_DEVICE_API_H__
+
+#include "CL/cl.h"
+
+#define DEV_PRIVATE_DATA(PARENT, DEV, PRIV) \
+  do {  \
+PRIV = NULL;\
+assert(PARENT->each_device_num > 0);\
+for (cl_uint eedev = 0; eedev < PARENT->each_device_num; eedev++) { \
+  if (DEV == (PARENT->each_device[eedev])->device) {\
+PRIV = (void *)PARENT->each_device[eedev];  \
+break;  \
+  } \
+}   \
+assert(PRIV != NULL);   \
+  } while (0);
+
+typedef struct _cl_device_api {
+  cl_int (*compiler_unload)(cl_device_id device);
+
+  void *(*context_new)(cl_device_id device, cl_context ctx);
+  cl_int (*context_create)(cl_device_id device, cl_context ctx);
+  void (*context_delete)(cl_device_id device, cl_context ctx);
+
+  void *(*program_new)(cl_device_id device, cl_program p);
+  cl_int (*program_load_binary)(cl_device_id device, cl_program prog);
+  void (*program_delete)(cl_device_id device, cl_program p);
+  cl_int (*get_program_info)(cl_device_id device, cl_program program,
+ cl_uint param_name, void *param_value);
+
+  void *(*kernel_new)(cl_device_id device, cl_kernel kernel);
+  void (*kernel_delete)(cl_device_id device, cl_kernel kernel);
+  cl_int (*kernel_create)(cl_device_id device, cl_kernel kernel);
+  cl_int (*get_kernel_info)(cl_device_id device, cl_kernel kernel,
+cl_uint param_name, void *param_value);
+
+  cl_int (*ND_range_kernel)(cl_command_queue queue, cl_kernel ker,
+cl_event event, const uint32_t work_dim,
+const size_t *global_wk_off, const size_t 
*global_wk_sz,
+const size_t *local_wk_sz);
+  cl_int (*mem_copy)(cl_command_queue queue, cl_event event, cl_mem src, 
cl_mem dst,
+ size_t src_offset, size_t dst_offset, size_t cb);
+  cl_int (*mem_fill)(cl_command_queue queue, cl_event event, const void 
*pattern,
+ size_t pattern_size, cl_mem buffer, size_t offset, size_t 
size);
+  cl_int (*mem_copy_rect)(cl_command_queue queue, cl_event event, cl_mem 
src_buf,
+  cl_mem dst_buf, const size_t *src_origin, const 
size_t *dst_origin,
+  const size_t *region, size_t src_row_pitch, size_t 
src_slice_pitch,
+  size_t dst_row_pitch, size_t dst_slice_pitch);
+  cl_int (*image_fill)(cl_command_queue queue, cl_event e, const void 
*pattern, cl_mem src_image,
+   const size_t *origin, const size_t *region);
+  cl_int (*image_copy)(cl_command_queue queue, cl_event event, cl_mem 
src_image, cl_mem dst_image,
+   const size_t *src_origin, const size_t *dst_origin, 
const size_t *region);
+  cl_int (*copy_image_to_buffer)(cl_command_queue queue, cl_event event, 
cl_mem image, cl_mem buffer,
+ const size_t *src_origin, const size_t 
dst_offset, const size_t *region);
+  cl_int (*copy_buffer_to_image)(cl_command_queue queue, cl_event event, 
cl_mem buffer, cl_mem image,
+   

[Beignet] [PATCH V2 newRT] Add compiler API functions.

2017-03-02 Thread junyan . he
From: Junyan He 

We will split the compiler with runtime. The runtime will
call the compiler using standard Build, Compile, and Link
API to generate ELF, IR Bitcode. The file implements all
these APIs.

V2:
Add check option for gbe.
Fix some bugs.

Signed-off-by: Junyan He 
---
 backend/src/backend/compiler_api.cpp | 848 +++
 src/cl_compiler.h|   9 +-
 2 files changed, 852 insertions(+), 5 deletions(-)
 create mode 100644 backend/src/backend/compiler_api.cpp

diff --git a/backend/src/backend/compiler_api.cpp 
b/backend/src/backend/compiler_api.cpp
new file mode 100644
index 000..98f5d0b
--- /dev/null
+++ b/backend/src/backend/compiler_api.cpp
@@ -0,0 +1,848 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see .
+ *
+ */
+#include "llvm/ADT/Triple.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm-c/Linker.h"
+#include "clang/Basic/DiagnosticOptions.h"
+#include "clang/Frontend/TextDiagnosticPrinter.h"
+#include "clang/Frontend/CompilerInvocation.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/CodeGen/CodeGenAction.h"
+
+#include "GBEConfig.h"
+#include "backend/gen_program.hpp"
+#include "sys/cvar.hpp"
+
+#include 
+#include 
+#include 
+#include 
+
+using namespace gbe;
+
+SVAR(OCL_PCH_PATH, OCL_PCH_OBJECT);
+SVAR(OCL_PCH_20_PATH, OCL_PCH_OBJECT_20);
+SVAR(OCL_HEADER_FILE_DIR, OCL_HEADER_DIR);
+BVAR(OCL_OUTPUT_KERNEL_SOURCE, false);
+BVAR(OCL_DEBUGINFO, false);
+BVAR(OCL_OUTPUT_BUILD_LOG, false);
+
+static llvm::Module *
+loadProgramFromLLVMIRBinary(uint32_t deviceID, const char *binary, size_t size)
+{
+  std::string binary_content;
+  //the first byte stands for binary_type.
+  binary_content.assign(binary, size);
+  llvm::StringRef llvm_bin_str(binary_content);
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
+  llvm::LLVMContext &c = GBEGetLLVMContext();
+#else
+  llvm::LLVMContext &c = llvm::getGlobalContext();
+#endif
+  llvm::SMDiagnostic Err;
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+  std::unique_ptr memory_buffer = 
llvm::MemoryBuffer::getMemBuffer(llvm_bin_str, "llvm_bin_str");
+  acquireLLVMContextLock();
+  llvm::Module *module = llvm::parseIR(memory_buffer->getMemBufferRef(), Err, 
c).release();
+#else
+  llvm::MemoryBuffer *memory_buffer = 
llvm::MemoryBuffer::getMemBuffer(llvm_bin_str, "llvm_bin_str");
+  acquireLLVMContextLock();
+  llvm::Module *module = llvm::ParseIR(memory_buffer, Err, c);
+#endif
+
+  if (module == NULL)
+return NULL;
+
+  // if load 32 bit spir binary, the triple should be spir-unknown-unknown.
+  llvm::Triple triple(module->getTargetTriple());
+  if (triple.getArchName() == "spir" && triple.getVendorName() == "unknown" &&
+  triple.getOSName() == "unknown") {
+module->setTargetTriple("spir");
+  } else if (triple.getArchName() == "spir64" && triple.getVendorName() == 
"unknown" &&
+ triple.getOSName() == "unknown") {
+module->setTargetTriple("spir64");
+  }
+  releaseLLVMContextLock();
+
+  return module;
+}
+
+static bool
+processSourceAndOption(const char *source, const char *options, const char 
*temp_header_path,
+   std::vector &clOpt, std::string 
&dumpLLVMFileName,
+   std::string &dumpASMFileName, std::string 
&dumpSPIRBinaryName,
+   int &optLevel, size_t stringSize, char *err, size_t 
*errSize,
+   uint32_t &oclVersion)
+{
+  std::string pchFileName;
+  bool findPCH = false;
+#if defined(__ANDROID__)
+  bool invalidPCH = true;
+#else
+  bool invalidPCH = false;
+#endif
+  size_t start = 0, end = 0;
+
+  std::string hdirs = OCL_HEADER_FILE_DIR;
+  if (hdirs == "")
+hdirs = OCL_HEADER_DIR;
+  std::istringstream hidirs(hdirs);
+  std::string headerFilePath;
+  bool findOcl = false;
+
+  while (getline(hidirs, headerFilePath, ':')) {
+std::string oclDotHName = headerFilePath + "/ocl.h";
+if (access(oclDotHName.c_str(), R_OK) == 0) {
+  findOcl = true;
+  break;
+}
+  }
+  (void)findOcl;
+  assert(findOcl);
+  if (OCL_OUTPUT_KERNEL_SOURCE) {
+if (options) {
+  std::cout << "Build optio

[Beignet] [PATCH V4 newRT] Add GenProgram::toBinaryFormat to generate ELF format binary.

2017-03-02 Thread junyan . he
From: Junyan He 

We add this function to generate a standard ELF format binary.
All the verbose information we need in runtime will be stored
in .note.gpu_info section. Then, we can separate the runtime
and compiler clearly.

V2:
Add OpenCL info such as Argument nane, workgroup size, etc.
Add GPU version and OpenCL version info.
Use struct and template to clear up the code.

V3:
Fix some bugs.

V4:
Fix a compiler error

Signed-off-by: Junyan He 
---
 backend/src/backend/gen_program_elf.cpp | 673 ++--
 backend/src/backend/program.h   |  16 +
 backend/src/backend/program.hpp |   4 +
 3 files changed, 492 insertions(+), 201 deletions(-)

diff --git a/backend/src/backend/gen_program_elf.cpp 
b/backend/src/backend/gen_program_elf.cpp
index 0440e81..45b3381 100644
--- a/backend/src/backend/gen_program_elf.cpp
+++ b/backend/src/backend/gen_program_elf.cpp
@@ -15,10 +15,12 @@
  * License along with this library. If not, see .
  *
  */
+#include "src/cl_device_data.h"
 #include "ocl_common_defines.h"
 #include "elfio/elfio.hpp"
 #include "backend/gen_program.hpp"
 #include "backend/gen_program.hpp"
+#include "sys/cvar.hpp"
 #include 
 #include 
 #include 
@@ -26,6 +28,9 @@ using namespace std;
 
 namespace gbe
 {
+
+BVAR(OCL_DUMP_ELF_FILE, false);
+
 /* The elf writer need to make sure seekp function work, so sstream
can not work, and we do not want the fostream to generate the real
file. We just want to keep the elf image in the memory. Implement
@@ -125,82 +130,338 @@ protected:
 
 using namespace ELFIO;
 
+/* The format for one Gen Kernel function is following note section format
+ --
+ | GEN_NOTE_TYPE_GPU_INFO |
+ --
+ | Function Name size:4 |
+ 
+ | Desc size:4  |
+ ---
+ | The kernel name(strlen) |
+ 
---
+ | SIMD:4 | Local Mem Size:4 | Scratch Size:4 | Stack Size :4 | Barrier/SLM 
Used:4 | Arg Num:4 |
+ 
---
+   Then the format for each argument is
+ 
--
+ | Index:4 | Size:4 | Type:4 | Offset:4 | Addr Space:4 | Align(if is ptr) | 
BTI(if buffer):4 / Index(sampler and image):4 |
+ 
--
+   Then all sampler info
+ ---
+ | Number:4 | SamperInfo:4 | ..|
+ ---
+   Then all image info
+ 

+ | Number:4 | BTI:4 | Width:4 | Height:4 | Depth:4 | Data Type:4 | Channel 
Order:4 | ...|
+ 

+   Last is the map table of special virtual register and phy register
+ 
+ | Number:4 | Virt Reg:4 | Phy Reg:4 | Size:4 |.|
+   */
+
+/* The format for one Gen Kernel function's OpenCL info is following note 
section format
+ --
+ | GEN_NOTE_TYPE_CL_INFO  |
+ 
+ | The kernel function's name: (strlen) |
+ 
+ | Function's attribute string: (strlen)|
+ 
+ | Work Group size: sizeof(size_t) * 3  |
+ 
+ | Argument TypeName: (strlen) |
+ -
+ | Argument AccessQual: (strlen) |
+ -
+ | Argument Name: (strlen) |
+ ---  */
+
+/* The format for GPU version is:
+ 
+ | GEN_NOTE_TYPE_GPU_VERSION |
+ -
+ | GEN string (HasWell e.g.) |
+ -
+ | GEN pci id |
+ --
+ | GEN version major:4 |
+ ---
+ | GEN version minor:4 |
+ ---  */
+
+/* The format for CL version is:
+ 
+ | GEN_NOTE_TYPE_CL_VERSION |
+ 
+ | CL version string (OpenCL 2.0  e.g.) |
+ 
+ | CL version major:4 |
+ --
+ | CL version minor:4 |
+ --  */
+
+/* The format for Compiler info is:
+ ---
+ | GEN_NOTE_TYPE_COMPILER_INFO |
+ 
+ | Compiler name (GBE_Compiler  e.g.) |
+ 
+ | LLVM version major:4 |
+ 
+ | LLVM version minor:4 |
+  */
+
 

[Beignet] [PATCH V3 newRT] Add GenProgram::toBinaryFormat to generate ELF format binary.

2017-03-02 Thread junyan . he
From: Junyan He 

We add this function to generate a standard ELF format binary.
All the verbose information we need in runtime will be stored
in .note.gpu_info section. Then, we can separate the runtime
and compiler clearly.

V2:
Add OpenCL info such as Argument nane, workgroup size, etc.
Add GPU version and OpenCL version info.
Use struct and template to clear up the code.

V3:
Fix some bugs.

Signed-off-by: Junyan He 
---
 backend/src/backend/gen_program_elf.cpp | 672 ++--
 1 file changed, 471 insertions(+), 201 deletions(-)

diff --git a/backend/src/backend/gen_program_elf.cpp 
b/backend/src/backend/gen_program_elf.cpp
index 0440e81..c750ca8 100644
--- a/backend/src/backend/gen_program_elf.cpp
+++ b/backend/src/backend/gen_program_elf.cpp
@@ -19,6 +19,7 @@
 #include "elfio/elfio.hpp"
 #include "backend/gen_program.hpp"
 #include "backend/gen_program.hpp"
+#include "sys/cvar.hpp"
 #include 
 #include 
 #include 
@@ -26,6 +27,9 @@ using namespace std;
 
 namespace gbe
 {
+
+BVAR(OCL_DUMP_ELF_FILE, false);
+
 /* The elf writer need to make sure seekp function work, so sstream
can not work, and we do not want the fostream to generate the real
file. We just want to keep the elf image in the memory. Implement
@@ -125,82 +129,338 @@ protected:
 
 using namespace ELFIO;
 
+/* The format for one Gen Kernel function is following note section format
+ --
+ | GEN_NOTE_TYPE_GPU_INFO |
+ --
+ | Function Name size:4 |
+ 
+ | Desc size:4  |
+ ---
+ | The kernel name(strlen) |
+ 
---
+ | SIMD:4 | Local Mem Size:4 | Scratch Size:4 | Stack Size :4 | Barrier/SLM 
Used:4 | Arg Num:4 |
+ 
---
+   Then the format for each argument is
+ 
--
+ | Index:4 | Size:4 | Type:4 | Offset:4 | Addr Space:4 | Align(if is ptr) | 
BTI(if buffer):4 / Index(sampler and image):4 |
+ 
--
+   Then all sampler info
+ ---
+ | Number:4 | SamperInfo:4 | ..|
+ ---
+   Then all image info
+ 

+ | Number:4 | BTI:4 | Width:4 | Height:4 | Depth:4 | Data Type:4 | Channel 
Order:4 | ...|
+ 

+   Last is the map table of special virtual register and phy register
+ 
+ | Number:4 | Virt Reg:4 | Phy Reg:4 | Size:4 |.|
+   */
+
+/* The format for one Gen Kernel function's OpenCL info is following note 
section format
+ --
+ | GEN_NOTE_TYPE_CL_INFO  |
+ 
+ | The kernel function's name: (strlen) |
+ 
+ | Function's attribute string: (strlen)|
+ 
+ | Work Group size: sizeof(size_t) * 3  |
+ 
+ | Argument TypeName: (strlen) |
+ -
+ | Argument AccessQual: (strlen) |
+ -
+ | Argument Name: (strlen) |
+ ---  */
+
+/* The format for GPU version is:
+ 
+ | GEN_NOTE_TYPE_GPU_VERSION |
+ -
+ | GEN string (HasWell e.g.) |
+ -
+ | GEN pci id |
+ --
+ | GEN version major:4 |
+ ---
+ | GEN version minor:4 |
+ ---  */
+
+/* The format for CL version is:
+ 
+ | GEN_NOTE_TYPE_CL_VERSION |
+ 
+ | CL version string (OpenCL 2.0  e.g.) |
+ 
+ | CL version major:4 |
+ --
+ | CL version minor:4 |
+ --  */
+
+/* The format for Compiler info is:
+ ---
+ | GEN_NOTE_TYPE_COMPILER_INFO |
+ 
+ | Compiler name (GBE_Compiler  e.g.) |
+ 
+ | LLVM version major:4 |
+ 
+ | LLVM version minor:4 |
+  */
+
 class GenProgramElfContext
 {
 public:
-  enum { // 0, 1, 2 already have meanings
+  enum {
+GEN_NOTE_TYPE_CL_VERSION = 1,
+GEN_NOTE_TYPE_GPU_VERSION = 2,
 GEN_NOTE_TYPE_GPU_INFO = 3,
-GEN_NOTE_TYPE_CL_ARG_INFO = 4,
-GEN_NOTE_TYPE_CL_WORKGROUP_SIZE = 5,
+GEN_

[Beignet] [PATCH 3/3] fix build error log not output issue.

2017-03-02 Thread xionghu . luo
From: Luo Xionghu 

Signed-off-by: Luo Xionghu 
---
 backend/src/backend/program.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 09c79d8..5f18dd3 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -1114,7 +1114,7 @@ EXTEND_QUOTE:
 stringSize, err, errSize, optLevel, 
options);
   if (err != NULL)
 *errSize += clangErrSize;
-  if (OCL_OUTPUT_BUILD_LOG && options)
+  if (OCL_OUTPUT_BUILD_LOG && err)
 llvm::errs() << options;
 } else
   p = NULL;
-- 
2.5.0

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 1/3] add extension cl_intel_media_block_io READ related function

2017-03-02 Thread xionghu . luo
From: Luo Xionghu 

Signed-off-by: Luo Xionghu 
---
 backend/src/backend/gen_insn_selection.cpp |  55 --
 backend/src/ir/instruction.cpp |  14 +++-
 backend/src/ir/instruction.hpp |   4 +-
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl   | 117 -
 backend/src/libocl/tmpl/ocl_simd.tmpl.h|  17 +
 backend/src/llvm/llvm_gen_backend.cpp  |  89 +-
 backend/src/llvm/llvm_gen_ocl_function.hxx |   6 ++
 backend/src/llvm/llvm_scalarize.cpp|   5 ++
 8 files changed, 274 insertions(+), 33 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index 1cab40c..cabc6a3 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -7811,25 +7811,56 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
   /*! Media Block Read pattern */
   DECL_PATTERN(MediaBlockReadInstruction)
   {
+uint32_t fixBlockSize(const ir::MediaBlockReadInstruction &insn, uint32_t 
typeSize, uint32_t simdWidth, uint32_t &block_width) const
+{
+  uint8_t width = insn.getWidth();
+  uint8_t height = insn.getHeight();
+  uint32_t vec_size = insn.getVectorSize();
+  uint32_t blocksize = 0;
+  if (width && height) {
+if (width * height * typeSize > vec_size * simdWidth * typeSize) {
+  if (width <= simdWidth * vec_size) {
+height = vec_size * simdWidth / width;
+  } else {
+height = 1;
+width = vec_size * simdWidth / height;
+  }
+}
+  }else {
+width = simdWidth;
+height = vec_size;
+  }
+  block_width = typeSize * (width < simdWidth ? width : simdWidth);
+  blocksize = (block_width - 1) % 32 | (height - 1) << 16;
+  return blocksize;
+}
+
 bool emitOne(Selection::Opaque &sel, const ir::MediaBlockReadInstruction 
&insn, bool &markChildren) const
 {
   using namespace ir;
   uint32_t vec_size = insn.getVectorSize();
   uint32_t simdWidth = sel.curr.execWidth;
   const Type type = insn.getType();
-  const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+  uint32_t typeSize = 0;
+  if(type == TYPE_U32) {
+typeSize = 4;
+  }else if(type == TYPE_U16) {
+typeSize = 2;
+  }else if(type == TYPE_U8) {
+typeSize = 1;
+  }else
+NOT_IMPLEMENTED;
   uint32_t response_size = simdWidth * vec_size * typeSize / 32;
   // ushort in simd8 will have half reg thus 0.5 reg size, but response 
lenght is still 1
   response_size = response_size ? response_size : 1;
-  uint32_t block_width = typeSize * simdWidth;
-  uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
-
+  uint32_t block_width = 0;
+  uint32_t blocksize = fixBlockSize(insn, typeSize, simdWidth, 
block_width);
 
   vector valuesVec;
   vector tmpVec;
   for (uint32_t i = 0; i < vec_size; ++i) {
 valuesVec.push_back(sel.selReg(insn.getDst(i), type));
-if(simdWidth == 16 && typeSize == 4)
+if((simdWidth == 16 && typeSize == 4) || typeSize == 1)
   tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG)));
   }
   const GenRegister coordx = 
GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD);
@@ -7855,15 +7886,23 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
 sel.MOV(blocksizereg, GenRegister::immud(blocksize));
   sel.pop();
 
-  if (simdWidth * typeSize < 64) {
+  if (block_width < 64) {
 sel.push();
   sel.curr.execWidth = 8;
   sel.curr.predicate = GEN_PREDICATE_NONE;
   sel.curr.noMask = 1;
   // Now read the data
-  sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(), 
response_size);
+  if(typeSize == 1) {
+sel.MBREAD(&tmpVec[0], vec_size, header, insn.getImageIndex(), 
response_size);
+for (uint32_t i = 0; i < vec_size; i++) {
+  sel.MOV(valuesVec[i], 
sel.getOffsetReg(GenRegister::retype(tmpVec[0], GEN_TYPE_UB), 0, i*simdWidth));
+  sel.MOV(sel.getOffsetReg(valuesVec[i], 0, 16), 
sel.getOffsetReg(GenRegister::retype(tmpVec[0], GEN_TYPE_UB), 0, i*simdWidth + 
8));
+}
+  }else
+sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(), 
response_size);
+
 sel.pop();
-  } else if (simdWidth * typeSize == 64) {
+  } else if (block_width == 64) {
 sel.push();
   sel.curr.execWidth = 8;
   sel.curr.predicate = GEN_PREDICATE_NONE;
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index f0c3957..4b87e4a 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1070,7 +1070,7 @@ namespace ir {
   public TupleDstPolicy
 {
 public:
-  INLINE M

[Beignet] [PATCH 2/3] cl_intel_media_block_io READ related unit tests.

2017-03-02 Thread xionghu . luo
From: Luo Xionghu 

Signed-off-by: Luo Xionghu 
---
 kernels/compiler_subgroup_image_block_read.cl | 115 ++
 src/cl_extensions.h   |   1 +
 utests/CMakeLists.txt |   1 +
 utests/compiler_subgroup_media_block_read.cpp | 212 ++
 utests/utest_helper.cpp   |  20 +++
 utests/utest_helper.hpp   |   3 +
 6 files changed, 352 insertions(+)
 create mode 100644 utests/compiler_subgroup_media_block_read.cpp

diff --git a/kernels/compiler_subgroup_image_block_read.cl 
b/kernels/compiler_subgroup_image_block_read.cl
index fa079b7..596265f 100644
--- a/kernels/compiler_subgroup_image_block_read.cl
+++ b/kernels/compiler_subgroup_image_block_read.cl
@@ -62,3 +62,118 @@ __kernel void 
compiler_subgroup_image_block_read_us8(image2d_t src, global ushor
   dst[id] = tmp;
 }
 #endif
+#ifdef MEDIA_BLOCK_IO
+__kernel void compiler_subgroup_media_block_read_ui1(image2d_t src, global 
uint *dst)
+{
+  int id = get_global_id(0);
+  int yid = get_global_id(1);
+  int2 coord = (int2)(get_sub_group_size()*get_sub_group_id()*sizeof(uint) + 
sizeof(uint) * get_group_id(0) * get_local_size(0),yid);
+  uint tmp = intel_sub_group_media_block_read_ui(coord, 16, 1, src);
+  dst[yid * get_global_size(0) + id] = tmp;
+}
+__kernel void compiler_subgroup_media_block_read_ui2(image2d_t src, global 
uint2 *dst)
+{
+  int id = get_global_id(0);
+  int yid = get_global_id(1);
+  int2 coord = (int2)(get_sub_group_size()*get_sub_group_id()*sizeof(uint) + 
sizeof(uint) * get_group_id(0) * get_local_size(0),yid*2);
+  uint2 tmp = intel_sub_group_media_block_read_ui2(coord, 16, 2, src);
+  dst[yid * get_global_size(0) + id] = tmp;
+}
+__kernel void compiler_subgroup_media_block_read_ui4(image2d_t src, global 
uint4 *dst)
+{
+  int id = get_global_id(0);
+  int yid = get_global_id(1);
+  int2 coord = (int2)(get_sub_group_size()*get_sub_group_id()*sizeof(uint) + 
sizeof(uint) * get_group_id(0) * get_local_size(0),yid*4);
+  uint4 tmp = intel_sub_group_media_block_read_ui4(coord, 16, 4, src);
+  dst[yid * get_global_size(0) + id] = tmp;
+}
+__kernel void compiler_subgroup_media_block_read_ui8(image2d_t src, global 
uint8 *dst)
+{
+  int id = get_global_id(0);
+  int yid = get_global_id(1);
+  int2 coord = (int2)(get_sub_group_size()*get_sub_group_id()*sizeof(uint) + 
sizeof(uint) * get_group_id(0) * get_local_size(0),yid*8);
+  uint8 tmp = intel_sub_group_media_block_read_ui8(coord, 16, 8, src);
+  dst[yid * get_global_size(0) + id] = tmp;
+}
+__kernel void compiler_subgroup_media_block_read_us1(image2d_t src, global 
ushort *dst)
+{
+  int id = get_global_id(0);
+  int yid = get_global_id(1);
+  int2 coord = (int2)(get_sub_group_size()*get_sub_group_id()*sizeof(ushort) + 
sizeof(ushort) * get_group_id(0) * get_local_size(0),yid);
+  ushort tmp = intel_sub_group_media_block_read_us(coord, 16, 1, src);
+  dst[yid * get_global_size(0) + id] = tmp;
+}
+__kernel void compiler_subgroup_media_block_read_us2(image2d_t src, global 
ushort2 *dst)
+{
+  int id = get_global_id(0);
+  int yid = get_global_id(1);
+  int2 coord = (int2)(get_sub_group_size()*get_sub_group_id()*sizeof(ushort) + 
sizeof(ushort) * get_group_id(0) * get_local_size(0),yid*2);
+  ushort2 tmp = intel_sub_group_media_block_read_us2(coord, 16, 2, src);
+  dst[yid * get_global_size(0) + id] = tmp;
+}
+__kernel void compiler_subgroup_media_block_read_us4(image2d_t src, global 
ushort4 *dst)
+{
+  int id = get_global_id(0);
+  int yid = get_global_id(1);
+  int2 coord = (int2)(get_sub_group_size()*get_sub_group_id()*sizeof(ushort) + 
sizeof(ushort) * get_group_id(0) * get_local_size(0),yid*4);
+  ushort4 tmp = intel_sub_group_media_block_read_us4(coord, 16, 4, src);
+  dst[yid * get_global_size(0) + id] = tmp;
+}
+__kernel void compiler_subgroup_media_block_read_us8(image2d_t src, global 
ushort8 *dst)
+{
+  int id = get_global_id(0);
+  int yid = get_global_id(1);
+  int2 coord = (int2)(get_sub_group_size()*get_sub_group_id()*sizeof(ushort) + 
sizeof(ushort) * get_group_id(0) * get_local_size(0),yid*8);
+  ushort8 tmp = intel_sub_group_media_block_read_us8(coord, 16, 8, src);
+  dst[yid * get_global_size(0) + id] = tmp;
+}
+__kernel void  __attribute__((intel_reqd_sub_group_size(8)))
+compiler_subgroup_media_block_read_us16(image2d_t src, global ushort16 *dst)
+{
+  int id = get_global_id(0);
+  int yid = get_global_id(1);
+  int2 coord = (int2)(get_sub_group_size()*get_sub_group_id()*sizeof(ushort) + 
sizeof(ushort) * get_group_id(0) * get_local_size(0),yid*16);
+  ushort16 tmp = intel_sub_group_media_block_read_us16(coord, 8, 16, src);
+  dst[yid * get_global_size(0) + id] = tmp;
+}
+__kernel void compiler_subgroup_media_block_read_uc1(image2d_t src, global 
uchar *dst)
+{
+  int id = get_global_id(0);
+  int yid = get_global_id(1);
+  int2 coord = (int2)(get_sub_group_size()*get_sub_group_id()*sizeof(char) + 
sizeof(char) * get_group_id(0) * get_local_size(0),yid);
+  uchar