Ok, That's good for me. Thanks for enabling BTX. On Fri, Dec 04, 2015 at 03:22:20AM +0800, Guo Yejun wrote: > Date: Fri, 4 Dec 2015 03:22:20 +0800 > From: Guo Yejun <yejun....@intel.com> > To: beignet@lists.freedesktop.org > Cc: Guo Yejun <yejun....@intel.com> > Subject: [Beignet] [PATCH] add Broxton support > X-Mailer: git-send-email 1.9.1 > > special versions of linux kernel and libdrm are needed. > utest and conformance test PASSED. > > Signed-off-by: Guo Yejun <yejun....@intel.com> > --- > GetGenID.sh | 2 +- > backend/src/backend/gen8_context.cpp | 2 +- > backend/src/backend/gen8_context.hpp | 2 + > backend/src/backend/gen9_context.cpp | 110 > +++++++++++++++++++++++++++++ > backend/src/backend/gen9_context.hpp | 22 ++++++ > backend/src/backend/gen_insn_selection.cpp | 11 +++ > backend/src/backend/gen_insn_selection.hpp | 7 ++ > backend/src/backend/gen_program.cpp | 17 ++++- > backend/src/gbe_bin_generater.cpp | 4 ++ > src/cl_device_data.h | 9 ++- > src/cl_device_id.c | 34 +++++++-- > src/intel/intel_gpgpu.c | 5 +- > 12 files changed, 213 insertions(+), 12 deletions(-) > > diff --git a/GetGenID.sh b/GetGenID.sh > index 7acf9bd..30296da 100755 > --- a/GetGenID.sh > +++ b/GetGenID.sh > @@ -1,5 +1,5 @@ > #!/bin/bash > -genpciid=(0152 0162 0156 0166 015a 016a 0f31 0402 0412 0422 040a 041a 042a > 0406 0416 0426 0c02 0c12 0c22 0c0a 0c1a 0c2a 0c06 0c16 0c26 0a02 0a12 0a22 > 0a0a 0a1a 0a2a 0a06 0a16 0a26 0d02 0d12 0d22 0d0a 0d1a 0d2a 0d06 0d16 0d26) > +genpciid=(0152 0162 0156 0166 015a 016a 0f31 0402 0412 0422 040a 041a 042a > 0406 0416 0426 0c02 0c12 0c22 0c0a 0c1a 0c2a 0c06 0c16 0c26 0a02 0a12 0a22 > 0a0a 0a1a 0a2a 0a06 0a16 0a26 0d02 0d12 0d22 0d0a 0d1a 0d2a 0d06 0d16 0d26 > 5a84) > pciid=($(lspci -nn | grep "\[8086:.*\]" -o | awk -F : '{print $2}' | awk -F > ] '{print $1}')) > n=${#pciid[*]} > i=0 > diff --git a/backend/src/backend/gen8_context.cpp > b/backend/src/backend/gen8_context.cpp > index 71d900f..7455bfc 100644 > --- a/backend/src/backend/gen8_context.cpp > +++ b/backend/src/backend/gen8_context.cpp > @@ -417,7 +417,7 @@ namespace gbe > GBE_ASSERT(0); > } > > - static GenRegister unpacked_ud(GenRegister reg, uint32_t offset = 0) > + GenRegister Gen8Context::unpacked_ud(GenRegister reg, uint32_t offset) > { > if(reg.hstride == GEN_HORIZONTAL_STRIDE_0) { > if(offset == 0) > diff --git a/backend/src/backend/gen8_context.hpp > b/backend/src/backend/gen8_context.hpp > index 537aef5..cc415c6 100644 > --- a/backend/src/backend/gen8_context.hpp > +++ b/backend/src/backend/gen8_context.hpp > @@ -76,6 +76,8 @@ namespace gbe > > virtual void emitF64DIVInstruction(const SelectionInstruction &insn); > > + static GenRegister unpacked_ud(GenRegister reg, uint32_t offset = 0); > + > protected: > virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, > int sz = 0); > virtual void subTimestamps(GenRegister& t0, GenRegister& t1, > GenRegister& tmp); > diff --git a/backend/src/backend/gen9_context.cpp > b/backend/src/backend/gen9_context.cpp > index c35293a..47b1496 100644 > --- a/backend/src/backend/gen9_context.cpp > +++ b/backend/src/backend/gen9_context.cpp > @@ -55,4 +55,114 @@ namespace gbe > p->WAIT(); > p->pop(); > } > + > + void BxtContext::newSelection(void) { > + this->sel = GBE_NEW(SelectionBxt, *this); > + } > + > + void BxtContext::calculateFullU64MUL(GenRegister src0, GenRegister src1, > GenRegister dst_h, > + GenRegister dst_l, GenRegister > s0l_s1h, GenRegister s0h_s1l) > + { > + src0.type = src1.type = GEN_TYPE_UD; > + dst_h.type = dst_l.type = GEN_TYPE_UL; > + s0l_s1h.type = s0h_s1l.type = GEN_TYPE_UL; > + > + //GenRegister tmp; > + > + GenRegister s0l = unpacked_ud(src0); > + GenRegister s1l = unpacked_ud(src1); > + GenRegister s0h = unpacked_ud(s0l_s1h); //s0h only used before s0l_s1h, > reuse s0l_s1h > + GenRegister s1h = unpacked_ud(dst_l); //s1h only used before dst_l, > reuse dst_l > + > + p->MOV(s0h, GenRegister::offset(s0l, 0, 4)); > + p->MOV(s1h, GenRegister::offset(s1l, 0, 4)); > + > + /* High 32 bits X High 32 bits. */ > + p->MUL(dst_h, s0h, s1h); > + /* High 32 bits X low 32 bits. */ > + p->MUL(s0h_s1l, s0h, s1l); > + /* Low 32 bits X high 32 bits. */ > + p->MUL(s0l_s1h, s0l, s1h); > + /* Low 32 bits X low 32 bits. */ > + p->MUL(dst_l, s0l, s1l); > + > + /* Because the max product of s0l*s1h is (2^N - 1) * (2^N - 1) = 2^2N + > 1 - 2^(N+1), here N = 32 > + The max of addding 2 32bits integer to it is > + 2^2N + 1 - 2^(N+1) + 2*(2^N - 1) = 2^2N - 1 > + which means the product s0h_s1l adds dst_l's high 32 bits and then > adds s0l_s1h's low 32 bits will not > + overflow and have no carry. > + By this manner, we can avoid using acc register, which has a lot of > restrictions. */ > + > + GenRegister s0l_s1h_l = unpacked_ud(s0l_s1h); > + p->ADD(s0h_s1l, s0h_s1l, s0l_s1h_l); > + > + p->SHR(s0l_s1h, s0l_s1h, GenRegister::immud(32)); > + GenRegister s0l_s1h_h = unpacked_ud(s0l_s1h); > + p->ADD(dst_h, dst_h, s0l_s1h_h); > + > + GenRegister dst_l_h = unpacked_ud(s0l_s1h); > + p->MOV(dst_l_h, unpacked_ud(dst_l, 1)); > + p->ADD(s0h_s1l, s0h_s1l, dst_l_h); > + > + // No longer need s0l_s1h > + GenRegister tmp = s0l_s1h; > + > + p->SHL(tmp, s0h_s1l, GenRegister::immud(32)); > + GenRegister tmp_unpacked = unpacked_ud(tmp, 1); > + p->MOV(unpacked_ud(dst_l, 1), tmp_unpacked); > + > + p->SHR(tmp, s0h_s1l, GenRegister::immud(32)); > + p->ADD(dst_h, dst_h, tmp); > + } > + > + void BxtContext::emitI64MULInstruction(const SelectionInstruction &insn) > + { > + GenRegister src0 = ra->genReg(insn.src(0)); > + GenRegister src1 = ra->genReg(insn.src(1)); > + GenRegister dst = ra->genReg(insn.dst(0)); > + GenRegister res = ra->genReg(insn.dst(1)); > + > + src0.type = src1.type = GEN_TYPE_UD; > + dst.type = GEN_TYPE_UL; > + res.type = GEN_TYPE_UL; > + > + /* Low 32 bits X low 32 bits. */ > + GenRegister s0l = unpacked_ud(src0); > + GenRegister s1l = unpacked_ud(src1); > + p->MUL(dst, s0l, s1l); > + > + /* Low 32 bits X high 32 bits. */ > + GenRegister s1h = unpacked_ud(res); > + p->MOV(s1h, unpacked_ud(src1, 1)); > + > + p->MUL(res, s0l, s1h); > + p->SHL(res, res, GenRegister::immud(32)); > + p->ADD(dst, dst, res); > + > + /* High 32 bits X low 32 bits. */ > + GenRegister s0h = unpacked_ud(res); > + p->MOV(s0h, unpacked_ud(src0, 1)); > + > + p->MUL(res, s0h, s1l); > + p->SHL(res, res, GenRegister::immud(32)); > + p->ADD(dst, dst, res); > + } > + > + void BxtContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, > int sz) { > + if (sz == 0) > + sz = 16; > + GBE_ASSERT(sz%4 == 0); > + GBE_ASSERT(new_a0[0] >= 0 && new_a0[0] < 4096); > + > + p->push(); > + p->curr.execWidth = 1; > + p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > + for (int i = 0; i < sz/2; i++) { > + p->MOV(GenRegister::retype(GenRegister::addr1(i*2), GEN_TYPE_UD), > + GenRegister::immud(new_a0[i*2 + 1] << 16 | new_a0[i*2])); > + } > + p->pop(); > + } > + > } > diff --git a/backend/src/backend/gen9_context.hpp > b/backend/src/backend/gen9_context.hpp > index 8acad8c..a2931cc 100644 > --- a/backend/src/backend/gen9_context.hpp > +++ b/backend/src/backend/gen9_context.hpp > @@ -46,5 +46,27 @@ namespace gbe > private: > virtual void newSelection(void); > }; > + > + //most code of BxtContext are copied from ChvContext, it results in two > physical copy of the same code. > + //there are two possible ways to resolve it: 1) virtual inheritance 2) > class template > + //but either way makes BxtContext and ChvContext tied closely, it might > impact the flexibility of future changes > + //so, choose the method of two physical copies. > + class BxtContext : public Gen9Context > + { > + public: > + virtual ~BxtContext(void) { } > + BxtContext(const ir::Unit &unit, const std::string &name, uint32_t > deviceID, bool relaxMath = false) > + : Gen9Context(unit, name, deviceID, relaxMath) { > + }; > + virtual void emitI64MULInstruction(const SelectionInstruction &insn); > + > + protected: > + virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, > int sz = 0); > + > + private: > + virtual void newSelection(void); > + virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, > GenRegister dst_h, > + GenRegister dst_l, GenRegister > s0l_s1h, GenRegister s0h_s1l); > + }; > } > #endif /* __GBE_GEN9_CONTEXT_HPP__ */ > diff --git a/backend/src/backend/gen_insn_selection.cpp > b/backend/src/backend/gen_insn_selection.cpp > index cd7b2eb..b5da42d 100644 > --- a/backend/src/backend/gen_insn_selection.cpp > +++ b/backend/src/backend/gen_insn_selection.cpp > @@ -2267,6 +2267,17 @@ extern bool OCL_DEBUGINFO; // first defined by calling > BVAR in program.cpp > opt_features = SIOF_LOGICAL_SRCMOD; > } > > + SelectionBxt::SelectionBxt(GenContext &ctx) : Selection(ctx) { > + this->opaque->setHas32X32Mul(true); > + this->opaque->setHasLongType(true); > + this->opaque->setLongRegRestrict(true); > + this->opaque->setHasDoubleType(true); > + this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL); > + this->opaque->setSlowByteGather(true); > + this->opaque->setHasHalfType(true); > + opt_features = SIOF_LOGICAL_SRCMOD | SIOF_OP_MOV_LONG_REG_RESTRICT; > + } > + > void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, > uint32_t bti, bool is3D) { > uint32_t elemID = 0; > diff --git a/backend/src/backend/gen_insn_selection.hpp > b/backend/src/backend/gen_insn_selection.hpp > index f292566..83d64be 100644 > --- a/backend/src/backend/gen_insn_selection.hpp > +++ b/backend/src/backend/gen_insn_selection.hpp > @@ -333,6 +333,13 @@ namespace gbe > Selection9(GenContext &ctx); > }; > > + class SelectionBxt: public Selection > + { > + public: > + /*! Initialize internal structures used for the selection */ > + SelectionBxt(GenContext &ctx); > + }; > + > } /* namespace gbe */ > > #endif /* __GEN_INSN_SELECTION_HPP__ */ > diff --git a/backend/src/backend/gen_program.cpp > b/backend/src/backend/gen_program.cpp > index 5149d49..1427c25 100644 > --- a/backend/src/backend/gen_program.cpp > +++ b/backend/src/backend/gen_program.cpp > @@ -171,6 +171,8 @@ namespace gbe { > ctx = GBE_NEW(ChvContext, unit, name, deviceID, relaxMath); > } else if (IS_SKYLAKE(deviceID)) { > ctx = GBE_NEW(Gen9Context, unit, name, deviceID, relaxMath); > + } else if (IS_BROXTON(deviceID)) { > + ctx = GBE_NEW(BxtContext, unit, name, deviceID, relaxMath); > } > GBE_ASSERTM(ctx != NULL, "Fail to create the gen context\n"); > > @@ -219,7 +221,8 @@ namespace gbe { > GBHI_HSW = 2, > GBHI_CHV = 3, > GBHI_BDW = 4, > - GBHI_SKL = 5,//remember update GBHI_MAX if add option. > + GBHI_SKL = 5, > + GBHI_BXT = 6, > GBHI_MAX, > }; > > @@ -229,7 +232,9 @@ namespace gbe { > {0, 'G','E', 'N', 'C', 'H', > 'S', 'W'}, > {0, 'G','E', 'N', 'C', 'C', > 'H', 'V'}, > {0, 'G','E', 'N', 'C', 'B', > 'D', 'W'}, > - {0, 'G','E', 'N', 'C', 'S', > 'K', 'L'}}; > + {0, 'G','E', 'N', 'C', 'S', > 'K', 'L'}, > + {0, 'G','E', 'N', 'C', 'B', > 'X', 'T'} > + }; > > #define FILL_GEN_HEADER(binary, index) do {int i = 0; do {*(binary+i) = > gen_binary_header[index][i]; i++; }while(i < > GEN_BINARY_HEADER_LENGTH);}while(0) > #define FILL_BYT_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_BYT) > @@ -238,6 +243,7 @@ namespace gbe { > #define FILL_CHV_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_CHV) > #define FILL_BDW_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_BDW) > #define FILL_SKL_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_SKL) > +#define FILL_BXT_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_BXT) > > static bool genHeaderCompare(const unsigned char *BufPtr, > GEN_BINARY_HEADER_INDEX index) > { > @@ -255,6 +261,7 @@ namespace gbe { > #define MATCH_CHV_HEADER(binary) genHeaderCompare(binary, GBHI_CHV) > #define MATCH_BDW_HEADER(binary) genHeaderCompare(binary, GBHI_BDW) > #define MATCH_SKL_HEADER(binary) genHeaderCompare(binary, GBHI_SKL) > +#define MATCH_BXT_HEADER(binary) genHeaderCompare(binary, GBHI_BXT) > > #define MATCH_DEVICE(deviceID, binary) ((IS_IVYBRIDGE(deviceID) && > MATCH_IVB_HEADER(binary)) || \ > (IS_IVYBRIDGE(deviceID) && > MATCH_IVB_HEADER(binary)) || \ > @@ -262,7 +269,9 @@ namespace gbe { > (IS_HASWELL(deviceID) && > MATCH_HSW_HEADER(binary)) || \ > (IS_BROADWELL(deviceID) && > MATCH_BDW_HEADER(binary)) || \ > (IS_CHERRYVIEW(deviceID) && > MATCH_CHV_HEADER(binary)) || \ > - (IS_SKYLAKE(deviceID) && > MATCH_SKL_HEADER(binary)) ) > + (IS_SKYLAKE(deviceID) && > MATCH_SKL_HEADER(binary)) || \ > + (IS_BROXTON(deviceID) && > MATCH_BXT_HEADER(binary)) \ > + ) > > static gbe_program genProgramNewFromBinary(uint32_t deviceID, const char > *binary, size_t size) { > using namespace gbe; > @@ -359,6 +368,8 @@ namespace gbe { > FILL_CHV_HEADER(*binary); > }else if(IS_SKYLAKE(prog->deviceID)){ > FILL_SKL_HEADER(*binary); > + }else if(IS_BROXTON(prog->deviceID)){ > + FILL_BXT_HEADER(*binary); > }else { > free(*binary); > *binary = NULL; > diff --git a/backend/src/gbe_bin_generater.cpp > b/backend/src/gbe_bin_generater.cpp > index 86197e1..8225d4a 100644 > --- a/backend/src/gbe_bin_generater.cpp > +++ b/backend/src/gbe_bin_generater.cpp > @@ -186,6 +186,10 @@ void program_build_instance::serialize_program(void) > throw(int) > src_hw_info[0]='S'; > src_hw_info[1]='K'; > src_hw_info[2]='L'; > + }else if(IS_BROXTON(gen_pci_id)){ > + src_hw_info[0]='B'; > + src_hw_info[1]='X'; > + src_hw_info[2]='T'; > } > > if (str_fmt_out) { > diff --git a/src/cl_device_data.h b/src/cl_device_data.h > index 3552a16..63e078f 100644 > --- a/src/cl_device_data.h > +++ b/src/cl_device_data.h > @@ -287,7 +287,14 @@ > devid == PCI_CHIP_SKYLAKE_SRV_GT4) > > #define IS_SKYLAKE(devid) (IS_SKL_GT1(devid) || IS_SKL_GT2(devid) || > IS_SKL_GT3(devid) || IS_SKL_GT4(devid)) > -#define IS_GEN9(devid) IS_SKYLAKE(devid) > + > +/* BXT */ > +#define PCI_CHIP_BROXTON_P 0x5A84 /* Intel(R) BXT-P for mobile desktop */ > + > +#define IS_BROXTON(devid) \ > + (devid == PCI_CHIP_BROXTON_P) > + > +#define IS_GEN9(devid) (IS_SKYLAKE(devid) || IS_BROXTON(devid)) > > #endif /* __CL_DEVICE_DATA_H__ */ > > diff --git a/src/cl_device_id.c b/src/cl_device_id.c > index 5debf06..a98523f 100644 > --- a/src/cl_device_id.c > +++ b/src/cl_device_id.c > @@ -198,6 +198,17 @@ static struct _cl_device_id intel_skl_gt4_device = { > #include "cl_gen9_device.h" > }; > > +static struct _cl_device_id intel_bxt_device = { > + INIT_ICD(dispatch) > + .max_compute_unit = 18, > + .max_thread_per_unit = 6, > + .sub_slice_count = 3, > + .max_work_item_sizes = {512, 512, 512}, > + .max_work_group_size = 512, > + .max_clock_frequency = 1000, > +#include "cl_gen9_device.h" > +}; > + > LOCAL cl_device_id > cl_get_gt_device(void) > { > @@ -529,6 +540,16 @@ skl_gt4_break: > cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id); > break; > > + case PCI_CHIP_BROXTON_P: > + DECL_INFO_STRING(bxt_break, intel_bxt_device, name, "Intel(R) HD > Graphics Broxton-P"); > +bxt_break: > + intel_bxt_device.device_id = device_id; > + intel_bxt_device.platform = cl_get_platform_default(); > + ret = &intel_bxt_device; > + cl_intel_platform_get_default_extension(ret); > + cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id); > + break; > + > case PCI_CHIP_SANDYBRIDGE_BRIDGE: > case PCI_CHIP_SANDYBRIDGE_GT1: > case PCI_CHIP_SANDYBRIDGE_GT2: > @@ -754,7 +775,8 @@ cl_get_device_info(cl_device_id device, > device != &intel_skl_gt1_device && > device != &intel_skl_gt2_device && > device != &intel_skl_gt3_device && > - device != &intel_skl_gt4_device > + device != &intel_skl_gt4_device && > + device != &intel_bxt_device > )) > return CL_INVALID_DEVICE; > > @@ -868,7 +890,9 @@ cl_device_get_version(cl_device_id device, cl_int *ver) > device != &intel_skl_gt1_device && > device != &intel_skl_gt2_device && > device != &intel_skl_gt3_device && > - device != &intel_skl_gt4_device)) > + device != &intel_skl_gt4_device && > + device != &intel_bxt_device > + )) > return CL_INVALID_DEVICE; > if (ver == NULL) > return CL_SUCCESS; > @@ -883,7 +907,8 @@ cl_device_get_version(cl_device_id device, cl_int *ver) > || device == &intel_brw_gt3_device || device == &intel_chv_device) { > *ver = 8; > } else if (device == &intel_skl_gt1_device || device == > &intel_skl_gt2_device > - || device == &intel_skl_gt3_device || device == > &intel_skl_gt4_device) { > + || device == &intel_skl_gt3_device || device == &intel_skl_gt4_device > + || device == &intel_bxt_device) { > *ver = 9; > } else > return CL_INVALID_VALUE; > @@ -971,7 +996,8 @@ cl_get_kernel_workgroup_info(cl_kernel kernel, > device != &intel_skl_gt1_device && > device != &intel_skl_gt2_device && > device != &intel_skl_gt3_device && > - device != &intel_skl_gt4_device)) > + device != &intel_skl_gt4_device && > + device != &intel_bxt_device)) > return CL_INVALID_DEVICE; > > CHECK_KERNEL(kernel); > diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c > index 7f212e2..e83b586 100644 > --- a/src/intel/intel_gpgpu.c > +++ b/src/intel/intel_gpgpu.c > @@ -1208,7 +1208,8 @@ static uint32_t get_surface_type(intel_gpgpu_t *gpgpu, > int index, cl_mem_object_ > IS_HASWELL(gpgpu->drv->device_id) || > IS_BROADWELL(gpgpu->drv->device_id) || > IS_CHERRYVIEW(gpgpu->drv->device_id) || > - IS_SKYLAKE(gpgpu->drv->device_id))) && > + IS_SKYLAKE(gpgpu->drv->device_id) || > + IS_BROXTON(gpgpu->drv->device_id))) && > index >= BTI_WORKAROUND_IMAGE_OFFSET + BTI_RESERVED_NUM && > type == CL_MEM_OBJECT_IMAGE1D_ARRAY) > surface_type = I965_SURFACE_2D; > @@ -2488,7 +2489,7 @@ intel_set_gpgpu_callbacks(int device_id) > intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen7; > return; > } > - if (IS_SKYLAKE(device_id)) { > + if (IS_SKYLAKE(device_id) || IS_BROXTON(device_id)) { > cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) > intel_gpgpu_bind_image_gen9; > intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen8; > cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb > *)intel_gpgpu_get_cache_ctrl_gen9; > -- > 1.9.1 > > _______________________________________________ > Beignet mailing list > Beignet@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/beignet
_______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet