The first 4 patches is LGTM, will push them. For intel_sub_group_shuffle_down/up implement, we'd better handle it in the opencl c level.
> -----Original Message----- > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of > Xiuli Pan > Sent: Thursday, July 7, 2016 11:10 > To: beignet@lists.freedesktop.org > Cc: Pan, Xiuli <xiuli....@intel.com> > Subject: [Beignet] [PATCH V2 5/6] Backend: Add > intel_sub_group_shuffle_down/up/xor > > From: Pan Xiuli <xiuli....@intel.com> > > Using a function shuffle delta for down/up, using some flags for current and > down/up src switch. The flags and index is pre caculated in libocl. > The shuffle delta only handle flag mask the dst with different src. > Using the old shuffle with xor for shuffle_xor. > > Signed-off-by: Pan Xiuli <xiuli....@intel.com> > --- > backend/src/backend/gen_insn_selection.cpp | 65 > ++++++++++++++++++++++++++++++ > backend/src/ir/instruction.cpp | 44 ++++++++++++++++++++ > backend/src/ir/instruction.hpp | 9 +++++ > backend/src/ir/instruction.hxx | 1 + > backend/src/libocl/script/ocl_simd.def | 9 +++++ > backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 34 ++++++++++++++++ > backend/src/libocl/tmpl/ocl_simd.tmpl.h | 9 +++++ > backend/src/llvm/llvm_gen_backend.cpp | 13 ++++++ > backend/src/llvm/llvm_gen_ocl_function.hxx | 2 + > 9 files changed, 186 insertions(+) > > diff --git a/backend/src/backend/gen_insn_selection.cpp > b/backend/src/backend/gen_insn_selection.cpp > index e342161..7b646e0 100644 > --- a/backend/src/backend/gen_insn_selection.cpp > +++ b/backend/src/backend/gen_insn_selection.cpp > @@ -6738,6 +6738,70 @@ extern bool OCL_DEBUGINFO; // first defined by > calling BVAR in program.cpp > DECL_CTOR(MediaBlockWriteInstruction, 1, 1); > }; > > + /*! SIMD shuffle delta pattern */ > + DECL_PATTERN(SimdShuffleDeltaInstruction) > + { > + bool emitOne(Selection::Opaque &sel, const > ir::SimdShuffleDeltaInstruction &insn, bool &markChildren) const > + { > + using namespace ir; > + const GenRegister dst = sel.selReg(insn.getDst(0), TYPE_U32); > + const GenRegister srcx = sel.selReg(insn.getSrc(0), TYPE_U32); > + const GenRegister srcy = sel.selReg(insn.getSrc(1), TYPE_U32); > + const GenRegister index = sel.selReg(insn.getSrc(2), TYPE_U32); > + const GenRegister inRange = sel.selReg(insn.getSrc(3), TYPE_U32); > + const GenRegister constZero = GenRegister::immud(0);; > + const GenRegister shiftL = sel.selReg(sel.reg(FAMILY_DWORD), > TYPE_U32); > + bool hasShiftL = false; > + > + sel.push(); > + sel.curr.predicate = GEN_PREDICATE_NONE; > + /* First shuffle for srcx */ > + if (sel.isScalarReg(insn.getSrc(0))) { > + sel.MOV(dst, srcx); > + } else { > + if (index.file == GEN_IMMEDIATE_VALUE) { > + sel.push(); > + uint32_t offset = index.value.ud % sel.curr.execWidth; > + GenRegister reg = GenRegister::subphysicaloffset(srcx, offset); > + reg.vstride = GEN_VERTICAL_STRIDE_0; > + reg.hstride = GEN_HORIZONTAL_STRIDE_0; > + reg.width = GEN_WIDTH_1; > + sel.MOV(dst, reg); > + sel.push(); > + } else { > + sel.SHL(shiftL, index, GenRegister::immud(0x2)); > + hasShiftL = true; > + sel.SIMD_SHUFFLE(dst, srcx, shiftL); > + } > + } > + sel.curr.flag = 0; > + sel.curr.subFlag = 1; > + sel.CMP(GEN_CONDITIONAL_EQ, inRange, constZero); > + sel.curr.predicate = GEN_PREDICATE_NORMAL; > + /* Now shuffle for srcy */ > + if (sel.isScalarReg(insn.getSrc(1))) { > + sel.MOV(dst, srcy); > + } else { > + if (index.file == GEN_IMMEDIATE_VALUE) { > + sel.push(); > + uint32_t offset = index.value.ud % sel.curr.execWidth; > + GenRegister reg = GenRegister::subphysicaloffset(srcy, offset); > + reg.vstride = GEN_VERTICAL_STRIDE_0; > + reg.hstride = GEN_HORIZONTAL_STRIDE_0; > + reg.width = GEN_WIDTH_1; > + sel.MOV(dst, reg); > + sel.pop(); > + } else { > + if (!hasShiftL) > + sel.SHL(shiftL, index, GenRegister::immud(0x2)); > + sel.SIMD_SHUFFLE(dst, srcy, shiftL); > + } > + } > + sel.pop(); > + return true; > + } > + DECL_CTOR(SimdShuffleDeltaInstruction, 1, 1); }; > > /*! Sort patterns */ > INLINE bool cmp(const SelectionPattern *p0, const SelectionPattern *p1) > { @@ -6782,6 +6846,7 @@ extern bool OCL_DEBUGINFO; // first defined by > calling BVAR in program.cpp > this->insert<PrintfInstructionPattern>(); > this->insert<MediaBlockReadInstructionPattern>(); > this->insert<MediaBlockWriteInstructionPattern>(); > + this->insert<SimdShuffleDeltaInstructionPattern>(); > > // Sort all the patterns with the number of instructions they output > for (uint32_t op = 0; op < ir::OP_INVALID; ++op) diff --git > a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp index > ed64580..a274626 100644 > --- a/backend/src/ir/instruction.cpp > +++ b/backend/src/ir/instruction.cpp > @@ -1136,6 +1136,35 @@ namespace ir { > uint8_t vec_size; > }; > > + class ALIGNED_INSTRUCTION SimdShuffleDeltaInstruction : > + public BasePolicy, > + public TupleSrcPolicy<SimdShuffleDeltaInstruction >, > + public NDstPolicy<SimdShuffleDeltaInstruction, 1> > + { > + public: > + > + INLINE SimdShuffleDeltaInstruction(Register dst, Tuple srcTuple, > uint8_t > srcNum) { > + this->opcode = OP_SIMD_SHUFFLE_DELTA; > + this->dst[0] = dst; > + this->src = srcTuple; > + this->srcNum = srcNum; > + } > + INLINE bool wellFormed(const Function &fn, std::string &why) const; > + INLINE void out(std::ostream &out, const Function &fn) const { > + this->outOpcode(out); > + out << " %" << this->getDst(fn, 0); > + out << " {"; > + for (uint32_t i = 0; i < srcNum; ++i) > + out << "%" << this->getSrc(fn, i) << (i != (srcNum-1u) ? " " : ""); > + out << "}"; > + } > + > + Tuple src; > + Register dst[1]; > + uint8_t srcNum; > + }; > + > + > #undef ALIGNED_INSTRUCTION > > ///////////////////////////////////////////////////////////////////////// > @@ -1679,6 +1708,14 @@ namespace ir { > return true; > } > > + INLINE bool SimdShuffleDeltaInstruction::wellFormed(const Function &fn, > std::string &whyNot) const { > + if (this->srcNum != 4) { > + whyNot = "Wrong number of source."; > + return false; > + } > + return true; > + } > + > #undef CHECK_TYPE > > ///////////////////////////////////////////////////////////////////////// > @@ -2154,6 +2191,10 @@ > START_INTROSPECTION(MediaBlockWriteInstruction) > #include "ir/instruction.hxx" > END_INTROSPECTION(MediaBlockWriteInstruction) > > +START_INTROSPECTION(SimdShuffleDeltaInstruction) > +#include "ir/instruction.hxx" > +END_INTROSPECTION(SimdShuffleDeltaInstruction) > + > #undef END_INTROSPECTION > #undef START_INTROSPECTION > #undef DECL_INSN > @@ -2691,6 +2732,9 @@ DECL_MEM_FN(MemInstruction, void, > setBtiReg(Register reg), setBtiReg(reg)) > return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, > srcNum, vec_size).convert(); > } > > + Instruction SIMD_SHUFFLE_DELTA(Register dst, Tuple srcTuple, uint8_t > srcNum) { > + return internal::SimdShuffleDeltaInstruction(dst, srcTuple, > + srcNum).convert(); } > > std::ostream &operator<< (std::ostream &out, const Instruction &insn) { > const Function &fn = insn.getFunction(); diff --git > a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp index > b2b0b49..7ee59a2 100644 > --- a/backend/src/ir/instruction.hpp > +++ b/backend/src/ir/instruction.hpp > @@ -653,6 +653,13 @@ namespace ir { > uint8_t getVectorSize() const; > }; > > + /*! simd shuffle */ > + class SimdShuffleDeltaInstruction : public Instruction { > + public: > + /*! Return true if the given instruction is an instance of this class */ > + static bool isClassOf(const Instruction &insn); }; > + > /*! Specialize the instruction. Also performs typechecking first based on > the > * opcode. Crashes if it fails > */ > @@ -889,6 +896,8 @@ namespace ir { > Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple > coord, uint8_t srcNum); > /*! media block write */ > Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, > uint8_t vec_size); > + /*! sub_group_shuffle_delta dst srctupel */ Instruction > + SIMD_SHUFFLE_DELTA(Register dst, Tuple srcTuple, uint8_t srcNum); > } /* namespace ir */ > } /* namespace gbe */ > > diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx > index 7d755ae..35f9623 100644 > --- a/backend/src/ir/instruction.hxx > +++ b/backend/src/ir/instruction.hxx > @@ -116,3 +116,4 @@ DECL_INSN(SUBGROUP, SubGroupInstruction) > DECL_INSN(PRINTF, PrintfInstruction) DECL_INSN(MBREAD, > MediaBlockReadInstruction) DECL_INSN(MBWRITE, > MediaBlockWriteInstruction) > +DECL_INSN(SIMD_SHUFFLE_DELTA, SimdShuffleDeltaInstruction) > diff --git a/backend/src/libocl/script/ocl_simd.def > b/backend/src/libocl/script/ocl_simd.def > index e26243e..aa47735 100644 > --- a/backend/src/libocl/script/ocl_simd.def > +++ b/backend/src/libocl/script/ocl_simd.def > @@ -2,3 +2,12 @@ > floatn intel_sub_group_shuffle(floatn x, uint c) intn > intel_sub_group_shuffle(intn x, uint c) uintn intel_sub_group_shuffle(uintn > x, uint c) > +floatn intel_sub_group_shuffle_down(floatn x, floatn y, uint c) intn > +intel_sub_group_shuffle_down(intn x, intn y, uint c) uintn > +intel_sub_group_shuffle_down(uintn x, uintn y, uint c) floatn > +intel_sub_group_shuffle_up(floatn x, floatn y, uint c) intn > +intel_sub_group_shuffle_up(intn x, intn y, uint c) uintn > +intel_sub_group_shuffle_up(uintn x, uintn y, uint c) floatn > +intel_sub_group_shuffle_xor(floatn x, uint c) intn > +intel_sub_group_shuffle_xor(intn x, uint c) uintn > +intel_sub_group_shuffle_xor(uintn x, uint c) > diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl > b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl > index b066502..6aee94e 100644 > --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl > +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl > @@ -18,6 +18,7 @@ > > #include "ocl_simd.h" > #include "ocl_workitem.h" > +#include "ocl_as.h" > > uint get_max_sub_group_size(void) > { > @@ -216,3 +217,36 @@ OVERLOADABLE void > intel_sub_group_block_write8(image2d_t p, int2 cord, uint8 dat { > __gen_ocl_sub_group_block_write_image8(p, cord.x, cord.y, data); } > + > +PURE CONST uint __gen_ocl_sub_group_shuffle_delta(uint x, uint y, uint > +c, uint inRange); static OVERLOADABLE INLINE uint as_uint(uint x) { > + return x; > +} > +#define SHUFFLE_DOWN(TYPE) \ > +OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) > +{ \ > + uint inRange = ((int)c + (int)get_sub_group_local_id() < > +get_max_sub_group_size()); \ > + return as_##TYPE(__gen_ocl_sub_group_shuffle_delta(as_uint(x), > +as_uint(y), (get_sub_group_local_id() + c) % get_max_sub_group_size(), > +inRange ));\ } > +SHUFFLE_DOWN(float) > +SHUFFLE_DOWN(int) > +SHUFFLE_DOWN(uint) > +#undef SHUFFLE_DOWN > + > +#define SHUFFLE_UP(TYPE) \ > +OVERLOADABLE TYPE intel_sub_group_shuffle_up(TYPE x, TYPE y, uint c) { > +\ > + uint inRange = ((int)c - (int)get_sub_group_local_id() > 0); \ > + return as_##TYPE(__gen_ocl_sub_group_shuffle_delta(as_uint(x), > +as_uint(y), (get_max_sub_group_size() + get_sub_group_local_id() - c) % > +get_max_sub_group_size(), inRange ));\ } > +SHUFFLE_UP(float) > +SHUFFLE_UP(int) > +SHUFFLE_UP(uint) > +#undef SHUFFLE_UP > +#define SHUFFLE_XOR(TYPE) \ > +OVERLOADABLE TYPE intel_sub_group_shuffle_xor(TYPE x, uint c) { \ > + return intel_sub_group_shuffle(x, (get_sub_group_local_id() ^ c) % > +get_max_sub_group_size()); \ } > +SHUFFLE_XOR(float) > +SHUFFLE_XOR(int) > +SHUFFLE_XOR(uint) > +#undef SHUFFLE_XOR > diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h > b/backend/src/libocl/tmpl/ocl_simd.tmpl.h > index 799f772..15da0e7 100644 > --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h > +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h > @@ -132,6 +132,15 @@ OVERLOADABLE double > sub_group_scan_exclusive_max(double x); OVERLOADABLE float > intel_sub_group_shuffle(float x, uint c); OVERLOADABLE int > intel_sub_group_shuffle(int x, uint c); OVERLOADABLE uint > intel_sub_group_shuffle(uint x, uint c); > +OVERLOADABLE float intel_sub_group_shuffle_down(float x, float y, uint > +c); OVERLOADABLE int intel_sub_group_shuffle_down(int x, int y, uint > +c); OVERLOADABLE uint intel_sub_group_shuffle_down(uint x, uint y, uint > +c); OVERLOADABLE float intel_sub_group_shuffle_up(float x, float y, > +uint c); OVERLOADABLE int intel_sub_group_shuffle_up(int x, int y, uint > +c); OVERLOADABLE uint intel_sub_group_shuffle_up(uint x, uint y, uint > +c); OVERLOADABLE float intel_sub_group_shuffle_xor(float x, uint c); > +OVERLOADABLE int intel_sub_group_shuffle_xor(int x, uint c); > +OVERLOADABLE uint intel_sub_group_shuffle_xor(uint x, uint c); > > /* blocak read/write */ > OVERLOADABLE uint intel_sub_group_block_read(const global uint* p); diff > --git a/backend/src/llvm/llvm_gen_backend.cpp > b/backend/src/llvm/llvm_gen_backend.cpp > index 41cb783..8f0bcea 100644 > --- a/backend/src/llvm/llvm_gen_backend.cpp > +++ b/backend/src/llvm/llvm_gen_backend.cpp > @@ -3733,6 +3733,7 @@ namespace gbe > case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2: > case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4: > case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8: > + case GEN_OCL_SUB_GROUP_SHUFFLE_DELTA: > this->newRegister(&I); > break; > case GEN_OCL_PRINTF: > @@ -4897,6 +4898,18 @@ namespace gbe > this->emitBlockReadWriteImageInst(I, CS, true, 4); break; > case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8: > this->emitBlockReadWriteImageInst(I, CS, true, 8); break; > + case GEN_OCL_SUB_GROUP_SHUFFLE_DELTA: > + { > + vector<ir::Register> srcTupleData; > + for (uint32_t i = 0; i < 4; ++i) { > + srcTupleData.push_back(this->getRegister(*AI)); > + ++AI; > + } > + const ir::Register dst = this->getRegister(&I); > + ir::Tuple tuple = ctx.arrayTuple(&srcTupleData[0], 4); > + ctx.SIMD_SHUFFLE_DELTA(dst, tuple, 4); > + break; > + } > default: break; > } > } > diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx > b/backend/src/llvm/llvm_gen_ocl_function.hxx > index 48a72d1..dbd25b0 100644 > --- a/backend/src/llvm/llvm_gen_ocl_function.hxx > +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx > @@ -234,5 +234,7 @@ > DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE2, > __gen_ocl_sub_group_block_w > DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE4, > __gen_ocl_sub_group_block_write_image4) > DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE8, > __gen_ocl_sub_group_block_write_image8) > > +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SHUFFLE_DELTA, > +__gen_ocl_sub_group_shuffle_delta) > + > // common function > DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp) > -- > 2.7.4 > > _______________________________________________ > Beignet mailing list > Beignet@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet