Yes, according to spec, for the packed signed/unsigned halfbyte integer vector, it could only be converted to an 8-element signed/unsigned word vector.
For the other 2 comments, I'll refine the code accordingly. -----Original Message----- From: Yang, Rong R Sent: Monday, August 24, 2015 5:00 PM To: Guo, Yejun; beignet@lists.freedesktop.org Cc: Guo, Yejun Subject: RE: [Beignet] [PATCH] remove GBE_CURBE_STACK_POINTER in payload Some comments. > -----Original Message----- > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf > Of Guo Yejun > Sent: Friday, August 14, 2015 02:52 > To: beignet@lists.freedesktop.org > Cc: Guo, Yejun > Subject: [Beignet] [PATCH] remove GBE_CURBE_STACK_POINTER in payload > > initialize the data inside kernel with packed integer vector > > Signed-off-by: Guo Yejun <yejun....@intel.com> > --- > backend/src/backend/context.cpp | 66 > ++---------------------------- > backend/src/backend/context.hpp | 61 > ++++++++++++++++++++++++++- > backend/src/backend/gen75_context.cpp | 4 +- > backend/src/backend/gen_context.cpp | 33 ++++++++++++++- > backend/src/backend/gen_context.hpp | 2 + > backend/src/backend/gen_reg_allocation.cpp | 26 +++++++----- > backend/src/backend/program.h | 1 - > backend/src/backend/program.hpp | 2 +- > src/cl_command_queue_gen7.c | 9 ---- > 9 files changed, 116 insertions(+), 88 deletions(-) > > diff --git a/backend/src/backend/context.cpp > b/backend/src/backend/context.cpp index b8dfa8c..16b5961 100644 > --- a/backend/src/backend/context.cpp > +++ b/backend/src/backend/context.cpp > @@ -35,66 +35,6 @@ > > namespace gbe > { > - class SimpleAllocator > - { > - public: > - SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail); > - ~SimpleAllocator(void); > - > - /*! Allocate some memory from the pool. > - */ > - int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false); > - > - /*! Free the given register file piece */ > - void deallocate(int16_t offset); > - > - /*! Spilt a block into 2 blocks */ > - void splitBlock(int16_t offset, int16_t subOffset); > - > - protected: > - /*! Double chained list of free spaces */ > - struct Block { > - Block(int16_t offset, int16_t size) : > - prev(NULL), next(NULL), offset(offset), size(size) {} > - Block *prev, *next; //!< Previous and next free blocks > - int16_t offset; //!< Where the free block starts > - int16_t size; //!< Size of the free block > - }; > - > - /*! Try to coalesce two blocks (left and right). They must be in that > order. > - * If the colascing was done, the left block is deleted > - */ > - void coalesce(Block *left, Block *right); > - /*! the maximum offset */ > - int16_t maxOffset; > - /*! whether trigger an assertion on allocation failure */ > - bool assertFail; > - /*! Head and tail of the free list */ > - Block *head; > - Block *tail; > - /*! Handle free list element allocation */ > - DECL_POOL(Block, blockPool); > - /*! Track allocated memory blocks <offset, size> */ > - map<int16_t, int16_t> allocatedBlocks; > - /*! Use custom allocators */ > - GBE_CLASS(SimpleAllocator); > - }; > - > - /*! Structure that keeps track of allocation in the register file. This is > - * actually needed by Context (and not only by GenContext) because both > - * simulator and hardware have to deal with constant pushing which uses > the > - * register file > - * > - * Since Gen is pretty flexible, we just reuse the Simpleallocator > - */ > - > - class RegisterAllocator: public SimpleAllocator { > - public: > - RegisterAllocator(int16_t offset, int16_t size): SimpleAllocator(offset, > size, > false) {} > - > - GBE_CLASS(RegisterAllocator); > - }; > - > /*! > * an allocator for scratch memory allocation. Scratch memory are > used for register spilling. > * You can query how much scratch memory needed through > getMaxScatchMemUsed(). > @@ -396,10 +336,10 @@ namespace gbe > > void Context::buildStack(void) { > const auto &stackUse = dag->getUse(ir::ocl::stackptr); > - if (stackUse.size() == 0) // no stack is used if stackptr is unused > + if (stackUse.size() == 0) { // no stack is used if stackptr is unused > + this->kernel->stackSize = 0; > return; > - // Be sure that the stack pointer is set > - // GBE_ASSERT(this->kernel- > >getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0); > + } > uint32_t stackSize = 128; > while (stackSize < fn.getStackSize()) { > stackSize *= 3; > diff --git a/backend/src/backend/context.hpp > b/backend/src/backend/context.hpp index faa7c8a..48b46ed 100644 > --- a/backend/src/backend/context.hpp > +++ b/backend/src/backend/context.hpp > @@ -42,9 +42,68 @@ namespace ir { > namespace gbe > { > class Kernel; // context creates Kernel > - class RegisterAllocator; // allocator for physical register allocation > class ScratchAllocator; // allocator for scratch memory allocation > > + class SimpleAllocator > + { > + public: > + SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail); > + ~SimpleAllocator(void); > + > + /*! Allocate some memory from the pool. > + */ > + int16_t allocate(int16_t size, int16_t alignment, bool > + bFwd=false); > + > + /*! Free the given register file piece */ > + void deallocate(int16_t offset); > + > + /*! Spilt a block into 2 blocks */ > + void splitBlock(int16_t offset, int16_t subOffset); > + > + protected: > + /*! Double chained list of free spaces */ > + struct Block { > + Block(int16_t offset, int16_t size) : > + prev(NULL), next(NULL), offset(offset), size(size) {} > + Block *prev, *next; //!< Previous and next free blocks > + int16_t offset; //!< Where the free block starts > + int16_t size; //!< Size of the free block > + }; > + > + /*! Try to coalesce two blocks (left and right). They must be in that > order. > + * If the colascing was done, the left block is deleted > + */ > + void coalesce(Block *left, Block *right); > + /*! the maximum offset */ > + int16_t maxOffset; > + /*! whether trigger an assertion on allocation failure */ > + bool assertFail; > + /*! Head and tail of the free list */ > + Block *head; > + Block *tail; > + /*! Handle free list element allocation */ > + DECL_POOL(Block, blockPool); > + /*! Track allocated memory blocks <offset, size> */ > + map<int16_t, int16_t> allocatedBlocks; > + /*! Use custom allocators */ > + GBE_CLASS(SimpleAllocator); > + }; > + > + /*! Structure that keeps track of allocation in the register file. This is > + * actually needed by Context (and not only by GenContext) because both > + * simulator and hardware have to deal with constant pushing which > + uses > the > + * register file > + * > + * Since Gen is pretty flexible, we just reuse the Simpleallocator > + */ > + > + class RegisterAllocator: public SimpleAllocator { > + public: > + RegisterAllocator(int16_t offset, int16_t size): > + SimpleAllocator(offset, size, false) {} > + > + GBE_CLASS(RegisterAllocator); > + }; > + > /*! Context is the helper structure to build the Gen ISA or simulation code > * from GenIR > */ > diff --git a/backend/src/backend/gen75_context.cpp > b/backend/src/backend/gen75_context.cpp > index b9dfb18..7d407c3 100644 > --- a/backend/src/backend/gen75_context.cpp > +++ b/backend/src/backend/gen75_context.cpp > @@ -67,7 +67,7 @@ namespace gbe > using namespace ir; > > // Only emit stack pointer computation if we use a stack > - if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0) > + if (kernel->getStackSize() == 0) > return; > > // Check that everything is consistent in the kernel code @@ > -80,6 +80,8 @@ namespace gbe > GenRegister::ud16grf(ir::ocl::stackptr); > const GenRegister stackptr = ra->genReg(selStatckPtr); > > + loadLaneID(stackptr); > + > // We compute the per-lane stack pointer here > // private address start from zero > p->push(); > diff --git a/backend/src/backend/gen_context.cpp > b/backend/src/backend/gen_context.cpp > index 0c301dd..25fdf08 100644 > --- a/backend/src/backend/gen_context.cpp > +++ b/backend/src/backend/gen_context.cpp > @@ -176,11 +176,39 @@ namespace gbe > p->pop(); > } > > + void GenContext::loadLaneID(GenRegister dst) { > + const GenRegister laneID = GenRegister::immv(0x76543210); > + GenRegister dst_; > + if (dst.type == GEN_TYPE_UW) > + dst_ = dst; > + else > + dst_ = GenRegister::uw16grf(126,0); Does GEN only support move immv register to word or unsigned word? > + > + p->push(); > + uint32_t execWidth = p->curr.execWidth; > + p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > + if (execWidth == 8) > + p->MOV(dst_, laneID); > + else { > + p->curr.execWidth = 8; > + p->MOV(dst_, laneID); > + //Packed Unsigned Half-Byte Integer Vector does not work > + //have to mock by adding 8 to the singed vector > + const GenRegister eight = GenRegister::immuw(8); > + p->ADD(GenRegister::offset(dst_, 0, 16), dst_, eight); > + p->curr.execWidth = 16; > + } > + if (dst.type != GEN_TYPE_UW) > + p->MOV(dst, dst_); > + p->pop(); > + } > + > void GenContext::emitStackPointer(void) { > using namespace ir; > > // Only emit stack pointer computation if we use a stack > - if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0) > + if (kernel->getStackSize() == 0) > return; > > // Check that everything is consistent in the kernel code @@ > -193,6 +221,8 @@ namespace gbe > GenRegister::ud16grf(ir::ocl::stackptr); > const GenRegister stackptr = ra->genReg(selStatckPtr); > > + loadLaneID(stackptr); > + > // We compute the per-lane stack pointer here > // threadId * perThreadSize + laneId*perLaneSize > // let private address start from zero @@ -2254,7 +2284,6 @@ > namespace gbe > INSERT_REG(numgroup0, GROUP_NUM_X) > INSERT_REG(numgroup1, GROUP_NUM_Y) > INSERT_REG(numgroup2, GROUP_NUM_Z) > - INSERT_REG(stackptr, STACK_POINTER) > INSERT_REG(printfbptr, PRINTF_BUF_POINTER) > INSERT_REG(printfiptr, PRINTF_INDEX_POINTER) > do {} while(0); > diff --git a/backend/src/backend/gen_context.hpp > b/backend/src/backend/gen_context.hpp > index 8ef725f..34f9293 100644 > --- a/backend/src/backend/gen_context.hpp > +++ b/backend/src/backend/gen_context.hpp > @@ -107,6 +107,8 @@ namespace gbe > return this->liveness->getLiveIn(bb); > } > > + void loadLaneID(GenRegister dst); > + > void collectShifter(GenRegister dest, GenRegister src); > void loadTopHalf(GenRegister dest, GenRegister src); > void storeTopHalf(GenRegister dest, GenRegister src); diff --git > a/backend/src/backend/gen_reg_allocation.cpp > b/backend/src/backend/gen_reg_allocation.cpp > index 4cb88e9..36ee80a 100644 > --- a/backend/src/backend/gen_reg_allocation.cpp > +++ b/backend/src/backend/gen_reg_allocation.cpp > @@ -133,8 +133,8 @@ namespace gbe > void validateFlag(Selection &selection, SelectionInstruction &insn); > /*! Allocate the GRF registers */ > bool allocateGRFs(Selection &selection); > - /*! Create gen registers for all preallocated curbe registers. */ > - void allocatePayloadRegs(void); > + /*! Create gen registers for all preallocated special registers. */ > + void allocateSpecialRegs(void); > /*! Create a Gen register from a register set in the payload */ > void allocatePayloadReg(ir::Register, uint32_t offset, uint32_t > subOffset = 0); > /*! Create the intervals for each register */ @@ -228,7 +228,7 @@ > namespace gbe > this->intervals[reg].maxID = 0; > } > > - INLINE void GenRegAllocator::Opaque::allocatePayloadRegs(void) { > + INLINE void GenRegAllocator::Opaque::allocateSpecialRegs(void) { > using namespace ir; > for(auto &it : this->ctx.curbeRegs) > allocatePayloadReg(it.first, it.second); @@ -248,6 +248,18 @@ > namespace gbe > allocatePayloadReg(reg, it->second, subOffset); > ctx.splitBlock(it->second, subOffset); > } > + > + if (RA.contains(ocl::stackbuffer)) { > + uint32_t regSize = this->ctx.ra->getRegSize(ocl::stackptr); You can use this->getRegAttrib directly. > + uint32_t offset = > + this->ctx.registerAllocator->allocate(regSize, regSize, 1); Should not call ctx.registerAllocator->allocate, you can use ctx.allocate. > + RA.insert(std::make_pair(ocl::stackptr, offset)); > + } > + > + // Group and barrier IDs are always allocated by the hardware in r0 > + RA.insert(std::make_pair(ocl::groupid0, 1*sizeof(float))); // r0.1 > + RA.insert(std::make_pair(ocl::groupid1, 6*sizeof(float))); // r0.6 > + RA.insert(std::make_pair(ocl::groupid2, 7*sizeof(float))); // r0.7 > + RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // > + r0.2 > } > > bool GenRegAllocator::Opaque::createGenReg(const Selection > &selection, const GenRegInterval &interval) { @@ -1001,13 +1013,7 @@ > namespace gbe > this->intervals.push_back(ir::Register(regID)); > > // Allocate the special registers (only those which are actually used) > - this->allocatePayloadRegs(); > - > - // Group and barrier IDs are always allocated by the hardware in r0 > - RA.insert(std::make_pair(ocl::groupid0, 1*sizeof(float))); // r0.1 > - RA.insert(std::make_pair(ocl::groupid1, 6*sizeof(float))); // r0.6 > - RA.insert(std::make_pair(ocl::groupid2, 7*sizeof(float))); // r0.7 > - RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2 > + this->allocateSpecialRegs(); > > // block IP used to handle the mask in SW is always allocated > > diff --git a/backend/src/backend/program.h > b/backend/src/backend/program.h index fa75052..af19732 100644 > --- a/backend/src/backend/program.h > +++ b/backend/src/backend/program.h > @@ -91,7 +91,6 @@ enum gbe_curbe_type { > GBE_CURBE_GROUP_NUM_Z, > GBE_CURBE_WORK_DIM, > GBE_CURBE_IMAGE_INFO, > - GBE_CURBE_STACK_POINTER, > GBE_CURBE_PRINTF_BUF_POINTER, > GBE_CURBE_PRINTF_INDEX_POINTER, > GBE_CURBE_KERNEL_ARGUMENT, > diff --git a/backend/src/backend/program.hpp > b/backend/src/backend/program.hpp index cff2463..efe192f 100644 > --- a/backend/src/backend/program.hpp > +++ b/backend/src/backend/program.hpp > @@ -223,7 +223,7 @@ namespace gbe { > uint32_t argNum; //!< Number of function arguments > uint32_t curbeSize; //!< Size of the data to push > uint32_t simdWidth; //!< SIMD size for the kernel (lane number) > - uint32_t stackSize; //!< Stack size (may be 0 if unused) > + uint32_t stackSize; //!< Stack size (0 if unused) > uint32_t scratchSize; //!< Scratch memory size (may be 0 if unused) > bool useSLM; //!< SLM requires a special HW config > uint32_t slmSize; //!< slm size for kernel variable > diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c > index 4adbd2b..0e60528 100644 > --- a/src/cl_command_queue_gen7.c > +++ b/src/cl_command_queue_gen7.c > @@ -210,15 +210,6 @@ cl_curbe_fill(cl_kernel ker, > UPLOAD(GBE_CURBE_WORK_DIM, work_dim); #undef UPLOAD > > - /* Write identity for the stack pointer. This is required by the stack > pointer > - * computation in the kernel > - */ > - if ((offset = interp_kernel_get_curbe_offset(ker->opaque, > GBE_CURBE_STACK_POINTER, 0)) >= 0) { > - const uint32_t simd_sz = interp_kernel_get_simd_width(ker->opaque); > - uint32_t *stackptr = (uint32_t *) (ker->curbe + offset); > - int32_t i; > - for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i; > - } > /* Handle the various offsets to SLM */ > const int32_t arg_n = interp_kernel_get_arg_num(ker->opaque); > int32_t arg, slm_offset = interp_kernel_get_slm_size(ker->opaque); > -- > 1.9.1 > > _______________________________________________ > Beignet mailing list > Beignet@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet