From: Pan Xiuli <xiuli....@intel.com> Using meida block read/write to read data in block. In simd16 mode the need some reg relocation for later use. GEN7 has some different data port. V2: Refine block read simd16 with tmp reg to avoide MOVs
Signed-off-by: Pan Xiuli <xiuli....@intel.com> --- backend/src/backend/gen/gen_mesa_disasm.c | 27 +++- backend/src/backend/gen7_encoder.cpp | 48 +++++++ backend/src/backend/gen7_encoder.hpp | 4 + backend/src/backend/gen7_instruction.hpp | 16 +++ backend/src/backend/gen8_instruction.hpp | 16 +++ backend/src/backend/gen_context.cpp | 155 +++++++++++++++++++++ backend/src/backend/gen_context.hpp | 2 + backend/src/backend/gen_defs.hpp | 16 +++ backend/src/backend/gen_encoder.cpp | 47 +++++++ backend/src/backend/gen_encoder.hpp | 4 + .../src/backend/gen_insn_gen7_schedule_info.hxx | 2 + backend/src/backend/gen_insn_selection.cpp | 115 ++++++++++++++- backend/src/backend/gen_insn_selection.hpp | 4 + backend/src/backend/gen_insn_selection.hxx | 2 + backend/src/ir/instruction.cpp | 112 ++++++++++++++- backend/src/ir/instruction.hpp | 22 +++ backend/src/ir/instruction.hxx | 2 + backend/src/ir/liveness.cpp | 3 +- backend/src/libocl/src/ocl_substore.ll | 33 +++++ backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 21 +++ backend/src/libocl/tmpl/ocl_simd.tmpl.h | 10 ++ backend/src/llvm/llvm_gen_backend.cpp | 62 ++++++++- backend/src/llvm/llvm_gen_ocl_function.hxx | 8 ++ backend/src/llvm/llvm_scalarize.cpp | 14 ++ 24 files changed, 732 insertions(+), 13 deletions(-) diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c index 9200c26..9955dfc 100644 --- a/backend/src/backend/gen/gen_mesa_disasm.c +++ b/backend/src/backend/gen/gen_mesa_disasm.c @@ -1476,6 +1476,15 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac SAMPLER_MSG_TYPE(inst), SAMPLER_SIMD_MODE(inst)); break; + case GEN_SFID_DATAPORT_RENDER: + if(UNTYPED_RW_MSG_TYPE(inst) == 4 || UNTYPED_RW_MSG_TYPE(inst) == 10) + format(file, " (bti: %d, %s, %s)", + UNTYPED_RW_BTI(inst), + data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)], + data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]); + else + format(file, " not implemented"); + break; case GEN_SFID_DATAPORT_DATA: if(UNTYPED_RW_CATEGORY(inst) == 0) { if(UNTYPED_RW_MSG_TYPE(inst) == 5 || UNTYPED_RW_MSG_TYPE(inst) == 13) @@ -1510,12 +1519,18 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac } break; case GEN_SFID_DATAPORT1_DATA: - format(file, " (bti: %d, rgba: %d, %s, %s, %s)", - UNTYPED_RW_BTI(inst), - UNTYPED_RW_RGBA(inst), - data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)], - data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)], - data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]); + if(UNTYPED_RW_MSG_TYPE(inst) == 4 || UNTYPED_RW_MSG_TYPE(inst) == 10) + format(file, " (bti: %d, %s, %s)", + UNTYPED_RW_BTI(inst), + data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)], + data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]); + else + format(file, " (bti: %d, rgba: %d, %s, %s, %s)", + UNTYPED_RW_BTI(inst), + UNTYPED_RW_RGBA(inst), + data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)], + data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)], + data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]); break; case GEN_SFID_DATAPORT_CONSTANT: format(file, " (bti: %d, %s)", diff --git a/backend/src/backend/gen7_encoder.cpp b/backend/src/backend/gen7_encoder.cpp index fc358be..abb8b77 100644 --- a/backend/src/backend/gen7_encoder.cpp +++ b/backend/src/backend/gen7_encoder.cpp @@ -239,5 +239,53 @@ namespace gbe } } + static void setMBlockRWGEN7(GenEncoder *p, + GenNativeInstruction *insn, + uint32_t bti, + uint32_t msg_type, + uint32_t msg_length, + uint32_t response_length) + { + const GenMessageTarget sfid = GEN_SFID_DATAPORT_RENDER; + p->setMessageDescriptor(insn, sfid, msg_length, response_length); + insn->bits3.gen7_mblock_rw.msg_type = msg_type; + insn->bits3.gen7_mblock_rw.bti = bti; + insn->bits3.gen7_mblock_rw.header_present = 1; + } + + + void Gen7Encoder::MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) { + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); + const uint32_t msg_length = 1; + const uint32_t response_length = size; // Size of registers + this->setHeader(insn); + this->setDst(insn, GenRegister::ud8grf(dst.nr, 0)); + this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); + this->setSrc1(insn, GenRegister::immud(0)); + setMBlockRWGEN7(this, + insn, + bti, + GEN75_P1_MEDIA_BREAD, + msg_length, + response_length); + } + + void Gen7Encoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t size) { + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); + const uint32_t msg_length = 1 + size; + const uint32_t response_length = 0; // Size of registers + this->setHeader(insn); + this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW)); + this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); + this->setSrc1(insn, GenRegister::immud(0)); + setMBlockRWGEN7(this, + insn, + bti, + GEN75_P1_MEDIA_TYPED_BWRITE, + msg_length, + response_length); + } + + #undef NO_SWIZZLE } diff --git a/backend/src/backend/gen7_encoder.hpp b/backend/src/backend/gen7_encoder.hpp index 1276c67..edb711d 100644 --- a/backend/src/backend/gen7_encoder.hpp +++ b/backend/src/backend/gen7_encoder.hpp @@ -42,6 +42,10 @@ namespace gbe virtual void setSrc1(GenNativeInstruction *insn, GenRegister reg); virtual void alu3(uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1, GenRegister src2); + /*! MBlock read */ + virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize); + /*! MBlock write */ + virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize); }; } #endif /* __GBE_GEN7_ENCODER_HPP__ */ diff --git a/backend/src/backend/gen7_instruction.hpp b/backend/src/backend/gen7_instruction.hpp index 258dd24..7d7eada 100644 --- a/backend/src/backend/gen7_instruction.hpp +++ b/backend/src/backend/gen7_instruction.hpp @@ -531,6 +531,22 @@ union Gen7NativeInstruction uint32_t uip:16; } gen7_branch; + /*! Data port Media block read / write */ + struct { + uint32_t bti:8; + uint32_t ver_line_stride_offset:1; + uint32_t ver_line_stride:1; + uint32_t ver_line_stride_override:1; + uint32_t ignored:3; + uint32_t msg_type:4; + uint32_t category:1; + uint32_t header_present:1; + uint32_t response_length:5; + uint32_t msg_length:4; + uint32_t pad2:2; + uint32_t end_of_thread:1; + } gen7_mblock_rw; + int d; uint32_t ud; float f; diff --git a/backend/src/backend/gen8_instruction.hpp b/backend/src/backend/gen8_instruction.hpp index ada9ffc..549948a 100644 --- a/backend/src/backend/gen8_instruction.hpp +++ b/backend/src/backend/gen8_instruction.hpp @@ -608,6 +608,22 @@ union Gen8NativeInstruction uint32_t jip:32; } gen8_branch; + /*! Data port Media block read / write */ + struct { + uint32_t bti:8; + uint32_t ver_line_stride_offset:1; + uint32_t ver_line_stride:1; + uint32_t ver_line_stride_override:1; + uint32_t ignored:3; + uint32_t msg_type:4; + uint32_t category:1; + uint32_t header_present:1; + uint32_t response_length:5; + uint32_t msg_length:4; + uint32_t pad2:2; + uint32_t end_of_thread:1; + } gen7_mblock_rw; + int d; uint32_t ud; float f; diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 90b8b45..98a94ba 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -3538,6 +3538,161 @@ namespace gbe p->OBWRITE(header, insn.getbti(), insn.extra.elem); } + void GenContext::emitMBReadInstruction(const SelectionInstruction &insn) { + const GenRegister dst = ra->genReg(insn.dst(0)); + const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)),GEN_TYPE_D); + const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)),GEN_TYPE_D); + GenRegister header, offsetx, offsety, blocksizereg; + if (simdWidth == 8) + header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD); + else + header = GenRegister::retype(GenRegister::Qn(ra->genReg(insn.src(2)),1), GEN_TYPE_UD); + + offsetx = GenRegister::offset(header, 0, 0*4); + offsety = GenRegister::offset(header, 0, 1*4); + blocksizereg = GenRegister::offset(header, 0, 2*4); + size_t vec_size = insn.extra.elem; + uint32_t blocksize = 0x1F | (vec_size-1) << 16; + + if (simdWidth == 8) + { + p->push(); + // Copy r0 into the header first + p->curr.execWidth = 8; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->MOV(header, GenRegister::ud8grf(0,0)); + + // Update the header with the coord + p->curr.execWidth = 1; + p->MOV(offsetx, coordx); + p->MOV(offsety, coordy); + // Update block width and height + p->MOV(blocksizereg, GenRegister::immud(blocksize)); + // Now read the data + p->curr.execWidth = 8; + p->MBREAD(dst, header, insn.getbti(), vec_size); + p->pop(); + + } + else + { + const GenRegister tmp = ra->genReg(insn.dst(vec_size)); + p->push(); + // Copy r0 into the header first + p->curr.execWidth = 8; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->MOV(header, GenRegister::ud8grf(0,0)); + + // First half + // Update the header with the coord + p->curr.execWidth = 1; + p->MOV(offsetx, coordx); + p->MOV(offsety, coordy); + // Update block width and height + p->MOV(blocksizereg, GenRegister::immud(blocksize)); + // Now read the data + p->curr.execWidth = 8; + p->MBREAD(tmp, header, insn.getbti(), vec_size); + + // Second half + // Update the header with the coord + p->curr.execWidth = 1; + p->ADD(offsetx, offsetx, GenRegister::immud(32)); + + const GenRegister tmp2 = GenRegister::offset(tmp, vec_size); + // Now read the data + p->curr.execWidth = 8; + p->MBREAD(tmp2, header, insn.getbti(), vec_size); + + // Move the reg to fit vector rule. + for (int i = 0; i < vec_size; i++) { + p->MOV(GenRegister::offset(dst, i * 2), GenRegister::offset(tmp, i)); + p->MOV(GenRegister::offset(dst, i * 2 + 1), + GenRegister::offset(tmp2, i)); + } + p->pop(); + } + } + + void GenContext::emitMBWriteInstruction(const SelectionInstruction &insn) { + const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_D); + const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)), GEN_TYPE_D); + GenRegister header, offsetx, offsety, blocksizereg; + size_t vec_size = insn.extra.elem; + uint32_t blocksize = 0x1F | (vec_size-1) << 16; + if (simdWidth == 8) + header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD); + else + header = GenRegister::retype(GenRegister::Qn(ra->genReg(insn.dst(0)),1), GEN_TYPE_UD); + + offsetx = GenRegister::offset(header, 0, 0*4); + offsety = GenRegister::offset(header, 0, 1*4); + blocksizereg = GenRegister::offset(header, 0, 2*4); + + if (simdWidth == 8) + { + p->push(); + // Copy r0 into the header first + p->curr.execWidth = 8; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->MOV(header, GenRegister::ud8grf(0,0)); + + // Update the header with the coord + p->curr.execWidth = 1; + p->MOV(offsetx, coordx); + p->MOV(offsety, coordy); + // Update block width and height + p->MOV(blocksizereg, GenRegister::immud(blocksize)); + p->curr.execWidth = 8; + // Mov what we need into msgs + for(uint32_t i = 0; i < vec_size; i++) + p->MOV(ra->genReg(insn.dst(1 + i)), ra->genReg(insn.src(2 + i))); + // Now read the data + p->MBWRITE(header, insn.getbti(), vec_size); + p->pop(); + + } + else + { + p->push(); + // Copy r0 into the header first + p->curr.execWidth = 8; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->MOV(header, GenRegister::ud8grf(0,0)); + + // First half + // Update the header with the coord + p->curr.execWidth = 1; + p->MOV(offsetx, coordx); + p->MOV(offsety, coordy); + // Update block width and height + p->MOV(blocksizereg, GenRegister::immud(blocksize)); + // Now read the data + p->curr.execWidth = 8; + // Mov what we need into msgs + for(uint32_t i = 0; i < vec_size; i++) + p->MOV(GenRegister::offset(header, 1 + i), ra->genReg(insn.src(2 + i))); + p->MBWRITE(header, insn.getbti(), vec_size); + + // Second half + // Update the header with the coord + p->curr.execWidth = 1; + p->ADD(offsetx, offsetx, GenRegister::immud(32)); + + p->curr.execWidth = 8; + // Mov what we need into msgs + for(uint32_t i = 0; i < vec_size; i++) + p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra->genReg(insn.src(2 + i)), 1)); + // Now write the data + p->MBWRITE(header, insn.getbti(), vec_size); + + p->pop(); + } + } BVAR(OCL_OUTPUT_REG_ALLOC, false); BVAR(OCL_OUTPUT_ASM, false); diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp index a634338..fb3d4fe 100644 --- a/backend/src/backend/gen_context.hpp +++ b/backend/src/backend/gen_context.hpp @@ -189,6 +189,8 @@ namespace gbe void afterMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, GenRegister btiTmp, unsigned jip0); void emitOBReadInstruction(const SelectionInstruction &insn); void emitOBWriteInstruction(const SelectionInstruction &insn); + void emitMBReadInstruction(const SelectionInstruction &insn); + void emitMBWriteInstruction(const SelectionInstruction &insn); /*! Implements base class */ virtual Kernel *allocateKernel(void); diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp index 09cb2ba..66ae5b5 100644 --- a/backend/src/backend/gen_defs.hpp +++ b/backend/src/backend/gen_defs.hpp @@ -784,6 +784,22 @@ union GenNativeInstruction uint32_t jip:32; } gen8_branch; + /*! Data port Media block read / write */ + struct { + uint32_t bti:8; + uint32_t ver_line_stride_offset:1; + uint32_t ver_line_stride:1; + uint32_t ver_line_stride_override:1; + uint32_t ignored:3; + uint32_t msg_type:4; + uint32_t category:1; + uint32_t header_present:1; + uint32_t response_length:5; + uint32_t msg_length:4; + uint32_t pad2:2; + uint32_t end_of_thread:1; + } gen7_mblock_rw; + int d; uint32_t ud; float f; diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp index e745b9c..eb9fbeb 100644 --- a/backend/src/backend/gen_encoder.cpp +++ b/backend/src/backend/gen_encoder.cpp @@ -276,6 +276,21 @@ namespace gbe insn->bits3.gen7_oblock_rw.header_present = 1; } + static void setMBlockRW(GenEncoder *p, + GenNativeInstruction *insn, + uint32_t bti, + uint32_t msg_type, + uint32_t msg_length, + uint32_t response_length) + { + const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA; + p->setMessageDescriptor(insn, sfid, msg_length, response_length); + insn->bits3.gen7_mblock_rw.msg_type = msg_type; + insn->bits3.gen7_mblock_rw.bti = bti; + insn->bits3.gen7_mblock_rw.header_present = 1; + } + + static void setDWordScatterMessgae(GenEncoder *p, GenNativeInstruction *insn, uint32_t bti, @@ -1277,6 +1292,38 @@ namespace gbe response_length); } + void GenEncoder::MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) { + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); + const uint32_t msg_length = 1; + const uint32_t response_length = size; // Size of registers + this->setHeader(insn); + this->setDst(insn, GenRegister::ud8grf(dst.nr, 0)); + this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); + this->setSrc1(insn, GenRegister::immud(0)); + setMBlockRW(this, + insn, + bti, + GEN75_P1_MEDIA_BREAD, + msg_length, + response_length); + } + + void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t size) { + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); + const uint32_t msg_length = 1 + size; + const uint32_t response_length = 0; // Size of registers + this->setHeader(insn); + this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW)); + this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); + this->setSrc1(insn, GenRegister::immud(0)); + setMBlockRW(this, + insn, + bti, + GEN75_P1_MEDIA_TYPED_BWRITE, + msg_length, + response_length); + } + void GenEncoder::EOT(uint32_t msg) { GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD)); diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp index a53c879..4979305 100644 --- a/backend/src/backend/gen_encoder.hpp +++ b/backend/src/backend/gen_encoder.hpp @@ -271,6 +271,10 @@ namespace gbe void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize); /*! OBlock write */ void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize); + /*! MBlock read */ + virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize); + /*! MBlock write */ + virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize); GBE_CLASS(GenEncoder); //!< Use custom allocators virtual void alu3(uint32_t opcode, GenRegister dst, diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx index d297726..c396626 100644 --- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx +++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx @@ -52,3 +52,5 @@ DECL_GEN7_SCHEDULE(SubGroupOp, 80, 1, 1) DECL_GEN7_SCHEDULE(Printf, 80, 1, 1) DECL_GEN7_SCHEDULE(OBRead, 80, 1, 1) DECL_GEN7_SCHEDULE(OBWrite, 80, 1, 1) +DECL_GEN7_SCHEDULE(MBRead, 80, 1, 1) +DECL_GEN7_SCHEDULE(MBWrite, 80, 1, 1) diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index e974e97..39688ad 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -189,7 +189,8 @@ namespace gbe this->opcode == SEL_OP_SAMPLE || this->opcode == SEL_OP_VME || this->opcode == SEL_OP_DWORD_GATHER || - this->opcode == SEL_OP_OBREAD; + this->opcode == SEL_OP_OBREAD || + this->opcode == SEL_OP_MBREAD; } bool SelectionInstruction::modAcc(void) const { @@ -212,7 +213,8 @@ namespace gbe this->opcode == SEL_OP_ATOMIC || this->opcode == SEL_OP_BYTE_SCATTER || this->opcode == SEL_OP_TYPED_WRITE || - this->opcode == SEL_OP_OBWRITE; + this->opcode == SEL_OP_OBWRITE || + this->opcode == SEL_OP_MBWRITE; } bool SelectionInstruction::isBranch(void) const { @@ -703,6 +705,10 @@ namespace gbe void OBREAD(GenRegister dst, GenRegister addr, GenRegister header, uint32_t bti, uint32_t size); /*! Oblock write */ void OBWRITE(GenRegister addr, GenRegister value, GenRegister header, uint32_t bti, uint32_t size); + /*! Media block read */ + void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size); + /*! Media block write */ + void MBWRITE(GenRegister coordx, GenRegister coordy, GenRegister* values, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size); /* common functions for both binary instruction and sel_cmp and compare instruction. It will handle the IMM or normal register assignment, and will try to avoid LOADI @@ -2055,6 +2061,63 @@ namespace gbe vector->isSrc = 1; } + void Selection::Opaque::MBREAD(GenRegister* dsts, + GenRegister coordx, + GenRegister coordy, + GenRegister header, + GenRegister* tmp, + uint32_t bti, + uint32_t vec_size) { + SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size * 2, 3); + SelectionVector *vector = this->appendVector(); + SelectionVector *vectortmp = this->appendVector(); + for (uint32_t i = 0; i < vec_size; ++i) { + insn->dst(i) = dsts[i]; + insn->dst(i + vec_size) = tmp[i]; + } + insn->src(0) = coordx; + insn->src(1) = coordy; + insn->src(2) = header; + insn->setbti(bti); + insn->extra.elem = vec_size; // vector size + + vector->regNum = vec_size; + vector->reg = &insn->dst(0); + vector->offsetID = 0; + vector->isSrc = 0; + vectortmp->regNum = vec_size; + vectortmp->reg = &insn->dst(vec_size); + vectortmp->offsetID = 0; + vectortmp->isSrc = 0; + + } + + void Selection::Opaque::MBWRITE(GenRegister coordx, + GenRegister coordy, + GenRegister* values, + GenRegister header, + GenRegister* tmp, + uint32_t bti, + uint32_t vec_size) { + SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 1 + vec_size, 2 + vec_size); + SelectionVector *vector = this->appendVector(); + insn->src(0) = coordx; + insn->src(1) = coordy; + for (uint32_t i = 0; i < vec_size; ++i) + insn->src(2 + i) = values[i]; + insn->dst(0) = header; + for (uint32_t i = 0; i < vec_size; ++i) + insn->dst(1 + i) = tmp[i]; + insn->state = this->curr; + insn->setbti(bti); + insn->extra.elem = vec_size; // vector size + + // We need to put the header and the data together + vector->regNum = 1 + vec_size; + vector->reg = &insn->dst(0); + vector->offsetID = 0; + vector->isSrc = 0; + } // Boiler plate to initialize the selection library at c++ pre-main static SelectionLibrary *selLib = NULL; @@ -6583,6 +6646,52 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp } }; + /*! Media Block Read pattern */ + DECL_PATTERN(MediaBlockReadInstruction) + { + bool emitOne(Selection::Opaque &sel, const ir::MediaBlockReadInstruction &insn, bool &markChildren) const + { + using namespace ir; + uint32_t vec_size = insn.getVectorSize(); + GenRegister values[vec_size]; + GenRegister tmp[vec_size]; + for (uint32_t i = 0; i < vec_size; ++i) { + values[i] = sel.selReg(insn.getDst(i), TYPE_U32); + tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); + } + const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32); + const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32); + const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); + sel.MBREAD(values, coordx, coordy, header, tmp, insn.getImageIndex(), insn.getVectorSize()); + return true; + } + DECL_CTOR(MediaBlockReadInstruction, 1, 1); + }; + + /*! Media Block Write pattern */ + DECL_PATTERN(MediaBlockWriteInstruction) + { + bool emitOne(Selection::Opaque &sel, const ir::MediaBlockWriteInstruction &insn, bool &markChildren) const + { + using namespace ir; + uint32_t vec_size = insn.getVectorSize(); + const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32); + const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32); + GenRegister values[vec_size]; + GenRegister tmp[vec_size]; + for(uint32_t i = 0; i < vec_size; i++) + { + values[i] = sel.selReg(insn.getSrc(2 + i), TYPE_U32); + tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); + } + const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); + sel.MBWRITE(coordx, coordy, values, header, tmp, insn.getImageIndex(), vec_size); + return true; + } + DECL_CTOR(MediaBlockWriteInstruction, 1, 1); + }; + + /*! Sort patterns */ INLINE bool cmp(const SelectionPattern *p0, const SelectionPattern *p1) { if (p0->insnNum != p1->insnNum) @@ -6624,6 +6733,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp this->insert<NullaryInstructionPattern>(); this->insert<WaitInstructionPattern>(); this->insert<PrintfInstructionPattern>(); + this->insert<MediaBlockReadInstructionPattern>(); + this->insert<MediaBlockWriteInstructionPattern>(); // Sort all the patterns with the number of instructions they output for (uint32_t op = 0; op < ir::OP_INVALID; ++op) diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp index 51af686..b481de8 100644 --- a/backend/src/backend/gen_insn_selection.hpp +++ b/backend/src/backend/gen_insn_selection.hpp @@ -177,6 +177,8 @@ namespace gbe switch (opcode) { case SEL_OP_OBREAD: case SEL_OP_OBWRITE: + case SEL_OP_MBREAD: + case SEL_OP_MBWRITE: case SEL_OP_DWORD_GATHER: return extra.function; case SEL_OP_SAMPLE: return extra.rdbti; case SEL_OP_VME: return extra.vme_bti; @@ -192,6 +194,8 @@ namespace gbe switch (opcode) { case SEL_OP_OBREAD: case SEL_OP_OBWRITE: + case SEL_OP_MBREAD: + case SEL_OP_MBWRITE: case SEL_OP_DWORD_GATHER: extra.function = bti; return; case SEL_OP_SAMPLE: extra.rdbti = bti; return; case SEL_OP_VME: extra.vme_bti = bti; return; diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx index 4a7caff..ccaf526 100644 --- a/backend/src/backend/gen_insn_selection.hxx +++ b/backend/src/backend/gen_insn_selection.hxx @@ -98,3 +98,5 @@ DECL_SELECTION_IR(SUBGROUP_OP, SubGroupOpInstruction) DECL_SELECTION_IR(PRINTF, PrintfInstruction) DECL_SELECTION_IR(OBREAD, OBReadInstruction) DECL_SELECTION_IR(OBWRITE, OBWriteInstruction) +DECL_SELECTION_IR(MBREAD, MBReadInstruction) +DECL_SELECTION_IR(MBWRITE, MBWriteInstruction) diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp index 88491a7..ed64580 100644 --- a/backend/src/ir/instruction.cpp +++ b/backend/src/ir/instruction.cpp @@ -1064,6 +1064,78 @@ namespace ir { Register dst[1]; }; + class ALIGNED_INSTRUCTION MediaBlockReadInstruction : + public BasePolicy, + public TupleSrcPolicy<MediaBlockReadInstruction>, + public TupleDstPolicy<MediaBlockReadInstruction> + { + public: + INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t vec_size, Tuple srcTuple, uint8_t srcNum) { + this->opcode = OP_MBREAD; + this->dst = dst; + this->dstNum = vec_size; + this->src = srcTuple; + this->srcNum = srcNum; + this->imageIdx = imageIdx; + } + INLINE bool wellFormed(const Function &fn, std::string &why) const; + INLINE void out(std::ostream &out, const Function &fn) const { + this->outOpcode(out); + out << (int)this->getVectorSize(); + out << " {"; + for (uint32_t i = 0; i < dstNum; ++i) + out << "%" << this->getDst(fn, i) << (i != (dstNum-1u) ? " " : ""); + out << "}"; + out << " 2D surface id " << (int)this->getImageIndex() + << " byte coord x %" << this->getSrc(fn, 0) + << " row coord y %" << this->getSrc(fn, 1); + } + INLINE uint8_t getImageIndex(void) const { return this->imageIdx; } + INLINE uint8_t getVectorSize(void) const { return this->dstNum; } + + Tuple src; + Tuple dst; + uint8_t imageIdx; + uint8_t srcNum; + uint8_t dstNum; + }; + + class ALIGNED_INSTRUCTION MediaBlockWriteInstruction : + public BasePolicy, + public TupleSrcPolicy<MediaBlockWriteInstruction>, + public NDstPolicy<MediaBlockWriteInstruction, 0> + { + public: + + INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size) { + this->opcode = OP_MBWRITE; + this->src = srcTuple; + this->srcNum = srcNum; + this->imageIdx = imageIdx; + this->vec_size = vec_size; + } + INLINE bool wellFormed(const Function &fn, std::string &why) const; + INLINE void out(std::ostream &out, const Function &fn) const { + this->outOpcode(out); + out << (int)this->getVectorSize() + << " 2D surface id " << (int)this->getImageIndex() + << " byte coord x %" << this->getSrc(fn, 0) + << " row coord y %" << this->getSrc(fn, 1); + out << " {"; + for (uint32_t i = 0; i < vec_size; ++i) + out << "%" << this->getSrc(fn, i + 2) << (i != (vec_size-1u) ? " " : ""); + out << "}"; + } + INLINE uint8_t getImageIndex(void) const { return this->imageIdx; } + INLINE uint8_t getVectorSize(void) const { return this->vec_size; } + + Tuple src; + Register dst[0]; + uint8_t imageIdx; + uint8_t srcNum; + uint8_t vec_size; + }; + #undef ALIGNED_INSTRUCTION ///////////////////////////////////////////////////////////////////////// @@ -1591,6 +1663,22 @@ namespace ir { return true; } + INLINE bool MediaBlockReadInstruction::wellFormed(const Function &fn, std::string &whyNot) const { + if (this->srcNum != 2) { + whyNot = "Wrong number of source."; + return false; + } + return true; + } + + INLINE bool MediaBlockWriteInstruction::wellFormed(const Function &fn, std::string &whyNot) const { + if (this->srcNum != 2 + this->vec_size) { + whyNot = "Wrong number of source."; + return false; + } + return true; + } + #undef CHECK_TYPE ///////////////////////////////////////////////////////////////////////// @@ -2058,6 +2146,14 @@ START_INTROSPECTION(PrintfInstruction) #include "ir/instruction.hxx" END_INTROSPECTION(PrintfInstruction) +START_INTROSPECTION(MediaBlockReadInstruction) +#include "ir/instruction.hxx" +END_INTROSPECTION(MediaBlockReadInstruction) + +START_INTROSPECTION(MediaBlockWriteInstruction) +#include "ir/instruction.hxx" +END_INTROSPECTION(MediaBlockWriteInstruction) + #undef END_INTROSPECTION #undef START_INTROSPECTION #undef DECL_INSN @@ -2205,7 +2301,8 @@ END_FUNCTION(Instruction, Register) opcode == OP_CALC_TIMESTAMP || opcode == OP_STORE_PROFILING || opcode == OP_WAIT || - opcode == OP_PRINTF; + opcode == OP_PRINTF || + opcode == OP_MBWRITE; } #define DECL_MEM_FN(CLASS, RET, PROTOTYPE, CALL) \ @@ -2275,6 +2372,10 @@ DECL_MEM_FN(SubGroupInstruction, WorkGroupOps, getWorkGroupOpcode(void), getWork DECL_MEM_FN(PrintfInstruction, uint32_t, getNum(void), getNum()) DECL_MEM_FN(PrintfInstruction, uint32_t, getBti(void), getBti()) DECL_MEM_FN(PrintfInstruction, Type, getType(const Function& fn, uint32_t ID), getType(fn, ID)) +DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getImageIndex(void), getImageIndex()) +DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getVectorSize(void), getVectorSize()) +DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getImageIndex(void), getImageIndex()) +DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getVectorSize(void), getVectorSize()) #undef DECL_MEM_FN @@ -2582,6 +2683,15 @@ DECL_MEM_FN(MemInstruction, void, setBtiReg(Register reg), setBtiReg(reg)) return internal::PrintfInstruction(dst, srcTuple, typeTuple, srcNum, bti, num).convert(); } + Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum) { + return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, coord, srcNum).convert(); + } + + Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size) { + return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, vec_size).convert(); + } + + std::ostream &operator<< (std::ostream &out, const Instruction &insn) { const Function &fn = insn.getFunction(); const BasicBlock *bb = insn.getParent(); diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp index 4e7d5b7..b2b0b49 100644 --- a/backend/src/ir/instruction.hpp +++ b/backend/src/ir/instruction.hpp @@ -635,6 +635,24 @@ namespace ir { static bool isClassOf(const Instruction &insn); }; + /*! Media Block Read. */ + class MediaBlockReadInstruction : public Instruction { + public: + /*! Return true if the given instruction is an instance of this class */ + static bool isClassOf(const Instruction &insn); + uint8_t getImageIndex() const; + uint8_t getVectorSize() const; + }; + + /*! Media Block Write. */ + class MediaBlockWriteInstruction : public Instruction { + public: + /*! Return true if the given instruction is an instance of this class */ + static bool isClassOf(const Instruction &insn); + uint8_t getImageIndex() const; + uint8_t getVectorSize() const; + }; + /*! Specialize the instruction. Also performs typechecking first based on the * opcode. Crashes if it fails */ @@ -867,6 +885,10 @@ namespace ir { Instruction SUBGROUP(WorkGroupOps opcode, Register dst, Tuple srcTuple, uint8_t srcNum, Type type); /*! printf */ Instruction PRINTF(Register dst, Tuple srcTuple, Tuple typeTuple, uint8_t srcNum, uint8_t bti, uint16_t num); + /*! media block read */ + Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum); + /*! media block write */ + Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size); } /* namespace ir */ } /* namespace gbe */ diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx index 57e13eb..7d755ae 100644 --- a/backend/src/ir/instruction.hxx +++ b/backend/src/ir/instruction.hxx @@ -114,3 +114,5 @@ DECL_INSN(WAIT, WaitInstruction) DECL_INSN(WORKGROUP, WorkGroupInstruction) DECL_INSN(SUBGROUP, SubGroupInstruction) DECL_INSN(PRINTF, PrintfInstruction) +DECL_INSN(MBREAD, MediaBlockReadInstruction) +DECL_INSN(MBWRITE, MediaBlockWriteInstruction) diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp index 3162d13..43d4c87 100644 --- a/backend/src/ir/liveness.cpp +++ b/backend/src/ir/liveness.cpp @@ -118,7 +118,8 @@ namespace ir { uniform = false; // do not change dst uniform for block read - if (insn.getOpcode() == ir::OP_LOAD && ir::cast<ir::LoadInstruction>(insn).isBlock()) + if ((insn.getOpcode() == ir::OP_LOAD && ir::cast<ir::LoadInstruction>(insn).isBlock()) || + insn.getOpcode() == ir::OP_MBREAD) uniform = false; for (uint32_t srcID = 0; srcID < srcNum; ++srcID) { diff --git a/backend/src/libocl/src/ocl_substore.ll b/backend/src/libocl/src/ocl_substore.ll index 665cdfa..f6c2c70 100644 --- a/backend/src/libocl/src/ocl_substore.ll +++ b/backend/src/libocl/src/ocl_substore.ll @@ -1,9 +1,42 @@ target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" target triple = "spir" +%opencl.image2d_t = type opaque declare void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)* nocapture, i32) nounwind alwaysinline noduplicate +declare void @__gen_ocl_sub_group_block_write_image(%opencl.image2d_t addrspace(1)*, i32, i32, i32) nounwind alwaysinline noduplicate +declare void @__gen_ocl_sub_group_block_write_image2(%opencl.image2d_t addrspace(1)*, i32, i32, <2 x i32>) nounwind alwaysinline noduplicate +declare void @__gen_ocl_sub_group_block_write_image4(%opencl.image2d_t addrspace(1)*, i32, i32, <4 x i32>) nounwind alwaysinline noduplicate +declare void @__gen_ocl_sub_group_block_write_image8(%opencl.image2d_t addrspace(1)*, i32, i32, <8 x i32>) nounwind alwaysinline noduplicate define void @_Z27intel_sub_group_block_writePKU3AS1jj(i32 addrspace(1)* %p, i32 %data) nounwind alwaysinline noduplicate { call void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)* %p, i32 %data) ret void } + +define void @_Z27intel_sub_group_block_write11ocl_image2dDv2_ij(%opencl.image2d_t addrspace(1)* %image, <2 x i32> %byte_coord, i32 %data) nounwind alwaysinline noduplicate { + %1 = extractelement <2 x i32> %byte_coord, i32 0 + %2 = extractelement <2 x i32> %byte_coord, i32 1 + call void @__gen_ocl_sub_group_block_write_image(%opencl.image2d_t addrspace(1)* %image, i32 %1, i32 %2, i32 %data) + ret void +} + +define void @_Z28intel_sub_group_block_write211ocl_image2dDv2_iDv2_j(%opencl.image2d_t addrspace(1)* %image, <2 x i32> %byte_coord, <2 x i32> %data) nounwind alwaysinline noduplicate { + %1 = extractelement <2 x i32> %byte_coord, i32 0 + %2 = extractelement <2 x i32> %byte_coord, i32 1 + call void @__gen_ocl_sub_group_block_write_image2(%opencl.image2d_t addrspace(1)* %image, i32 %1, i32 %2, <2 x i32> %data) + ret void +} + +define void @_Z28intel_sub_group_block_write411ocl_image2dDv2_iDv4_j(%opencl.image2d_t addrspace(1)* %image, <2 x i32> %byte_coord, <4 x i32> %data) nounwind alwaysinline noduplicate { + %1 = extractelement <2 x i32> %byte_coord, i32 0 + %2 = extractelement <2 x i32> %byte_coord, i32 1 + call void @__gen_ocl_sub_group_block_write_image4(%opencl.image2d_t addrspace(1)* %image, i32 %1, i32 %2, <4 x i32> %data) + ret void +} + +define void @_Z28intel_sub_group_block_write811ocl_image2dDv2_iDv8_j(%opencl.image2d_t addrspace(1)* %image, <2 x i32> %byte_coord, <8 x i32> %data) nounwind alwaysinline noduplicate { + %1 = extractelement <2 x i32> %byte_coord, i32 0 + %2 = extractelement <2 x i32> %byte_coord, i32 1 + call void @__gen_ocl_sub_group_block_write_image8(%opencl.image2d_t addrspace(1)* %image, i32 %1, i32 %2, <8 x i32> %data) + ret void +} diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl index 66490cc..753a045 100644 --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl @@ -187,3 +187,24 @@ OVERLOADABLE void intel_sub_group_block_write8(const global uint* p,uint8 data) intel_sub_group_block_write(p + get_simd_size() * 6, data.s6); intel_sub_group_block_write(p + get_simd_size() * 7, data.s7); } + +PURE CONST uint __gen_ocl_sub_group_block_read_image(image2d_t p, int x, int y); +PURE CONST uint2 __gen_ocl_sub_group_block_read_image2(image2d_t p, int x, int y); +PURE CONST uint4 __gen_ocl_sub_group_block_read_image4(image2d_t p, int x, int y); +PURE CONST uint8 __gen_ocl_sub_group_block_read_image8(image2d_t p, int x, int y); +OVERLOADABLE uint intel_sub_group_block_read(image2d_t p, int2 cord) +{ + return __gen_ocl_sub_group_block_read_image(p, cord.x, cord.y); +} +OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t p, int2 cord) +{ + return __gen_ocl_sub_group_block_read_image2(p, cord.x, cord.y); +} +OVERLOADABLE uint4 intel_sub_group_block_read4(image2d_t p, int2 cord) +{ + return __gen_ocl_sub_group_block_read_image4(p, cord.x, cord.y); +} +OVERLOADABLE uint8 intel_sub_group_block_read8(image2d_t p, int2 cord) +{ + return __gen_ocl_sub_group_block_read_image8(p, cord.x, cord.y); +} diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h index d0676be..799f772 100644 --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h @@ -143,3 +143,13 @@ OVERLOADABLE void intel_sub_group_block_write(const __global uint* p, uint data) OVERLOADABLE void intel_sub_group_block_write2(const __global uint* p, uint2 data); OVERLOADABLE void intel_sub_group_block_write4(const __global uint* p, uint4 data); OVERLOADABLE void intel_sub_group_block_write8(const __global uint* p, uint8 data); + +OVERLOADABLE uint intel_sub_group_block_read(image2d_t image, int2 byte_coord); +OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t image, int2 byte_coord); +OVERLOADABLE uint4 intel_sub_group_block_read4(image2d_t image, int2 byte_coord); +OVERLOADABLE uint8 intel_sub_group_block_read8(image2d_t image, int2 byte_coord); + +OVERLOADABLE void intel_sub_group_block_write(image2d_t image, int2 byte_coord, uint data); +OVERLOADABLE void intel_sub_group_block_write2(image2d_t image, int2 byte_coord, uint2 data); +OVERLOADABLE void intel_sub_group_block_write4(image2d_t image, int2 byte_coord, uint4 data); +OVERLOADABLE void intel_sub_group_block_write8(image2d_t image, int2 byte_coord, uint8 data); diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index ffa838c..38c0f2b 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -699,6 +699,7 @@ namespace gbe void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode); // Emit subgroup instructions void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite); + void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size); uint8_t appendSampler(CallSite::arg_iterator AI); uint8_t getImageID(CallInst &I); @@ -3744,10 +3745,12 @@ namespace gbe case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MAX: case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MIN: case GEN_OCL_LRP: - this->newRegister(&I); - break; case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM: - this->newRegister(&I, NULL, false); + case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE: + case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8: + this->newRegister(&I); break; case GEN_OCL_PRINTF: this->newRegister(&I); // fall through @@ -3764,6 +3767,10 @@ namespace gbe case GEN_OCL_STORE_PROFILING: case GEN_OCL_DEBUGWAIT: case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8: break; case GEN_OCL_NOT_FOUND: default: @@ -4013,6 +4020,39 @@ namespace gbe GBE_ASSERT(AI == AE); } + void GenWriter::emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size) { + CallSite::arg_iterator AI = CS.arg_begin(); + CallSite::arg_iterator AE = CS.arg_end(); + GBE_ASSERT(AI != AE); + + const uint8_t imageID = getImageID(I); + AI++; + + if(isWrite){ + ir::Register src[2 + vec_size]; + src[0] = getRegister(*(AI++)); + src[1] = getRegister(*(AI++)); + for(int i = 0;i < vec_size; i++) + src[2 + i] = getRegister(*(AI), i); + AI++; + const ir::Tuple srctuple = ctx.arrayTuple(src, 2 + vec_size); + ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size); + } else { + ir::Register src[2]; + src[0] = getRegister(*(AI++)); + src[1] = getRegister(*(AI++)); + ir::Register dst[vec_size]; + for(int i = 0;i < vec_size; i++) + dst[i] = getRegister(&I, i); + const ir::Tuple srctuple = ctx.arrayTuple(src, 2); + const ir::Tuple dsttuple = ctx.arrayTuple(dst, vec_size); + ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2); + } + + GBE_ASSERT(AI == AE); + } + + /* append a new sampler. should be called before any reference to * a sampler_t value. */ uint8_t GenWriter::appendSampler(CallSite::arg_iterator AI) { @@ -4841,6 +4881,22 @@ namespace gbe this->emitBlockReadWriteMemInst(I, CS, false); break; case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM: this->emitBlockReadWriteMemInst(I, CS, true); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE: + this->emitBlockReadWriteImageInst(I, CS, false, 1); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2: + this->emitBlockReadWriteImageInst(I, CS, false, 2); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4: + this->emitBlockReadWriteImageInst(I, CS, false, 4); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8: + this->emitBlockReadWriteImageInst(I, CS, false, 8); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE: + this->emitBlockReadWriteImageInst(I, CS, true, 1); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2: + this->emitBlockReadWriteImageInst(I, CS, true, 2); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4: + this->emitBlockReadWriteImageInst(I, CS, true, 4); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8: + this->emitBlockReadWriteImageInst(I, CS, true, 8); break; default: break; } } diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx index 003be91..456ab58 100644 --- a/backend/src/llvm/llvm_gen_ocl_function.hxx +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx @@ -219,6 +219,14 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN, __gen_ocl_sub_group_scan_in DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM, __gen_ocl_sub_group_block_read_mem) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM, __gen_ocl_sub_group_block_write_mem) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE, __gen_ocl_sub_group_block_read_image) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE2, __gen_ocl_sub_group_block_read_image2) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE4, __gen_ocl_sub_group_block_read_image4) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE8, __gen_ocl_sub_group_block_read_image8) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE, __gen_ocl_sub_group_block_write_image) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE2, __gen_ocl_sub_group_block_write_image2) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE4, __gen_ocl_sub_group_block_write_image4) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE8, __gen_ocl_sub_group_block_write_image8) // common function DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp) diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp index 53fd320..e60bf4b 100644 --- a/backend/src/llvm/llvm_scalarize.cpp +++ b/backend/src/llvm/llvm_scalarize.cpp @@ -682,7 +682,21 @@ namespace gbe { *CI = InsertToVector(call, *CI); break; } + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8: + { + ++CI; + ++CI; + if ((*CI)->getType()->isVectorTy()) + *CI = InsertToVector(call, *CI); + break; + } case GEN_OCL_VME: + case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8: setAppendPoint(call); extractFromVector(call); break; -- 2.7.4 _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet