From: Pan Xiuli <xiuli....@intel.com> Using max to 8 OWORD as read/write size for high profermance. V4: Reuse tmp for oword read for small and less vector.
Signed-off-by: Pan Xiuli <xiuli....@intel.com> --- backend/src/backend/gen_context.cpp | 154 ++++++++++++++++++++++++++--- backend/src/backend/gen_encoder.cpp | 6 +- backend/src/backend/gen_insn_selection.cpp | 84 +++++++++++----- backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 41 +++----- backend/src/llvm/llvm_gen_backend.cpp | 46 ++++++--- backend/src/llvm/llvm_gen_ocl_function.hxx | 6 ++ backend/src/llvm/llvm_scalarize.cpp | 12 +++ 7 files changed, 263 insertions(+), 86 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 081033a..5303191 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -3488,11 +3488,17 @@ namespace gbe } void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) { - const GenRegister dst = ra->genReg(insn.dst(0)); + const GenRegister dst= GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD); const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD); - GenRegister header = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_UD); + const GenRegister header = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_UD); + const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4); + const uint32_t vec_size = insn.extra.elem; + const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(vec_size)), GEN_TYPE_UD); + const uint32_t simdWidth = p->curr.execWidth; + // Make header p->push(); + { // Copy r0 into the header first p->curr.execWidth = 8; p->curr.predicate = GEN_PREDICATE_NONE; @@ -3501,23 +3507,81 @@ namespace gbe // Update the header with the current address p->curr.execWidth = 1; - p->SHR(GenRegister::offset(header, 0, 2*4), addr, GenRegister::immud(4)); + p->SHR(headeraddr, addr, GenRegister::immud(4)); // Put zero in the general state base address - p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0)); - + p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0)); + } p->pop(); - // Now read the data - p->OBREAD(dst, header, insn.getbti(), insn.extra.elem); + // Now read the data, oword block read can only work with simd16 and no mask + if (vec_size == 1) { + p->push(); + { + p->curr.execWidth = 16; + p->curr.noMask = 1; + p->OBREAD(dst, header, insn.getbti(), simdWidth / 4); + } + p->pop(); + } else if (vec_size == 2) { + p->push(); + { + p->curr.execWidth = 16; + p->curr.noMask = 1; + p->OBREAD(tmp, header, insn.getbti(), simdWidth / 2); + } + p->pop(); + p->MOV(ra->genReg(insn.dst(0)), GenRegister::offset(tmp, 0)); + p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, simdWidth / 8)); + } else if (vec_size == 4 || vec_size == 8) { + if (simdWidth == 8) { + for (uint32_t i = 0; i < vec_size / 4; i++) { + if (i > 0) { + p->push(); + { + // Update the address in header + p->curr.execWidth = 1; + p->ADD(headeraddr, headeraddr, GenRegister::immud(8)); + } + p->pop(); + } + p->push(); + { + p->curr.execWidth = 16; + p->curr.noMask = 1; + p->OBREAD(tmp, header, insn.getbti(), 8); + } + p->pop(); + for (uint32_t j = 0; j < 4; j++) + p->MOV(ra->genReg(insn.dst(j + i * 4)), GenRegister::offset(tmp, j)); + } + } else { + for (uint32_t i = 0; i < vec_size / 2; i++) { + if (i > 0) { + p->push(); + { + // Update the address in header + p->curr.execWidth = 1; + p->ADD(headeraddr, headeraddr, GenRegister::immud(8)); + } + p->pop(); + } + p->OBREAD(tmp, header, insn.getbti(), 8); + for (uint32_t j = 0; j < 2; j++) + p->MOV(ra->genReg(insn.dst(j + i * 2)), GenRegister::offset(tmp, j*2)); + } + } + } else NOT_SUPPORTED; } void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) { - const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(2)), GEN_TYPE_UD); - GenRegister header; - if (simdWidth == 8) - header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD); - else - header = GenRegister::retype(GenRegister::Qn(ra->genReg(insn.src(0)),1), GEN_TYPE_UD); + const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD); + const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD); + const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4); + const uint32_t vec_size = insn.extra.elem; + const GenRegister tmp = GenRegister::offset(header, 1); + const uint32_t simdWidth = p->curr.execWidth; + uint32_t tmp_size = simdWidth * vec_size / 8; + tmp_size = tmp_size > 4 ? 4 : tmp_size; p->push(); // Copy r0 into the header first @@ -3528,14 +3592,72 @@ namespace gbe // Update the header with the current address p->curr.execWidth = 1; - p->SHR(GenRegister::offset(header, 0, 2*4), addr, GenRegister::immud(4)); + p->SHR(headeraddr, addr, GenRegister::immud(4)); // Put zero in the general state base address p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0)); p->pop(); - // Now write the data - p->OBWRITE(header, insn.getbti(), insn.extra.elem); + // Now write the data, oword block write can only work with simd16 and no mask + if (vec_size == 1) { + p->MOV(tmp, ra->genReg(insn.src(1))); + p->push(); + { + p->curr.execWidth = 16; + p->curr.noMask = 1; + p->OBWRITE(header, insn.getbti(), simdWidth / 4); + } + p->pop(); + } else if (vec_size == 2) { + p->MOV(GenRegister::offset(tmp, 0), ra->genReg(insn.src(1))) ; + p->MOV(GenRegister::offset(tmp, simdWidth / 8), ra->genReg(insn.src(2))) ; + p->push(); + { + p->curr.execWidth = 16; + p->curr.noMask = 1; + p->OBWRITE(header, insn.getbti(), simdWidth / 2); + } + p->pop(); + } else if (vec_size == 4 || vec_size == 8) { + if (simdWidth == 8) { + for (uint32_t i = 0; i < vec_size / 4; i++) { + for (uint32_t j = 0; j < 4; j++) + p->MOV(GenRegister::offset(tmp, j), ra->genReg(insn.src(1 + j + i*4))) ; + if (i > 0) { + p->push(); + { + // Update the address in header + p->curr.execWidth = 1; + p->ADD(headeraddr, headeraddr, GenRegister::immud(8)); + } + p->pop(); + } + p->push(); + { + p->curr.execWidth = 16; + p->curr.noMask = 1; + p->OBWRITE(header, insn.getbti(), 8); + } + p->pop(); + } + } else { + for (uint32_t i = 0; i < vec_size / 2; i++) { + for (uint32_t j = 0; j < 2; j++) + p->MOV(GenRegister::offset(tmp, j * 2), ra->genReg(insn.src(1 + j + i*2))) ; + if (i > 0) { + p->push(); + { + // Update the address in header + p->curr.execWidth = 1; + p->ADD(headeraddr, headeraddr, GenRegister::immud(8)); + } + p->pop(); + } + p->OBWRITE(header, insn.getbti(), 8); + } + } + } else NOT_SUPPORTED; + } void GenContext::emitMBReadInstruction(const SelectionInstruction &insn) { diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp index eb9fbeb..f8c99b2 100644 --- a/backend/src/backend/gen_encoder.cpp +++ b/backend/src/backend/gen_encoder.cpp @@ -269,10 +269,10 @@ namespace gbe { const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA; p->setMessageDescriptor(insn, sfid, msg_length, response_length); - assert(size == 2 || size == 4); + assert(size == 2 || size == 4 || size == 8); insn->bits3.gen7_oblock_rw.msg_type = msg_type; insn->bits3.gen7_oblock_rw.bti = bti; - insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : 3; + insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : (size == 4 ? 3 : 4); insn->bits3.gen7_oblock_rw.header_present = 1; } @@ -1261,7 +1261,7 @@ namespace gbe void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) { GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); const uint32_t msg_length = 1; - const uint32_t response_length = size / 2; // Size is in owords + const uint32_t response_length = size / 2; // Size is in regs this->setHeader(insn); this->setDst(insn, GenRegister::uw16grf(dst.nr, 0)); this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 9a5567d..c566957 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -702,9 +702,9 @@ namespace gbe void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src, GenRegister tmpData1, GenRegister tmpData2); /*! Oblock read */ - void OBREAD(GenRegister dst, GenRegister addr, GenRegister header, uint32_t bti, uint32_t size); + void OBREAD(GenRegister* dsts, uint32_t vec_size, GenRegister addr, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size); /*! Oblock write */ - void OBWRITE(GenRegister addr, GenRegister value, GenRegister header, uint32_t bti, uint32_t size); + void OBWRITE(GenRegister addr, GenRegister* values, uint32_t vec_size, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size); /*! Media block read */ void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size); /*! Media block write */ @@ -2027,38 +2027,54 @@ namespace gbe insn->src(0) = src; insn->src(1) = tmpData2; } - void Selection::Opaque::OBREAD(GenRegister dst, + void Selection::Opaque::OBREAD(GenRegister* dsts, + uint32_t vec_size, GenRegister addr, GenRegister header, uint32_t bti, - uint32_t size) { - SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1, 2); - insn->dst(0) = dst; + GenRegister* tmp, + uint32_t tmp_size) { + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, vec_size + tmp_size, 2); + SelectionVector *vector = this->appendVector(); + for (uint32_t i = 0; i < vec_size; ++i) + insn->dst(i) = dsts[i]; + for (uint32_t i = 0; i < tmp_size; ++i) + insn->dst(i + vec_size) = tmp[i]; insn->src(0) = addr; insn->src(1) = header; insn->setbti(bti); - insn->extra.elem = size / sizeof(int[4]); // number of owords + insn->extra.elem = vec_size; // number of vector size + + // tmp regs for OWORD read dst + vector->regNum = tmp_size; + vector->reg = &insn->dst(vec_size); + vector->offsetID = vec_size; + vector->isSrc = 0; } void Selection::Opaque::OBWRITE(GenRegister addr, - GenRegister value, + GenRegister* values, + uint32_t vec_size, GenRegister header, uint32_t bti, - uint32_t size) { - SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, 3); + GenRegister* tmp, + uint32_t tmp_size) { + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, tmp_size + 1, vec_size + 1); SelectionVector *vector = this->appendVector(); - insn->src(0) = header; - insn->src(1) = value; - insn->src(2) = addr; - insn->state = this->curr; + insn->src(0) = addr; + for (uint32_t i = 0; i < vec_size; ++i) + insn->src(i + 1) = values[i]; + insn->dst(0) = header; + for (uint32_t i = 0; i < tmp_size; ++i) + insn->dst(i + 1) = tmp[i]; insn->setbti(bti); - insn->extra.elem = size / sizeof(int[4]); // number of owords + insn->extra.elem = vec_size; // number of vector_size - // We need to put the header and the data together - vector->regNum = 2; - vector->reg = &insn->src(0); + // tmp regs for OWORD read dst + vector->regNum = tmp_size + 1; + vector->reg = &insn->dst(0); vector->offsetID = 0; - vector->isSrc = 1; + vector->isSrc = 0; } void Selection::Opaque::MBREAD(GenRegister* dsts, @@ -4113,10 +4129,19 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp ir::BTI bti) const { using namespace ir; - const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); - const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32); + const uint32_t vec_size = insn.getValueNum(); const uint32_t simdWidth = sel.ctx.getSimdWidth(); - sel.OBREAD(value, address, header, bti.imm, simdWidth * sizeof(int)); + const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32); + vector<GenRegister> valuesVec; + for(uint32_t i = 0; i < vec_size; i++) + valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32)); + // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs + uint32_t tmp_size = simdWidth * vec_size / 8; + tmp_size = tmp_size > 4 ? 4 : tmp_size; + vector<GenRegister> tmpVec; + for(uint32_t i = 0; i < tmp_size; i++) + tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32)); + sel.OBREAD(&valuesVec[0], vec_size, address, header, bti.imm, &tmpVec[0], tmp_size); } // check whether all binded table index point to constant memory @@ -4289,10 +4314,19 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp ir::BTI bti) const { using namespace ir; - const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); - const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32); + const uint32_t vec_size = insn.getValueNum(); const uint32_t simdWidth = sel.ctx.getSimdWidth(); - sel.OBWRITE(address, value, header, bti.imm, simdWidth * sizeof(int)); + const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32); + vector<GenRegister> valuesVec; + for(uint32_t i = 0; i < vec_size; i++) + valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32)); + // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs + uint32_t tmp_size = simdWidth * vec_size / 8; + tmp_size = tmp_size > 4 ? 4 : tmp_size; + vector<GenRegister> tmpVec; + for(uint32_t i = 0; i < tmp_size; i++) + tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32)); + sel.OBWRITE(address, &valuesVec[0], vec_size, header, bti.imm, &tmpVec[0], tmp_size); } virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl index 5d3d0bb..b066502 100644 --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl @@ -134,63 +134,46 @@ RANGE_OP(scan_exclusive, max, double, true) #undef RANGE_OP PURE CONST uint __gen_ocl_sub_group_block_read_mem(const global uint* p); +PURE CONST uint2 __gen_ocl_sub_group_block_read_mem2(const global uint* p); +PURE CONST uint4 __gen_ocl_sub_group_block_read_mem4(const global uint* p); +PURE CONST uint8 __gen_ocl_sub_group_block_read_mem8(const global uint* p); OVERLOADABLE uint intel_sub_group_block_read(const global uint* p) { return __gen_ocl_sub_group_block_read_mem(p); } OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p) { - return (uint2)(intel_sub_group_block_read(p), - intel_sub_group_block_read(p + get_simd_size())); + return __gen_ocl_sub_group_block_read_mem2(p); } OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p) { - return (uint4)(intel_sub_group_block_read(p), - intel_sub_group_block_read(p + get_simd_size()), - intel_sub_group_block_read(p + get_simd_size() * 2), - intel_sub_group_block_read(p + get_simd_size() * 3)); + return __gen_ocl_sub_group_block_read_mem4(p); } OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p) { - return (uint8)(intel_sub_group_block_read(p), - intel_sub_group_block_read(p + get_simd_size()), - intel_sub_group_block_read(p + get_simd_size() * 2), - intel_sub_group_block_read(p + get_simd_size() * 3), - intel_sub_group_block_read(p + get_simd_size() * 4), - intel_sub_group_block_read(p + get_simd_size() * 5), - intel_sub_group_block_read(p + get_simd_size() * 6), - intel_sub_group_block_read(p + get_simd_size() * 7)); + return __gen_ocl_sub_group_block_read_mem8(p); } - void __gen_ocl_sub_group_block_write_mem(const global uint* p, uint data); +void __gen_ocl_sub_group_block_write_mem2(const global uint* p, uint2 data); +void __gen_ocl_sub_group_block_write_mem4(const global uint* p, uint4 data); +void __gen_ocl_sub_group_block_write_mem8(const global uint* p, uint8 data); OVERLOADABLE void intel_sub_group_block_write(const global uint* p, uint data) { __gen_ocl_sub_group_block_write_mem(p, data); } OVERLOADABLE void intel_sub_group_block_write2(const global uint* p, uint2 data) { - intel_sub_group_block_write(p, data.s0); - intel_sub_group_block_write(p + get_simd_size(), data.s1); + __gen_ocl_sub_group_block_write_mem2(p, data); } OVERLOADABLE void intel_sub_group_block_write4(const global uint* p,uint4 data) { - intel_sub_group_block_write(p, data.s0); - intel_sub_group_block_write(p + get_simd_size(), data.s1); - intel_sub_group_block_write(p + get_simd_size() * 2, data.s2); - intel_sub_group_block_write(p + get_simd_size() * 3, data.s3); + __gen_ocl_sub_group_block_write_mem4(p, data); } OVERLOADABLE void intel_sub_group_block_write8(const global uint* p,uint8 data) { - intel_sub_group_block_write(p, data.s0); - intel_sub_group_block_write(p + get_simd_size(), data.s1); - intel_sub_group_block_write(p + get_simd_size() * 2, data.s2); - intel_sub_group_block_write(p + get_simd_size() * 3, data.s3); - intel_sub_group_block_write(p + get_simd_size() * 4, data.s4); - intel_sub_group_block_write(p + get_simd_size() * 5, data.s5); - intel_sub_group_block_write(p + get_simd_size() * 6, data.s6); - intel_sub_group_block_write(p + get_simd_size() * 7, data.s7); + __gen_ocl_sub_group_block_write_mem8(p, data); } PURE CONST uint __gen_ocl_sub_group_block_read_image(image2d_t p, int x, int y); diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index 419f585..074391f 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -698,7 +698,7 @@ namespace gbe // Emit subgroup instructions void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode); // Emit subgroup instructions - void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite); + void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size); void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size); uint8_t appendSampler(CallSite::arg_iterator AI); @@ -3726,6 +3726,9 @@ namespace gbe case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MIN: case GEN_OCL_LRP: case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM: + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2: + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4: + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8: case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE: case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2: case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4: @@ -3747,6 +3750,9 @@ namespace gbe case GEN_OCL_STORE_PROFILING: case GEN_OCL_DEBUGWAIT: case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8: case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE: case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2: case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4: @@ -3945,13 +3951,12 @@ namespace gbe GBE_ASSERT(AI == AE); } - void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite) { + void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size) { CallSite::arg_iterator AI = CS.arg_begin(); CallSite::arg_iterator AE = CS.arg_end(); GBE_ASSERT(AI != AE); Value *llvmPtr = *(AI++); - Value *llvmValues; ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace()); GBE_ASSERT(addrSpace == ir::MEM_GLOBAL); ir::Register pointer = this->getRegister(llvmPtr); @@ -3986,15 +3991,18 @@ namespace gbe GBE_ASSERT(AM != ir::AM_DynamicBti); if(isWrite){ - llvmValues = *(AI++); - const ir::Register values = getRegister(llvmValues); - const ir::Tuple tuple = ctx.arrayTuple(&values, 1); - ctx.STORE(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex, true); + Value *llvmValues = *(AI++); + vector<ir::Register> srcTupleData; + for(int i = 0;i < vec_size; i++) + srcTupleData.push_back(getRegister(llvmValues, i)); + const ir::Tuple tuple = ctx.arrayTuple(&srcTupleData[0], vec_size); + ctx.STORE(type, tuple, ptr, addrSpace, vec_size, true, AM, SurfaceIndex, true); } else { - llvmValues = &I; - const ir::Register values = getRegister(llvmValues); - const ir::Tuple tuple = ctx.arrayTuple(&values, 1); - ctx.LOAD(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex, true); + vector<ir::Register> dstTupleData; + for(int i = 0;i < vec_size; i++) + dstTupleData.push_back(getRegister(&I, i)); + const ir::Tuple tuple = ctx.arrayTuple(&dstTupleData[0], vec_size); + ctx.LOAD(type, tuple, ptr, addrSpace, vec_size, true, AM, SurfaceIndex, true); } GBE_ASSERT(AI == AE); @@ -4858,9 +4866,21 @@ namespace gbe break; } case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM: - this->emitBlockReadWriteMemInst(I, CS, false); break; + this->emitBlockReadWriteMemInst(I, CS, false, 1); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2: + this->emitBlockReadWriteMemInst(I, CS, false, 2); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4: + this->emitBlockReadWriteMemInst(I, CS, false, 4); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8: + this->emitBlockReadWriteMemInst(I, CS, false, 8); break; case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM: - this->emitBlockReadWriteMemInst(I, CS, true); break; + this->emitBlockReadWriteMemInst(I, CS, true, 1); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2: + this->emitBlockReadWriteMemInst(I, CS, true, 2); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4: + this->emitBlockReadWriteMemInst(I, CS, true, 4); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8: + this->emitBlockReadWriteMemInst(I, CS, true, 8); break; case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE: this->emitBlockReadWriteImageInst(I, CS, false, 1); break; case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2: diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx index 456ab58..48a72d1 100644 --- a/backend/src/llvm/llvm_gen_ocl_function.hxx +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx @@ -218,7 +218,13 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MAX, __gen_ocl_sub_group_scan_in DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN, __gen_ocl_sub_group_scan_inclusive_min) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM, __gen_ocl_sub_group_block_read_mem) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM2, __gen_ocl_sub_group_block_read_mem2) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM4, __gen_ocl_sub_group_block_read_mem4) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM8, __gen_ocl_sub_group_block_read_mem8) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM, __gen_ocl_sub_group_block_write_mem) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM2, __gen_ocl_sub_group_block_write_mem2) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM4, __gen_ocl_sub_group_block_write_mem4) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM8, __gen_ocl_sub_group_block_write_mem8) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE, __gen_ocl_sub_group_block_read_image) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE2, __gen_ocl_sub_group_block_read_image2) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE4, __gen_ocl_sub_group_block_read_image4) diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp index e60bf4b..615fb50 100644 --- a/backend/src/llvm/llvm_scalarize.cpp +++ b/backend/src/llvm/llvm_scalarize.cpp @@ -693,7 +693,19 @@ namespace gbe { *CI = InsertToVector(call, *CI); break; } + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8: + { + if ((*CI)->getType()->isVectorTy()) + *CI = InsertToVector(call, *CI); + break; + } case GEN_OCL_VME: + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2: + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4: + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8: case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2: case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4: case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8: -- 2.7.4 _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet