LGTM, thanks. -----Original Message----- From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Xiuli Pan Sent: Monday, December 19, 2016 3:58 PM To: beignet@lists.freedesktop.org Cc: Pan, Xiuli Subject: [Beignet] [PATCH V4] Backend: Refine block read/write instruction selection
From: Pan Xiuli <xiuli....@intel.com> Move the block pack/unpack into instruction selection in order to get optimization. Also change some variable name to avoid misleading. And make some new function in GenEncoder class. V2: Use ud8grf instead of f8grf to save a retype. V3: Merge change name patch and fix some comments. V4: Fix some simd 8 related bug and comments typo. Signed-off-by: Pan Xiuli <xiuli....@intel.com> --- backend/src/backend/gen8_encoder.cpp | 40 ++- backend/src/backend/gen_context.cpp | 459 ++--------------------------- backend/src/backend/gen_encoder.cpp | 105 ++++--- backend/src/backend/gen_encoder.hpp | 18 +- backend/src/backend/gen_insn_selection.cpp | 448 +++++++++++++++++++++------- 5 files changed, 440 insertions(+), 630 deletions(-) diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp index 8f73346..39dcfd3 100644 --- a/backend/src/backend/gen8_encoder.cpp +++ b/backend/src/backend/gen8_encoder.cpp @@ -840,20 +840,15 @@ namespace gbe gen8_insn->bits3.gen8_block_rw_a64.header_present = 1; } - void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) { - GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); + void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size) { + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); const uint32_t msg_length = 1; - uint32_t rsize = size / 2; - uint32_t msgsize = size; - // When size is 1 OWord, which means half a reg, we need to know which half to use - if (size == 1) { - if (dst.subnr == 0) - msgsize = 0; - else - msgsize = 1; - } - rsize = rsize == 0 ? 1 : rsize; - const uint32_t response_length = rsize; // Size is in regs + uint32_t sizeinreg = ow_size / 2; + // half reg should also have size 1 + sizeinreg = sizeinreg == 0 ? 1 : sizeinreg; + const uint32_t block_size = getOBlockSize(ow_size, dst.subnr == 0); + const uint32_t response_length = sizeinreg; // Size is in reg + this->setHeader(insn); this->setDst(insn, GenRegister::uw16grf(dst.nr, 0)); this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); @@ -861,21 +856,22 @@ namespace gbe setOBlockRWA64(this, insn, bti, - msgsize, + block_size, GEN8_P1_BLOCK_READ_A64, msg_length, response_length); } - void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti, uint32_t size) { - GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); - uint32_t rsize = size / 2; - rsize = rsize == 0 ? 1 : rsize; - const uint32_t msg_length = 1 + rsize; // Size is in owords + void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti, uint32_t ow_size) { + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); + uint32_t sizeinreg = ow_size / 2; + // half reg should also have size 1 + sizeinreg = sizeinreg == 0 ? 1 : sizeinreg; + const uint32_t msg_length = 1 + sizeinreg; // Size is in reg and header const uint32_t response_length = 0; - uint32_t msgsize = size; - msgsize = msgsize == 1 ? 0 : msgsize; + const uint32_t block_size = getOBlockSize(ow_size); + this->setHeader(insn); this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); this->setSrc1(insn, GenRegister::immud(0)); @@ -883,7 +879,7 @@ namespace gbe setOBlockRWA64(this, insn, bti, - msgsize, + block_size, GEN8_P1_BLOCK_WRITE_A64, msg_length, response_length); diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 8288fa5..791e607 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -3551,458 +3551,39 @@ namespace gbe } void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) { - const GenRegister dst= ra->genReg(insn.dst(1)); - const GenRegister addrreg = ra->genReg(insn.src(0)); - uint32_t type = dst.type; - uint32_t typesize = typeSize(type); - const uint32_t vec_size = insn.extra.elem; - const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), type); - const uint32_t simdWidth = p->curr.execWidth; - const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD); - const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type); - GenRegister headeraddr; - bool isA64 = insn.getbti() == 255; + const GenRegister header = ra->genReg(insn.src(0)); + const GenRegister tmp = ra->genReg(insn.dst(0)); + const uint32_t bti = insn.getbti(); + const uint32_t ow_size = insn.extra.elem; + bool isA64 = bti == 255; if (isA64) - headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL); + p->OBREADA64(tmp, header, bti, ow_size); else - headeraddr = GenRegister::offset(header, 0, 2*4); - - // Make header - p->push(); - { - // Copy r0 into the header first - p->curr.execWidth = 8; - p->curr.predicate = GEN_PREDICATE_NONE; - p->curr.noMask = 1; - p->MOV(header, GenRegister::ud8grf(0, 0)); - - // Update the header with the current address - p->curr.execWidth = 1; - p->MOV(headeraddr, addr); - - // Put zero in the general state base address - if (!isA64) - p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0)); - - } - p->pop(); - // Now read the data, oword block read can only work with simd16 and no mask - if (vec_size == 1) { - p->push(); - { - p->curr.execWidth = 16; - p->curr.noMask = 1; - if (isA64) { - //p->curr.execWidth = 8; - p->OBREADA64(dst, header, insn.getbti(), simdWidth * typesize / 16); - } - else - p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16); - } - p->pop(); - } else if (vec_size == 2) { - p->push(); - { - p->curr.execWidth = 16; - p->curr.noMask = 1; - if (isA64) - p->OBREADA64(tmp, header, insn.getbti(), simdWidth * typesize / 8); - else - p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8); - } - p->pop(); - p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0)); - p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, 0, simdWidth * typesize )); - } else if (vec_size == 4) { - if (simdWidth == 8) { - p->push(); - { - p->curr.execWidth = 16; - p->curr.noMask = 1; - if (isA64) - p->OBREADA64(tmp, header, insn.getbti(), 2 * typesize); - else - p->OBREAD(tmp, header, insn.getbti(), 2 * typesize); - } - p->pop(); - for (uint32_t j = 0; j < 4; j++) - p->MOV(ra->genReg(insn.dst(1 + j)), GenRegister::offset(tmp, 0, j * simdWidth * typesize )); - } else { - for (uint32_t i = 0; i < typesize / 2; i++) { - if (i > 0) { - p->push(); - { - // Update the address in header - p->curr.execWidth = 1; - p->ADD(headeraddr, headeraddr, GenRegister::immud(128)); - } - p->pop(); - } - if (isA64) - p->OBREADA64(tmp, header, insn.getbti(), 8); - else - p->OBREAD(tmp, header, insn.getbti(), 8); - for (uint32_t j = 0; j < 8 / typesize ; j++) - p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize )); - } - } - } else if (vec_size == 8) { - if (simdWidth == 8) { - for (uint32_t i = 0; i < typesize / 2; i++) { - if (i > 0) { - p->push(); - { - // Update the address in header - p->curr.execWidth = 1; - p->ADD(headeraddr, headeraddr, GenRegister::immud(128)); - } - p->pop(); - } - p->push(); - { - p->curr.execWidth = 16; - p->curr.noMask = 1; - if (isA64) - p->OBREADA64(tmp, header, insn.getbti(), 8); - else - p->OBREAD(tmp, header, insn.getbti(), 8); - } - p->pop(); - for (uint32_t j = 0; j < 16 / typesize; j++) - p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp, 0, j * simdWidth * typesize )); - } - } else { - for (uint32_t i = 0; i < typesize ; i++) { - if (i > 0) { - p->push(); - { - // Update the address in header - p->curr.execWidth = 1; - p->ADD(headeraddr, headeraddr, GenRegister::immud(128)); - } - p->pop(); - } - if (isA64) - p->OBREADA64(tmp, header, insn.getbti(), 8); - else - p->OBREAD(tmp, header, insn.getbti(), 8); - for (uint32_t j = 0; j < 8 / typesize; j++) - p->MOV(ra->genReg(insn.dst(1 + j + i * 8 / typesize)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize )); - } - } - } else NOT_SUPPORTED; + p->OBREAD(tmp, header, bti, ow_size); } void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) { - const GenRegister addrreg = ra->genReg(insn.src(0)); - const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD); - uint32_t type = ra->genReg(insn.src(1)).type; - uint32_t typesize = typeSize(type); - const uint32_t vec_size = insn.extra.elem; - const GenRegister tmp = GenRegister::offset(header, 1); - const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type); - GenRegister headeraddr; - bool isA64 = insn.getbti() == 255; + const GenRegister header = ra->genReg(insn.src(0)); + const uint32_t bti = insn.getbti(); + const uint32_t ow_size = insn.extra.elem; + bool isA64 = bti == 255; if (isA64) - headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL); + p->OBWRITEA64(header, bti, ow_size); else - headeraddr = GenRegister::offset(header, 0, 2*4); - const uint32_t simdWidth = p->curr.execWidth; - uint32_t tmp_size = simdWidth * vec_size / 8; - tmp_size = tmp_size > 4 ? 4 : tmp_size; - uint32_t offset_size = isA64 ? 128 : 8; - - p->push(); - // Copy r0 into the header first - p->curr.execWidth = 8; - p->curr.predicate = GEN_PREDICATE_NONE; - p->curr.noMask = 1; - p->MOV(header, GenRegister::ud8grf(0,0)); - - // Update the header with the current address - p->curr.execWidth = 1; - if (isA64) - p->MOV(headeraddr, addr); - else - p->SHR(headeraddr, addr, GenRegister::immud(4)); - - // Put zero in the general state base address - if (!isA64) - p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0)); - - p->pop(); - // Now write the data, oword block write can only work with simd16 and no mask - if (vec_size == 1) { - p->MOV(GenRegister::retype(tmp, type), ra->genReg(insn.src(1))); - p->push(); - { - p->curr.execWidth = 16; - p->curr.noMask = 1; - if (isA64) - p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 16); - else - p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 16); - } - p->pop(); - } else if (vec_size == 2) { - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, 0), type), ra->genReg(insn.src(1))); - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, simdWidth * typesize), type), ra->genReg(insn.src(2))); - p->push(); - { - p->curr.execWidth = 16; - p->curr.noMask = 1; - if (isA64) - p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 8); - else - p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 8); - } - p->pop(); - } else if (vec_size == 4) { - if (simdWidth == 8) { - for (uint32_t i = 0; i < 4; i++) - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * simdWidth * typesize), type), ra->genReg(insn.src(1 + i))); - p->push(); - { - p->curr.execWidth = 16; - p->curr.noMask = 1; - if (isA64) - p->OBWRITEA64(header, insn.getbti(), 2 * typesize); - else - p->OBWRITE(header, insn.getbti(), 2 * typesize); - } - p->pop(); - } else { - for (uint32_t i = 0; i < typesize / 2; i++) { - for (uint32_t j = 0; j < 8 / typesize; j++) - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize))); - if (i > 0) { - p->push(); - { - // Update the address in header - p->curr.execWidth = 1; - p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size)); - } - p->pop(); - } - if (isA64) - p->OBWRITEA64(header, insn.getbti(), 8); - else - p->OBWRITE(header, insn.getbti(), 8); - } - } - } else if (vec_size == 8) { - if (simdWidth == 8) { - for (uint32_t i = 0; i < typesize / 2; i++) { - for (uint32_t j = 0; j < 16 / typesize; j++) - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 16 / typesize))); - if (i > 0) { - p->push(); - { - // Update the address in header - p->curr.execWidth = 1; - p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size)); - } - p->pop(); - } - p->push(); - { - p->curr.execWidth = 16; - p->curr.noMask = 1; - if (isA64) - p->OBWRITEA64(header, insn.getbti(), 8); - else - p->OBWRITE(header, insn.getbti(), 8); - } - p->pop(); - } - } else { - for (uint32_t i = 0; i < typesize; i++) { - for (uint32_t j = 0; j < 8 / typesize; j++) - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize))); - if (i > 0) { - p->push(); - { - // Update the address in header - p->curr.execWidth = 1; - p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size)); - } - p->pop(); - } - if (isA64) - p->OBWRITEA64(header, insn.getbti(), 8); - else - p->OBWRITE(header, insn.getbti(), 8); - } - } - } else NOT_SUPPORTED; - + p->OBWRITE(header, bti, ow_size); } void GenContext::emitMBReadInstruction(const SelectionInstruction &insn) { - const GenRegister dst = ra->genReg(insn.dst(1)); - const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)),GEN_TYPE_D); - const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)),GEN_TYPE_D); - const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD); - const GenRegister offsetx = GenRegister::offset(header, 0, 0*4); - const GenRegister offsety = GenRegister::offset(header, 0, 1*4); - const GenRegister blocksizereg = GenRegister::offset(header, 0, 2*4); - size_t vec_size = insn.extra.elem; - uint32_t type = dst.type; - uint32_t typesize = typeSize(type); - uint32_t block_width = typesize * simdWidth; - uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16; - - if (simdWidth == 8) - { - p->push(); - // Copy r0 into the header first - p->curr.execWidth = 8; - p->curr.predicate = GEN_PREDICATE_NONE; - p->curr.noMask = 1; - p->MOV(header, GenRegister::ud8grf(0,0)); - - // Update the header with the coord - p->curr.execWidth = 1; - p->MOV(offsetx, coordx); - p->MOV(offsety, coordy); - // Update block width and height - p->MOV(blocksizereg, GenRegister::immud(blocksize)); - p->curr.execWidth = 8; - // ushort in simd8 will have half reg, but response lenght is still 1 - uint32_t rsize = vec_size * typesize / 4; - rsize = rsize ? rsize : 1; - // Now read the data - p->MBREAD(dst, header, insn.getbti(), rsize); - p->pop(); - - } - else if (simdWidth == 16) - { - const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(vec_size + 1)), GEN_TYPE_UD); - p->push(); - // Copy r0 into the header first - p->curr.execWidth = 8; - p->curr.predicate = GEN_PREDICATE_NONE; - p->curr.noMask = 1; - p->MOV(header, GenRegister::ud8grf(0,0)); - - // First half - // Update the header with the coord - p->curr.execWidth = 1; - p->MOV(offsetx, coordx); - p->MOV(offsety, coordy); - // Update block width and height - p->MOV(blocksizereg, GenRegister::immud(blocksize)); - // Now read the data - p->curr.execWidth = 8; - p->MBREAD(tmp, header, insn.getbti(), vec_size); - for (uint32_t i = 0; i < vec_size; i++) - p->MOV(GenRegister::retype(ra->genReg(insn.dst(i + 1)),GEN_TYPE_UD), GenRegister::offset(tmp, i)); - - if (typesize == 4) - { - // Second half - // Update the header with the coord - p->curr.execWidth = 1; - p->ADD(offsetx, offsetx, GenRegister::immud(32)); - - // Now read the data - p->curr.execWidth = 8; - p->MBREAD(tmp, header, insn.getbti(), vec_size); - - // Move the reg to fit vector rule. - for (uint32_t i = 0; i < vec_size; i++) - p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1), - GenRegister::offset(tmp, i)); - } - p->pop(); - } else NOT_IMPLEMENTED; + const GenRegister dst = ra->genReg(insn.dst(0)); + const GenRegister header = ra->genReg(insn.src(0)); + const size_t response_size = insn.extra.elem; + p->MBREAD(dst, header, insn.getbti(), response_size); } void GenContext::emitMBWriteInstruction(const SelectionInstruction &insn) { - const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_D); - const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)), GEN_TYPE_D); - const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD); - const GenRegister tmp = GenRegister::offset(header, 1); - GenRegister offsetx, offsety, blocksizereg; - size_t vec_size = insn.extra.elem; - uint32_t type = ra->genReg(insn.src(2)).type; - uint32_t typesize = typeSize(type); - uint32_t block_width = typesize * simdWidth; - uint32_t blocksize = (block_width - 1) % 32 | (vec_size-1) << 16; - - offsetx = GenRegister::offset(header, 0, 0*4); - offsety = GenRegister::offset(header, 0, 1*4); - blocksizereg = GenRegister::offset(header, 0, 2*4); - - if (simdWidth == 8) - { - p->push(); - // Copy r0 into the header first - p->curr.execWidth = 8; - p->curr.predicate = GEN_PREDICATE_NONE; - p->curr.noMask = 1; - p->MOV(header, GenRegister::ud8grf(0,0)); - - // Update the header with the coord - p->curr.execWidth = 1; - p->MOV(offsetx, coordx); - p->MOV(offsety, coordy); - // Update block width and height - p->MOV(blocksizereg, GenRegister::immud(blocksize)); - p->curr.execWidth = 8; - // Mov what we need into msgs - for(uint32_t i = 0; i < vec_size; i++) - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * block_width), type), - ra->genReg(insn.src(2 + i))); - // ushort in simd8 will have half reg, but reponse lenght is still 1 - uint32_t rsize = vec_size * typesize / 4; - rsize = rsize ? rsize : 1; - // Now read the data - p->MBWRITE(header, insn.getbti(), rsize); - p->pop(); - - } - else - { - p->push(); - // Copy r0 into the header first - p->curr.execWidth = 8; - p->curr.predicate = GEN_PREDICATE_NONE; - p->curr.noMask = 1; - p->MOV(header, GenRegister::ud8grf(0,0)); - - // First half - // Update the header with the coord - p->curr.execWidth = 1; - p->MOV(offsetx, coordx); - p->MOV(offsety, coordy); - // Update block width and height - p->MOV(blocksizereg, GenRegister::immud(blocksize)); - // Now read the data - p->curr.execWidth = 8; - // Mov what we need into msgs - for(uint32_t i = 0; i < vec_size; i++) - p->MOV(GenRegister::offset(tmp, i), GenRegister::retype(ra->genReg(insn.src(2 + i)), GEN_TYPE_UD)); - p->MBWRITE(header, insn.getbti(), vec_size); - - if (typesize == 4) - { - // Second half - // Update the header with the coord - p->curr.execWidth = 1; - p->ADD(offsetx, offsetx, GenRegister::immud(32)); - - p->curr.execWidth = 8; - // Mov what we need into msgs - for(uint32_t i = 0; i < vec_size; i++) - p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra->genReg(insn.src(2 + i)), 1)); - // Now write the data - p->MBWRITE(header, insn.getbti(), vec_size); - } - - p->pop(); - } + const GenRegister header = ra->genReg(insn.dst(0)); + const size_t data_size = insn.extra.elem; + p->MBWRITE(header, insn.getbti(), data_size); } BVAR(OCL_OUTPUT_REG_ALLOC, false); diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp index 49d93e8..1bca668 100644 --- a/backend/src/backend/gen_encoder.cpp +++ b/backend/src/backend/gen_encoder.cpp @@ -257,32 +257,47 @@ namespace gbe NOT_SUPPORTED; } - static void setOBlockRW(GenEncoder *p, - GenNativeInstruction *insn, - uint32_t bti, - uint32_t size, - uint32_t msg_type, - uint32_t msg_length, - uint32_t response_length) + void GenEncoder::setOBlockRW(GenNativeInstruction *insn, + uint32_t bti, + uint32_t block_size, + uint32_t msg_type, + uint32_t msg_length, + uint32_t response_length) { const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA; - p->setMessageDescriptor(insn, sfid, msg_length, response_length); - assert(size == 0 || size == 1 || size == 2 || size == 4 || size == 8); + setMessageDescriptor(insn, sfid, msg_length, response_length); insn->bits3.gen7_oblock_rw.msg_type = msg_type; insn->bits3.gen7_oblock_rw.bti = bti; - insn->bits3.gen7_oblock_rw.block_size = size <= 2 ? size : (size == 4 ? 3 : 4); + insn->bits3.gen7_oblock_rw.block_size = block_size; insn->bits3.gen7_oblock_rw.header_present = 1; } - static void setMBlockRW(GenEncoder *p, - GenNativeInstruction *insn, - uint32_t bti, - uint32_t msg_type, - uint32_t msg_length, - uint32_t response_length) + uint32_t GenEncoder::getOBlockSize(uint32_t oword_size, bool low_half) + { + /* 000: 1 OWord, read into or written from the low 128 bits of the destination register. + * 001: 1 OWord, read into or written from the high 128 bits of the destination register. + * 010: 2 OWords + * 011: 4 OWords + * 100: 8 OWords */ + switch(oword_size) + { + case 1: return low_half ? 0 : 1; + case 2: return 2; + case 4: return 3; + case 8: return 4; + default: NOT_SUPPORTED; + } + return 0; + } + + void GenEncoder::setMBlockRW(GenNativeInstruction *insn, + uint32_t bti, + uint32_t msg_type, + uint32_t msg_length, + uint32_t response_length) { const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA; - p->setMessageDescriptor(insn, sfid, msg_length, response_length); + setMessageDescriptor(insn, sfid, msg_length, response_length); insn->bits3.gen7_mblock_rw.msg_type = msg_type; insn->bits3.gen7_mblock_rw.bti = bti; insn->bits3.gen7_mblock_rw.header_present = 1; @@ -1312,80 +1327,72 @@ namespace gbe setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_READ, 1, dst_num); } - void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) { + void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size) { GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); const uint32_t msg_length = 1; - uint32_t rsize = size / 2; - uint32_t msgsize = size; - // When size is 1 OWord, which means half a reg, we need to know which half to use - if (size == 1) { - if (dst.subnr == 0) - msgsize = 0; - else - msgsize = 1; - } - rsize = rsize == 0 ? 1 : rsize; - const uint32_t response_length = rsize; // Size is in regs + uint32_t sizeinreg = ow_size / 2; + // half reg should also have size 1 + sizeinreg = sizeinreg == 0 ? 1 : sizeinreg; + const uint32_t block_size = getOBlockSize(ow_size, dst.subnr == 0); + const uint32_t response_length = sizeinreg; // Size is in reg + this->setHeader(insn); this->setDst(insn, GenRegister::uw16grf(dst.nr, 0)); this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); this->setSrc1(insn, GenRegister::immud(0)); - setOBlockRW(this, - insn, + setOBlockRW(insn, bti, - msgsize, + block_size, GEN7_UNALIGNED_OBLOCK_READ, msg_length, response_length); } - void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t size) { + void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t ow_size) { GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); - uint32_t rsize = size / 2; - rsize = rsize == 0 ? 1 : rsize; - const uint32_t msg_length = 1 + rsize; // Size is in owords + uint32_t sizeinreg = ow_size / 2; + // half reg should also have size 1 + sizeinreg = sizeinreg == 0 ? 1 : sizeinreg; + const uint32_t msg_length = 1 + sizeinreg; // Size is in reg and header const uint32_t response_length = 0; - uint32_t msgsize = size; - msgsize = msgsize == 1 ? 0 : msgsize; + const uint32_t block_size = getOBlockSize(ow_size); + this->setHeader(insn); this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); this->setSrc1(insn, GenRegister::immud(0)); this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW)); - setOBlockRW(this, - insn, + setOBlockRW(insn, bti, - msgsize, + block_size, GEN7_OBLOCK_WRITE, msg_length, response_length); } - void GenEncoder::MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) { + void GenEncoder::MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t response_size) { GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); const uint32_t msg_length = 1; - const uint32_t response_length = size; // Size of registers + const uint32_t response_length = response_size; // Size of registers this->setHeader(insn); this->setDst(insn, GenRegister::ud8grf(dst.nr, 0)); this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); this->setSrc1(insn, GenRegister::immud(0)); - setMBlockRW(this, - insn, + setMBlockRW(insn, bti, GEN75_P1_MEDIA_BREAD, msg_length, response_length); } - void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t size) { + void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t data_size) { GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); - const uint32_t msg_length = 1 + size; + const uint32_t msg_length = 1 + data_size; const uint32_t response_length = 0; // Size of registers this->setHeader(insn); this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW)); this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); this->setSrc1(insn, GenRegister::immud(0)); - setMBlockRW(this, - insn, + setMBlockRW(insn, bti, GEN75_P1_MEDIA_TYPED_BWRITE, msg_length, diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp index e5eb2e2..46ec53b 100644 --- a/backend/src/backend/gen_encoder.hpp +++ b/backend/src/backend/gen_encoder.hpp @@ -286,18 +286,24 @@ namespace gbe virtual bool canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1 = GenRegister::null()); virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1 = GenRegister::null()); + + /*! OBlock helper function */ + uint32_t getOBlockSize(uint32_t oword_size, bool low_half = true); + void setMBlockRW(GenNativeInstruction *insn, uint32_t bti, uint32_t msg_type, uint32_t msg_length, uint32_t response_length); + void setOBlockRW(GenNativeInstruction *insn, uint32_t bti, uint32_t block_size, uint32_t msg_type, uint32_t msg_length, uint32_t response_lengtha); + /*! OBlock read */ - void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize); + void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size); /*! OBlock write */ - void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize); + void OBWRITE(GenRegister header, uint32_t bti, uint32_t ow_size); /*! MBlock read */ - virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize); + virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t response_size); /*! MBlock write */ - virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize); + virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t data_size); /*! A64 OBlock read */ - virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize); + virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size); /*! A64 OBlock write */ - virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t elemSize); + virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t ow_size); GBE_CLASS(GenEncoder); //!< Use custom allocators virtual void alu3(uint32_t opcode, GenRegister dst, diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 1cd6137..223c384 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -759,13 +759,13 @@ namespace gbe void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src, GenRegister tmpData1, GenRegister tmpData2); /*! Oblock read */ - void OBREAD(GenRegister* dsts, uint32_t vec_size, GenRegister addr, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size); + void OBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, uint32_t bti, uint32_t ow_size); /*! Oblock write */ - void OBWRITE(GenRegister addr, GenRegister* values, uint32_t vec_size, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size); + void OBWRITE(GenRegister header, GenRegister* values, uint32_t tmp_size, uint32_t bti, uint32_t ow_size); /*! Media block read */ - void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size); + void MBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, uint32_t bti, uint32_t response_size); /*! Media block write */ - void MBWRITE(GenRegister coordx, GenRegister coordy, GenRegister* values, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size); + void MBWRITE(GenRegister header, GenRegister* values, uint32_t tmp_size, uint32_t bti, uint32_t data_size); /* common functions for both binary instruction and sel_cmp and compare instruction. It will handle the IMM or normal register assignment, and will try to avoid LOADI @@ -2267,118 +2267,84 @@ namespace gbe } void Selection::Opaque::OBREAD(GenRegister* dsts, uint32_t vec_size, - GenRegister addr, GenRegister header, uint32_t bti, - GenRegister* tmp, - uint32_t tmp_size) { - SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1 + vec_size + tmp_size, 1); + uint32_t ow_size) { + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, vec_size, 1); SelectionVector *vector = this->appendVector(); - insn->dst(0) = header; + insn->src(0) = header; for (uint32_t i = 0; i < vec_size; ++i) - insn->dst(1 + i) = dsts[i]; - for (uint32_t i = 0; i < tmp_size; ++i) - insn->dst(1 + i + vec_size) = tmp[i]; - insn->src(0) = addr; + insn->dst(i) = dsts[i]; insn->setbti(bti); - insn->extra.elem = vec_size; // number of vector size + insn->extra.elem = ow_size; // number of OWord size // tmp regs for OWORD read dst - vector->regNum = tmp_size; - vector->reg = &insn->dst(1 + vec_size); - vector->offsetID = 1 + vec_size; + vector->regNum = vec_size; + vector->reg = &insn->dst(0); + vector->offsetID = 0; vector->isSrc = 0; } - void Selection::Opaque::OBWRITE(GenRegister addr, + void Selection::Opaque::OBWRITE(GenRegister header, GenRegister* values, uint32_t vec_size, - GenRegister header, uint32_t bti, - GenRegister* tmp, - uint32_t tmp_size) { - SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, tmp_size + 1, vec_size + 1); + uint32_t ow_size) { + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, vec_size + 1); SelectionVector *vector = this->appendVector(); - insn->src(0) = addr; + insn->src(0) = header; for (uint32_t i = 0; i < vec_size; ++i) insn->src(i + 1) = values[i]; - insn->dst(0) = header; - for (uint32_t i = 0; i < tmp_size; ++i) - insn->dst(i + 1) = tmp[i]; insn->setbti(bti); - insn->extra.elem = vec_size; // number of vector_size + insn->extra.elem = ow_size; // number of OWord_size - // tmp regs for OWORD read dst - vector->regNum = tmp_size + 1; - vector->reg = &insn->dst(0); + // tmp regs for OWORD write header and values + vector->regNum = vec_size + 1; + vector->reg = &insn->src(0); vector->offsetID = 0; - vector->isSrc = 0; + vector->isSrc = 1; + } void Selection::Opaque::MBREAD(GenRegister* dsts, - GenRegister coordx, - GenRegister coordy, + uint32_t tmp_size, GenRegister header, - GenRegister* tmp, uint32_t bti, - uint32_t vec_size) { - - uint32_t simdWidth = curr.execWidth; - SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size * simdWidth / 8 + 1, 2); - insn->dst(0) = header; - for (uint32_t i = 0; i < vec_size; ++i) { - insn->dst(i + 1) = dsts[i]; - if(simdWidth == 16) - insn->dst(i + vec_size + 1) = tmp[i]; - } - insn->src(0) = coordx; - insn->src(1) = coordy; + uint32_t response_size) { + + SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, tmp_size, 1); + insn->src(0) = header; insn->setbti(bti); - insn->extra.elem = vec_size; // vector size + insn->extra.elem = response_size; // send response length - // Only in simd 8 the data is in vector form - if(simdWidth == 8) { - SelectionVector *vector = this->appendVector(); - vector->regNum = vec_size; - vector->reg = &insn->dst(1); - vector->offsetID = 1; - vector->isSrc = 0; - } - if(simdWidth == 16) - { - SelectionVector *vectortmp = this->appendVector(); - vectortmp->regNum = vec_size; - vectortmp->reg = &insn->dst(vec_size + 1); - vectortmp->offsetID = vec_size + 1; - vectortmp->isSrc = 0; + for (uint32_t i = 0; i < tmp_size; ++i) { + insn->dst(i) = dsts[i]; } + SelectionVector *vector = this->appendVector(); + vector->regNum = tmp_size; + vector->reg = &insn->dst(0); + vector->offsetID = 0; + vector->isSrc = 0; } - void Selection::Opaque::MBWRITE(GenRegister coordx, - GenRegister coordy, + void Selection::Opaque::MBWRITE(GenRegister header, GenRegister* values, - GenRegister header, - GenRegister* tmp, + uint32_t tmp_size, uint32_t bti, - uint32_t vec_size) { - SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 1 + vec_size, 2 + vec_size); + uint32_t data_size) { + SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 0, 1 + tmp_size); SelectionVector *vector = this->appendVector(); - insn->src(0) = coordx; - insn->src(1) = coordy; - for (uint32_t i = 0; i < vec_size; ++i) - insn->src(2 + i) = values[i]; - insn->dst(0) = header; - for (uint32_t i = 0; i < vec_size; ++i) - insn->dst(1 + i) = tmp[i]; - insn->state = this->curr; + insn->src(0) = header; + for (uint32_t i = 0; i < tmp_size; ++i) + insn->src(1 + i) = values[i]; insn->setbti(bti); - insn->extra.elem = vec_size; // vector size + insn->extra.elem = data_size; // msg data part size // We need to put the header and the data together - vector->regNum = 1 + vec_size; - vector->reg = &insn->dst(0); + vector->regNum = 1 + tmp_size; + vector->reg = &insn->src(0); vector->offsetID = 0; - vector->isSrc = 0; + vector->isSrc = 1; } // Boiler plate to initialize the selection library at c++ pre-main @@ -4715,18 +4681,79 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp const uint32_t simdWidth = sel.ctx.getSimdWidth(); const Type type = insn.getValueType(); const uint32_t typeSize = type == TYPE_U32 ? 4 : 2; - const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD); + const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW; + const RegisterFamily family = getFamily(type); + bool isA64 = SI == 255; + + const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG)); vector<GenRegister> valuesVec; + vector<GenRegister> tmpVec; for(uint32_t i = 0; i < vec_size; i++) valuesVec.push_back(sel.selReg(insn.getValue(i), type)); - // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs - uint32_t tmp_size = simdWidth * vec_size * typeSize / 32; - tmp_size = tmp_size == 0 ? 1 : tmp_size; - tmp_size = tmp_size > 4 ? 4 : tmp_size; - vector<GenRegister> tmpVec; + + GenRegister headeraddr; + if (isA64) + headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL); + else + headeraddr = sel.getOffsetReg(header, 0, 2 * 4); + // Make header + sel.push(); + { + // Copy r0 into the header first + sel.curr.execWidth = 8; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + sel.MOV(header, GenRegister::ud8grf(0, 0)); + + // Update the header with the current address + sel.curr.execWidth = 1; + + // Put zero in the general state base address + if (isA64) + sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL)); + else { + sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD)); + sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0)); + } + } + sel.pop(); + + /* For block read we need to unpack the block date into values, and for different + * simdwidth and vector size with different type size, we may need to spilt the + * block read send message. + * We can only get a send message with 5 reg length + * so for different combination we have different message length and tmp vector size + * | simd8 | simd16 | simd8 | simd16 + * r0 |header | | | | + * r1 |date | w0,w1 | w0 | dw0 | dw0 + * r2 |date | w2,w3 | w1 | dw1 | dw0 + * r3 |date | ...... | ...... | ...... | dw1 + * r4 |date | ....... | ...... | ...... | dw1 + */ + + uint32_t totalSize = simdWidth * typeSize * vec_size; + uint32_t valueSize = simdWidth * typeSize; + uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size; + uint32_t msg_num = vec_size / tmp_size; + uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16); + for(uint32_t i = 0; i < tmp_size; i++) - tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD)); - sel.OBREAD(&valuesVec[0], vec_size, address, header, SI, &tmpVec[0], tmp_size); + tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType)); + for (uint32_t i = 0; i < msg_num; i++) { + if (i > 0) { + sel.push(); + { + // Update the address in header + sel.curr.execWidth = 1; + sel.ADD(headeraddr, headeraddr, GenRegister::immud(128)); + } + sel.pop(); + } + sel.OBREAD(&tmpVec[0], tmp_size, header, SI, ow_size); + for (uint32_t j = 0; j < tmp_size; j++) + sel.MOV(valuesVec[j + i * tmp_size], tmpVec[j]); + } + } // check whether all binded table index point to constant memory @@ -5161,18 +5188,87 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp const uint32_t simdWidth = sel.ctx.getSimdWidth(); const Type type = insn.getValueType(); const uint32_t typeSize = type == TYPE_U32 ? 4 : 2; - const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD); + const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW; + const RegisterFamily family = getFamily(type); + bool isA64 = SI == 255; + + const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG)); vector<GenRegister> valuesVec; + vector<GenRegister> tmpVec; for(uint32_t i = 0; i < vec_size; i++) valuesVec.push_back(sel.selReg(insn.getValue(i), type)); - // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs - uint32_t tmp_size = simdWidth * vec_size * typeSize / 32; - tmp_size = tmp_size == 0 ? 1 : tmp_size; - tmp_size = tmp_size > 4 ? 4 : tmp_size; - vector<GenRegister> tmpVec; + + GenRegister headeraddr; + if (isA64) + headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL); + else + headeraddr = sel.getOffsetReg(header, 0, 2 * 4); + // Make header + sel.push(); + { + // Copy r0 into the header first + sel.curr.execWidth = 8; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + sel.MOV(header, GenRegister::ud8grf(0, 0)); + + // Update the header with the current address + sel.curr.execWidth = 1; + + // Put zero in the general state base address + if (isA64) + sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL)); + else { + sel.SHR(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD), GenRegister::immud(4)); + sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0)); + } + } + sel.pop(); + + /* For block write we need to pack the block date into the tmp, and for different + * simdwidth and vector size with different type size, we may need to spilt the + * block write send message. + * We can only get a send message with 5 reg length + * so for different combination we have different message length and tmp vector size + * | simd8 | simd16 | simd8 | simd16 + * r0 |header | | | | + * r1 |date | w0,w1 | w0 | dw0 | dw0 + * r2 |date | w2,w3 | w1 | dw1 | dw0 + * r3 |date | ...... | ...... | ...... | dw1 + * r4 |date | ....... | ...... | ...... | dw1 + */ + + uint32_t totalSize = simdWidth * typeSize * vec_size; + uint32_t valueSize = simdWidth * typeSize; + uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size; + uint32_t msg_num = vec_size / tmp_size; + uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16); + for(uint32_t i = 0; i < tmp_size; i++) - tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD)); - sel.OBWRITE(address, &valuesVec[0], vec_size, header, SI, &tmpVec[0], tmp_size); + tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType)); + for (uint32_t i = 0; i < msg_num; i++) { + for (uint32_t j = 0; j < tmp_size; j++) + sel.MOV(tmpVec[j], valuesVec[j + i * tmp_size]); + if (i > 0) { + sel.push(); + { + // Update the address in header + sel.curr.execWidth = 1; + sel.ADD(headeraddr, headeraddr, GenRegister::immud(8)); + } + sel.pop(); + } + sel.push(); + // In simd8 mode, when data reg has more than 1 reg, execWidth 8 will get wrong + // result, so set the execWidth to 16. + sel.curr.execWidth = 16; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + sel.OBWRITE(header, &tmpVec[0], tmp_size, SI, ow_size); + sel.pop(); + } + + } virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const @@ -7662,20 +7758,77 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp uint32_t vec_size = insn.getVectorSize(); uint32_t simdWidth = sel.curr.execWidth; const Type type = insn.getType(); + const uint32_t typeSize = type == TYPE_U32 ? 4 : 2; + uint32_t response_size = simdWidth * vec_size * typeSize / 32; + // ushort in simd8 will have half reg thus 0.5 reg size, but response lenght is still 1 + response_size = response_size ? response_size : 1; + uint32_t block_width = typeSize * simdWidth; + uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16; + + vector<GenRegister> valuesVec; vector<GenRegister> tmpVec; for (uint32_t i = 0; i < vec_size; ++i) { valuesVec.push_back(sel.selReg(insn.getDst(i), type)); - if(simdWidth == 16) - tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD)); + if(simdWidth == 16 && typeSize == 4) + tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG))); } - const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32); - const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32); - const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD); - GenRegister *tmp = NULL; - if(simdWidth == 16) - tmp = &tmpVec[0]; - sel.MBREAD(&valuesVec[0], coordx, coordy, header, tmp, insn.getImageIndex(), insn.getVectorSize()); + const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD); + const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD); + const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG)); + const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0 * 4), GEN_TYPE_UD); + const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1 * 4), GEN_TYPE_UD); + const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2 * 4); + + // Make header + sel.push(); + // Copy r0 into the header first + sel.curr.execWidth = 8; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + sel.MOV(header, GenRegister::ud8grf(0, 0)); + + // Update the header with the coord + sel.curr.execWidth = 1; + sel.MOV(offsetx, coordx); + sel.MOV(offsety, coordy); + // Update block width and height + sel.MOV(blocksizereg, GenRegister::immud(blocksize)); + sel.pop(); + + if (simdWidth * typeSize < 64) { + sel.push(); + sel.curr.execWidth = 8; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + // Now read the data + sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(), response_size); + sel.pop(); + } else if (simdWidth * typeSize == 64) { + sel.push(); + sel.curr.execWidth = 8; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + sel.MBREAD(&tmpVec[0], vec_size ,header, insn.getImageIndex(), vec_size); + for (uint32_t i = 0; i < vec_size; i++) + sel.MOV(valuesVec[i], tmpVec[i]); + + // Second half + // Update the header with the coord + sel.curr.execWidth = 1; + sel.ADD(offsetx, offsetx, GenRegister::immud(32)); + + // Now read the data + sel.curr.execWidth = 8; + sel.MBREAD(&tmpVec[0], vec_size, header, insn.getImageIndex(), vec_size); + + // Move the reg to fit vector rule. + for (uint32_t i = 0; i < vec_size; i++) + sel.MOV(sel.getOffsetReg(valuesVec[i], 0, 32) , tmpVec[i]); + sel.pop(); + } else NOT_IMPLEMENTED; + + return true; } DECL_CTOR(MediaBlockReadInstruction, 1, 1); @@ -7689,17 +7842,84 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp using namespace ir; uint32_t vec_size = insn.getVectorSize(); const Type type = insn.getType(); - const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32); - const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32); + uint32_t simdWidth = sel.curr.execWidth; + const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW; + const RegisterFamily family = getFamily(type); + const uint32_t typeSize = type == TYPE_U32 ? 4 : 2; + // ushort in simd8 will have half reg, but data lenght is still 1 + uint32_t data_size = simdWidth * vec_size * typeSize / 32; + data_size = data_size? data_size : 1; + uint32_t block_width = typeSize * simdWidth; + uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16; + + vector<GenRegister> valuesVec; vector<GenRegister> tmpVec; - for(uint32_t i = 0; i < vec_size; i++) - { - valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type)); - tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD)); - } - const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD); - sel.MBWRITE(coordx, coordy, &valuesVec[0], header, &tmpVec[0], insn.getImageIndex(), vec_size); + for (uint32_t i = 0; i < vec_size; ++i) { + valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type)); + if(simdWidth == 16 && typeSize == 4) + tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG))); + else + tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType)); + } + const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD); + const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD); + const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG)); + const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0*4), GEN_TYPE_UD); + const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1*4), GEN_TYPE_UD); + const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2*4); + + // Make header + sel.push(); + // Copy r0 into the header first + sel.curr.execWidth = 8; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + sel.MOV(header, GenRegister::ud8grf(0, 0)); + + // Update the header with the coord + sel.curr.execWidth = 1; + sel.MOV(offsetx, coordx); + sel.MOV(offsety, coordy); + // Update block width and height + sel.MOV(blocksizereg, GenRegister::immud(blocksize)); + sel.pop(); + + if (simdWidth * typeSize < 64) { + for (uint32_t i = 0; i < vec_size; ++i) { + sel.MOV(tmpVec[i], valuesVec[i]); + } + sel.push(); + sel.curr.execWidth = 8; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + // Now write the data + sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), data_size); + sel.pop(); + } else if (simdWidth * typeSize == 64) { + sel.push(); + sel.curr.execWidth = 8; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.noMask = 1; + for (uint32_t i = 0; i < vec_size; i++) + sel.MOV(tmpVec[i], valuesVec[i]); + sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), vec_size); + + // Second half + // Update the header with the coord + sel.curr.execWidth = 1; + sel.ADD(offsetx, offsetx, GenRegister::immud(32)); + + sel.curr.execWidth = 8; + for (uint32_t i = 0; i < vec_size; i++) + sel.MOV(tmpVec[i], sel.getOffsetReg(valuesVec[i], 0, 32)); + // Now write the data + sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), vec_size); + + // Move the reg to fit vector rule. + sel.pop(); + } else NOT_IMPLEMENTED; + return true; } DECL_CTOR(MediaBlockWriteInstruction, 1, 1); -- 2.7.4 _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet