Re: [Beignet] [PATCH] Backend: Refine block read/write instruction selection
1. I will refine them with ud8grf 2. The mov was used for vector. %65 %66 %67 %68 %69 are in a vector. If we replace it with %65 %56 %57 %58 %59, then the vector will have a longer liveness. These may case some register pressure. These MOVs is used to minimize the liveness of vectors. -Original Message- From: Guo, Yejun Sent: Wednesday, December 14, 2016 3:43 PM To: Pan, Xiuli ; beignet@lists.freedesktop.org Cc: Pan, Xiuli Subject: RE: [Beignet] [PATCH] Backend: Refine block read/write instruction selection two comments, thanks. 1. for header register, we can call: const GenRegister header = GenRegister::ud8grf(sel.reg(ir::FAMILY_REG)); instead of: const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_REG)), GEN_TYPE_UD); 2. how about separate the logic for SIMD8 and SIMD16? two consideration: a) In current patch, I see you have finished an elaborate algorithm to handle all the cases. If we separate it, the logic can be simpler, easier to be understood. b) at SIMD8, help to decrease the reg pressure. For example the following instructions: [46]MOV(8) %66<1>:UD : %56<8,8,1>:UD [48]MOV(8) %67<1>:UD : %57<8,8,1>:UD [50]MOV(8) %68<1>:UD : %58<8,8,1>:UD [52]MOV(8) %69<1>:UD : %59<8,8,1>:UD [54]OBWRITE(8) : %65<8,8,1>:UD %66<8,8,1>:UD %67<8,8,1>:UD %68<8,8,1>:UD %69<8,8,1>:UD can be replaced with: OBWRITE(8) : %65<8,8,1>:UD %56<8,8,1>:UD %57<8,8,1>:UD %58<8,8,1>:UD %59<8,8,1>:UD due to the reg vector requirement, the previous instructions could not be optimized at sel ir level, and it is feasible for us to generate the optimized ir. thanks yejun -Original Message- From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Xiuli Pan Sent: Friday, December 09, 2016 3:01 PM To: beignet@lists.freedesktop.org Cc: Pan, Xiuli Subject: [Beignet] [PATCH] Backend: Refine block read/write instruction selection From: Pan Xiuli Move the block pack/unpack into instruction selection in order to get optimization. Signed-off-by: Pan Xiuli --- backend/src/backend/gen_context.cpp| 459 ++--- backend/src/backend/gen_insn_selection.cpp | 439 --- 2 files changed, 346 insertions(+), 552 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 798fac8..4e971a2 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -3551,458 +3551,39 @@ namespace gbe } void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) { -const GenRegister dst= ra->genReg(insn.dst(1)); -const GenRegister addrreg = ra->genReg(insn.src(0)); -uint32_t type = dst.type; -uint32_t typesize = typeSize(type); -const uint32_t vec_size = insn.extra.elem; -const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), type); -const uint32_t simdWidth = p->curr.execWidth; -const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD); -const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type); -GenRegister headeraddr; -bool isA64 = insn.getbti() == 255; +const GenRegister header = ra->genReg(insn.src(0)); +const GenRegister tmp = ra->genReg(insn.dst(0)); +const uint32_t bti = insn.getbti(); +const uint32_t ow_size = insn.extra.elem; +bool isA64 = bti == 255; if (isA64) - headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL); + p->OBREADA64(tmp, header, bti, ow_size); else - headeraddr = GenRegister::offset(header, 0, 2*4); - -// Make header -p->push(); -{ - // Copy r0 into the header first - p->curr.execWidth = 8; - p->curr.predicate = GEN_PREDICATE_NONE; - p->curr.noMask = 1; - p->MOV(header, GenRegister::ud8grf(0, 0)); - - // Update the header with the current address - p->curr.execWidth = 1; - p->MOV(headeraddr, addr); - - // Put zero in the general state base address - if (!isA64) -p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0)); - -} -p->pop(); -// Now read the data, oword block read can only work with simd16 and no mask -if (vec_size == 1) { - p->push(); - { -p->curr.execWidth = 16; -p->curr.noMask = 1; -if (isA64) { - //p->curr.execWidth = 8; - p->OBREADA64(dst, header, insn.getbti(), simdWidth * typesize / 16); -} -else - p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16); - } -
Re: [Beignet] [PATCH] Backend: Refine block read/write instruction selection
two comments, thanks. 1. for header register, we can call: const GenRegister header = GenRegister::ud8grf(sel.reg(ir::FAMILY_REG)); instead of: const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_REG)), GEN_TYPE_UD); 2. how about separate the logic for SIMD8 and SIMD16? two consideration: a) In current patch, I see you have finished an elaborate algorithm to handle all the cases. If we separate it, the logic can be simpler, easier to be understood. b) at SIMD8, help to decrease the reg pressure. For example the following instructions: [46]MOV(8) %66<1>:UD : %56<8,8,1>:UD [48]MOV(8) %67<1>:UD : %57<8,8,1>:UD [50]MOV(8) %68<1>:UD : %58<8,8,1>:UD [52]MOV(8) %69<1>:UD : %59<8,8,1>:UD [54]OBWRITE(8) : %65<8,8,1>:UD %66<8,8,1>:UD %67<8,8,1>:UD %68<8,8,1>:UD %69<8,8,1>:UD can be replaced with: OBWRITE(8) : %65<8,8,1>:UD %56<8,8,1>:UD %57<8,8,1>:UD %58<8,8,1>:UD %59<8,8,1>:UD due to the reg vector requirement, the previous instructions could not be optimized at sel ir level, and it is feasible for us to generate the optimized ir. thanks yejun -Original Message- From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Xiuli Pan Sent: Friday, December 09, 2016 3:01 PM To: beignet@lists.freedesktop.org Cc: Pan, Xiuli Subject: [Beignet] [PATCH] Backend: Refine block read/write instruction selection From: Pan Xiuli Move the block pack/unpack into instruction selection in order to get optimization. Signed-off-by: Pan Xiuli --- backend/src/backend/gen_context.cpp| 459 ++--- backend/src/backend/gen_insn_selection.cpp | 439 --- 2 files changed, 346 insertions(+), 552 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 798fac8..4e971a2 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -3551,458 +3551,39 @@ namespace gbe } void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) { -const GenRegister dst= ra->genReg(insn.dst(1)); -const GenRegister addrreg = ra->genReg(insn.src(0)); -uint32_t type = dst.type; -uint32_t typesize = typeSize(type); -const uint32_t vec_size = insn.extra.elem; -const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), type); -const uint32_t simdWidth = p->curr.execWidth; -const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD); -const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type); -GenRegister headeraddr; -bool isA64 = insn.getbti() == 255; +const GenRegister header = ra->genReg(insn.src(0)); +const GenRegister tmp = ra->genReg(insn.dst(0)); +const uint32_t bti = insn.getbti(); +const uint32_t ow_size = insn.extra.elem; +bool isA64 = bti == 255; if (isA64) - headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL); + p->OBREADA64(tmp, header, bti, ow_size); else - headeraddr = GenRegister::offset(header, 0, 2*4); - -// Make header -p->push(); -{ - // Copy r0 into the header first - p->curr.execWidth = 8; - p->curr.predicate = GEN_PREDICATE_NONE; - p->curr.noMask = 1; - p->MOV(header, GenRegister::ud8grf(0, 0)); - - // Update the header with the current address - p->curr.execWidth = 1; - p->MOV(headeraddr, addr); - - // Put zero in the general state base address - if (!isA64) -p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0)); - -} -p->pop(); -// Now read the data, oword block read can only work with simd16 and no mask -if (vec_size == 1) { - p->push(); - { -p->curr.execWidth = 16; -p->curr.noMask = 1; -if (isA64) { - //p->curr.execWidth = 8; - p->OBREADA64(dst, header, insn.getbti(), simdWidth * typesize / 16); -} -else - p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16); - } - p->pop(); -} else if (vec_size == 2) { - p->push(); - { -p->curr.execWidth = 16; -p->curr.noMask = 1; -if (isA64) - p->OBREADA64(tmp, header, insn.getbti(), simdWidth * typesize / 8); -else - p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8); - } - p->pop(); - p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0)); - p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, 0, simdWid
[Beignet] [PATCH] Backend: Refine block read/write instruction selection
From: Pan Xiuli Move the block pack/unpack into instruction selection in order to get optimization. Signed-off-by: Pan Xiuli --- backend/src/backend/gen_context.cpp| 459 ++--- backend/src/backend/gen_insn_selection.cpp | 439 --- 2 files changed, 346 insertions(+), 552 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 798fac8..4e971a2 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -3551,458 +3551,39 @@ namespace gbe } void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) { -const GenRegister dst= ra->genReg(insn.dst(1)); -const GenRegister addrreg = ra->genReg(insn.src(0)); -uint32_t type = dst.type; -uint32_t typesize = typeSize(type); -const uint32_t vec_size = insn.extra.elem; -const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), type); -const uint32_t simdWidth = p->curr.execWidth; -const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD); -const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type); -GenRegister headeraddr; -bool isA64 = insn.getbti() == 255; +const GenRegister header = ra->genReg(insn.src(0)); +const GenRegister tmp = ra->genReg(insn.dst(0)); +const uint32_t bti = insn.getbti(); +const uint32_t ow_size = insn.extra.elem; +bool isA64 = bti == 255; if (isA64) - headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL); + p->OBREADA64(tmp, header, bti, ow_size); else - headeraddr = GenRegister::offset(header, 0, 2*4); - -// Make header -p->push(); -{ - // Copy r0 into the header first - p->curr.execWidth = 8; - p->curr.predicate = GEN_PREDICATE_NONE; - p->curr.noMask = 1; - p->MOV(header, GenRegister::ud8grf(0, 0)); - - // Update the header with the current address - p->curr.execWidth = 1; - p->MOV(headeraddr, addr); - - // Put zero in the general state base address - if (!isA64) -p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0)); - -} -p->pop(); -// Now read the data, oword block read can only work with simd16 and no mask -if (vec_size == 1) { - p->push(); - { -p->curr.execWidth = 16; -p->curr.noMask = 1; -if (isA64) { - //p->curr.execWidth = 8; - p->OBREADA64(dst, header, insn.getbti(), simdWidth * typesize / 16); -} -else - p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16); - } - p->pop(); -} else if (vec_size == 2) { - p->push(); - { -p->curr.execWidth = 16; -p->curr.noMask = 1; -if (isA64) - p->OBREADA64(tmp, header, insn.getbti(), simdWidth * typesize / 8); -else - p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8); - } - p->pop(); - p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0)); - p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, 0, simdWidth * typesize )); -} else if (vec_size == 4) { - if (simdWidth == 8) { -p->push(); -{ - p->curr.execWidth = 16; - p->curr.noMask = 1; - if (isA64) -p->OBREADA64(tmp, header, insn.getbti(), 2 * typesize); - else -p->OBREAD(tmp, header, insn.getbti(), 2 * typesize); -} -p->pop(); -for (uint32_t j = 0; j < 4; j++) - p->MOV(ra->genReg(insn.dst(1 + j)), GenRegister::offset(tmp, 0, j * simdWidth * typesize )); - } else { -for (uint32_t i = 0; i < typesize / 2; i++) { - if (i > 0) { -p->push(); -{ - // Update the address in header - p->curr.execWidth = 1; - p->ADD(headeraddr, headeraddr, GenRegister::immud(128)); -} -p->pop(); - } - if (isA64) -p->OBREADA64(tmp, header, insn.getbti(), 8); - else -p->OBREAD(tmp, header, insn.getbti(), 8); - for (uint32_t j = 0; j < 8 / typesize ; j++) -p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize )); -} - } -} else if (vec_size == 8) { - if (simdWidth == 8) { -for (uint32_t i = 0; i < typesize / 2; i++) { - if (i > 0) { -p->push(); -{ - // Update the address in header - p->curr.execWidth = 1; - p->ADD(headeraddr, headeraddr, GenRegister::immud(128)); -} -p->pop(); - } - p->push(); - { -p->curr.execWidth = 16; -p->curr.noMask = 1; -if (isA64) - p->OBREADA64(tmp, header, insn.getbti(),