Replace sqrt, maskload, fp min/max, cttz, ctlz with llvm equivalent. Replace AVX maskedstore intrinsic with LLVM intrinsic. Add helper llvm macros for stacksave, stackrestore, popcnt. --- src/gallium/drivers/swr/Makefile.am | 8 ++ src/gallium/drivers/swr/SConscript | 9 ++ src/gallium/drivers/swr/meson.build | 2 +- .../swr/rasterizer/codegen/gen_llvm_ir_macros.py | 100 ++++++++++++++------- .../rasterizer/codegen/templates/gen_builder.hpp | 20 ++++- .../drivers/swr/rasterizer/jitter/builder.h | 1 + .../drivers/swr/rasterizer/jitter/builder_mem.cpp | 50 +---------- .../drivers/swr/rasterizer/jitter/builder_mem.h | 5 -- .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 13 --- .../drivers/swr/rasterizer/jitter/builder_misc.h | 11 --- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp | 8 +- .../drivers/swr/rasterizer/jitter/meson.build | 11 +++ .../swr/rasterizer/jitter/streamout_jit.cpp | 18 ++-- 13 files changed, 130 insertions(+), 126 deletions(-)
diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am index 5ec9213..32dd9e5 100644 --- a/src/gallium/drivers/swr/Makefile.am +++ b/src/gallium/drivers/swr/Makefile.am @@ -81,6 +81,7 @@ BUILT_SOURCES = \ rasterizer/jitter/gen_state_llvm.h \ rasterizer/jitter/gen_builder.hpp \ rasterizer/jitter/gen_builder_x86.hpp \ + rasterizer/jitter/gen_builder_intrin.hpp \ rasterizer/archrast/gen_ar_event.hpp \ rasterizer/archrast/gen_ar_event.cpp \ rasterizer/archrast/gen_ar_eventhandler.hpp \ @@ -140,6 +141,13 @@ rasterizer/jitter/gen_builder_x86.hpp: rasterizer/codegen/gen_llvm_ir_macros.py --output rasterizer/jitter \ --gen_x86_h +rasterizer/jitter/gen_builder_intrin.hpp: rasterizer/codegen/gen_llvm_ir_macros.py rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py + $(MKDIR_GEN) + $(PYTHON_GEN) \ + $(srcdir)/rasterizer/codegen/gen_llvm_ir_macros.py \ + --output rasterizer/jitter \ + --gen_intrin_h + rasterizer/archrast/gen_ar_event.hpp: rasterizer/codegen/gen_archrast.py rasterizer/codegen/templates/gen_ar_event.hpp rasterizer/archrast/events.proto rasterizer/archrast/events_private.proto rasterizer/codegen/gen_common.py $(MKDIR_GEN) $(PYTHON_GEN) \ diff --git a/src/gallium/drivers/swr/SConscript b/src/gallium/drivers/swr/SConscript index cc4025b..5097be6 100644 --- a/src/gallium/drivers/swr/SConscript +++ b/src/gallium/drivers/swr/SConscript @@ -85,6 +85,15 @@ Depends('rasterizer/jitter/gen_builder.hpp', swrroot + 'rasterizer/codegen/templates/gen_builder.hpp') env.CodeGenerate( + target = 'rasterizer/jitter/gen_builder_intrin.hpp', + script = swrroot + 'rasterizer/codegen/gen_llvm_ir_macros.py', + source = '', + command = python_cmd + ' $SCRIPT --output ' + bldroot + '/rasterizer/jitter --gen_intrin_h' +) +Depends('rasterizer/jitter/gen_builder.hpp', + swrroot + 'rasterizer/codegen/templates/gen_builder.hpp') + +env.CodeGenerate( target = './gen_swr_context_llvm.h', script = swrroot + 'rasterizer/codegen/gen_llvm_types.py', source = 'swr_context.h', diff --git a/src/gallium/drivers/swr/meson.build b/src/gallium/drivers/swr/meson.build index 4bcd4f4..b28abd6 100644 --- a/src/gallium/drivers/swr/meson.build +++ b/src/gallium/drivers/swr/meson.build @@ -296,7 +296,7 @@ endif libmesaswr = static_library( 'mesaswr', [files_swr_mesa, files_swr_common, gen_knobs_h, gen_knobs_cpp, - gen_builder_hpp, gen_builder_x86_hpp], + gen_builder_hpp, gen_builder_x86_hpp, gen_builder_intrin_hpp], cpp_args : [cpp_vis_args, swr_cpp_args, swr_avx_args, swr_arch_defines], include_directories : [inc_common, swr_incs], dependencies : dep_llvm, diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py index 3e1fbfe..9dfc1e7 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py @@ -42,32 +42,40 @@ inst_aliases = { } intrinsics = [ - ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale']], - ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale']], - ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']], - ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale']], - ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale']], - ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']], - ['VRSQRTPS', 'x86_avx_rsqrt_ps_256', ['a']], - ['VRCPPS', 'x86_avx_rcp_ps_256', ['a']], - ['VMINPS', 'x86_avx_min_ps_256', ['a', 'b']], - ['VMAXPS', 'x86_avx_max_ps_256', ['a', 'b']], - ['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding']], - ['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control']], - ['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b']], - ['VPERMD', 'x86_avx2_permd', ['a', 'idx']], - ['VPERMPS', 'x86_avx2_permps', ['idx', 'a']], - ['VCVTPD2PS', 'x86_avx_cvt_pd2_ps_256', ['a']], - ['VCVTPH2PS', 'x86_vcvtph2ps_256', ['a']], - ['VCVTPS2PH', 'x86_vcvtps2ph_256', ['a', 'round']], - ['VHSUBPS', 'x86_avx_hsub_ps_256', ['a', 'b']], - ['VPTESTC', 'x86_avx_ptestc_256', ['a', 'b']], - ['VPTESTZ', 'x86_avx_ptestz_256', ['a', 'b']], - ['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c']], - ['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a']], - ['INTERRUPT', 'x86_int', ['a']], - ['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b']], - ] + ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale']], + ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale']], + ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']], + ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale']], + ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale']], + ['VRCPPS', 'x86_avx_rcp_ps_256', ['a']], + ['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding']], + ['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control']], + ['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b']], + ['VPERMD', 'x86_avx2_permd', ['a', 'idx']], + ['VPERMPS', 'x86_avx2_permps', ['idx', 'a']], + ['VCVTPD2PS', 'x86_avx_cvt_pd2_ps_256', ['a']], + ['VCVTPH2PS', 'x86_vcvtph2ps_256', ['a']], + ['VCVTPS2PH', 'x86_vcvtps2ph_256', ['a', 'round']], + ['VHSUBPS', 'x86_avx_hsub_ps_256', ['a', 'b']], + ['VPTESTC', 'x86_avx_ptestc_256', ['a', 'b']], + ['VPTESTZ', 'x86_avx_ptestz_256', ['a', 'b']], + ['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c']], + ['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a']], + ['INTERRUPT', 'x86_int', ['a']], + ['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b']], +] + +llvm_intrinsics = [ + ['CTTZ', 'cttz', ['a', 'flag'], ['a']], + ['CTLZ', 'ctlz', ['a', 'flag'], ['a']], + ['VSQRTPS', 'sqrt', ['a'], ['a']], + ['STACKSAVE', 'stacksave', [], []], + ['STACKRESTORE', 'stackrestore', ['a'], []], + ['VMINPS', 'minnum', ['a', 'b'], ['a']], + ['VMAXPS', 'maxnum', ['a', 'b'], ['a']], + ['DEBUGTRAP', 'debugtrap', [], []], + ['POPCNT', 'ctpop', ['a'], ['a']] +] this_dir = os.path.dirname(os.path.abspath(__file__)) template = os.path.join(this_dir, 'templates', 'gen_builder.hpp') @@ -195,7 +203,7 @@ def generate_gen_h(functions, output_dir): templfuncs.append({ 'decl' : decl, 'intrin' : func['name'], - 'args' : ', '.join(func['arg_names']), + 'args' : func['arg_names'], }) MakoTemplateWriter.to_file( @@ -205,7 +213,7 @@ def generate_gen_h(functions, output_dir): comment='Builder IR Wrappers', filename=filename, functions=templfuncs, - isX86=False) + isX86=False, isIntrin=False) ''' Auto-generates macros for LLVM IR @@ -221,8 +229,8 @@ def generate_x86_h(output_dir): functions.append({ 'decl' : 'Value* %s(%s, const llvm::Twine& name = "")' % (inst[0], declargs), - 'args' : ', '.join(inst[2]), 'intrin' : inst[1], + 'args' : inst[2], }) MakoTemplateWriter.to_file( @@ -232,8 +240,36 @@ def generate_x86_h(output_dir): comment='x86 intrinsics', filename=filename, functions=functions, - isX86=True) + isX86=True, isIntrin=False) +def generate_intrin_h(output_dir): + filename = 'gen_builder_intrin.hpp' + output_filename = os.path.join(output_dir, filename) + + functions = [] + for inst in llvm_intrinsics: + #print('Inst: %s, x86: %s numArgs: %d' % (inst[0], inst[1], len(inst[2]))) + if len(inst[2]) != 0: + declargs = 'Value* ' + ', Value* '.join(inst[2]) + decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (inst[0], declargs) + else: + decl = 'Value* %s(const llvm::Twine& name = "")' % (inst[0]) + + functions.append({ + 'decl' : decl, + 'intrin' : inst[1], + 'args' : inst[2], + 'types' : inst[3], + }) + + MakoTemplateWriter.to_file( + template, + output_filename, + cmdline=sys.argv, + comment='llvm intrinsics', + filename=filename, + functions=functions, + isX86=False, isIntrin=True) ''' Function which is invoked when this script is started from a command line. Will present and consume a set of arguments which will tell this script how @@ -247,6 +283,7 @@ def main(): parser.add_argument('--output-dir', '-o', action='store', dest='output', help='Path to output directory', required=True) parser.add_argument('--gen_h', help='Generate builder_gen.h', action='store_true', default=False) parser.add_argument('--gen_x86_h', help='Generate x86 intrinsics. No input is needed.', action='store_true', default=False) + parser.add_argument('--gen_intrin_h', help='Generate llvm intrinsics. No input is needed.', action='store_true', default=False) args = parser.parse_args() if not os.path.exists(args.output): @@ -264,6 +301,9 @@ def main(): if args.gen_x86_h: generate_x86_h(args.output) + if args.gen_intrin_h: + generate_intrin_h(args.output) + if __name__ == '__main__': main() # END OF FILE diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp index b6cf03e..5a47c9a 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp @@ -35,15 +35,27 @@ //============================================================================ // Auto-generated ${comment} //============================================================================ - %for func in functions: +<%argList = ', '.join(func['args'])%>\ ${func['decl']} { %if isX86: - Function *pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}); - return CALL(pFunc, std::initializer_list<Value*>{${func['args']}}, name); + Function * pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}); + return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name); +%elif isIntrin: + %if len(func['types']) != 0: + SmallVector<Type*, ${len(func['types'])}> args; + %for arg in func['types']: + args.push_back(${arg}->getType()); + %endfor + Function * pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}, args); + return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name); + %else: + Function * pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}); + return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name); + %endif %else: - return IRB()->${func['intrin']}(${func['args']}); + return IRB()->${func['intrin']}(${argList}); %endif } diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h index 763d29f..516e872 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h @@ -92,6 +92,7 @@ namespace SwrJit #include "gen_builder.hpp" #include "gen_builder_x86.hpp" +#include "gen_builder_intrin.hpp" #include "builder_misc.h" #include "builder_math.h" #include "builder_mem.h" diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp index 7c223d1..b4f30fb 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp @@ -129,30 +129,6 @@ namespace SwrJit return STORE(val, GEPA(basePtr, valIndices)); } - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate an i32 masked load operation in LLVM IR. If not - /// supported on the underlying platform, emulate it with float masked load - /// @param src - base address pointer for the load - /// @param vMask - SIMD wide mask that controls whether to access memory load 0 - Value *Builder::MASKLOADD(Value* src, Value* mask) - { - Value* vResult; - // use avx2 gather instruction is available - if (JM()->mArch.AVX2()) - { - Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256); - vResult = CALL(func, { src,mask }); - } - else - { - // maskload intrinsic expects integer mask operand in llvm >= 3.8 - mask = BITCAST(mask, VectorType::get(mInt32Ty, mVWidth)); - Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskload_ps_256); - vResult = BITCAST(CALL(func, { src,mask }), VectorType::get(mInt32Ty, mVWidth)); - } - return vResult; - } - Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant *offset) { return GEP(base, offset); @@ -390,9 +366,7 @@ namespace SwrJit /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru) { - Function* pMaskedGather = llvm::Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::masked_gather, { pVecPassthru->getType(), pVecSrcPtr->getType() }); - - return CALL(pMaskedGather, { pVecSrcPtr, C(0), pVecMask, pVecPassthru }); + return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru); } void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, @@ -791,14 +765,11 @@ namespace SwrJit Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty)); - // Get cttz function - Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty }); - // Setup loop basic block BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc); // compute first set bit - Value* pIndex = CALL(pfnCttz, { pMask, C(false) }); + Value* pIndex = CTTZ(pMask, C(false)); Value* pIsUndef = ICMP_EQ(pIndex, C(32)); @@ -835,7 +806,7 @@ namespace SwrJit Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi))); // Terminator - Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) }); + Value* pNewIndex = CTTZ(pNewMask, C(false)); pIsUndef = ICMP_EQ(pNewIndex, C(32)); COND_BR(pIsUndef, pPostLoop, pLoop); @@ -848,19 +819,4 @@ namespace SwrJit IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin()); } - ////////////////////////////////////////////////////////////////////////// - /// @brief save/restore stack, providing ability to push/pop the stack and - /// reduce overall stack requirements for temporary stack use - Value* Builder::STACKSAVE() - { - Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave); - return CALLA(pfnStackSave); - } - - void Builder::STACKRESTORE(Value* pSaved) - { - Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore); - CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved}); - } - } diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h index 8f79425..c694104 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h @@ -60,8 +60,6 @@ LoadInst *LOADV(Value *BasePtr, const std::initializer_list<Value*> &offset, con StoreInst *STORE(Value *Val, Value *BasePtr, const std::initializer_list<uint32_t> &offset); StoreInst *STOREV(Value *Val, Value *BasePtr, const std::initializer_list<Value*> &offset); -Value *MASKLOADD(Value* src, Value* mask); - void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage = MEM_CLIENT_RASTY); @@ -87,9 +85,6 @@ void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask); void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput); void Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[], Value* vGatherOutput[], bool bPackedOutput); -Value* STACKSAVE(); -void STACKRESTORE(Value* pSaved); - // Static stack allocations for scatter operations Value* pScatterStackSrc{ nullptr }; Value* pScatterStackOffsets{ nullptr }; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 704b0f2..c266018 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -335,13 +335,6 @@ namespace SwrJit return CALLA(Callee, args); } - ////////////////////////////////////////////////////////////////////////// - Value *Builder::DEBUGTRAP() - { - Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap); - return CALL(func); - } - Value *Builder::VRCP(Value *va, const llvm::Twine& name) { return FDIV(VIMMED1(1.0f), va, name); // 1 / a @@ -841,12 +834,6 @@ namespace SwrJit return vOut; } - Value* Builder::POPCNT(Value* a) - { - Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() }); - return CALL(pCtPop, std::initializer_list<Value*>{a}); - } - ////////////////////////////////////////////////////////////////////////// /// @brief pop count on vector mask (e.g. <8 x i1>) Value* Builder::VPOPCNT(Value* a) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index 549f328..343a9b0 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -124,15 +124,6 @@ Value *PMINUD(Value* a, Value* b); Value *VABSPS(Value* a); Value *FMADDPS(Value* a, Value* b, Value* c); -// LLVM removed VPCMPGTD x86 intrinsic. This emulates that behavior -Value *VPCMPGTD(Value* a, Value* b) -{ - Value* vIndexMask = ICMP_UGT(a,b); - - // need to set the high bit for x86 intrinsic masks - return S_EXT(vIndexMask,VectorType::get(mInt32Ty,JM()->mVWidth)); -} - Value *ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name = ""); Value *FCLAMP(Value* src, Value* low, Value* high); Value *FCLAMP(Value* src, float low, float high); @@ -140,10 +131,8 @@ Value *FCLAMP(Value* src, float low, float high); CallInst *PRINT(const std::string &printStr); CallInst *PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs); -Value* POPCNT(Value* a); Value* VPOPCNT(Value* a); -Value* DEBUGTRAP(); Value* INT3() { return DEBUGTRAP(); } diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 1ee6691..5c8d813 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -1884,13 +1884,11 @@ Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex) // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0 Value* vMaxIndex = VBROADCAST(numIndicesLeft); - Value* vIndexMask = VPCMPGTD(vMaxIndex, vIndexOffsets); - - // VMASKLOAD takes an *i8 src pointer - pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0)); + Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets); // Load the indices; OOB loads 0 - return MASKLOADD(pIndices,vIndexMask); + pIndices = BITCAST(pIndices, PointerType::get(mSimdInt32Ty, 0)); + return MASKED_LOAD(pIndices, 4, vIndexMask, VIMMED1(0)); } ////////////////////////////////////////////////////////////////////////// diff --git a/src/gallium/drivers/swr/rasterizer/jitter/meson.build b/src/gallium/drivers/swr/rasterizer/jitter/meson.build index dd1ddcf..4a2f46a 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/meson.build +++ b/src/gallium/drivers/swr/rasterizer/jitter/meson.build @@ -48,3 +48,14 @@ gen_builder_x86_hpp = custom_target( depend_files : swr_gen_builder_depends, ) +gen_builder_intrin_hpp = custom_target( + 'gen_builder_intrin.hpp', + input : '../codegen/gen_llvm_ir_macros.py', + output : 'gen_builder_intrin.hpp', + command : [ + prog_python2, '@INPUT0@', '--gen_intrin_h', '--output', '@OUTPUT@', + '--output-dir', '@OUTDIR@' + ], + depend_files : swr_gen_builder_depends, +) + diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp index ac4436e..f9d8580 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp @@ -112,11 +112,11 @@ struct StreamOutJit : public Builder { if (bitmask & (1 << i)) { - indices.push_back(C(-1.0f)); + indices.push_back(C(true)); } else { - indices.push_back(C(0.0f)); + indices.push_back(C(false)); } } return ConstantVector::get(indices); @@ -131,9 +131,6 @@ struct StreamOutJit : public Builder // @param decl - input decl void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl) { - // @todo add this to x86 macros - Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps); - uint32_t numComponents = _mm_popcnt_u32(decl.componentMask); uint32_t packedMask = (1 << numComponents) - 1; if (!decl.hole) @@ -152,15 +149,14 @@ struct StreamOutJit : public Builder // store to output buffer // cast SO buffer to i8*, needed by maskstore - Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0)); + Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(simd4Ty, 0)); // cast input to <4xfloat> Value* src = BITCAST(vpackedAttrib, simd4Ty); - // cast mask to <4xint> + // cast mask to <4xi1> Value* mask = ToMask(packedMask); - mask = BITCAST(mask, VectorType::get(IRB()->getInt32Ty(), 4)); - CALL(maskStore, {pOut, mask, src}); + MASKED_STORE(src, pOut, 4, mask); } // increment SO buffer @@ -325,13 +321,15 @@ struct StreamOutJit : public Builder /// @return PFN_SO_FUNC - pointer to SOS function PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc) { - const llvm::Function *func = (const llvm::Function*)hFunc; + llvm::Function *func = (llvm::Function*)hFunc; JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); PFN_SO_FUNC pfnStreamOut; pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module pJitMgr->mIsModuleFinalized = true; + pJitMgr->DumpAsm(func, "SoFunc_optimized"); + return pfnStreamOut; } -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev