[Mesa-dev] [PATCH 5/6] swr/rast: don't use 32-bit gathers for elements < 32-bits in size
Using a gather for elements less than 32-bits in size can cause pagefaults when loading the last elements in a page-aligned-sized buffer. --- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 61 +- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 99a936d176..ad70cbe95d 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -741,7 +741,66 @@ void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pB // only works if pixel size is <= 32bits SWR_ASSERT(info.bpp <= 32); -Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask); +Value *pGather; +if (info.bpp == 32) +{ +pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask); +} +else +{ +// Can't use 32-bit gather for items less than 32-bits, could cause page faults. +Value *pMem = ALLOCA(mSimdInt32Ty); +STORE(VIMMED1(0u), pMem); + +pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0)); +Value* pDstMem = BITCAST(pMem, mInt32PtrTy); + +for (uint32_t lane = 0; lane < mVWidth; ++lane) +{ +// Get index +Value* index = VEXTRACT(pOffsets, C(lane)); +Value* mask = VEXTRACT(pMask, C(lane)); +switch (info.bpp) +{ +case 8: +{ +Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0)); +Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0)); +STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst); +break; +} + +case 16: +{ +Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0)); +Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0)); +STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst); +break; +} +break; + +case 24: +{ +// First 16-bits of data +Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0)); +Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0)); +STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst); + +// Last 8-bits of data +pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0)); +pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0)); +STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst); +break; +} + +default: +SWR_INVALID("Shouldn't have BPP = %d now", info.bpp); +break; +} +} + +pGather = LOAD(pMem); +} for (uint32_t comp = 0; comp < 4; ++comp) { -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/6] swr/rast: SIMD16 builder - cleanup naming (simd2 -> simd16)
--- .../drivers/swr/rasterizer/jitter/builder.cpp | 76 +- .../drivers/swr/rasterizer/jitter/builder.h| 45 +++--- .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 133 .../drivers/swr/rasterizer/jitter/builder_misc.h | 50 +++--- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 168 +++-- 5 files changed, 239 insertions(+), 233 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp index 4b83a3204c..c46159a35a 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp @@ -40,52 +40,56 @@ namespace SwrJit Builder::Builder(JitManager *pJitMgr) : mpJitMgr(pJitMgr) { +SWR_ASSERT(pJitMgr->mVWidth == 8); + mVWidth = pJitMgr->mVWidth; -#if USE_SIMD16_BUILDER -mVWidth2 = pJitMgr->mVWidth * 2; -#endif +mVWidth16 = pJitMgr->mVWidth * 2; mpIRBuilder = &pJitMgr->mBuilder; -mVoidTy = Type::getVoidTy(pJitMgr->mContext); -mFP16Ty = Type::getHalfTy(pJitMgr->mContext); -mFP32Ty = Type::getFloatTy(pJitMgr->mContext); -mFP32PtrTy = PointerType::get(mFP32Ty, 0); -mDoubleTy = Type::getDoubleTy(pJitMgr->mContext); -mInt1Ty = Type::getInt1Ty(pJitMgr->mContext); -mInt8Ty = Type::getInt8Ty(pJitMgr->mContext); -mInt16Ty = Type::getInt16Ty(pJitMgr->mContext); -mInt32Ty = Type::getInt32Ty(pJitMgr->mContext); -mInt8PtrTy = PointerType::get(mInt8Ty, 0); +// Built in types: scalar + +mVoidTy = Type::getVoidTy(pJitMgr->mContext); +mFP16Ty = Type::getHalfTy(pJitMgr->mContext); +mFP32Ty = Type::getFloatTy(pJitMgr->mContext); +mFP32PtrTy = PointerType::get(mFP32Ty, 0); +mDoubleTy = Type::getDoubleTy(pJitMgr->mContext); +mInt1Ty = Type::getInt1Ty(pJitMgr->mContext); +mInt8Ty = Type::getInt8Ty(pJitMgr->mContext); +mInt16Ty= Type::getInt16Ty(pJitMgr->mContext); +mInt32Ty= Type::getInt32Ty(pJitMgr->mContext); +mInt8PtrTy = PointerType::get(mInt8Ty, 0); mInt16PtrTy = PointerType::get(mInt16Ty, 0); mInt32PtrTy = PointerType::get(mInt32Ty, 0); -mInt64Ty = Type::getInt64Ty(pJitMgr->mContext); -mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth); -mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth); -mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth); -mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth); -mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth); -mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth); -mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4); +mInt64Ty= Type::getInt64Ty(pJitMgr->mContext); + +// Built in types: simd8 + +mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth); +mSimdInt16Ty= VectorType::get(mInt16Ty, mVWidth); +mSimdInt32Ty= VectorType::get(mInt32Ty, mVWidth); +mSimdInt64Ty= VectorType::get(mInt64Ty, mVWidth); +mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth); +mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth); +mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4); mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5); -#if USE_SIMD16_BUILDER -mSimd2Int1Ty = VectorType::get(mInt1Ty, mVWidth2); -mSimd2Int16Ty = VectorType::get(mInt16Ty, mVWidth2); -mSimd2Int32Ty = VectorType::get(mInt32Ty, mVWidth2); -mSimd2Int64Ty = VectorType::get(mInt64Ty, mVWidth2); -mSimd2FP16Ty = VectorType::get(mFP16Ty, mVWidth2); -mSimd2FP32Ty = VectorType::get(mFP32Ty, mVWidth2); -mSimd2VectorTy = ArrayType::get(mSimd2FP32Ty, 4); -mSimd2VectorTRTy = ArrayType::get(mSimd2FP32Ty, 5); -#endif + +// Built in types: simd16 + +mSimd16Int1Ty = VectorType::get(mInt1Ty, mVWidth16); +mSimd16Int16Ty = VectorType::get(mInt16Ty, mVWidth16); +mSimd16Int32Ty = VectorType::get(mInt32Ty, mVWidth16); +mSimd16Int64Ty = VectorType::get(mInt64Ty, mVWidth16); +mSimd16FP16Ty = VectorType::get(mFP16Ty, mVWidth16); +mSimd16FP32Ty = VectorType::get(mFP32Ty, mVWidth16); +mSimd16VectorTy = ArrayType::get(mSimd16FP32Ty, 4); +mSimd16VectorTRTy = ArrayType::get(mSimd16FP32Ty, 5); if (sizeof(uint32_t*) == 4) { mIntPtrTy = mInt32Ty; mSimdIntPtrTy = mSimdInt32Ty; -#if USE_SIMD16_BUILDER -mSimd2IntPtrTy = mSimd2Int32Ty; -#endif +mSimd16IntPtrTy = mSimd16Int32Ty; } else { @@ -93,9 +97,7 @@ namespace SwrJit mIntPtrTy = mInt64Ty; mSimdIntPtrTy = mSimdInt64Ty; -#if USE_SIMD16_BUILDER -mSimd2IntPtrTy = mSimd2Int64Ty; -#endif +mSimd1
[Mesa-dev] [PATCH 4/6] swr/rast: autogenerate named structs instead of literal structs
Results in far smaller and useful IR output. --- .../swr/rasterizer/codegen/templates/gen_llvm.hpp | 23 ++ 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp index 18ea781713..574ee5aaa7 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp @@ -40,15 +40,22 @@ namespace SwrJit INLINE static StructType *Gen_${type['name']}(JitManager* pJitMgr) { LLVMContext& ctx = pJitMgr->mContext; -std::vector members; -<% -(max_type_len, max_name_len) = calc_max_len(type['members']) -%> -%for member in type['members']: -/* ${member['name']} ${pad(len(member['name']), max_name_len)}*/ members.push_back( ${member['type']} ); -%endfor -return StructType::get(ctx, members, false); +StructType* pRetType = pJitMgr->mpCurrentModule->getTypeByName("${type['name']}"); +if (pRetType == nullptr) +{ +std::vector members; +<% +(max_type_len, max_name_len) = calc_max_len(type['members']) +%> +%for member in type['members']: +/* ${member['name']} ${pad(len(member['name']), max_name_len)}*/ members.push_back(${ member['type'] }); +%endfor + +pRetType = StructType::create(members, "${type['name']}", false); +} + +return pRetType; } %for member in type['members']: -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/6] swr/rast: SIMD16 fetch shader jitter cleanup
Bake in USE_SIMD16_BUILDER code paths (for USE_SIMD16_SHADER defined), remove USE_SIMD16_BUILDER define, remove deprecated psuedo-SIMD16 code paths. --- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 1118 +++- 1 file changed, 383 insertions(+), 735 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index ac09a82f6c..99a936d176 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -50,7 +50,6 @@ enum ConversionType #if USE_SIMD16_SHADERS #define USE_SIMD16_GATHERS 0 -#define USE_SIMD16_BUILDER 0 #endif // @@ -61,6 +60,7 @@ struct FetchJit : public Builder FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){}; Function* Create(const FETCH_COMPILE_STATE& fetchState); + Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex); Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex); Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex); @@ -69,43 +69,49 @@ struct FetchJit : public Builder typedef std::tuple Shuffle8bpcArgs; + #if USE_SIMD16_SHADERS +#if USE_SIMD16_GATHERS +void Shuffle8bpcGatherd16(Shuffle8bpcArgs &args); +#else void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2); +#endif #else void Shuffle8bpcGatherd(Shuffle8bpcArgs &args); #endif -#if USE_SIMD16_BUILDER -void Shuffle8bpcGatherd2(Shuffle8bpcArgs &args); -#endif typedef std::tuple Shuffle16bpcArgs; + #if USE_SIMD16_SHADERS +#if USE_SIMD16_GATHERS +void Shuffle16bpcGather16(Shuffle16bpcArgs &args); +#else void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2); +#endif #else void Shuffle16bpcGather(Shuffle16bpcArgs &args); #endif -#if USE_SIMD16_BUILDER -void Shuffle16bpcGather2(Shuffle16bpcArgs &args); -#endif +#if USE_SIMD16_GATHERS +void StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); +#else void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); -#if USE_SIMD16_BUILDER -void StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); #endif #if USE_SIMD16_SHADERS -Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2); +#if USE_SIMD16_GATHERS +Value *GenerateCompCtrlVector16(const ComponentControl ctrl); #else -Value* GenerateCompCtrlVector(const ComponentControl ctrl); +Value *GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2); #endif -#if USE_SIMD16_BUILDER -Value* GenerateCompCtrlVector2(const ComponentControl ctrl); +#else +Value *GenerateCompCtrlVector(const ComponentControl ctrl); #endif void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut); -#if USE_SIMD16_SHADERS +#if USE_SIMD16_SHADERS #if USE_SIMD16_GATHERS void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2); #else @@ -833,21 +839,14 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, uint32_t outputElt = 0; Value* vVertexElements[4]; #if USE_SIMD16_GATHERS -Value* vVertexElements2[4]; -#if USE_SIMD16_BUILDER Value *pVtxSrc2[4]; -#endif #endif Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance}); Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance}); #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER Value* vBaseVertex16 = VBROADCAST_16(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex })); -#else -Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex })); -#endif #else Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex })); #endif @@ -874,11 +873,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch}); #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER Value *vStride16 = VBROADCAST_16(stride); -#else -Value *vStride = VBROADCAST(stride); -#endif #else Value *vStride = VBROADCAST(stride); #endif @@ -901,20 +896,14 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, curInstance = ADD(curInstance, startInstance); } -Value *vCurIndices; #if USE_SIMD16_GATHERS -Value *vCurIndices2; -#if USE_SIMD16_BUILDER Value *vCurIndices16; -#endif +#e
[Mesa-dev] [PATCH 2/6] swr/rast: shuffle header files for msvc pre-compiled header usage
--- src/gallium/drivers/swr/Makefile.sources | 1 + .../drivers/swr/rasterizer/jitter/JitManager.cpp | 36 +- .../drivers/swr/rasterizer/jitter/JitManager.h | 46 +-- .../drivers/swr/rasterizer/jitter/blend_jit.cpp| 3 +- .../drivers/swr/rasterizer/jitter/builder.cpp | 1 + .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 1 + .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 3 +- .../drivers/swr/rasterizer/jitter/jit_api.h| 1 - .../drivers/swr/rasterizer/jitter/jit_pch.hpp | 134 + .../swr/rasterizer/jitter/streamout_jit.cpp| 5 +- 10 files changed, 143 insertions(+), 88 deletions(-) create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources index 53f8bf011b..cd2040e137 100644 --- a/src/gallium/drivers/swr/Makefile.sources +++ b/src/gallium/drivers/swr/Makefile.sources @@ -145,6 +145,7 @@ JITTER_CXX_SOURCES := \ rasterizer/jitter/fetch_jit.cpp \ rasterizer/jitter/fetch_jit.h \ rasterizer/jitter/jit_api.h \ + rasterizer/jitter/jit_pch.hpp \ rasterizer/jitter/JitManager.cpp \ rasterizer/jitter/JitManager.h \ rasterizer/jitter/streamout_jit.cpp \ diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp index 59672bb545..883ac77482 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp @@ -27,41 +27,7 @@ * Notes: * **/ -#if defined(_WIN32) -#pragma warning(disable: 4800 4146 4244 4267 4355 4996) -#endif - -#pragma push_macro("DEBUG") -#undef DEBUG - -#if defined(_WIN32) -#include "llvm/ADT/Triple.h" -#endif -#include "llvm/IR/Function.h" - -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/SourceMgr.h" - -#include "llvm/Analysis/CFGPrinter.h" -#include "llvm/IRReader/IRReader.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Support/FormattedStream.h" -#include "llvm/Support/Path.h" -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Config/llvm-config.h" - -#if LLVM_VERSION_MAJOR < 4 -#include "llvm/Bitcode/ReaderWriter.h" -#else -#include "llvm/Bitcode/BitcodeWriter.h" -#include "llvm/Bitcode/BitcodeReader.h" -#endif - -#if LLVM_USE_INTEL_JITEVENTS -#include "llvm/ExecutionEngine/JITEventListener.h" -#endif - -#pragma pop_macro("DEBUG") +#include "jit_pch.hpp" #include "JitManager.h" #include "jit_api.h" diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h index c30a807222..9e5e4cf2b6 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h @@ -29,52 +29,9 @@ **/ #pragma once -#if defined(_WIN32) -#pragma warning(disable : 4146 4244 4267 4800 4996) -#endif - -// llvm 3.7+ reuses "DEBUG" as an enum value -#pragma push_macro("DEBUG") -#undef DEBUG - -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/ExecutionEngine/ObjectCache.h" - -#include "llvm/Config/llvm-config.h" - -#include "llvm/IR/Verifier.h" -#include "llvm/ExecutionEngine/MCJIT.h" -#include "llvm/Support/FileSystem.h" -#define LLVM_F_NONE sys::fs::F_None - -#include "llvm/Analysis/Passes.h" - -#include "llvm/IR/LegacyPassManager.h" -using FunctionPassManager = llvm::legacy::FunctionPassManager; -using PassManager = llvm::legacy::PassManager; - -#include "llvm/CodeGen/Passes.h" -#include "llvm/ExecutionEngine/ExecutionEngine.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/Transforms/IPO.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Support/Host.h" -#include "llvm/Support/DynamicLibrary.h" - - -#include "common/os.h" +#include "jit_pch.hpp" #include "common/isa.hpp" -#include - -#pragma pop_macro("DEBUG") // /// JitInstructionSet @@ -173,6 +130,7 @@ struct JitManager uint32_t mVWidth; + // Built in types. llvm::Type*mInt8Ty; llvm::Type*mInt32Ty; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp index 3258639d38..cc92622978 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp @@ -27,13 +27,12 @@ * Notes: * ***
[Mesa-dev] [PATCH 0/6] swr: update rasterizer
Highlights include simd16 cleanup (renaming and removing old codepaths), fixing a potential crash with the fetch shader, and code cleanups. Tim Rowley (6): swr/rast: SIMD16 builder - cleanup naming (simd2 -> simd16) swr/rast: shuffle header files for msvc pre-compiled header usage swr/rast: SIMD16 fetch shader jitter cleanup swr/rast: autogenerate named structs instead of literal structs swr/rast: don't use 32-bit gathers for elements < 32-bits in size swr/rast: switch win32 jit format to COFF src/gallium/drivers/swr/Makefile.sources |1 + .../swr/rasterizer/codegen/templates/gen_llvm.hpp | 23 +- .../drivers/swr/rasterizer/jitter/JitManager.cpp | 40 +- .../drivers/swr/rasterizer/jitter/JitManager.h | 46 +- .../drivers/swr/rasterizer/jitter/blend_jit.cpp|3 +- .../drivers/swr/rasterizer/jitter/builder.cpp | 77 +- .../drivers/swr/rasterizer/jitter/builder.h| 45 +- .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 134 +- .../drivers/swr/rasterizer/jitter/builder_misc.h | 50 +- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 1304 .../drivers/swr/rasterizer/jitter/jit_api.h|1 - .../drivers/swr/rasterizer/jitter/jit_pch.hpp | 134 ++ .../swr/rasterizer/jitter/streamout_jit.cpp|5 +- 13 files changed, 819 insertions(+), 1044 deletions(-) create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 6/6] swr/rast: switch win32 jit format to COFF
Allows for call-stack and exception handling for jitted functions. --- src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp index 883ac77482..508bc027dd 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp @@ -92,7 +92,7 @@ JitManager::JitManager(uint32_t simdWidth, const char *arch, const char* core) #if defined(_WIN32) // Needed for MCJIT on windows Triple hostTriple(sys::getProcessTriple()); -hostTriple.setObjectFormat(Triple::ELF); +hostTriple.setObjectFormat(Triple::COFF); mpCurrentModule->setTargetTriple(hostTriple.getTriple()); #endif // _WIN32 @@ -486,4 +486,4 @@ std::unique_ptr JitCache::getObject(const llvm::Module* M) fclose(fpIn); return pBuf; -} +} \ No newline at end of file -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] swr/rast: fix invalid sign masks in avx512 simdlib code
Should be 0x8000 instead of 0x800. Cc: mesa-sta...@lists.freedesktop.org --- src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl | 2 +- src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl | 2 +- src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl index 66e8309610..b70a7691e2 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl @@ -270,7 +270,7 @@ static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, In { __mmask16 m = 0xf; m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)), -_mm512_set1_epi32(0x800)); +_mm512_set1_epi32(0x8000)); return __conv(_mm512_mask_i32gather_ps( __conv(old), m, diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl index 3f93cfbd7f..3fcfd250f9 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl @@ -271,7 +271,7 @@ static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, In { __mmask16 m = 0xff; m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)), -_mm512_set1_epi32(0x800)); +_mm512_set1_epi32(0x8000)); return __conv(_mm512_mask_i32gather_ps( __conv(old), m, diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl index c13b9f616a..8de62f2a7e 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl @@ -540,7 +540,7 @@ static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a) } static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a) { -__mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(0x800)); +__mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(0x8000)); return static_cast(m); } -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] swr/rast: fix build break for llvm-6
LLVM api change. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104381 --- src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp | 4 1 file changed, 4 insertions(+) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp index 3f0772c942..59672bb545 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp @@ -498,7 +498,11 @@ std::unique_ptr JitCache::getObject(const llvm::Module* M) break; } +#if LLVM_VERSION_MAJOR < 6 pBuf = llvm::MemoryBuffer::getNewUninitMemBuffer(size_t(header.GetBufferSize())); +#else +pBuf = llvm::WritableMemoryBuffer::getNewUninitMemBuffer(size_t(header.GetBufferSize())); +#endif if (!fread(const_cast(pBuf->getBufferStart()), header.GetBufferSize(), 1, fpIn)) { pBuf = nullptr; -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 19/20] swr/rast: EXTRACT2 changed from vextract/vinsert to vshuffle
--- .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 60 ++ .../drivers/swr/rasterizer/jitter/builder_misc.h | 3 +- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 30 +-- 3 files changed, 32 insertions(+), 61 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index bdcafd28a3..0774889af1 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -653,16 +653,14 @@ namespace SwrJit } else { -Value *src0 = EXTRACT2_F(vSrc, 0); -Value *src1 = EXTRACT2_F(vSrc, 1); +Value *src0 = EXTRACT2(vSrc, 0); +Value *src1 = EXTRACT2(vSrc, 1); -Value *indices0 = EXTRACT2_I(vIndices, 0); -Value *indices1 = EXTRACT2_I(vIndices, 1); +Value *indices0 = EXTRACT2(vIndices, 0); +Value *indices1 = EXTRACT2(vIndices, 1); -Value *vmask16 = VMASK2(vMask); - -Value *mask0 = MASK(EXTRACT2_I(vmask16, 0)); // TODO: do this better.. -Value *mask1 = MASK(EXTRACT2_I(vmask16, 1)); +Value *mask0 = EXTRACT2(vMask, 0); +Value *mask1 = EXTRACT2(vMask, 1); Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale); Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale); @@ -738,16 +736,14 @@ namespace SwrJit } else { -Value *src0 = EXTRACT2_F(vSrc, 0); -Value *src1 = EXTRACT2_F(vSrc, 1); - -Value *indices0 = EXTRACT2_I(vIndices, 0); -Value *indices1 = EXTRACT2_I(vIndices, 1); +Value *src0 = EXTRACT2(vSrc, 0); +Value *src1 = EXTRACT2(vSrc, 1); -Value *vmask16 = VMASK2(vMask); +Value *indices0 = EXTRACT2(vIndices, 0); +Value *indices1 = EXTRACT2(vIndices, 1); -Value *mask0 = MASK(EXTRACT2_I(vmask16, 0)); // TODO: do this better.. -Value *mask1 = MASK(EXTRACT2_I(vmask16, 1)); +Value *mask0 = EXTRACT2(vMask, 0); +Value *mask1 = EXTRACT2(vMask, 1); Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale); Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale); @@ -809,34 +805,12 @@ namespace SwrJit } #if USE_SIMD16_BUILDER -// -/// @brief -Value *Builder::EXTRACT2_F(Value *a2, uint32_t imm) -{ -const uint32_t i0 = (imm > 0) ? mVWidth : 0; - -Value *result = VUNDEF_F(); - -for (uint32_t i = 0; i < mVWidth; i += 1) -{ -#if 1 -if (!a2->getType()->getScalarType()->isFloatTy()) -{ -a2 = BITCAST(a2, mSimd2FP32Ty); -} - -#endif -Value *temp = VEXTRACT(a2, C(i0 + i)); - -result = VINSERT(result, temp, C(i)); -} - -return result; -} - -Value *Builder::EXTRACT2_I(Value *a2, uint32_t imm) +Value *Builder::EXTRACT2(Value *x, uint32_t imm) { -return BITCAST(EXTRACT2_F(a2, imm), mSimdInt32Ty); +if (imm == 0) +return VSHUFFLE(x, UndefValue::get(x->getType()), {0, 1, 2, 3, 4, 5, 6, 7}); +else +return VSHUFFLE(x, UndefValue::get(x->getType()), {8, 9, 10, 11, 12, 13, 14, 15}); } Value *Builder::JOIN2(Value *a, Value *b) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index 98bc563351..646ed0efb2 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -117,8 +117,7 @@ Value *VMASK2(Value *mask); // #if USE_SIMD16_BUILDER -Value *EXTRACT2_F(Value *a2, uint32_t imm); -Value *EXTRACT2_I(Value *a2, uint32_t imm); +Value *EXTRACT2(Value *x, uint32_t imm); Value *JOIN2(Value *a, Value *b); #endif diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 8d97ddfdc9..aa911b58f3 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -1078,14 +1078,12 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, vOffsets16 = ADD(vOffsets16, vInstanceStride16); // TODO: remove the following simd8 interop stuff once all code paths are fully widened to SIMD16.. -Value *vmask16 = VMASK2(vGatherMask16); -Value *vGatherMask = MASK(EXTRACT2_I(vmask16, 0)); -Value *vGatherMask2 = MASK(EXTRACT2_I(vmask16, 1)); - -Value *vOffsets = EXTRACT2_I(vOffsets16, 0); -Va
[Mesa-dev] [PATCH 17/20] swr/rast: Replace VPSRL with LSHR
Replace use of x86 intrinsic with general llvm IR instruction. Generates the same final assembly. --- .../swr/rasterizer/codegen/gen_llvm_ir_macros.py | 2 -- .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 30 -- .../drivers/swr/rasterizer/jitter/builder_misc.h | 5 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 8 +++--- 4 files changed, 4 insertions(+), 41 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py index 8bbf36d9b8..9544353eb9 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py @@ -47,8 +47,6 @@ intrinsics = [ ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale']], -['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']], -['VPSRLI_16', 'x86_avx512_psrli_d_512', ['src', 'imm']], ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']], ['VRSQRTPS', 'x86_avx_rsqrt_ps_256', ['a']], ['VRCPPS', 'x86_avx_rcp_ps_256', ['a']], diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 684c9fac54..bdcafd28a3 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -808,36 +808,6 @@ namespace SwrJit return vGather; } -#if USE_SIMD16_BUILDER -Value *Builder::PSRLI(Value *a, Value *imm) -{ -return VPSRLI(a, imm); -} - -Value *Builder::PSRLI_16(Value *a, Value *imm) -{ -Value *result = VUNDEF2_I(); - -// use avx512 shift right instruction if available -if (JM()->mArch.AVX512F()) -{ -result = VPSRLI_16(a, imm); -} -else -{ -Value *a0 = EXTRACT2_I(a, 0); -Value *a1 = EXTRACT2_I(a, 1); - -Value *result0 = PSRLI(a0, imm); -Value *result1 = PSRLI(a1, imm); - -result = JOIN2(result0, result1); -} - -return result; -} - -#endif #if USE_SIMD16_BUILDER // /// @brief diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index 6c883d8f52..98bc563351 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -143,11 +143,6 @@ void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, Value *GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1); -#if USE_SIMD16_BUILDER -Value *PSRLI(Value *a, Value *imm); -Value *PSRLI_16(Value *a, Value *imm); - -#endif void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask); void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 1312ac0009..8d97ddfdc9 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -1422,12 +1422,12 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // But, we know that elements must be aligned for FETCH. :) // Right shift the offset by a bit and then scale by 2 to remove the sign extension. #if USE_SIMD16_BUILDER -Value *shiftedOffsets = VPSRLI_16(vOffsets16, C(1)); +Value *shiftedOffsets = LSHR(vOffsets16, 1); pVtxSrc2[currentVertexElement] = GATHERPS_16(gatherSrc16, pStreamBase, shiftedOffsets, vGatherMask16, 2); #else -Value *vShiftedOffsets = VPSRLI(vOffsets, C(1)); -Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1)); +Value *vShiftedOffsets = LSHR(vOffsets, 1); +Value *vShiftedOffsets2 = LSHR(vOffsets2, 1); vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2); vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vGatherMask2, 2); @@ -1492,7 +1492,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
[Mesa-dev] [PATCH 20/20] swr/rast: Move more RTAI handling out of binner
--- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 13 + src/gallium/drivers/swr/rasterizer/core/clip.h | 1 + 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index 7ef87c4443..9aa9f9e79b 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -1023,18 +1023,7 @@ void BinPostSetupPointsImpl( SIMD_T::store_si(reinterpret_cast(aMTBottom), bbox.ymax); // store render target array index -OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH]; -if (state.backendState.readRenderTargetArrayIndex) -{ -typename SIMD_T::Vec4 vRtai[2]; -pa.Assemble(VERTEX_SGV_SLOT, vRtai); -typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]); -SIMD_T::store_si(reinterpret_cast(aRTAI), vRtaii); -} -else -{ -SIMD_T::store_si(reinterpret_cast(aRTAI), SIMD_T::setzero_si()); -} +const uint32_t *aRTAI = reinterpret_cast(&rtIdx); OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH]; SIMD_T::store_ps(reinterpret_cast(aPointSize), vPointSize); diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index e5e00d49b0..592c9bfa73 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -646,6 +646,7 @@ public: PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, NumVertsPerPrim, clipTopology); clipPA.viewportArrayActive = pa.viewportArrayActive; +clipPA.rtArrayActive = pa.rtArrayActive; static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f }; -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 13/20] swr/rast: SIMD16 Fetch - Fully widen 32-bit integer vertex components
Also widen the 16-bit a 8-bit integer vertex component gathers to SIMD16. --- .../swr/rasterizer/codegen/gen_llvm_ir_macros.py | 1 + .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 36 + .../drivers/swr/rasterizer/jitter/builder_misc.h | 3 + .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 86 +- 4 files changed, 109 insertions(+), 17 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py index ac8b3badf6..8bbf36d9b8 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py @@ -46,6 +46,7 @@ intrinsics = [ ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale']], +['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']], ['VPSRLI_16', 'x86_avx512_psrli_d_512', ['src', 'imm']], ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']], diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 3a486e4c1e..684c9fac54 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -723,6 +723,42 @@ namespace SwrJit return vGather; } +#if USE_SIMD16_BUILDER +Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale) +{ +Value *vGather = VUNDEF2_F(); + +// use avx512 gather instruction if available +if (JM()->mArch.AVX512F()) +{ +// force mask to , required by vgather2 +Value *mask = BITCAST(vMask, mInt16Ty); + +vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale)); +} +else +{ +Value *src0 = EXTRACT2_F(vSrc, 0); +Value *src1 = EXTRACT2_F(vSrc, 1); + +Value *indices0 = EXTRACT2_I(vIndices, 0); +Value *indices1 = EXTRACT2_I(vIndices, 1); + +Value *vmask16 = VMASK2(vMask); + +Value *mask0 = MASK(EXTRACT2_I(vmask16, 0)); // TODO: do this better.. +Value *mask1 = MASK(EXTRACT2_I(vmask16, 1)); + +Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale); +Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale); + +vGather = JOIN2(gather0, gather1); +} + +return vGather; +} + +#endif // /// @brief Generate a masked gather operation in LLVM IR. If not /// supported on the underlying platform, emulate it with loads diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index 231bd6ad85..6c883d8f52 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -135,6 +135,9 @@ void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, Value* mask, Value* vGatherComponents[], bool bPackedOutput); Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1); +#if USE_SIMD16_BUILDER +Value *GATHERDD_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1); +#endif void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, Value* mask, Value* vGatherComponents[], bool bPackedOutput); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index e0a0770560..ec3b5eafcc 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -1349,14 +1349,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (compMask) { #if USE_SIMD16_BUILDER -#if USE_SIMD16_BUILDER -#else -Value *gatherResult[2]; - -gatherResult[0] = JOIN2(vGatherResult[0], vGatherResult2[0]); -gatherResult[1] = JOIN2(vGatherResult[1], vGatherResult2[1]); - -#endif Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE, @@ -1701,6 +1693,9 @@ void FetchJit::JitGather
[Mesa-dev] [PATCH 18/20] swr/rast: Fix cache of API thread event manager
--- src/gallium/drivers/swr/rasterizer/core/api.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 25a3f34841..09b482dcc0 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -166,7 +166,7 @@ HANDLE SwrCreateContext( #if defined(KNOB_ENABLE_AR) // cache the API thread event manager, for use with sim layer -pCreateInfo->hArEventManager = pContext->pArContext[16]; +pCreateInfo->hArEventManager = pContext->pArContext[pContext->NumWorkerThreads + 1]; #endif // State setup AFTER context is fully initialized -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 12/20] swr/rast: Replace INSERT2 vextract/vinsert with JOIN2 vshuffle
--- .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 38 ++--- .../drivers/swr/rasterizer/jitter/builder_misc.h | 5 +- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 92 ++ 3 files changed, 30 insertions(+), 105 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index b2210db717..3a486e4c1e 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -667,8 +667,7 @@ namespace SwrJit Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale); Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale); -vGather = INSERT2_F(vGather, gather0, 0); -vGather = INSERT2_F(vGather, gather1, 1); +vGather = JOIN2(gather0, gather1); } return vGather; @@ -796,8 +795,7 @@ namespace SwrJit Value *result0 = PSRLI(a0, imm); Value *result1 = PSRLI(a1, imm); -result = INSERT2_I(result, result0, 0); -result = INSERT2_I(result, result1, 1); +result = JOIN2(result0, result1); } return result; @@ -835,37 +833,13 @@ namespace SwrJit return BITCAST(EXTRACT2_F(a2, imm), mSimdInt32Ty); } -// -/// @brief -Value *Builder::INSERT2_F(Value *a2, Value *b, uint32_t imm) +Value *Builder::JOIN2(Value *a, Value *b) { -const uint32_t i0 = (imm > 0) ? mVWidth : 0; - -Value *result = BITCAST(a2, mSimd2FP32Ty); - -for (uint32_t i = 0; i < mVWidth; i += 1) -{ -#if 1 -if (!b->getType()->getScalarType()->isFloatTy()) -{ -b = BITCAST(b, mSimdFP32Ty); -} - -#endif -Value *temp = VEXTRACT(b, C(i)); - -result = VINSERT(result, temp, C(i0 + i)); -} - -return result; +return VSHUFFLE(a, b, +{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); } - -Value *Builder::INSERT2_I(Value *a2, Value *b, uint32_t imm) -{ -return BITCAST(INSERT2_F(a2, b, imm), mSimd2Int32Ty); -} - #endif + // /// @brief convert x86 mask to llvm mask Value *Builder::MASK(Value *vmask) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index 62360a3ad7..231bd6ad85 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -119,10 +119,9 @@ Value *VMASK2(Value *mask); #if USE_SIMD16_BUILDER Value *EXTRACT2_F(Value *a2, uint32_t imm); Value *EXTRACT2_I(Value *a2, uint32_t imm); -Value *INSERT2_F(Value *a2, Value *b, uint32_t imm); -Value *INSERT2_I(Value *a2, Value *b, uint32_t imm); - +Value *JOIN2(Value *a, Value *b); #endif + Value *MASKLOADD(Value* src, Value* mask); void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index c960dc77fb..e0a0770560 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -960,10 +960,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // offset indices by baseVertex #if USE_SIMD16_GATHERS #if USE_SIMD16_BUILDER -Value *vIndices16 = VUNDEF2_I(); - -vIndices16 = INSERT2_I(vIndices16, vIndices, 0); -vIndices16 = INSERT2_I(vIndices16, vIndices2, 1); +Value *vIndices16 = JOIN2(vIndices, vIndices2); vCurIndices16 = ADD(vIndices16, vBaseVertex16); #else @@ -982,10 +979,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // offset indices by baseVertex #if USE_SIMD16_GATHERS #if USE_SIMD16_BUILDER -Value *vIndices16 = VUNDEF2_I(); - -vIndices16 = INSERT2_I(vIndices16, vIndices, 0); -vIndices16 = INSERT2_I(vIndices16, vIndices2, 1); +Value *vIndices16 = JOIN2(vIndices, vIndices2); vCurIndices16 = ADD(vIndices16, vBaseVertex16); #else @@ -1206,9 +1200,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, { #if USE_SIMD16_BUILDER // pack adjacent pairs of SIMD8s into SIMD16s -pVtxSrc2[currentVertexElement] = VUNDEF2_F(); -pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults[c], 0); -pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement]
[Mesa-dev] [PATCH 15/20] swr/rast: Pull of RTAI gather & offset out of clip/bin code
--- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 118 +++- src/gallium/drivers/swr/rasterizer/core/clip.cpp | 30 ++-- src/gallium/drivers/swr/rasterizer/core/clip.h | 35 +++-- src/gallium/drivers/swr/rasterizer/core/context.h | 4 +- .../drivers/swr/rasterizer/core/frontend.cpp | 153 +++-- src/gallium/drivers/swr/rasterizer/core/frontend.h | 8 +- src/gallium/drivers/swr/rasterizer/core/pa.h | 1 + 7 files changed, 203 insertions(+), 146 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index a664ed812f..7ef87c4443 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -45,7 +45,8 @@ void BinPostSetupLinesImpl( typename SIMD_T::Float recipW[], uint32_t primMask, typename SIMD_T::Integer const &primID, -typename SIMD_T::Integer const &viewportIdx); +typename SIMD_T::Integer const &viewportIdx, +typename SIMD_T::Integer const &rtIdx); template void BinPostSetupPointsImpl( @@ -55,7 +56,8 @@ void BinPostSetupPointsImpl( typename SIMD_T::Vec4 prim[], uint32_t primMask, typename SIMD_T::Integer const &primID, -typename SIMD_T::Integer const &viewportIdx); +typename SIMD_T::Integer const &viewportIdx, +typename SIMD_T::Integer const &rtIdx); // /// @brief Processes attributes for the backend based on linkage mask and @@ -308,9 +310,11 @@ void SIMDCALL BinTrianglesImpl( typename SIMD_T::Vec4 tri[3], uint32_t triMask, typename SIMD_T::Integer const &primID, -typename SIMD_T::Integer const &viewportIdx) +typename SIMD_T::Integer const &viewportIdx, +typename SIMD_T::Integer const &rtIdx) { SWR_CONTEXT *pContext = pDC->pContext; +const uint32_t *aRTAI = reinterpret_cast(&rtIdx); AR_BEGIN(FEBinTriangles, pDC->drawId); @@ -604,21 +608,21 @@ endBinTriangles: recipW[0] = vRecipW0; recipW[1] = vRecipW1; -BinPostSetupLinesImpl(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); +BinPostSetupLinesImpl(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx); line[0] = tri[1]; line[1] = tri[2]; recipW[0] = vRecipW1; recipW[1] = vRecipW2; -BinPostSetupLinesImpl(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); +BinPostSetupLinesImpl(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx); line[0] = tri[2]; line[1] = tri[0]; recipW[0] = vRecipW2; recipW[1] = vRecipW0; -BinPostSetupLinesImpl(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); +BinPostSetupLinesImpl(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx); AR_END(FEBinTriangles, 1); return; @@ -626,9 +630,9 @@ endBinTriangles: else if (rastState.fillMode == SWR_FILLMODE_POINT) { // Bin 3 points -BinPostSetupPointsImpl(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx); -BinPostSetupPointsImpl(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx); -BinPostSetupPointsImpl(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx); +BinPostSetupPointsImpl(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx, rtIdx); +BinPostSetupPointsImpl(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx, rtIdx); +BinPostSetupPointsImpl(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx, rtIdx); AR_END(FEBinTriangles, 1); return; @@ -659,22 +663,6 @@ endBinTriangles: TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z); TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2); -// store render target array index -OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH]; -if (state.backendState.readRenderTargetArrayIndex) -{ -typename SIMD_T::Vec4 vRtai[3]; -pa.Assemble(VERTEX_SGV_SLOT, vRtai); -typename SIMD_T::Integer vRtaii; -vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]); -SIMD_T::store_si(reinterpret_cast(aRTAI), vRtaii); -} -else -{ -SIMD_T::store_si(reinterpret_cast(aRTAI), SIMD_T::setzero_si()); -} - - // scan remaining valid triangles and bin each separately while (_BitScanForward(&triIndex, triMask)) { @@ -763,9 +751,10 @@ void BinTriangles( simdvector tri[3], uint32_t triMask, simdscalari const &primID, -simdscalari const &viewportIdx) +simdscalari const &viewportIdx, +simdscalari const &rtIdx) { -BinTrianglesImpl(pDC, pa, workerId, tri, triMask, primID, viewportIdx); +BinTrianglesImpl(pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx); } #if USE_SIMD16_FR
[Mesa-dev] [PATCH 16/20] swr/rast: Rework thread binding parameters for machine partitioning
Add BASE_NUMA_NODE, BASE_CORE, BASE_THREAD parameters to SwrCreateContext. Add optional SWR_API_THREADING_INFO parameter to SwrCreateContext to control reservation of API threads. Add SwrBindApiThread() function to allow binding of API threads to reserved HW threads. --- .../drivers/swr/rasterizer/codegen/knob_defs.py| 29 +- src/gallium/drivers/swr/rasterizer/core/api.cpp| 40 ++- src/gallium/drivers/swr/rasterizer/core/api.h | 33 +++ src/gallium/drivers/swr/rasterizer/core/context.h | 1 + .../drivers/swr/rasterizer/core/threads.cpp| 299 +++-- src/gallium/drivers/swr/rasterizer/core/threads.h | 4 + .../drivers/swr/rasterizer/core/tilemgr.cpp| 4 +- 7 files changed, 322 insertions(+), 88 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py index 09e3124602..30803927e3 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py @@ -62,15 +62,33 @@ KNOBS = [ 'category' : 'perf', }], -['MAX_NUMA_NODES', { +['BASE_NUMA_NODE', { 'type' : 'uint32_t', 'default' : '0', +'desc' : ['Starting NUMA node index to use when allocating compute resources.', + 'Setting this to a non-zero value will reduce the maximum # of NUMA nodes used.'], +'category' : 'perf', +'advanced' : True, +}], + +['MAX_NUMA_NODES', { +'type' : 'uint32_t', +'default' : '1' if sys.platform == 'win32' else '0', 'desc' : ['Maximum # of NUMA-nodes per system used for worker threads', ' 0 == ALL NUMA-nodes in the system', ' N == Use at most N NUMA-nodes for rendering'], 'category' : 'perf', }], +['BASE_CORE', { +'type' : 'uint32_t', +'default' : '0', +'desc' : ['Starting core index to use when allocating compute resources.', + 'Setting this to a non-zero value will reduce the maximum # of cores used.'], +'category' : 'perf', +'advanced' : True, +}], + ['MAX_CORES_PER_NUMA_NODE', { 'type' : 'uint32_t', 'default' : '0', @@ -80,6 +98,15 @@ KNOBS = [ 'category' : 'perf', }], +['BASE_THREAD', { +'type' : 'uint32_t', +'default' : '0', +'desc' : ['Starting thread index to use when allocating compute resources.', + 'Setting this to a non-zero value will reduce the maximum # of threads used.'], +'category' : 'perf', +'advanced' : True, +}], + ['MAX_THREADS_PER_CORE', { 'type' : 'uint32_t', 'default' : '1', diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 9265440904..25a3f34841 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -95,16 +95,32 @@ HANDLE SwrCreateContext( pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); } -pContext->threadInfo.MAX_WORKER_THREADS= KNOB_MAX_WORKER_THREADS; -pContext->threadInfo.MAX_NUMA_NODES= KNOB_MAX_NUMA_NODES; -pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE; -pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE; -pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED; - if (pCreateInfo->pThreadInfo) { pContext->threadInfo = *pCreateInfo->pThreadInfo; } +else +{ +pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS; +pContext->threadInfo.BASE_NUMA_NODE = KNOB_BASE_NUMA_NODE; +pContext->threadInfo.BASE_CORE = KNOB_BASE_CORE; +pContext->threadInfo.BASE_THREAD= KNOB_BASE_THREAD; +pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES; +pContext->threadInfo.MAX_CORES_PER_NUMA_NODE= KNOB_MAX_CORES_PER_NUMA_NODE; +pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE; +pContext->threadInfo.SINGLE_THREADED= KNOB_SINGLE_THREADED; +} + +if (pCreateInfo->pApiThreadInfo) +{ +pContext->apiThreadInfo = *pCreateInfo->pApiThreadInfo; +} +else +{ +pContext->apiThreadInfo.bindAPIThread0 = true; +pContext->apiThreadInfo.numAPIReservedThreads = 1; +pContext->apiThreadInfo.numAPIThreadsPerCore= 1; +} memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock)); memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty)); @@ -113,6 +129,11 @@ HANDLE SwrCreateContext( Create
[Mesa-dev] [PATCH 14/20] swr/rast: Remove no-op VBROADCAST of vID
--- src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index ec3b5eafcc..1312ac0009 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -3101,7 +3101,7 @@ Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) #else Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); #endif -return VBROADCAST(pId); +return pId; } case StoreInstanceId: { @@ -3129,7 +3129,7 @@ Value* FetchJit::GenerateCompCtrlVector2(const ComponentControl ctrl) Value *pId = JOIN2(pId_lo, pId_hi); -return VBROADCAST2(pId); +return pId; } case StoreInstanceId: { -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 01/20] swr/rast: Remove unneeded copy of gather mask
--- .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 22 +- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 80 ++ 2 files changed, 23 insertions(+), 79 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 8ffe05b41c..0221106664 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -1107,23 +1107,19 @@ namespace SwrJit } void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, -Value* mask, Value* vGatherComponents[], bool bPackedOutput) +Value* vMask, Value* vGatherComponents[], bool bPackedOutput) { switch(info.bpp / info.numComps) { case 16: { Value* vGatherResult[2]; -Value *vMask; // TODO: vGatherMaskedVal Value* vGatherMaskedVal = VIMMED1((float)0); // always have at least one component out of x or y to fetch -// save mask as it is zero'd out after each gather -vMask = mask; - vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); // e.g. result of first 8x32bit integer gather for 16bit components // 256i - 01234567 @@ -1135,7 +1131,6 @@ namespace SwrJit { // offset base to the next components(zw) in the vertex to gather pSrcBase = GEP(pSrcBase, C((char)4)); -vMask = mask; vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); // e.g. result of second 8x32bit integer gather for 16bit components @@ -1164,9 +1159,6 @@ namespace SwrJit { uint32_t swizzleIndex = info.swizzle[i]; -// save mask as it is zero'd out after each gather -Value *vMask = mask; - // Gather a SIMD of components vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask); @@ -1182,14 +1174,14 @@ namespace SwrJit } void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, -Value* mask, Value* vGatherComponents[], bool bPackedOutput) +Value* vMask, Value* vGatherComponents[], bool bPackedOutput) { switch (info.bpp / info.numComps) { case 8: { Value* vGatherMaskedVal = VIMMED1((int32_t)0); -Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask); +Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); // e.g. result of an 8x32bit integer gather for 8bit components // 256i - 01234567 //xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw @@ -1200,16 +1192,12 @@ namespace SwrJit case 16: { Value* vGatherResult[2]; -Value *vMask; // TODO: vGatherMaskedVal Value* vGatherMaskedVal = VIMMED1((int32_t)0); // always have at least one component out of x or y to fetch -// save mask as it is zero'd out after each gather -vMask = mask; - vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); // e.g. result of first 8x32bit integer gather for 16bit components // 256i - 01234567 @@ -1221,7 +1209,6 @@ namespace SwrJit { // offset base to the next components(zw) in the vertex to gather pSrcBase = GEP(pSrcBase, C((char)4)); -vMask = mask; vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); // e.g. result of second 8x32bit integer gather for 16bit components @@ -1251,9 +1238,6 @@ namespace SwrJit { uint32_t swizzleIndex = info.swizzle[i]; -// save mask as it is zero'd out after each gather -Value *vMask = mask; - // Gather a SIMD of components vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/raster
[Mesa-dev] [PATCH 02/20] swr/rast: Binner fixes for viewport index offset handling
--- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 9 - src/gallium/drivers/swr/rasterizer/core/clip.h | 5 - 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index 9d1f0d8799..52375f8956 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -470,6 +470,10 @@ void SIMDCALL BinTrianglesImpl( typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports); viewportIdx = SIMD_T::and_si(vClearMask, vpai); } +else +{ +viewportIdx = vpai; +} if (feState.vpTransformDisable) { @@ -1326,6 +1330,10 @@ void BinPointsImpl( typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports); viewportIdx = SIMD_T::and_si(vClearMask, vpai); } +else +{ +viewportIdx = vpai; +} if (!feState.vpTransformDisable) { @@ -1647,7 +1655,6 @@ void SIMDCALL BinLinesImpl( if (state.backendState.readViewportArrayIndex) { pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); - vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); } diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index 0d3d78057f..9d8bbc19e6 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -694,7 +694,6 @@ public: if (state.backendState.readViewportArrayIndex) { pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); - vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); } @@ -707,6 +706,10 @@ public: typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports); viewportIdx = SIMD_T::and_si(vClearMask, vpai); } +else +{ +viewportIdx = vpai; +} ComputeClipCodes(prim, viewportIdx); -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 00/20] swr: update rasterizer
Highlights include simd16 work, thread pool initialization rework, and code cleanup. Tim Rowley (20): swr/rast: Remove unneeded copy of gather mask swr/rast: Binner fixes for viewport index offset handling swr/rast: Corrections to multi-scissor handling swr/rast: WIP - Widen fetch shader to SIMD16 swr/rast: Convert gather masks to Nx1bit swr/rast: Rewrite Shuffle8bpcGatherd using shuffle swr/rast: Move GatherScissors to header swr/rast: Pull most of the VPAI manipulation out of the binner/clipper swr/rast: Pass prim to ClipSimd swr/rast: SIMD16 Fetch - Fully widen 32-bit float vertex components swr/rast: SIMD16 Fetch - Fully widen 16-bit float vertex components swr/rast: Replace INSERT2 vextract/vinsert with JOIN2 vshuffle swr/rast: SIMD16 Fetch - Fully widen 32-bit integer vertex components swr/rast: Remove no-op VBROADCAST of vID swr/rast: Pull of RTAI gather & offset out of clip/bin code swr/rast: Rework thread binding parameters for machine partitioning swr/rast: Replace VPSRL with LSHR swr/rast: Fix cache of API thread event manager swr/rast: EXTRACT2 changed from vextract/vinsert to vshuffle swr/rast: Move more RTAI handling out of binner .../swr/rasterizer/codegen/gen_llvm_ir_macros.py |4 +- .../drivers/swr/rasterizer/codegen/knob_defs.py| 29 +- src/gallium/drivers/swr/rasterizer/core/api.cpp| 42 +- src/gallium/drivers/swr/rasterizer/core/api.h | 33 + src/gallium/drivers/swr/rasterizer/core/binner.cpp | 345 ++- src/gallium/drivers/swr/rasterizer/core/binner.h | 127 +++ src/gallium/drivers/swr/rasterizer/core/clip.cpp | 31 +- src/gallium/drivers/swr/rasterizer/core/clip.h | 67 +- src/gallium/drivers/swr/rasterizer/core/context.h |5 +- .../drivers/swr/rasterizer/core/frontend.cpp | 179 +++- src/gallium/drivers/swr/rasterizer/core/frontend.h |8 +- src/gallium/drivers/swr/rasterizer/core/pa.h |5 +- .../drivers/swr/rasterizer/core/threads.cpp| 299 -- src/gallium/drivers/swr/rasterizer/core/threads.h |4 + .../drivers/swr/rasterizer/core/tilemgr.cpp|4 +- .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 157 ++- .../drivers/swr/rasterizer/jitter/builder_misc.h | 13 +- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 1038 18 files changed, 1657 insertions(+), 733 deletions(-) -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 04/20] swr/rast: WIP - Widen fetch shader to SIMD16
Widen vertex gather/storage to SIMD16 for all component types. --- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 716 - 1 file changed, 689 insertions(+), 27 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 337bb7f660..6c0e658e68 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -70,6 +70,9 @@ struct FetchJit : public Builder #else void Shuffle8bpcGatherd(Shuffle8bpcArgs &args); #endif +#if USE_SIMD16_BUILDER +void Shuffle8bpcGatherd2(Shuffle8bpcArgs &args); +#endif typedef std::tuple Shuffle16bpcArgs; @@ -78,6 +81,9 @@ struct FetchJit : public Builder #else void Shuffle16bpcGather(Shuffle16bpcArgs &args); #endif +#if USE_SIMD16_BUILDER +void Shuffle16bpcGather2(Shuffle16bpcArgs &args); +#endif void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); #if USE_SIMD16_BUILDER @@ -726,7 +732,7 @@ void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pB // only works if pixel size is <= 32bits SWR_ASSERT(info.bpp <= 32); - Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask); +Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask); for (uint32_t comp = 0; comp < 4; ++comp) { @@ -825,6 +831,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* vVertexElements[4]; #if USE_SIMD16_GATHERS Value* vVertexElements2[4]; +#if USE_SIMD16_BUILDER +Value *pVtxSrc2[4]; +#endif #endif Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); @@ -961,6 +970,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, #if USE_SIMD16_GATHERS // override cur indices with 0 if pitch is 0 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0)); +vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices); vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2); // are vertices partially OOB? @@ -983,7 +993,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // only fetch lanes that pass both tests vGatherMask = AND(vMaxGatherMask, vMinGatherMask); -vGatherMask2 = AND(vMaxGatherMask, vMinGatherMask2); +vGatherMask2 = AND(vMaxGatherMask2, vMinGatherMask2); } else { @@ -1074,15 +1084,32 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, { if (isComponentEnabled(compMask, c)) { -vVertexElements[currentVertexElement] = pResults[c]; +#if USE_SIMD16_BUILDER +// pack adjacent pairs of SIMD8s into SIMD16s +pVtxSrc2[currentVertexElement] = VUNDEF2_F(); +pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults[c], 0); +pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults2[c], 1); + +#else +vVertexElements[currentVertexElement] = pResults[c]; vVertexElements2[currentVertexElement] = pResults2[c]; -currentVertexElement++; + +#endif +currentVertexElement += 1; if (currentVertexElement > 3) { +#if USE_SIMD16_BUILDER +// store SIMD16s +Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + +StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); + +#else StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); +#endif outputElt += 1; // reset to the next vVertexElement to output @@ -1113,9 +1140,12 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, else if(info.type[0] == SWR_TYPE_FLOAT) { ///@todo: support 64 bit vb accesses -Value* gatherSrc = VIMMED1(0.0f); +Value *gatherSrc = VIMMED1(0.0f); #if USE_SIMD16_GATHERS -Value* gatherSrc2 = VIMMED1(0.0f); +Value *gatherSrc2 = VIMMED1(0.0f); +#if USE_SIMD16_BUILDER +Value *gatherSrc16 = VIMMED2_1(0.0f); +#endif #endif SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), @@ -1127,8 +1157,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, case 16: { #if USE_SIMD16_GATHERS -Value* vGatherResult[2]; -
[Mesa-dev] [PATCH 11/20] swr/rast: SIMD16 Fetch - Fully widen 16-bit float vertex components
--- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 55 +++--- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 2065db3475..c960dc77fb 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -1277,6 +1277,43 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, case 16: { #if USE_SIMD16_GATHERS +#if USE_SIMD16_BUILDER +Value *gatherResult[2]; + +// if we have at least one component out of x or y to fetch +if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) +{ +gatherResult[0] = GATHERPS_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16); + +// e.g. result of first 8x32bit integer gather for 16bit components +// 256i - 01234567 +//xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy +// +} +else +{ +gatherResult[0] = VUNDEF2_I(); +} + +// if we have at least one component out of z or w to fetch +if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) +{ +// offset base to the next components(zw) in the vertex to gather +pStreamBase = GEP(pStreamBase, C((char)4)); + +gatherResult[1] = GATHERPS_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16); + +// e.g. result of second 8x32bit integer gather for 16bit components +// 256i - 01234567 +//zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw +// +} +else +{ +gatherResult[1] = VUNDEF2_I(); +} + +#else Value *vGatherResult[2]; Value *vGatherResult2[2]; @@ -1315,10 +1352,13 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, vGatherResult2[1] = VUNDEF_I(); } +#endif // if we have at least one component to shuffle into place if (compMask) { #if USE_SIMD16_BUILDER +#if USE_SIMD16_BUILDER +#else Value *gatherResult[2]; gatherResult[0] = VUNDEF2_I(); @@ -1330,6 +1370,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1], 0); gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1); +#endif Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE, @@ -1511,21 +1552,21 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // if we need to gather the component if (compCtrl[i] == StoreSrc) { -Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 })); +Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 })); Value *vMaskLo2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 })); -Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 })); +Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 })); Value *vMaskHi2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 })); -Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0)); +Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0)); Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0)); -Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1)); +Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1)); Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1)); Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(),
[Mesa-dev] [PATCH 03/20] swr/rast: Corrections to multi-scissor handling
binner's GatherScissors() will be turned into a real gather in the not too distant future. --- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 176 ++--- 1 file changed, 88 insertions(+), 88 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index 52375f8956..8a5356b168 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -226,117 +226,117 @@ static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t simdscalari &scisXmin, simdscalari &scisYmin, simdscalari &scisXmax, simdscalari &scisYmax) { scisXmin = _simd_set_epi32( -pScissorsInFixedPoint[pViewportIndex[0]].xmin, -pScissorsInFixedPoint[pViewportIndex[1]].xmin, -pScissorsInFixedPoint[pViewportIndex[2]].xmin, -pScissorsInFixedPoint[pViewportIndex[3]].xmin, -pScissorsInFixedPoint[pViewportIndex[4]].xmin, -pScissorsInFixedPoint[pViewportIndex[5]].xmin, +pScissorsInFixedPoint[pViewportIndex[7]].xmin, pScissorsInFixedPoint[pViewportIndex[6]].xmin, -pScissorsInFixedPoint[pViewportIndex[7]].xmin); +pScissorsInFixedPoint[pViewportIndex[5]].xmin, +pScissorsInFixedPoint[pViewportIndex[4]].xmin, +pScissorsInFixedPoint[pViewportIndex[3]].xmin, +pScissorsInFixedPoint[pViewportIndex[2]].xmin, +pScissorsInFixedPoint[pViewportIndex[1]].xmin, +pScissorsInFixedPoint[pViewportIndex[0]].xmin); scisYmin = _simd_set_epi32( -pScissorsInFixedPoint[pViewportIndex[0]].ymin, -pScissorsInFixedPoint[pViewportIndex[1]].ymin, -pScissorsInFixedPoint[pViewportIndex[2]].ymin, -pScissorsInFixedPoint[pViewportIndex[3]].ymin, -pScissorsInFixedPoint[pViewportIndex[4]].ymin, -pScissorsInFixedPoint[pViewportIndex[5]].ymin, +pScissorsInFixedPoint[pViewportIndex[7]].ymin, pScissorsInFixedPoint[pViewportIndex[6]].ymin, -pScissorsInFixedPoint[pViewportIndex[7]].ymin); +pScissorsInFixedPoint[pViewportIndex[5]].ymin, +pScissorsInFixedPoint[pViewportIndex[4]].ymin, +pScissorsInFixedPoint[pViewportIndex[3]].ymin, +pScissorsInFixedPoint[pViewportIndex[2]].ymin, +pScissorsInFixedPoint[pViewportIndex[1]].ymin, +pScissorsInFixedPoint[pViewportIndex[0]].ymin); scisXmax = _simd_set_epi32( -pScissorsInFixedPoint[pViewportIndex[0]].xmax, -pScissorsInFixedPoint[pViewportIndex[1]].xmax, -pScissorsInFixedPoint[pViewportIndex[2]].xmax, -pScissorsInFixedPoint[pViewportIndex[3]].xmax, -pScissorsInFixedPoint[pViewportIndex[4]].xmax, -pScissorsInFixedPoint[pViewportIndex[5]].xmax, +pScissorsInFixedPoint[pViewportIndex[7]].xmax, pScissorsInFixedPoint[pViewportIndex[6]].xmax, -pScissorsInFixedPoint[pViewportIndex[7]].xmax); +pScissorsInFixedPoint[pViewportIndex[5]].xmax, +pScissorsInFixedPoint[pViewportIndex[4]].xmax, +pScissorsInFixedPoint[pViewportIndex[3]].xmax, +pScissorsInFixedPoint[pViewportIndex[2]].xmax, +pScissorsInFixedPoint[pViewportIndex[1]].xmax, +pScissorsInFixedPoint[pViewportIndex[0]].xmax); scisYmax = _simd_set_epi32( -pScissorsInFixedPoint[pViewportIndex[0]].ymax, -pScissorsInFixedPoint[pViewportIndex[1]].ymax, -pScissorsInFixedPoint[pViewportIndex[2]].ymax, -pScissorsInFixedPoint[pViewportIndex[3]].ymax, -pScissorsInFixedPoint[pViewportIndex[4]].ymax, -pScissorsInFixedPoint[pViewportIndex[5]].ymax, +pScissorsInFixedPoint[pViewportIndex[7]].ymax, pScissorsInFixedPoint[pViewportIndex[6]].ymax, -pScissorsInFixedPoint[pViewportIndex[7]].ymax); +pScissorsInFixedPoint[pViewportIndex[5]].ymax, +pScissorsInFixedPoint[pViewportIndex[4]].ymax, +pScissorsInFixedPoint[pViewportIndex[3]].ymax, +pScissorsInFixedPoint[pViewportIndex[2]].ymax, +pScissorsInFixedPoint[pViewportIndex[01]].ymax, +pScissorsInFixedPoint[pViewportIndex[00]].ymax); } static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex, simd16scalari &scisXmin, simd16scalari &scisYmin, simd16scalari &scisXmax, simd16scalari &scisYmax) { scisXmin = _simd16_set_epi32( -pScissorsInFixedPoint[pViewportIndex[0]].xmin, -pScissorsInFixedPoint[pViewportIndex[1]].xmin, -pScissorsInFixedPoint[pViewportIndex[2]].xmin, -pScissorsInFixedPoint[pViewportIndex[3]].xmin, -pScissorsInFixedPoint[pViewportIndex[4]].xmin, -pScissorsInFixedPoint[pViewportIndex[5]].xmin, -pScissorsInFixedPoint[pViewportIndex[6]].xmin, -pScissorsInFixedPoint[pViewportIndex[7]].xmin, -pScissorsInFixedPoint[pViewportIndex[8]].xmin, -pScissorsInFixedPoint[pViewportIn
[Mesa-dev] [PATCH 08/20] swr/rast: Pull most of the VPAI manipulation out of the binner/clipper
Move out of binner/clipper; hand them down from the frontend code instead. --- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 124 ++--- src/gallium/drivers/swr/rasterizer/core/clip.cpp | 25 ++--- src/gallium/drivers/swr/rasterizer/core/clip.h | 58 +++--- src/gallium/drivers/swr/rasterizer/core/context.h | 4 +- .../drivers/swr/rasterizer/core/frontend.cpp | 112 ++- src/gallium/drivers/swr/rasterizer/core/frontend.h | 8 +- src/gallium/drivers/swr/rasterizer/core/pa.h | 4 +- 7 files changed, 177 insertions(+), 158 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index 22996c5a5d..a664ed812f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -307,7 +307,8 @@ void SIMDCALL BinTrianglesImpl( uint32_t workerId, typename SIMD_T::Vec4 tri[3], uint32_t triMask, -typename SIMD_T::Integer const &primID) +typename SIMD_T::Integer const &primID, +typename SIMD_T::Integer const &viewportIdx) { SWR_CONTEXT *pContext = pDC->pContext; @@ -323,31 +324,6 @@ void SIMDCALL BinTrianglesImpl( typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f); typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f); -typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si(); -typename SIMD_T::Vec4 vpiAttrib[3]; -typename SIMD_T::Integer vpai = SIMD_T::setzero_si(); - -if (state.backendState.readViewportArrayIndex) -{ -pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); - -vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); -} - - -if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0 -{ -// OOB indices => forced to zero. -vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si()); -typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); -typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports); -viewportIdx = SIMD_T::and_si(vClearMask, vpai); -} -else -{ -viewportIdx = vpai; -} - if (feState.vpTransformDisable) { // RHW is passed in directly when VP transform is disabled @@ -375,7 +351,7 @@ void SIMDCALL BinTrianglesImpl( tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2); // Viewport transform to screen space coords -if (state.backendState.readViewportArrayIndex) +if (pa.viewportArrayActive) { viewportTransform<3>(tri, state.vpMatrices, viewportIdx); } @@ -568,8 +544,8 @@ void SIMDCALL BinTrianglesImpl( /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. { typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax; +if (pa.viewportArrayActive) -if (state.backendState.readViewportArrayIndex) { GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); } @@ -786,9 +762,10 @@ void BinTriangles( uint32_t workerId, simdvector tri[3], uint32_t triMask, -simdscalari const &primID) +simdscalari const &primID, +simdscalari const &viewportIdx) { -BinTrianglesImpl(pDC, pa, workerId, tri, triMask, primID); +BinTrianglesImpl(pDC, pa, workerId, tri, triMask, primID, viewportIdx); } #if USE_SIMD16_FRONTEND @@ -799,9 +776,10 @@ void SIMDCALL BinTriangles_simd16( uint32_t workerId, simd16vector tri[3], uint32_t triMask, -simd16scalari const &primID) +simd16scalari const &primID, +simd16scalari const &viewportIdx) { -BinTrianglesImpl(pDC, pa, workerId, tri, triMask, primID); +BinTrianglesImpl(pDC, pa, workerId, tri, triMask, primID, viewportIdx); } #endif @@ -1026,7 +1004,7 @@ void BinPostSetupPointsImpl( { typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax; -if (state.backendState.readViewportArrayIndex) +if (pa.viewportArrayActive) { GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); } @@ -1176,38 +1154,13 @@ void BinPointsImpl( uint32_t workerId, typename SIMD_T::Vec4 prim[3], uint32_t primMask, -typename SIMD_T::Integer const &primID) +typename SIMD_T::Integer const &primID, +typename SIMD_T::Integer const &viewportIdx) { const API_STATE& state = GetApiState(pDC); const SWR_FRONTEND_STATE& feState = state.frontendState; const SWR_RASTSTATE& rastState = state.rastState; -// Read back viewport index if required -typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si(); -typename SIMD_T::Vec4 vpiAttrib[1]; -ty
[Mesa-dev] [PATCH 05/20] swr/rast: Convert gather masks to Nx1bit
Simplifies calling code, gets gather function interface closer to llvm's masked_gather. --- .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 20 + .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 34 +- 2 files changed, 14 insertions(+), 40 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 0221106664..04092541e5 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -602,7 +602,7 @@ namespace SwrJit if(JM()->mArch.AVX2()) { // force mask to , required by vgather -Value *mask = BITCAST(vMask, mSimdFP32Ty); +Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty); vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale)); } @@ -617,7 +617,6 @@ namespace SwrJit vGather = VUNDEF_F(); Value *vScaleVec = VIMMED1((uint32_t)scale); Value *vOffsets = MUL(vIndices,vScaleVec); -Value *mask = MASK(vMask); for(uint32_t i = 0; i < mVWidth; ++i) { // single component byte index @@ -627,7 +626,7 @@ namespace SwrJit loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0)); // pointer to the value to load if we're masking off a component Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)}); -Value *selMask = VEXTRACT(mask,C(i)); +Value *selMask = VEXTRACT(vMask,C(i)); // switch in a safe address to load if we're trying to access a vertex Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); Value *val = LOAD(validAddress); @@ -648,7 +647,7 @@ namespace SwrJit if (JM()->mArch.AVX512F()) { // force mask to , required by vgather2 -Value *mask = BITCAST(MASK2(vMask), mInt16Ty); +Value *mask = BITCAST(vMask, mInt16Ty); vGather = VGATHERPS2(vSrc, pBase, vIndices, mask, C((uint32_t)scale)); } @@ -689,7 +688,7 @@ namespace SwrJit // use avx2 gather instruction if available if(JM()->mArch.AVX2()) { -vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale)); +vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale)); } else { @@ -702,7 +701,6 @@ namespace SwrJit vGather = VUNDEF_I(); Value *vScaleVec = VIMMED1((uint32_t)scale); Value *vOffsets = MUL(vIndices, vScaleVec); -Value *mask = MASK(vMask); for(uint32_t i = 0; i < mVWidth; ++i) { // single component byte index @@ -712,7 +710,7 @@ namespace SwrJit loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0)); // pointer to the value to load if we're masking off a component Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)}); -Value *selMask = VEXTRACT(mask, C(i)); +Value *selMask = VEXTRACT(vMask, C(i)); // switch in a safe address to load if we're trying to access a vertex Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); Value *val = LOAD(validAddress, C(0)); @@ -739,6 +737,7 @@ namespace SwrJit // use avx2 gather instruction if available if(JM()->mArch.AVX2()) { +vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2)); vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale)); } else @@ -752,7 +751,6 @@ namespace SwrJit vGather = UndefValue::get(VectorType::get(mDoubleTy, 4)); Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale)); Value *vOffsets = MUL(vIndices,vScaleVec); -Value *mask = MASK(vMask); for(uint32_t i = 0; i < mVWidth/2; ++i) { // single component byte index @@ -762,7 +760,7 @@ namespace SwrJit loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0)); // pointer to the value to load if we're masking off a component Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)}); -Value *selMask = VEXTRACT(mask,C(i)); +Value *selMask = VEXTRACT(vMask,C(i)); // switch in a safe address to load if we're trying to access a vertex Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); Value *val = LOAD(validAddress); @@ -1094,14 +1092,10 @@ namespace SwrJit const SWR_FORMAT_INFO &info = GetFormatInfo(format); if(info.type[0] ==
[Mesa-dev] [PATCH 06/20] swr/rast: Rewrite Shuffle8bpcGatherd using shuffle
Ease future code maintenance, prepare for folding simd8 and simd16 versions. --- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 244 ++--- 1 file changed, 62 insertions(+), 182 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 67a4a04072..a847cb74da 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -2014,206 +2014,86 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) const uint32_t (&swizzle)[4] = std::get<9>(args); // cast types -Type* vGatherTy = mSimdInt32Ty; Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits -// have to do extra work for sign extending -if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){ -Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane -Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits - -// shuffle mask, including any swizzling -const char x = (char)swizzle[0]; const char y = (char)swizzle[1]; -const char z = (char)swizzle[2]; const char w = (char)swizzle[3]; -Value* vConstMask = C({char(x), char(x+4), char(x+8), char(x+12), -char(y), char(y+4), char(y+8), char(y+12), -char(z), char(z+4), char(z+8), char(z+12), -char(w), char(w+4), char(w+8), char(w+12), -char(x), char(x+4), char(x+8), char(x+12), -char(y), char(y+4), char(y+8), char(y+12), -char(z), char(z+4), char(z+8), char(z+12), -char(w), char(w+4), char(w+8), char(w+12)}); - -Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy); -// after pshufb: group components together in each 128bit lane -// 256i - 01234567 -// - -Value* vi128XY = nullptr; -if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ -vi128XY = BITCAST(PERMD(vShufResult, C({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); -// after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane -// 256i - 01234567 -// dcdc dcdc dcdc dcdc (dc - don't care) -} - -// do the same for zw components -Value* vi128ZW = nullptr; -if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ -vi128ZW = BITCAST(PERMD(vShufResult, C({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); -} - -// init denormalize variables if needed -Instruction::CastOps fpCast; -Value* conversionFactor; - -switch (conversionType) -{ -case CONVERT_NORMALIZED: -fpCast = Instruction::CastOps::SIToFP; -conversionFactor = VIMMED1((float)(1.0 / 127.0)); -break; -case CONVERT_SSCALED: -fpCast = Instruction::CastOps::SIToFP; -conversionFactor = VIMMED1((float)(1.0)); -break; -case CONVERT_USCALED: -SWR_INVALID("Type should not be sign extended!"); -conversionFactor = nullptr; -break; -default: -SWR_ASSERT(conversionType == CONVERT_NONE); -conversionFactor = nullptr; -break; -} +for (uint32_t i = 0; i < 4; i++) +{ +if (!isComponentEnabled(compMask, i)) +continue; -// sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex -for (uint32_t i = 0; i < 4; i++) +if (compCtrl[i] == ComponentControl::StoreSrc) { -if (isComponentEnabled(compMask, i)) -{ -if (compCtrl[i] == ComponentControl::StoreSrc) -{ -// if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 -uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; -// if x or y, use vi128XY permute result, else use vi128ZW -Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; - -// sign extend -vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty)); - -// denormalize if needed -if (conversionType != CONVERT_NONE) -{ -vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); -} -
[Mesa-dev] [PATCH 07/20] swr/rast: Move GatherScissors to header
--- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 127 - src/gallium/drivers/swr/rasterizer/core/binner.h | 127 + 2 files changed, 127 insertions(+), 127 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index 8a5356b168..22996c5a5d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -212,133 +212,6 @@ INLINE void ProcessAttributes( } } -// -/// @brief Gather scissor rect data based on per-prim viewport indices. -/// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point. -/// @param pViewportIndex - array of per-primitive vewport indexes. -/// @param scisXmin - output vector of per-prmitive scissor rect Xmin data. -/// @param scisYmin - output vector of per-prmitive scissor rect Ymin data. -/// @param scisXmax - output vector of per-prmitive scissor rect Xmax data. -/// @param scisYmax - output vector of per-prmitive scissor rect Ymax data. -// -/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. -static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex, -simdscalari &scisXmin, simdscalari &scisYmin, simdscalari &scisXmax, simdscalari &scisYmax) -{ -scisXmin = _simd_set_epi32( -pScissorsInFixedPoint[pViewportIndex[7]].xmin, -pScissorsInFixedPoint[pViewportIndex[6]].xmin, -pScissorsInFixedPoint[pViewportIndex[5]].xmin, -pScissorsInFixedPoint[pViewportIndex[4]].xmin, -pScissorsInFixedPoint[pViewportIndex[3]].xmin, -pScissorsInFixedPoint[pViewportIndex[2]].xmin, -pScissorsInFixedPoint[pViewportIndex[1]].xmin, -pScissorsInFixedPoint[pViewportIndex[0]].xmin); -scisYmin = _simd_set_epi32( -pScissorsInFixedPoint[pViewportIndex[7]].ymin, -pScissorsInFixedPoint[pViewportIndex[6]].ymin, -pScissorsInFixedPoint[pViewportIndex[5]].ymin, -pScissorsInFixedPoint[pViewportIndex[4]].ymin, -pScissorsInFixedPoint[pViewportIndex[3]].ymin, -pScissorsInFixedPoint[pViewportIndex[2]].ymin, -pScissorsInFixedPoint[pViewportIndex[1]].ymin, -pScissorsInFixedPoint[pViewportIndex[0]].ymin); -scisXmax = _simd_set_epi32( -pScissorsInFixedPoint[pViewportIndex[7]].xmax, -pScissorsInFixedPoint[pViewportIndex[6]].xmax, -pScissorsInFixedPoint[pViewportIndex[5]].xmax, -pScissorsInFixedPoint[pViewportIndex[4]].xmax, -pScissorsInFixedPoint[pViewportIndex[3]].xmax, -pScissorsInFixedPoint[pViewportIndex[2]].xmax, -pScissorsInFixedPoint[pViewportIndex[1]].xmax, -pScissorsInFixedPoint[pViewportIndex[0]].xmax); -scisYmax = _simd_set_epi32( -pScissorsInFixedPoint[pViewportIndex[7]].ymax, -pScissorsInFixedPoint[pViewportIndex[6]].ymax, -pScissorsInFixedPoint[pViewportIndex[5]].ymax, -pScissorsInFixedPoint[pViewportIndex[4]].ymax, -pScissorsInFixedPoint[pViewportIndex[3]].ymax, -pScissorsInFixedPoint[pViewportIndex[2]].ymax, -pScissorsInFixedPoint[pViewportIndex[01]].ymax, -pScissorsInFixedPoint[pViewportIndex[00]].ymax); -} - -static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex, -simd16scalari &scisXmin, simd16scalari &scisYmin, simd16scalari &scisXmax, simd16scalari &scisYmax) -{ -scisXmin = _simd16_set_epi32( -pScissorsInFixedPoint[pViewportIndex[15]].xmin, -pScissorsInFixedPoint[pViewportIndex[14]].xmin, -pScissorsInFixedPoint[pViewportIndex[13]].xmin, -pScissorsInFixedPoint[pViewportIndex[12]].xmin, -pScissorsInFixedPoint[pViewportIndex[11]].xmin, -pScissorsInFixedPoint[pViewportIndex[10]].xmin, -pScissorsInFixedPoint[pViewportIndex[ 9]].xmin, -pScissorsInFixedPoint[pViewportIndex[ 8]].xmin, -pScissorsInFixedPoint[pViewportIndex[ 7]].xmin, -pScissorsInFixedPoint[pViewportIndex[ 6]].xmin, -pScissorsInFixedPoint[pViewportIndex[ 5]].xmin, -pScissorsInFixedPoint[pViewportIndex[ 4]].xmin, -pScissorsInFixedPoint[pViewportIndex[ 3]].xmin, -pScissorsInFixedPoint[pViewportIndex[ 2]].xmin, -pScissorsInFixedPoint[pViewportIndex[ 1]].xmin, -pScissorsInFixedPoint[pViewportIndex[ 0]].xmin); - -scisYmin = _simd16_set_epi32( -pScissorsInFixedPoint[pViewportIndex[15]].ymin, -pScissorsInFixedPoint[pViewportIndex[14]].ymin, -pScissorsInFixedPoint[pViewportIndex[13]].ymin, -pScissorsInFixedPoint[pViewportIndex[12]].ymin, -pScissorsInFixedPoint[pViewportIndex[11]].ymin, -pScissorsInFixedPoint[pViewportIndex[10]].ymin, -pScissorsInFixedPoint[pViewportIndex[ 9]].ymin, -pScisso
[Mesa-dev] [PATCH 10/20] swr/rast: SIMD16 Fetch - Fully widen 32-bit float vertex components
--- .../swr/rasterizer/codegen/gen_llvm_ir_macros.py | 3 +- .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 41 - .../drivers/swr/rasterizer/jitter/builder_misc.h | 7 +- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 175 ++--- 4 files changed, 194 insertions(+), 32 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py index 44fc857371..ac8b3badf6 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py @@ -44,9 +44,10 @@ inst_aliases = { intrinsics = [ ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale']], -['VGATHERPS2', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']], +['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']], +['VPSRLI_16', 'x86_avx512_psrli_d_512', ['src', 'imm']], ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']], ['VRSQRTPS', 'x86_avx_rsqrt_ps_256', ['a']], ['VRCPPS', 'x86_avx_rcp_ps_256', ['a']], diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 04092541e5..b2210db717 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -639,7 +639,7 @@ namespace SwrJit } #if USE_SIMD16_BUILDER -Value *Builder::GATHERPS2(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale) +Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale) { Value *vGather = VUNDEF2_F(); @@ -649,7 +649,7 @@ namespace SwrJit // force mask to , required by vgather2 Value *mask = BITCAST(vMask, mInt16Ty); -vGather = VGATHERPS2(vSrc, pBase, vIndices, mask, C((uint32_t)scale)); +vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale)); } else { @@ -659,8 +659,10 @@ namespace SwrJit Value *indices0 = EXTRACT2_I(vIndices, 0); Value *indices1 = EXTRACT2_I(vIndices, 1); -Value *mask0 = EXTRACT2_I(vMask, 0); -Value *mask1 = EXTRACT2_I(vMask, 1); +Value *vmask16 = VMASK2(vMask); + +Value *mask0 = MASK(EXTRACT2_I(vmask16, 0)); // TODO: do this better.. +Value *mask1 = MASK(EXTRACT2_I(vmask16, 1)); Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale); Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale); @@ -771,6 +773,37 @@ namespace SwrJit return vGather; } +#if USE_SIMD16_BUILDER +Value *Builder::PSRLI(Value *a, Value *imm) +{ +return VPSRLI(a, imm); +} + +Value *Builder::PSRLI_16(Value *a, Value *imm) +{ +Value *result = VUNDEF2_I(); + +// use avx512 shift right instruction if available +if (JM()->mArch.AVX512F()) +{ +result = VPSRLI_16(a, imm); +} +else +{ +Value *a0 = EXTRACT2_I(a, 0); +Value *a1 = EXTRACT2_I(a, 1); + +Value *result0 = PSRLI(a0, imm); +Value *result1 = PSRLI(a1, imm); + +result = INSERT2_I(result, result0, 0); +result = INSERT2_I(result, result1, 1); +} + +return result; +} + +#endif #if USE_SIMD16_BUILDER // /// @brief diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index d858a827db..62360a3ad7 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -130,7 +130,7 @@ void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1); #if USE_SIMD16_BUILDER -Value *GATHERPS2(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1); +Value *GATHERPS_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1); #endif void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, Value* mask, Value* vGatherComponents[], bool bPackedOutput); @@ -141,6 +141,11 @@ void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, Value *GATHERPD(Value* src,
[Mesa-dev] [PATCH 09/20] swr/rast: Pass prim to ClipSimd
--- src/gallium/drivers/swr/rasterizer/core/clip.h | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index 148f661ab4..8b947668d3 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -437,7 +437,7 @@ public: return SIMD_T::movemask_ps(vClipCullMask); } -void ClipSimd(const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, PA_STATE &pa, const typename SIMD_T::Integer &vPrimId, const typename SIMD_T::Integer &vViewportIdx) +void ClipSimd(const typename SIMD_T::Vec4 prim[], const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, PA_STATE &pa, const typename SIMD_T::Integer &vPrimId, const typename SIMD_T::Integer &vViewportIdx) { // input/output vertex store for clipper SIMDVERTEX_T vertices[7]; // maximum 7 verts generated per triangle @@ -452,10 +452,9 @@ public: // assemble pos typename SIMD_T::Vec4 tmpVector[NumVertsPerPrim]; -pa.Assemble(VERTEX_POSITION_SLOT, tmpVector); for (uint32_t i = 0; i < NumVertsPerPrim; ++i) { -vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i]; +vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i]; } // assemble attribs @@ -568,7 +567,8 @@ public: SIMDVERTEX_T transposedPrims[2]; #endif -for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim) +uint32_t numInputPrims = pa.NumPrims(); +for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim) { uint32_t numEmittedVerts = pVertexCount[inputPrim]; if (numEmittedVerts < NumVertsPerPrim) @@ -716,7 +716,7 @@ public: AR_BEGIN(FEGuardbandClip, pa.pDC->drawId); // we have to clip tris, execute the clipper, which will also // call the binner -ClipSimd(SIMD_T::vmask_ps(primMask), SIMD_T::vmask_ps(clipMask), pa, primId, viewportIdx); +ClipSimd(prim, SIMD_T::vmask_ps(primMask), SIMD_T::vmask_ps(clipMask), pa, primId, viewportIdx); AR_END(FEGuardbandClip, 1); } else if (validMask) -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 08/10] swr/rast: Simplify GATHER* jit builder api
General cleanup, and prep work for possibly moving to llvm masked gather intrinsic. --- .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 32 ++--- .../drivers/swr/rasterizer/jitter/builder_misc.h | 6 +-- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 56 +++--- src/gallium/drivers/swr/swr_shader.cpp | 2 +- 4 files changed, 48 insertions(+), 48 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index daa9cb1ec1..bd3a52566d 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -554,7 +554,7 @@ namespace SwrJit /// @param vIndices - SIMD wide value of VB byte offsets /// @param vMask - SIMD wide mask that controls whether to access memory or the src values /// @param scale - value to scale indices by -Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) +Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) { Value* vGather; @@ -563,7 +563,7 @@ namespace SwrJit { // force mask to , required by vgather vMask = BITCAST(vMask, mSimdFP32Ty); -vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale); +vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,C(scale)); } else { @@ -574,7 +574,7 @@ namespace SwrJit STORE(vSrc, vSrcPtr); vGather = VUNDEF_F(); -Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty)); +Value *vScaleVec = VIMMED1((uint32_t)scale); Value *vOffsets = MUL(vIndices,vScaleVec); Value *mask = MASK(vMask); for(uint32_t i = 0; i < mVWidth; ++i) @@ -606,14 +606,14 @@ namespace SwrJit /// @param vIndices - SIMD wide value of VB byte offsets /// @param vMask - SIMD wide mask that controls whether to access memory or the src values /// @param scale - value to scale indices by -Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) +Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) { Value* vGather; // use avx2 gather instruction if available if(JM()->mArch.AVX2()) { -vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale); +vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale)); } else { @@ -624,7 +624,7 @@ namespace SwrJit STORE(vSrc, vSrcPtr); vGather = VUNDEF_I(); -Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty)); +Value *vScaleVec = VIMMED1((uint32_t)scale); Value *vOffsets = MUL(vIndices, vScaleVec); Value *mask = MASK(vMask); for(uint32_t i = 0; i < mVWidth; ++i) @@ -656,14 +656,14 @@ namespace SwrJit /// @param vIndices - SIMD wide value of VB byte offsets /// @param vMask - SIMD wide mask that controls whether to access memory or the src values /// @param scale - value to scale indices by -Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) +Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) { Value* vGather; // use avx2 gather instruction if available if(JM()->mArch.AVX2()) { -vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, scale); +vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale)); } else { @@ -674,7 +674,7 @@ namespace SwrJit STORE(vSrc, vSrcPtr); vGather = UndefValue::get(VectorType::get(mDoubleTy, 4)); -Value *vScaleVec = VECTOR_SPLAT(4, Z_EXT(scale,mInt32Ty)); +Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale)); Value *vOffsets = MUL(vIndices,vScaleVec); Value *mask = MASK(vMask); for(uint32_t i = 0; i < mVWidth/2; ++i) @@ -1016,7 +1016,7 @@ namespace SwrJit // save mask as it is zero'd out after each gather vMask = mask; -vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); +vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); // e.g. result of first 8x32bit integer gather for 16bit components // 256i - 01234567 //xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy @@ -1029,7 +1029,7 @@ namespace SwrJit pSrcBase = GEP(pSrcBase, C((char)4)); vMask = mask; -
[Mesa-dev] [PATCH 06/10] swr/rast: Cache eventmanager
--- src/gallium/drivers/swr/rasterizer/archrast/archrast.h | 1 + src/gallium/drivers/swr/rasterizer/core/api.cpp| 5 + src/gallium/drivers/swr/rasterizer/core/api.h | 3 +++ 3 files changed, 9 insertions(+) diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.h b/src/gallium/drivers/swr/rasterizer/archrast/archrast.h index fa88a4948c..c74d6ad909 100644 --- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.h +++ b/src/gallium/drivers/swr/rasterizer/archrast/archrast.h @@ -29,6 +29,7 @@ #include "common/os.h" #include "gen_ar_event.hpp" +#include "eventmanager.h" namespace ArchRast { diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 20eeb29681..9265440904 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -143,6 +143,11 @@ HANDLE SwrCreateContext( #endif } +#if defined(KNOB_ENABLE_AR) +// cache the API thread event manager, for use with sim layer +pCreateInfo->hArEventManager = pContext->pArContext[16]; +#endif + // State setup AFTER context is fully initialized SetupDefaultState(pContext); diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h index 60f56c6d76..c032b0bb10 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.h +++ b/src/gallium/drivers/swr/rasterizer/core/api.h @@ -213,6 +213,9 @@ struct SWR_CREATECONTEXT_INFO // Output: size required memory passed to for SwrSaveState / SwrRestoreState size_t contextSaveSize; +// ArchRast event manager. +HANDLE hArEventManager; + // Input (optional): Threading info that overrides any set KNOB values. SWR_THREADING_INFO* pThreadInfo; -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 05/10] swr/rast: Enable AVX-512 targets in the jitter
--- src/gallium/drivers/swr/rasterizer/core/knobs.h| 8 src/gallium/drivers/swr/rasterizer/jitter/JitManager.h | 2 -- 2 files changed, 10 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h index fe0a044ae8..e00e2da650 100644 --- a/src/gallium/drivers/swr/rasterizer/core/knobs.h +++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h @@ -61,18 +61,10 @@ #define KNOB_SIMD_WIDTH 8 #define KNOB_SIMD_BYTES 32 #elif (KNOB_ARCH == KNOB_ARCH_AVX512) -#if 0 -// not ready to enable this globally, enabled on the side (below) #define KNOB_ARCH_ISA AVX512F #define KNOB_ARCH_STR "AVX512" -#define KNOB_SIMD_WIDTH 16 -#define KNOB_SIMD_BYTES 64 -#else -#define KNOB_ARCH_ISA AVX2 -#define KNOB_ARCH_STR "AVX2" #define KNOB_SIMD_WIDTH 8 #define KNOB_SIMD_BYTES 32 -#endif #else #error "Unknown architecture" #endif diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h index 46ffe276a0..c30a807222 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h @@ -102,14 +102,12 @@ public: bForceAVX2 = true; bForceAVX512 = false; } -#if 0 else if(isaRequest == "avx512") { bForceAVX = false; bForceAVX2 = false; bForceAVX512 = true; } -#endif }; bool AVX2(void) { return bForceAVX ? 0 : InstructionSet::AVX2(); } -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 02/10] swr/rast: Widen fetch shader to SIMD16
Widen fetch shader to SIMD16, enable SIMD16 types in the jitter, and provide utility EXTRACT/INSERT SIMD8 <-> SIMD16 utility functions. --- .../drivers/swr/rasterizer/jitter/builder.cpp | 20 .../drivers/swr/rasterizer/jitter/builder.h| 16 ++ .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 52 .../drivers/swr/rasterizer/jitter/builder_misc.h | 9 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 57 -- 5 files changed, 151 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp index 6a33ec265f..4b83a3204c 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp @@ -41,6 +41,9 @@ namespace SwrJit : mpJitMgr(pJitMgr) { mVWidth = pJitMgr->mVWidth; +#if USE_SIMD16_BUILDER +mVWidth2 = pJitMgr->mVWidth * 2; +#endif mpIRBuilder = &pJitMgr->mBuilder; @@ -65,17 +68,34 @@ namespace SwrJit mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth); mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4); mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5); +#if USE_SIMD16_BUILDER +mSimd2Int1Ty = VectorType::get(mInt1Ty, mVWidth2); +mSimd2Int16Ty = VectorType::get(mInt16Ty, mVWidth2); +mSimd2Int32Ty = VectorType::get(mInt32Ty, mVWidth2); +mSimd2Int64Ty = VectorType::get(mInt64Ty, mVWidth2); +mSimd2FP16Ty = VectorType::get(mFP16Ty, mVWidth2); +mSimd2FP32Ty = VectorType::get(mFP32Ty, mVWidth2); +mSimd2VectorTy = ArrayType::get(mSimd2FP32Ty, 4); +mSimd2VectorTRTy = ArrayType::get(mSimd2FP32Ty, 5); +#endif if (sizeof(uint32_t*) == 4) { mIntPtrTy = mInt32Ty; mSimdIntPtrTy = mSimdInt32Ty; +#if USE_SIMD16_BUILDER +mSimd2IntPtrTy = mSimd2Int32Ty; +#endif } else { SWR_ASSERT(sizeof(uint32_t*) == 8); + mIntPtrTy = mInt64Ty; mSimdIntPtrTy = mSimdInt64Ty; +#if USE_SIMD16_BUILDER +mSimd2IntPtrTy = mSimd2Int64Ty; +#endif } } } diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h index 8210e49b18..c6ab64e06e 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h @@ -32,6 +32,8 @@ #include "JitManager.h" #include "common/formats.h" +#define USE_SIMD16_BUILDER 0 + namespace SwrJit { using namespace llvm; @@ -45,6 +47,9 @@ namespace SwrJit IRBuilder<>* mpIRBuilder; uint32_t mVWidth; +#if USE_SIMD16_BUILDER +uint32_t mVWidth2; +#endif // Built in types. Type*mVoidTy; @@ -70,6 +75,17 @@ namespace SwrJit Type*mSimdIntPtrTy; Type*mSimdVectorTy; Type*mSimdVectorTRTy; +#if USE_SIMD16_BUILDER +Type*mSimd2FP16Ty; +Type*mSimd2FP32Ty; +Type*mSimd2Int1Ty; +Type*mSimd2Int16Ty; +Type*mSimd2Int32Ty; +Type*mSimd2Int64Ty; +Type*mSimd2IntPtrTy; +Type*mSimd2VectorTy; +Type*mSimd2VectorTRTy; +#endif #include "gen_builder.hpp" #include "gen_builder_x86.hpp" diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 9ca36b2467..daa9cb1ec1 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -231,6 +231,13 @@ namespace SwrJit return UndefValue::get(VectorType::get(mFP32Ty, mVWidth)); } +#if USE_SIMD16_BUILDER +Value *Builder::VUNDEF2_F() +{ +return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2)); +} + +#endif Value *Builder::VUNDEF(Type* t) { return UndefValue::get(VectorType::get(t, mVWidth)); @@ -690,6 +697,51 @@ namespace SwrJit return vGather; } +#if USE_SIMD16_BUILDER +// +/// @brief +Value *Builder::EXTRACT(Value *a2, uint32_t imm) +{ +const uint32_t i0 = (imm > 0) ? mVWidth : 0; + +Value *result = VUNDEF_F(); + +for (uint32_t i = 0; i < mVWidth; i += 1) +{ +Value *temp = VEXTRACT(a2, C(i0 + i)); + +result = VINSERT(result, temp, C(i)); +} + +return result; +} + +// +/// @brief +Value *Builder::INSERT(Value *a2, Value * b, uint32_t imm) +{
[Mesa-dev] [PATCH 03/10] swr/rast: Code style change (NFC)
--- src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index e15b300979..2fe6cfcf69 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -39,6 +39,7 @@ #include "tilemgr.h" #include "tessellator.h" #include +#include // /// @brief Helper macro to generate a bitmask @@ -770,6 +771,7 @@ void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t } } + // /// @brief Implements GS stage. /// @param pDC - pointer to draw context. @@ -1335,8 +1337,11 @@ static void TessellationStages( SWR_ASSERT(pfnClipFunc); #if USE_SIMD16_FRONTEND -tessPa.useAlternateOffset = false; -pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID); + +{ +tessPa.useAlternateOffset = false; +pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID); +} #else pfnClipFunc(pDC, tessPa, workerId, prim, GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID)); -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 00/10] swr: update rasterizer
Highlights are code cleanups and more progress on simd16. Tim Rowley (10): swr/rast: support flexible vertex layout for DS output swr/rast: Widen fetch shader to SIMD16 swr/rast: Code style change (NFC) swr/rast: Points with clipdistance can't go through simplepoints path swr/rast: Enable AVX-512 targets in the jitter swr/rast: Cache eventmanager swr/rast: Add alignment to transpose targets swr/rast: Simplify GATHER* jit builder api swr/rast: Implement AVX-512 GATHERPS in SIMD16 fetch shader swr/rast: Repair simd8 frontend code rot .../drivers/swr/rasterizer/archrast/archrast.h | 1 + .../swr/rasterizer/codegen/gen_llvm_ir_macros.py | 1 + src/gallium/drivers/swr/rasterizer/core/api.cpp| 5 + src/gallium/drivers/swr/rasterizer/core/api.h | 3 + src/gallium/drivers/swr/rasterizer/core/binner.cpp | 16 +- .../drivers/swr/rasterizer/core/frontend.cpp | 12 +- src/gallium/drivers/swr/rasterizer/core/frontend.h | 3 +- src/gallium/drivers/swr/rasterizer/core/knobs.h| 8 - src/gallium/drivers/swr/rasterizer/core/state.h| 2 + .../drivers/swr/rasterizer/jitter/JitManager.h | 2 - .../drivers/swr/rasterizer/jitter/builder.cpp | 20 ++ .../drivers/swr/rasterizer/jitter/builder.h| 16 ++ .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 202 ++--- .../drivers/swr/rasterizer/jitter/builder_misc.h | 38 +++- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 180 ++ src/gallium/drivers/swr/swr_shader.cpp | 2 +- 16 files changed, 429 insertions(+), 82 deletions(-) -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 07/10] swr/rast: Add alignment to transpose targets
Needed to ensure alignment for avx512. Fixes address sanitizer crash. --- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 16 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index b624ae69b3..9d1f0d8799 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -796,10 +796,10 @@ endBinTriangles: // transpose verts needed for backend /// @todo modify BE to take non-transformed verts -simd4scalar vHorizX[SIMD_WIDTH]; -simd4scalar vHorizY[SIMD_WIDTH]; -simd4scalar vHorizZ[SIMD_WIDTH]; -simd4scalar vHorizW[SIMD_WIDTH]; +OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH]; +OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH]; +OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH]; +OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH]; TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x); TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y); @@ -1510,10 +1510,10 @@ void BinPostSetupLinesImpl( // transpose verts needed for backend /// @todo modify BE to take non-transformed verts -simd4scalar vHorizX[SIMD_WIDTH]; -simd4scalar vHorizY[SIMD_WIDTH]; -simd4scalar vHorizZ[SIMD_WIDTH]; -simd4scalar vHorizW[SIMD_WIDTH]; +OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH]; +OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH]; +OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH]; +OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH]; if (!primMask) { -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 01/10] swr/rast: support flexible vertex layout for DS output
--- src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 1 + src/gallium/drivers/swr/rasterizer/core/state.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 211e9e4b07..e15b300979 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -1237,6 +1237,7 @@ static void TessellationStages( dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU; dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV; dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput; +dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset; #if USE_SIMD16_FRONTEND dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16 #else diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 2af384fd90..d11ffc69b0 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -288,6 +288,7 @@ struct SWR_DS_CONTEXT uint32_tPrimitiveID;// IN: (SCALAR) PrimitiveID for the patch associated with the DS invocation uint32_tvectorOffset; // IN: (SCALAR) vector index offset into SIMD data. uint32_tvectorStride; // IN: (SCALAR) stride (in vectors) of output data per attribute-component +uint32_toutVertexAttribOffset; // IN: (SCALAR) Offset to the attributes as processed by the next shader stage. ScalarPatch*pCpIn; // IN: (SCALAR) Control patch simdscalar* pDomainU; // IN: (SIMD) Domain Point U coords simdscalar* pDomainV; // IN: (SIMD) Domain Point V coords @@ -819,6 +820,7 @@ struct SWR_TS_STATE uint32_tnumHsOutputAttribs; uint32_tnumDsOutputAttribs; uint32_tdsAllocationSize; +uint32_tdsOutVtxAttribOffset; // Offset to the start of the attributes of the input vertices, in simdvector units uint32_tvertexAttribOffset; -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 04/10] swr/rast: Points with clipdistance can't go through simplepoints path
Fixes piglit glsl-1.20:vs-clip-vertex-primitives and glsl-1.30:vs-clip-distance-primitives. --- src/gallium/drivers/swr/rasterizer/core/frontend.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h index 5cb2f87c15..11099d6449 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.h +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h @@ -352,7 +352,8 @@ bool CanUseSimplePoints(DRAW_CONTEXT *pDC) return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X && state.rastState.pointSize == 1.0f && !state.rastState.pointParam && -!state.rastState.pointSpriteEnable); +!state.rastState.pointSpriteEnable && +!state.backendState.clipDistanceMask); } INLINE -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 09/10] swr/rast: Implement AVX-512 GATHERPS in SIMD16 fetch shader
Disabled for now. --- .../swr/rasterizer/codegen/gen_llvm_ir_macros.py | 1 + .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 126 +++-- .../drivers/swr/rasterizer/jitter/builder_misc.h | 31 - .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 91 --- 4 files changed, 220 insertions(+), 29 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py index ce892a9abe..44fc857371 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py @@ -44,6 +44,7 @@ inst_aliases = { intrinsics = [ ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale']], +['VGATHERPS2', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']], ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']], diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index bd3a52566d..8ffe05b41c 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -211,6 +211,28 @@ namespace SwrJit return ConstantVector::getSplat(mVWidth, cast(C(i))); } +#if USE_SIMD16_BUILDER +Value *Builder::VIMMED2_1(int i) +{ +return ConstantVector::getSplat(mVWidth2, cast(C(i))); +} + +Value *Builder::VIMMED2_1(uint32_t i) +{ +return ConstantVector::getSplat(mVWidth2, cast(C(i))); +} + +Value *Builder::VIMMED2_1(float i) +{ +return ConstantVector::getSplat(mVWidth2, cast(C(i))); +} + +Value *Builder::VIMMED2_1(bool i) +{ +return ConstantVector::getSplat(mVWidth2, cast(C(i))); +} + +#endif Value *Builder::VUNDEF_IPTR() { return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth)); @@ -237,6 +259,11 @@ namespace SwrJit return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2)); } +Value *Builder::VUNDEF2_I() +{ +return UndefValue::get(VectorType::get(mInt32Ty, mVWidth2)); +} + #endif Value *Builder::VUNDEF(Type* t) { @@ -254,6 +281,19 @@ namespace SwrJit return VECTOR_SPLAT(mVWidth, src); } +#if USE_SIMD16_BUILDER +Value *Builder::VBROADCAST2(Value *src) +{ +// check if src is already a vector +if (src->getType()->isVectorTy()) +{ +return src; +} + +return VECTOR_SPLAT(mVWidth2, src); +} + +#endif uint32_t Builder::IMMED(Value* v) { SWR_ASSERT(isa(v)); @@ -554,16 +594,17 @@ namespace SwrJit /// @param vIndices - SIMD wide value of VB byte offsets /// @param vMask - SIMD wide mask that controls whether to access memory or the src values /// @param scale - value to scale indices by -Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) +Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale) { -Value* vGather; +Value *vGather; // use avx2 gather instruction if available if(JM()->mArch.AVX2()) { // force mask to , required by vgather -vMask = BITCAST(vMask, mSimdFP32Ty); -vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,C(scale)); +Value *mask = BITCAST(vMask, mSimdFP32Ty); + +vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale)); } else { @@ -598,6 +639,41 @@ namespace SwrJit return vGather; } +#if USE_SIMD16_BUILDER +Value *Builder::GATHERPS2(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale) +{ +Value *vGather = VUNDEF2_F(); + +// use avx512 gather instruction if available +if (JM()->mArch.AVX512F()) +{ +// force mask to , required by vgather2 +Value *mask = BITCAST(MASK2(vMask), mInt16Ty); + +vGather = VGATHERPS2(vSrc, pBase, vIndices, mask, C((uint32_t)scale)); +} +else +{ +Value *src0 = EXTRACT2_F(vSrc, 0); +Value *src1 = EXTRACT2_F(vSrc, 1); + +Value *indices0 = EXTRACT2_I(vIndices, 0); +Value *indices1 = EXTRACT2_I(vIndices, 1); + +Value *mask0 = EXTRACT2_I(vMask, 0); +Value *mask1 = EXTRACT2_I(vMask, 1); + +Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale); +Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scal
[Mesa-dev] [PATCH 10/10] swr/rast: Repair simd8 frontend code rot
Keep non-default simd8 frontend code running for comparison purposes. --- src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 2fe6cfcf69..5a61dc33a0 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -956,7 +956,7 @@ static void GeometryShaderStage( PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim); #else -PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts); +PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim); #endif while (gsPa.GetNextStreamOutput()) -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] swr/rast: Use gather instruction for i32gather_ps on simd16/avx512
Speed up avx512 platforms; fixes performance regression caused by swithc to simdlib. Cc: mesa-sta...@lists.freedesktop.org --- .../drivers/swr/rasterizer/common/simdlib_512_avx512.inl | 12 +--- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl index 95e4c31909..c13b9f616a 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl @@ -484,17 +484,7 @@ SIMD_WRAPPER_2(unpacklo_ps); template static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) { -uint32_t *pOffsets = (uint32_t*)&idx; -Float vResult; -float* pResult = (float*)&vResult; -for (uint32_t i = 0; i < SIMD_WIDTH; ++i) -{ -uint32_t offset = pOffsets[i]; -offset = offset * static_cast(ScaleT); -pResult[i] = *(float const*)(((uint8_t const*)p + offset)); -} - -return vResult; +return _mm512_i32gather_ps(idx, p, static_cast(ScaleT)); } static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements) -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] swr/rast: Faster emulated simd16 permute
Speed up simd16 frontend (default) on avx/avx2 platforms; fixes performance regression caused by switch to simdlib. Cc: mesa-sta...@lists.freedesktop.org --- .../swr/rasterizer/common/simdlib_512_emu.inl | 34 +++--- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl index d6af7b1c64..44eba0b126 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl @@ -521,36 +521,24 @@ SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (int32) { -Integer result; - -// Ugly slow implementation -uint32_t const *pA = reinterpret_cast(&a); -uint32_t const *pSwiz = reinterpret_cast(&swiz); -uint32_t *pResult = reinterpret_cast(&result); - -for (uint32_t i = 0; i < SIMD_WIDTH; ++i) -{ -pResult[i] = pA[0xF & pSwiz[i]]; -} - -return result; +return castps_si(permute_ps(castsi_ps(a), swiz)); } static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz)// return a[swiz[i]] for each 32-bit lane i (float) { -Float result; +const auto mask = SIMD256T::set1_epi32(7); -// Ugly slow implementation -float const *pA = reinterpret_cast(&a); -uint32_t const *pSwiz = reinterpret_cast(&swiz); -float *pResult = reinterpret_cast(&result); +auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask)); +auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask)); -for (uint32_t i = 0; i < SIMD_WIDTH; ++i) -{ -pResult[i] = pA[0xF & pSwiz[i]]; -} +auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask)); +auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask)); -return result; +return Float +{ +SIMD256T::blendv_ps(lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))), +SIMD256T::blendv_ps(hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))), +}; } // All of the 512-bit permute2f128_XX intrinsics do the following: -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] gallivm: allow arch rounding with avx512
Fixes piglit vs-roundeven-{float,vec[234]} with simd16 VS. --- src/gallium/auxiliary/gallivm/lp_bld_arit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index cf1958b3b6..a1edd349f1 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -1953,7 +1953,8 @@ arch_rounding_available(const struct lp_type type) { if ((util_cpu_caps.has_sse4_1 && (type.length == 1 || type.width*type.length == 128)) || - (util_cpu_caps.has_avx && type.width*type.length == 256)) + (util_cpu_caps.has_avx && type.width*type.length == 256) || + (util_cpu_caps.has_avx512f && type.width*type.length == 512)) return TRUE; else if ((util_cpu_caps.has_altivec && (type.width == 32 && type.length == 4))) -- 2.14.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] gallium: add more exceptions to tgsi_util_get_inst_usage_mask
A number of double/int64 operations don't have matching read and write usage masks, which the fallthrough case of tgsi_util_get_inst_usage_mask assumes for componentwise tagged instructions. No regressions in llvmpipe piglit; fixes a large number of swr regressions. --- src/gallium/auxiliary/tgsi/tgsi_util.c | 12 1 file changed, 12 insertions(+) diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c index cfce59093c..afe5690ce0 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_util.c +++ b/src/gallium/auxiliary/tgsi/tgsi_util.c @@ -230,13 +230,25 @@ tgsi_util_get_inst_usage_mask(const struct tgsi_full_instruction *inst, read_mask = TGSI_WRITEMASK_XYZ; break; + case TGSI_OPCODE_DSEQ: + case TGSI_OPCODE_DSNE: + case TGSI_OPCODE_DSLT: + case TGSI_OPCODE_DSGE: case TGSI_OPCODE_DP4: case TGSI_OPCODE_PK4B: case TGSI_OPCODE_PK4UB: case TGSI_OPCODE_D2F: + case TGSI_OPCODE_D2I: + case TGSI_OPCODE_D2U: case TGSI_OPCODE_I2F: case TGSI_OPCODE_U2F: + case TGSI_OPCODE_U64SEQ: + case TGSI_OPCODE_U64SNE: + case TGSI_OPCODE_U64SLT: + case TGSI_OPCODE_U64SGE: case TGSI_OPCODE_U642F: + case TGSI_OPCODE_I64SLT: + case TGSI_OPCODE_I64SGE: case TGSI_OPCODE_I642F: read_mask = TGSI_WRITEMASK_XYZW; break; -- 2.11.0 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 6/7] swr/rast: Add api to override draws in flight
Allow draws in flight to be overridden via SWR_CREATECONTEXT_INFO. Patch by Jan Zielinski. --- src/gallium/drivers/swr/rasterizer/core/api.cpp| 26 +- src/gallium/drivers/swr/rasterizer/core/api.h | 4 src/gallium/drivers/swr/rasterizer/core/context.h | 2 ++ .../drivers/swr/rasterizer/core/threads.cpp| 18 +++ 4 files changed, 31 insertions(+), 19 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 6323098..20eeb29 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -74,13 +74,19 @@ HANDLE SwrCreateContext( pContext->privateStateSize = pCreateInfo->privateStateSize; -pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); -pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); +pContext->MAX_DRAWS_IN_FLIGHT = KNOB_MAX_DRAWS_IN_FLIGHT; +if (pCreateInfo->MAX_DRAWS_IN_FLIGHT != 0) +{ +pContext->MAX_DRAWS_IN_FLIGHT = pCreateInfo->MAX_DRAWS_IN_FLIGHT; +} + +pContext->dcRing.Init(pContext->MAX_DRAWS_IN_FLIGHT); +pContext->dsRing.Init(pContext->MAX_DRAWS_IN_FLIGHT); -pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64); -pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64); +pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * pContext->MAX_DRAWS_IN_FLIGHT, 64); +pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * pContext->MAX_DRAWS_IN_FLIGHT, 64); -for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) +for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc) { pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena); @@ -173,7 +179,7 @@ template void QueueWork(SWR_CONTEXT *pContext) { DRAW_CONTEXT* pDC = pContext->pCurDrawContext; -uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT; +uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT; if (IsDraw) { @@ -257,7 +263,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) } uint64_t curDraw = pContext->dcRing.GetHead(); -uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT; +uint32_t dcIndex = curDraw % pContext->MAX_DRAWS_IN_FLIGHT; if ((pContext->frameCount - pContext->lastFrameChecked) > 2 || (curDraw - pContext->lastDrawChecked) > 0x1) @@ -273,7 +279,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) pContext->pCurDrawContext = pCurDrawContext; // Assign next available entry in DS ring to this DC. -uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT; +uint32_t dsIndex = pContext->curStateId % pContext->MAX_DRAWS_IN_FLIGHT; pCurDrawContext->pState = &pContext->dsRing[dsIndex]; // Copy previous state to current state. @@ -361,7 +367,7 @@ void SwrDestroyContext(HANDLE hContext) DestroyThreadPool(pContext, &pContext->threadPool); // free the fifos -for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i) +for (uint32_t i = 0; i < pContext->MAX_DRAWS_IN_FLIGHT; ++i) { AlignedFree(pContext->dcRing[i].dynState.pStats); delete pContext->dcRing[i].pArena; @@ -1481,7 +1487,7 @@ void SwrDispatch( pTaskData->threadGroupCountZ = threadGroupCountZ; uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ; -uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT; +uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT; pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex]; pDC->pDispatch->initialize(totalThreadGroups, pTaskData, &ProcessComputeBE); diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h index 577cfb1..60f56c6 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.h +++ b/src/gallium/drivers/swr/rasterizer/core/api.h @@ -215,6 +215,10 @@ struct SWR_CREATECONTEXT_INFO // Input (optional): Threading info that overrides any set KNOB values. SWR_THREADING_INFO* pThreadInfo; + +// Input: if set to non-zero value, overrides KNOB value for maximum +// number of draws in flight +uint32_t MAX_DRAWS_IN_FLIGHT; }; // diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index bcd5801..ae942f1 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer
[Mesa-dev] [PATCH 4/7] swr/rast: Change DS memory allocation
--- src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 4 ++-- src/gallium/drivers/swr/rasterizer/core/state.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index a803512..211e9e4 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -1212,9 +1212,9 @@ static void TessellationStages( // Allocate DS Output memory uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH; #if USE_SIMD16_FRONTEND -size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding +size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.dsAllocationSize; // simd8 -> simd16, padding #else -size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs; +size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize; size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors; #endif if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize) diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index d9450fc..2af384f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -818,6 +818,7 @@ struct SWR_TS_STATE uint32_tnumHsInputAttribs; uint32_tnumHsOutputAttribs; uint32_tnumDsOutputAttribs; +uint32_tdsAllocationSize; // Offset to the start of the attributes of the input vertices, in simdvector units uint32_tvertexAttribOffset; -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/7] swr/rast: Minor changes for os-x
--- src/gallium/drivers/swr/rasterizer/core/threads.cpp | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 4bb395d..9ece064 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -30,7 +30,7 @@ #include #include -#if defined(__linux__) || defined(__gnu_linux__) +#if defined(__linux__) || defined(__gnu_linux__) || defined(__APPLE__) #include #include #include @@ -218,6 +218,8 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread } } +#elif defined(__APPLE__) + #else #error Unsupported platform @@ -291,7 +293,7 @@ void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId = SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr); -#else +#elif defined(__linux__) || defined(__gnu_linux__) cpu_set_t cpuset; pthread_t thread = pthread_self(); -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 0/7] swr: rasterizer update
Highlights are code cleanups, some more simd16 work (disabled by default), and tuning for the Intel Xeon Phi architecture. Tim Rowley (7): swr/rast: Minor changes for os-x swr/rast: Miscellaneous viewport array code changes swr/rast: Fix indentation swr/rast: Change DS memory allocation swr/rast: Widen fetch shader to SIMD16 (disabled for now) swr/rast: Add api to override draws in flight swr: knob overrides for Intel Xeon Phi src/gallium/drivers/swr/rasterizer/core/api.cpp| 26 +- src/gallium/drivers/swr/rasterizer/core/api.h | 4 + src/gallium/drivers/swr/rasterizer/core/binner.cpp | 45 ++- src/gallium/drivers/swr/rasterizer/core/clip.h | 14 +- src/gallium/drivers/swr/rasterizer/core/context.h | 2 + .../drivers/swr/rasterizer/core/frontend.cpp | 26 +- src/gallium/drivers/swr/rasterizer/core/pa.h | 24 +- src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp | 4 +- src/gallium/drivers/swr/rasterizer/core/state.h| 3 +- .../drivers/swr/rasterizer/core/threads.cpp| 24 +- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 441 - src/gallium/drivers/swr/swr_context.cpp| 27 ++ src/gallium/drivers/swr/swr_context.h | 2 + src/gallium/drivers/swr/swr_loader.cpp | 4 + src/gallium/drivers/swr/swr_scratch.cpp| 2 +- src/gallium/drivers/swr/swr_screen.h | 3 + 16 files changed, 575 insertions(+), 76 deletions(-) -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/7] swr/rast: Fix indentation
--- src/gallium/drivers/swr/rasterizer/core/state.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index f7c9308..d9450fc 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -820,7 +820,7 @@ struct SWR_TS_STATE uint32_tnumDsOutputAttribs; // Offset to the start of the attributes of the input vertices, in simdvector units -uint32_t vertexAttribOffset; +uint32_tvertexAttribOffset; }; // output merger state -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/7] swr/rast: Miscellaneous viewport array code changes
--- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 45 -- src/gallium/drivers/swr/rasterizer/core/clip.h | 14 +-- .../drivers/swr/rasterizer/core/frontend.cpp | 22 ++- src/gallium/drivers/swr/rasterizer/core/pa.h | 24 ++-- src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp | 4 +- 5 files changed, 71 insertions(+), 38 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index e08e489..b624ae6 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -450,16 +450,22 @@ void SIMDCALL BinTrianglesImpl( typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f); typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f); -typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0); +typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si(); +typename SIMD_T::Vec4 vpiAttrib[3]; +typename SIMD_T::Integer vpai = SIMD_T::setzero_si(); if (state.backendState.readViewportArrayIndex) { -typename SIMD_T::Vec4 vpiAttrib[3]; pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); +vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); +} + + +if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0 +{ // OOB indices => forced to zero. -typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); -vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai); +vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si()); typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports); viewportIdx = SIMD_T::and_si(vClearMask, vpai); @@ -815,6 +821,7 @@ endBinTriangles: SIMD_T::store_si(reinterpret_cast(aRTAI), SIMD_T::setzero_si()); } + // scan remaining valid triangles and bin each separately while (_BitScanForward(&triIndex, triMask)) { @@ -1299,15 +1306,22 @@ void BinPointsImpl( const SWR_RASTSTATE& rastState = state.rastState; // Read back viewport index if required -typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0); +typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si(); +typename SIMD_T::Vec4 vpiAttrib[1]; +typename SIMD_T::Integer vpai = SIMD_T::setzero_si(); + if (state.backendState.readViewportArrayIndex) { -typename SIMD_T::Vec4 vpiAttrib[1]; pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); +vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); +} + + +if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0 +{ // OOB indices => forced to zero. -typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); -vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai); +vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si()); typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports); viewportIdx = SIMD_T::and_si(vClearMask, vpai); @@ -1626,15 +1640,22 @@ void SIMDCALL BinLinesImpl( typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) }; -typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0); +typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si(); +typename SIMD_T::Vec4 vpiAttrib[2]; +typename SIMD_T::Integer vpai = SIMD_T::setzero_si(); + if (state.backendState.readViewportArrayIndex) { -typename SIMD_T::Vec4 vpiAttrib[2]; pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); +vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); +} + + +if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0 +{ // OOB indices => forced to zero. -typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); -vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai); +vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si()); typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports); viewportIdx = SIMD_T::and_si(vClearMask, vpai); diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index e9a410d..0d3d780 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@
[Mesa-dev] [PATCH 7/7] swr: knob overrides for Intel Xeon Phi
Architecture benefits from having more threads/work outstanding. --- src/gallium/drivers/swr/swr_context.cpp | 27 +++ src/gallium/drivers/swr/swr_context.h | 2 ++ src/gallium/drivers/swr/swr_loader.cpp | 4 src/gallium/drivers/swr/swr_scratch.cpp | 2 +- src/gallium/drivers/swr/swr_screen.h| 3 +++ 5 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp index 34d9a25..b61720c 100644 --- a/src/gallium/drivers/swr/swr_context.cpp +++ b/src/gallium/drivers/swr/swr_context.cpp @@ -39,6 +39,7 @@ #include "api.h" #include "backend.h" +#include "knobs.h" static struct pipe_surface * swr_create_surface(struct pipe_context *pipe, @@ -483,6 +484,8 @@ swr_create_context(struct pipe_screen *p_screen, void *priv, unsigned flags) ctx->blendJIT = new std::unordered_map; + ctx->max_draws_in_flight = KNOB_MAX_DRAWS_IN_FLIGHT; + SWR_CREATECONTEXT_INFO createInfo; memset(&createInfo, 0, sizeof(createInfo)); createInfo.privateStateSize = sizeof(swr_draw_context); @@ -491,6 +494,30 @@ swr_create_context(struct pipe_screen *p_screen, void *priv, unsigned flags) createInfo.pfnClearTile = swr_StoreHotTileClear; createInfo.pfnUpdateStats = swr_UpdateStats; createInfo.pfnUpdateStatsFE = swr_UpdateStatsFE; + + SWR_THREADING_INFO threadingInfo {0}; + + threadingInfo.MAX_WORKER_THREADS= KNOB_MAX_WORKER_THREADS; + threadingInfo.MAX_NUMA_NODES= KNOB_MAX_NUMA_NODES; + threadingInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE; + threadingInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE; + threadingInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED; + + // Use non-standard settings for KNL + if (swr_screen(p_screen)->is_knl) + { + if (nullptr == getenv("KNOB_MAX_THREADS_PER_CORE")) + threadingInfo.MAX_THREADS_PER_CORE = 2; + + if (nullptr == getenv("KNOB_MAX_DRAWS_IN_FLIGHT")) + { + ctx->max_draws_in_flight = 2048; + createInfo.MAX_DRAWS_IN_FLIGHT = ctx->max_draws_in_flight; + } + } + + createInfo.pThreadInfo = &threadingInfo; + ctx->swrContext = ctx->api.pfnSwrCreateContext(&createInfo); ctx->api.pfnSwrInit(); diff --git a/src/gallium/drivers/swr/swr_context.h b/src/gallium/drivers/swr/swr_context.h index 8bed78f..5c280ee 100644 --- a/src/gallium/drivers/swr/swr_context.h +++ b/src/gallium/drivers/swr/swr_context.h @@ -173,6 +173,8 @@ struct swr_context { unsigned dirty; /**< Mask of SWR_NEW_x flags */ SWR_INTERFACE api; + + uint32_t max_draws_in_flight; }; static INLINE struct swr_context * diff --git a/src/gallium/drivers/swr/swr_loader.cpp b/src/gallium/drivers/swr/swr_loader.cpp index e205fe2..9d6f918 100644 --- a/src/gallium/drivers/swr/swr_loader.cpp +++ b/src/gallium/drivers/swr/swr_loader.cpp @@ -38,11 +38,14 @@ swr_create_screen(struct sw_winsys *winsys) util_cpu_detect(); + bool is_knl = false; + if (!strlen(filename) && util_cpu_caps.has_avx512f && util_cpu_caps.has_avx512er) { #if HAVE_SWR_KNL fprintf(stderr, "KNL "); sprintf(filename, "%s%s%s", UTIL_DL_PREFIX, "swrKNL", UTIL_DL_EXT); + is_knl = true; #else fprintf(stderr, "KNL (not built) "); #endif @@ -99,6 +102,7 @@ swr_create_screen(struct sw_winsys *winsys) struct pipe_screen *screen = swr_create_screen_internal(winsys); swr_screen(screen)->pfnSwrGetInterface = (PFNSwrGetInterface)pApiProc; + swr_screen(screen)->is_knl = is_knl; return screen; } diff --git a/src/gallium/drivers/swr/swr_scratch.cpp b/src/gallium/drivers/swr/swr_scratch.cpp index d298a48..8afe73c 100644 --- a/src/gallium/drivers/swr/swr_scratch.cpp +++ b/src/gallium/drivers/swr/swr_scratch.cpp @@ -45,7 +45,7 @@ swr_copy_to_scratch_space(struct swr_context *ctx, ptr = ctx->api.pfnSwrAllocDrawContextMemory(ctx->swrContext, size, 4); } else { /* Allocate enough so that MAX_DRAWS_IN_FLIGHT sets fit. */ - unsigned int max_size_in_flight = size * KNOB_MAX_DRAWS_IN_FLIGHT; + uint32_t max_size_in_flight = size * ctx->max_draws_in_flight; /* Need to grow space */ if (max_size_in_flight > space->current_size) { diff --git a/src/gallium/drivers/swr/swr_screen.h b/src/gallium/drivers/swr/swr_screen.h index 1c4e331..81b1a18 100644 --- a/src/gallium/drivers/swr/swr_screen.h +++ b/src/gallium/drivers/swr/swr_screen.h @@ -54,6 +54,9 @@ struct swr_screen { #endif PFNSwrGetInterface pfnSwrGetInterface; + + /* Do we run on Xeon Phi? */ + bool is_knl; }; static INLINE struct swr_screen * -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 5/7] swr/rast: Widen fetch shader to SIMD16 (disabled for now)
Refactored the gather operation to process 16 elements at a time via paired SIMD8 operations. --- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 441 - 1 file changed, 428 insertions(+), 13 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 1e3db90..30dbcfc 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -89,7 +89,13 @@ struct FetchJit : public Builder void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut); #if USE_SIMD16_SHADERS +#define USE_SIMD16_GATHERS 0 + +#if USE_SIMD16_GATHERS +void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2); +#else void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2); +#endif #else void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut); #endif @@ -279,8 +285,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) } else { +#if USE_SIMD16_GATHERS +JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false); +#else JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false); JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true); +#endif } #else (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut) @@ -792,8 +802,13 @@ void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4]) /// @param vIndices - vector value of indices to gather /// @param pVtxOut - value pointer to output simdvertex struct #if USE_SIMD16_SHADERS +#if USE_SIMD16_GATHERS +void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, +Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2) +#else void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2) +#endif #else void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut) @@ -802,6 +817,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, uint32_t currentVertexElement = 0; uint32_t outputElt = 0; Value* vVertexElements[4]; +#if USE_SIMD16_GATHERS +Value* vVertexElements2[4]; +#endif Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance}); @@ -809,7 +827,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex})); curInstance->setName("curInstance"); -for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt) +for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1) { const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt]; @@ -836,7 +854,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, maxVertex = LOAD(maxVertex); Value *minVertex = NULL; -if (fetchState.bPartialVertexBuffer) { +if (fetchState.bPartialVertexBuffer) +{ // min vertex index for low bounds OOB checking minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)}); minVertex = LOAD(minVertex); @@ -849,10 +868,13 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } Value *vCurIndices; +#if USE_SIMD16_GATHERS +Value *vCurIndices2; +#endif Value *startOffset; Value *vInstanceStride = VIMMED1(0); -if(ied.InstanceEnable) +if (ied.InstanceEnable) { Value* stepRate = C(ied.InstanceAdvancementState); @@ -867,6 +889,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, calcInstance = SELECT(isNonZeroStep, calcInstance, C(0)); vCurIndices = VBROADCAST(calcInstance); +#if USE_SIMD16_GATHERS +vCurIndices2 = VBROADCAST(calcInstance); +#endif startOffset = startInstance; } @@ -878,6 +903,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // offset indices by baseVertex vCurIndices = ADD(vIndices, vBaseVertex); +#if USE_SIMD16_GATHERS +vCurIndices2 = ADD(vIndices2, vBaseVertex); +#endif startOffset = startVertex; SWR_ASSERT((0), "TODO: Fill out more once driver sends this down."); @@ -8
[Mesa-dev] [PATCH 0/2] gallium/swr: simd16 work in progress
Changes to allow the swr work in progress native simd16 pipeline. Currently enabling this via USE_SIMD16_SHADERS in knobs.h will run the fetch shader with double pumped simd8, the vertex shaders in native simd16, and the rest of the pipeline in simd8. Tim Rowley (2): gallium: allow 512-bit vectors swr: simd16 shaders work in progress src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c | 14 +++--- src/gallium/auxiliary/gallivm/lp_bld_type.h | 4 ++-- src/gallium/drivers/swr/swr_screen.cpp | 6 ++ src/gallium/drivers/swr/swr_screen.h| 3 +++ src/gallium/drivers/swr/swr_shader.cpp | 14 -- 5 files changed, 30 insertions(+), 11 deletions(-) -- 2.11.0 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/2] gallium: allow 512-bit vectors
Increase the max allowed vector size from 256 to 512. No piglit llvmpipe regressions running on avx2. Cc: Dave Airlie Cc: Jose Fonseca --- src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c | 14 +++--- src/gallium/auxiliary/gallivm/lp_bld_type.h | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c index de18f629cd..97efc3a399 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c @@ -1272,9 +1272,9 @@ emit_fetch_constant( /** * Fetch 64-bit values from two separate channels. * 64-bit values are stored split across two channels, like xy and zw. - * This function creates a set of 16 floats, + * This function creates a set of vec_length*2 floats, * extracts the values from the two channels, - * puts them in the correct place, then casts to 8 64-bits. + * puts them in the correct place, then casts to vec_length 64-bits. */ static LLVMValueRef emit_fetch_64bit( @@ -1289,9 +1289,9 @@ emit_fetch_64bit( LLVMValueRef res; struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype); int i; - LLVMValueRef shuffles[16]; + LLVMValueRef shuffles[2 * (LP_MAX_VECTOR_WIDTH/32)]; int len = bld_base->base.type.length * 2; - assert(len <= 16); + assert(len <= (2 * (LP_MAX_VECTOR_WIDTH/32))); for (i = 0; i < bld_base->base.type.length * 2; i+=2) { shuffles[i] = lp_build_const_int32(gallivm, i / 2); @@ -1691,7 +1691,7 @@ emit_fetch_deriv( } /** - * store an array of 8 64-bit into two arrays of 8 floats + * store an array of vec-length 64-bit into two arrays of vec_length floats * i.e. * value is d0, d1, d2, d3 etc. * each 64-bit has high and low pieces x, y @@ -1710,8 +1710,8 @@ emit_store_64bit_chan(struct lp_build_tgsi_context *bld_base, struct lp_build_context *float_bld = &bld_base->base; unsigned i; LLVMValueRef temp, temp2; - LLVMValueRef shuffles[8]; - LLVMValueRef shuffles2[8]; + LLVMValueRef shuffles[LP_MAX_VECTOR_WIDTH/32]; + LLVMValueRef shuffles2[LP_MAX_VECTOR_WIDTH/32]; for (i = 0; i < bld_base->base.type.length; i++) { shuffles[i] = lp_build_const_int32(gallivm, i * 2); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h index afe8722b05..62f1f85461 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_type.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h @@ -59,7 +59,7 @@ extern unsigned lp_native_vector_width; * Should only be used when lp_native_vector_width isn't available, * i.e. sizing/alignment of non-malloced variables. */ -#define LP_MAX_VECTOR_WIDTH 256 +#define LP_MAX_VECTOR_WIDTH 512 /** * Minimum vector alignment for static variable alignment @@ -67,7 +67,7 @@ extern unsigned lp_native_vector_width; * It should always be a constant equal to LP_MAX_VECTOR_WIDTH/8. An * expression is non-portable. */ -#define LP_MIN_VECTOR_ALIGN 32 +#define LP_MIN_VECTOR_ALIGN 64 /** * Several functions can only cope with vectors of length up to this value. -- 2.11.0 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] swr: simd16 shaders work in progress
Start building vertex shaders as simd16. Disabled by default, set USE_SIMD16_SHADERS in knobs.h to experiment. Cc: Bruce Cherniak --- src/gallium/drivers/swr/swr_screen.cpp | 6 ++ src/gallium/drivers/swr/swr_screen.h | 3 +++ src/gallium/drivers/swr/swr_shader.cpp | 14 -- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp index 639b18f930..46b3a003c6 100644 --- a/src/gallium/drivers/swr/swr_screen.cpp +++ b/src/gallium/drivers/swr/swr_screen.cpp @@ -1058,6 +1058,9 @@ swr_destroy_screen(struct pipe_screen *p_screen) swr_fence_reference(p_screen, &screen->flush_fence, NULL); JitDestroyContext(screen->hJitMgr); +#if USE_SIMD16_SHADERS + JitDestroyContext(screen->hJitMgr16); +#endif if (winsys->destroy) winsys->destroy(winsys); @@ -1141,6 +1144,9 @@ swr_create_screen_internal(struct sw_winsys *winsys) // Pass in "" for architecture for run-time determination screen->hJitMgr = JitCreateContext(KNOB_SIMD_WIDTH, "", "swr"); +#if USE_SIMD16_SHADERS + screen->hJitMgr16 = JitCreateContext(16, "", "swr"); +#endif swr_fence_init(&screen->base); diff --git a/src/gallium/drivers/swr/swr_screen.h b/src/gallium/drivers/swr/swr_screen.h index a11ea9f41d..1c4e331583 100644 --- a/src/gallium/drivers/swr/swr_screen.h +++ b/src/gallium/drivers/swr/swr_screen.h @@ -49,6 +49,9 @@ struct swr_screen { uint32_t client_copy_limit; HANDLE hJitMgr; +#if USE_SIMD16_SHADERS + HANDLE hJitMgr16; +#endif PFNSwrGetInterface pfnSwrGetInterface; }; diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp index 510bc0e457..732e08dae7 100644 --- a/src/gallium/drivers/swr/swr_shader.cpp +++ b/src/gallium/drivers/swr/swr_shader.cpp @@ -693,7 +693,7 @@ swr_compile_gs(struct swr_context *ctx, swr_jit_gs_key &key) void BuilderSWR::WriteVS(Value *pVal, Value *pVsContext, Value *pVtxOutput, unsigned slot, unsigned channel) { -#if USE_SIMD16_FRONTEND +#if USE_SIMD16_FRONTEND && !USE_SIMD16_SHADERS // interleave the simdvertex components into the dest simd16vertex // slot16offset = slot8offset * 2 // comp16offset = comp8offset * 2 + alternateOffset @@ -756,6 +756,9 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key) const_sizes_ptr->setName("num_vs_constants"); Value *vtxInput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVin}); +#if USE_SIMD16_SHADERS + vtxInput = BITCAST(vtxInput, PointerType::get(Gen_simd16vertex(JM()), 0)); +#endif for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) { const unsigned mask = swr_vs->info.base.input_usage_mask[attrib]; @@ -777,7 +780,7 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key) lp_build_tgsi_soa(gallivm, swr_vs->pipe.tokens, - lp_type_float_vec(32, 32 * 8), + lp_type_float_vec(32, 32 * mVWidth), NULL, // mask wrap(consts_ptr), wrap(const_sizes_ptr), @@ -795,6 +798,9 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key) IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); Value *vtxOutput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVout}); +#if USE_SIMD16_SHADERS + vtxOutput = BITCAST(vtxOutput, PointerType::get(Gen_simd16vertex(JM()), 0)); +#endif for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_OUTPUTS; attrib++) { @@ -905,7 +911,11 @@ swr_compile_vs(struct swr_context *ctx, swr_jit_vs_key &key) return NULL; BuilderSWR builder( +#if USE_SIMD16_SHADERS + reinterpret_cast(swr_screen(ctx->pipe.screen)->hJitMgr16), +#else reinterpret_cast(swr_screen(ctx->pipe.screen)->hJitMgr), +#endif "VS"); PFN_VERTEX_FUNC func = builder.CompileVS(ctx, key); -- 2.11.0 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] swr/rast: use proper alignment for debug transposedPrims
Causing a crash in ParaView waveletcontour.py test when _DEBUG defined due to vector aligned copy with unaligned address. --- src/gallium/drivers/swr/rasterizer/core/clip.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index cde5261521..e9a410daa3 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -561,7 +561,7 @@ public: #if defined(_DEBUG) // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds -SIMDVERTEX_T *transposedPrims = reinterpret_cast *>(malloc(sizeof(SIMDVERTEX_T) * 2)); +SIMDVERTEX_T *transposedPrims = reinterpret_cast *>(AlignedMalloc(sizeof(SIMDVERTEX_T) * 2, 64)); #else SIMDVERTEX_T transposedPrims[2]; @@ -667,7 +667,7 @@ public: } #if defined(_DEBUG) -free(transposedPrims); +AlignedFree(transposedPrims); #endif // update global pipeline stat -- 2.11.0 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/2] configure.ac: add _DEBUG to strip_unwanted_llvm_flags
Assert-enabled builds of llvm add _DEBUG to the LLVM_CFLAGS. This was causing a crash with swr running the ParaView waveletcontour.py test, due to a bug in our _DEBUG code. --- configure.ac | 1 + 1 file changed, 1 insertion(+) diff --git a/configure.ac b/configure.ac index 903a3979d4..b2768f46c0 100644 --- a/configure.ac +++ b/configure.ac @@ -987,6 +987,7 @@ strip_unwanted_llvm_flags() { echo " `$1` " | sed -E \ -e 's/[[[:space:]]]+-m[[^[:space:]]]*//g' \ -e 's/[[[:space:]]]+-DNDEBUG[[[:space:]]]/ /g' \ +-e 's/[[[:space:]]]+-D_DEBUG[[[:space:]]]/ /g' \ -e 's/[[[:space:]]]+-D_GNU_SOURCE[[[:space:]]]/ /g' \ -e 's/[[[:space:]]]+-pedantic[[[:space:]]]/ /g' \ -e 's/[[[:space:]]]+-W[[^[:space:]]]*//g' \ -- 2.11.0 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 7/9] swr/rast: Fix allocation of DS output data for USE_SIMD16_FRONTEND
--- src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 16 ++-- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 22a5705..aea8e88 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -1062,7 +1062,7 @@ struct TessellationThreadLocalData size_t tsCtxSize; simdscalar* pDSOutput; -size_t numDSOutputVectors; +size_t dsOutputAllocSize; }; THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr; @@ -1210,24 +1210,20 @@ static void TessellationStages( // Allocate DS Output memory uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH; -size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs; #if USE_SIMD16_FRONTEND size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding #else +size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs; size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors; #endif -if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors) +if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize) { AlignedFree(gt_pTessellationThreadData->pDSOutput); gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64); -#if USE_SIMD16_FRONTEND -gt_pTessellationThreadData->numDSOutputVectors = RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding -#else -gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors; -#endif +gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize; } SWR_ASSERT(gt_pTessellationThreadData->pDSOutput); -SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors); +SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize); #if defined(_DEBUG) memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize); @@ -1356,7 +1352,7 @@ static void TessellationStages( AlignedFree(gt_pTessellationThreadData->pDSOutput); gt_pTessellationThreadData->pDSOutput = nullptr; } -gt_pTessellationThreadData->numDSOutputVectors = 0; +gt_pTessellationThreadData->dsOutputAllocSize = 0; #endif TSDestroyCtx(tsCtx); -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 0/9] swr: update rasterizer
Highlights: large change in the geometry shader api, cleanups. Tim Rowley (9): swr/rast: Add support for R10G10B10_FLOAT_A2_UNORM pixel format swr/rast: New GS state/context API swr/rast: Fetch compile state changes swr/rast: Move SWR_GS_CONTEXT from thread local storage to stack swr/rast: Properly sized null GS buffer swr/rast: Slightly more efficient blend jit swr/rast: Fix allocation of DS output data for USE_SIMD16_FRONTEND swr/rast: Remove code supporting legacy llvm (<3.9) swr/rast: Handle instanceID offset / Instance Stride enable .../drivers/swr/rasterizer/common/formats.cpp | 27 ++- .../drivers/swr/rasterizer/core/format_traits.h| 2 +- .../drivers/swr/rasterizer/core/frontend.cpp | 252 +++-- src/gallium/drivers/swr/rasterizer/core/state.h| 55 +++-- .../drivers/swr/rasterizer/jitter/JitManager.cpp | 11 +- .../drivers/swr/rasterizer/jitter/JitManager.h | 7 - .../drivers/swr/rasterizer/jitter/blend_jit.cpp| 30 +-- .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 118 ++ .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 40 +++- .../drivers/swr/rasterizer/jitter/fetch_jit.h | 7 +- src/gallium/drivers/swr/swr_shader.cpp | 183 +++ 11 files changed, 361 insertions(+), 371 deletions(-) -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 4/9] swr/rast: Move SWR_GS_CONTEXT from thread local storage to stack
Move structure, as the size is significantly reduced due to dynamic allocation of the GS buffers. --- .../drivers/swr/rasterizer/core/frontend.cpp | 23 +++--- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 26e76a9..15bc93d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -708,8 +708,6 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num } } -THREAD SWR_GS_CONTEXT tlsGsContext; - // Buffers that are allocated if GS is enabled struct GsBuffers { @@ -798,21 +796,22 @@ static void GeometryShaderStage( const API_STATE& state = GetApiState(pDC); const SWR_GS_STATE* pState = &state.gsState; +SWR_GS_CONTEXT gsContext; static uint8_t sNullBuffer[1024] = { 0 }; for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) { -tlsGsContext.pStreams[i] = pGsBuffers->pGsOut[i]; +gsContext.pStreams[i] = pGsBuffers->pGsOut[i]; } -tlsGsContext.pVerts = (simdvector*)pGsBuffers->pGsIn; -tlsGsContext.PrimitiveID = primID; +gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn; +gsContext.PrimitiveID = primID; uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true); simdvector attrib[MAX_NUM_VERTS_PER_PRIM]; // assemble all attributes for the input primitive -tlsGsContext.inputVertStride = pState->inputVertStride; +gsContext.inputVertStride = pState->inputVertStride; for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot) { uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot; @@ -821,7 +820,7 @@ static void GeometryShaderStage( for (uint32_t i = 0; i < numVertsPerPrim; ++i) { -tlsGsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i]; +gsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i]; } } @@ -829,7 +828,7 @@ static void GeometryShaderStage( pa.Assemble(VERTEX_POSITION_SLOT, attrib); for (uint32_t i = 0; i < numVertsPerPrim; ++i) { -tlsGsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i]; +gsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i]; } // record valid prims from the frontend to avoid over binning the newly generated @@ -842,15 +841,15 @@ static void GeometryShaderStage( for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) { -tlsGsContext.InstanceID = instance; -tlsGsContext.mask = GenerateMask(numInputPrims); +gsContext.InstanceID = instance; +gsContext.mask = GenerateMask(numInputPrims); // execute the geometry shader -state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext); +state.pfnGsFunc(GetPrivateState(pDC), &gsContext); for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) { -tlsGsContext.pStreams[i] += pState->allocationSize; +gsContext.pStreams[i] += pState->allocationSize; } } -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/9] swr/rast: Add support for R10G10B10_FLOAT_A2_UNORM pixel format
--- .../drivers/swr/rasterizer/common/formats.cpp | 27 +++--- .../drivers/swr/rasterizer/core/format_traits.h| 2 +- .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 16 ++--- 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.cpp b/src/gallium/drivers/swr/rasterizer/common/formats.cpp index 263dec6..1c086ff 100644 --- a/src/gallium/drivers/swr/rasterizer/common/formats.cpp +++ b/src/gallium/drivers/swr/rasterizer/common/formats.cpp @@ -2729,16 +2729,27 @@ const SWR_FORMAT_INFO gFormatInfo[] = { { 0.0f, 0.0f, 0.0f, 0.0f }, 1, 1 }, -// padding (0xD5) + +// R10G10B10_FLOAT_A2_UNORM (0xD5) { -nullptr, -{ SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 }, -0, 0, 0, false, false, false, false, -{ false, false, false, false }, -{ 0.0f, 0.0f, 0.0f, 0.0f }, -1, 1 +"R10G10B10_FLOAT_A2_UNORM", +{ SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNORM }, +{ 0, 0, 0, 0x3f80 }, // Defaults for missing components +{ 0, 1, 2, 3 }, // Swizzle +{ 10, 10, 10, 2 }, // Bits per component +32, // Bits per element +4, // Bytes per element +4, // Num components +false, // isSRGB +false, // isBC +false, // isSubsampled +false, // isLuminance +{ false, false, false, false }, // Is normalized? +{ 1.0f, 1.0f, 1.0f, 1.0f / 3.0f }, // To float scale factor +1, // bcWidth +1, // bcHeight }, + // R32_SINT (0xD6) { "R32_SINT", diff --git a/src/gallium/drivers/swr/rasterizer/core/format_traits.h b/src/gallium/drivers/swr/rasterizer/core/format_traits.h index c04ea5f..bc585dd 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_traits.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_traits.h @@ -1237,7 +1237,7 @@ template<> struct FormatTraits : /// FormatTraits - Format traits specialization for R10G10B10_FLOAT_A2_UNORM // template<> struct FormatTraits : -ComponentTraits, +ComponentTraits, FormatSwizzle<0, 1, 2, 3>, Defaults<0, 0, 0, 0x3f80> { diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 402fd26..b943909 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -42,7 +42,7 @@ namespace SwrJit ///number of mantissa bits. /// @param val - 32-bit float /// @todo Maybe move this outside of this file into a header? -static uint16_t Convert32To16Float(float val) +static uint16_t ConvertFloat32ToFloat16(float val) { uint32_t sign, exp, mant; uint32_t roundBits; @@ -112,7 +112,7 @@ namespace SwrJit ///float /// @param val - 16-bit float /// @todo Maybe move this outside of this file into a header? -static float ConvertSmallFloatTo32(uint32_t val) +static float ConvertFloat16ToFloat32(uint32_t val) { uint32_t result; if ((val & 0x7fff) == 0) @@ -888,11 +888,11 @@ namespace SwrJit else { FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty); -Function* pCvtPh2Ps = cast(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy)); +Function* pCvtPh2Ps = cast(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy)); -if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr) +if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr) { -sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32); +sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32); } Value* pResult = UndefValue::get(mSimdFP32Ty); @@ -921,11 +921,11 @@ namespace SwrJit { // call scalar C function for now FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty); -Function* pCvtPs2Ph = cast(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy)); +Function* pCvtPs2Ph = cast(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy)); -if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr) +if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr) { -sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32
[Mesa-dev] [PATCH 5/9] swr/rast: Properly sized null GS buffer
--- src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 15bc93d..22a5705 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -798,7 +798,7 @@ static void GeometryShaderStage( const SWR_GS_STATE* pState = &state.gsState; SWR_GS_CONTEXT gsContext; -static uint8_t sNullBuffer[1024] = { 0 }; +static uint8_t sNullBuffer[128] = { 0 }; for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) { -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/9] swr/rast: Fetch compile state changes
Add ForceSequentialAccessEnable and InstanceIDOffsetEnable bools to FETCH_COMPILE_STATE. --- src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp | 6 ++ src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h | 7 ++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index f3a4b27..9061298 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -275,6 +275,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) : JitGatherVertices(fetchState, streams, vIndices, pVtxOut); #endif +if (fetchState.bInstanceIDOffsetEnable) +{ +// TODO: +SWR_ASSERT((0), "Add support for handling InstanceID Offset Enable."); +} + RET_VOID(); JitManager::DumpToFile(fetch, "src"); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h index 0dd6de7..18fa963 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h @@ -107,6 +107,9 @@ struct FETCH_COMPILE_STATE bool bVertexIDOffsetEnable{ false };// Offset vertexID by StartVertex for non-indexed draws or BaseVertex for indexed draws bool bPartialVertexBuffer{ false }; // for indexed draws, map illegal indices to a known resident vertex +bool bForceSequentialAccessEnable{ false }; +bool bInstanceIDOffsetEnable{ false }; + FETCH_COMPILE_STATE(bool disableVGATHER = false, bool diableIndexOOBCheck = false): bDisableVGATHER(disableVGATHER), bDisableIndexOOBCheck(diableIndexOOBCheck){ }; @@ -120,11 +123,13 @@ struct FETCH_COMPILE_STATE if (cutIndex != other.cutIndex) return false; if (bVertexIDOffsetEnable != other.bVertexIDOffsetEnable) return false; if (bPartialVertexBuffer != other.bPartialVertexBuffer) return false; +if (bForceSequentialAccessEnable != other.bForceSequentialAccessEnable) return false; +if (bInstanceIDOffsetEnable != other.bInstanceIDOffsetEnable) return false; for(uint32_t i = 0; i < numAttribs; ++i) { if((layout[i].bits != other.layout[i].bits) || - ((layout[i].InstanceEnable == 1) && + (((layout[i].InstanceEnable == 1) || (layout[i].InstanceStrideEnable == 1)) && (layout[i].InstanceAdvancementState != other.layout[i].InstanceAdvancementState))){ return false; } -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 8/9] swr/rast: Remove code supporting legacy llvm (<3.9)
--- .../drivers/swr/rasterizer/jitter/JitManager.cpp | 11 ++- .../drivers/swr/rasterizer/jitter/JitManager.h | 7 -- .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 102 ++--- 3 files changed, 15 insertions(+), 105 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp index e4281f8..3f0772c 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp @@ -48,8 +48,9 @@ #include "llvm/Support/FormattedStream.h" #include "llvm/Support/Path.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Config/llvm-config.h" -#if HAVE_LLVM < 0x400 +#if LLVM_VERSION_MAJOR < 4 #include "llvm/Bitcode/ReaderWriter.h" #else #include "llvm/Bitcode/BitcodeWriter.h" @@ -231,8 +232,8 @@ void JitManager::DumpAsm(Function* pFunction, const char* fileName) #if defined(_WIN32) DWORD pid = GetCurrentProcessId(); -TCHAR procname[MAX_PATH]; -GetModuleFileName(NULL, procname, MAX_PATH); +char procname[MAX_PATH]; +GetModuleFileNameA(NULL, procname, MAX_PATH); const char* pBaseName = strrchr(procname, '\\'); std::stringstream outDir; outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends; @@ -269,8 +270,8 @@ void JitManager::DumpToFile(Function *f, const char *fileName) { #if defined(_WIN32) DWORD pid = GetCurrentProcessId(); -TCHAR procname[MAX_PATH]; -GetModuleFileName(NULL, procname, MAX_PATH); +char procname[MAX_PATH]; +GetModuleFileNameA(NULL, procname, MAX_PATH); const char* pBaseName = strrchr(procname, '\\'); std::stringstream outDir; outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h index 4bc543b..46ffe27 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h @@ -47,13 +47,6 @@ #include "llvm/ExecutionEngine/ObjectCache.h" #include "llvm/Config/llvm-config.h" -#ifndef LLVM_VERSION_MAJOR -#include "llvm/Config/config.h" -#endif - -#ifndef HAVE_LLVM -#define HAVE_LLVM ((LLVM_VERSION_MAJOR << 8) | LLVM_VERSION_MINOR) -#endif #include "llvm/IR/Verifier.h" #include "llvm/ExecutionEngine/MCJIT.h" diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index b943909..9ca36b2 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -763,22 +763,10 @@ namespace SwrJit /// lower 8 values are used. Value *Builder::PMOVSXBD(Value* a) { -// llvm-3.9 removed the pmovsxbd intrinsic -#if HAVE_LLVM < 0x309 -// use avx2 byte sign extend instruction if available -if(JM()->mArch.AVX2()) -{ -Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd); -return CALL(pmovsxbd, std::initializer_list{a}); -} -else -#endif -{ -// VPMOVSXBD output type -Type* v8x32Ty = VectorType::get(mInt32Ty, 8); -// Extract 8 values from 128bit lane and sign extend -return S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); -} +// VPMOVSXBD output type +Type* v8x32Ty = VectorType::get(mInt32Ty, 8); +// Extract 8 values from 128bit lane and sign extend +return S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); } // @@ -787,22 +775,10 @@ namespace SwrJit /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values. Value *Builder::PMOVSXWD(Value* a) { -// llvm-3.9 removed the pmovsxwd intrinsic -#if HAVE_LLVM < 0x309 -// use avx2 word sign extend if available -if(JM()->mArch.AVX2()) -{ -Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd); -return CALL(pmovsxwd, std::initializer_list{a}); -} -else -#endif -{ -// VPMOVSXWD output type -Type* v8x32Ty = VectorType::get(mInt32Ty, 8); -// Extract 8 values from 128bit lane and sign extend -return S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); -} +// VPMOVSXWD output type +Type* v8x32Ty = VectorType::get(mInt32Ty, 8); +// Extract 8 values from 128bit lane and sign extend +return S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); } //
[Mesa-dev] [PATCH 9/9] swr/rast: Handle instanceID offset / Instance Stride enable
Supported in JitGatherVertices(); FetchJit::JitLoadVertices() may require similar changes, will need address this if it is determined that this path is still in use. Handle Force Sequential Access in FetchJit::Create. --- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 46 ++ 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 9061298..1e3db90 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -222,6 +222,18 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break; } +if(fetchState.bForceSequentialAccessEnable) +{ +Value* pOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 }); + +// VertexData buffers are accessed sequentially, the index is equal to the vertex number +vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex })); +vIndices = ADD(vIndices, pOffsets); +#if USE_SIMD16_SHADERS +vIndices2 = ADD(vIndices, VIMMED1(8)); +#endif +} + Value* vVertexId = vIndices; #if USE_SIMD16_SHADERS Value* vVertexId2 = vIndices2; @@ -275,12 +287,6 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) : JitGatherVertices(fetchState, streams, vIndices, pVtxOut); #endif -if (fetchState.bInstanceIDOffsetEnable) -{ -// TODO: -SWR_ASSERT((0), "Add support for handling InstanceID Offset Enable."); -} - RET_VOID(); JitManager::DumpToFile(fetch, "src"); @@ -362,6 +368,11 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* str vectors.clear(); +if (fetchState.bInstanceIDOffsetEnable) +{ +SWR_ASSERT((0), "TODO: Fill out more once driver sends this down"); +} + Value *vCurIndices; Value *startOffset; if(ied.InstanceEnable) @@ -831,8 +842,16 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, minVertex = LOAD(minVertex); } +if (fetchState.bInstanceIDOffsetEnable) +{ +// the InstanceID (curInstance) value is offset by StartInstanceLocation +curInstance = ADD(curInstance, startInstance); +} + Value *vCurIndices; Value *startOffset; +Value *vInstanceStride = VIMMED1(0); + if(ied.InstanceEnable) { Value* stepRate = C(ied.InstanceAdvancementState); @@ -853,11 +872,19 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } else if (ied.InstanceStrideEnable) { +// grab the instance advancement state, determines stride in bytes from one instance to the next +Value* stepRate = C(ied.InstanceAdvancementState); +vInstanceStride = VBROADCAST(MUL(curInstance, stepRate)); + +// offset indices by baseVertex +vCurIndices = ADD(vIndices, vBaseVertex); + +startOffset = startVertex; SWR_ASSERT((0), "TODO: Fill out more once driver sends this down."); } else { -// offset indices by baseVertex +// offset indices by baseVertex vCurIndices = ADD(vIndices, vBaseVertex); startOffset = startVertex; @@ -925,6 +952,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* vOffsets = MUL(vCurIndices, vStride); vOffsets = ADD(vOffsets, vAlignmentOffsets); +// if instance stride enable is: +// true - add product of the instanceID and advancement state to the offst into the VB +// false - value of vInstanceStride has been initialialized to zero +vOffsets = ADD(vOffsets, vInstanceStride); + // Packing and component control ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking; const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1, -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/9] swr/rast: New GS state/context API
One piglit regression, which was a false pass: spec@glsl-1.50@execution@geometry@dynamic_input_array_index --- .../drivers/swr/rasterizer/core/frontend.cpp | 227 - src/gallium/drivers/swr/rasterizer/core/state.h| 55 +++-- src/gallium/drivers/swr/swr_shader.cpp | 183 - 3 files changed, 253 insertions(+), 212 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index f882869..26e76a9 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -710,45 +710,67 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num THREAD SWR_GS_CONTEXT tlsGsContext; -template -struct GsBufferInfo +// Buffers that are allocated if GS is enabled +struct GsBuffers { -GsBufferInfo(const SWR_GS_STATE &gsState) -{ -const uint32_t vertexCount = gsState.maxNumVerts; -const uint32_t vertexStride = sizeof(SIMDVERTEX); -const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH; +uint8_t* pGsIn; +uint8_t* pGsOut[KNOB_SIMD_WIDTH]; +uint8_t* pGsTransposed; +void* pStreamCutBuffer; +}; -vertexPrimitiveStride = vertexStride * numSimdBatches; -vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH; +// +/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler +/// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler +/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader +/// @param numVerts - Number of vertices outputted by the GS +/// @param numAttribs - Number of attributes per vertex +template +void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs) +{ +uint32_t srcVertexStride = numAttribs * sizeof(float) * 4; +uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4; -if (gsState.isSingleStream) -{ -cutPrimitiveStride = (vertexCount + 7) / 8; -cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH; +OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth]; -streamCutPrimitiveStride = 0; -streamCutInstanceStride = 0; -} -else -{ -cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4); -cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH; - -streamCutPrimitiveStride = (vertexCount + 7) / 8; -streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH; -} +for (uint32_t i = 0; i < SimdWidth; ++i) +{ +gatherOffsets[i] = srcVertexStride * i; } +auto vGatherOffsets = SIMD_T::load_si((typename SIMD_T::Integer*)&gatherOffsets[0]); -uint32_t vertexPrimitiveStride; -uint32_t vertexInstanceStride; +uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth; +uint32_t remainingVerts = numVerts; -uint32_t cutPrimitiveStride; -uint32_t cutInstanceStride; +for (uint32_t s = 0; s < numSimd; ++s) +{ +uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth; +uint8_t* pDstBase = pDst + s * dstVertexStride; -uint32_t streamCutPrimitiveStride; -uint32_t streamCutInstanceStride; -}; +// Compute mask to prevent src overflow +uint32_t mask = std::min(remainingVerts, SimdWidth); +mask = GenMask(mask); +auto vMask = SIMD_T::vmask_ps(mask); +auto viMask = SIMD_T::castps_si(vMask); + +for (uint32_t a = 0; a < numAttribs; ++a) +{ +auto attribGatherX = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask); +auto attribGatherY = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask); +auto attribGatherZ = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask); +auto attribGatherW = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask); + +SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX); +SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float)), viMask, attribGatherY); +SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 2), viMask, attribGatherZ); +SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 3), viMask, attribGatherW); + +pSrcBase += sizeof(float) * 4; +pDstBase += sizeof(typename SIMD_T::Float) * 4; +} +remainingVerts -= SimdWidth; +}
[Mesa-dev] [PATCH 6/9] swr/rast: Slightly more efficient blend jit
--- .../drivers/swr/rasterizer/jitter/blend_jit.cpp| 30 -- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp index f2e6e53..3258639 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp @@ -581,13 +581,13 @@ struct BlendJit : public Builder // load src1 src1[i] = LOAD(pSrc1, { i }); } -Value* currentMask = VIMMED1(-1); +Value* currentSampleMask = VIMMED1(-1); if (state.desc.alphaToCoverageEnable) { Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f); uint32_t bits = (1 << state.desc.numSamples) - 1; -currentMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits))); -currentMask = FP_TO_SI(FADD(currentMask, VIMMED1(0.5f)), mSimdInt32Ty); +currentSampleMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits))); +currentSampleMask = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty); } // alpha test @@ -766,34 +766,24 @@ struct BlendJit : public Builder assert(!(state.desc.alphaToCoverageEnable)); // load current mask Value* oMask = LOAD(ppoMask); -Value* sampleMasked = VBROADCAST(SHL(C(1), sampleNum)); -oMask = AND(oMask, sampleMasked); -currentMask = AND(oMask, currentMask); +currentSampleMask = AND(oMask, currentSampleMask); } if(state.desc.sampleMaskEnable) { Value* sampleMask = LOAD(pBlendState, { 0, SWR_BLEND_STATE_sampleMask}); -Value* sampleMasked = SHL(C(1), sampleNum); -sampleMask = AND(sampleMask, sampleMasked); -sampleMask = VBROADCAST(ICMP_SGT(sampleMask, C(0))); -sampleMask = S_EXT(sampleMask, mSimdInt32Ty); -currentMask = AND(sampleMask, currentMask); -} - -if (state.desc.alphaToCoverageEnable) -{ -Value* sampleMasked = SHL(C(1), sampleNum); -currentMask = AND(currentMask, VBROADCAST(sampleMasked)); +currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask); } if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable || state.desc.oMaskEnable) { -// load coverage mask +// load coverage mask and mask off any lanes with no samples Value* pMask = LOAD(ppMask); -currentMask = S_EXT(ICMP_UGT(currentMask, VBROADCAST(C(0))), mSimdInt32Ty); -Value* outputMask = AND(pMask, currentMask); +Value* sampleMasked = SHL(C(1), sampleNum); +currentSampleMask = AND(currentSampleMask, VBROADCAST(sampleMasked)); +currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, VBROADCAST(C(0))), mSimdInt32Ty); +Value* outputMask = AND(pMask, currentSampleMask); // store new mask STORE(outputMask, GEP(ppMask, C(0))); } -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] swr/rast: remove llvm fence/atomics from generated files
We currently don't use these instructions, and since their API changed in llvm-5.0 having them in the autogen files broke the mesa release tarballs which ship with generated autogen files. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102847 CC: mesa-sta...@lists.freedesktop.org --- src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py | 8 1 file changed, 8 insertions(+) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py index 025d38a..ce892a9 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py @@ -140,6 +140,14 @@ def parse_ir_builder(input_file): ignore = False +# The following functions need to be ignored in openswr. +# API change in llvm-5.0 breaks baked autogen files +if ( +(func_name == 'CreateFence' or + func_name == 'CreateAtomicCmpXchg' or + func_name == 'CreateAtomicRMW')): +ignore = True + # The following functions need to be ignored. if (func_name == 'CreateInsertNUWNSWBinOp' or func_name == 'CreateMaskedIntrinsic' or -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 08/10] swr/rast: Missed conversion to SIMD_T
--- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index a6713e8..e08e489 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -1195,7 +1195,7 @@ void BinPostSetupPointsImpl( } OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH]; -_simd16_store_ps(reinterpret_cast(aPointSize), vPointSize); +SIMD_T::store_ps(reinterpret_cast(aPointSize), vPointSize); uint32_t *pPrimID = (uint32_t *)&primID; -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 01/10] swr/rast: Add new API SwrStallBE
SwrStallBE stalls the backend threads until all work submitted before the stall has finished. The frontend threads can continue to make forward progress. --- src/gallium/drivers/swr/rasterizer/core/api.cpp | 9 + src/gallium/drivers/swr/rasterizer/core/api.h | 8 2 files changed, 17 insertions(+) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index ccb6dfb..6323098 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -458,6 +458,14 @@ void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint AR_API_END(APISync, 1); } +void SwrStallBE(HANDLE hContext) +{ +SWR_CONTEXT* pContext = GetContext(hContext); +DRAW_CONTEXT* pDC = GetDrawContext(pContext); + +pDC->dependent = true; +} + void SwrWaitForIdle(HANDLE hContext) { SWR_CONTEXT *pContext = GetContext(hContext); @@ -1672,6 +1680,7 @@ void SwrGetInterface(SWR_INTERFACE &out_funcs) out_funcs.pfnSwrSaveState = SwrSaveState; out_funcs.pfnSwrRestoreState = SwrRestoreState; out_funcs.pfnSwrSync = SwrSync; +out_funcs.pfnSwrStallBE = SwrStallBE; out_funcs.pfnSwrWaitForIdle = SwrWaitForIdle; out_funcs.pfnSwrWaitForIdleFE = SwrWaitForIdleFE; out_funcs.pfnSwrSetVertexBuffers = SwrSetVertexBuffers; diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h index a394205..577cfb1 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.h +++ b/src/gallium/drivers/swr/rasterizer/core/api.h @@ -263,6 +263,13 @@ SWR_FUNC(void, SwrSync, uint64_t userData3); // +/// @brief Stall cmd. Stalls the backend until all previous work has been completed. +///Frontend work can continue to make progress +/// @param hContext - Handle passed back from SwrCreateContext +SWR_FUNC(void, SwrStallBE, +HANDLE hContext); + +// /// @brief Blocks until all rendering has been completed. /// @param hContext - Handle passed back from SwrCreateContext SWR_FUNC(void, SwrWaitForIdle, @@ -709,6 +716,7 @@ struct SWR_INTERFACE PFNSwrSaveState pfnSwrSaveState; PFNSwrRestoreState pfnSwrRestoreState; PFNSwrSync pfnSwrSync; +PFNSwrStallBE pfnSwrStallBE; PFNSwrWaitForIdle pfnSwrWaitForIdle; PFNSwrWaitForIdleFE pfnSwrWaitForIdleFE; PFNSwrSetVertexBuffers pfnSwrSetVertexBuffers; -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 07/10] swr/rast: whitespace changes
--- src/gallium/drivers/swr/rasterizer/jitter/jit_api.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h index 9f69669..e589d2c 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h @@ -51,6 +51,7 @@ struct ShaderInfo; + // /// Jit Compile Info Input // @@ -63,6 +64,7 @@ struct JIT_COMPILE_INPUT size_t irLength; bool enableJitSampler; + }; extern "C" -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 10/10] swr/rast: Fetch compile state changes
Add InstanceStrideEnable field and rename InstanceDataStepRate to InstanceAdvancementState in INPUT_ELEMENT_DESC structure. Add stubs for handling InstanceStrideEnable in FetchJit::JitLoadVertices() and FetchJit::JitGatherVertices() and assert if they are triggered. --- src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp | 12 ++-- src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h | 7 --- src/gallium/drivers/swr/swr_state.cpp | 2 +- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 761c58c..f3a4b27 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -360,7 +360,7 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* str Value *startOffset; if(ied.InstanceEnable) { -Value* stepRate = C(ied.InstanceDataStepRate); +Value* stepRate = C(ied.InstanceAdvancementState); // prevent a div by 0 for 0 step rate Value* isNonZeroStep = ICMP_UGT(stepRate, C(0)); @@ -376,6 +376,10 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* str startOffset = startInstance; } +else if (ied.InstanceStrideEnable) +{ +SWR_ASSERT((0), "TODO: Fill out more once driver sends this down."); +} else { // offset indices by baseVertex @@ -825,7 +829,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *startOffset; if(ied.InstanceEnable) { -Value* stepRate = C(ied.InstanceDataStepRate); +Value* stepRate = C(ied.InstanceAdvancementState); // prevent a div by 0 for 0 step rate Value* isNonZeroStep = ICMP_UGT(stepRate, C(0)); @@ -841,6 +845,10 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, startOffset = startInstance; } +else if (ied.InstanceStrideEnable) +{ +SWR_ASSERT((0), "TODO: Fill out more once driver sends this down."); +} else { // offset indices by baseVertex diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h index 4f456af..0dd6de7 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h @@ -45,16 +45,17 @@ struct INPUT_ELEMENT_DESC uint32_tFormat : 10; uint32_tStreamIndex : 6; uint32_tInstanceEnable : 1; +uint32_tInstanceStrideEnable : 1; uint32_tComponentControl0 : 3; uint32_tComponentControl1 : 3; uint32_tComponentControl2 : 3; uint32_tComponentControl3 : 3; uint32_tComponentPacking : 4; -uint32_t_reserved : 19; +uint32_t_reserved : 18; }; uint64_t bits; }; -uint32_t InstanceDataStepRate; +uint32_t InstanceAdvancementState; }; // used to set ComponentPacking @@ -124,7 +125,7 @@ struct FETCH_COMPILE_STATE { if((layout[i].bits != other.layout[i].bits) || ((layout[i].InstanceEnable == 1) && -(layout[i].InstanceDataStepRate != other.layout[i].InstanceDataStepRate))){ +(layout[i].InstanceAdvancementState != other.layout[i].InstanceAdvancementState))){ return false; } } diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp index 1491868..93108de 100644 --- a/src/gallium/drivers/swr/swr_state.cpp +++ b/src/gallium/drivers/swr/swr_state.cpp @@ -531,7 +531,7 @@ swr_create_vertex_elements_state(struct pipe_context *pipe, ? ComponentControl::StoreSrc : ComponentControl::Store1Fp; velems->fsState.layout[i].ComponentPacking = ComponentEnable::XYZW; - velems->fsState.layout[i].InstanceDataStepRate = + velems->fsState.layout[i].InstanceAdvancementState = attribs[i].instance_divisor; /* Calculate the pitch of each stream */ -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 05/10] swr/rast: Migrate memory pointers to gfxptr_t type
--- .../swr/rasterizer/codegen/gen_llvm_types.py| 2 +- src/gallium/drivers/swr/rasterizer/core/state.h | 5 +++-- .../drivers/swr/rasterizer/memory/StoreTile.h | 4 ++-- .../drivers/swr/rasterizer/memory/TilingFunctions.h | 2 +- src/gallium/drivers/swr/swr_context.cpp | 18 +- src/gallium/drivers/swr/swr_draw.cpp| 8 src/gallium/drivers/swr/swr_resource.h | 2 +- src/gallium/drivers/swr/swr_screen.cpp | 21 ++--- src/gallium/drivers/swr/swr_state.cpp | 10 +- 9 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py index 94f3f9f..ccf2bde 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py @@ -42,7 +42,7 @@ def gen_llvm_type(type, name, is_pointer, is_pointer_pointer, is_array, is_array else: if type == 'BYTE' or type == 'char' or type == 'uint8_t' or type == 'int8_t' or type == 'bool': llvm_type = 'Type::getInt8Ty(ctx)' -elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type == 'int64_t': +elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type == 'int64_t' or type == 'gfxptr_t': llvm_type = 'Type::getInt64Ty(ctx)' elif type == 'UINT16' or type == 'int16_t' or type == 'uint16_t': llvm_type = 'Type::getInt16Ty(ctx)' diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index b0af663..13c1d8b 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -29,6 +29,7 @@ #include "common/formats.h" #include "common/intrin.h" +using gfxptr_t = unsigned long long; #include #include @@ -513,7 +514,7 @@ enum SWR_AUX_MODE // struct SWR_SURFACE_STATE { -uint8_t *pBaseAddress; +gfxptr_t xpBaseAddress; SWR_SURFACE_TYPE type; // @llvm_enum SWR_FORMAT format; // @llvm_enum uint32_t width; @@ -536,7 +537,7 @@ struct SWR_SURFACE_STATE uint32_t lodOffsets[2][15]; // lod offsets for sampled surfaces -uint8_t *pAuxBaseAddress; // Used for compression, append/consume counter, etc. +gfxptr_t xpAuxBaseAddress; // Used for compression, append/consume counter, etc. SWR_AUX_MODE auxMode; // @llvm_enum diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h index c3d14e9..512c338 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h @@ -1179,7 +1179,7 @@ struct StoreRasterTile resolveColor[3] *= oneOverNumSamples; // Use the resolve surface state -SWR_SURFACE_STATE* pResolveSurface = (SWR_SURFACE_STATE*)pDstSurface->pAuxBaseAddress; +SWR_SURFACE_STATE* pResolveSurface = (SWR_SURFACE_STATE*)pDstSurface->xpAuxBaseAddress; uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress((x + rx), (y + ry), pResolveSurface->arrayIndex + renderTargetArrayIndex, pResolveSurface->arrayIndex + renderTargetArrayIndex, 0, pResolveSurface->lod, pResolveSurface); @@ -2390,7 +2390,7 @@ struct StoreMacroTile } } -if (pDstSurface->pAuxBaseAddress) +if (pDstSurface->xpAuxBaseAddress) { uint32_t sampleOffset = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8); // Store each raster tile from the hot tile to the destination surface. diff --git a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h index 9222d3e..6c801c7 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h +++ b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h @@ -694,5 +694,5 @@ template INLINE void* ComputeSurfaceAddress(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState) { -return pState->pBaseAddress + ComputeSurfaceOffset(x, y, z, array, sampleNum, lod, pState); +return (void*)(pState->xpBaseAddress + ComputeSurfaceOffset(x, y, z, array, sampleNum, lod, pState)); } diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp index c058870..e95bd3b 100644 --- a/src/gallium/drivers/swr/swr_context.cpp +++ b/src/gallium/drivers/swr/swr_context.cpp @@ -152,12 +152,12 @@ swr_transfer_map(struct pipe_context *pipe, for (int y = box->y; y < box->y + box->height; y++) {
[Mesa-dev] [PATCH 04/10] swr/rast: Remove hardcoded clip/cull slot from clipper
--- src/gallium/drivers/swr/rasterizer/core/clip.h | 35 +++--- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index e0aaf81..cde5261 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -372,13 +372,15 @@ public: int ComputeUserClipCullMask(PA_STATE &pa, typename SIMD_T::Vec4 prim[]) { uint8_t cullMask = state.backendState.cullDistanceMask; +uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset; + typename SIMD_T::Float vClipCullMask = SIMD_T::setzero_ps(); typename SIMD_T::Vec4 vClipCullDistLo[3]; typename SIMD_T::Vec4 vClipCullDistHi[3]; -pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo); -pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi); +pa.Assemble(vertexClipCullOffset, vClipCullDistLo); +pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi); DWORD index; while (_BitScanForward(&index, cullMask)) @@ -488,21 +490,22 @@ public: } // assemble user clip distances if enabled +uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset; if (state.backendState.clipDistanceMask & 0xf) { -pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector); +pa.Assemble(vertexClipCullSlot, tmpVector); for (uint32_t i = 0; i < NumVertsPerPrim; ++i) { -vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i]; +vertices[i].attrib[vertexClipCullSlot] = tmpVector[i]; } } if (state.backendState.clipDistanceMask & 0xf0) { -pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector); +pa.Assemble(vertexClipCullSlot + 1, tmpVector); for (uint32_t i = 0; i < NumVertsPerPrim; ++i) { -vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i]; +vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i]; } } @@ -613,26 +616,27 @@ public: } // transpose user clip distances if enabled +uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset; if (state.backendState.clipDistanceMask & 0x0f) { -pBase = reinterpret_cast(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim; +pBase = reinterpret_cast(&vertices[0].attrib[vertexClipCullSlot]) + sizeof(float) * inputPrim; for (uint32_t c = 0; c < 4; ++c) { SIMD256::Float temp = SIMD256::template mask_i32gather_ps(SIMD256::setzero_ps(), reinterpret_cast(pBase), vOffsets, vMask); -transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = SimdHelper::insert_lo_ps(temp); +transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper::insert_lo_ps(temp); pBase += sizeof(typename SIMD_T::Float); } } if (state.backendState.clipDistanceMask & 0xf0) { -pBase = reinterpret_cast(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim; +pBase = reinterpret_cast(&vertices[0].attrib[vertexClipCullSlot + 1]) + sizeof(float) * inputPrim; for (uint32_t c = 0; c < 4; ++c) { SIMD256::Float temp = SIMD256::template mask_i32gather_ps(SIMD256::setzero_ps(), reinterpret_cast(pBase), vOffsets, vMask); -transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = SimdHelper::insert_lo_ps(temp); +transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper::insert_lo_ps(temp); pBase += sizeof(typename SIMD_T::Float); } } @@ -692,6 +696,7 @@ public: // OOB indices => forced to zero. typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); +vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si()); typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports); viewportIdx = SIMD_T::and_si(vClearMask, vpai); @@ -822,6 +827,7 @@ private: float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4. { uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset; +uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset; // compute interpol
[Mesa-dev] [PATCH 06/10] swr/rast: add graph write to jit debug putput
--- src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp index fc32b62..e4281f8 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp @@ -296,10 +296,10 @@ void JitManager::DumpToFile(Function *f, const char *fileName) #endif fd.flush(); -//raw_fd_ostream fd_cfg(fName, EC, llvm::sys::fs::F_Text); -//WriteGraph(fd_cfg, (const Function*)f); +raw_fd_ostream fd_cfg(fName, EC, llvm::sys::fs::F_Text); +WriteGraph(fd_cfg, (const Function*)f); -//fd_cfg.flush(); +fd_cfg.flush(); } } -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 03/10] swr/rast: Start to remove hardcoded clipcull_dist vertex attrib slot
Add new field in SWR_BACKEND_STATE::vertexClipCullOffset to specify the start of the clip/cull section of the vertex header. Removed use of hardcoded slot from binner. --- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 11 ++- src/gallium/drivers/swr/rasterizer/core/state.h| 9 ++--- src/gallium/drivers/swr/swr_state.cpp | 3 +++ 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index 19afd1f..a6713e8 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -366,16 +366,17 @@ PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzl /// @param clipDistMask - mask of enabled clip distances /// @param pUserClipBuffer - buffer to store results template -void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float *pRecipW, float* pUserClipBuffer) +void ProcessUserClipDist(const SWR_BACKEND_STATE& state, PA_STATE& pa, uint32_t primIndex, float *pRecipW, float* pUserClipBuffer) { DWORD clipDist; +uint32_t clipDistMask = state.clipDistanceMask; while (_BitScanForward(&clipDist, clipDistMask)) { clipDistMask &= ~(1 << clipDist); uint32_t clipSlot = clipDist >> 2; uint32_t clipComp = clipDist & 0x3; uint32_t clipAttribSlot = clipSlot == 0 ? -VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT; +state.vertexClipCullOffset : state.vertexClipCullOffset + 1; simd4scalar primClipDist[3]; pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist); @@ -872,7 +873,7 @@ endBinTriangles: { uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask); desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float)); -ProcessUserClipDist<3>(pa, triIndex, state.backendState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer); +ProcessUserClipDist<3>(state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer); } for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y) @@ -1248,7 +1249,7 @@ void BinPostSetupPointsImpl( desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float)); float dists[8]; float one = 1.0f; -ProcessUserClipDist<1>(pa, primIndex, backendState.clipDistanceMask, &one, dists); +ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists); for (uint32_t i = 0; i < numClipDist; i++) { desc.pUserClipBuffer[3 * i + 0] = 0.0f; desc.pUserClipBuffer[3 * i + 1] = 0.0f; @@ -1577,7 +1578,7 @@ void BinPostSetupLinesImpl( { uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask); desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float)); -ProcessUserClipDist<2>(pa, primIndex, state.backendState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer); +ProcessUserClipDist<2>(state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer); } MacroTileMgr *pTileMgr = pDC->pTileMgr; diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 284c523..b0af663 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -1070,12 +1070,15 @@ struct SWR_BACKEND_STATE bool readRenderTargetArrayIndex;// Forward render target array index from last FE stage to the backend bool readViewportArrayIndex;// Read viewport array index from last FE stage during binning -// user clip/cull distance enables + // Offset to the start of the attributes of the input vertices, in simdvector units +uint32_t vertexAttribOffset; + +// User clip/cull distance enables uint8_t cullDistanceMask; uint8_t clipDistanceMask; - // Offset to the start of the attributes of the input vertices, in simdvector units -uint32_t vertexAttribOffset; +// Offset to clip/cull attrib section of the vertex, in simdvector units +uint32_t vertexClipCullOffset; }; diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp index d5b553b..69a4473 100644 --- a/src/gallium/drivers/swr/swr_state.cpp +++ b/src/gallium/drivers/swr/swr_state.cpp @@ -1766,6 +1766,9 @@ swr_update_derived(struct pipe_context *pipe, backendState.cullDistanceMask = ctx->vs->info.base.culldist_writemask << ctx->vs->info.base.num_written_clipdistance; + // Assume old layout of SGV, POSITION, CLIPCULL, ATTRIB + backendState.vertexClip
[Mesa-dev] [PATCH 09/10] swr/rast: adjust linux cpu topology identification code
Make more robust to handle strange strange configurations like a vmware exported 4-way numa X 1-core configuration. --- .../drivers/swr/rasterizer/core/threads.cpp| 81 ++ 1 file changed, 38 insertions(+), 43 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index b704d23..4bb395d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -169,37 +169,16 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread std::ifstream input("/proc/cpuinfo"); std::string line; char* c; -uint32_t threadId = uint32_t(-1); +uint32_t procId = uint32_t(-1); uint32_t coreId = uint32_t(-1); -uint32_t numaId = uint32_t(-1); +uint32_t physId = uint32_t(-1); while (std::getline(input, line)) { if (line.find("processor") != std::string::npos) { -if (threadId != uint32_t(-1)) -{ -// Save information. -if (out_nodes.size() <= numaId) -{ -out_nodes.resize(numaId + 1); -} - -auto& numaNode = out_nodes[numaId]; -if (numaNode.cores.size() <= coreId) -{ -numaNode.cores.resize(coreId + 1); -} - -auto& core = numaNode.cores[coreId]; -core.procGroup = coreId; -core.threadIds.push_back(threadId); - -out_numThreadsPerProcGroup++; -} - auto data_start = line.find(": ") + 2; -threadId = std::strtoul(&line.c_str()[data_start], &c, 10); +procId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.find("core id") != std::string::npos) @@ -211,29 +190,32 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread if (line.find("physical id") != std::string::npos) { auto data_start = line.find(": ") + 2; -numaId = std::strtoul(&line.c_str()[data_start], &c, 10); +physId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } +if (line.length() == 0) +{ +if (physId + 1 > out_nodes.size()) +out_nodes.resize(physId + 1); +auto& numaNode = out_nodes[physId]; +numaNode.numaId = physId; + +if (coreId + 1 > numaNode.cores.size()) +numaNode.cores.resize(coreId + 1); +auto& core = numaNode.cores[coreId]; +core.procGroup = coreId; +core.threadIds.push_back(procId); +} } -if (threadId != uint32_t(-1)) +out_numThreadsPerProcGroup = 0; +for (auto &node : out_nodes) { -// Save information. -if (out_nodes.size() <= numaId) +for (auto &core : node.cores) { -out_nodes.resize(numaId + 1); +out_numThreadsPerProcGroup = std::max((size_t)out_numThreadsPerProcGroup, + core.threadIds.size()); } -auto& numaNode = out_nodes[numaId]; -numaNode.numaId = numaId; -if (numaNode.cores.size() <= coreId) -{ -numaNode.cores.resize(coreId + 1); -} -auto& core = numaNode.cores[coreId]; - -core.procGroup = coreId; -core.threadIds.push_back(threadId); -out_numThreadsPerProcGroup++; } #else @@ -316,7 +298,11 @@ void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId = CPU_ZERO(&cpuset); CPU_SET(threadId, &cpuset); -pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset); +int err = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset); +if (err != 0) +{ +fprintf(stderr, "pthread_setaffinity_np failure for tid %u: %s\n", threadId, strerror(err)); +} #endif } @@ -1031,7 +1017,16 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) } else { -pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.) +// numa distribution assumes workers on all nodes +bool useNuma = true; +if (numCoresPerNode * numHyperThreads == 1) +useNuma = false; + +if (useNuma) { +pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.) +} else { +pPool->numaMask = 0; +} uint32_t workerId = 0; for (uint32_t n = 0; n < numNodes; ++n) @@ -1064,7 +1059,7 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) pPool->pThreadData[workerId].workerId = workerId; pPool->pThreadData[workerId].procGroupId = core.procGrou
[Mesa-dev] [PATCH 02/10] swr/rast: Move clip/cull enables in API
Moved from from SWR_RASTSTATE to SWR_BACKEND_STATE. --- .../drivers/swr/rasterizer/core/backend.cpp| 4 ++-- .../drivers/swr/rasterizer/core/backend_impl.h | 2 +- .../drivers/swr/rasterizer/core/backend_sample.cpp | 4 ++-- .../swr/rasterizer/core/backend_singlesample.cpp | 4 ++-- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 18 +- src/gallium/drivers/swr/rasterizer/core/clip.h | 22 +++--- .../drivers/swr/rasterizer/core/rasterizer.cpp | 2 +- src/gallium/drivers/swr/rasterizer/core/state.h| 8 src/gallium/drivers/swr/swr_state.cpp | 16 9 files changed, 40 insertions(+), 40 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index 363349f..6282e87 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -272,9 +272,9 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, AR_END(BEBarycentric, 0); // interpolate user clip distance if available -if (state.rastState.clipDistanceMask) +if (state.backendState.clipDistanceMask) { -coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); +coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); } simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h index 0f430ef..593082b 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h @@ -886,7 +886,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t AR_END(BESetup, 0); -PixelRateZTestLoop PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask); +PixelRateZTestLoop PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.backendState.clipDistanceMask); psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast(y))); psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast(y))); diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp index bb2e9a9..04e34aa 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp @@ -128,9 +128,9 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ AR_END(BEBarycentric, 0); // interpolate user clip distance if available -if (state.rastState.clipDistanceMask) +if (state.backendState.clipDistanceMask) { -coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); +coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); } simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp index 18f4299..686b979 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp @@ -112,9 +112,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 AR_END(BEBarycentric, 1); // interpolate user clip distance if available -if (state.rastState.clipDistanceMask) +if (state.backendState.clipDistanceMask) { -coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center); +coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center); } simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index 01c2f8f..19afd1f 100644 --- a/src/gallium/drivers/swr/rasteri
[Mesa-dev] [PATCH 00/10] swr: update rasterizer
Mostly some api changes, plus making the cpu topology code a bit more robust in the face of some odd configurations seen in virtualized environments. No piglit or vtk ctest regressions. Tim Rowley (10): swr/rast: Add new API SwrStallBE swr/rast: Move clip/cull enables in API swr/rast: Start to remove hardcoded clipcull_dist vertex attrib slot swr/rast: Remove hardcoded clip/cull slot from clipper swr/rast: Migrate memory pointers to gfxptr_t type swr/rast: add graph write to jit debug putput swr/rast: whitespace changes swr/rast: Missed conversion to SIMD_T swr/rast: adjust linux cpu topology identification code swr/rast: Fetch compile state changes .../swr/rasterizer/codegen/gen_llvm_types.py | 2 +- src/gallium/drivers/swr/rasterizer/core/api.cpp| 9 +++ src/gallium/drivers/swr/rasterizer/core/api.h | 8 +++ .../drivers/swr/rasterizer/core/backend.cpp| 4 +- .../drivers/swr/rasterizer/core/backend_impl.h | 2 +- .../drivers/swr/rasterizer/core/backend_sample.cpp | 4 +- .../swr/rasterizer/core/backend_singlesample.cpp | 4 +- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 25 +++ src/gallium/drivers/swr/rasterizer/core/clip.h | 57 --- .../drivers/swr/rasterizer/core/rasterizer.cpp | 2 +- src/gallium/drivers/swr/rasterizer/core/state.h| 16 +++-- .../drivers/swr/rasterizer/core/threads.cpp| 81 ++ .../drivers/swr/rasterizer/jitter/JitManager.cpp | 6 +- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 12 +++- .../drivers/swr/rasterizer/jitter/fetch_jit.h | 7 +- .../drivers/swr/rasterizer/jitter/jit_api.h| 2 + .../drivers/swr/rasterizer/memory/StoreTile.h | 4 +- .../swr/rasterizer/memory/TilingFunctions.h| 2 +- src/gallium/drivers/swr/swr_context.cpp| 18 ++--- src/gallium/drivers/swr/swr_draw.cpp | 8 +-- src/gallium/drivers/swr/swr_resource.h | 2 +- src/gallium/drivers/swr/swr_screen.cpp | 21 +++--- src/gallium/drivers/swr/swr_state.cpp | 31 + 23 files changed, 182 insertions(+), 145 deletions(-) -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 6/8] swr/rast: SIMD16 FE remove templated immediates workaround
Fixed properly in gcc-compatible fashion. --- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 110 - 1 file changed, 20 insertions(+), 90 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index e09ff7a..832c47d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -404,35 +404,6 @@ void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, } } -// WA linux compiler issue with SIMDLIB and shift immediates -#define SIMD_WA_SXXI_EPI32 1 - -#if SIMD_WA_SXXI_EPI32 -template -simdscalari simd_wa_slli_epi32(simdscalari a) -{ -return SIMD256::slli_epi32(a); -} - -template -simd16scalari simd_wa_slli_epi32(simd16scalari a) -{ -return SIMD512::slli_epi32(a); -} - -template -simdscalari simd_wa_srai_epi32(simdscalari a) -{ -return SIMD256::srai_epi32(a); -} - -template -simd16scalari simd_wa_srai_epi32(simd16scalari a) -{ -return SIMD512::srai_epi32(a); -} - -#endif INLINE void TransposeVertices(simd4scalar(&dst)[8], const simdscalar &src0, const simdscalar &src1, const simdscalar &src2) { @@ -804,17 +775,10 @@ endBinTriangles: } // Convert triangle bbox to macrotile units. -#if SIMD_WA_SXXI_EPI32 -bbox.xmin = simd_wa_srai_epi32(bbox.xmin); -bbox.ymin = simd_wa_srai_epi32(bbox.ymin); -bbox.xmax = simd_wa_srai_epi32(bbox.xmax); -bbox.ymax = simd_wa_srai_epi32(bbox.ymax); -#else -bbox.xmin = SIMD_T::srai_epi32(bbox.xmin); -bbox.ymin = SIMD_T::srai_epi32(bbox.ymin); -bbox.xmax = SIMD_T::srai_epi32(bbox.xmax); -bbox.ymax = SIMD_T::srai_epi32(bbox.ymax); -#endif +bbox.xmin = SIMD_T::template srai_epi32(bbox.xmin); +bbox.ymin = SIMD_T::template srai_epi32(bbox.ymin); +bbox.xmax = SIMD_T::template srai_epi32(bbox.xmax); +bbox.ymax = SIMD_T::template srai_epi32(bbox.ymax); OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH]; @@ -1034,13 +998,8 @@ void BinPostSetupPointsImpl( primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi)); // compute macro tile coordinates -#if SIMD_WA_SXXI_EPI32 -typename SIMD_T::Integer macroX = simd_wa_srai_epi32(vXi); -typename SIMD_T::Integer macroY = simd_wa_srai_epi32(vYi); -#else -typename SIMD_T::Integer macroX = SIMD_T::srai_epi32(vXi); -typename SIMD_T::Integer macroY = SIMD_T::srai_epi32(vYi); -#endif +typename SIMD_T::Integer macroX = SIMD_T::template srai_epi32(vXi); +typename SIMD_T::Integer macroY = SIMD_T::template srai_epi32(vYi); OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH]; @@ -1048,30 +1007,15 @@ void BinPostSetupPointsImpl( SIMD_T::store_si(reinterpret_cast(aMacroY), macroY); // compute raster tile coordinates -#if SIMD_WA_SXXI_EPI32 -typename SIMD_T::Integer rasterX = simd_wa_srai_epi32(vXi); -typename SIMD_T::Integer rasterY = simd_wa_srai_epi32(vYi); -#else -typename SIMD_T::Integer rasterX = SIMD_T::srai_epi32(vXi); -typename SIMD_T::Integer rasterY = SIMD_T::srai_epi32(vYi); -#endif +typename SIMD_T::Integer rasterX = SIMD_T::template srai_epi32(vXi); +typename SIMD_T::Integer rasterY = SIMD_T::template srai_epi32(vYi); // compute raster tile relative x,y for coverage mask -#if SIMD_WA_SXXI_EPI32 -typename SIMD_T::Integer tileAlignedX = simd_wa_slli_epi32(rasterX); -typename SIMD_T::Integer tileAlignedY = simd_wa_slli_epi32(rasterY); -#else -typename SIMD_T::Integer tileAlignedX = SIMD_T::slli_epi32(rasterX); -typename SIMD_T::Integer tileAlignedY = SIMD_T::slli_epi32(rasterY); -#endif +typename SIMD_T::Integer tileAlignedX = SIMD_T::template slli_epi32(rasterX); +typename SIMD_T::Integer tileAlignedY = SIMD_T::template slli_epi32(rasterY); -#if SIMD_WA_SXXI_EPI32 -typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(simd_wa_srai_epi32(vXi), tileAlignedX); -typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(simd_wa_srai_epi32(vYi), tileAlignedY); -#else -typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::srai_epi32(vXi), tileAlignedX); -typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::srai_epi32(vYi), tileAlignedY); -#endif +typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32(vXi), tileAlignedX); +typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32(vYi), tileAlignedY); OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH]; OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH]; @@ -1223,17 +1167,10 @@ void BinPostSetupPointsImpl( primMask = primMask & ~maskOutsideScissor;
[Mesa-dev] [PATCH 7/8] swr/rast: Remove use of C++14 template variable
SWR rasterizer must remain C++11 compliant. --- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 6 +++--- src/gallium/drivers/swr/rasterizer/core/binner.h | 14 +++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index 832c47d..01c2f8f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -502,7 +502,7 @@ void SIMDCALL BinTrianglesImpl( } // Adjust for pixel center location -typename SIMD_T::Float offset = g_pixelOffsets[rastState.pixelLocation]; +typename SIMD_T::Float offset = SwrPixelOffsets::GetOffset(rastState.pixelLocation); tri[0].x = SIMD_T::add_ps(tri[0].x, offset); tri[0].y = SIMD_T::add_ps(tri[0].y, offset); @@ -1332,7 +1332,7 @@ void BinPointsImpl( } } -typename SIMD_T::Float offset = g_pixelOffsets[rastState.pixelLocation]; +typename SIMD_T::Float offset = SwrPixelOffsets::GetOffset(rastState.pixelLocation); prim[0].x = SIMD_T::add_ps(prim[0].x, offset); prim[0].y = SIMD_T::add_ps(prim[0].y, offset); @@ -1666,7 +1666,7 @@ void SIMDCALL BinLinesImpl( } // adjust for pixel center location -typename SIMD_T::Float offset = g_pixelOffsets[rastState.pixelLocation]; +typename SIMD_T::Float offset = SwrPixelOffsets::GetOffset(rastState.pixelLocation); prim[0].x = SIMD_T::add_ps(prim[0].x, offset); prim[0].y = SIMD_T::add_ps(prim[0].y, offset); diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.h b/src/gallium/drivers/swr/rasterizer/core/binner.h index e842aa6..97e113f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.h +++ b/src/gallium/drivers/swr/rasterizer/core/binner.h @@ -31,11 +31,19 @@ // /// @brief Offsets added to post-viewport vertex positions based on /// raster state. +/// +/// Can't use templated variable because we must stick with C++11 features. +/// Template variables were introduced with C++14 template -static const typename SIMD_T::Float g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] = +struct SwrPixelOffsets { -SIMD_T::set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER -SIMD_T::set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL +public: +INLINE static typename SIMD_T::Float GetOffset(uint32_t loc) +{ +SWR_ASSERT(loc <= 1); + +return SIMD_T::set1_ps(loc ? 0.5f : 0.0f); +} }; // -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 0/8] swr: update rasterizer
Highlight is starting to unify the simd/simd16 code, removing lots of temporary code duplication. No piglit or vtk test regressions. Tim Rowley (8): swr/rast: Allow gather of floats from fetch shader with 2-4GB offsets swr: set caps for VB 4-byte alignment swr/rast: Removed some trailing whitespace caught during review swr/rast: FE/Binner - unify SIMD8/16 functions using simdlib types swr/rast: SIMD16 PA - rename Assemble_simd16 to Assemble swr/rast: SIMD16 FE remove templated immediates workaround swr/rast: Remove use of C++14 template variable swr/rast: FE/Clipper - unify SIMD8/16 functions using simdlib types .../swr/rasterizer/codegen/gen_llvm_ir_macros.py |1 + .../codegen/templates/gen_ar_eventhandlerfile.hpp |4 +- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 2312 ++-- src/gallium/drivers/swr/rasterizer/core/binner.h | 192 +- src/gallium/drivers/swr/rasterizer/core/clip.cpp | 16 +- src/gallium/drivers/swr/rasterizer/core/clip.h | 1654 -- .../drivers/swr/rasterizer/core/conservativeRast.h |1 + src/gallium/drivers/swr/rasterizer/core/fifo.hpp |4 +- .../drivers/swr/rasterizer/core/frontend.cpp |6 +- src/gallium/drivers/swr/rasterizer/core/pa.h | 20 +- src/gallium/drivers/swr/rasterizer/core/state.h|7 + src/gallium/drivers/swr/rasterizer/core/utils.h|8 + .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|7 +- src/gallium/drivers/swr/swr_screen.cpp |9 +- 14 files changed, 1193 insertions(+), 3048 deletions(-) -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 5/8] swr/rast: SIMD16 PA - rename Assemble_simd16 to Assemble
For consistency and to support overloading. --- src/gallium/drivers/swr/rasterizer/core/clip.h | 18 +- .../drivers/swr/rasterizer/core/frontend.cpp | 6 +++--- src/gallium/drivers/swr/rasterizer/core/pa.h | 22 +++--- 3 files changed, 15 insertions(+), 31 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index ffc69c4..5238284 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -399,8 +399,8 @@ public: simd16vector vClipCullDistLo[3]; simd16vector vClipCullDistHi[3]; -pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo); -pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi); +pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo); +pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi); DWORD index; while (_BitScanForward(&index, cullMask)) @@ -680,7 +680,7 @@ public: { #if USE_SIMD16_FRONTEND simd16vector attrib_simd16[NumVertsPerPrim]; -bool assemble = clipPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16); +bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16); if (assemble) { @@ -731,7 +731,7 @@ public: // assemble pos simd16vector tmpVector[NumVertsPerPrim]; -pa.Assemble_simd16(VERTEX_POSITION_SLOT, tmpVector); +pa.Assemble(VERTEX_POSITION_SLOT, tmpVector); for (uint32_t i = 0; i < NumVertsPerPrim; ++i) { vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i]; @@ -748,7 +748,7 @@ public: maxSlot = std::max(maxSlot, mapSlot); uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot; -pa.Assemble_simd16(inputSlot, tmpVector); +pa.Assemble(inputSlot, tmpVector); // if constant interpolation enabled for this attribute, assign the provoking // vertex values to all edges @@ -771,7 +771,7 @@ public: // assemble user clip distances if enabled if (this->state.rastState.clipDistanceMask & 0xf) { -pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector); +pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector); for (uint32_t i = 0; i < NumVertsPerPrim; ++i) { vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i]; @@ -780,7 +780,7 @@ public: if (this->state.rastState.clipDistanceMask & 0xf0) { -pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector); +pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector); for (uint32_t i = 0; i < NumVertsPerPrim; ++i) { vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i]; @@ -919,7 +919,7 @@ public: do { simd16vector attrib[NumVertsPerPrim]; -bool assemble = clipPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib); +bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib); if (assemble) { @@ -1060,7 +1060,7 @@ public: if (state.backendState.readViewportArrayIndex) { simd16vector vpiAttrib[NumVertsPerPrim]; -pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib); +pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); // OOB indices => forced to zero. simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 406a0e0..f882869 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -929,7 +929,7 @@ static void GeometryShaderStage( #if USE_SIMD16_FRONTEND simd16vector attrib_simd16[3]; -bool assemble = gsPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16); +bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16); #else bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib); @@ -1297,7 +1297,7 @@ static void TessellationStages( AR_BEGIN(FEPAAssemble, pDC->drawId); bool assemble = #if USE_SIMD16_FRONTEND -tessPa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16); +tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16); #else tessPa.Assemble(VERTEX_POSITION_SLOT, prim); #endif @@ -1646,7 +1646,7 @@ void ProcessDraw( simd16vector pri
[Mesa-dev] [PATCH 8/8] swr/rast: FE/Clipper - unify SIMD8/16 functions using simdlib types
--- src/gallium/drivers/swr/rasterizer/core/clip.cpp | 16 +- src/gallium/drivers/swr/rasterizer/core/clip.h | 1650 ++ src/gallium/drivers/swr/rasterizer/core/state.h |7 + 3 files changed, 465 insertions(+), 1208 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp index 4b5512c..a40f077 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp @@ -32,9 +32,9 @@ #include "core/clip.h" // Temp storage used by the clipper -THREAD simdvertex tlsTempVertices[7]; +THREAD SIMDVERTEX_T tlsTempVertices[7]; #if USE_SIMD16_FRONTEND -THREAD simd16vertex tlsTempVertices_simd16[7]; +THREAD SIMDVERTEX_T tlsTempVertices_simd16[7]; #endif float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1) @@ -164,7 +164,7 @@ void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvecto { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(FEClipTriangles, pDC->drawId); -Clipper<3> clipper(workerId, pDC); +Clipper clipper(workerId, pDC); clipper.ExecuteStage(pa, prims, primMask, primId); AR_END(FEClipTriangles, 1); } @@ -173,7 +173,7 @@ void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector pr { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(FEClipLines, pDC->drawId); -Clipper<2> clipper(workerId, pDC); +Clipper clipper(workerId, pDC); clipper.ExecuteStage(pa, prims, primMask, primId); AR_END(FEClipLines, 1); } @@ -182,7 +182,7 @@ void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector p { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(FEClipPoints, pDC->drawId); -Clipper<1> clipper(workerId, pDC); +Clipper clipper(workerId, pDC); clipper.ExecuteStage(pa, prims, primMask, primId); AR_END(FEClipPoints, 1); } @@ -195,7 +195,7 @@ void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t wor enum { VERTS_PER_PRIM = 3 }; -Clipper clipper(workerId, pDC); +Clipper clipper(workerId, pDC); pa.useAlternateOffset = false; clipper.ExecuteStage(pa, prims, primMask, primId); @@ -210,7 +210,7 @@ void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerI enum { VERTS_PER_PRIM = 2 }; -Clipper clipper(workerId, pDC); +Clipper clipper(workerId, pDC); pa.useAlternateOffset = false; clipper.ExecuteStage(pa, prims, primMask, primId); @@ -225,7 +225,7 @@ void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t worker enum { VERTS_PER_PRIM = 1 }; -Clipper clipper(workerId, pDC); +Clipper clipper(workerId, pDC); pa.useAlternateOffset = false; clipper.ExecuteStage(pa, prims, primMask, primId); diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index 5238284..d7b559b 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -33,9 +33,9 @@ #include "rdtsc_core.h" // Temp storage used by the clipper -extern THREAD simdvertex tlsTempVertices[7]; +extern THREAD SIMDVERTEX_T tlsTempVertices[7]; #if USE_SIMD16_FRONTEND -extern THREAD simd16vertex tlsTempVertices_simd16[7]; +extern THREAD SIMDVERTEX_T tlsTempVertices_simd16[7]; #endif enum SWR_CLIPCODES @@ -61,29 +61,29 @@ enum SWR_CLIPCODES #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW) -INLINE -void ComputeClipCodes(const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes, simdscalari const &viewportIndexes) +template +void ComputeClipCodes(const API_STATE &state, const typename SIMD_T::Vec4 &vertex, typename SIMD_T::Float &clipCodes, typename SIMD_T::Integer const &viewportIndexes) { -clipCodes = _simd_setzero_ps(); +clipCodes = SIMD_T::setzero_ps(); // -w -simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f)); +typename SIMD_T::Float vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f)); // FRUSTUM_LEFT -simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW); -clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT))); +typename SIMD_T::Float vRes = SIMD_T::cmplt_ps(vertex.x, vNegW); +clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT))); // FRUSTUM_TOP -vRes = _simd_cmplt_ps(vertex.y, vNegW); -clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP; +vRes = SIMD_T::cmplt_ps(vertex.y, vNegW); +clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP; // FRUSTUM_RIGHT -vRes = _simd_cmpgt_ps(vertex.x, vertex.w); -clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes,
[Mesa-dev] [PATCH 2/8] swr: set caps for VB 4-byte alignment
Needed to compensate for change to fetch jit requiring alignment. Fixes regressions in piglit: vertex-buffer-offsets and about another hundred of the vs-input*byte* tests. --- src/gallium/drivers/swr/swr_screen.cpp | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp index cc8d995..85bf765 100644 --- a/src/gallium/drivers/swr/swr_screen.cpp +++ b/src/gallium/drivers/swr/swr_screen.cpp @@ -263,6 +263,12 @@ swr_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_FAKE_SW_MSAA: return (swr_screen(screen)->msaa_max_count > 1) ? 0 : 1; + /* fetch jit change for 2-4GB buffers requires alignment */ + case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: + case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: + case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: + return 1; + /* unsupported features */ case PIPE_CAP_ANISOTROPIC_FILTER: case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: @@ -274,9 +280,6 @@ swr_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_COMPUTE: case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT: case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS: - case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: - case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: - case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: case PIPE_CAP_TGSI_TEXCOORD: case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/8] swr/rast: Allow gather of floats from fetch shader with 2-4GB offsets
--- src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py | 1 + src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp | 7 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py index 2ed2b2f..025d38a 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py @@ -45,6 +45,7 @@ intrinsics = [ ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale']], +['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']], ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']], ['VRSQRTPS', 'x86_avx_rsqrt_ps_256', ['a']], ['VRCPPS', 'x86_avx_rcp_ps_256', ['a']], diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index dcfe897..761c58c 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -1005,7 +1005,12 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *vMask = vGatherMask; // Gather a SIMD of vertices -vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); +// APIs allow a 4GB range for offsets +// However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :( +// But, we know that elements must be aligned for FETCH. :) +// Right shift the offset by a bit and then scale by 2 to remove the sign extension. +Value* vShiftedOffsets = VPSRLI(vOffsets, C(1)); +vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, C((char)2)); } else { -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/8] swr/rast: Removed some trailing whitespace caught during review
--- .../rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp | 4 ++-- src/gallium/drivers/swr/rasterizer/core/fifo.hpp | 4 ++-- src/gallium/drivers/swr/rasterizer/core/pa.h | 12 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp index 0ca9a78..d1852b3 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp @@ -23,7 +23,7 @@ * @file ${filename} * * @brief Event handler interface. auto-generated file -* +* * DO NOT EDIT * * Generation Command Line: @@ -57,7 +57,7 @@ namespace ArchRast std::stringstream outDir; outDir << KNOB_DEBUG_OUTPUT_DIR << pBaseName << "_" << pid << std::ends; CreateDirectory(outDir.str().c_str(), NULL); - + // There could be multiple threads creating thread pools. We // want to make sure they are uniquly identified by adding in // the creator's thread id into the filename. diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp index 3be72f3..43d3a83 100644 --- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp +++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp @@ -79,7 +79,7 @@ struct QUEUE long initial = InterlockedCompareExchange(&mLock, 1, 0); return (initial == 0); } - + void unlock() { mLock = 0; @@ -112,7 +112,7 @@ struct QUEUE __m256 vSrc = _mm256_load_ps(pSrc + i*KNOB_SIMD_WIDTH); _mm256_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc); }; - + const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH*4); static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T), "FIFO element size should be multiple of SIMD width."); diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h index cb3470f..87dba22 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa.h +++ b/src/gallium/drivers/swr/rasterizer/core/pa.h @@ -162,7 +162,7 @@ struct PA_STATE_OPT : public PA_STATE bool isStreaming{ false }; SIMDMASK junkIndices { 0 }; // temporary index store for unused virtual function - + PA_STATE_OPT() {} PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts, uint32_t vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN); @@ -412,7 +412,7 @@ struct PA_STATE_CUT : public PA_STATE uint32_t vertsPerPrim{ 0 }; bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they // are ignored. Fetch shader sends invalid verts on cuts that should be ignored - // while the GS sends valid verts for every index + // while the GS sends valid verts for every index simdvector junkVector; // junk simdvector for unimplemented API #if ENABLE_AVX512_SIMD16 @@ -575,7 +575,7 @@ struct PA_STATE_CUT : public PA_STATE return CheckBit(this->pCutIndices[vertexIndex], vertexOffset); } -// iterates across the unprocessed verts until we hit the end or we +// iterates across the unprocessed verts until we hit the end or we // have assembled SIMD prims void ProcessVerts() { @@ -583,7 +583,7 @@ struct PA_STATE_CUT : public PA_STATE this->numRemainingVerts > 0 && this->curVertex != this->headVertex) { -// if cut index, restart topology +// if cut index, restart topology if (IsCutIndex(this->curVertex)) { if (this->processCutVerts) @@ -923,7 +923,7 @@ struct PA_STATE_CUT : public PA_STATE case 6: SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!"); AssembleTriStripAdj(); - + uint32_t nextTri[6]; if (this->reverseWinding) { @@ -939,7 +939,7 @@ struct PA_STATE_CUT : public PA_STATE nextTri[1] = this->adjExtraVert; nextTri[2] = this->vert[3]; nextTri[4] = this->vert[4]; -nextTri[5] = this->vert[0]; +nextTri[5] = this->vert[0]; } for (uint32_t i = 0; i < 6; ++i) { -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] swr: limit pipe_draw_info->restart_index usage
Only copy this value when in restart drawing mode. Eliminates valgrind errors when running trivial programs. --- src/gallium/drivers/swr/swr_draw.cpp | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/swr/swr_draw.cpp b/src/gallium/drivers/swr/swr_draw.cpp index df1c11a..2363800 100644 --- a/src/gallium/drivers/swr/swr_draw.cpp +++ b/src/gallium/drivers/swr/swr_draw.cpp @@ -107,7 +107,10 @@ swr_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) } struct swr_vertex_element_state *velems = ctx->velems; - velems->fsState.cutIndex = info->restart_index; + if (info->primitive_restart) + velems->fsState.cutIndex = info->restart_index; + else + velems->fsState.cutIndex = 0; velems->fsState.bEnableCutIndex = info->primitive_restart; velems->fsState.bPartialVertexBuffer = (info->min_index > 0); -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] configure: remove trailing "-a" in swr architecture test
Fixes "configure: line 27326: test: argument expected" CC: mesa-sta...@lists.freedesktop.org --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 5b12dd8..316e6a8 100644 --- a/configure.ac +++ b/configure.ac @@ -2545,7 +2545,7 @@ if test -n "$with_gallium_drivers"; then if test "x$HAVE_SWR_AVX" != xyes -a \ "x$HAVE_SWR_AVX2" != xyes -a \ "x$HAVE_SWR_KNL" != xyes -a \ -"x$HAVE_SWR_SKX" != xyes -a; then +"x$HAVE_SWR_SKX" != xyes; then AC_MSG_ERROR([swr enabled but no swr architectures selected]) fi -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] swr/rast: [rasterizer core] fix invalid casting for calls to Interlocked* functions
CID: 1416243, 1416244, 1416255 CC: mesa-sta...@lists.freedesktop.org --- src/gallium/drivers/swr/rasterizer/core/api.cpp | 2 +- src/gallium/drivers/swr/rasterizer/core/context.h | 8 src/gallium/drivers/swr/rasterizer/core/threads.cpp | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 8dc9ac2..ccb6dfb 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -189,7 +189,7 @@ void QueueWork(SWR_CONTEXT *pContext) if (IsDraw) { -InterlockedIncrement((volatile long*)&pContext->drawsOutstandingFE); +InterlockedIncrement(&pContext->drawsOutstandingFE); } _ReadWriteBarrier(); diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 131b3cb..bcd5801 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -409,12 +409,12 @@ struct DRAW_CONTEXT booldependent; // Backend work is dependent on all previous BE boolisCompute; // Is this DC a compute context? boolcleanupState; // True if this is the last draw using an entry in the state ring. -volatile bool doneFE; // Is FE work done for this draw? FE_WORK FeWork; +volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw? volatile OSALIGNLINE(uint32_t) FeLock; -volatile int32_tthreadsDone; +volatile OSALIGNLINE(uint32_t) threadsDone; SYNC_DESC retireCallback; // Call this func when this DC is retired. }; @@ -503,9 +503,9 @@ struct SWR_CONTEXT // Scratch space for workers. uint8_t** ppScratch; -volatile int32_t drawsOutstandingFE; +volatile OSALIGNLINE(uint32_t) drawsOutstandingFE; -CachingAllocator cachingArenaAllocator; +OSALIGNLINE(CachingAllocator) cachingArenaAllocator; uint32_t frameCount; uint32_t lastFrameChecked; diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 70bde02..b704d23 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -393,7 +393,7 @@ INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONT // inlined-only version INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC) { -int32_t result = InterlockedDecrement((volatile long*)&pDC->threadsDone); +int32_t result = static_cast(InterlockedDecrement(&pDC->threadsDone)); SWR_ASSERT(result >= 0); AR_FLUSH(pDC->drawId); @@ -639,7 +639,7 @@ INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEX _mm_mfence(); pDC->doneFE = true; -InterlockedDecrement((volatile long*)&pContext->drawsOutstandingFE); +InterlockedDecrement(&pContext->drawsOutstandingFE); } void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE) -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 17/17] swr/rast: fix core / knights split of AVX512 intrinsics
Move AVX512BW specific intrinics to be Core-only. Move some AVX512F intrinsics back to common implementation file. --- .../drivers/swr/rasterizer/common/simdlib.hpp | 2 + .../swr/rasterizer/common/simdlib_512_avx512.inl | 53 + .../rasterizer/common/simdlib_512_avx512_core.inl | 54 ++ .../common/simdlib_512_avx512_knights.inl | 15 -- 4 files changed, 69 insertions(+), 55 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp index 22d7da4..500cf8a 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp @@ -214,6 +214,8 @@ struct SIMDBase : Traits::IsaImpl using Vec4 = typename Traits::Vec4; using Mask = typename Traits::Mask; +static const size_t VECTOR_BYTES = sizeof(Float); + // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes . static SIMDINLINE void vec4_load1_ps(Vec4& r, const float *p) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl index 1dbfff8..95e4c31 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl @@ -158,6 +158,11 @@ private: return _mm512_maskz_set1_epi32(m, -1); } +static SIMDINLINE Integer vmask(__mmask8 m) +{ +return _mm512_maskz_set1_epi64(m, -1LL); +} + public: //--- // Single precision floating point arithmetic operations @@ -187,8 +192,8 @@ static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps 0xff) ? 0xff : (a + b) (uint8) +//SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) +//SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) @@ -202,7 +207,7 @@ SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32) SIMD_IWRAPPER_2(mullo_epi32); SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32) SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64) -SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) +//SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) //--- // Logical operations @@ -276,7 +281,7 @@ static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a(i return _mm512_cvtepi32_ps(a); } -SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a(uint8 --> int16) +//SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a(uint8 --> int16) SIMD_IWRAPPER_1_4(cvtepu8_epi32); // return (int32)a(uint8 --> int32) SIMD_IWRAPPER_1_8(cvtepu16_epi32);// return (int32)a(uint16 --> int32) SIMD_IWRAPPER_1_4(cvtepu16_epi64);// return (int64)a(uint16 --> int64) @@ -317,20 +322,6 @@ static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps(a, b); } template -static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b) -{ -// Legacy vector mask generator -__mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast(CmpTypeT)); -return vmask(result); -} -template -static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b) -{ -// Legacy vector mask generator -__mmask32 result = _mm512_cmp_epi16_mask(a, b, static_cast(CmpTypeT)); -return vmask(result); -} -template static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b) { // Legacy vector mask generator @@ -345,12 +336,12 @@ static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b) return vmask(result); } -SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8);// return a == b (int8) -SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16); // return a == b (int16) +//SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8);// return a == b (int8) +//SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16); // return a == b (int16) SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32); // return a == b (int32) SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64); // return a == b (int64) -SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8);// return a > b (int8) -SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16); // return a > b (int16) +//SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8);// return a > b (int8) +//SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16); // return a > b (int16) SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32); // return a > b (int32) SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64); // return a > b (int64) SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32); // return a < b (int32) @@ -458,7 +449,7 @@ SIMD_IWRAPP
[Mesa-dev] [PATCH v2 14/17] swr/rast: gen_knobs template code style
--- src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp index e6fe165..a950643 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp @@ -203,8 +203,8 @@ GlobalKnobs g_GlobalKnobs; // GlobalKnobs::GlobalKnobs() { -% for knob in knobs: -InitKnob(${knob[0]}); +% for knob in knobs : +InitKnob(${ knob[0] }); % endfor } -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 16/17] swr/rast: simplify knob default value setup
--- .../drivers/swr/rasterizer/codegen/templates/gen_knobs.h| 13 - src/gallium/drivers/swr/rasterizer/core/knobs_init.h| 12 +++- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h index b02870b..d81f7d0 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h @@ -67,12 +67,6 @@ public: return Value(); } -protected: -Knob(T const &defaultValue) : -m_Value(expandEnvironmentVariables(defaultValue)) -{ -} - private: T m_Value; }; @@ -83,10 +77,10 @@ private: { \\ -Knob_##_name() : Knob<_type>(_default) { } \\ - static const char* Name() { return "KNOB_" #_name; }\\ +static _type DefaultValue() { return (_default); } \\ + } _name; #define GET_KNOB(_name) g_GlobalKnobs._name.Value() @@ -117,8 +111,9 @@ struct GlobalKnobs % endif % endfor -GlobalKnobs(); + std::string ToString(const char* optPerLinePrefix=""); +GlobalKnobs(); }; extern GlobalKnobs g_GlobalKnobs; diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h index ba2df22..12c2a30 100644 --- a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h +++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h @@ -91,16 +91,18 @@ static inline void ConvertEnvToKnob(const char* pOverride, std::string& knobValu template static inline void InitKnob(T& knob) { - -// TODO, read registry first - -// Second, read environment variables +// Read environment variables const char* pOverride = getenv(knob.Name()); if (pOverride) { -auto knobValue = knob.Value(); +auto knobValue = knob.DefaultValue(); ConvertEnvToKnob(pOverride, knobValue); knob.Value(knobValue); } +else +{ +// Set default value +knob.Value(knob.DefaultValue()); +} } -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 15/17] swr/rast: split gen_knobs templates into .h/.cpp
Switch to a 1:1 mapping template:generated for future maintenance. --- src/gallium/drivers/swr/Makefile.am| 3 +- src/gallium/drivers/swr/SConscript | 2 +- .../drivers/swr/rasterizer/codegen/gen_knobs.py| 14 +- .../swr/rasterizer/codegen/templates/gen_knobs.cpp | 108 -- .../swr/rasterizer/codegen/templates/gen_knobs.h | 157 + 5 files changed, 166 insertions(+), 118 deletions(-) create mode 100644 src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am index 73fe904..b20f128 100644 --- a/src/gallium/drivers/swr/Makefile.am +++ b/src/gallium/drivers/swr/Makefile.am @@ -115,7 +115,7 @@ rasterizer/codegen/gen_knobs.cpp: rasterizer/codegen/gen_knobs.py rasterizer/cod --output rasterizer/codegen/gen_knobs.cpp \ --gen_cpp -rasterizer/codegen/gen_knobs.h: rasterizer/codegen/gen_knobs.py rasterizer/codegen/knob_defs.py rasterizer/codegen/templates/gen_knobs.cpp rasterizer/codegen/gen_common.py +rasterizer/codegen/gen_knobs.h: rasterizer/codegen/gen_knobs.py rasterizer/codegen/knob_defs.py rasterizer/codegen/templates/gen_knobs.h rasterizer/codegen/gen_common.py $(MKDIR_GEN) $(PYTHON_GEN) \ $(srcdir)/rasterizer/codegen/gen_knobs.py \ @@ -347,5 +347,6 @@ EXTRA_DIST = \ rasterizer/codegen/templates/gen_builder.hpp \ rasterizer/codegen/templates/gen_header_init.hpp \ rasterizer/codegen/templates/gen_knobs.cpp \ + rasterizer/codegen/templates/gen_knobs.h \ rasterizer/codegen/templates/gen_llvm.hpp \ rasterizer/codegen/templates/gen_rasterizer.cpp diff --git a/src/gallium/drivers/swr/SConscript b/src/gallium/drivers/swr/SConscript index c578d7a..b394cbc 100644 --- a/src/gallium/drivers/swr/SConscript +++ b/src/gallium/drivers/swr/SConscript @@ -54,7 +54,7 @@ env.CodeGenerate( command = python_cmd + ' $SCRIPT --output $TARGET --gen_h' ) Depends('rasterizer/codegen/gen_knobs.h', -swrroot + 'rasterizer/codegen/templates/gen_knobs.cpp') +swrroot + 'rasterizer/codegen/templates/gen_knobs.h') env.CodeGenerate( target = 'rasterizer/jitter/gen_state_llvm.h', diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py index 2c271c7..33f62a2 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py @@ -37,27 +37,25 @@ def main(args=sys.argv[1:]): args = parser.parse_args() cur_dir = os.path.dirname(os.path.abspath(__file__)) -template_file = os.path.join(cur_dir, 'templates', 'gen_knobs.cpp') +template_cpp = os.path.join(cur_dir, 'templates', 'gen_knobs.cpp') +template_h = os.path.join(cur_dir, 'templates', 'gen_knobs.h') if args.gen_h: MakoTemplateWriter.to_file( -template_file, +template_h, args.output, cmdline=sys.argv, filename='gen_knobs', -knobs=knob_defs.KNOBS, -includes=['core/knobs_init.h', 'common/os.h', 'sstream', 'iomanip'], -gen_header=True) +knobs=knob_defs.KNOBS) if args.gen_cpp: MakoTemplateWriter.to_file( -template_file, +template_cpp, args.output, cmdline=sys.argv, filename='gen_knobs', knobs=knob_defs.KNOBS, -includes=['core/knobs_init.h', 'common/os.h', 'sstream', 'iomanip'], -gen_header=False) +includes=['core/knobs_init.h', 'common/os.h', 'sstream', 'iomanip']) return 0 diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp index a950643..2f4c47a 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp @@ -20,11 +20,7 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * -% if gen_header: -* @file ${filename}.h -% else: * @file ${filename}.cpp -% endif * * @brief Dynamic Knobs for Core. * @@ -35,105 +31,6 @@ * **/ <% calc_max_knob_len(knobs) %> -%if gen_header: -#pragma once -#include - -struct KnobBase -{ -private: -// Update the input string. -static void autoExpandEnvironmentVariables(std::string &text); - -protected: -// Leave input alone and return new string. -static std::string expandEnvironmentVariables(std::string const &input) -{ -std::string text = input; -autoExpandEnvironmentVariables(text); -return text; -} - -template -static T expandEnvironmentVariables(T const &input)
[Mesa-dev] [PATCH v2 06/17] swr/rast: stop using MSFT types in platform independent code
--- src/gallium/drivers/swr/rasterizer/common/os.h | 6 -- src/gallium/drivers/swr/rasterizer/core/api.cpp| 2 +- src/gallium/drivers/swr/rasterizer/core/api.h | 4 ++-- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 4 ++-- src/gallium/drivers/swr/rasterizer/core/blend.h| 2 +- src/gallium/drivers/swr/rasterizer/core/clip.h | 8 src/gallium/drivers/swr/rasterizer/core/fifo.hpp | 2 +- src/gallium/drivers/swr/rasterizer/core/format_traits.h| 4 ++-- src/gallium/drivers/swr/rasterizer/core/pa.h | 2 +- src/gallium/drivers/swr/rasterizer/core/threads.cpp| 4 ++-- src/gallium/drivers/swr/rasterizer/core/tilemgr.h | 12 ++-- src/gallium/drivers/swr/rasterizer/core/utils.h| 10 ++ src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp| 2 +- src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp | 4 ++-- 14 files changed, 35 insertions(+), 31 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h index dc90fca..4ed6b88 100644 --- a/src/gallium/drivers/swr/rasterizer/common/os.h +++ b/src/gallium/drivers/swr/rasterizer/common/os.h @@ -220,12 +220,6 @@ void *AlignedMalloc(unsigned int size, unsigned int alignment) return ret; } -inline -unsigned char _bittest(const LONG *a, LONG b) -{ -return ((*(unsigned *)(a) & (1 << b)) != 0); -} - static inline void AlignedFree(void* p) { diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 855d133..8dc9ac2 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -189,7 +189,7 @@ void QueueWork(SWR_CONTEXT *pContext) if (IsDraw) { -InterlockedIncrement((volatile LONG*)&pContext->drawsOutstandingFE); +InterlockedIncrement((volatile long*)&pContext->drawsOutstandingFE); } _ReadWriteBarrier(); diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h index 236e0fc..a394205 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.h +++ b/src/gallium/drivers/swr/rasterizer/core/api.h @@ -697,8 +697,8 @@ SWR_FUNC(void, SwrStoreHotTileToSurface, SWR_FUNC(void, SwrStoreHotTileClear, SWR_SURFACE_STATE *pDstSurface, SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - UINT x, - UINT y, + uint32_t x, + uint32_t y, uint32_t renderTargetArrayIndex, const float* pClearColor); diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index de6691b..c1f0f07 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -64,7 +64,7 @@ INLINE void ProcessAttributes( static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT"); const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; // Conservative Rasterization requires degenerate tris to have constant attribute interpolation -LONG constantInterpMask = IsDegenerate::value ? 0x : backendState.constantInterpolationMask; +uint32_t constantInterpMask = IsDegenerate::value ? 0x : backendState.constantInterpolationMask; const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex; const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology; @@ -93,7 +93,7 @@ INLINE void ProcessAttributes( if (HasConstantInterpT::value || IsDegenerate::value) { -if (_bittest(&constantInterpMask, i)) +if (CheckBit(constantInterpMask, i)) { uint32_t vid; uint32_t adjustedTriIndex; diff --git a/src/gallium/drivers/swr/rasterizer/core/blend.h b/src/gallium/drivers/swr/rasterizer/core/blend.h index 1b98e44..c89c476 100644 --- a/src/gallium/drivers/swr/rasterizer/core/blend.h +++ b/src/gallium/drivers/swr/rasterizer/core/blend.h @@ -278,7 +278,7 @@ INLINE void Clamp(simdvector &src) } template -void Blend(const SWR_BLEND_STATE *pBlendState, const SWR_RENDER_TARGET_BLEND_STATE *pState, simdvector &src, simdvector& src1, BYTE *pDst, simdvector &result) +void Blend(const SWR_BLEND_STATE *pBlendState, const SWR_RENDER_TARGET_BLEND_STATE *pState, simdvector &src, simdvector& src1, uint8_t *pDst, simdvector &result) { // load render target simdvector dst; diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index bf16792..ca6596e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -464,7 +464,7 @@ public: // input/output vertex store for clipper
[Mesa-dev] [PATCH v2 08/17] swr/rast: rename frontend pVertexStore
Rename to reflect global nature. --- src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 15 +-- 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index f9eda83..e51f967 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -1332,7 +1332,7 @@ static void TessellationStages( TSDestroyCtx(tsCtx); } -THREAD PA_STATE::SIMDVERTEX *pVertexStore = nullptr; +THREAD PA_STATE::SIMDVERTEX *gpVertexStore = nullptr; THREAD uint32_t gVertexStoreSize = 0; // @@ -1459,19 +1459,22 @@ void ProcessDraw( // grow the vertex store for the PA as necessary if (gVertexStoreSize < vertexStoreSize) { -if (pVertexStore != nullptr) +if (gpVertexStore != nullptr) { -AlignedFree(pVertexStore); +AlignedFree(gpVertexStore); +gpVertexStore = nullptr; } -pVertexStore = reinterpret_cast(AlignedMalloc(vertexStoreSize, 64)); +SWR_ASSERT(gpVertexStore == nullptr); + +gpVertexStore = reinterpret_cast(AlignedMalloc(vertexStoreSize, 64)); gVertexStoreSize = vertexStoreSize; -SWR_ASSERT(pVertexStore != nullptr); +SWR_ASSERT(gpVertexStore != nullptr); } // choose primitive assembler -PA_FACTORY paFactory(pDC, state.topology, work.numVerts, pVertexStore, numVerts, state.frontendState.vsVertexSize); +PA_FACTORY paFactory(pDC, state.topology, work.numVerts, gpVertexStore, numVerts, state.frontendState.vsVertexSize); PA_STATE& pa = paFactory.GetPA(); #if USE_SIMD16_FRONTEND -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 13/17] swr/rast: switch gen_knobs.cpp license
Unintentionally added with an apache2 license; relicense to match the rest of the tree. --- .../swr/rasterizer/codegen/templates/gen_knobs.cpp | 29 +- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp index 06b93bd..e6fe165 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp +++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp @@ -1,19 +1,24 @@ /** +* Copyright (C) 2015-2017 Intel Corporation. All Rights Reserved. * -* Copyright 2015-2017 -* Intel Corporation +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: * -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. * -* http ://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. * % if gen_header: * @file ${filename}.h -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 12/17] swr/rast: fix scons gen_knobs.h dependency
Copy/paste error was duplicating a gen_knobs.cpp rule. --- src/gallium/drivers/swr/SConscript | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/swr/SConscript b/src/gallium/drivers/swr/SConscript index a32807d..c578d7a 100644 --- a/src/gallium/drivers/swr/SConscript +++ b/src/gallium/drivers/swr/SConscript @@ -53,7 +53,7 @@ env.CodeGenerate( source = '', command = python_cmd + ' $SCRIPT --output $TARGET --gen_h' ) -Depends('rasterizer/codegen/gen_knobs.cpp', +Depends('rasterizer/codegen/gen_knobs.h', swrroot + 'rasterizer/codegen/templates/gen_knobs.cpp') env.CodeGenerate( -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 09/17] swr/rast: vmask() implementations for KNL
--- .../swr/rasterizer/common/simdlib_512_avx512_knights.inl | 14 ++ 1 file changed, 14 insertions(+) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl index 17001be..2ee7639 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl @@ -132,6 +132,20 @@ } #define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op) +private: +static SIMDINLINE Integer vmask(__mmask8 m) +{ +return _mm512_mask_set1_epi64(_mm512_setzero_si512(), m, -1LL); +} +static SIMDINLINE Integer vmask(__mmask32 m) +{ +return _mm512_mask_set1_epi16(_mm512_setzero_si512(), m, -1); +} +static SIMDINLINE Integer vmask(__mmask64 m) +{ +return _mm512_mask_set1_epi8(_mm512_setzero_si512(), m, -1); +} + public: SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int) SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32);// return (~a) & b(float treated as int) -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev