[Mesa-dev] [PATCH 4/7] swr/rast: Add ConcatLists()

2018-05-04 Thread George Kyriazis
for concatenating lists
---
 src/gallium/drivers/swr/rasterizer/codegen/gen_common.py | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py
index 44a0cc8..60b749d 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py
@@ -32,6 +32,12 @@ from mako.template import Template
 from mako.exceptions import RichTraceback
 
 #==
+def ConcatLists(list_of_lists):
+output = []
+for l in list_of_lists: output += l
+return output
+
+#==
 def MakeTmpDir(suffix=''):
 '''
 Create temporary directory for use in codegen scripts.
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 6/7] swr/rast: Add Builder::GetVectorType()

2018-05-04 Thread George Kyriazis
---
 .../drivers/swr/rasterizer/jitter/builder.cpp  | 44 ++
 .../drivers/swr/rasterizer/jitter/builder.h|  1 +
 2 files changed, 45 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index 3248735..e1c5d80 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -170,4 +170,48 @@ namespace SwrJit
 return (pGenIntrin->getMetadata("is_evaluate") != nullptr);
 }
 
+//
+/// @brief Packetizes the type. Assumes SOA conversion.
+Type* Builder::GetVectorType(Type* pType)
+{
+if (pType->isVectorTy())
+{
+return pType;
+}
+
+// [N x float] should packetize to [N x <8 x float>]
+if (pType->isArrayTy())
+{
+uint32_t arraySize = pType->getArrayNumElements();
+Type* pArrayType = pType->getArrayElementType();
+Type* pVecArrayType = GetVectorType(pArrayType);
+Type* pVecType = ArrayType::get(pVecArrayType, arraySize);
+return pVecType;
+}
+
+// {float,int} should packetize to {<8 x float>, <8 x int>}
+if (pType->isAggregateType())
+{
+uint32_t numElems = pType->getStructNumElements();
+SmallVector vecTypes;
+for (uint32_t i = 0; i < numElems; ++i)
+{
+Type* pElemType = pType->getStructElementType(i);
+Type* pVecElemType = GetVectorType(pElemType);
+vecTypes.push_back(pVecElemType);
+}
+Type* pVecType = StructType::get(JM()->mContext, vecTypes);
+return pVecType;
+}
+
+// [N x float]* should packetize to [N x <8 x float>]*
+if (pType->isPointerTy() && 
pType->getPointerElementType()->isArrayTy())
+{
+return 
PointerType::get(GetVectorType(pType->getPointerElementType()), 
pType->getPointerAddressSpace());
+}
+
+//  should packetize to <8 x >
+Type* vecType = VectorType::get(pType, JM()->mVWidth);
+return vecType;
+}
 }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 82c5f8c..6ca128d 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -123,6 +123,7 @@ namespace SwrJit
 bool IsTempAlloca(Value* inst);
 bool SetTexelMaskEvaluate(Instruction* inst);
 bool IsTexelMaskEvaluate(Instruction* inst);
+Type* GetVectorType(Type* pType);
 
 #include "gen_builder.hpp"
 #include "gen_builder_meta.hpp"
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 0/7] SWR changes

2018-05-04 Thread George Kyriazis
Misc OpenSWR changes

George Kyriazis (7):
  swr/rast: Change formatting
  swr/rast: Use binner topology to assemble backend attributes
  swr/rast: Add constant initializer for uint64_t
  swr/rast: Add ConcatLists()
  swr/rast: Prepend the console output with a newline
  swr/rast: Add Builder::GetVectorType()
  swr/rast: Thread locked tiles improvement

 .../drivers/swr/rasterizer/codegen/gen_common.py   |   6 ++
 .../codegen/templates/gen_ar_eventhandlerfile.hpp  |   2 +-
 src/gallium/drivers/swr/rasterizer/core/api.cpp|  18 +++-
 src/gallium/drivers/swr/rasterizer/core/binner.cpp |   2 +-
 src/gallium/drivers/swr/rasterizer/core/context.h  |   2 +-
 .../drivers/swr/rasterizer/core/threads.cpp|   5 +-
 src/gallium/drivers/swr/rasterizer/core/threads.h  |   2 +-
 .../drivers/swr/rasterizer/core/tilemgr.cpp|  31 +++---
 src/gallium/drivers/swr/rasterizer/core/tilemgr.h  |  20 ++--
 src/gallium/drivers/swr/rasterizer/core/tileset.h  | 105 +
 .../drivers/swr/rasterizer/jitter/builder.cpp  |  44 +
 .../drivers/swr/rasterizer/jitter/builder.h|   1 +
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp |   5 +
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |   1 +
 14 files changed, 217 insertions(+), 27 deletions(-)
 create mode 100644 src/gallium/drivers/swr/rasterizer/core/tileset.h

-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 5/7] swr/rast: Prepend the console output with a newline

2018-05-04 Thread George Kyriazis
It can get jumbled with output from other threads.
---
 .../swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp| 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git 
a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
index ceded82..79612f3 100644
--- 
a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
+++ 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
@@ -59,7 +59,7 @@ namespace ArchRast
 outDir << KNOB_DEBUG_OUTPUT_DIR << pBaseName << "_" << pid << 
std::ends;
 mOutputDir = outDir.str();
 if (CreateDirectory(mOutputDir.c_str(), NULL)) {
-std::cout << "ArchRast Dir:  " << mOutputDir << std::endl 
<< std::endl << std::flush;
+std::cout << std::endl << "ArchRast Dir:   " << mOutputDir 
<< std::endl << std::endl << std::flush;
 }
 
 // There could be multiple threads creating thread pools. We
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/7] swr/rast: Add constant initializer for uint64_t

2018-05-04 Thread George Kyriazis
---
 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp | 5 +
 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h   | 1 +
 2 files changed, 6 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 619a67b..231fa94 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -182,6 +182,11 @@ namespace SwrJit
 return ConstantInt::get(IRB()->getInt32Ty(), i);
 }
 
+Constant *Builder::C(uint64_t i)
+{
+return ConstantInt::get(IRB()->getInt64Ty(), i);
+}
+
 Constant *Builder::C(float i)
 {
 return ConstantFP::get(IRB()->getFloatTy(), i);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index a51aad0..d7732ef 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -34,6 +34,7 @@ Constant *C(char i);
 Constant *C(uint8_t i);
 Constant *C(int i);
 Constant *C(int64_t i);
+Constant *C(uint64_t i);
 Constant *C(uint16_t i);
 Constant *C(uint32_t i);
 Constant *C(float i);
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/7] swr/rast: Use binner topology to assemble backend attributes

2018-05-04 Thread George Kyriazis
Previously was using the draw topology, which may change if GS or Tess
are active. Only affected attributes marked with constant interpolation,
which limited the impact.
---
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 9f8dc88..7b9c20e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -81,7 +81,7 @@ INLINE void ProcessAttributes(
 // Conservative Rasterization requires degenerate tris to have constant 
attribute interpolation
 uint32_t constantInterpMask = IsDegenerate::value ? 0x : 
backendState.constantInterpolationMask;
 const uint32_t provokingVertex = 
pDC->pState->state.frontendState.topologyProvokingVertex;
-const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
+const PRIMITIVE_TOPOLOGY topo = pa.binTopology;
 
 static const float constTable[3][4] = {
 { 0.0f, 0.0f, 0.0f, 0.0f },
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/7] swr/rast: Change formatting

2018-05-04 Thread George Kyriazis
---
 src/gallium/drivers/swr/rasterizer/core/api.cpp | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index a2ee85d..3458793 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -475,7 +475,12 @@ void SetupDefaultState(SWR_CONTEXT *pContext)
 pState->depthBoundsState.depthBoundsTestMaxValue = 1.0f;
 }
 
-void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, 
uint64_t userData2, uint64_t userData3)
+void SWR_API SwrSync(
+HANDLE hContext,
+PFN_CALLBACK_FUNC pfnFunc,
+uint64_t userData,
+uint64_t userData2,
+uint64_t userData3)
 {
 SWR_ASSERT(pfnFunc != nullptr);
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 7/7] swr/rast: Thread locked tiles improvement

2018-05-04 Thread George Kyriazis
- Change tilemgr TILE_ID encoding to use Morton-order (Z-order).
- Change locked tiles set to bitset.  Makes clear, set, get much faster.
---
 src/gallium/drivers/swr/rasterizer/core/api.cpp|  11 ++-
 src/gallium/drivers/swr/rasterizer/core/context.h  |   2 +-
 .../drivers/swr/rasterizer/core/threads.cpp|   5 +-
 src/gallium/drivers/swr/rasterizer/core/threads.h  |   2 +-
 .../drivers/swr/rasterizer/core/tilemgr.cpp|  31 +++---
 src/gallium/drivers/swr/rasterizer/core/tilemgr.h  |  20 ++--
 src/gallium/drivers/swr/rasterizer/core/tileset.h  | 105 +
 7 files changed, 152 insertions(+), 24 deletions(-)
 create mode 100644 src/gallium/drivers/swr/rasterizer/core/tileset.h

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 3458793..47f3633 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -42,6 +42,7 @@
 #include "core/tilemgr.h"
 #include "core/clip.h"
 #include "core/utils.h"
+#include "core/tileset.h"
 
 #include "common/os.h"
 
@@ -139,6 +140,11 @@ HANDLE SwrCreateContext(
 BindApiThread(pContext, 0);
 }
 
+if (pContext->threadInfo.SINGLE_THREADED)
+{
+pContext->pSingleThreadLockedTiles = new TileSet();
+}
+
 pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
 pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * 
pContext->NumWorkerThreads, 64);
 
@@ -245,7 +251,7 @@ void QueueWork(SWR_CONTEXT *pContext)
 {
 uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, 
pContext->pCurDrawContext->drawId };
 WorkOnFifoFE(pContext, 0, curDraw[0]);
-WorkOnFifoBE(pContext, 0, curDraw[1], 
pContext->singleThreadLockedTiles, 0, 0);
+WorkOnFifoBE(pContext, 0, curDraw[1], 
*pContext->pSingleThreadLockedTiles, 0, 0);
 }
 else
 {
@@ -427,7 +433,8 @@ void SwrDestroyContext(HANDLE hContext)
 delete[] pContext->ppScratch;
 AlignedFree(pContext->pStats);
 
-delete(pContext->pHotTileMgr);
+delete pContext->pHotTileMgr;
+delete pContext->pSingleThreadLockedTiles;
 
 pContext->~SWR_CONTEXT();
 AlignedFree(GetContext(hContext));
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h 
b/src/gallium/drivers/swr/rasterizer/core/context.h
index af8f4b8..2cd61e4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -516,7 +516,7 @@ struct SWR_CONTEXT
 
 uint32_t lastFrameChecked;
 uint64_t lastDrawChecked;
-TileSet singleThreadLockedTiles;
+TileSet* pSingleThreadLockedTiles;
 
 // ArchRast thread contexts.
 HANDLE* pArContext;
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp 
b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 9e16246..f77ae22 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -49,6 +49,7 @@
 #include "rasterizer.h"
 #include "rdtsc_core.h"
 #include "tilemgr.h"
+#include "tileset.h"
 
 
 
@@ -587,7 +588,7 @@ bool WorkOnFifoBE(
 }
 
 // can only work on this draw if it's not in use by other threads
-if (lockedTiles.find(tileID) != lockedTiles.end())
+if (lockedTiles.get(tileID))
 {
 continue;
 }
@@ -645,7 +646,7 @@ bool WorkOnFifoBE(
 else
 {
 // This tile is already locked. So let's add it to our locked 
tiles set. This way we don't try locking this one again.
-lockedTiles.insert(tileID);
+lockedTiles.set(tileID);
 }
 }
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h 
b/src/gallium/drivers/swr/rasterizer/core/threads.h
index cb918dd..0489a3c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -62,7 +62,7 @@ struct THREAD_POOL
 THREAD_DATA *pApiThreadData;
 };
 
-typedef std::unordered_set TileSet;
+struct TileSet;
 
 void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool);
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp 
b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index 28fa787..1bdef4b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -33,8 +33,6 @@
 #include "core/multisample.h"
 #include "rdtsc_core.h"
 
-#define TILE_ID(x,y) ((x << 16 | y))
-
 MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
 {
 }
@@ -50,26 +48,35 @@ void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK 
*pWork)
 return;
 }
 
-uint32_t id = TILE_ID(x, y);
+uint32_t id = getTileId(x, y);
+
+if (id >= mTiles.size())
+  

[Mesa-dev] [PATCH 12/21] swr/rast: Fix init in EventHandlerWorkerStats

2018-04-25 Thread George Kyriazis
Make sure we initialize variables.
---
 src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp 
b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
index 871db79..ff7bdc3 100644
--- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
+++ b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
@@ -121,7 +121,10 @@ namespace ArchRast
 class EventHandlerWorkerStats : public EventHandlerFile
 {
 public:
-EventHandlerWorkerStats(uint32_t id) : EventHandlerFile(id), 
mNeedFlush(false) {}
+EventHandlerWorkerStats(uint32_t id) : EventHandlerFile(id), 
mNeedFlush(false)
+{
+memset(mShaderStats, 0, sizeof(mShaderStats));
+}
 
 virtual void Handle(const EarlyDepthStencilInfoSingleSample& event)
 {
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 15/21] swr/rast: Fix regressions.

2018-04-25 Thread George Kyriazis
Bump jit cache revision number to force recompile.
---
 src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index bfc3e42..3b4c3f5 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -474,7 +474,7 @@ struct JitCacheFileHeader
 uint64_t GetObjectCRC() const { return m_objCRC; }
 
 private:
-static const uint64_t   JC_MAGIC_NUMBER = 0xfedcba9876543211ULL + 3;
+static const uint64_t   JC_MAGIC_NUMBER = 0xfedcba9876543211ULL + 4;
 static const size_t JC_STR_MAX_LEN = 32;
 static const uint32_t   JC_PLATFORM_KEY =
 (LLVM_VERSION_MAJOR << 24)  |
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 11/21] swr/rast: Fix return type of VCVTPS2PH.

2018-04-25 Thread George Kyriazis
expecting <8xi16> return.
---
 src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index bced657..2e7f1a8 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -53,7 +53,7 @@ intrinsics = [
 ['VPERMPS', ['idx', 'a'], 'a'],
 ['VCVTPD2PS',   ['a'], 'VectorType::get(mFP32Ty, 
a->getType()->getVectorNumElements())'],
 ['VCVTPH2PS',   ['a'], 'VectorType::get(mFP32Ty, 
a->getType()->getVectorNumElements())'],
-['VCVTPS2PH',   ['a', 'round'], 'mSimdFP16Ty'],
+['VCVTPS2PH',   ['a', 'round'], 'mSimdInt16Ty'],
 ['VHSUBPS', ['a', 'b'], 'a'],
 ['VPTESTC', ['a', 'b'], 'mInt32Ty'],
 ['VPTESTZ', ['a', 'b'], 'mInt32Ty'],
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 03/21] swr/rast: Fix wrong type allocation

2018-04-25 Thread George Kyriazis
ALLOCA pointer elements, not pointers.
---
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 09590b7..a43c787 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -1014,7 +1014,7 @@ template Value* 
FetchJit::GetSimdValidIndicesHelper(Value* pIndices,
 
 {
 // store 0 index on stack to be used to conditionally load from if 
index address is OOB
-Value* pZeroIndex = ALLOCA(Ty);
+Value* pZeroIndex = ALLOCA(Ty->getPointerElementType());
 STORE(C((T)0), pZeroIndex);
 
 // Load a SIMD of index pointers
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 18/21] swr/rast: Output rasterizer dir to console since it's process specific

2018-04-25 Thread George Kyriazis
---
 .../swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git 
a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
index 54d2486..4f87e0c 100644
--- 
a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
+++ 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
@@ -36,6 +36,7 @@
 #include "${event_header}"
 #include 
 #include 
+#include 
 #include 
 
 namespace ArchRast
@@ -57,7 +58,9 @@ namespace ArchRast
 std::stringstream outDir;
 outDir << KNOB_DEBUG_OUTPUT_DIR << pBaseName << "_" << pid << 
std::ends;
 mOutputDir = outDir.str();
-CreateDirectory(mOutputDir.c_str(), NULL);
+if (CreateDirectory(mOutputDir.c_str(), NULL)) {
+std::cout << "Rasterizer Dir:  " << mOutputDir << 
std::endl << std::endl << std::flush;
+}
 
 // There could be multiple threads creating thread pools. We
 // want to make sure they are uniquly identified by adding in
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 17/21] swr/rast: Add TranslateGfxAddress for shader

2018-04-25 Thread George Kyriazis
Also add GFX_MEM_CLIENT_SHADER
---
 .../drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp   |  2 +-
 .../drivers/swr/rasterizer/jitter/builder_gfx_mem.h | 17 -
 src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h |  3 ++-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
index 9b70716..03e34db 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
@@ -201,7 +201,7 @@ namespace SwrJit
 return Builder::MASKED_LOAD(Ptr, Align, Mask, PassThru, Name, Ty, 
usage);
 }
 
-Value* BuilderGfxMem::TranslateGfxAddress(Value* xpGfxAddress, Type* 
PtrTy, const Twine )
+Value* BuilderGfxMem::TranslateGfxAddress(Value* xpGfxAddress, Type* 
PtrTy, const Twine , JIT_MEM_CLIENT /* usage */)
 {
 if (PtrTy == nullptr)
 {
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h
index effbe05..d1a25c4 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h
@@ -57,7 +57,22 @@ namespace SwrJit
 
 virtual Value *GATHERDD(Value* src, Value* pBase, Value* indices, 
Value* mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
 
-Value* TranslateGfxAddress(Value* xpGfxAddress, Type* PtrTy = nullptr, 
const Twine  = "");
+Value* TranslateGfxAddress(Value* xpGfxAddress, Type* PtrTy = nullptr, 
const Twine  = "", JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+template 
+Value* TranslateGfxAddress(Value* xpGfxBaseAddress, const 
std::initializer_list , Type* PtrTy = nullptr, const Twine  = 
"", JIT_MEM_CLIENT usage = GFX_MEM_CLIENT_SHADER)
+{
+AssertGFXMemoryParams(xpGfxBaseAddress, usage);
+SWR_ASSERT(xpGfxBaseAddress->getType()->isPointerTy() == false);
+
+if (!PtrTy)
+{
+PtrTy = mInt8PtrTy;
+}
+
+Value* ptr = INT_TO_PTR(xpGfxBaseAddress, PtrTy);
+ptr = GEP(ptr, offset);
+return TranslateGfxAddress(PTR_TO_INT(ptr, mInt64Ty), PtrTy, Name, 
usage);
+}
 
 
 protected:
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
index 9ccac4f..3823a13 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
@@ -35,7 +35,8 @@ typedef enum _JIT_MEM_CLIENT
 {
 MEM_CLIENT_INTERNAL,
 GFX_MEM_CLIENT_FETCH,
-GFX_MEM_CLIENT_SAMPLER
+GFX_MEM_CLIENT_SAMPLER,
+GFX_MEM_CLIENT_SHADER,
 } JIT_MEM_CLIENT;
 
 protected:
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 06/21] swr/rast: Internal core change

2018-04-25 Thread George Kyriazis
---
 src/gallium/drivers/swr/rasterizer/core/utils.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h 
b/src/gallium/drivers/swr/rasterizer/core/utils.h
index d6cbf24..7769e05 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "common/os.h"
 #include "common/intrin.h"
 #include "common/swr_assert.h"
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 05/21] swr/rast: Fix x86 lowering 64-bit float handling

2018-04-25 Thread George Kyriazis
- 64-bit cvt-to-float needs to be explicitly handled
- gathers need the right parameter types to work with doubles

Fixes draw-vertices piglit tests
---
 .../drivers/swr/rasterizer/jitter/builder_misc.h   | 12 ++
 .../rasterizer/jitter/functionpasses/lower_x86.cpp | 50 +++---
 2 files changed, 56 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index bd4be9f..a51aad0 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -55,6 +55,18 @@ Constant *CA(LLVMContext& ctx, ArrayRef constList)
 return ConstantDataArray::get(ctx, constList);
 }
 
+template
+Constant *CInc(uint32_t base, uint32_t count)
+{
+std::vector vConsts;
+
+for(uint32_t i = 0; i < count; i++) {
+vConsts.push_back(C((Ty)base));
+base++;
+}
+return ConstantVector::get(vConsts);
+}
+
 Constant *PRED(bool pred);
 
 Value *VIMMED1(int i);
diff --git 
a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
index baf3ab5..eac0549 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
@@ -115,7 +115,7 @@ namespace SwrJit
 {"meta.intrinsic.VGATHERPD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
-{"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx_cvt_pd2_ps_256,
  Intrinsic::not_intrinsic},  NO_EMU}},
+{"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx_cvt_pd2_ps_256,
  DOUBLE},NO_EMU}},
 {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_vcvtph2ps_256, 
  Intrinsic::not_intrinsic},  NO_EMU}},
 {"meta.intrinsic.VROUND",  {{Intrinsic::x86_avx_round_ps_256,  
  DOUBLE},NO_EMU}},
 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256,   
  DOUBLE},NO_EMU}},
@@ -166,10 +166,18 @@ namespace SwrJit
 // across all intrinsics, and will have to be rethought. Probably need 
something
 // similar to llvm's getDeclaration() utility to map a set of inputs 
to a specific typed
 // intrinsic.
-void GetRequestedWidthAndType(CallInst* pCallInst, TargetWidth* 
pWidth, Type** pTy)
+void GetRequestedWidthAndType(CallInst* pCallInst, const StringRef 
intrinName, TargetWidth* pWidth, Type** pTy)
 {
 uint32_t vecWidth;
 Type* pVecTy = pCallInst->getType();
+
+// Check for intrinsic specific types
+// VCVTPD2PS type comes from src, not dst
+if (intrinName.equals("meta.intrinsic.VCVTPD2PS"))
+{
+pVecTy = pCallInst->getOperand(0)->getType();
+}
+
 if (!pVecTy->isVectorTy())
 {
 for (auto& op : pCallInst->arg_operands())
@@ -231,7 +239,7 @@ namespace SwrJit
 auto& intrinsic = intrinsicMap2[mTarget][pFunc->getName()];
 TargetWidth vecWidth;
 Type* pElemTy;
-GetRequestedWidthAndType(pCallInst, , );
+GetRequestedWidthAndType(pCallInst, pFunc->getName(), , 
);
 
 // Check if there is a native intrinsic for this instruction
 Intrinsic::ID id = intrinsic.intrin[vecWidth];
@@ -460,7 +468,9 @@ namespace SwrJit
 // Double pump 4-wide for 64bit elements
 if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)
 {
-auto v64Mask = B->S_EXT(pThis->VectorMask(vi1Mask), 
B->mInt64Ty);
+auto v64Mask = pThis->VectorMask(vi1Mask);
+v64Mask = B->S_EXT(v64Mask,
+   VectorType::get(B->mInt64Ty, 
v64Mask->getType()->getVectorNumElements()));
 v64Mask = B->BITCAST(v64Mask, vSrc->getType());
 
 Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({ 0, 1, 2, 3 
}));
@@ -472,10 +482,15 @@ namespace SwrJit
 Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({ 0, 1, 
2, 3 }));
 Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({ 4, 5, 
6, 7 }));
 
+src0 = B->BITCAST(src0, VectorType::get(B->mInt64Ty, 

[Mesa-dev] [PATCH 20/21] swr/rast: Small editorial changes

2018-04-25 Thread George Kyriazis
---
 .../swr/rasterizer/jitter/builder_gfx_mem.cpp  | 33 ++
 .../swr/rasterizer/jitter/builder_gfx_mem.h|  1 +
 .../rasterizer/jitter/functionpasses/lower_x86.cpp |  2 +-
 3 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
index 03e34db..c6d0619 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
@@ -55,6 +55,7 @@ namespace SwrJit
 SWR_ASSERT(!(ptr->getType() == mInt64Ty && usage == 
MEM_CLIENT_INTERNAL), "Internal memory should not be gfxptr_t.");
 }
 
+
 //
 /// @brief Generate a masked gather operation in LLVM IR.  If not  
 /// supported on the underlying platform, emulate it with loads
@@ -63,17 +64,15 @@ namespace SwrJit
 /// @param vIndices - SIMD wide value of VB byte offsets
 /// @param vMask - SIMD wide mask that controls whether to access memory 
or the src values
 /// @param scale - value to scale indices by
-Value *BuilderGfxMem::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, 
Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage)
+Value* BuilderGfxMem::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, uint8_t scale, JIT_MEM_CLIENT usage)
 {
-Value *vGather;
-
 // address may be coming in as 64bit int now so get the pointer
 if (pBase->getType() == mInt64Ty)
 {
 pBase = INT_TO_PTR(pBase, PointerType::get(mInt8Ty, 0));
 }
 
-vGather = Builder::GATHERPS(vSrc, pBase, vIndices, vMask, scale);
+Value* vGather = Builder::GATHERPS(vSrc, pBase, vIndices, vMask, 
scale);
 return vGather;
 }
 
@@ -85,10 +84,8 @@ namespace SwrJit
 /// @param vIndices - SIMD wide value of VB byte offsets
 /// @param vMask - SIMD wide mask that controls whether to access memory 
or the src values
 /// @param scale - value to scale indices by
-Value *BuilderGfxMem::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, uint8_t scale, JIT_MEM_CLIENT usage)
+Value* BuilderGfxMem::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, uint8_t scale, JIT_MEM_CLIENT usage)
 {
-Value* vGather = VIMMED1(0.0f);
-
 
 // address may be coming in as 64bit int now so get the pointer
 if (pBase->getType() == mInt64Ty)
@@ -96,7 +93,7 @@ namespace SwrJit
 pBase = INT_TO_PTR(pBase, PointerType::get(mInt8Ty, 0));
 }
 
-vGather = Builder::GATHERDD(vSrc, pBase, vIndices, vMask, scale);
+Value* vGather = Builder::GATHERDD(vSrc, pBase, vIndices, vMask, 
scale);
 return vGather;
 }
 
@@ -106,31 +103,31 @@ namespace SwrJit
 return ADD(base, offset);
 }
 
-Value *BuilderGfxMem::GEP(Value *Ptr, Value *Idx, Type *Ty, const Twine 
)
+Value* BuilderGfxMem::GEP(Value* Ptr, Value* Idx, Type *Ty, const Twine 
)
 {
 Ptr = TranslationHelper(Ptr, Ty);
 return Builder::GEP(Ptr, Idx, nullptr, Name);
 }
 
-Value *BuilderGfxMem::GEP(Type *Ty, Value *Ptr, Value *Idx, const Twine 
)
+Value* BuilderGfxMem::GEP(Type *Ty, Value* Ptr, Value* Idx, const Twine 
)
 {
 Ptr = TranslationHelper(Ptr, Ty);
 return Builder::GEP(Ty, Ptr, Idx, Name);
 }
 
-Value *BuilderGfxMem::GEP(Value* Ptr, const std::initializer_list 
, Type *Ty)
+Value* BuilderGfxMem::GEP(Value* Ptr, const std::initializer_list 
, Type *Ty)
 {
 Ptr = TranslationHelper(Ptr, Ty);
 return Builder::GEP(Ptr, indexList);
 }
 
-Value *BuilderGfxMem::GEP(Value* Ptr, const 
std::initializer_list , Type *Ty)
+Value* BuilderGfxMem::GEP(Value* Ptr, const 
std::initializer_list , Type *Ty)
 {
 Ptr = TranslationHelper(Ptr, Ty);
 return Builder::GEP(Ptr, indexList);
 }
 
-Value* BuilderGfxMem::TranslationHelper(Value *Ptr, Type *Ty)
+Value* BuilderGfxMem::TranslationHelper(Value* Ptr, Type *Ty)
 {
 SWR_ASSERT(!(Ptr->getType() == mInt64Ty && Ty == nullptr), "Access of 
GFX pointers must have non-null type specified.");
 
@@ -144,7 +141,7 @@ namespace SwrJit
 return Ptr;
 }
 
-LoadInst* BuilderGfxMem::LOAD(Value *Ptr, const char *Name, Type *Ty, 
JIT_MEM_CLIENT usage)
+LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const char *Name, Type *Ty, 
JIT_MEM_CLIENT usage)
 {
 AssertGFXMemoryParams(Ptr, usage);
 
@@ -152,7 +149,7 @@ namespace SwrJit
 return Builder::LOAD(Ptr, Name);
 }
 
-LoadInst* BuilderGfxMem::LOAD(Value *Ptr, const Twine , Type *Ty, 
JIT_MEM_CLIENT usage)
+LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const Twine , Type *Ty, 
JIT_MEM_CLIENT usage)
 {
 AssertGFXMemoryParams(Ptr, 

[Mesa-dev] [PATCH 16/21] swr/rast: jit PRINT improvements.

2018-04-25 Thread George Kyriazis
Sign-extend integer types to 32bit when specifying "%d" and add new %u
which zero-extends to 32bit. Improves  printing of sub 32bit integer types
(i1 specifically).
---
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp| 15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index f893693..619a67b 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -416,9 +416,20 @@ namespace SwrJit
 {
 tempStr.insert(pos, std::string("%d "));
 pos += 3;
-printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+printCallArgs.push_back(S_EXT(VEXTRACT(pArg, C(i)), 
Type::getInt32Ty(JM()->mContext)));
+}
+printCallArgs.push_back(S_EXT(VEXTRACT(pArg, C(i)), 
Type::getInt32Ty(JM()->mContext)));
+}
+else if ((tempStr[pos + 1] == 'u') && 
(pContainedType->isIntegerTy()))
+{
+uint32_t i = 0;
+for (; i < (pArg->getType()->getVectorNumElements()) - 1; 
i++)
+{
+tempStr.insert(pos, std::string("%d "));
+pos += 3;
+printCallArgs.push_back(Z_EXT(VEXTRACT(pArg, C(i)), 
Type::getInt32Ty(JM()->mContext)));
 }
-printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+printCallArgs.push_back(Z_EXT(VEXTRACT(pArg, C(i)), 
Type::getInt32Ty(JM()->mContext)));
 }
 }
 else
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 07/21] swr/rast: Add support for TexelMask evaluation

2018-04-25 Thread George Kyriazis
---
 .../drivers/swr/rasterizer/jitter/builder.cpp  | 42 ++
 .../drivers/swr/rasterizer/jitter/builder.h|  2 ++
 2 files changed, 44 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index bd81560..3248735 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -128,4 +128,46 @@ namespace SwrJit
 
 return (pAlloca->getMetadata("is_temp_alloca") != nullptr);
 }
+
+// Returns true if able to find an intrinsic to mark
+bool Builder::SetTexelMaskEvaluate(Instruction* inst)
+{
+CallInst* pGenIntrin = dyn_cast(inst);
+if (pGenIntrin)
+{
+MDNode* N = MDNode::get(JM()->mContext, 
MDString::get(JM()->mContext, "is_evaluate"));
+pGenIntrin->setMetadata("is_evaluate", N);
+return true;
+}
+else
+{
+// Follow use def chain back up
+for (Use& u : inst->operands())
+{
+Instruction* srcInst = dyn_cast(u.get());
+if (srcInst)
+{
+if (SetTexelMaskEvaluate(srcInst))
+{
+return true;
+}
+}
+}
+}
+
+return false;
+}
+
+bool Builder::IsTexelMaskEvaluate(Instruction* genSampleOrLoadIntrinsic)
+{
+CallInst* pGenIntrin = dyn_cast(genSampleOrLoadIntrinsic);
+
+if (!pGenIntrin)
+{
+return false;
+}
+
+return (pGenIntrin->getMetadata("is_evaluate") != nullptr);
+}
+
 }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index e2ad1e8..82c5f8c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -121,6 +121,8 @@ namespace SwrJit
 void SetTargetWidth(uint32_t width);
 void SetTempAlloca(Value* inst);
 bool IsTempAlloca(Value* inst);
+bool SetTexelMaskEvaluate(Instruction* inst);
+bool IsTexelMaskEvaluate(Instruction* inst);
 
 #include "gen_builder.hpp"
 #include "gen_builder_meta.hpp"
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 21/21] swr/rast: No need to export GetSimdValidIndicesGfx

2018-04-25 Thread George Kyriazis
---
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp | 4 
 1 file changed, 4 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 48f0961..7b0b80a 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -985,10 +985,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 }
 }
 
-typedef void*(*PFN_TRANSLATEGFXADDRESS_FUNC)(void* pdc, gfxptr_t va);
-extern "C" void GetSimdValid8bitIndicesGfx(gfxptr_t indices, gfxptr_t 
lastIndex, uint32_t vWidth, PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, void* 
pdc, uint32_t* outIndices);
-extern "C" void GetSimdValid16bitIndicesGfx(gfxptr_t indices, gfxptr_t 
lastIndex, uint32_t vWidth, PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, void* 
pdc, uint32_t* outIndices);
-
 template Value* FetchJit::GetSimdValidIndicesHelper(Value* 
pIndices, Value* pLastIndex)
 {
 SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == 
mInt64Ty, "Function expects gfxptr_t for both input parameters.");
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 19/21] swr/rast: Use new processor detection mechanism

2018-04-25 Thread George Kyriazis
Use specific avx512 selection mechanism based on avx512er bit instead of
getHostCPUName().  LLVM 6.0.0 has a bug that reports wrong string for KNL
(fixed in 6.0.1).
---
 .../drivers/swr/rasterizer/jitter/JitManager.cpp   | 50 +-
 .../drivers/swr/rasterizer/jitter/JitManager.h |  2 +
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 3b4c3f5..28aadc6 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -80,7 +80,55 @@ JitManager::JitManager(uint32_t simdWidth, const char *arch, 
const char* core)
 
 StringRef hostCPUName;
 
-hostCPUName = sys::getHostCPUName();
+// force JIT to use the same CPU arch as the rest of swr
+if(mArch.AVX512F())
+{
+#if USE_SIMD16_SHADERS
+if(mArch.AVX512ER())
+{
+hostCPUName = StringRef("knl");
+}
+else
+{
+hostCPUName = StringRef("skylake-avx512");
+}
+mUsingAVX512 = true;
+#else
+hostCPUName = StringRef("core-avx2");
+#endif
+if (mVWidth == 0)
+{
+mVWidth = 8;
+}
+}
+else if(mArch.AVX2())
+{
+hostCPUName = StringRef("core-avx2");
+if (mVWidth == 0)
+{
+mVWidth = 8;
+}
+}
+else if(mArch.AVX())
+{
+if (mArch.F16C())
+{
+hostCPUName = StringRef("core-avx-i");
+}
+else
+{
+hostCPUName = StringRef("corei7-avx");
+}
+if (mVWidth == 0)
+{
+mVWidth = 8;
+}
+}
+else
+{
+SWR_INVALID("Jitting requires at least AVX ISA support");
+}
+
 
 auto optLevel = CodeGenOpt::Aggressive;
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index c15e0d1..54a25d8 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -69,6 +69,7 @@ public:
 
 bool AVX2(void) { return bForceAVX ? 0 : InstructionSet::AVX2(); }
 bool AVX512F(void) { return (bForceAVX | bForceAVX2) ? 0 : 
InstructionSet::AVX512F(); }
+bool AVX512ER(void) { return (bForceAVX | bForceAVX2) ? 0 : 
InstructionSet::AVX512ER(); }
 bool BMI2(void) { return bForceAVX ? 0 : InstructionSet::BMI2(); }
 
 private:
@@ -142,6 +143,7 @@ struct JitManager
 
 uint32_tmVWidth;
 
+boolmUsingAVX512 = false;
 
 // fetch shader types
 llvm::FunctionType* mFetchShaderTy;
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 10/21] swr/rast: WIP Translation handling

2018-04-25 Thread George Kyriazis
---
 .../swr/rasterizer/jitter/builder_gfx_mem.cpp  | 41 +-
 .../swr/rasterizer/jitter/builder_gfx_mem.h|  3 +-
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
index 6ecd969..9b70716 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
@@ -160,14 +160,6 @@ namespace SwrJit
 return Builder::LOAD(Ptr, Name);
 }
 
-LoadInst* BuilderGfxMem::LOAD(Type *Ty, Value *Ptr, const Twine , 
JIT_MEM_CLIENT usage)
-{
-AssertGFXMemoryParams(Ptr, usage);
-
-Ptr = TranslationHelper(Ptr, Ty);
-return Builder::LOAD(Ty, Ptr, Name);
-}
-
 LoadInst* BuilderGfxMem::LOAD(Value *Ptr, bool isVolatile, const Twine 
, Type *Ty, JIT_MEM_CLIENT usage)
 {
 AssertGFXMemoryParams(Ptr, usage);
@@ -180,12 +172,25 @@ namespace SwrJit
 {
 AssertGFXMemoryParams(BasePtr, usage);
 
-// This call is just a pass through to the base class.
-// It needs to be here to compile due to the combination of virtual 
overrides and signature overloads.
-// It doesn't do anything meaningful because the implementation in the 
base class is going to call 
-// another version of LOAD inside itself where the actual per offset 
translation will take place 
-// and we can't just translate the BasePtr once, each address needs 
individual translation.
-return Builder::LOAD(BasePtr, offset, name, Ty, usage);
+bool bNeedTranslation = false;
+if (BasePtr->getType() == mInt64Ty)
+{
+SWR_ASSERT(Ty);
+BasePtr = INT_TO_PTR(BasePtr, Ty, name);
+bNeedTranslation = true;
+}
+std::vector valIndices;
+for (auto i : offset)
+{
+valIndices.push_back(C(i));
+}
+BasePtr = Builder::GEPA(BasePtr, valIndices, name);
+if (bNeedTranslation)
+{
+BasePtr = PTR_TO_INT(BasePtr, mInt64Ty, name);
+}
+
+return LOAD(BasePtr, name, Ty, usage);
 }
 
 CallInst* BuilderGfxMem::MASKED_LOAD(Value *Ptr, unsigned Align, Value 
*Mask, Value *PassThru, const Twine , Type *Ty, JIT_MEM_CLIENT usage)
@@ -196,8 +201,12 @@ namespace SwrJit
 return Builder::MASKED_LOAD(Ptr, Align, Mask, PassThru, Name, Ty, 
usage);
 }
 
-Value* BuilderGfxMem::TranslateGfxAddress(Value* xpGfxAddress)
+Value* BuilderGfxMem::TranslateGfxAddress(Value* xpGfxAddress, Type* 
PtrTy, const Twine )
 {
-return INT_TO_PTR(xpGfxAddress, PointerType::get(mInt8Ty, 0));
+if (PtrTy == nullptr)
+{
+PtrTy = mInt8PtrTy;
+}
+return INT_TO_PTR(xpGfxAddress, PtrTy, Name);
 }
 }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h
index f8ec0ac..effbe05 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h
@@ -48,7 +48,6 @@ namespace SwrJit
 
 virtual LoadInst* LOAD(Value *Ptr, const char *Name, Type *Ty = 
nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
 virtual LoadInst* LOAD(Value *Ptr, const Twine  = "", Type *Ty = 
nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
-virtual LoadInst* LOAD(Type *Ty, Value *Ptr, const Twine  = "", 
JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
 virtual LoadInst* LOAD(Value *Ptr, bool isVolatile, const Twine  
= "", Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
 virtual LoadInst* LOAD(Value *BasePtr, const 
std::initializer_list , const llvm::Twine& Name = "", Type *Ty 
= nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
 
@@ -58,7 +57,7 @@ namespace SwrJit
 
 virtual Value *GATHERDD(Value* src, Value* pBase, Value* indices, 
Value* mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
 
-Value* TranslateGfxAddress(Value* xpGfxAddress);
+Value* TranslateGfxAddress(Value* xpGfxAddress, Type* PtrTy = nullptr, 
const Twine  = "");
 
 
 protected:
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 02/21] swr: touch generated files to update timestamp

2018-04-25 Thread George Kyriazis
previous change in generators necessitates this change
---
 src/gallium/drivers/swr/Makefile.am | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/src/gallium/drivers/swr/Makefile.am 
b/src/gallium/drivers/swr/Makefile.am
index c22f09e..8b31502 100644
--- a/src/gallium/drivers/swr/Makefile.am
+++ b/src/gallium/drivers/swr/Makefile.am
@@ -104,6 +104,7 @@ gen_swr_context_llvm.h: 
rasterizer/codegen/gen_llvm_types.py rasterizer/codegen/
$(srcdir)/rasterizer/codegen/gen_llvm_types.py \
--input $(srcdir)/swr_context.h \
--output ./gen_swr_context_llvm.h
+   $(AM_V_GEN)touch $@
 
 rasterizer/codegen/gen_knobs.cpp: rasterizer/codegen/gen_knobs.py 
rasterizer/codegen/knob_defs.py rasterizer/codegen/templates/gen_knobs.cpp 
rasterizer/codegen/gen_common.py
$(MKDIR_GEN)
@@ -111,6 +112,7 @@ rasterizer/codegen/gen_knobs.cpp: 
rasterizer/codegen/gen_knobs.py rasterizer/cod
$(srcdir)/rasterizer/codegen/gen_knobs.py \
--output rasterizer/codegen/gen_knobs.cpp \
--gen_cpp
+   $(AM_V_GEN)touch $@
 
 rasterizer/codegen/gen_knobs.h: rasterizer/codegen/gen_knobs.py 
rasterizer/codegen/knob_defs.py rasterizer/codegen/templates/gen_knobs.h 
rasterizer/codegen/gen_common.py
$(MKDIR_GEN)
@@ -118,6 +120,7 @@ rasterizer/codegen/gen_knobs.h: 
rasterizer/codegen/gen_knobs.py rasterizer/codeg
$(srcdir)/rasterizer/codegen/gen_knobs.py \
--output rasterizer/codegen/gen_knobs.h \
--gen_h
+   $(AM_V_GEN)touch $@
 
 rasterizer/jitter/gen_state_llvm.h: rasterizer/codegen/gen_llvm_types.py 
rasterizer/codegen/templates/gen_llvm.hpp rasterizer/core/state.h 
rasterizer/codegen/gen_common.py
$(MKDIR_GEN)
@@ -125,6 +128,7 @@ rasterizer/jitter/gen_state_llvm.h: 
rasterizer/codegen/gen_llvm_types.py rasteri
$(srcdir)/rasterizer/codegen/gen_llvm_types.py \
--input $(srcdir)/rasterizer/core/state.h \
--output rasterizer/jitter/gen_state_llvm.h
+   $(AM_V_GEN)touch $@
 
 rasterizer/jitter/gen_builder.hpp: rasterizer/codegen/gen_llvm_ir_macros.py 
rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py
$(MKDIR_GEN)
@@ -133,6 +137,7 @@ rasterizer/jitter/gen_builder.hpp: 
rasterizer/codegen/gen_llvm_ir_macros.py rast
--input $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h \
--output rasterizer/jitter \
--gen_h
+   $(AM_V_GEN)touch $@
 
 rasterizer/jitter/gen_builder_meta.hpp: 
rasterizer/codegen/gen_llvm_ir_macros.py 
rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py
$(MKDIR_GEN)
@@ -140,6 +145,7 @@ rasterizer/jitter/gen_builder_meta.hpp: 
rasterizer/codegen/gen_llvm_ir_macros.py
$(srcdir)/rasterizer/codegen/gen_llvm_ir_macros.py \
--output rasterizer/jitter \
--gen_meta_h
+   $(AM_V_GEN)touch $@
 
 rasterizer/jitter/gen_builder_intrin.hpp: 
rasterizer/codegen/gen_llvm_ir_macros.py 
rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py
$(MKDIR_GEN)
@@ -147,6 +153,7 @@ rasterizer/jitter/gen_builder_intrin.hpp: 
rasterizer/codegen/gen_llvm_ir_macros.
$(srcdir)/rasterizer/codegen/gen_llvm_ir_macros.py \
--output rasterizer/jitter \
--gen_intrin_h
+   $(AM_V_GEN)touch $@
 
 rasterizer/archrast/gen_ar_event.hpp: rasterizer/codegen/gen_archrast.py 
rasterizer/codegen/templates/gen_ar_event.hpp rasterizer/archrast/events.proto 
rasterizer/archrast/events_private.proto rasterizer/codegen/gen_common.py
$(MKDIR_GEN)
@@ -156,6 +163,7 @@ rasterizer/archrast/gen_ar_event.hpp: 
rasterizer/codegen/gen_archrast.py rasteri
--proto_private 
$(srcdir)/rasterizer/archrast/events_private.proto \
--output rasterizer/archrast/gen_ar_event.hpp \
--gen_event_hpp
+   $(AM_V_GEN)touch $@
 
 rasterizer/archrast/gen_ar_event.cpp: rasterizer/codegen/gen_archrast.py 
rasterizer/codegen/templates/gen_ar_event.cpp rasterizer/archrast/events.proto 
rasterizer/archrast/events_private.proto rasterizer/codegen/gen_common.py
$(MKDIR_GEN)
@@ -165,6 +173,7 @@ rasterizer/archrast/gen_ar_event.cpp: 
rasterizer/codegen/gen_archrast.py rasteri
--proto_private 
$(srcdir)/rasterizer/archrast/events_private.proto \
--output rasterizer/archrast/gen_ar_event.cpp \
--gen_event_cpp
+   $(AM_V_GEN)touch $@
 
 rasterizer/archrast/gen_ar_eventhandler.hpp: 
rasterizer/codegen/gen_archrast.py 
rasterizer/codegen/templates/gen_ar_eventhandler.hpp 
rasterizer/archrast/events.proto rasterizer/archrast/events_private.proto 
rasterizer/codegen/gen_common.py
$(MKDIR_GEN)
@@ -174,6 +183,7 @@ rasterizer/archrast/gen_ar_eventhandler.hpp: 
rasterizer/codegen/gen_archrast.py
--proto_private 

[Mesa-dev] [PATCH 00/21] OpenSWR batch change

2018-04-25 Thread George Kyriazis
Misc changes.  Include:
- fix KNL behavior with LLVm 6.0.0
- fix byte offset for non-indexed draws
- fix 64-bit float handling with code generator
- misc cleanup

George Kyriazis (21):
  swr/rast: Fix byte offset for non-indexed draws
  swr: touch generated files to update timestamp
  swr/rast: Fix wrong type allocation
  swr/rast: Add some SIMD_T utility functors
  swr/rast: Fix x86 lowering 64-bit float handling
  swr/rast: Internal core change
  swr/rast: Add support for TexelMask evaluation
  swr/rast: Silence warnings
  swr/rast: Use different handing for stream masks
  swr/rast: WIP Translation handling
  swr/rast: Fix return type of VCVTPS2PH.
  swr/rast: Fix init in EventHandlerWorkerStats
  swr/rast: Package events.proto with core output
  swr/rast: Cleanup old windows cruft.
  swr/rast: Fix regressions.
  swr/rast: jit PRINT improvements.
  swr/rast: Add TranslateGfxAddress for shader
  swr/rast: Output rasterizer dir to console since it's process specific
  swr/rast: Use new processor detection mechanism
  swr/rast: Small editorial changes
  swr/rast: No need to export GetSimdValidIndicesGfx

 src/gallium/drivers/swr/Makefile.am| 11 
 .../drivers/swr/rasterizer/archrast/archrast.cpp   | 35 +-
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |  2 +-
 .../codegen/templates/gen_ar_eventhandlerfile.hpp  |  7 +-
 src/gallium/drivers/swr/rasterizer/common/os.h |  3 +
 .../drivers/swr/rasterizer/common/simdlib.hpp  | 66 +++
 src/gallium/drivers/swr/rasterizer/core/api.cpp|  4 +-
 .../drivers/swr/rasterizer/core/frontend.cpp   | 12 ++--
 src/gallium/drivers/swr/rasterizer/core/state.h|  2 +-
 src/gallium/drivers/swr/rasterizer/core/utils.h|  1 +
 .../drivers/swr/rasterizer/jitter/JitManager.cpp   | 66 ++-
 .../drivers/swr/rasterizer/jitter/JitManager.h |  2 +
 .../drivers/swr/rasterizer/jitter/blend_jit.cpp|  2 -
 .../drivers/swr/rasterizer/jitter/builder.cpp  | 42 
 .../drivers/swr/rasterizer/jitter/builder.h|  2 +
 .../swr/rasterizer/jitter/builder_gfx_mem.cpp  | 74 --
 .../swr/rasterizer/jitter/builder_gfx_mem.h| 19 +-
 .../drivers/swr/rasterizer/jitter/builder_mem.h|  3 +-
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 15 -
 .../drivers/swr/rasterizer/jitter/builder_misc.h   | 12 
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|  7 +-
 .../rasterizer/jitter/functionpasses/lower_x86.cpp | 55 +---
 .../swr/rasterizer/jitter/streamout_jit.cpp|  2 +
 23 files changed, 361 insertions(+), 83 deletions(-)

-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 01/21] swr/rast: Fix byte offset for non-indexed draws

2018-04-25 Thread George Kyriazis
for the case when USE_SIMD16_SHADERS == FALSE
---
 src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 9630afa..6e2bab3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -1729,15 +1729,15 @@ void ProcessDraw(
 fetchInfo_lo.xpLastIndex = fetchInfo_lo.xpIndices;
 uint32_t offset;
 offset = std::min(endVertex-i, (uint32_t) 
KNOB_SIMD16_WIDTH);
-#if USE_SIMD16_SHADERS
 offset *= 4; // convert from index to address
+#if USE_SIMD16_SHADERS
 fetchInfo_lo.xpLastIndex += offset;
 #else
-fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t) 
KNOB_SIMD_WIDTH) * 4; // * 4 for converting index to address
+fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t) 
KNOB_SIMD_WIDTH);
 uint32_t offset2 = std::min(offset, (uint32_t) 
KNOB_SIMD16_WIDTH)-KNOB_SIMD_WIDTH;
 assert(offset >= 0);
 fetchInfo_hi.xpLastIndex = fetchInfo_hi.xpIndices;
-fetchInfo_hi.xpLastIndex += offset2 * 4; // * 4 for 
converting index to address
+fetchInfo_hi.xpLastIndex += offset2;
 #endif
 }
 // 1. Execute FS/VS for a single SIMD.
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 14/21] swr/rast: Cleanup old windows cruft.

2018-04-25 Thread George Kyriazis
---
 src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp | 16 ++--
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 284eb27..bfc3e42 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -82,13 +82,6 @@ JitManager::JitManager(uint32_t simdWidth, const char *arch, 
const char* core)
 
 hostCPUName = sys::getHostCPUName();
 
-#if defined(_WIN32)
-// Needed for MCJIT on windows
-Triple hostTriple(sys::getProcessTriple());
-hostTriple.setObjectFormat(Triple::COFF);
-mpCurrentModule->setTargetTriple(hostTriple.getTriple());
-#endif // _WIN32
-
 auto optLevel = CodeGenOpt::Aggressive;
 
 if (KNOB_JIT_OPTIMIZATION_LEVEL >= CodeGenOpt::None &&
@@ -97,6 +90,7 @@ JitManager::JitManager(uint32_t simdWidth, const char *arch, 
const char* core)
 optLevel = CodeGenOpt::Level(KNOB_JIT_OPTIMIZATION_LEVEL);
 }
 
+mpCurrentModule->setTargetTriple(sys::getProcessTriple());
 mpExec = EngineBuilder(std::move(newModule))
 .setTargetOptions(tOpts)
 .setOptLevel(optLevel)
@@ -163,13 +157,7 @@ void JitManager::SetupNewModule()
 
 std::unique_ptr newModule(new Module("", mContext));
 mpCurrentModule = newModule.get();
-#if defined(_WIN32)
-// Needed for MCJIT on windows
-Triple hostTriple(sys::getProcessTriple());
-hostTriple.setObjectFormat(Triple::COFF);
-newModule->setTargetTriple(hostTriple.getTriple());
-#endif // _WIN32
-
+mpCurrentModule->setTargetTriple(sys::getProcessTriple());
 mpExec->addModule(std::move(newModule));
 mIsModuleFinalized = false;
 }
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 09/21] swr/rast: Use different handing for stream masks

2018-04-25 Thread George Kyriazis
---
 src/gallium/drivers/swr/rasterizer/common/os.h  | 3 +++
 src/gallium/drivers/swr/rasterizer/core/api.cpp | 4 ++--
 src/gallium/drivers/swr/rasterizer/core/frontend.cpp| 6 +++---
 src/gallium/drivers/swr/rasterizer/core/state.h | 2 +-
 src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp | 2 ++
 5 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h 
b/src/gallium/drivers/swr/rasterizer/common/os.h
index 5cfd12f..e779562 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -209,6 +209,9 @@ unsigned char _BitScanReverse(unsigned int *Index, unsigned 
int Mask)
 return (Mask != 0);
 }
 
+#define _BitScanForward64 _BitScanForward
+#define _BitScanReverse64 _BitScanReverse
+
 inline
 void *AlignedMalloc(size_t size, size_t alignment)
 {
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index e37e2e4..a2ee85d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -976,14 +976,14 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
 
 if (pState->state.soState.soEnable)
 {
-uint32_t streamMasks = 0;
+uint64_t streamMasks = 0;
 for (uint32_t i = 0; i < 4; ++i)
 {
 streamMasks |= pState->state.soState.streamMasks[i];
 }
 
 DWORD maxAttrib;
-if (_BitScanReverse(, streamMasks))
+if (_BitScanReverse64(, streamMasks))
 {
 pState->state.feNumAttributes = 
std::max(pState->state.feNumAttributes, (uint32_t)(maxAttrib + 1));
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 6e2bab3..1847c3e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -528,10 +528,10 @@ static void StreamOut(
 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
 {
 DWORD slot = 0;
-uint32_t soMask = soState.streamMasks[streamIndex];
+uint64_t soMask = soState.streamMasks[streamIndex];
 
 // Write all entries into primitive data buffer for SOS.
-while (_BitScanForward(, soMask))
+while (_BitScanForward64(, soMask))
 {
 simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM];// prim attribs 
(always 4 wide)
 uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
@@ -551,7 +551,7 @@ static void StreamOut(
 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
 }
 
-soMask &= ~(1 << slot);
+soMask &= ~(uint64_t(1) << slot);
 }
 
 // Update pPrimData pointer 
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index 217cf44..f160913 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -702,7 +702,7 @@ struct SWR_STREAMOUT_STATE
 // The stream masks specify which attributes are sent to which streams.
 // These masks help the FE to setup the pPrimData buffer that is passed
 // the Stream Output Shader (SOS) function.
-uint32_t streamMasks[MAX_SO_STREAMS];
+uint64_t streamMasks[MAX_SO_STREAMS];
 
 // Number of attributes, including position, per vertex that are streamed 
out.
 // This should match number of bits in stream mask.
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
index 15a6bc4..f804900 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -313,6 +313,7 @@ struct StreamOutJit : public Builder
 
 JitManager::DumpToFile(soFunc, "SoFunc_optimized");
 
+
 return soFunc;
 }
 };
@@ -333,6 +334,7 @@ PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE 
hFunc)
 
 pJitMgr->DumpAsm(func, "SoFunc_optimized");
 
+
 return pfnStreamOut;
 }
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 04/21] swr/rast: Add some SIMD_T utility functors

2018-04-25 Thread George Kyriazis
VecEqual and VecHash
---
 .../drivers/swr/rasterizer/common/simdlib.hpp  | 66 ++
 1 file changed, 66 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp 
b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
index 4114645..24cf27d 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
@@ -580,3 +580,69 @@ template  using Double = typename 
SIMD_T::Double;
 template  using Integer= typename SIMD_T::Integer;
 template  using Vec4   = typename SIMD_T::Vec4;
 template  using Mask   = typename SIMD_T::Mask;
+
+template 
+struct SIMDVecEqual
+{
+INLINE bool operator () (Integer a, Integer b) const
+{
+Integer c = SIMD_T::xor_si(a, b);
+return SIMD_T::testz_si(c, c);
+}
+
+INLINE bool operator () (Float a, Float b) const
+{
+return this->operator()(SIMD_T::castps_si(a), SIMD_T::castps_si(b));
+}
+
+INLINE bool operator () (Double a, Double b) const
+{
+return this->operator()(SIMD_T::castpd_si(a), SIMD_T::castpd_si(b));
+}
+};
+
+template 
+struct SIMDVecHash
+{
+INLINE uint32_t operator ()(Integer val) const
+{
+#if defined(_WIN64) || !defined(_WIN32) // assume non-Windows is always 64-bit
+static_assert(sizeof(void*) == 8, "This path only meant for 64-bit 
code");
+
+uint64_t crc32 = 0;
+const uint64_t *pData = reinterpret_cast();
+static const uint32_t loopIterations = sizeof(val) / sizeof(void*);
+static_assert(loopIterations * sizeof(void*) == sizeof(val), "bad 
vector size");
+
+for (uint32_t i = 0; i < loopIterations; ++i)
+{
+crc32 = _mm_crc32_u64(crc32, pData[i]);
+}
+
+return static_cast(crc32);
+#else
+static_assert(sizeof(void*) == 4, "This path only meant for 32-bit 
code");
+
+uint32_t crc32 = 0;
+const uint32_t *pData = reinterpret_cast();
+static const uint32_t loopIterations = sizeof(val) / sizeof(void*);
+static_assert(loopIterations * sizeof(void*) == sizeof(val), "bad 
vector size");
+
+for (uint32_t i = 0; i < loopIterations; ++i)
+{
+crc32 = _mm_crc32_u32(crc32, pData[i]);
+}
+
+return crc32;
+#endif
+};
+
+INLINE uint32_t operator ()(Float val) const
+{
+return operator()(SIMD_T::castps_si(val));
+};
+INLINE uint32_t operator ()(Double val) const
+{
+return operator()(SIMD_T::castpd_si(val));
+}
+};
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 08/21] swr/rast: Silence warnings

2018-04-25 Thread George Kyriazis
---
 src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp| 2 --
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp| 1 -
 src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp | 3 ++-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
index 58fdb7f..72bf900 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -558,8 +558,6 @@ struct BlendJit : public Builder
 ppoMask->setName("ppoMask");
 Value* ppMask = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_pMask });
 ppMask->setName("pMask");
-Value* AlphaTest1 = LOAD(pBlendContext, { 0, 
SWR_BLEND_CONTEXT_isAlphaBlended });
-ppMask->setName("AlphaTest1");
 
 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, 
"Unsupported hot tile format");
 Value* dst[4];
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index a43c787..48f0961 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -1070,7 +1070,6 @@ Value* FetchJit::GetSimdValid16bitIndices(Value* 
pIndices, Value* pLastIndex)
 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
 {
 DataLayout dL(JM()->mpCurrentModule);
-unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
 Value* iLastIndex = pLastIndex; 
 Value* iIndices = pIndices;
 
diff --git 
a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
index eac0549..b8c3296 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
@@ -168,7 +168,6 @@ namespace SwrJit
 // intrinsic.
 void GetRequestedWidthAndType(CallInst* pCallInst, const StringRef 
intrinName, TargetWidth* pWidth, Type** pTy)
 {
-uint32_t vecWidth;
 Type* pVecTy = pCallInst->getType();
 
 // Check for intrinsic specific types
@@ -210,6 +209,7 @@ namespace SwrJit
 {
 case W256: numElem = 8; break;
 case W512: numElem = 16; break;
+   default: SWR_ASSERT(false, "Unhandled vector width type %d\n", 
width);
 }
 
 return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
@@ -222,6 +222,7 @@ namespace SwrJit
 {
 case W256: mask = B->C((uint8_t)-1); break;
 case W512: mask = B->C((uint16_t)-1); break;
+   default: SWR_ASSERT(false, "Unhandled vector width type %d\n", 
width);
 }
 return mask;
 }
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 13/21] swr/rast: Package events.proto with core output

2018-04-25 Thread George Kyriazis
However only if the file exists in DEBUG_OUTPUT_DIR. The expectation is
that AR rasterizerLauncher will start placing it there when launching
a workload (which is in a subsequent checkin)
---
 .../drivers/swr/rasterizer/archrast/archrast.cpp   | 30 +-
 .../codegen/templates/gen_ar_eventhandlerfile.hpp  |  4 ++-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp 
b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
index ff7bdc3..285d1ac 100644
--- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
+++ b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
@@ -93,7 +93,35 @@ namespace ArchRast
 class EventHandlerApiStats : public EventHandlerFile
 {
 public:
-EventHandlerApiStats(uint32_t id) : EventHandlerFile(id) {}
+EventHandlerApiStats(uint32_t id) : EventHandlerFile(id) {
+#if defined(_WIN32)
+// Attempt to copy the events.proto file to the ArchRasty output 
dir. It's common for tools to place the events.proto file
+// in the DEBUG_OUTPUT_DIR when launching AR. If it exists, this 
will attempt to copy it the first time we get here to package
+// it with the stats. Otherwise, the user would need to specify 
the events.proto location when parsing the stats in post.
+std::stringstream eventsProtoSrcFilename, eventsProtoDstFilename;
+eventsProtoSrcFilename << KNOB_DEBUG_OUTPUT_DIR << 
"\\events.proto" << std::ends;
+eventsProtoDstFilename << mOutputDir.substr(0, mOutputDir.size() - 
1) << "\\events.proto" << std::ends;
+
+// If event.proto already exists, we're done; else do the copy
+struct stat buf; // Use a Posix stat for file existence check
+if (!stat(eventsProtoDstFilename.str().c_str(), ) == 0) {
+// Now check to make sure the events.proto source exists
+if (stat(eventsProtoSrcFilename.str().c_str(), ) == 0) {
+std::ifstream srcFile;
+srcFile.open(eventsProtoSrcFilename.str().c_str(), 
std::ios::binary);
+if (srcFile.is_open())
+{
+// Just do a binary buffer copy
+std::ofstream dstFile;
+dstFile.open(eventsProtoDstFilename.str().c_str(), 
std::ios::binary);
+dstFile << srcFile.rdbuf();
+dstFile.close();
+}
+srcFile.close();
+}
+}
+#endif
+}
 
 virtual void Handle(const DrawInstancedEvent& event)
 {
diff --git 
a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
index d1852b3..54d2486 100644
--- 
a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
+++ 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
@@ -56,7 +56,8 @@ namespace ArchRast
 const char* pBaseName = strrchr(procname, '\\');
 std::stringstream outDir;
 outDir << KNOB_DEBUG_OUTPUT_DIR << pBaseName << "_" << pid << 
std::ends;
-CreateDirectory(outDir.str().c_str(), NULL);
+mOutputDir = outDir.str();
+CreateDirectory(mOutputDir.c_str(), NULL);
 
 // There could be multiple threads creating thread pools. We
 // want to make sure they are uniquly identified by adding in
@@ -152,6 +153,7 @@ namespace ArchRast
 }
 
 std::string mFilename;
+std::string mOutputDir;
 
 static const uint32_t mBufferSize = 1024;
 uint8_t mBuffer[mBufferSize];
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 32/45] swr/rast: Fix alloca usage in jitter

2018-04-17 Thread George Kyriazis
Fix issue where temporary allocas were getting hoisted to function entry
unnecessarily. We now explicitly mark temporary allocas and skip hoisting
during the hoist pass. Shuold reduce stack usage.
---
 src/gallium/drivers/swr/rasterizer/jitter/builder.cpp   | 17 +
 src/gallium/drivers/swr/rasterizer/jitter/builder.h |  2 ++
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp   |  1 +
 3 files changed, 20 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index 53947c3..bd81560 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -111,4 +111,21 @@ namespace SwrJit
 mSimdVectorIntTy = ArrayType::get(mSimdInt32Ty, 4);
 mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5);
 }
+
+/// @brief Mark this alloca as temporary to avoid hoisting later on
+void Builder::SetTempAlloca(Value* inst)
+{
+AllocaInst* pAlloca = dyn_cast(inst);
+SWR_ASSERT(pAlloca, "Unexpected non-alloca instruction");
+MDNode* N = MDNode::get(JM()->mContext, MDString::get(JM()->mContext, 
"is_temp_alloca"));
+pAlloca->setMetadata("is_temp_alloca", N);
+}
+
+bool Builder::IsTempAlloca(Value* inst)
+{
+AllocaInst* pAlloca = dyn_cast(inst);
+SWR_ASSERT(pAlloca, "Unexpected non-alloca instruction");
+
+return (pAlloca->getMetadata("is_temp_alloca") != nullptr);
+}
 }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 4c79bab..27a32bc 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -96,6 +96,8 @@ namespace SwrJit
 Type*mSimd32Int8Ty;
 
 void SetTargetWidth(uint32_t width);
+void SetTempAlloca(Value* inst);
+bool IsTempAlloca(Value* inst);
 
 #include "gen_builder.hpp"
 #include "gen_builder_meta.hpp"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index c5f0b2b..eccf0ad 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -229,6 +229,7 @@ namespace SwrJit
 
 // store vSrc on the stack.  this way we can select between a 
valid load address and the vSrc address
 Value* vSrcPtr = ALLOCA(vSrc->getType());
+SetTempAlloca(vSrcPtr);
 STORE(vSrc, vSrcPtr);
 
 vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 25/45] swr/rast: Enable generalized fetch jit

2018-04-17 Thread George Kyriazis
Enable generalized fetch jit with 8 or 16 wide SIMD target. Still some
work needed to remove some simd8 double pumping for 16-wide target.

Also removed unused non-gather load vertices path.
---
 .../drivers/swr/rasterizer/jitter/builder.cpp  |   26 +-
 .../drivers/swr/rasterizer/jitter/builder.h|4 +-
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  |   69 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 1197 +++-
 .../drivers/swr/rasterizer/jitter/fetch_jit.h  |6 +-
 5 files changed, 169 insertions(+), 1133 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index 625f132..53947c3 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -66,16 +66,7 @@ namespace SwrJit
 mSimd4FP64Ty = VectorType::get(mDoubleTy, 4);
 
 // Built in types: target simd
-
-mSimdInt1Ty = VectorType::get(mInt1Ty,  mVWidth);
-mSimdInt16Ty= VectorType::get(mInt16Ty, mVWidth);
-mSimdInt32Ty= VectorType::get(mInt32Ty, mVWidth);
-mSimdInt64Ty= VectorType::get(mInt64Ty, mVWidth);
-mSimdFP16Ty = VectorType::get(mFP16Ty,  mVWidth);
-mSimdFP32Ty = VectorType::get(mFP32Ty,  mVWidth);
-mSimdVectorTy   = ArrayType::get(mSimdFP32Ty, 4);
-mSimdVectorIntTy= ArrayType::get(mSimdInt32Ty, 4);
-mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5);
+SetTargetWidth(pJitMgr->mVWidth);
 
 // Built in types: simd16
 
@@ -105,4 +96,19 @@ namespace SwrJit
 mSimd16IntPtrTy = mSimd16Int64Ty;
 }
 }
+
+void Builder::SetTargetWidth(uint32_t width)
+{
+mVWidth = width;
+
+mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth);
+mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth);
+mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth);
+mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth);
+mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth);
+mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
+mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4);
+mSimdVectorIntTy = ArrayType::get(mSimdInt32Ty, 4);
+mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5);
+}
 }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 6b2c9f0..4c79bab 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -46,7 +46,7 @@ namespace SwrJit
 JitManager *mpJitMgr;
 IRBuilder<> *mpIRBuilder;
 
-uint32_t mVWidth;   // vector width simd8
+uint32_t mVWidth;   // vector width target simd
 uint32_t mVWidth16; // vector width simd16
 
 // Built in types: scalar
@@ -95,6 +95,8 @@ namespace SwrJit
 
 Type*mSimd32Int8Ty;
 
+void SetTargetWidth(uint32_t width);
+
 #include "gen_builder.hpp"
 #include "gen_builder_meta.hpp"
 #include "gen_builder_intrin.hpp"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index adb9296..a67cb9b 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -38,6 +38,7 @@ namespace SwrJit
 {
 void Builder::AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage)
 {
+SWR_ASSERT(ptr->getType() != mInt64Ty, "Address appears to be GFX 
access.  Requires translation through BuilderGfxMem.");
 }
 
 Value *Builder::GEP(Value* ptr, const std::initializer_list 
)
@@ -175,78 +176,14 @@ namespace SwrJit
 {
 AssertMemoryUsageParams(pBase, usage);
 
-Value* vGather;
-
-// use avx2 gather instruction if available
-if (JM()->mArch.AVX2())
-{
-vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
-}
-else
-{
-Value* pStack = STACKSAVE();
-
-// store vSrc on the stack.  this way we can select between a 
valid load address and the vSrc address
-Value* vSrcPtr = ALLOCA(vSrc->getType());
-STORE(vSrc, vSrcPtr);
-
-vGather = VUNDEF_I();
-Value *vScaleVec = VIMMED1((uint32_t)scale);
-Value *vOffsets = MUL(vIndices, vScaleVec);
-for (uint32_t i = 0; i < mVWidth; ++i)
-{
-// single component byte index
-Value *offset = VEXTRACT(vOffsets, C(i));
-// byte pointer to component
-Value *loadAddress = GEP(pBase, offset);
-loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 
0));
-// pointer to the value to load if we're masking off a 

[Mesa-dev] [PATCH v2 30/45] swr/rast: Fix byte offset for non-indexed draws

2018-04-17 Thread George Kyriazis
---
 src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 25d1073..2076859 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -1729,13 +1729,14 @@ void ProcessDraw(
 uint32_t offset;
 offset = std::min(endVertex-i, (uint32_t) 
KNOB_SIMD16_WIDTH);
 #if USE_SIMD16_SHADERS
+offset *= 4; // convert from index to address
 fetchInfo_lo.pLastIndex += offset;
 #else
-fetchInfo_lo.pLastIndex += std::min(offset, (uint32_t) 
KNOB_SIMD_WIDTH);
+fetchInfo_lo.pLastIndex += std::min(offset, (uint32_t) 
KNOB_SIMD_WIDTH) * 4; // * 4 for converting index to address
 uint32_t offset2 = std::min(offset, (uint32_t) 
KNOB_SIMD16_WIDTH)-KNOB_SIMD_WIDTH;
 assert(offset >= 0);
 fetchInfo_hi.pLastIndex = fetchInfo_hi.pIndices;
-fetchInfo_hi.pLastIndex += offset2;
+fetchInfo_hi.pLastIndex += offset2 * 4; // * 4 for 
converting index to address
 #endif
 }
 // 1. Execute FS/VS for a single SIMD.
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 41/45] swr/rast: Refactor to improve code sharing.

2018-04-17 Thread George Kyriazis
---
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 79 ++
 1 file changed, 36 insertions(+), 43 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 767866f..af97b83 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -63,6 +63,7 @@ struct FetchJit : public BuilderGfxMem
 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
+template Value* GetSimdValidIndicesHelper(Value* pIndices, 
Value* pLastIndex);
 
 // package up Shuffle*bpcGatherd args into a tuple for convenience
 typedef std::tuplegetType() == mInt64Ty && pLastIndex->getType() == 
mInt64Ty, "Function expects gfxptr_t for both input parameters.");
 
+Type* Ty = nullptr;
+
+static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == 
sizeof(uint8_t), "Unsupported type for use with GetSimdValidIndicesHelper");
+constexpr bool bSize = (sizeof(T) == sizeof(uint16_t));
+if (bSize)
+{
+Ty = mInt16PtrTy;
+}
+else if (sizeof(T) == sizeof(uint8_t))
+{
+Ty = mInt8PtrTy;
+}
+else
+{
+SWR_ASSERT(false, "This should never happen as per static_assert 
above.");
+}
+
 Value* vIndices = VUNDEF_I();
 
 {
 // store 0 index on stack to be used to conditionally load from if 
index address is OOB
-Value* pZeroIndex = ALLOCA(mInt8Ty);
-STORE(C((uint8_t)0), pZeroIndex);
+Value* pZeroIndex = ALLOCA(Ty);
+STORE(C((T)0), pZeroIndex);
 
 // Load a SIMD of index pointers
 for (int64_t lane = 0; lane < mVWidth; lane++)
 {
 // Calculate the address of the requested index
-Value *pIndex = GEP(pIndices, C(lane), mInt8PtrTy);
+Value *pIndex = GEP(pIndices, C(lane), Ty);
 
-pLastIndex = INT_TO_PTR(pLastIndex, mInt8PtrTy);
+pLastIndex = INT_TO_PTR(pLastIndex, Ty);
 
 // check if the address is less than the max index, 
 Value* mask = ICMP_ULT(pIndex, pLastIndex);
 
 // if valid, load the index. if not, load 0 from the stack
 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
-Value *index = LOAD(pValid, "valid index", 
PointerType::get(mInt8Ty, 0), GFX_MEM_CLIENT_FETCH);
+Value *index = LOAD(pValid, "valid index", Ty, 
GFX_MEM_CLIENT_FETCH);
 
 // zero extended index to 32 bits and insert into the correct simd 
lane
 index = Z_EXT(index, mInt32Ty);
@@ -1028,43 +1040,24 @@ Value* FetchJit::GetSimdValid8bitIndices(Value* 
pIndices, Value* pLastIndex)
 
 //
 /// @brief Loads a simd of valid indices. OOB indices are set to 0
+/// *Note* have to do 8bit index checking in scalar until we have AVX-512
+/// support
+/// @param pIndices - pointer to 8 bit indices
+/// @param pLastIndex - pointer to last valid index
+Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
+{
+return GetSimdValidIndicesHelper(pIndices, pLastIndex);
+}
+
+//
+/// @brief Loads a simd of valid indices. OOB indices are set to 0
 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
 /// support
 /// @param pIndices - pointer to 16 bit indices
 /// @param pLastIndex - pointer to last valid index
 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
 {
-SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == 
mInt64Ty, "Function expects gfxptr_t for both input parameters.");
-
-

[Mesa-dev] [PATCH v2 27/45] swr/rast: Fix codegen for typedef types

2018-04-17 Thread George Kyriazis
---
 src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py
index a127976..d8863c0 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py
@@ -143,6 +143,7 @@ def gen_llvm_types(input_file, output_file):
 is_llvm_typedef = re.search(r'@llvm_typedef', line)
 if is_llvm_typedef is not None:
 is_llvm_typedef = True
+continue
 else:
 is_llvm_typedef = False
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 20/45] swr/rast: Start refactoring of builder/packetizer.

2018-04-17 Thread George Kyriazis
Move x86 intrinsic lowering to a separate pass. Builder now instantiates
generic intrinsics for features not supported by llvm. The separate x86
lowering pass is responsible for lowering to valid x86 for the target
SIMD architecture. Currently it's a port of existing code to get it
up and running quickly. Will eventually support optimized x86 for AVX,
AVX2 and AVX512.
---
 src/gallium/drivers/swr/Makefile.am|   6 +-
 src/gallium/drivers/swr/Makefile.sources   |   3 +-
 src/gallium/drivers/swr/SConscript |   4 +-
 src/gallium/drivers/swr/meson.build|   3 +-
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |  58 +--
 .../drivers/swr/rasterizer/codegen/meson.build |   2 +-
 .../rasterizer/codegen/templates/gen_builder.hpp   |  11 +-
 .../drivers/swr/rasterizer/jitter/blend_jit.cpp|   3 +
 .../drivers/swr/rasterizer/jitter/builder.cpp  |   4 +
 .../drivers/swr/rasterizer/jitter/builder.h|   6 +-
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  |   5 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|   3 +
 .../rasterizer/jitter/functionpasses/lower_x86.cpp | 455 +
 .../swr/rasterizer/jitter/functionpasses/passes.h  |  37 ++
 .../drivers/swr/rasterizer/jitter/meson.build  |   8 +-
 .../swr/rasterizer/jitter/streamout_jit.cpp|   3 +
 16 files changed, 565 insertions(+), 46 deletions(-)
 create mode 100644 
src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
 create mode 100644 
src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h

diff --git a/src/gallium/drivers/swr/Makefile.am 
b/src/gallium/drivers/swr/Makefile.am
index 32dd9e5..c22f09e 100644
--- a/src/gallium/drivers/swr/Makefile.am
+++ b/src/gallium/drivers/swr/Makefile.am
@@ -80,7 +80,7 @@ BUILT_SOURCES = \
rasterizer/codegen/gen_knobs.h \
rasterizer/jitter/gen_state_llvm.h \
rasterizer/jitter/gen_builder.hpp \
-   rasterizer/jitter/gen_builder_x86.hpp \
+   rasterizer/jitter/gen_builder_meta.hpp \
rasterizer/jitter/gen_builder_intrin.hpp \
rasterizer/archrast/gen_ar_event.hpp \
rasterizer/archrast/gen_ar_event.cpp \
@@ -134,12 +134,12 @@ rasterizer/jitter/gen_builder.hpp: 
rasterizer/codegen/gen_llvm_ir_macros.py rast
--output rasterizer/jitter \
--gen_h
 
-rasterizer/jitter/gen_builder_x86.hpp: 
rasterizer/codegen/gen_llvm_ir_macros.py 
rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py
+rasterizer/jitter/gen_builder_meta.hpp: 
rasterizer/codegen/gen_llvm_ir_macros.py 
rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py
$(MKDIR_GEN)
$(PYTHON_GEN) \
$(srcdir)/rasterizer/codegen/gen_llvm_ir_macros.py \
--output rasterizer/jitter \
-   --gen_x86_h
+   --gen_meta_h
 
 rasterizer/jitter/gen_builder_intrin.hpp: 
rasterizer/codegen/gen_llvm_ir_macros.py 
rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py
$(MKDIR_GEN)
diff --git a/src/gallium/drivers/swr/Makefile.sources 
b/src/gallium/drivers/swr/Makefile.sources
index 4924da1..a7fcba8 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -152,7 +152,8 @@ JITTER_CXX_SOURCES := \
rasterizer/jitter/JitManager.h \
rasterizer/jitter/streamout_jit.cpp \
rasterizer/jitter/streamout_jit.h \
-   rasterizer/jitter/shader_lib/DebugOutput.cpp
+   rasterizer/jitter/shader_lib/DebugOutput.cpp \
+   rasterizer/jitter/functionpasses/lower_x86.cpp
 
 MEMORY_CXX_SOURCES := \
rasterizer/memory/ClearTile.cpp \
diff --git a/src/gallium/drivers/swr/SConscript 
b/src/gallium/drivers/swr/SConscript
index 5097be6..528cfac 100644
--- a/src/gallium/drivers/swr/SConscript
+++ b/src/gallium/drivers/swr/SConscript
@@ -76,10 +76,10 @@ Depends('rasterizer/jitter/gen_builder.hpp',
 swrroot + 'rasterizer/codegen/templates/gen_builder.hpp')
 
 env.CodeGenerate(
-target = 'rasterizer/jitter/gen_builder_x86.hpp',
+target = 'rasterizer/jitter/gen_builder_meta.hpp',
 script = swrroot + 'rasterizer/codegen/gen_llvm_ir_macros.py',
 source = '',
-command = python_cmd + ' $SCRIPT --output ' + bldroot + 
'/rasterizer/jitter --gen_x86_h'
+command = python_cmd + ' $SCRIPT --output ' + bldroot + 
'/rasterizer/jitter --gen_meta_h'
 )
 Depends('rasterizer/jitter/gen_builder.hpp',
 swrroot + 'rasterizer/codegen/templates/gen_builder.hpp')
diff --git a/src/gallium/drivers/swr/meson.build 
b/src/gallium/drivers/swr/meson.build
index 3848232..949f582 100644
--- a/src/gallium/drivers/swr/meson.build
+++ b/src/gallium/drivers/swr/meson.build
@@ -80,6 +80,7 @@ files_swr_mesa = files(
   'rasterizer/jitter/streamout_jit.cpp',
   'rasterizer/jitter/streamout_jit.h',
   'rasterizer/jitter/shader_lib/DebugOutput.cpp',
+  

[Mesa-dev] [PATCH v2 40/45] swr/rast: minimize codegen redundant work

2018-04-17 Thread George Kyriazis
Move filtering of redundant codegen operations into gen scripts themselves
---
 .../drivers/swr/rasterizer/codegen/gen_archrast.py | 111 +
 .../drivers/swr/rasterizer/codegen/gen_backends.py |  97 +--
 .../drivers/swr/rasterizer/codegen/gen_common.py   | 131 +++--
 .../drivers/swr/rasterizer/codegen/gen_knobs.py|  53 ++---
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |  42 +--
 .../swr/rasterizer/codegen/gen_llvm_types.py   |  29 -
 6 files changed, 335 insertions(+), 128 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_archrast.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_archrast.py
index aa09f22..c5842aa 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_archrast.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_archrast.py
@@ -24,7 +24,7 @@ from __future__ import print_function
 import os
 import sys
 import re
-from gen_common import ArgumentParser, MakoTemplateWriter
+from gen_common import *
 
 def parse_event_fields(lines, idx, event_dict):
 field_names = []
@@ -144,6 +144,10 @@ def main():
 print('Error: Could not find private proto file %s' % 
proto_private_filename, file=sys.stderr)
 return 1
 
+final_output_dir = output_dir
+MakeDir(final_output_dir)
+output_dir = MakeTmpDir('_codegen')
+
 protos = {}
 protos['events'] = {}   # event dictionary containing events with 
their fields
 protos['event_names'] = []  # needed to keep events in order parsed. dict 
is not ordered.
@@ -153,53 +157,64 @@ def main():
 parse_protos(protos, proto_filename)
 parse_protos(protos, proto_private_filename)
 
-# Generate event header
-if args.gen_event_hpp:
-curdir = os.path.dirname(os.path.abspath(__file__))
-template_file = os.sep.join([curdir, 'templates', 'gen_ar_event.hpp'])
-output_fullpath = os.sep.join([output_dir, output_filename])
-
-MakoTemplateWriter.to_file(template_file, output_fullpath,
-cmdline=sys.argv,
-filename=output_filename,
-protos=protos)
-
-# Generate event implementation
-if args.gen_event_cpp:
-curdir = os.path.dirname(os.path.abspath(__file__))
-template_file = os.sep.join([curdir, 'templates', 'gen_ar_event.cpp'])
-output_fullpath = os.sep.join([output_dir, output_filename])
-
-MakoTemplateWriter.to_file(template_file, output_fullpath,
-cmdline=sys.argv,
-filename=output_filename,
-protos=protos)
-
-# Generate event handler header
-if args.gen_eventhandler_hpp:
-curdir = os.path.dirname(os.path.abspath(__file__))
-template_file = os.sep.join([curdir, 'templates', 
'gen_ar_eventhandler.hpp'])
-output_fullpath = os.sep.join([output_dir, output_filename])
-
-MakoTemplateWriter.to_file(template_file, output_fullpath,
-cmdline=sys.argv,
-filename=output_filename,
-event_header='gen_ar_event.hpp',
-protos=protos)
-
-# Generate event handler header
-if args.gen_eventhandlerfile_hpp:
-curdir = os.path.dirname(os.path.abspath(__file__))
-template_file = os.sep.join([curdir, 'templates', 
'gen_ar_eventhandlerfile.hpp'])
-output_fullpath = os.sep.join([output_dir, output_filename])
-
-MakoTemplateWriter.to_file(template_file, output_fullpath,
-cmdline=sys.argv,
-filename=output_filename,
-event_header='gen_ar_eventhandler.hpp',
-protos=protos)
-
-return 0
+rval = 0
+
+try:
+# Generate event header
+if args.gen_event_hpp:
+curdir = os.path.dirname(os.path.abspath(__file__))
+template_file = os.sep.join([curdir, 'templates', 
'gen_ar_event.hpp'])
+output_fullpath = os.sep.join([output_dir, output_filename])
+
+MakoTemplateWriter.to_file(template_file, output_fullpath,
+cmdline=sys.argv,
+filename=output_filename,
+protos=protos)
+
+# Generate event implementation
+if args.gen_event_cpp:
+curdir = os.path.dirname(os.path.abspath(__file__))
+template_file = os.sep.join([curdir, 'templates', 
'gen_ar_event.cpp'])
+output_fullpath = os.sep.join([output_dir, output_filename])
+
+MakoTemplateWriter.to_file(template_file, output_fullpath,
+cmdline=sys.argv,
+filename=output_filename,
+protos=protos)
+
+# Generate event handler header
+if args.gen_eventhandler_hpp:
+curdir = os.path.dirname(os.path.abspath(__file__))
+template_file = os.sep.join([curdir, 'templates', 
'gen_ar_eventhandler.hpp'])
+output_fullpath = os.sep.join([output_dir, 

[Mesa-dev] [PATCH v2 12/45] swr/rast: Permute work for simd16

2018-04-17 Thread George Kyriazis
Fix slow permutes in PA tri lists under SIMD16 emulation on AVX

Added missing permute (interlane, immediate) to SIMDLIB
---
 .../drivers/swr/rasterizer/common/simd16intrin.h   |  1 +
 .../drivers/swr/rasterizer/common/simdintrin.h |  1 +
 .../swr/rasterizer/common/simdlib_256_avx.inl  |  6 
 .../swr/rasterizer/common/simdlib_256_avx2.inl |  7 
 .../swr/rasterizer/common/simdlib_512_avx512.inl   |  6 
 .../swr/rasterizer/common/simdlib_512_emu.inl  | 14 ++--
 src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp | 42 +-
 7 files changed, 67 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h 
b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
index 019b26d..98a8b9b 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
@@ -138,6 +138,7 @@ typedef SIMD512 SIMD16;
 #define _simd16_cmpeq_epi8  SIMD16::cmpeq_epi8
 #define _simd16_cmpgt_epi8  SIMD16::cmpgt_epi8
 
+#define _simd16_permute_ps_i(a, i)  SIMD16::permute_ps(a)
 #define _simd16_permute_ps  SIMD16::permute_ps
 #define _simd16_permute_epi32   SIMD16::permute_epi32
 #define _simd16_sllv_epi32  SIMD16::sllv_epi32
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h 
b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index fce360d..b1471a9 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -106,6 +106,7 @@ typedef SIMD256 SIMD;
 #define _simd_cmpgt_epi16   SIMD::cmpgt_epi16
 #define _simd_cmpeq_epi16   SIMD::cmpeq_epi16
 #define _simd_movemask_epi8 SIMD::movemask_epi8
+#define _simd_permute_ps_i(a, i)SIMD::permute_ps(a)
 #define _simd_permute_psSIMD::permute_ps
 #define _simd_permute_epi32 SIMD::permute_epi32
 #define _simd_srlv_epi32SIMD::srlv_epi32
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
index 42b4552..00c094a 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
@@ -479,6 +479,12 @@ SIMD_EMU_IWRAPPER_2(packs_epi32);   // See documentation 
for _mm256_packs_epi32
 SIMD_EMU_IWRAPPER_2(packus_epi16);  // See documentation for 
_mm256_packus_epi16 and _mm512_packus_epi16
 SIMD_EMU_IWRAPPER_2(packus_epi32);  // See documentation for 
_mm256_packus_epi32 and _mm512_packus_epi32
 
+template
+static SIMDINLINE Float SIMDCALL permute_ps(Float const )
+{
+return _mm256_permute_ps(a, ImmT);
+}
+
 static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const , Integer 
const ) // return a[swiz[i]] for each 32-bit lane i (int32)
 {
 Integer result;
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
index 9cd0a64..96c24ff 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
@@ -174,6 +174,13 @@ SIMD_IWRAPPER_2(packs_epi16);   // See documentation for 
_mm256_packs_epi16 and
 SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm256_packs_epi32 
and _mm512_packs_epi32
 SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm256_packus_epi16 
and _mm512_packus_epi16
 SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm256_packus_epi32 
and _mm512_packus_epi32
+
+template
+static SIMDINLINE Float SIMDCALL permute_ps(Float const )
+{
+return _mm256_permute_ps(a, ImmT);
+}
+
 SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
 
 static SIMDINLINE Float SIMDCALL permute_ps(Float const , Integer const 
)// return a[swiz[i]] for each 32-bit lane i (float)
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
index f3a58f9..dfe19d3 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
@@ -433,6 +433,12 @@ static SIMDINLINE Integer SIMDCALL insert_si(Integer a, 
SIMD256Impl::Integer b)
 // SIMD_IWRAPPER_2(packus_epi16);  // See documentation for 
_mm512_packus_epi16 and _mm512_packus_epi16
 // SIMD_IWRAPPER_2(packus_epi32);  // See documentation for 
_mm512_packus_epi32 and _mm512_packus_epi32
 
+template
+static SIMDINLINE Float SIMDCALL permute_ps(Float const )
+{
+return _mm512_permute_ps(a, ImmT);
+}
+
 static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz)
// return a[swiz[i]] for each 32-bit lane i (float)
 {
 return 

[Mesa-dev] [PATCH v2 14/45] swr/rast: Add "Num Instructions Executed" stats intrinsic.

2018-04-17 Thread George Kyriazis
Added a SWR_SHADER_STATS structure which is passed to each shader. The
stats pass will instrument the shader to populate this.
---
 src/gallium/drivers/swr/rasterizer/core/state.h | 28 ++---
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index 22acbe0..47ffacf 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -214,6 +214,15 @@ struct SIMDVERTEX_T
 };
 
 //
+/// SWR_SHADER_STATS
+/// @brief Structure passed to shader for stats collection.
+/
+struct SWR_SHADER_STATS
+{
+uint32_t numInstExecuted; // This is roughly the API instructions executed 
and not x86.
+};
+
+//
 /// SWR_VS_CONTEXT
 /// @brief Input to vertex shader
 /
@@ -232,6 +241,7 @@ struct SWR_VS_CONTEXT
 simd16scalari VertexID16;   // IN: Vertex ID (16-wide)
 #endif
 #endif
+SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast.
 };
 
 /
@@ -281,6 +291,7 @@ struct SWR_HS_CONTEXT
 simdscalari mask;   // IN: Active mask for shader
 ScalarPatch* pCPout;// OUT: Output control point patch
 // SIMD-sized-array of SCALAR patches
+SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast.
 };
 
 //
@@ -298,6 +309,7 @@ struct SWR_DS_CONTEXT
 simdscalar* pDomainV;   // IN: (SIMD) Domain Point V coords
 simdscalari mask;   // IN: Active mask for shader
 simdscalar* pOutputData;// OUT: (SIMD) Vertex Attributes (2D array 
of vectors, one row per attribute-component)
+SWR_SHADER_STATS stats; // OUT: shader statistics used for 
archrast.
 };
 
 //
@@ -312,6 +324,7 @@ struct SWR_GS_CONTEXT
 uint32_t InstanceID;// IN: input instance ID
 simdscalari mask;   // IN: Active mask for shader
 uint8_t* pStreams[KNOB_SIMD_WIDTH]; // OUT: output stream (contains 
vertices for all output streams)
+SWR_SHADER_STATS stats; // OUT: shader statistics used for 
archrast.
 };
 
 struct PixelPositions
@@ -358,6 +371,8 @@ struct SWR_PS_CONTEXT
 uint32_t rasterizerSampleCount; // IN: sample count used by the 
rasterizer
 
 uint8_t* pColorBuffer[SWR_NUM_RENDERTARGETS]; // IN: Pointers to render 
target hottiles
+
+SWR_SHADER_STATS stats; // OUT: shader statistics used for 
archrast.
 };
 
 //
@@ -391,14 +406,13 @@ struct SWR_CS_CONTEXT
 // Dispatch dimensions used by shader to compute system values from the 
tile counter.
 uint32_t dispatchDims[3];
 
-uint8_t* pTGSM;  // Thread Group Shared Memory pointer.
-
-uint8_t* pSpillFillBuffer;  // Spill/fill buffer for barrier support
-
-uint8_t* pScratchSpace; // Pointer to scratch space buffer used by the 
shader, shader is responsible
-// for subdividing scratch space per 
instance/simd
-
+uint8_t* pTGSM;   // Thread Group Shared Memory pointer.
+uint8_t* pSpillFillBuffer;// Spill/fill buffer for barrier support
+uint8_t* pScratchSpace;   // Pointer to scratch space buffer used by 
the shader, shader is responsible
+  // for subdividing scratch space per 
instance/simd
 uint32_t scratchSpacePerSimd; // Scratch space per work item x SIMD_WIDTH
+
+SWR_SHADER_STATS stats;   // OUT: shader statistics used for archrast.
 };
 
 // enums
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 29/45] swr/rast: Add support for setting optimization level

2018-04-17 Thread George Kyriazis
for JIT compilation
---
 .../drivers/swr/rasterizer/codegen/knob_defs.py| 35 ++
 .../swr/rasterizer/common/simdlib_512_emu.inl  |  2 +-
 src/gallium/drivers/swr/rasterizer/core/state.h| 13 
 .../drivers/swr/rasterizer/jitter/JitManager.cpp   | 10 +--
 .../drivers/swr/rasterizer/jitter/JitManager.h |  1 -
 .../swr/rasterizer/jitter/builder_gfx_mem.cpp  |  4 +--
 .../swr/rasterizer/jitter/builder_gfx_mem.h|  3 +-
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  |  1 -
 .../drivers/swr/rasterizer/jitter/builder_mem.h|  4 ---
 9 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py 
b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
index d4bf193..c9d1f5d 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
@@ -193,6 +193,41 @@ KNOBS = [
 'category'  : 'debug_adv',
 }],
 
+['JIT_OPTIMIZATION_LEVEL', {
+'type'  : 'int',
+'default'   : '-1',
+'desc'  : ['JIT compile optimization level:',],
+'category'  : 'debug',
+'control'   : 'dropdown',
+'choices' : [
+{
+'name'  : 'Automatic',
+'desc'  : 'Automatic based on other KNOB and build settings',
+'value' : -1,
+},
+{
+'name'  : 'Debug',
+'desc'  : 'No optimization: -O0',
+'value' : 0,
+},
+{
+'name'  : 'Less',
+'desc'  : 'Some optimization: -O1',
+'value' : 1,
+},
+{
+'name'  : 'Optimize',
+'desc'  : 'Default Clang / LLVM optimizations: -O2',
+'value' : 2,
+},
+{
+'name'  : 'Aggressive',
+'desc'  : 'Maximum optimization: -O3',
+'value' : 3,
+},
+],
+}],
+
 ['JIT_CACHE_DIR', {
 'type'  : 'std::string',
 'default'   : r'%TEMP%\SWR\JitCache' if sys.platform == 'win32' else 
'${HOME}/.swr/jitcache',
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
index 5d5120a..55981dc 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
@@ -426,7 +426,7 @@ static SIMDINLINE bool SIMDCALL testz_ps(Float const , 
Float const )  // ret
   SIMD256T::testz_ps(a.v8[1], b.v8[1]));
 }
 
-static SIMDINLINE int SIMDCALL testz_si(Integer const , Integer const )  
// return all_lanes_zero(a & b) ? 1 : 0 (int)
+static SIMDINLINE bool SIMDCALL testz_si(Integer const , Integer const )  
// return all_lanes_zero(a & b) ? 1 : 0 (int)
 {
 return  0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) &
   SIMD256T::testz_si(a.v8[1], b.v8[1]));
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index 084ca54..9233446 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -1,5 +1,5 @@
 /
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -526,6 +526,11 @@ enum SWR_AUX_MODE
 AUX_MODE_DEPTH,
 };
 
+struct SWR_LOD_OFFSETS
+{
+uint32_t offsets[2][15];
+};
+
 //
 /// SWR_SURFACE_STATE
 //
@@ -866,11 +871,9 @@ enum SWR_MULTISAMPLE_COUNT
 SWR_MULTISAMPLE_TYPE_COUNT
 };
 
-INLINE uint32_t GetNumSamples(SWR_MULTISAMPLE_COUNT sampleCount) // 
@llvm_func_start
+static INLINE uint32_t GetNumSamples(/* SWR_SAMPLE_COUNT */ int 
sampleCountEnum) // @llvm_func_start
 {
-static const uint32_t sampleCountLUT[SWR_MULTISAMPLE_TYPE_COUNT] {1, 2, 4, 
8, 16};
-assert(sampleCount < SWR_MULTISAMPLE_TYPE_COUNT);
-return sampleCountLUT[sampleCount];
+return uint32_t(1) << sampleCountEnum;
 } // @llvm_func_end
 
 struct SWR_BLEND_STATE
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 9080964..7f9c9dd 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -66,6 +66,7 @@ JitManager::JitManager(uint32_t simdWidth, const char *arch, 
const char* core)
 

[Mesa-dev] [PATCH v2 02/45] swr/rast: Introduce JIT_MEM_CLIENT

2018-04-17 Thread George Kyriazis
Add assert for correct usage of memory accesses

v2: reworded commit message; renamed enum more appropriately
---
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  | 58 ++
 .../drivers/swr/rasterizer/jitter/builder_mem.h| 47 --
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|  6 +--
 3 files changed, 71 insertions(+), 40 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index 6fa60a1..278113e 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -36,6 +36,9 @@
 
 namespace SwrJit
 {
+void Builder::AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage)
+{
+}
 
 Value *Builder::GEP(Value* ptr, const std::initializer_list 
)
 {
@@ -69,28 +72,33 @@ namespace SwrJit
 return IN_BOUNDS_GEP(ptr, indices);
 }
 
-LoadInst* Builder::LOAD(Value *Ptr, const char *Name)
+LoadInst* Builder::LOAD(Value *Ptr, const char *Name, JIT_MEM_CLIENT usage)
 {
+AssertMemoryUsageParams(Ptr, usage);
 return IRB()->CreateLoad(Ptr, Name);
 }
 
-LoadInst* Builder::LOAD(Value *Ptr, const Twine )
+LoadInst* Builder::LOAD(Value *Ptr, const Twine , JIT_MEM_CLIENT 
usage)
 {
+AssertMemoryUsageParams(Ptr, usage);
 return IRB()->CreateLoad(Ptr, Name);
 }
 
-LoadInst* Builder::LOAD(Type *Ty, Value *Ptr, const Twine )
+LoadInst* Builder::LOAD(Type *Ty, Value *Ptr, const Twine , 
JIT_MEM_CLIENT usage)
 {
+AssertMemoryUsageParams(Ptr, usage);
 return IRB()->CreateLoad(Ty, Ptr, Name);
 }
 
-LoadInst* Builder::LOAD(Value *Ptr, bool isVolatile, const Twine )
+LoadInst* Builder::LOAD(Value *Ptr, bool isVolatile, const Twine , 
JIT_MEM_CLIENT usage)
 {
+AssertMemoryUsageParams(Ptr, usage);
 return IRB()->CreateLoad(Ptr, isVolatile, Name);
 }
 
-LoadInst *Builder::LOAD(Value *basePtr, const 
std::initializer_list , const llvm::Twine& name)
+LoadInst *Builder::LOAD(Value *basePtr, const 
std::initializer_list , const llvm::Twine& name, 
JIT_MEM_CLIENT usage)
 {
+AssertMemoryUsageParams(basePtr, usage);
 std::vector valIndices;
 for (auto i : indices)
 valIndices.push_back(C(i));
@@ -158,8 +166,10 @@ namespace SwrJit
 /// @param vIndices - SIMD wide value of VB byte offsets
 /// @param vMask - SIMD wide mask that controls whether to access memory 
or the src values
 /// @param scale - value to scale indices by
-Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value 
*vMask, uint8_t scale)
+Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value 
*vMask, uint8_t scale, JIT_MEM_CLIENT usage)
 {
+AssertMemoryUsageParams(pBase, usage);
+
 Value *vGather;
 Value *pBasePtr = INT_TO_PTR(pBase, PointerType::get(mInt8Ty, 0));
 
@@ -204,8 +214,10 @@ namespace SwrJit
 return vGather;
 }
 
-Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, 
Value *vMask, uint8_t scale)
+Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, 
Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage)
 {
+AssertMemoryUsageParams(pBase, usage);
+
 Value *vGather = VUNDEF_F_16();
 
 // use AVX512F gather instruction if available
@@ -244,8 +256,10 @@ namespace SwrJit
 /// @param vIndices - SIMD wide value of VB byte offsets
 /// @param vMask - SIMD wide mask that controls whether to access memory 
or the src values
 /// @param scale - value to scale indices by
-Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, uint8_t scale)
+Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, uint8_t scale, JIT_MEM_CLIENT usage)
 {
+AssertMemoryUsageParams(pBase, usage);
+
 Value* vGather;
 
 // use avx2 gather instruction if available
@@ -286,8 +300,10 @@ namespace SwrJit
 return vGather;
 }
 
-Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, 
Value *vMask, uint8_t scale)
+Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, 
Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage)
 {
+AssertMemoryUsageParams(pBase, usage);
+
 Value *vGather = VUNDEF_I_16();
 
 // use AVX512F gather instruction if available
@@ -380,21 +396,21 @@ namespace SwrJit
 }
 
 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* 
byteOffsets,
-Value* mask, Value* vGatherComponents[], bool bPackedOutput)
+Value* mask, Value* vGatherComponents[], bool bPackedOutput, 
JIT_MEM_CLIENT usage)
 {
 const SWR_FORMAT_INFO  = GetFormatInfo(format);
 if (info.type[0] == 

[Mesa-dev] [PATCH v2 37/45] swr/rast: Add shader stats infrastructure (WIP)

2018-04-17 Thread George Kyriazis
---
 .../drivers/swr/rasterizer/archrast/archrast.cpp   | 64 +++--
 .../drivers/swr/rasterizer/archrast/events.proto   | 65 --
 .../swr/rasterizer/archrast/events_private.proto   | 30 ++
 .../drivers/swr/rasterizer/jitter/builder.h| 23 
 4 files changed, 148 insertions(+), 34 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp 
b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
index 2184673..871db79 100644
--- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
+++ b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
@@ -61,7 +61,7 @@ namespace ArchRast
 //@todo:: Change this to numPatches. Assumed: 1 patch per prim. If 
holds, its fine.
 };
 
-struct GSInfo
+struct GSStateInfo
 {
 uint32_t inputPrimCount;
 uint32_t primGeneratedCount;
@@ -155,7 +155,7 @@ namespace ArchRast
 mDSSampleRate.earlyStencilTestFailCount += 
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
 
 //earlyZ test single and multi sample
-mDSCombined.earlyZTestPassCount  += 
_mm_popcnt_u32(event.data.depthPassMask);
+mDSCombined.earlyZTestPassCount += 
_mm_popcnt_u32(event.data.depthPassMask);
 mDSCombined.earlyZTestFailCount += 
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
 
 //earlyStencil test single and multi sample
@@ -257,11 +257,51 @@ namespace ArchRast
 mClipper.trivialAcceptCount += _mm_popcnt_u32(event.data.validMask 
& ~event.data.clipMask);
 }
 
+struct ShaderStats
+{
+uint32_t numInstExecuted;
+};
+
+virtual void Handle(const VSStats& event)
+{
+mShaderStats[SHADER_VERTEX].numInstExecuted += 
event.data.numInstExecuted;
+}
+
+virtual void Handle(const GSStats& event)
+{
+mShaderStats[SHADER_GEOMETRY].numInstExecuted += 
event.data.numInstExecuted;
+}
+
+virtual void Handle(const DSStats& event)
+{
+mShaderStats[SHADER_DOMAIN].numInstExecuted += 
event.data.numInstExecuted;
+}
+
+virtual void Handle(const HSStats& event)
+{
+mShaderStats[SHADER_HULL].numInstExecuted += 
event.data.numInstExecuted;
+}
+
+virtual void Handle(const PSStats& event)
+{
+mShaderStats[SHADER_PIXEL].numInstExecuted += 
event.data.numInstExecuted;
+mNeedFlush = true;
+}
+
+virtual void Handle(const CSStats& event)
+{
+mShaderStats[SHADER_COMPUTE].numInstExecuted += 
event.data.numInstExecuted;
+mNeedFlush = true;
+}
+
 // Flush cached events for this draw
 virtual void FlushDraw(uint32_t drawId)
 {
 if (mNeedFlush == false) return;
 
+EventHandlerFile::Handle(PSInfo(drawId, 
mShaderStats[SHADER_PIXEL].numInstExecuted));
+EventHandlerFile::Handle(CSInfo(drawId, 
mShaderStats[SHADER_COMPUTE].numInstExecuted));
+
 //singleSample
 EventHandlerFile::Handle(EarlyZSingleSample(drawId, 
mDSSingleSample.earlyZTestPassCount, mDSSingleSample.earlyZTestFailCount));
 EventHandlerFile::Handle(LateZSingleSample(drawId, 
mDSSingleSample.lateZTestPassCount, mDSSingleSample.lateZTestFailCount));
@@ -297,7 +337,7 @@ namespace ArchRast
 
 // Primitive Culling
 EventHandlerFile::Handle(CullEvent(drawId, 
mCullStats.backfacePrimCount, mCullStats.degeneratePrimCount));
-
+
 mDSSingleSample = {};
 mDSSampleRate = {};
 mDSCombined = {};
@@ -307,6 +347,10 @@ namespace ArchRast
 rastStats = {};
 mCullStats = {};
 mAlphaStats = {};
+
+mShaderStats[SHADER_PIXEL] = {};
+mShaderStats[SHADER_COMPUTE] = {};
+
 mNeedFlush = false;
 }
 
@@ -323,6 +367,16 @@ namespace ArchRast
 EventHandlerFile::Handle(GSPrimsGen(event.data.drawId, 
mGS.primGeneratedCount));
 EventHandlerFile::Handle(GSVertsInput(event.data.drawId, 
mGS.vertsInput));
 
+EventHandlerFile::Handle(VSInfo(event.data.drawId, 
mShaderStats[SHADER_VERTEX].numInstExecuted));
+EventHandlerFile::Handle(HSInfo(event.data.drawId, 
mShaderStats[SHADER_HULL].numInstExecuted));
+EventHandlerFile::Handle(DSInfo(event.data.drawId, 
mShaderStats[SHADER_DOMAIN].numInstExecuted));
+EventHandlerFile::Handle(GSInfo(event.data.drawId, 
mShaderStats[SHADER_GEOMETRY].numInstExecuted));
+
+mShaderStats[SHADER_VERTEX] = {};
+mShaderStats[SHADER_HULL] = {};
+mShaderStats[SHADER_DOMAIN] = {};
+mShaderStats[SHADER_GEOMETRY] = {};
+
 //Reset Internal Counters
 mClipper = {};
 mTS = {};
@@ 

[Mesa-dev] [PATCH v2 18/45] swr/rast: Move CallPrint() to a separate file

2018-04-17 Thread George Kyriazis
Needed work for jit code debug.
---
 src/gallium/drivers/swr/Makefile.sources   |  3 +-
 src/gallium/drivers/swr/meson.build|  1 +
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 22 +-
 .../rasterizer/jitter/shader_lib/DebugOutput.cpp   | 51 ++
 4 files changed, 56 insertions(+), 21 deletions(-)
 create mode 100644 
src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp

diff --git a/src/gallium/drivers/swr/Makefile.sources 
b/src/gallium/drivers/swr/Makefile.sources
index cbf7395..4924da1 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -151,7 +151,8 @@ JITTER_CXX_SOURCES := \
rasterizer/jitter/JitManager.cpp \
rasterizer/jitter/JitManager.h \
rasterizer/jitter/streamout_jit.cpp \
-   rasterizer/jitter/streamout_jit.h
+   rasterizer/jitter/streamout_jit.h \
+   rasterizer/jitter/shader_lib/DebugOutput.cpp
 
 MEMORY_CXX_SOURCES := \
rasterizer/memory/ClearTile.cpp \
diff --git a/src/gallium/drivers/swr/meson.build 
b/src/gallium/drivers/swr/meson.build
index b28abd6..3848232 100644
--- a/src/gallium/drivers/swr/meson.build
+++ b/src/gallium/drivers/swr/meson.build
@@ -79,6 +79,7 @@ files_swr_mesa = files(
   'rasterizer/jitter/JitManager.h',
   'rasterizer/jitter/streamout_jit.cpp',
   'rasterizer/jitter/streamout_jit.h',
+  'rasterizer/jitter/shader_lib/DebugOutput.cpp',
 )
 
 files_swr_arch = files(
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index c266018..54987c7 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -33,10 +33,10 @@
 
 #include 
 
+extern "C" void CallPrint(const char* fmt, ...);
+
 namespace SwrJit
 {
-void __cdecl CallPrint(const char* fmt, ...);
-
 //
 /// @brief Convert an IEEE 754 32-bit single precision float to an
 ///16 bit float with 5 exponent bits and a variable
@@ -846,24 +846,6 @@ namespace SwrJit
 /// @brief C functions called by LLVM IR
 //
 
-//
-/// @brief called in JIT code, inserted by PRINT
-/// output to both stdout and visual studio debug console
-void __cdecl CallPrint(const char* fmt, ...)
-{
-va_list args;
-va_start(args, fmt);
-vprintf(fmt, args);
-
-#if defined( _WIN32 )
-char strBuf[1024];
-vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
-OutputDebugStringA(strBuf);
-#endif
-
-va_end(args);
-}
-
 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
 {
 bool flag = !imm8->isZeroValue();
diff --git 
a/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp
new file mode 100644
index 000..54d45e6
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp
@@ -0,0 +1,51 @@
+/
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file DebugOutput.cpp
+*
+* @brief Shader support library implementation for printed Debug output
+*
+* Notes:
+*
+**/
+#include 
+#include "common/os.h"
+
+
+//
+/// @brief called in JIT code, inserted by PRINT
+/// output to both stdout and visual studio debug console
+extern "C" void CallPrint(const char* 

[Mesa-dev] [PATCH v2 36/45] swr/rast: Type-check TemplateArgUnroller

2018-04-17 Thread George Kyriazis
Allows direct use of enum values in conversion to template args.
---
 src/gallium/drivers/swr/rasterizer/core/utils.h | 39 +
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h 
b/src/gallium/drivers/swr/rasterizer/core/utils.h
index c926f6a..d6cbf24 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
@@ -268,12 +268,15 @@ public:
 };
 
 // Ranged integer argument for TemplateArgUnroller
-template 
-struct IntArg
+template 
+struct RangedArg
 {
-uint32_t val;
+T val;
 };
 
+template 
+using IntArg = RangedArg;
+
 // Recursive template used to auto-nest conditionals.  Converts dynamic 
boolean function
 // arguments to static template arguments.
 template 
@@ -307,49 +310,49 @@ struct TemplateArgUnroller
 }
 
 //-
-// Integer value (within specified range)
+// Ranged value (within specified range)
 //-
 
 // Last Arg Terminator
-template 
-static typename TermT::FuncType GetFunc(IntArg iArg)
+template 
+static typename TermT::FuncType GetFunc(RangedArg iArg)
 {
 if (iArg.val == TMax)
 {
-return TermT::template GetFunc>();
+return TermT::template GetFunc>();
 }
 if (TMax > TMin)
 {
-return TemplateArgUnroller::GetFunc(IntArg{iArg.val});
+return TemplateArgUnroller::GetFunc(RangedArg{iArg.val});
 }
 SWR_ASSUME(false); return nullptr;
 }
-template 
-static typename TermT::FuncType GetFunc(IntArg iArg)
+template 
+static typename TermT::FuncType GetFunc(RangedArg iArg)
 {
 SWR_ASSERT(iArg.val == TVal);
-return TermT::template GetFunc>();
+return TermT::template GetFunc>();
 }
 
 // Recursively parse args
-template 
-static typename TermT::FuncType GetFunc(IntArg iArg, TArgsT... 
remainingArgs)
+template 
+static typename TermT::FuncType GetFunc(RangedArg iArg, 
TArgsT... remainingArgs)
 {
 if (iArg.val == TMax)
 {
-return TemplateArgUnroller>::GetFunc(remainingArgs...);
+return TemplateArgUnroller>::GetFunc(remainingArgs...);
 }
 if (TMax > TMin)
 {
-return TemplateArgUnroller::GetFunc(IntArg{iArg.val}, remainingArgs...);
+return TemplateArgUnroller::GetFunc(RangedArg{iArg.val}, remainingArgs...);
 }
 SWR_ASSUME(false); return nullptr;
 }
-template 
-static typename TermT::FuncType GetFunc(IntArg iArg, TArgsT... 
remainingArgs)
+template 
+static typename TermT::FuncType GetFunc(RangedArg iArg, 
TArgsT... remainingArgs)
 {
 SWR_ASSERT(iArg.val == TVal);
-return TemplateArgUnroller>::GetFunc(remainingArgs...);
+return TemplateArgUnroller>::GetFunc(remainingArgs...);
 }
 };
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 16/45] swr/rast: Add some archrast counters

2018-04-17 Thread George Kyriazis
Hook up archrast counters for shader stats: instructions executed.
---
 .../drivers/swr/rasterizer/archrast/archrast.cpp   |  4 +--
 .../drivers/swr/rasterizer/archrast/events.proto   | 30 ++
 .../drivers/swr/rasterizer/core/backend.cpp|  1 +
 .../drivers/swr/rasterizer/core/backend_impl.h |  4 +++
 .../drivers/swr/rasterizer/core/backend_sample.cpp |  5 +++-
 .../swr/rasterizer/core/backend_singlesample.cpp   |  5 +++-
 .../drivers/swr/rasterizer/core/frontend.cpp   |  8 ++
 7 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp 
b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
index 12dfc0e..2184673 100644
--- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
+++ b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
@@ -61,7 +61,7 @@ namespace ArchRast
 //@todo:: Change this to numPatches. Assumed: 1 patch per prim. If 
holds, its fine.
 };
 
-struct GSStats
+struct GSInfo
 {
 uint32_t inputPrimCount;
 uint32_t primGeneratedCount;
@@ -369,7 +369,7 @@ namespace ArchRast
 DepthStencilStats mDSOmZ = {};
 CStats mClipper = {};
 TEStats mTS = {};
-GSStats mGS = {};
+GSInfo mGS = {};
 RastStats rastStats = {};
 CullStats mCullStats = {};
 AlphaStats mAlphaStats = {};
diff --git a/src/gallium/drivers/swr/rasterizer/archrast/events.proto 
b/src/gallium/drivers/swr/rasterizer/archrast/events.proto
index deb0373..f924b57 100644
--- a/src/gallium/drivers/swr/rasterizer/archrast/events.proto
+++ b/src/gallium/drivers/swr/rasterizer/archrast/events.proto
@@ -115,6 +115,36 @@ event FrontendStatsEvent
 uint64_t SoNumPrimsWritten3;
 };
 
+event VSStats
+{
+uint32_t numInstExecuted;
+};
+
+event HSStats
+{
+uint32_t numInstExecuted;
+};
+
+event DSStats
+{
+uint32_t numInstExecuted;
+};
+
+event GSStats
+{
+uint32_t numInstExecuted;
+};
+
+event PSStats
+{
+uint32_t numInstExecuted;
+};
+
+event CSStats
+{
+uint32_t numInstExecuted;
+};
+
 event BackendStatsEvent
 {
 uint32_t drawId;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index ccc7150..1e0769a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -81,6 +81,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, 
uint32_t threadGroup
 state.pfnCsFunc(GetPrivateState(pDC), );
 
 UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
+AR_EVENT(CSStats(csContext.stats.numInstExecuted));
 
 RDTSC_END(BEDispatch, 1);
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h 
b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
index dd349a1..20b2ec5 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
@@ -968,6 +968,10 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t 
workerId, uint32_t x, uint32_t
 UPDATE_STAT_BE(PsInvocations, 
_mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
 RDTSC_END(BEPixelShader, 0);
 
+// update stats
+UPDATE_STAT_BE(PsInvocations, 
_mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
+AR_EVENT(PSStats(psContext.stats.numInstExecuted));
+
 // update active lanes to remove any discarded or oMask'd pixels
 activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, 
_simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si(;
 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp 
b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
index 4982025..c7c6c533 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
@@ -163,10 +163,13 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t 
workerId, uint32_t x, uint32_
 
 // execute pixel shader
 RDTSC_BEGIN(BEPixelShader, pDC->drawId);
-UPDATE_STAT_BE(PsInvocations, 
_mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
 state.psState.pfnPixelShader(GetPrivateState(pDC), 
);
 RDTSC_END(BEPixelShader, 0);
 
+// update stats
+UPDATE_STAT_BE(PsInvocations, 
_mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
+AR_EVENT(PSStats(psContext.stats.numInstExecuted));
+
 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
 
 // late-Z
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp 
b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
index 452fba1..26d5a75 100644
--- 

[Mesa-dev] [PATCH v2 35/45] swr/rast: Add vgather to x86 lowering pass.

2018-04-17 Thread George Kyriazis
Add support for generic VGATHERPD intrinsic in x86 lowering pass.
---
 .../rasterizer/jitter/functionpasses/lower_x86.cpp | 85 +++---
 1 file changed, 76 insertions(+), 9 deletions(-)

diff --git 
a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
index b27335f..9423b28 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
@@ -72,7 +72,6 @@ namespace SwrJit
 // Map of intrinsics that haven't been moved to the new mechanism yet. If 
used, these get the previous behavior of
 // mapping directly to avx/avx2 intrinsics.
 static std::map intrinsicMap = {
-{"meta.intrinsic.VGATHERPD",   
Intrinsic::x86_avx2_gather_d_pd_256},
 {"meta.intrinsic.VROUND",  Intrinsic::x86_avx_round_ps_256},
 {"meta.intrinsic.BEXTR_32",Intrinsic::x86_bmi_bextr_32},
 {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
@@ -98,6 +97,7 @@ namespace SwrJit
 {"meta.intrinsic.VRCPPS",  {{Intrinsic::x86_avx_rcp_ps_256,
  Intrinsic::not_intrinsic},  NO_EMU}},
 {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VPERM_EMU}},
 {"meta.intrinsic.VPERMD",  {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VPERM_EMU}},
+{"meta.intrinsic.VGATHERPD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx_cvt_pd2_ps_256,
  Intrinsic::not_intrinsic},  NO_EMU}},
@@ -107,6 +107,7 @@ namespace SwrJit
 {"meta.intrinsic.VRCPPS",  {{Intrinsic::x86_avx_rcp_ps_256,
  Intrinsic::not_intrinsic},  NO_EMU}},
 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps,   
  Intrinsic::not_intrinsic},  VPERM_EMU}},
 {"meta.intrinsic.VPERMD",  {{Intrinsic::x86_avx2_permd,
  Intrinsic::not_intrinsic},  VPERM_EMU}},
+{"meta.intrinsic.VGATHERPD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx_cvt_pd2_ps_256,
  Intrinsic::not_intrinsic},  NO_EMU}},
@@ -116,6 +117,7 @@ namespace SwrJit
 {"meta.intrinsic.VRCPPS",  {{Intrinsic::x86_avx512_rcp14_ps_256,   
  Intrinsic::x86_avx512_rcp14_ps_512},NO_EMU}},
 {"meta.intrinsic.VPERMPS", 
{{Intrinsic::x86_avx512_mask_permvar_sf_256,  
Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}},
 {"meta.intrinsic.VPERMD",  
{{Intrinsic::x86_avx512_mask_permvar_si_256,  
Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}},
+{"meta.intrinsic.VGATHERPD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VCVTPD2PS",   
{{Intrinsic::x86_avx512_mask_cvtpd2ps_256,
Intrinsic::x86_avx512_mask_cvtpd2ps_512 },  NO_EMU}},
@@ -207,6 +209,13 @@ namespace SwrJit
 return mask;
 }
 
+// Convert  mask to  x86 mask
+Value* VectorMask(Value* vi1Mask)
+{
+uint32_t numElem = vi1Mask->getType()->getVectorNumElements();
+return B->S_EXT(vi1Mask, VectorType::get(B->mInt32Ty, numElem));
+}
+
 Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)
 {
 Function* pFunc = pCallInst->getCalledFunction();
@@ -406,17 +415,33 @@ namespace SwrJit
 }
 else if (arch == AVX2 || (arch == AVX512 && width == W256))
 {
-Function* pX86IntrinFunc = srcTy == 

[Mesa-dev] [PATCH v2 31/45] swr/rast: Change gfx pointers to gfxptr_t

2018-04-17 Thread George Kyriazis
Changing type to gfxptr for indices and related changes to fetch and mem
builder code.
---
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |  4 +-
 src/gallium/drivers/swr/rasterizer/core/api.cpp|  8 +-
 src/gallium/drivers/swr/rasterizer/core/context.h  |  2 +-
 .../drivers/swr/rasterizer/core/frontend.cpp   | 40 -
 src/gallium/drivers/swr/rasterizer/core/state.h|  6 +-
 .../swr/rasterizer/jitter/builder_gfx_mem.cpp  | 80 +++--
 .../swr/rasterizer/jitter/builder_gfx_mem.h| 24 --
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  | 35 ++--
 .../drivers/swr/rasterizer/jitter/builder_mem.h| 23 +++--
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 99 --
 src/gallium/drivers/swr/swr_state.cpp  |  2 +-
 11 files changed, 220 insertions(+), 103 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index bdd785a..2636e60 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -162,7 +162,9 @@ def parse_ir_builder(input_file):
 if (func_name == 'CreateInsertNUWNSWBinOp' or
 func_name == 'CreateMaskedIntrinsic' or
 func_name == 'CreateAlignmentAssumptionHelper' or
-func_name == 'CreateLoad'):
+func_name == 'CreateGEP' or
+func_name == 'CreateLoad' or
+func_name == 'CreateMaskedLoad'):
 ignore = True
 
 # Convert CamelCase to CAMEL_CASE
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 53bd2d2..3141db6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -1321,8 +1321,8 @@ void DrawIndexedInstance(
 }
 
 int draw = 0;
-uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
-pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
+gfxptr_t xpIB = pState->indexBuffer.xpIndices;
+xpIB += (uint64_t)indexOffset * (uint64_t)indexSize;
 
 pState->topology = topology;
 pState->forceFront = false;
@@ -1360,7 +1360,7 @@ void DrawIndexedInstance(
 pDC->pState->pfnProcessPrims != nullptr);
 pDC->FeWork.desc.draw.pDC = pDC;
 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
-pDC->FeWork.desc.draw.pIB = (int*)pIB;
+pDC->FeWork.desc.draw.xpIB = xpIB;
 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
 
 pDC->FeWork.desc.draw.numInstances = numInstances;
@@ -1376,7 +1376,7 @@ void DrawIndexedInstance(
 AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId, topology, 
numIndicesForDraw, indexOffset, baseVertex,
 numInstances, startInstance, pState->tsState.tsEnable, 
pState->gsState.gsEnable, pState->soState.soEnable, 
pState->gsState.outputTopology, draw));
 
-pIB += maxIndicesPerDraw * indexSize;
+xpIB += maxIndicesPerDraw * indexSize;
 remainingIndices -= numIndicesForDraw;
 draw++;
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h 
b/src/gallium/drivers/swr/rasterizer/core/context.h
index 489aa78..7bc69f5 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -176,7 +176,7 @@ struct DRAW_WORK
 };
 union
 {
-const int32_t* pIB;// DrawIndexed: App supplied indices
+gfxptr_t   xpIB;  // DrawIndexed: App supplied int32 
indices 
 uint32_t   startVertex;// Draw: Starting vertex in VB to render 
from.
 };
 int32_tbaseVertex;
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 2076859..30c2e7b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -1527,28 +1527,24 @@ void ProcessDraw(
 uint32_t indexSize = 0;
 uint32_t endVertex = work.numVerts;
 
-const int32_t* pLastRequestedIndex = nullptr;
+gfxptr_t xpLastRequestedIndex = 0;
 if (IsIndexedT::value)
 {
 switch (work.type)
 {
 case R32_UINT:
 indexSize = sizeof(uint32_t);
-pLastRequestedIndex = &(work.pIB[endVertex]);
 break;
 case R16_UINT:
 indexSize = sizeof(uint16_t);
-// nasty address offset to last index
-pLastRequestedIndex = 
(int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
 break;
 case R8_UINT:
 indexSize = sizeof(uint8_t);
-// nasty address offset to last index
-pLastRequestedIndex = 

[Mesa-dev] [PATCH v2 24/45] swr/rast: Add builder_gfx_mem.{h|cpp}

2018-04-17 Thread George Kyriazis
Abstract usage scenarios for memory accesses into builder_gfx_mem.
Builder_gfx_mem will convert gfxptr_t from 64-bit int to regular pointer
types for use by builder_mem.

v2: reworded commit message; renamed enum more appropriately
---
 src/gallium/drivers/swr/Makefile.sources   |   2 +
 src/gallium/drivers/swr/meson.build|   2 +
 .../swr/rasterizer/jitter/builder_gfx_mem.cpp  | 136 +
 .../swr/rasterizer/jitter/builder_gfx_mem.h|  67 ++
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|   7 +-
 5 files changed, 210 insertions(+), 4 deletions(-)
 create mode 100644 
src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
 create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h

diff --git a/src/gallium/drivers/swr/Makefile.sources 
b/src/gallium/drivers/swr/Makefile.sources
index a7fcba8..dd815dc 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -142,6 +142,8 @@ JITTER_CXX_SOURCES := \
rasterizer/jitter/builder_math.h \
rasterizer/jitter/builder_mem.cpp \
rasterizer/jitter/builder_mem.h \
+   rasterizer/jitter/builder_gfx_mem.cpp \
+   rasterizer/jitter/builder_gfx_mem.h \
rasterizer/jitter/builder_misc.cpp \
rasterizer/jitter/builder_misc.h \
rasterizer/jitter/fetch_jit.cpp \
diff --git a/src/gallium/drivers/swr/meson.build 
b/src/gallium/drivers/swr/meson.build
index 949f582..1cb40f8 100644
--- a/src/gallium/drivers/swr/meson.build
+++ b/src/gallium/drivers/swr/meson.build
@@ -70,6 +70,8 @@ files_swr_mesa = files(
   'rasterizer/jitter/builder_math.h',
   'rasterizer/jitter/builder_mem.cpp',
   'rasterizer/jitter/builder_mem.h',
+  'rasterizer/jitter/builder_gfx_mem.cpp',
+  'rasterizer/jitter/builder_gfx_mem.h',
   'rasterizer/jitter/builder_misc.cpp',
   'rasterizer/jitter/builder_misc.h',
   'rasterizer/jitter/fetch_jit.cpp',
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
new file mode 100644
index 000..e097bd1
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
@@ -0,0 +1,136 @@
+/
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file builder_gfx_mem.cpp
+*
+* @brief Definition of the gfx mem builder
+*
+* Notes:
+*
+**/
+#include "jit_pch.hpp"
+#include "builder.h"
+#include "common/rdtsc_buckets.h"
+#include "builder_gfx_mem.h"
+
+
+namespace SwrJit
+{
+using namespace llvm;
+
+BuilderGfxMem::BuilderGfxMem(JitManager* pJitMgr) :
+Builder(pJitMgr)
+{
+mpfnTranslateGfxAddress = nullptr;
+mpParamSimDC = nullptr;
+}
+
+void BuilderGfxMem::NotifyPrivateContextSet()
+{
+}
+
+void BuilderGfxMem::AssertGFXMemoryParams(Value* ptr, 
Builder::JIT_MEM_CLIENT usage)
+{
+SWR_ASSERT(ptr->getType() == mInt64Ty, "GFX addresses must be gfxptr_t 
and not converted to system pointers.");
+SWR_ASSERT(usage != MEM_CLIENT_INTERNAL, "Internal memory should not 
go through the translation path and should not be gfxptr_t.");
+}
+
+//
+/// @brief Generate a masked gather operation in LLVM IR.  If not  
+/// supported on the underlying platform, emulate it with loads
+/// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+/// @param pBase - Int8* base VB address pointer value
+/// @param vIndices - SIMD wide value of VB byte offsets
+/// @param vMask - SIMD wide mask that controls whether to access memory 
or the src values
+/// @param 

[Mesa-dev] [PATCH v2 15/45] swr/rast: Code cleanup

2018-04-17 Thread George Kyriazis
Removing some code that doesn't seem to do anything meaningful.
---
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 5c8d813..5971a52 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -156,14 +156,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& 
fetchState)
 mpFetchInfo->setName("fetchInfo");
 Value*pVtxOut = &*argitr;
 pVtxOut->setName("vtxOutput");
-// this is just shorthand to tell LLVM to get a pointer to the base 
address of simdvertex
-// index 0(just the pointer to the simdvertex structure
-// index 1(which element of the simdvertex structure to offset to(in this 
case 0)
-// so the indices being i32's doesn't matter
-// TODO: generated this GEP with a VECTOR structure type so this makes 
sense
-std::vectorvtxInputIndices(2, C(0));
-// GEP
-pVtxOut = GEP(pVtxOut, C(0));
+
 #if USE_SIMD16_SHADERS
 #if 0// USE_SIMD16_BUILDER
 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, 
mVWidth16), 0));
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 11/45] swr/rast: WIP builder rewrite (2)

2018-04-17 Thread George Kyriazis
Finish up the remaining explicit intrinsic uses. At this point all
explicit Intrinsic::getDeclaration() usage has been replaced with auto
generated macros generated with gen_llvm_ir_macros.py. Going forward,
make sure to only use the intrinsics here, adding new ones as needed.

Next step is to remove all references to x86 intrinsics to keep the
builder target-independent. Any x86 lowering will be handled by a
separate pass.
---
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py| 17 +
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 9dfc1e7..0245584 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -61,8 +61,9 @@ intrinsics = [
 ['VPTESTZ', 'x86_avx_ptestz_256', ['a', 'b']],
 ['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c']],
 ['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a']],
-['INTERRUPT', 'x86_int', ['a']],
 ['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b']],
+['PDEP32', 'x86_bmi_pdep_32', ['a', 'b']],
+['RDTSC', 'x86_rdtsc', []],
 ]
 
 llvm_intrinsics = [
@@ -74,7 +75,11 @@ llvm_intrinsics = [
 ['VMINPS', 'minnum', ['a', 'b'], ['a']],
 ['VMAXPS', 'maxnum', ['a', 'b'], ['a']],
 ['DEBUGTRAP', 'debugtrap', [], []],
-['POPCNT', 'ctpop', ['a'], ['a']]
+['POPCNT', 'ctpop', ['a'], ['a']],
+['LOG2', 'log2', ['a'], ['a']],
+['FABS', 'fabs', ['a'], ['a']],
+['EXP2', 'exp2', ['a'], ['a']],
+['POW', 'pow', ['a', 'b'], ['a', 'b']]
 ]
 
 this_dir = os.path.dirname(os.path.abspath(__file__))
@@ -225,10 +230,14 @@ def generate_x86_h(output_dir):
 functions = []
 for inst in intrinsics:
 #print('Inst: %s, x86: %s numArgs: %d' % (inst[0], inst[1], 
len(inst[2])))
-declargs = 'Value* ' + ', Value* '.join(inst[2])
+if len(inst[2]) != 0:
+declargs = 'Value* ' + ', Value* '.join(inst[2])
+decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (inst[0], 
declargs)
+else:
+decl = 'Value* %s(const llvm::Twine& name = "")' % (inst[0])
 
 functions.append({
-'decl'  : 'Value* %s(%s, const llvm::Twine& name = "")' % 
(inst[0], declargs),
+'decl'  : decl,
 'intrin': inst[1],
 'args'  : inst[2],
 })
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 43/45] swr/rast: Optimize late/bindless JIT of samplers

2018-04-17 Thread George Kyriazis
Add per-worker thread private data to all shader calls
Add per-worker sampler cache and jit context
Add late LoadTexel JIT support
Add per-worker-thread Sampler / LoadTexel JIT
---
 src/gallium/drivers/swr/rasterizer/core/api.cpp|  7 ++-
 src/gallium/drivers/swr/rasterizer/core/api.h  | 47 +++
 .../drivers/swr/rasterizer/core/backend.cpp|  9 +--
 src/gallium/drivers/swr/rasterizer/core/backend.h  |  4 +-
 .../drivers/swr/rasterizer/core/backend_clear.cpp  | 19 +++---
 .../drivers/swr/rasterizer/core/backend_impl.h |  7 ++-
 .../drivers/swr/rasterizer/core/backend_sample.cpp |  5 +-
 .../swr/rasterizer/core/backend_singlesample.cpp   |  6 +-
 src/gallium/drivers/swr/rasterizer/core/binner.cpp |  2 +-
 src/gallium/drivers/swr/rasterizer/core/context.h  |  3 +-
 .../drivers/swr/rasterizer/core/frontend.cpp   | 29 +
 .../drivers/swr/rasterizer/core/rasterizer.cpp |  4 +-
 .../drivers/swr/rasterizer/core/rasterizer_impl.h  | 15 ++---
 src/gallium/drivers/swr/rasterizer/core/state.h| 18 +++---
 .../drivers/swr/rasterizer/core/threads.cpp| 68 +-
 src/gallium/drivers/swr/rasterizer/core/threads.h  |  5 +-
 .../drivers/swr/rasterizer/core/tilemgr.cpp| 21 +++
 src/gallium/drivers/swr/rasterizer/core/tilemgr.h  |  4 +-
 .../drivers/swr/rasterizer/jitter/JitManager.cpp   | 16 ++---
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|  8 ++-
 .../drivers/swr/rasterizer/memory/ClearTile.cpp|  1 +
 .../drivers/swr/rasterizer/memory/LoadTile.cpp |  1 +
 .../drivers/swr/rasterizer/memory/StoreTile.cpp|  1 +
 src/gallium/drivers/swr/swr_memory.h   |  9 ++-
 src/gallium/drivers/swr/swr_shader.cpp |  9 +++
 25 files changed, 213 insertions(+), 105 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 3141db6..e37e2e4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -1,5 +1,5 @@
 /
-* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -122,6 +122,11 @@ HANDLE SwrCreateContext(
 pContext->apiThreadInfo.numAPIThreadsPerCore= 1;
 }
 
+if (pCreateInfo->pWorkerPrivateState)
+{
+pContext->workerPrivateState = *pCreateInfo->pWorkerPrivateState;
+}
+
 memset(>WaitLock, 0, sizeof(pContext->WaitLock));
 memset(>FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
 new (>WaitLock) std::mutex();
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h 
b/src/gallium/drivers/swr/rasterizer/core/api.h
index 7247fa4..b171188 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -1,5 +1,5 @@
 /
-* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -115,7 +115,8 @@ struct SWR_RECT
 /// @param x - destination x coordinate
 /// @param y - destination y coordinate
 /// @param pDstHotTile - pointer to the hot tile surface
-typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT 
dstFormat,
+typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, HANDLE 
hWorkerPrivateData,
+SWR_FORMAT dstFormat,
 SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t 
*pDstHotTile);
 
@@ -127,7 +128,8 @@ typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE 
hPrivateContext, SWR_FORMAT dstForma
 /// @param x - destination x coordinate
 /// @param y - destination y coordinate
 /// @param pSrcHotTile - pointer to the hot tile surface
-typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT 
srcFormat,
+typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, HANDLE 
hWorkerPrivateData,
+SWR_FORMAT srcFormat,
 SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t 
*pSrcHotTile);
 
@@ -139,7 +141,7 @@ typedef void(SWR_API *PFN_STORE_TILE)(HANDLE 
hPrivateContext, SWR_FORMAT srcForm
 /// @param y - destination y coordinate
 /// @param renderTargetArrayIndex - render target array offset from arrayIndex
 /// @param pClearColor - pointer to the hot tile's clear value
-typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext,
+typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext, HANDLE 

[Mesa-dev] [PATCH v2 26/45] swr: add x86 lowering pass to fragment shader

2018-04-17 Thread George Kyriazis
Needed because some FP paths (namely stipple) use gather intrinsics
that now need to be lowered to x86.

v2: fix typo in commit message
---
 src/gallium/drivers/swr/swr_shader.cpp | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/src/gallium/drivers/swr/swr_shader.cpp 
b/src/gallium/drivers/swr/swr_shader.cpp
index 477fa7f..6ea021a 100644
--- a/src/gallium/drivers/swr/swr_shader.cpp
+++ b/src/gallium/drivers/swr/swr_shader.cpp
@@ -27,11 +27,13 @@
 #include "JitManager.h"
 #include "llvm-c/Core.h"
 #include "llvm/Support/CBindingWrapping.h"
+#include "llvm/IR/LegacyPassManager.h"
 #pragma pop_macro("DEBUG")
 
 #include "state.h"
 #include "gen_state_llvm.h"
 #include "builder.h"
+#include "functionpasses/passes.h"
 
 #include "tgsi/tgsi_strings.h"
 #include "util/u_format.h"
@@ -1389,6 +1391,11 @@ BuilderSWR::CompileFS(struct swr_context *ctx, 
swr_jit_fs_key )
 
gallivm_compile_module(gallivm);
 
+   // after the gallivm passes, we have to lower the core's intrinsics
+   llvm::legacy::FunctionPassManager lowerPass(JM()->mpCurrentModule);
+   lowerPass.add(createLowerX86Pass(mpJitMgr, this));
+   lowerPass.run(*pFunction);
+
PFN_PIXEL_KERNEL kernel =
   (PFN_PIXEL_KERNEL)gallivm_jit_function(gallivm, wrap(pFunction));
debug_printf("frag shader  %p\n", kernel);
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 45/45] swr/rast: Fix VGATHERPD lowering

2018-04-17 Thread George Kyriazis
Also Implement VHSUBPS in x86 lowering pass.
---
 .../rasterizer/jitter/functionpasses/lower_x86.cpp | 82 ++
 1 file changed, 69 insertions(+), 13 deletions(-)

diff --git 
a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
index 856d67d..baf3ab5 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
@@ -75,7 +75,6 @@ namespace SwrJit
 {"meta.intrinsic.BEXTR_32",Intrinsic::x86_bmi_bextr_32},
 {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
 {"meta.intrinsic.VCVTPS2PH",   Intrinsic::x86_vcvtps2ph_256},
-{"meta.intrinsic.VHSUBPS", Intrinsic::x86_avx_hsub_ps_256},
 {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
 {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
 {"meta.intrinsic.VFMADDPS",Intrinsic::x86_fma_vfmadd_ps_256},
@@ -89,6 +88,7 @@ namespace SwrJit
 Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst);
 Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst);
 Instruction* VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst);
+Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst);
 
 Instruction* DOUBLE_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst, Intrinsic::ID intrin);
 
@@ -106,6 +106,7 @@ namespace SwrJit
 {"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx_cvt_pd2_ps_256,
  Intrinsic::not_intrinsic},  NO_EMU}},
 {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_vcvtph2ps_256, 
  Intrinsic::not_intrinsic},  NO_EMU}},
 {"meta.intrinsic.VROUND",  {{Intrinsic::x86_avx_round_ps_256,  
  DOUBLE},NO_EMU}},
+{"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256,   
  DOUBLE},NO_EMU}},
 },
 {   // AVX2
 {"meta.intrinsic.VRCPPS",  {{Intrinsic::x86_avx_rcp_ps_256,
  DOUBLE},NO_EMU}},
@@ -117,6 +118,7 @@ namespace SwrJit
 {"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx_cvt_pd2_ps_256,
  Intrinsic::not_intrinsic},  NO_EMU}},
 {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_vcvtph2ps_256, 
  Intrinsic::not_intrinsic},  NO_EMU}},
 {"meta.intrinsic.VROUND",  {{Intrinsic::x86_avx_round_ps_256,  
  DOUBLE},NO_EMU}},
+{"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256,   
  DOUBLE},NO_EMU}},
 },
 {   // AVX512
 {"meta.intrinsic.VRCPPS",  {{Intrinsic::x86_avx512_rcp14_ps_256,   
  Intrinsic::x86_avx512_rcp14_ps_512},NO_EMU}},
@@ -127,7 +129,8 @@ namespace SwrJit
 {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VCVTPD2PS",   
{{Intrinsic::x86_avx512_mask_cvtpd2ps_256,
Intrinsic::x86_avx512_mask_cvtpd2ps_512 },  NO_EMU}},
 {"meta.intrinsic.VCVTPH2PS",   
{{Intrinsic::x86_avx512_mask_vcvtph2ps_256,   
Intrinsic::x86_avx512_mask_vcvtph2ps_512 }, NO_EMU}},
-{"meta.intrinsic.VROUND",  {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic }, VROUND_EMU}},
+{"meta.intrinsic.VROUND",  {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VROUND_EMU}},
+{"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VHSUB_EMU}},
 }
 };
 
@@ -454,21 +457,45 @@ namespace SwrJit
 }
 else if (width == W512)
 {
-// Double pump 8-wide
-auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), 
vSrc->getType());
-Value *src0 = B->EXTRACT_16(vSrc, 0);
-Value *src1 = B->EXTRACT_16(vSrc, 1);
+// Double pump 4-wide for 64bit elements
+if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)
+{
+auto v64Mask = B->S_EXT(pThis->VectorMask(vi1Mask), 
B->mInt64Ty);
+v64Mask = B->BITCAST(v64Mask, vSrc->getType());
+
+Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({ 0, 1, 2, 3 
}));
+Value* src1 = B->VSHUFFLE(vSrc, vSrc, 

[Mesa-dev] [PATCH v2 44/45] swr/rast: Replace x86 VMOVMSK with llvm-only implementation

2018-04-17 Thread George Kyriazis
---
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |  1 -
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  |  2 +-
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 25 --
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  2 ++
 .../rasterizer/jitter/functionpasses/lower_x86.cpp |  1 -
 5 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 9c1e9e0..bced657 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -58,7 +58,6 @@ intrinsics = [
 ['VPTESTC', ['a', 'b'], 'mInt32Ty'],
 ['VPTESTZ', ['a', 'b'], 'mInt32Ty'],
 ['VFMADDPS',['a', 'b', 'c'], 'a'],
-['VMOVMSKPS',   ['a'], 'mInt32Ty'],
 ['VPHADDD', ['a', 'b'], 'a'],
 ['PDEP32',  ['a', 'b'], 'a'],
 ['RDTSC',   [], 'mInt64Ty'],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index f0cd441..5b70b29 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -608,7 +608,7 @@ namespace SwrJit
 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, 
PointerType::get(mInt32Ty, 0));
 
-Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
+Value* pMask = VMOVMSK(vMask);
 
 // Setup loop basic block
 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, 
"Scatter_Loop", pFunc);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index aa9e2dd..f893693 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -525,6 +525,28 @@ namespace SwrJit
 return S_EXT(mask, mSimd16Int32Ty);
 }
 
+/// @brief Convert  llvm mask to integer
+Value *Builder::VMOVMSK(Value* mask)
+{
+SWR_ASSERT(mask->getType()->getVectorElementType() == mInt1Ty);
+uint32_t numLanes = mask->getType()->getVectorNumElements();
+Value* i32Result;
+if (numLanes == 8)
+{
+i32Result = BITCAST(mask, mInt8Ty);
+}
+else if (numLanes == 16)
+{
+i32Result = BITCAST(mask, mInt16Ty);
+}
+else
+{
+SWR_ASSERT("Unsupported vector width");
+i32Result = BITCAST(mask, mInt8Ty);
+}
+return Z_EXT(i32Result, mInt32Ty);
+}
+
 //
 /// @brief Generate a VPSHUFB operation in LLVM IR.  If not  
 /// supported on the underlying platform, emulate it
@@ -768,8 +790,7 @@ namespace SwrJit
 /// @brief pop count on vector mask (e.g. <8 x i1>)
 Value* Builder::VPOPCNT(Value* a)
 {
-Value* b = BITCAST(VMASK(a), mSimdFP32Ty);
-return POPCNT(VMOVMSKPS(b));
+return POPCNT(VMOVMSK(a));
 }
 
 //
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 7308821..bd4be9f 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -102,6 +102,8 @@ Value *MASK_16(Value *vmask);
 Value *VMASK(Value *mask);
 Value *VMASK_16(Value *mask);
 
+Value *VMOVMSK(Value *mask);
+
 //
 /// @brief functions that build IR to call x86 intrinsics directly, or
 /// emulate them with other instructions if not available on the host
diff --git 
a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
index 7cfa772..856d67d 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
@@ -79,7 +79,6 @@ namespace SwrJit
 {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
 {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
 {"meta.intrinsic.VFMADDPS",Intrinsic::x86_fma_vfmadd_ps_256},
-{"meta.intrinsic.VMOVMSKPS",   Intrinsic::x86_avx_movmsk_ps_256},
 {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
 {"meta.intrinsic.PDEP32",  Intrinsic::x86_bmi_pdep_32},
 {"meta.intrinsic.RDTSC",   Intrinsic::x86_rdtsc},
-- 
2.7.4

___
mesa-dev mailing list

[Mesa-dev] [PATCH v2 38/45] swr/rast: Fix 64bit float loads in x86 lowering pass

2018-04-17 Thread George Kyriazis
---
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  | 39 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 31 +
 2 files changed, 25 insertions(+), 45 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index c791278..f0cd441 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -201,44 +201,7 @@ namespace SwrJit
 /// @param scale - value to scale indices by
 Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, uint8_t scale)
 {
-Value* vGather;
-
-// use avx2 gather instruction if available
-if (JM()->mArch.AVX2())
-{
-vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth / 
2)), VectorType::get(mDoubleTy, mVWidth / 2));
-vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
-}
-else
-{
-Value* pStack = STACKSAVE();
-
-// store vSrc on the stack.  this way we can select between a 
valid load address and the vSrc address
-Value* vSrcPtr = ALLOCA(vSrc->getType());
-SetTempAlloca(vSrcPtr);
-STORE(vSrc, vSrcPtr);
-
-vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
-Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
-Value *vOffsets = MUL(vIndices, vScaleVec);
-for (uint32_t i = 0; i < mVWidth / 2; ++i)
-{
-// single component byte index
-Value *offset = VEXTRACT(vOffsets, C(i));
-// byte pointer to component
-Value *loadAddress = GEP(pBase, offset);
-loadAddress = BITCAST(loadAddress, PointerType::get(mDoubleTy, 
0));
-// pointer to the value to load if we're masking off a 
component
-Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
-Value *selMask = VEXTRACT(vMask, C(i));
-// switch in a safe address to load if we're trying to access 
a vertex
-Value *validAddress = SELECT(selMask, loadAddress, 
maskLoadAddress);
-Value *val = LOAD(validAddress);
-vGather = VINSERT(vGather, val, C(i));
-}
-STACKRESTORE(pStack);
-}
-return vGather;
+return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
 }
 
 //
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index cdfddf3..767866f 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -230,7 +230,6 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& 
fetchState)
 }
 
 // Fetch attributes from memory and output to a simdvertex struct
-// since VGATHER has a perf penalty on HSW vs BDW, allow client to choose 
which fetch method to use
 JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 
 RET_VOID();
@@ -763,13 +762,31 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 // if we need to gather the component
 if (compCtrl[i] == StoreSrc)
 {
-Value *vMaskLo = VSHUFFLE(vGatherMask, 
VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
-Value *vMaskHi = VSHUFFLE(vGatherMask, 
VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
+Value* vShufLo;
+Value* vShufHi;
+Value* vShufAll;
 
-Value *vOffsetsLo = VEXTRACTI128(vOffsets, 
C(0));
-Value *vOffsetsHi = VEXTRACTI128(vOffsets, 
C(1));
+if (mVWidth == 8)
+{
+vShufLo = C({ 0, 1, 2, 3 });
+vShufHi = C({ 4, 5, 6, 7 });
+vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
+}
+else
+{
+SWR_ASSERT(mVWidth == 16);
+vShufLo = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
+vShufHi = C({ 8, 9, 10, 11, 12, 13, 14, 15 
});
+vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 
9, 10, 11, 12, 13, 14, 15 });
+}
+
+Value *vMaskLo = VSHUFFLE(vGatherMask, 
vGatherMask, vShufLo);
+Value *vMaskHi = 

[Mesa-dev] [PATCH v2 13/45] swr/rast: Add MEM_ADD helper function to Builder.

2018-04-17 Thread George Kyriazis
mem[offset] += value

This function will be heavily used by all stats intrinsics.
---
 src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp | 7 +++
 src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h   | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index a825434..dee08b8 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -134,6 +134,13 @@ namespace SwrJit
 return GEP(base, offset);
 }
 
+Value* Builder::MEM_ADD(Value* i32Incr, Value* basePtr, const 
std::initializer_list , const llvm::Twine& name)
+{
+Value* i32Value = LOAD(GEP(basePtr, indices), name);
+Value* i32Result = ADD(i32Value, i32Incr);
+return STORE(i32Result, GEP(basePtr, indices));
+}
+
 //
 /// @brief Generate a masked gather operation in LLVM IR.  If not  
 /// supported on the underlying platform, emulate it with loads
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
index b538342..59b45c1 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
@@ -60,6 +60,8 @@ LoadInst *LOADV(Value *BasePtr, const 
std::initializer_list , con
 StoreInst *STORE(Value *Val, Value *BasePtr, const 
std::initializer_list );
 StoreInst *STOREV(Value *Val, Value *BasePtr, const 
std::initializer_list );
 
+Value* MEM_ADD(Value* i32Incr, Value* basePtr, const 
std::initializer_list , const llvm::Twine& name = "");
+
 void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
 Value* mask, Value* vGatherComponents[], bool bPackedOutput, 
JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 04/45] swr/rast: Add debug type info for i128

2018-04-17 Thread George Kyriazis
Help support debug info in 16 wide shaders.
---
 src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 0cefa43..bfb1d2e 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -288,6 +288,7 @@ DIType* JitManager::GetDebugIntegerType(Type* pTy)
 case 16: return builder.createBasicType("int16", 16, 
dwarf::DW_ATE_signed); break;
 case 32: return builder.createBasicType("int", 32, dwarf::DW_ATE_signed); 
break;
 case 64: return builder.createBasicType("int64", 64, 
dwarf::DW_ATE_signed); break;
+case 128: return builder.createBasicType("int128", 128, 
dwarf::DW_ATE_signed); break;
 default: SWR_ASSERT(false, "Unimplemented integer bit width");
 }
 return nullptr;
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 08/45] swr/rast: LLVM 6 fix

2018-04-17 Thread George Kyriazis
for getting masked gather intrinsic (also compatible with LLVM 4)
---
 src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index 278113e..cc0f897 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -390,7 +390,7 @@ namespace SwrJit
 /// @param pVecPassthru - SIMD wide vector of values to load when lane is 
inactive
 Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* 
pVecPassthru)
 {
-Function* pMaskedGather = 
llvm::Intrinsic::getDeclaration(JM()->mpCurrentModule, 
Intrinsic::masked_gather, { pVecPassthru->getType() });
+Function* pMaskedGather = 
llvm::Intrinsic::getDeclaration(JM()->mpCurrentModule, 
Intrinsic::masked_gather, { pVecPassthru->getType(), pVecSrcPtr->getType() });
 
 return CALL(pMaskedGather, { pVecSrcPtr, C(0), pVecMask, pVecPassthru 
});
 }
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 33/45] swr/rast: add cvt instructions in x86 lowering pass

2018-04-17 Thread George Kyriazis
Support generic VCVTPD2PS and VCVTPH2PS in x86 lowering pass.
---
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   | 70 --
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  | 14 -
 .../drivers/swr/rasterizer/jitter/builder_mem.h|  3 -
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|  6 +-
 .../rasterizer/jitter/functionpasses/lower_x86.cpp | 14 ++---
 5 files changed, 48 insertions(+), 59 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 2636e60..4a7d2e9 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -42,28 +42,26 @@ inst_aliases = {
 }
 
 intrinsics = [
-['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 
'mask', 'scale'], 'mSimd4FP64Ty'],
-['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 
'mask', 'scale'], 'mSimdFP32Ty'],
-['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 
'mask', 'scale'], 'mSimd16FP32Ty'],
-['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 
'mask', 'scale'], 'mSimdInt32Ty'],
-['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 
'mask', 'scale'], 'mSimd16Int32Ty'],
-['VRCPPS', 'x86_avx_rcp_ps_256', ['a'], 'mSimdFP32Ty'],
-['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding'], 'mSimdFP32Ty'],
-['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control'], 'mInt32Ty'],
-['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b'], 'mSimd32Int8Ty'],
-['VPERMD', 'x86_avx2_permd', ['a', 'idx'], 'mSimdInt32Ty'],
-['VPERMPS', 'x86_avx2_permps', ['idx', 'a'], 'mSimdFP32Ty'],
-['VCVTPD2PS', 'x86_avx_cvt_pd2_ps_256', ['a'], 'mSimdFP32Ty'],
-['VCVTPH2PS', 'x86_vcvtph2ps_256', ['a'], 'mSimdFP32Ty'],
-['VCVTPS2PH', 'x86_vcvtps2ph_256', ['a', 'round'], 'mSimdFP16Ty'],
-['VHSUBPS', 'x86_avx_hsub_ps_256', ['a', 'b'], 'mSimdFP32Ty'],
-['VPTESTC', 'x86_avx_ptestc_256', ['a', 'b'], 'mInt32Ty'],
-['VPTESTZ', 'x86_avx_ptestz_256', ['a', 'b'], 'mInt32Ty'],
-['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c'], 'mSimdFP32Ty'],
-['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a'], 'mInt32Ty'],
-['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b'], 'mSimdInt32Ty'],
-['PDEP32', 'x86_bmi_pdep_32', ['a', 'b'], 'mInt32Ty'],
-['RDTSC', 'x86_rdtsc', [], 'mInt64Ty'],
+['VGATHERPD',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
+['VGATHERPS',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
+['VGATHERDD',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
+['VRCPPS',  ['a'], 'a'],
+['VROUND',  ['a', 'rounding'], 'a'],
+['BEXTR_32',['src', 'control'], 'src'],
+['VPSHUFB', ['a', 'b'], 'a'],
+['VPERMD',  ['a', 'idx'], 'a'],
+['VPERMPS', ['idx', 'a'], 'a'],
+['VCVTPD2PS',   ['a'], 'VectorType::get(mFP32Ty, 
a->getType()->getVectorNumElements())'],
+['VCVTPH2PS',   ['a'], 'VectorType::get(mFP32Ty, 
a->getType()->getVectorNumElements())'],
+['VCVTPS2PH',   ['a', 'round'], 'mSimdFP16Ty'],
+['VHSUBPS', ['a', 'b'], 'a'],
+['VPTESTC', ['a', 'b'], 'mInt32Ty'],
+['VPTESTZ', ['a', 'b'], 'mInt32Ty'],
+['VFMADDPS',['a', 'b', 'c'], 'a'],
+['VMOVMSKPS',   ['a'], 'mInt32Ty'],
+['VPHADDD', ['a', 'b'], 'a'],
+['PDEP32',  ['a', 'b'], 'a'],
+['RDTSC',   [], 'mInt64Ty'],
 ]
 
 llvm_intrinsics = [
@@ -231,19 +229,31 @@ def generate_meta_h(output_dir):
 
 functions = []
 for inst in intrinsics:
+name = inst[0]
+args = inst[1]
+ret = inst[2]
+
 #print('Inst: %s, x86: %s numArgs: %d' % (inst[0], inst[1], 
len(inst[2])))
-if len(inst[2]) != 0:
-declargs = 'Value* ' + ', Value* '.join(inst[2])
-decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (inst[0], 
declargs)
+if len(args) != 0:
+declargs = 'Value* ' + ', Value* '.join(args)
+decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (name, 
declargs)
 else:
-decl = 'Value* %s(const llvm::Twine& name = "")' % (inst[0])
+decl = 'Value* %s(const llvm::Twine& name = "")' % (name)
+
+# determine the return type of the intrinsic. It can either be:
+# - type of one of the input arguments
+# - snippet of code to set the return type
+
+if ret in args:
+returnTy = ret + '->getType()'
+else:
+returnTy = ret
 
 functions.append({
 'decl'  : decl,
-'name'  : inst[0],
-'intrin': inst[1],
-'args'  : inst[2],
-'returnType': inst[3]
+'name'  : name,
+'args'  : args,
+'returnType': returnTy
 })
 
 

[Mesa-dev] [PATCH v2 22/45] swr/rast: Cleanup of JitManager convenience types

2018-04-17 Thread George Kyriazis
Small cleanup. Remove convenience types from JitManager and standardize
on the Builder's convenience types.
---
 .../drivers/swr/rasterizer/jitter/JitManager.cpp | 19 ---
 .../drivers/swr/rasterizer/jitter/JitManager.h   | 20 
 .../drivers/swr/rasterizer/jitter/builder.cpp|  7 +++
 src/gallium/drivers/swr/rasterizer/jitter/builder.h  |  3 ++-
 4 files changed, 5 insertions(+), 44 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index bfb1d2e..9080964 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -110,11 +110,6 @@ JitManager::JitManager(uint32_t simdWidth, const char 
*arch, const char* core)
 mpExec->RegisterJITEventListener(vTune);
 #endif
 
-mFP32Ty = Type::getFloatTy(mContext);   // float type
-mInt8Ty = Type::getInt8Ty(mContext);
-mInt32Ty = Type::getInt32Ty(mContext);   // int type
-mInt64Ty = Type::getInt64Ty(mContext);   // int type
-
 // fetch function signature
 #if USE_SIMD16_SHADERS
 // typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, 
simd16vertex& out);
@@ -135,20 +130,6 @@ JitManager::JitManager(uint32_t simdWidth, const char 
*arch, const char* core)
 
 mFetchShaderTy = FunctionType::get(Type::getVoidTy(mContext), fsArgs, 
false);
 
-mSimtFP32Ty = VectorType::get(mFP32Ty, mVWidth);
-mSimtInt32Ty = VectorType::get(mInt32Ty, mVWidth);
-
-mSimdVectorTy = ArrayType::get(mSimtFP32Ty, 4);
-mSimdVectorInt32Ty = ArrayType::get(mSimtInt32Ty, 4);
-
-#if USE_SIMD16_SHADERS
-mSimd16FP32Ty = ArrayType::get(mSimtFP32Ty, 2);
-mSimd16Int32Ty = ArrayType::get(mSimtInt32Ty, 2);
-
-mSimd16VectorFP32Ty = ArrayType::get(mSimd16FP32Ty, 4);
-mSimd16VectorInt32Ty = ArrayType::get(mSimd16Int32Ty, 4);
-
-#endif
 #if defined(_WIN32)
 // explicitly instantiate used symbols from potentially staticly linked 
libs
 sys::DynamicLibrary::AddSymbol("exp2f", );
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index 3660249..86e6758 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -143,26 +143,6 @@ struct JitManager
 uint32_tmVWidth;
 
 
-// Built in types.
-llvm::Type* mInt8Ty;
-llvm::Type* mInt32Ty;
-llvm::Type* mInt64Ty;
-llvm::Type* mFP32Ty;
-
-llvm::Type* mSimtFP32Ty;
-llvm::Type* mSimtInt32Ty;
-
-llvm::Type* mSimdVectorInt32Ty;
-llvm::Type* mSimdVectorTy;
-
-#if USE_SIMD16_SHADERS
-llvm::Type* mSimd16FP32Ty;
-llvm::Type* mSimd16Int32Ty;
-
-llvm::Type* mSimd16VectorFP32Ty;
-llvm::Type* mSimd16VectorInt32Ty;
-
-#endif
 // fetch shader types
 llvm::FunctionType* mFetchShaderTy;
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index 260daab..625f132 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -42,10 +42,8 @@ namespace SwrJit
 : mpJitMgr(pJitMgr),
   mpPrivateContext(nullptr)
 {
-SWR_ASSERT(pJitMgr->mVWidth == 8);
-
 mVWidth = pJitMgr->mVWidth;
-mVWidth16 = pJitMgr->mVWidth * 2;
+mVWidth16 = 16;
 
 mpIRBuilder = >mBuilder;
 
@@ -67,7 +65,7 @@ namespace SwrJit
 
 mSimd4FP64Ty = VectorType::get(mDoubleTy, 4);
 
-// Built in types: simd8
+// Built in types: target simd
 
 mSimdInt1Ty = VectorType::get(mInt1Ty,  mVWidth);
 mSimdInt16Ty= VectorType::get(mInt16Ty, mVWidth);
@@ -76,6 +74,7 @@ namespace SwrJit
 mSimdFP16Ty = VectorType::get(mFP16Ty,  mVWidth);
 mSimdFP32Ty = VectorType::get(mFP32Ty,  mVWidth);
 mSimdVectorTy   = ArrayType::get(mSimdFP32Ty, 4);
+mSimdVectorIntTy= ArrayType::get(mSimdInt32Ty, 4);
 mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5);
 
 // Built in types: simd16
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 0b57fbf..6b2c9f0 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -68,7 +68,7 @@ namespace SwrJit
 
 Type*mSimd4FP64Ty;
 
-// Built in types: simd8
+// Built in types: target SIMD
 
 Type*mSimdFP16Ty;
 Type*mSimdFP32Ty;
@@ -79,6 +79,7 @@ namespace SwrJit
 Type*mSimdIntPtrTy;
 Type*

[Mesa-dev] [PATCH v2 28/45] swr/rast: Adding translate call to builder_gfx_mem.

2018-04-17 Thread George Kyriazis
---
 src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp | 5 +
 src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h   | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
index e097bd1..38ac825 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
@@ -133,4 +133,9 @@ namespace SwrJit
 return Builder::LOAD(BasePtr, offset, name);
 }
 
+Value* BuilderGfxMem::TranlsateGfxAddress(Value* xpGfxAddress)
+{
+return INT_TO_PTR(xpGfxAddress, PointerType::get(mInt8Ty, 0));
+}
+
 }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h
index 837de44..a1c5f46 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h
@@ -51,6 +51,8 @@ namespace SwrJit
 
 virtual Value *GATHERDD(Value* src, Value* pBase, Value* indices, 
Value* mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
 
+Value* TranlsateGfxAddress(Value* xpGfxAddress);
+
 protected:
 
 void AssertGFXMemoryParams(Value* ptr, Builder::JIT_MEM_CLIENT usage);
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 39/45] swr/rast: double-pump in x86 lowering pass

2018-04-17 Thread George Kyriazis
Add support for double-pumping a smaller SIMD width intrinsic.
---
 .../rasterizer/jitter/functionpasses/lower_x86.cpp | 30 ++
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git 
a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
index 9423b28..983b227 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
@@ -90,11 +90,14 @@ namespace SwrJit
 Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, 
CallInst* pCallInst);
 Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst);
 Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst);
+Instruction* DOUBLE_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst, Intrinsic::ID intrin);
+
+static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
 
 static std::map intrinsicMap2[] = {
 //  256 wide   
 512 wide
 {   // AVX
-{"meta.intrinsic.VRCPPS",  {{Intrinsic::x86_avx_rcp_ps_256,
  Intrinsic::not_intrinsic},  NO_EMU}},
+{"meta.intrinsic.VRCPPS",  {{Intrinsic::x86_avx_rcp_ps_256,
  DOUBLE},NO_EMU}},
 {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VPERM_EMU}},
 {"meta.intrinsic.VPERMD",  {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VPERM_EMU}},
 {"meta.intrinsic.VGATHERPD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
@@ -104,7 +107,7 @@ namespace SwrJit
 {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_vcvtph2ps_256, 
  Intrinsic::not_intrinsic},  NO_EMU}},
 },
 {   // AVX2
-{"meta.intrinsic.VRCPPS",  {{Intrinsic::x86_avx_rcp_ps_256,
  Intrinsic::not_intrinsic},  NO_EMU}},
+{"meta.intrinsic.VRCPPS",  {{Intrinsic::x86_avx_rcp_ps_256,
  DOUBLE},NO_EMU}},
 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps,   
  Intrinsic::not_intrinsic},  VPERM_EMU}},
 {"meta.intrinsic.VPERMD",  {{Intrinsic::x86_avx2_permd,
  Intrinsic::not_intrinsic},  VPERM_EMU}},
 {"meta.intrinsic.VGATHERPD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
@@ -226,7 +229,15 @@ namespace SwrJit
 
 // Check if there is a native intrinsic for this instruction
 Intrinsic::ID id = intrinsic.intrin[vecWidth];
-if (id != Intrinsic::not_intrinsic)
+if (id == DOUBLE)
+{
+// Double pump the next smaller SIMD intrinsic
+SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD 
width.");
+Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];
+SWR_ASSERT(id2 != Intrinsic::not_intrinsic, "Cannot find 
intrinsic to double pump.");
+return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);
+}
+else if (id != Intrinsic::not_intrinsic)
 {
 Function* pIntrin = 
Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
 SmallVector args;
@@ -488,28 +499,25 @@ namespace SwrJit
 return cast(v32Gather);
 }
 
-#if 0
 // Double pump input using Intrin template arg. This blindly extracts 
lower and upper 256 from each vector argument and
 // calls the 256 wide intrinsic, then merges the results to 512 wide
-template
-Value* EMU_512(LowerX86* pThis, TargetArch arch, TargetWidth width, 
CallInst* pCallInst)
+Instruction* DOUBLE_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst, Intrinsic::ID intrin)
 {
 auto B = pThis->B;
 SWR_ASSERT(width == W512);
 Value* result[2];
-Function* pX86IntrinFunc = 
Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrin);
+Function* pX86IntrinFunc = 
Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);
 for (uint32_t i = 0; i < 2; ++i)
 {
 SmallVector args;
 for (auto& arg : pCallInst->arg_operands())
 {
-args.push_back(arg.get()->getType()->isVectorTy ? 
B->EXTRACT_16(arg.get(), i) : arg.get());
+args.push_back(arg.get()->getType()->isVectorTy() ? 

[Mesa-dev] [PATCH v2 34/45] swr/rast: fix comment

2018-04-17 Thread George Kyriazis
---
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 8d659d0..cdfddf3 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -970,7 +970,7 @@ extern "C" void GetSimdValid16bitIndicesGfx(gfxptr_t 
indices, gfxptr_t lastIndex
 
 //
 /// @brief Loads a simd of valid indices. OOB indices are set to 0
-/// *Note* have to do 16bit index checking in scalar until we have AVX-512
+/// *Note* have to do 8bit index checking in scalar until we have AVX-512
 /// support
 /// @param pIndices - pointer to 8 bit indices
 /// @param pLastIndex - pointer to last valid index
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 42/45] swr/rast: Implement VROUND intrinsic in x86 lowering pass

2018-04-17 Thread George Kyriazis
---
 .../rasterizer/jitter/functionpasses/lower_x86.cpp | 38 +-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git 
a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
index 983b227..7cfa772 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
@@ -72,7 +72,6 @@ namespace SwrJit
 // Map of intrinsics that haven't been moved to the new mechanism yet. If 
used, these get the previous behavior of
 // mapping directly to avx/avx2 intrinsics.
 static std::map intrinsicMap = {
-{"meta.intrinsic.VROUND",  Intrinsic::x86_avx_round_ps_256},
 {"meta.intrinsic.BEXTR_32",Intrinsic::x86_bmi_bextr_32},
 {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
 {"meta.intrinsic.VCVTPS2PH",   Intrinsic::x86_vcvtps2ph_256},
@@ -90,6 +89,8 @@ namespace SwrJit
 Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, 
CallInst* pCallInst);
 Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst);
 Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst);
+Instruction* VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst);
+
 Instruction* DOUBLE_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst, Intrinsic::ID intrin);
 
 static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
@@ -105,6 +106,7 @@ namespace SwrJit
 {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx_cvt_pd2_ps_256,
  Intrinsic::not_intrinsic},  NO_EMU}},
 {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_vcvtph2ps_256, 
  Intrinsic::not_intrinsic},  NO_EMU}},
+{"meta.intrinsic.VROUND",  {{Intrinsic::x86_avx_round_ps_256,  
  DOUBLE},NO_EMU}},
 },
 {   // AVX2
 {"meta.intrinsic.VRCPPS",  {{Intrinsic::x86_avx_rcp_ps_256,
  DOUBLE},NO_EMU}},
@@ -115,6 +117,7 @@ namespace SwrJit
 {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx_cvt_pd2_ps_256,
  Intrinsic::not_intrinsic},  NO_EMU}},
 {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_vcvtph2ps_256, 
  Intrinsic::not_intrinsic},  NO_EMU}},
+{"meta.intrinsic.VROUND",  {{Intrinsic::x86_avx_round_ps_256,  
  DOUBLE},NO_EMU}},
 },
 {   // AVX512
 {"meta.intrinsic.VRCPPS",  {{Intrinsic::x86_avx512_rcp14_ps_256,   
  Intrinsic::x86_avx512_rcp14_ps_512},NO_EMU}},
@@ -125,6 +128,7 @@ namespace SwrJit
 {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VCVTPD2PS",   
{{Intrinsic::x86_avx512_mask_cvtpd2ps_256,
Intrinsic::x86_avx512_mask_cvtpd2ps_512 },  NO_EMU}},
 {"meta.intrinsic.VCVTPH2PS",   
{{Intrinsic::x86_avx512_mask_vcvtph2ps_256,   
Intrinsic::x86_avx512_mask_vcvtph2ps_512 }, NO_EMU}},
+{"meta.intrinsic.VROUND",  {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic }, VROUND_EMU}},
 }
 };
 
@@ -499,6 +503,38 @@ namespace SwrJit
 return cast(v32Gather);
 }
 
+// No support for vroundps in avx512 (it is available in kncni), so 
emulate with avx instructions
+Instruction* VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst)
+{
+SWR_ASSERT(arch == AVX512);
+
+auto B = pThis->B;
+auto vf32Src = pCallInst->getOperand(0);
+auto i8Round = pCallInst->getOperand(1);
+auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, 
Intrinsic::x86_avx_round_ps_256);
+
+if (width == W256)
+{
+return cast(B->CALL2(pfnFunc, vf32Src, i8Round));
+}
+else if (width == W512)
+{
+auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0);
+auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1);
+
+auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round);
+auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round);
+
+return cast(B->JOIN_16(v8f32ResLo, v8f32ResHi));
+}
+

[Mesa-dev] [PATCH v2 09/45] swr/rast: WIP builder rewrite.

2018-04-17 Thread George Kyriazis
Start removing avx2 macros for functionality that exists in llvm.
---
 src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py | 5 -
 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h | 9 -
 2 files changed, 14 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 113c616..3e1fbfe 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -53,12 +53,7 @@ intrinsics = [
 ['VMINPS', 'x86_avx_min_ps_256', ['a', 'b']],
 ['VMAXPS', 'x86_avx_max_ps_256', ['a', 'b']],
 ['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding']],
-['VCMPPS', 'x86_avx_cmp_ps_256', ['a', 'b', 'cmpop']],
-['VBLENDVPS', 'x86_avx_blendv_ps_256', ['a', 'b', 'mask']],
 ['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control']],
-['VMASKLOADD', 'x86_avx2_maskload_d_256', ['src', 'mask']],
-['VMASKMOVPS', 'x86_avx_maskload_ps_256', ['src', 'mask']],
-['VMASKSTOREPS', 'x86_avx_maskstore_ps_256', ['src', 'mask', 'val']],
 ['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b']],
 ['VPERMD', 'x86_avx2_permd', ['a', 'idx']],
 ['VPERMPS', 'x86_avx2_permps', ['idx', 'a']],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 9660bc6..549f328 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -96,15 +96,6 @@ CallInst *CALL(Value *Callee, Value* arg);
 CallInst *CALL2(Value *Callee, Value* arg1, Value* arg2);
 CallInst *CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3);
 
-Value *VCMPPS_EQ(Value* a, Value* b){ return VCMPPS(a, b, 
C((uint8_t)_CMP_EQ_OQ)); }
-Value *VCMPPS_LT(Value* a, Value* b){ return VCMPPS(a, b, 
C((uint8_t)_CMP_LT_OQ)); }
-Value *VCMPPS_LE(Value* a, Value* b){ return VCMPPS(a, b, 
C((uint8_t)_CMP_LE_OQ)); }
-Value *VCMPPS_ISNAN(Value* a, Value* b) { return VCMPPS(a, b, 
C((uint8_t)_CMP_UNORD_Q)); }
-Value *VCMPPS_NEQ(Value* a, Value* b)   { return VCMPPS(a, b, 
C((uint8_t)_CMP_NEQ_OQ)); }
-Value *VCMPPS_GE(Value* a, Value* b){ return VCMPPS(a, b, 
C((uint8_t)_CMP_GE_OQ)); }
-Value *VCMPPS_GT(Value* a, Value* b){ return VCMPPS(a, b, 
C((uint8_t)_CMP_GT_OQ)); }
-Value *VCMPPS_NOTNAN(Value* a, Value* b){ return VCMPPS(a, b, 
C((uint8_t)_CMP_ORD_Q)); }
-
 Value *MASK(Value *vmask);
 Value *MASK_16(Value *vmask);
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 23/45] swr/rast: Lower VGATHERPS and VGATHERPS_16 to x86.

2018-04-17 Thread George Kyriazis
Some more work to do before we can support simultaneous 8-wide and
16-wide and remove the VGATHERPS_16 version.
---
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  | 69 +-
 1 file changed, 2 insertions(+), 67 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index d8ec885..adb9296 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -153,79 +153,14 @@ namespace SwrJit
 {
 AssertMemoryUsageParams(pBase, usage);
 
-Value *vGather;
-Value *pBasePtr = INT_TO_PTR(pBase, PointerType::get(mInt8Ty, 0));
-
-// use avx2 gather instruction if available
-if (JM()->mArch.AVX2())
-{
-vGather = VGATHERPS(vSrc, pBasePtr, vIndices, vMask, C(scale));
-}
-else
-{
-Value* pStack = STACKSAVE();
-
-// store vSrc on the stack.  this way we can select between a 
valid load address and the vSrc address
-Value* vSrcPtr = ALLOCA(vSrc->getType());
-STORE(vSrc, vSrcPtr);
-
-vGather = VUNDEF_F();
-Value *vScaleVec = VIMMED1((uint32_t)scale);
-Value *vOffsets = MUL(vIndices, vScaleVec);
-for (uint32_t i = 0; i < mVWidth; ++i)
-{
-// single component byte index
-Value *offset = VEXTRACT(vOffsets, C(i));
-// byte pointer to component
-Value *loadAddress = GEP(pBasePtr, offset);
-loadAddress = BITCAST(loadAddress, PointerType::get(mFP32Ty, 
0));
-// pointer to the value to load if we're masking off a 
component
-Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
-Value *selMask = VEXTRACT(vMask, C(i));
-// switch in a safe address to load if we're trying to access 
a vertex 
-Value *validAddress = SELECT(selMask, loadAddress, 
maskLoadAddress);
-Value *val = LOAD(validAddress);
-vGather = VINSERT(vGather, val, C(i));
-}
-
-STACKRESTORE(pStack);
-}
-
-return vGather;
+return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
 }
 
 Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, 
Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage)
 {
 AssertMemoryUsageParams(pBase, usage);
 
-Value *vGather = VUNDEF_F_16();
-
-// use AVX512F gather instruction if available
-if (JM()->mArch.AVX512F())
-{
-// force mask to , required by vgather2
-Value *mask = BITCAST(vMask, mInt16Ty);
-
-vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, 
C((uint32_t)scale));
-}
-else
-{
-Value *src0 = EXTRACT_16(vSrc, 0);
-Value *src1 = EXTRACT_16(vSrc, 1);
-
-Value *indices0 = EXTRACT_16(vIndices, 0);
-Value *indices1 = EXTRACT_16(vIndices, 1);
-
-Value *mask0 = EXTRACT_16(vMask, 0);
-Value *mask1 = EXTRACT_16(vMask, 1);
-
-Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
-Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
-
-vGather = JOIN_16(gather0, gather1);
-}
-
-return vGather;
+return VGATHERPS_16(vSrc, pBase, vIndices, vMask, C(scale));
 }
 
 //
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 10/45] swr/rast: Add autogen of helper llvm intrinsics.

2018-04-17 Thread George Kyriazis
Replace sqrt, maskload, fp min/max, cttz, ctlz with llvm equivalent.
Replace AVX maskedstore intrinsic with LLVM intrinsic. Add helper llvm
macros for stacksave, stackrestore, popcnt.
---
 src/gallium/drivers/swr/Makefile.am|   8 ++
 src/gallium/drivers/swr/SConscript |   9 ++
 src/gallium/drivers/swr/meson.build|   2 +-
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   | 100 ++---
 .../rasterizer/codegen/templates/gen_builder.hpp   |  20 -
 .../drivers/swr/rasterizer/jitter/builder.h|   1 +
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  |  50 +--
 .../drivers/swr/rasterizer/jitter/builder_mem.h|   5 --
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp |  13 ---
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  11 ---
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|   8 +-
 .../drivers/swr/rasterizer/jitter/meson.build  |  11 +++
 .../swr/rasterizer/jitter/streamout_jit.cpp|  18 ++--
 13 files changed, 130 insertions(+), 126 deletions(-)

diff --git a/src/gallium/drivers/swr/Makefile.am 
b/src/gallium/drivers/swr/Makefile.am
index 5ec9213..32dd9e5 100644
--- a/src/gallium/drivers/swr/Makefile.am
+++ b/src/gallium/drivers/swr/Makefile.am
@@ -81,6 +81,7 @@ BUILT_SOURCES = \
rasterizer/jitter/gen_state_llvm.h \
rasterizer/jitter/gen_builder.hpp \
rasterizer/jitter/gen_builder_x86.hpp \
+   rasterizer/jitter/gen_builder_intrin.hpp \
rasterizer/archrast/gen_ar_event.hpp \
rasterizer/archrast/gen_ar_event.cpp \
rasterizer/archrast/gen_ar_eventhandler.hpp \
@@ -140,6 +141,13 @@ rasterizer/jitter/gen_builder_x86.hpp: 
rasterizer/codegen/gen_llvm_ir_macros.py
--output rasterizer/jitter \
--gen_x86_h
 
+rasterizer/jitter/gen_builder_intrin.hpp: 
rasterizer/codegen/gen_llvm_ir_macros.py 
rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py
+   $(MKDIR_GEN)
+   $(PYTHON_GEN) \
+   $(srcdir)/rasterizer/codegen/gen_llvm_ir_macros.py \
+   --output rasterizer/jitter \
+   --gen_intrin_h
+
 rasterizer/archrast/gen_ar_event.hpp: rasterizer/codegen/gen_archrast.py 
rasterizer/codegen/templates/gen_ar_event.hpp rasterizer/archrast/events.proto 
rasterizer/archrast/events_private.proto rasterizer/codegen/gen_common.py
$(MKDIR_GEN)
$(PYTHON_GEN) \
diff --git a/src/gallium/drivers/swr/SConscript 
b/src/gallium/drivers/swr/SConscript
index cc4025b..5097be6 100644
--- a/src/gallium/drivers/swr/SConscript
+++ b/src/gallium/drivers/swr/SConscript
@@ -85,6 +85,15 @@ Depends('rasterizer/jitter/gen_builder.hpp',
 swrroot + 'rasterizer/codegen/templates/gen_builder.hpp')
 
 env.CodeGenerate(
+target = 'rasterizer/jitter/gen_builder_intrin.hpp',
+script = swrroot + 'rasterizer/codegen/gen_llvm_ir_macros.py',
+source = '',
+command = python_cmd + ' $SCRIPT --output ' + bldroot + 
'/rasterizer/jitter --gen_intrin_h'
+)
+Depends('rasterizer/jitter/gen_builder.hpp',
+swrroot + 'rasterizer/codegen/templates/gen_builder.hpp')
+
+env.CodeGenerate(
 target = './gen_swr_context_llvm.h',
 script = swrroot + 'rasterizer/codegen/gen_llvm_types.py',
 source = 'swr_context.h',
diff --git a/src/gallium/drivers/swr/meson.build 
b/src/gallium/drivers/swr/meson.build
index 4bcd4f4..b28abd6 100644
--- a/src/gallium/drivers/swr/meson.build
+++ b/src/gallium/drivers/swr/meson.build
@@ -296,7 +296,7 @@ endif
 libmesaswr = static_library(
   'mesaswr',
   [files_swr_mesa, files_swr_common, gen_knobs_h, gen_knobs_cpp,
-   gen_builder_hpp, gen_builder_x86_hpp],
+   gen_builder_hpp, gen_builder_x86_hpp, gen_builder_intrin_hpp],
   cpp_args : [cpp_vis_args, swr_cpp_args, swr_avx_args, swr_arch_defines],
   include_directories : [inc_common, swr_incs],
   dependencies : dep_llvm,
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 3e1fbfe..9dfc1e7 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -42,32 +42,40 @@ inst_aliases = {
 }
 
 intrinsics = [
-['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
-['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
-['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 
'indices', 'mask', 'scale']],
-['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
-['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 
'indices', 'mask', 'scale']],
-['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']],
-['VRSQRTPS', 'x86_avx_rsqrt_ps_256', ['a']],
-['VRCPPS', 'x86_avx_rcp_ps_256', ['a']],
-['VMINPS', 'x86_avx_min_ps_256', 

[Mesa-dev] [PATCH v2 07/45] swr/rast: Changes to allow jitter to compile with LLVM5

2018-04-17 Thread George Kyriazis
---
 src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp | 18 +-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp 
b/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp
index 031bced..b1d6076 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp
@@ -1,5 +1,5 @@
 /
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2017-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -112,6 +112,22 @@ using PassManager = llvm::legacy::PassManager;
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #endif
 
+#if LLVM_VERSION_MAJOR >= 5
+static const auto Sync_CrossThread = llvm::SyncScope::System;
+static const auto Attrib_FunctionIndex = llvm::AttributeList::FunctionIndex;
+static inline llvm::AttributeSet GetFuncAttribSet(llvm::LLVMContext& ctx, 
const llvm::AttrBuilder )
+{
+return llvm::AttributeSet::get(ctx, b);
+}
+#else
+static const auto Sync_CrossThread = llvm::SynchronizationScope::CrossThread;
+static const auto Attrib_FunctionIndex = llvm::AttributeSet::FunctionIndex;
+static inline llvm::AttributeSet GetFuncAttribSet(llvm::LLVMContext& ctx, 
const llvm::AttrBuilder )
+{
+return llvm::AttributeSet::get(ctx, Attrib_FunctionIndex, b);
+}
+#endif
+
 #pragma pop_macro("DEBUG")
 
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 21/45] swr/rast: Lower PERMD and PERMPS to x86.

2018-04-17 Thread George Kyriazis
Add support for providing an emulation callback function for arch/width
combinations that don't map cleanly to an x86 intrinsic.
---
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  |  8 +--
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 70 --
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  2 -
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 20 +++
 4 files changed, 14 insertions(+), 86 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index 68695c4..d8ec885 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -555,7 +555,7 @@ namespace SwrJit
 // 256i - 01234567
 //       
 
-Value* vi128XY = BITCAST(PERMD(vShufResult, C({ 0, 1, 4, 
5, 2, 3, 6, 7 })), v128bitTy);
+Value* vi128XY = BITCAST(VPERMD(vShufResult, C({ 0, 1, 4, 
5, 2, 3, 6, 7 })), v128bitTy);
 // after PERMD: move and pack xy components into each 128bit lane
 // 256i - 01234567
 //       
@@ -565,7 +565,7 @@ namespace SwrJit
 if (info.numComps > 2)
 {
 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], 
v32x8Ty), vConstMask), vGatherTy);
-vi128ZW = BITCAST(PERMD(vShufResult, C({ 0, 1, 4, 5, 
2, 3, 6, 7 })), v128bitTy);
+vi128ZW = BITCAST(VPERMD(vShufResult, C({ 0, 1, 4, 5, 
2, 3, 6, 7 })), v128bitTy);
 }
 
 for (uint32_t i = 0; i < 4; i++)
@@ -644,7 +644,7 @@ namespace SwrJit
 // 256i - 01234567
 //       
 
-Value* vi128XY = BITCAST(PERMD(vShufResult, C({ 0, 4, 0, 
0, 1, 5, 0, 0 })), v128Ty);
+Value* vi128XY = BITCAST(VPERMD(vShufResult, C({ 0, 4, 0, 
0, 1, 5, 0, 0 })), v128Ty);
 // after PERMD: move and pack xy and zw components in low 64 bits 
of each 128bit lane
 // 256i - 01234567
 //  dcdc dcdc   dcdc dcdc (dc - don't care)
@@ -653,7 +653,7 @@ namespace SwrJit
 Value* vi128ZW = nullptr;
 if (info.numComps > 2)
 {
-vi128ZW = BITCAST(PERMD(vShufResult, C({ 2, 6, 0, 0, 
3, 7, 0, 0 })), v128Ty);
+vi128ZW = BITCAST(VPERMD(vShufResult, C({ 2, 6, 0, 0, 
3, 7, 0, 0 })), v128Ty);
 }
 
 // sign extend all enabled components. If we have a fill 
vVertexElements, output to current simdvertex
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 54987c7..aa9e2dd 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -602,76 +602,6 @@ namespace SwrJit
 }
 
 //
-/// @brief Generate a VPERMD operation (shuffle 32 bit integer values 
-/// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
-/// platform, emulate it
-/// @param a - 256bit SIMD lane(8x32bit) of integer values.
-/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
-Value *Builder::PERMD(Value* a, Value* idx)
-{
-Value* res;
-// use avx2 permute instruction if available
-if(JM()->mArch.AVX2())
-{
-res = VPERMD(a, idx);
-}
-else
-{
-if (isa(idx))
-{
-res = VSHUFFLE(a, a, idx);
-}
-else
-{
-res = VUNDEF_I();
-for (uint32_t l = 0; l < JM()->mVWidth; ++l)
-{
-Value* pIndex = VEXTRACT(idx, C(l));
-Value* pVal = VEXTRACT(a, pIndex);
-res = VINSERT(res, pVal, C(l));
-}
-}
-}
-return res;
-}
-
-//
-/// @brief Generate a VPERMPS operation (shuffle 32 bit float values 
-/// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
-/// platform, emulate it
-/// @param a - 256bit SIMD lane(8x32bit) of float values.
-/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
-Value *Builder::PERMPS(Value* a, Value* idx)
-{
-Value* res;
-// use avx2 permute instruction if available
-if (JM()->mArch.AVX2())
-{
-// llvm 3.6.0 swapped the order of the args to vpermd
-res = VPERMPS(idx, a);
-  

[Mesa-dev] [PATCH v2 17/45] swr/rast: Fix name mangling for LLVM pow intrinsic

2018-04-17 Thread George Kyriazis
---
 src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 0245584..324f24a 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -79,7 +79,7 @@ llvm_intrinsics = [
 ['LOG2', 'log2', ['a'], ['a']],
 ['FABS', 'fabs', ['a'], ['a']],
 ['EXP2', 'exp2', ['a'], ['a']],
-['POW', 'pow', ['a', 'b'], ['a', 'b']]
+['POW', 'pow', ['a', 'b'], ['a']]
 ]
 
 this_dir = os.path.dirname(os.path.abspath(__file__))
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 19/45] swr/rast: Simplify #define usage in gen source file

2018-04-17 Thread George Kyriazis
Removed preprocessor defines from structures passed to LLVM jitted code.

The python scripts do not understand the preprocessor defines and ignores
them. So for fields that are compiled out due to a preprocessor define
the LLVM script accounts for them anyway because it doesn't know what
the defines are set to. The sanitize defines for open source are fine
in that they're safely used.
---
 src/gallium/drivers/swr/rasterizer/core/state.h | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index 47ffacf..084ca54 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -234,13 +234,12 @@ struct SWR_VS_CONTEXT
 uint32_t InstanceID;// IN: Instance ID, constant across all verts 
of the SIMD
 simdscalari VertexID;   // IN: Vertex ID
 simdscalari mask;   // IN: Active mask for shader
-#if USE_SIMD16_FRONTEND
+
+// SIMD16 Frontend fields.
 uint32_t AlternateOffset;   // IN: amount to offset for interleaving 
even/odd simd8 in simd16vertex output
-#if USE_SIMD16_VS
 simd16scalari mask16;   // IN: Active mask for shader (16-wide)
 simd16scalari VertexID16;   // IN: Vertex ID (16-wide)
-#endif
-#endif
+
 SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast.
 };
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 00/45] OpenSWR driver misc changes

2018-04-17 Thread George Kyriazis

Lots of SWR-specifc changes, including:
- work for 16-wide simd operation across all avx flavors
- separate avx instrinsics into a separate x86 lowering pass
- stats work
- misc other cleanup

v2: mostly editorial changes

George Kyriazis (45):
  swr/rast: Add some instructions to jitter
  swr/rast: Introduce JIT_MEM_CLIENT
  swr/rast: Use blend context struct to pass params
  swr/rast: Add debug type info for i128
  swr/rast: Silence some unused variable warnings
  swr/rast: Add some archrast stats
  swr/rast: Changes to allow jitter to compile with LLVM5
  swr/rast: LLVM 6 fix
  swr/rast: WIP builder rewrite.
  swr/rast: Add autogen of helper llvm intrinsics.
  swr/rast: WIP builder rewrite (2)
  swr/rast: Permute work for simd16
  swr/rast: Add MEM_ADD helper function to Builder.
  swr/rast: Add "Num Instructions Executed" stats intrinsic.
  swr/rast: Code cleanup
  swr/rast: Add some archrast counters
  swr/rast: Fix name mangling for LLVM pow intrinsic
  swr/rast: Move CallPrint() to a separate file
  swr/rast: Simplify #define usage in gen source file
  swr/rast: Start refactoring of builder/packetizer.
  swr/rast: Lower PERMD and PERMPS to x86.
  swr/rast: Cleanup of JitManager convenience types
  swr/rast: Lower VGATHERPS and VGATHERPS_16 to x86.
  swr/rast: Add builder_gfx_mem.{h|cpp}
  swr/rast: Enable generalized fetch jit
  swr: add x86 lowering pass to fragment shader
  swr/rast: Fix codegen for typedef types
  swr/rast: Adding translate call to builder_gfx_mem.
  swr/rast: Add support for setting optimization level
  swr/rast: Fix byte offset for non-indexed draws
  swr/rast: Change gfx pointers to gfxptr_t
  swr/rast: Fix alloca usage in jitter
  swr/rast: add cvt instructions in x86 lowering pass
  swr/rast: fix comment
  swr/rast: Add vgather to x86 lowering pass.
  swr/rast: Type-check TemplateArgUnroller
  swr/rast: Add shader stats infrastructure (WIP)
  swr/rast: Fix 64bit float loads in x86 lowering pass
  swr/rast: double-pump in x86 lowering pass
  swr/rast: minimize codegen redundant work
  swr/rast: Refactor to improve code sharing.
  swr/rast: Implement VROUND intrinsic in x86 lowering pass
  swr/rast: Optimize late/bindless JIT of samplers
  swr/rast: Replace x86 VMOVMSK with llvm-only implementation
  swr/rast: Fix VGATHERPD lowering

 src/gallium/drivers/swr/Makefile.am|   14 +-
 src/gallium/drivers/swr/Makefile.sources   |6 +-
 src/gallium/drivers/swr/SConscript |   13 +-
 src/gallium/drivers/swr/meson.build|6 +-
 .../drivers/swr/rasterizer/archrast/archrast.cpp   |   97 +-
 .../drivers/swr/rasterizer/archrast/events.proto   |   54 +
 .../swr/rasterizer/archrast/events_private.proto   |   45 +
 .../drivers/swr/rasterizer/codegen/gen_archrast.py |  111 +-
 .../drivers/swr/rasterizer/codegen/gen_backends.py |   97 +-
 .../drivers/swr/rasterizer/codegen/gen_common.py   |  131 +-
 .../drivers/swr/rasterizer/codegen/gen_knobs.py|   53 +-
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |  180 ++-
 .../swr/rasterizer/codegen/gen_llvm_types.py   |   30 +-
 .../drivers/swr/rasterizer/codegen/knob_defs.py|   35 +
 .../drivers/swr/rasterizer/codegen/meson.build |2 +-
 .../rasterizer/codegen/templates/gen_builder.hpp   |   29 +-
 .../drivers/swr/rasterizer/common/simd16intrin.h   |1 +
 .../drivers/swr/rasterizer/common/simdintrin.h |1 +
 .../swr/rasterizer/common/simdlib_256_avx.inl  |6 +
 .../swr/rasterizer/common/simdlib_256_avx2.inl |7 +
 .../swr/rasterizer/common/simdlib_512_avx512.inl   |6 +
 .../swr/rasterizer/common/simdlib_512_emu.inl  |   16 +-
 src/gallium/drivers/swr/rasterizer/core/api.cpp|   15 +-
 src/gallium/drivers/swr/rasterizer/core/api.h  |   47 +-
 .../drivers/swr/rasterizer/core/backend.cpp|   10 +-
 src/gallium/drivers/swr/rasterizer/core/backend.h  |4 +-
 .../drivers/swr/rasterizer/core/backend_clear.cpp  |   19 +-
 .../drivers/swr/rasterizer/core/backend_impl.h |   73 +-
 .../drivers/swr/rasterizer/core/backend_sample.cpp |   14 +-
 .../swr/rasterizer/core/backend_singlesample.cpp   |   15 +-
 src/gallium/drivers/swr/rasterizer/core/binner.cpp |4 +-
 src/gallium/drivers/swr/rasterizer/core/context.h  |5 +-
 .../drivers/swr/rasterizer/core/frontend.cpp   |   78 +-
 src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp |   42 +-
 .../drivers/swr/rasterizer/core/rasterizer.cpp |4 +-
 .../drivers/swr/rasterizer/core/rasterizer_impl.h  |   15 +-
 src/gallium/drivers/swr/rasterizer/core/state.h|   91 +-
 .../drivers/swr/rasterizer/core/threads.cpp|   68 +-
 src/gallium/drivers/swr/rasterizer/core/threads.h  |5 +-
 .../drivers/swr/rasterizer/core/tilemgr.cpp|   21 +-
 src/gallium/drivers/swr/rasterizer/core/tilemgr.h  |4 +-
 src/gallium/drivers/swr/rasterizer/core/utils.h|   39 +-
 .../drivers/swr/rasterizer/jitter/JitManager.cpp   |   44 +-
 .../drivers/swr/rasteri

[Mesa-dev] [PATCH v2 06/45] swr/rast: Add some archrast stats

2018-04-17 Thread George Kyriazis
Add stats for degenerate and backfacing primitive counts

Wire archrast stats for alpha blend and alpha test.
pass value to jitter, upon return have archrast event increment a value
---
 .../drivers/swr/rasterizer/archrast/archrast.cpp   | 35 +-
 .../drivers/swr/rasterizer/archrast/events.proto   | 19 
 .../swr/rasterizer/archrast/events_private.proto   | 15 ++
 .../drivers/swr/rasterizer/core/backend_impl.h | 18 +++
 .../drivers/swr/rasterizer/core/backend_sample.cpp |  4 +--
 .../swr/rasterizer/core/backend_singlesample.cpp   |  4 +--
 src/gallium/drivers/swr/rasterizer/core/binner.cpp |  2 ++
 src/gallium/drivers/swr/rasterizer/core/state.h|  2 ++
 .../drivers/swr/rasterizer/jitter/blend_jit.cpp| 17 +++
 9 files changed, 105 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp 
b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
index 1f87dba..12dfc0e 100644
--- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
+++ b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
@@ -73,6 +73,18 @@ namespace ArchRast
 uint32_t rasterTiles = 0;
 };
 
+struct CullStats
+{
+uint32_t degeneratePrimCount = 0;
+uint32_t backfacePrimCount = 0;
+};
+
+struct AlphaStats
+{
+uint32_t alphaTestCount = 0;
+uint32_t alphaBlendCount = 0;
+};
+
 //
 /// @brief Event handler that handles API thread events. This is shared
 ///between the API and its caller (e.g. driver shim) but typically
@@ -280,7 +292,12 @@ namespace ArchRast
 // Rasterized Subspans
 EventHandlerFile::Handle(RasterTiles(drawId, 
rastStats.rasterTiles));
 
-//Reset Internal Counters
+// Alpha Subspans
+EventHandlerFile::Handle(AlphaEvent(drawId, 
mAlphaStats.alphaTestCount, mAlphaStats.alphaBlendCount));
+
+// Primitive Culling
+EventHandlerFile::Handle(CullEvent(drawId, 
mCullStats.backfacePrimCount, mCullStats.degeneratePrimCount));
+
 mDSSingleSample = {};
 mDSSampleRate = {};
 mDSCombined = {};
@@ -288,6 +305,8 @@ namespace ArchRast
 mDSNullPS = {};
 
 rastStats = {};
+mCullStats = {};
+mAlphaStats = {};
 mNeedFlush = false;
 }
 
@@ -327,6 +346,18 @@ namespace ArchRast
 rastStats.rasterTiles += event.data.rasterTiles;
 }
 
+virtual void Handle(const CullInfoEvent& event)
+{
+mCullStats.degeneratePrimCount += 
_mm_popcnt_u32(event.data.validMask ^ (event.data.validMask & 
~event.data.degeneratePrimMask));
+mCullStats.backfacePrimCount   += 
_mm_popcnt_u32(event.data.validMask ^ (event.data.validMask & 
~event.data.backfacePrimMask));
+}
+
+virtual void Handle(const AlphaInfoEvent& event)
+{
+mAlphaStats.alphaTestCount  += event.data.alphaTestEnable;
+mAlphaStats.alphaBlendCount += event.data.alphaBlendEnable;
+}
+
 protected:
 bool mNeedFlush;
 // Per draw stats
@@ -340,6 +371,8 @@ namespace ArchRast
 TEStats mTS = {};
 GSStats mGS = {};
 RastStats rastStats = {};
+CullStats mCullStats = {};
+AlphaStats mAlphaStats = {};
 
 };
 
diff --git a/src/gallium/drivers/swr/rasterizer/archrast/events.proto 
b/src/gallium/drivers/swr/rasterizer/archrast/events.proto
index 7d9a68d..deb0373 100644
--- a/src/gallium/drivers/swr/rasterizer/archrast/events.proto
+++ b/src/gallium/drivers/swr/rasterizer/archrast/events.proto
@@ -180,6 +180,7 @@ event LateStencilSampleRate
 uint64_t failCount;
 };
 
+// Total Early-Z counts, SingleSample and SampleRate
 event EarlyZ
 {
 uint32_t drawId;
@@ -187,6 +188,7 @@ event EarlyZ
 uint64_t failCount;
 }; 
 
+// Total LateZ counts, SingleSample and SampleRate
 event LateZ
 {
 uint32_t drawId;
@@ -194,6 +196,7 @@ event LateZ
 uint64_t failCount;
 };
 
+// Total EarlyStencil counts, SingleSample and SampleRate
 event EarlyStencil
 {
 uint32_t drawId; 
@@ -201,6 +204,7 @@ event EarlyStencil
 uint64_t failCount;
 };
 
+// Total LateStencil counts, SingleSample and SampleRate
 event LateStencil
 {
 uint32_t drawId; 
@@ -302,3 +306,18 @@ event ClipperEvent
 uint32_t trivialAcceptCount;
 uint32_t mustClipCount;
 };
+
+event CullEvent
+{
+uint32_t drawId;
+uint64_t backfacePrimCount;
+uint64_t degeneratePrimCount;
+};
+
+event AlphaEvent
+{
+uint32_t drawId;
+uint32_t alphaTestCount;
+uint32_t alphaBlendCount;
+};
+
diff --git a/src/gallium/drivers/swr/rasterizer/archrast/events_private.proto 
b/src/gallium/drivers/swr/rasterizer/archrast/events_private.proto
index f0a9310..37593be 100644
--- 

[Mesa-dev] [PATCH v2 05/45] swr/rast: Silence some unused variable warnings

2018-04-17 Thread George Kyriazis
---
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 5feb5fa..1ee6691 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -253,7 +253,13 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& 
fetchState)
: vIndices2 = 
GetSimdValid32bitIndices(indices2, pLastIndex);
 #endif
 break; // incoming type is already 32bit int
-default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; 
break;
+default:
+SWR_INVALID("Unsupported index type");
+vIndices = nullptr;
+#if USE_SIMD16_SHADERS
+vIndices2 = nullptr;
+#endif
+break;
 }
 
 if(fetchState.bForceSequentialAccessEnable)
@@ -434,6 +440,10 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE 
, Value* str
 }
 else if (ied.InstanceStrideEnable)
 {
+// silence unused variable warnings
+startOffset = C(0);
+vCurIndices = vIndices;
+
 SWR_ASSERT((0), "TODO: Fill out more once driver sends this 
down.");
 }
 else
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 01/45] swr/rast: Add some instructions to jitter

2018-04-17 Thread George Kyriazis
VPHADDD, PMAXUD, PMINUD
---
 .../drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py |  1 +
 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp   | 12 
 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h |  2 ++
 3 files changed, 15 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index aab499b..113c616 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -71,6 +71,7 @@ intrinsics = [
 ['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c']],
 ['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a']],
 ['INTERRUPT', 'x86_int', ['a']],
+['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b']],
 ]
 
 this_dir = os.path.dirname(os.path.abspath(__file__))
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 0148d8e..704b0f2 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -756,6 +756,18 @@ namespace SwrJit
 return SELECT(cmp, a, b);
 }
 
+Value *Builder::PMAXUD(Value* a, Value* b)
+{
+Value* cmp = ICMP_UGT(a, b);
+return SELECT(cmp, a, b);
+}
+
+Value *Builder::PMINUD(Value* a, Value* b)
+{
+Value* cmp = ICMP_ULT(a, b);
+return SELECT(cmp, a, b);
+}
+
 // Helper function to create alloca in entry block of function
 Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
 {
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 5195678..9660bc6 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -128,6 +128,8 @@ Value *CVTPH2PS(Value* a, const llvm::Twine& name = "");
 Value *CVTPS2PH(Value* a, Value* rounding);
 Value *PMAXSD(Value* a, Value* b);
 Value *PMINSD(Value* a, Value* b);
+Value *PMAXUD(Value* a, Value* b);
+Value *PMINUD(Value* a, Value* b);
 Value *VABSPS(Value* a);
 Value *FMADDPS(Value* a, Value* b, Value* c);
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 03/45] swr/rast: Use blend context struct to pass params

2018-04-17 Thread George Kyriazis
Stuff parameters into a blend context struct before passing down through
the PFN_BLEND_JIT_FUNC function pointer. Needed for stat changes.
---
 .../drivers/swr/rasterizer/core/backend_impl.h | 44 ++-
 src/gallium/drivers/swr/rasterizer/core/state.h| 17 ++--
 .../drivers/swr/rasterizer/jitter/blend_jit.cpp| 50 +++---
 3 files changed, 62 insertions(+), 49 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h 
b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
index 2cfd52e..8c539e3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
@@ -724,24 +724,26 @@ INLINE void OutputMerger4x2(SWR_PS_CONTEXT , 
uint8_t* ()[SW
 
 const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = 
>renderTarget[rt];
 
+SWR_BLEND_CONTEXT blendContext = { 0 };
 {
 // pfnBlendFunc may not update all channels.  Initialize with PS 
output.
 /// TODO: move this into the blend JIT.
 blendOut = psContext.shaded[rt];
 
+blendContext.pBlendState = pBlendState;
+blendContext.src = [rt];
+blendContext.src1 = [1];
+blendContext.src0alpha = reinterpret_cast([0].w);
+blendContext.sampleNum = sample;
+blendContext.pDst = (simdvector *) 
+blendContext.result = 
+blendContext.oMask = 
+blendContext.pMask = reinterpret_cast();
+
 // Blend outputs and update coverage mask for alpha test
 if(pfnBlendFunc[rt] != nullptr)
 {
-pfnBlendFunc[rt](
-pBlendState,
-psContext.shaded[rt],
-psContext.shaded[1],
-psContext.shaded[0].w,
-sample,
-pColorSample,
-blendOut,
-,
-(simdscalari*));
+pfnBlendFunc[rt]();
 }
 }
 
@@ -811,24 +813,26 @@ INLINE void OutputMerger8x2(SWR_PS_CONTEXT , 
uint8_t* ()[SW
 pColorSample = nullptr;
 }
 
+SWR_BLEND_CONTEXT blendContext = { 0 };
 {
 // pfnBlendFunc may not update all channels.  Initialize with PS 
output.
 /// TODO: move this into the blend JIT.
 blendOut = psContext.shaded[rt];
 
+blendContext.pBlendState= pBlendState;
+blendContext.src= [rt];
+blendContext.src1   = [1];
+blendContext.src0alpha  = reinterpret_cast([0].w);
+blendContext.sampleNum  = sample;
+blendContext.pDst   = 
+blendContext.result = 
+blendContext.oMask  = 
+blendContext.pMask  = reinterpret_cast();
+
 // Blend outputs and update coverage mask for alpha test
 if(pfnBlendFunc[rt] != nullptr)
 {
-pfnBlendFunc[rt](
-pBlendState,
-psContext.shaded[rt],
-psContext.shaded[1],
-psContext.shaded[0].w,
-sample,
-reinterpret_cast(),
-blendOut,
-,
-reinterpret_cast());
+pfnBlendFunc[rt]();
 }
 }
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index 6b108d9..8c26ec6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -876,6 +876,19 @@ struct SWR_BLEND_STATE
 };
 static_assert(sizeof(SWR_BLEND_STATE) == 36, "Invalid SWR_BLEND_STATE size");
 
+struct SWR_BLEND_CONTEXT
+{
+const SWR_BLEND_STATE*  pBlendState;
+simdvector* src;
+simdvector* src1;
+simdvector* src0alpha;
+uint32_tsampleNum;
+simdvector* pDst;
+simdvector* result;
+simdscalari*oMask;
+simdscalari*pMask;
+};
+
 //
 /// FUNCTION POINTERS FOR SHADERS
 
@@ -892,9 +905,7 @@ typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, 
SWR_CS_CONTEXT* pCsConte
 typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext);
 typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT 
*pContext);
 typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT 
*pContext);
-typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*,
-simdvector& vSrc, simdvector& vSrc1, simdscalar& vSrc0Alpha, uint32_t 
sample,
-uint8_t* pDst, simdvector& vResult, simdscalari* vOMask, simdscalari* 
vCoverageMask);
+typedef void(__cdecl 

[Mesa-dev] [PATCH 15/45] swr/rast: Code cleanup

2018-04-13 Thread George Kyriazis
Removing some code that doesn't seem to do anything meaningful.
---
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 5c8d813..5971a52 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -156,14 +156,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& 
fetchState)
 mpFetchInfo->setName("fetchInfo");
 Value*pVtxOut = &*argitr;
 pVtxOut->setName("vtxOutput");
-// this is just shorthand to tell LLVM to get a pointer to the base 
address of simdvertex
-// index 0(just the pointer to the simdvertex structure
-// index 1(which element of the simdvertex structure to offset to(in this 
case 0)
-// so the indices being i32's doesn't matter
-// TODO: generated this GEP with a VECTOR structure type so this makes 
sense
-std::vectorvtxInputIndices(2, C(0));
-// GEP
-pVtxOut = GEP(pVtxOut, C(0));
+
 #if USE_SIMD16_SHADERS
 #if 0// USE_SIMD16_BUILDER
 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, 
mVWidth16), 0));
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 38/45] swr/rast: Fix 64bit float loads in x86 lowering pass

2018-04-13 Thread George Kyriazis
---
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  | 39 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 31 +
 2 files changed, 25 insertions(+), 45 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index a163b8f..3c3c157 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -201,44 +201,7 @@ namespace SwrJit
 /// @param scale - value to scale indices by
 Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, uint8_t scale)
 {
-Value* vGather;
-
-// use avx2 gather instruction if available
-if (JM()->mArch.AVX2())
-{
-vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth / 
2)), VectorType::get(mDoubleTy, mVWidth / 2));
-vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
-}
-else
-{
-Value* pStack = STACKSAVE();
-
-// store vSrc on the stack.  this way we can select between a 
valid load address and the vSrc address
-Value* vSrcPtr = ALLOCA(vSrc->getType());
-SetTempAlloca(vSrcPtr);
-STORE(vSrc, vSrcPtr);
-
-vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
-Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
-Value *vOffsets = MUL(vIndices, vScaleVec);
-for (uint32_t i = 0; i < mVWidth / 2; ++i)
-{
-// single component byte index
-Value *offset = VEXTRACT(vOffsets, C(i));
-// byte pointer to component
-Value *loadAddress = GEP(pBase, offset);
-loadAddress = BITCAST(loadAddress, PointerType::get(mDoubleTy, 
0));
-// pointer to the value to load if we're masking off a 
component
-Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
-Value *selMask = VEXTRACT(vMask, C(i));
-// switch in a safe address to load if we're trying to access 
a vertex
-Value *validAddress = SELECT(selMask, loadAddress, 
maskLoadAddress);
-Value *val = LOAD(validAddress);
-vGather = VINSERT(vGather, val, C(i));
-}
-STACKRESTORE(pStack);
-}
-return vGather;
+return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
 }
 
 //
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index cdfddf3..767866f 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -230,7 +230,6 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& 
fetchState)
 }
 
 // Fetch attributes from memory and output to a simdvertex struct
-// since VGATHER has a perf penalty on HSW vs BDW, allow client to choose 
which fetch method to use
 JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 
 RET_VOID();
@@ -763,13 +762,31 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 // if we need to gather the component
 if (compCtrl[i] == StoreSrc)
 {
-Value *vMaskLo = VSHUFFLE(vGatherMask, 
VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
-Value *vMaskHi = VSHUFFLE(vGatherMask, 
VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
+Value* vShufLo;
+Value* vShufHi;
+Value* vShufAll;
 
-Value *vOffsetsLo = VEXTRACTI128(vOffsets, 
C(0));
-Value *vOffsetsHi = VEXTRACTI128(vOffsets, 
C(1));
+if (mVWidth == 8)
+{
+vShufLo = C({ 0, 1, 2, 3 });
+vShufHi = C({ 4, 5, 6, 7 });
+vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
+}
+else
+{
+SWR_ASSERT(mVWidth == 16);
+vShufLo = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
+vShufHi = C({ 8, 9, 10, 11, 12, 13, 14, 15 
});
+vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 
9, 10, 11, 12, 13, 14, 15 });
+}
+
+Value *vMaskLo = VSHUFFLE(vGatherMask, 
vGatherMask, vShufLo);
+Value *vMaskHi = 

[Mesa-dev] [PATCH 18/45] swr/rast: Move CallPrint() to a separate file

2018-04-13 Thread George Kyriazis
Needed work for jit code debug.
---
 src/gallium/drivers/swr/Makefile.sources   |  3 +-
 src/gallium/drivers/swr/meson.build|  1 +
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 22 +-
 .../rasterizer/jitter/shader_lib/DebugOutput.cpp   | 51 ++
 4 files changed, 56 insertions(+), 21 deletions(-)
 create mode 100644 
src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp

diff --git a/src/gallium/drivers/swr/Makefile.sources 
b/src/gallium/drivers/swr/Makefile.sources
index cbf7395..4924da1 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -151,7 +151,8 @@ JITTER_CXX_SOURCES := \
rasterizer/jitter/JitManager.cpp \
rasterizer/jitter/JitManager.h \
rasterizer/jitter/streamout_jit.cpp \
-   rasterizer/jitter/streamout_jit.h
+   rasterizer/jitter/streamout_jit.h \
+   rasterizer/jitter/shader_lib/DebugOutput.cpp
 
 MEMORY_CXX_SOURCES := \
rasterizer/memory/ClearTile.cpp \
diff --git a/src/gallium/drivers/swr/meson.build 
b/src/gallium/drivers/swr/meson.build
index b28abd6..3848232 100644
--- a/src/gallium/drivers/swr/meson.build
+++ b/src/gallium/drivers/swr/meson.build
@@ -79,6 +79,7 @@ files_swr_mesa = files(
   'rasterizer/jitter/JitManager.h',
   'rasterizer/jitter/streamout_jit.cpp',
   'rasterizer/jitter/streamout_jit.h',
+  'rasterizer/jitter/shader_lib/DebugOutput.cpp',
 )
 
 files_swr_arch = files(
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index c266018..54987c7 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -33,10 +33,10 @@
 
 #include 
 
+extern "C" void CallPrint(const char* fmt, ...);
+
 namespace SwrJit
 {
-void __cdecl CallPrint(const char* fmt, ...);
-
 //
 /// @brief Convert an IEEE 754 32-bit single precision float to an
 ///16 bit float with 5 exponent bits and a variable
@@ -846,24 +846,6 @@ namespace SwrJit
 /// @brief C functions called by LLVM IR
 //
 
-//
-/// @brief called in JIT code, inserted by PRINT
-/// output to both stdout and visual studio debug console
-void __cdecl CallPrint(const char* fmt, ...)
-{
-va_list args;
-va_start(args, fmt);
-vprintf(fmt, args);
-
-#if defined( _WIN32 )
-char strBuf[1024];
-vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
-OutputDebugStringA(strBuf);
-#endif
-
-va_end(args);
-}
-
 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
 {
 bool flag = !imm8->isZeroValue();
diff --git 
a/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp
new file mode 100644
index 000..54d45e6
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp
@@ -0,0 +1,51 @@
+/
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file DebugOutput.cpp
+*
+* @brief Shader support library implementation for printed Debug output
+*
+* Notes:
+*
+**/
+#include 
+#include "common/os.h"
+
+
+//
+/// @brief called in JIT code, inserted by PRINT
+/// output to both stdout and visual studio debug console
+extern "C" void CallPrint(const char* 

[Mesa-dev] [PATCH 23/45] swr/rast: Lower VGATHERPS and VGATHERPS_16 to x86.

2018-04-13 Thread George Kyriazis
Some more work to do before we can support simultaneous 8-wide and
16-wide and remove the VGATHERPS_16 version.
---
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  | 69 +-
 1 file changed, 2 insertions(+), 67 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index a27f02e..0550493 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -153,79 +153,14 @@ namespace SwrJit
 {
 AssertRastyMemoryParams(pBase, usage);
 
-Value *vGather;
-Value *pBasePtr = INT_TO_PTR(pBase, PointerType::get(mInt8Ty, 0));
-
-// use avx2 gather instruction if available
-if (JM()->mArch.AVX2())
-{
-vGather = VGATHERPS(vSrc, pBasePtr, vIndices, vMask, C(scale));
-}
-else
-{
-Value* pStack = STACKSAVE();
-
-// store vSrc on the stack.  this way we can select between a 
valid load address and the vSrc address
-Value* vSrcPtr = ALLOCA(vSrc->getType());
-STORE(vSrc, vSrcPtr);
-
-vGather = VUNDEF_F();
-Value *vScaleVec = VIMMED1((uint32_t)scale);
-Value *vOffsets = MUL(vIndices, vScaleVec);
-for (uint32_t i = 0; i < mVWidth; ++i)
-{
-// single component byte index
-Value *offset = VEXTRACT(vOffsets, C(i));
-// byte pointer to component
-Value *loadAddress = GEP(pBasePtr, offset);
-loadAddress = BITCAST(loadAddress, PointerType::get(mFP32Ty, 
0));
-// pointer to the value to load if we're masking off a 
component
-Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
-Value *selMask = VEXTRACT(vMask, C(i));
-// switch in a safe address to load if we're trying to access 
a vertex 
-Value *validAddress = SELECT(selMask, loadAddress, 
maskLoadAddress);
-Value *val = LOAD(validAddress);
-vGather = VINSERT(vGather, val, C(i));
-}
-
-STACKRESTORE(pStack);
-}
-
-return vGather;
+return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
 }
 
 Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, 
Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage)
 {
 AssertRastyMemoryParams(pBase, usage);
 
-Value *vGather = VUNDEF_F_16();
-
-// use AVX512F gather instruction if available
-if (JM()->mArch.AVX512F())
-{
-// force mask to , required by vgather2
-Value *mask = BITCAST(vMask, mInt16Ty);
-
-vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, 
C((uint32_t)scale));
-}
-else
-{
-Value *src0 = EXTRACT_16(vSrc, 0);
-Value *src1 = EXTRACT_16(vSrc, 1);
-
-Value *indices0 = EXTRACT_16(vIndices, 0);
-Value *indices1 = EXTRACT_16(vIndices, 1);
-
-Value *mask0 = EXTRACT_16(vMask, 0);
-Value *mask1 = EXTRACT_16(vMask, 1);
-
-Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
-Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
-
-vGather = JOIN_16(gather0, gather1);
-}
-
-return vGather;
+return VGATHERPS_16(vSrc, pBase, vIndices, vMask, C(scale));
 }
 
 //
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 33/45] swr/rast: add cvt instructions in x86 lowering pass

2018-04-13 Thread George Kyriazis
Support generic VCVTPD2PS and VCVTPH2PS in x86 lowering pass.
---
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   | 70 --
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  | 14 -
 .../drivers/swr/rasterizer/jitter/builder_mem.h|  3 -
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|  6 +-
 .../rasterizer/jitter/functionpasses/lower_x86.cpp | 14 ++---
 5 files changed, 48 insertions(+), 59 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 2636e60..4a7d2e9 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -42,28 +42,26 @@ inst_aliases = {
 }
 
 intrinsics = [
-['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 
'mask', 'scale'], 'mSimd4FP64Ty'],
-['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 
'mask', 'scale'], 'mSimdFP32Ty'],
-['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 
'mask', 'scale'], 'mSimd16FP32Ty'],
-['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 
'mask', 'scale'], 'mSimdInt32Ty'],
-['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 
'mask', 'scale'], 'mSimd16Int32Ty'],
-['VRCPPS', 'x86_avx_rcp_ps_256', ['a'], 'mSimdFP32Ty'],
-['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding'], 'mSimdFP32Ty'],
-['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control'], 'mInt32Ty'],
-['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b'], 'mSimd32Int8Ty'],
-['VPERMD', 'x86_avx2_permd', ['a', 'idx'], 'mSimdInt32Ty'],
-['VPERMPS', 'x86_avx2_permps', ['idx', 'a'], 'mSimdFP32Ty'],
-['VCVTPD2PS', 'x86_avx_cvt_pd2_ps_256', ['a'], 'mSimdFP32Ty'],
-['VCVTPH2PS', 'x86_vcvtph2ps_256', ['a'], 'mSimdFP32Ty'],
-['VCVTPS2PH', 'x86_vcvtps2ph_256', ['a', 'round'], 'mSimdFP16Ty'],
-['VHSUBPS', 'x86_avx_hsub_ps_256', ['a', 'b'], 'mSimdFP32Ty'],
-['VPTESTC', 'x86_avx_ptestc_256', ['a', 'b'], 'mInt32Ty'],
-['VPTESTZ', 'x86_avx_ptestz_256', ['a', 'b'], 'mInt32Ty'],
-['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c'], 'mSimdFP32Ty'],
-['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a'], 'mInt32Ty'],
-['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b'], 'mSimdInt32Ty'],
-['PDEP32', 'x86_bmi_pdep_32', ['a', 'b'], 'mInt32Ty'],
-['RDTSC', 'x86_rdtsc', [], 'mInt64Ty'],
+['VGATHERPD',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
+['VGATHERPS',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
+['VGATHERDD',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
+['VRCPPS',  ['a'], 'a'],
+['VROUND',  ['a', 'rounding'], 'a'],
+['BEXTR_32',['src', 'control'], 'src'],
+['VPSHUFB', ['a', 'b'], 'a'],
+['VPERMD',  ['a', 'idx'], 'a'],
+['VPERMPS', ['idx', 'a'], 'a'],
+['VCVTPD2PS',   ['a'], 'VectorType::get(mFP32Ty, 
a->getType()->getVectorNumElements())'],
+['VCVTPH2PS',   ['a'], 'VectorType::get(mFP32Ty, 
a->getType()->getVectorNumElements())'],
+['VCVTPS2PH',   ['a', 'round'], 'mSimdFP16Ty'],
+['VHSUBPS', ['a', 'b'], 'a'],
+['VPTESTC', ['a', 'b'], 'mInt32Ty'],
+['VPTESTZ', ['a', 'b'], 'mInt32Ty'],
+['VFMADDPS',['a', 'b', 'c'], 'a'],
+['VMOVMSKPS',   ['a'], 'mInt32Ty'],
+['VPHADDD', ['a', 'b'], 'a'],
+['PDEP32',  ['a', 'b'], 'a'],
+['RDTSC',   [], 'mInt64Ty'],
 ]
 
 llvm_intrinsics = [
@@ -231,19 +229,31 @@ def generate_meta_h(output_dir):
 
 functions = []
 for inst in intrinsics:
+name = inst[0]
+args = inst[1]
+ret = inst[2]
+
 #print('Inst: %s, x86: %s numArgs: %d' % (inst[0], inst[1], 
len(inst[2])))
-if len(inst[2]) != 0:
-declargs = 'Value* ' + ', Value* '.join(inst[2])
-decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (inst[0], 
declargs)
+if len(args) != 0:
+declargs = 'Value* ' + ', Value* '.join(args)
+decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (name, 
declargs)
 else:
-decl = 'Value* %s(const llvm::Twine& name = "")' % (inst[0])
+decl = 'Value* %s(const llvm::Twine& name = "")' % (name)
+
+# determine the return type of the intrinsic. It can either be:
+# - type of one of the input arguments
+# - snippet of code to set the return type
+
+if ret in args:
+returnTy = ret + '->getType()'
+else:
+returnTy = ret
 
 functions.append({
 'decl'  : decl,
-'name'  : inst[0],
-'intrin': inst[1],
-'args'  : inst[2],
-'returnType': inst[3]
+'name'  : name,
+'args'  : args,
+'returnType': returnTy
 })
 
 

[Mesa-dev] [PATCH 31/45] swr/rast: Change gfx pointers to gfxptr_t

2018-04-13 Thread George Kyriazis
Changing type to gfxptr for indices and related changes to fetch and mem
builder code.
---
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |  4 +-
 src/gallium/drivers/swr/rasterizer/core/api.cpp|  8 +-
 src/gallium/drivers/swr/rasterizer/core/context.h  |  2 +-
 .../drivers/swr/rasterizer/core/frontend.cpp   | 40 -
 src/gallium/drivers/swr/rasterizer/core/state.h|  6 +-
 .../swr/rasterizer/jitter/builder_gfx_mem.cpp  | 80 +++--
 .../swr/rasterizer/jitter/builder_gfx_mem.h| 24 --
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  | 35 ++--
 .../drivers/swr/rasterizer/jitter/builder_mem.h| 23 +++--
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 99 --
 src/gallium/drivers/swr/swr_state.cpp  |  2 +-
 11 files changed, 220 insertions(+), 103 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index bdd785a..2636e60 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -162,7 +162,9 @@ def parse_ir_builder(input_file):
 if (func_name == 'CreateInsertNUWNSWBinOp' or
 func_name == 'CreateMaskedIntrinsic' or
 func_name == 'CreateAlignmentAssumptionHelper' or
-func_name == 'CreateLoad'):
+func_name == 'CreateGEP' or
+func_name == 'CreateLoad' or
+func_name == 'CreateMaskedLoad'):
 ignore = True
 
 # Convert CamelCase to CAMEL_CASE
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 53bd2d2..3141db6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -1321,8 +1321,8 @@ void DrawIndexedInstance(
 }
 
 int draw = 0;
-uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
-pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
+gfxptr_t xpIB = pState->indexBuffer.xpIndices;
+xpIB += (uint64_t)indexOffset * (uint64_t)indexSize;
 
 pState->topology = topology;
 pState->forceFront = false;
@@ -1360,7 +1360,7 @@ void DrawIndexedInstance(
 pDC->pState->pfnProcessPrims != nullptr);
 pDC->FeWork.desc.draw.pDC = pDC;
 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
-pDC->FeWork.desc.draw.pIB = (int*)pIB;
+pDC->FeWork.desc.draw.xpIB = xpIB;
 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
 
 pDC->FeWork.desc.draw.numInstances = numInstances;
@@ -1376,7 +1376,7 @@ void DrawIndexedInstance(
 AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId, topology, 
numIndicesForDraw, indexOffset, baseVertex,
 numInstances, startInstance, pState->tsState.tsEnable, 
pState->gsState.gsEnable, pState->soState.soEnable, 
pState->gsState.outputTopology, draw));
 
-pIB += maxIndicesPerDraw * indexSize;
+xpIB += maxIndicesPerDraw * indexSize;
 remainingIndices -= numIndicesForDraw;
 draw++;
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h 
b/src/gallium/drivers/swr/rasterizer/core/context.h
index 489aa78..7bc69f5 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -176,7 +176,7 @@ struct DRAW_WORK
 };
 union
 {
-const int32_t* pIB;// DrawIndexed: App supplied indices
+gfxptr_t   xpIB;  // DrawIndexed: App supplied int32 
indices 
 uint32_t   startVertex;// Draw: Starting vertex in VB to render 
from.
 };
 int32_tbaseVertex;
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 2076859..30c2e7b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -1527,28 +1527,24 @@ void ProcessDraw(
 uint32_t indexSize = 0;
 uint32_t endVertex = work.numVerts;
 
-const int32_t* pLastRequestedIndex = nullptr;
+gfxptr_t xpLastRequestedIndex = 0;
 if (IsIndexedT::value)
 {
 switch (work.type)
 {
 case R32_UINT:
 indexSize = sizeof(uint32_t);
-pLastRequestedIndex = &(work.pIB[endVertex]);
 break;
 case R16_UINT:
 indexSize = sizeof(uint16_t);
-// nasty address offset to last index
-pLastRequestedIndex = 
(int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
 break;
 case R8_UINT:
 indexSize = sizeof(uint8_t);
-// nasty address offset to last index
-pLastRequestedIndex = 

[Mesa-dev] [PATCH 24/45] swr/rast: Add builder_gfx_mem.{h|cpp}

2018-04-13 Thread George Kyriazis
Needed to support full translation.  Builder_gfx_mem will convert gfxptr_t
from 64 bit int to regular pointer types for use by builder_mem.
---
 src/gallium/drivers/swr/Makefile.sources   |   2 +
 src/gallium/drivers/swr/meson.build|   2 +
 .../swr/rasterizer/jitter/builder_gfx_mem.cpp  | 136 +
 .../swr/rasterizer/jitter/builder_gfx_mem.h|  67 ++
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|   7 +-
 5 files changed, 210 insertions(+), 4 deletions(-)
 create mode 100644 
src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
 create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h

diff --git a/src/gallium/drivers/swr/Makefile.sources 
b/src/gallium/drivers/swr/Makefile.sources
index a7fcba8..dd815dc 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -142,6 +142,8 @@ JITTER_CXX_SOURCES := \
rasterizer/jitter/builder_math.h \
rasterizer/jitter/builder_mem.cpp \
rasterizer/jitter/builder_mem.h \
+   rasterizer/jitter/builder_gfx_mem.cpp \
+   rasterizer/jitter/builder_gfx_mem.h \
rasterizer/jitter/builder_misc.cpp \
rasterizer/jitter/builder_misc.h \
rasterizer/jitter/fetch_jit.cpp \
diff --git a/src/gallium/drivers/swr/meson.build 
b/src/gallium/drivers/swr/meson.build
index 949f582..1cb40f8 100644
--- a/src/gallium/drivers/swr/meson.build
+++ b/src/gallium/drivers/swr/meson.build
@@ -70,6 +70,8 @@ files_swr_mesa = files(
   'rasterizer/jitter/builder_math.h',
   'rasterizer/jitter/builder_mem.cpp',
   'rasterizer/jitter/builder_mem.h',
+  'rasterizer/jitter/builder_gfx_mem.cpp',
+  'rasterizer/jitter/builder_gfx_mem.h',
   'rasterizer/jitter/builder_misc.cpp',
   'rasterizer/jitter/builder_misc.h',
   'rasterizer/jitter/fetch_jit.cpp',
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
new file mode 100644
index 000..bfb3057
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
@@ -0,0 +1,136 @@
+/
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file builder_gfx_mem.cpp
+*
+* @brief Definition of the gfx mem builder
+*
+* Notes:
+*
+**/
+#include "jit_pch.hpp"
+#include "builder.h"
+#include "common/rdtsc_buckets.h"
+#include "builder_gfx_mem.h"
+
+
+namespace SwrJit
+{
+using namespace llvm;
+
+BuilderGfxMem::BuilderGfxMem(JitManager* pJitMgr) :
+Builder(pJitMgr)
+{
+mpfnTranslateGfxAddress = nullptr;
+mpParamSimDC = nullptr;
+}
+
+void BuilderGfxMem::NotifyPrivateContextSet()
+{
+}
+
+void BuilderGfxMem::AssertGFXMemoryParams(Value* ptr, 
Builder::JIT_MEM_CLIENT usage)
+{
+SWR_ASSERT(ptr->getType() == mInt64Ty, "GFX addresses must be gfxptr_t 
and not converted to system pointers.");
+SWR_ASSERT(usage != MEM_CLIENT_RASTY, "Rasty memory should not go 
through the translation path and should not be gfxptr_t.");
+}
+
+//
+/// @brief Generate a masked gather operation in LLVM IR.  If not  
+/// supported on the underlying platform, emulate it with loads
+/// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+/// @param pBase - Int8* base VB address pointer value
+/// @param vIndices - SIMD wide value of VB byte offsets
+/// @param vMask - SIMD wide mask that controls whether to access memory 
or the src values
+/// @param scale - value to scale indices by
+Value *BuilderGfxMem::GATHERPS(Value *vSrc, Value *pBase, 

[Mesa-dev] [PATCH 14/45] swr/rast: Add "Num Instructions Executed" stats intrinsic.

2018-04-13 Thread George Kyriazis
Added a SWR_SHADER_STATS structure which is passed to each shader. The
stats pass will instrument the shader to populate this.
---
 src/gallium/drivers/swr/rasterizer/core/state.h | 28 ++---
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index 22acbe0..47ffacf 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -214,6 +214,15 @@ struct SIMDVERTEX_T
 };
 
 //
+/// SWR_SHADER_STATS
+/// @brief Structure passed to shader for stats collection.
+/
+struct SWR_SHADER_STATS
+{
+uint32_t numInstExecuted; // This is roughly the API instructions executed 
and not x86.
+};
+
+//
 /// SWR_VS_CONTEXT
 /// @brief Input to vertex shader
 /
@@ -232,6 +241,7 @@ struct SWR_VS_CONTEXT
 simd16scalari VertexID16;   // IN: Vertex ID (16-wide)
 #endif
 #endif
+SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast.
 };
 
 /
@@ -281,6 +291,7 @@ struct SWR_HS_CONTEXT
 simdscalari mask;   // IN: Active mask for shader
 ScalarPatch* pCPout;// OUT: Output control point patch
 // SIMD-sized-array of SCALAR patches
+SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast.
 };
 
 //
@@ -298,6 +309,7 @@ struct SWR_DS_CONTEXT
 simdscalar* pDomainV;   // IN: (SIMD) Domain Point V coords
 simdscalari mask;   // IN: Active mask for shader
 simdscalar* pOutputData;// OUT: (SIMD) Vertex Attributes (2D array 
of vectors, one row per attribute-component)
+SWR_SHADER_STATS stats; // OUT: shader statistics used for 
archrast.
 };
 
 //
@@ -312,6 +324,7 @@ struct SWR_GS_CONTEXT
 uint32_t InstanceID;// IN: input instance ID
 simdscalari mask;   // IN: Active mask for shader
 uint8_t* pStreams[KNOB_SIMD_WIDTH]; // OUT: output stream (contains 
vertices for all output streams)
+SWR_SHADER_STATS stats; // OUT: shader statistics used for 
archrast.
 };
 
 struct PixelPositions
@@ -358,6 +371,8 @@ struct SWR_PS_CONTEXT
 uint32_t rasterizerSampleCount; // IN: sample count used by the 
rasterizer
 
 uint8_t* pColorBuffer[SWR_NUM_RENDERTARGETS]; // IN: Pointers to render 
target hottiles
+
+SWR_SHADER_STATS stats; // OUT: shader statistics used for 
archrast.
 };
 
 //
@@ -391,14 +406,13 @@ struct SWR_CS_CONTEXT
 // Dispatch dimensions used by shader to compute system values from the 
tile counter.
 uint32_t dispatchDims[3];
 
-uint8_t* pTGSM;  // Thread Group Shared Memory pointer.
-
-uint8_t* pSpillFillBuffer;  // Spill/fill buffer for barrier support
-
-uint8_t* pScratchSpace; // Pointer to scratch space buffer used by the 
shader, shader is responsible
-// for subdividing scratch space per 
instance/simd
-
+uint8_t* pTGSM;   // Thread Group Shared Memory pointer.
+uint8_t* pSpillFillBuffer;// Spill/fill buffer for barrier support
+uint8_t* pScratchSpace;   // Pointer to scratch space buffer used by 
the shader, shader is responsible
+  // for subdividing scratch space per 
instance/simd
 uint32_t scratchSpacePerSimd; // Scratch space per work item x SIMD_WIDTH
+
+SWR_SHADER_STATS stats;   // OUT: shader statistics used for archrast.
 };
 
 // enums
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 43/45] swr/rast: Optimize late/bindless JIT of samplers

2018-04-13 Thread George Kyriazis
Add per-worker thread private data to all shader calls
Add per-worker sampler cache and jit context
Add late LoadTexel JIT support
Add per-worker-thread Sampler / LoadTexel JIT
---
 src/gallium/drivers/swr/rasterizer/core/api.cpp|  7 ++-
 src/gallium/drivers/swr/rasterizer/core/api.h  | 47 +++
 .../drivers/swr/rasterizer/core/backend.cpp|  9 +--
 src/gallium/drivers/swr/rasterizer/core/backend.h  |  4 +-
 .../drivers/swr/rasterizer/core/backend_clear.cpp  | 19 +++---
 .../drivers/swr/rasterizer/core/backend_impl.h |  7 ++-
 .../drivers/swr/rasterizer/core/backend_sample.cpp |  5 +-
 .../swr/rasterizer/core/backend_singlesample.cpp   |  6 +-
 src/gallium/drivers/swr/rasterizer/core/binner.cpp |  2 +-
 src/gallium/drivers/swr/rasterizer/core/context.h  |  3 +-
 .../drivers/swr/rasterizer/core/frontend.cpp   | 29 +
 .../drivers/swr/rasterizer/core/rasterizer.cpp |  4 +-
 .../drivers/swr/rasterizer/core/rasterizer_impl.h  | 15 ++---
 src/gallium/drivers/swr/rasterizer/core/state.h| 18 +++---
 .../drivers/swr/rasterizer/core/threads.cpp| 68 +-
 src/gallium/drivers/swr/rasterizer/core/threads.h  |  5 +-
 .../drivers/swr/rasterizer/core/tilemgr.cpp| 21 +++
 src/gallium/drivers/swr/rasterizer/core/tilemgr.h  |  4 +-
 .../drivers/swr/rasterizer/jitter/JitManager.cpp   | 16 ++---
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|  8 ++-
 .../drivers/swr/rasterizer/memory/ClearTile.cpp|  1 +
 .../drivers/swr/rasterizer/memory/LoadTile.cpp |  1 +
 .../drivers/swr/rasterizer/memory/StoreTile.cpp|  1 +
 src/gallium/drivers/swr/swr_memory.h   |  9 ++-
 src/gallium/drivers/swr/swr_shader.cpp |  9 +++
 25 files changed, 213 insertions(+), 105 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 3141db6..e37e2e4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -1,5 +1,5 @@
 /
-* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -122,6 +122,11 @@ HANDLE SwrCreateContext(
 pContext->apiThreadInfo.numAPIThreadsPerCore= 1;
 }
 
+if (pCreateInfo->pWorkerPrivateState)
+{
+pContext->workerPrivateState = *pCreateInfo->pWorkerPrivateState;
+}
+
 memset(>WaitLock, 0, sizeof(pContext->WaitLock));
 memset(>FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
 new (>WaitLock) std::mutex();
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h 
b/src/gallium/drivers/swr/rasterizer/core/api.h
index 7247fa4..b171188 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -1,5 +1,5 @@
 /
-* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -115,7 +115,8 @@ struct SWR_RECT
 /// @param x - destination x coordinate
 /// @param y - destination y coordinate
 /// @param pDstHotTile - pointer to the hot tile surface
-typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT 
dstFormat,
+typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, HANDLE 
hWorkerPrivateData,
+SWR_FORMAT dstFormat,
 SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t 
*pDstHotTile);
 
@@ -127,7 +128,8 @@ typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE 
hPrivateContext, SWR_FORMAT dstForma
 /// @param x - destination x coordinate
 /// @param y - destination y coordinate
 /// @param pSrcHotTile - pointer to the hot tile surface
-typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT 
srcFormat,
+typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, HANDLE 
hWorkerPrivateData,
+SWR_FORMAT srcFormat,
 SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t 
*pSrcHotTile);
 
@@ -139,7 +141,7 @@ typedef void(SWR_API *PFN_STORE_TILE)(HANDLE 
hPrivateContext, SWR_FORMAT srcForm
 /// @param y - destination y coordinate
 /// @param renderTargetArrayIndex - render target array offset from arrayIndex
 /// @param pClearColor - pointer to the hot tile's clear value
-typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext,
+typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext, HANDLE 

[Mesa-dev] [PATCH 09/45] swr/rast: WIP builder rewrite.

2018-04-13 Thread George Kyriazis
Start removing avx2 macros for functionality that exists in llvm.
---
 src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py | 5 -
 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h | 9 -
 2 files changed, 14 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 113c616..3e1fbfe 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -53,12 +53,7 @@ intrinsics = [
 ['VMINPS', 'x86_avx_min_ps_256', ['a', 'b']],
 ['VMAXPS', 'x86_avx_max_ps_256', ['a', 'b']],
 ['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding']],
-['VCMPPS', 'x86_avx_cmp_ps_256', ['a', 'b', 'cmpop']],
-['VBLENDVPS', 'x86_avx_blendv_ps_256', ['a', 'b', 'mask']],
 ['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control']],
-['VMASKLOADD', 'x86_avx2_maskload_d_256', ['src', 'mask']],
-['VMASKMOVPS', 'x86_avx_maskload_ps_256', ['src', 'mask']],
-['VMASKSTOREPS', 'x86_avx_maskstore_ps_256', ['src', 'mask', 'val']],
 ['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b']],
 ['VPERMD', 'x86_avx2_permd', ['a', 'idx']],
 ['VPERMPS', 'x86_avx2_permps', ['idx', 'a']],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 9660bc6..549f328 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -96,15 +96,6 @@ CallInst *CALL(Value *Callee, Value* arg);
 CallInst *CALL2(Value *Callee, Value* arg1, Value* arg2);
 CallInst *CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3);
 
-Value *VCMPPS_EQ(Value* a, Value* b){ return VCMPPS(a, b, 
C((uint8_t)_CMP_EQ_OQ)); }
-Value *VCMPPS_LT(Value* a, Value* b){ return VCMPPS(a, b, 
C((uint8_t)_CMP_LT_OQ)); }
-Value *VCMPPS_LE(Value* a, Value* b){ return VCMPPS(a, b, 
C((uint8_t)_CMP_LE_OQ)); }
-Value *VCMPPS_ISNAN(Value* a, Value* b) { return VCMPPS(a, b, 
C((uint8_t)_CMP_UNORD_Q)); }
-Value *VCMPPS_NEQ(Value* a, Value* b)   { return VCMPPS(a, b, 
C((uint8_t)_CMP_NEQ_OQ)); }
-Value *VCMPPS_GE(Value* a, Value* b){ return VCMPPS(a, b, 
C((uint8_t)_CMP_GE_OQ)); }
-Value *VCMPPS_GT(Value* a, Value* b){ return VCMPPS(a, b, 
C((uint8_t)_CMP_GT_OQ)); }
-Value *VCMPPS_NOTNAN(Value* a, Value* b){ return VCMPPS(a, b, 
C((uint8_t)_CMP_ORD_Q)); }
-
 Value *MASK(Value *vmask);
 Value *MASK_16(Value *vmask);
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 32/45] swr/rast: Fix alloca usage in jitter

2018-04-13 Thread George Kyriazis
Fix issue where temporary allocas were getting hoisted to function entry
unnecessarily. We now explicitly mark temporary allocas and skip hoisting
during the hoist pass. Shuold reduce stack usage.
---
 src/gallium/drivers/swr/rasterizer/jitter/builder.cpp   | 17 +
 src/gallium/drivers/swr/rasterizer/jitter/builder.h |  2 ++
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp   |  1 +
 3 files changed, 20 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index 53947c3..bd81560 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -111,4 +111,21 @@ namespace SwrJit
 mSimdVectorIntTy = ArrayType::get(mSimdInt32Ty, 4);
 mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5);
 }
+
+/// @brief Mark this alloca as temporary to avoid hoisting later on
+void Builder::SetTempAlloca(Value* inst)
+{
+AllocaInst* pAlloca = dyn_cast(inst);
+SWR_ASSERT(pAlloca, "Unexpected non-alloca instruction");
+MDNode* N = MDNode::get(JM()->mContext, MDString::get(JM()->mContext, 
"is_temp_alloca"));
+pAlloca->setMetadata("is_temp_alloca", N);
+}
+
+bool Builder::IsTempAlloca(Value* inst)
+{
+AllocaInst* pAlloca = dyn_cast(inst);
+SWR_ASSERT(pAlloca, "Unexpected non-alloca instruction");
+
+return (pAlloca->getMetadata("is_temp_alloca") != nullptr);
+}
 }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 4c79bab..27a32bc 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -96,6 +96,8 @@ namespace SwrJit
 Type*mSimd32Int8Ty;
 
 void SetTargetWidth(uint32_t width);
+void SetTempAlloca(Value* inst);
+bool IsTempAlloca(Value* inst);
 
 #include "gen_builder.hpp"
 #include "gen_builder_meta.hpp"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index cd9806a..5d8637e 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -229,6 +229,7 @@ namespace SwrJit
 
 // store vSrc on the stack.  this way we can select between a 
valid load address and the vSrc address
 Value* vSrcPtr = ALLOCA(vSrc->getType());
+SetTempAlloca(vSrcPtr);
 STORE(vSrc, vSrcPtr);
 
 vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 30/45] swr/rast: Fix byte offset for non-indexed draws

2018-04-13 Thread George Kyriazis
---
 src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 25d1073..2076859 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -1729,13 +1729,14 @@ void ProcessDraw(
 uint32_t offset;
 offset = std::min(endVertex-i, (uint32_t) 
KNOB_SIMD16_WIDTH);
 #if USE_SIMD16_SHADERS
+offset *= 4; // convert from index to address
 fetchInfo_lo.pLastIndex += offset;
 #else
-fetchInfo_lo.pLastIndex += std::min(offset, (uint32_t) 
KNOB_SIMD_WIDTH);
+fetchInfo_lo.pLastIndex += std::min(offset, (uint32_t) 
KNOB_SIMD_WIDTH) * 4; // * 4 for converting index to address
 uint32_t offset2 = std::min(offset, (uint32_t) 
KNOB_SIMD16_WIDTH)-KNOB_SIMD_WIDTH;
 assert(offset >= 0);
 fetchInfo_hi.pLastIndex = fetchInfo_hi.pIndices;
-fetchInfo_hi.pLastIndex += offset2;
+fetchInfo_hi.pLastIndex += offset2 * 4; // * 4 for 
converting index to address
 #endif
 }
 // 1. Execute FS/VS for a single SIMD.
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 42/45] swr/rast: Implement VROUND intrinsic in x86 lowering pass

2018-04-13 Thread George Kyriazis
---
 .../rasterizer/jitter/functionpasses/lower_x86.cpp | 38 +-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git 
a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
index 983b227..7cfa772 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
@@ -72,7 +72,6 @@ namespace SwrJit
 // Map of intrinsics that haven't been moved to the new mechanism yet. If 
used, these get the previous behavior of
 // mapping directly to avx/avx2 intrinsics.
 static std::map intrinsicMap = {
-{"meta.intrinsic.VROUND",  Intrinsic::x86_avx_round_ps_256},
 {"meta.intrinsic.BEXTR_32",Intrinsic::x86_bmi_bextr_32},
 {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
 {"meta.intrinsic.VCVTPS2PH",   Intrinsic::x86_vcvtps2ph_256},
@@ -90,6 +89,8 @@ namespace SwrJit
 Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, 
CallInst* pCallInst);
 Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst);
 Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst);
+Instruction* VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst);
+
 Instruction* DOUBLE_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst, Intrinsic::ID intrin);
 
 static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
@@ -105,6 +106,7 @@ namespace SwrJit
 {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx_cvt_pd2_ps_256,
  Intrinsic::not_intrinsic},  NO_EMU}},
 {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_vcvtph2ps_256, 
  Intrinsic::not_intrinsic},  NO_EMU}},
+{"meta.intrinsic.VROUND",  {{Intrinsic::x86_avx_round_ps_256,  
  DOUBLE},NO_EMU}},
 },
 {   // AVX2
 {"meta.intrinsic.VRCPPS",  {{Intrinsic::x86_avx_rcp_ps_256,
  DOUBLE},NO_EMU}},
@@ -115,6 +117,7 @@ namespace SwrJit
 {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx_cvt_pd2_ps_256,
  Intrinsic::not_intrinsic},  NO_EMU}},
 {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_vcvtph2ps_256, 
  Intrinsic::not_intrinsic},  NO_EMU}},
+{"meta.intrinsic.VROUND",  {{Intrinsic::x86_avx_round_ps_256,  
  DOUBLE},NO_EMU}},
 },
 {   // AVX512
 {"meta.intrinsic.VRCPPS",  {{Intrinsic::x86_avx512_rcp14_ps_256,   
  Intrinsic::x86_avx512_rcp14_ps_512},NO_EMU}},
@@ -125,6 +128,7 @@ namespace SwrJit
 {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
 {"meta.intrinsic.VCVTPD2PS",   
{{Intrinsic::x86_avx512_mask_cvtpd2ps_256,
Intrinsic::x86_avx512_mask_cvtpd2ps_512 },  NO_EMU}},
 {"meta.intrinsic.VCVTPH2PS",   
{{Intrinsic::x86_avx512_mask_vcvtph2ps_256,   
Intrinsic::x86_avx512_mask_vcvtph2ps_512 }, NO_EMU}},
+{"meta.intrinsic.VROUND",  {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic }, VROUND_EMU}},
 }
 };
 
@@ -499,6 +503,38 @@ namespace SwrJit
 return cast(v32Gather);
 }
 
+// No support for vroundps in avx512 (it is available in kncni), so 
emulate with avx instructions
+Instruction* VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst)
+{
+SWR_ASSERT(arch == AVX512);
+
+auto B = pThis->B;
+auto vf32Src = pCallInst->getOperand(0);
+auto i8Round = pCallInst->getOperand(1);
+auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, 
Intrinsic::x86_avx_round_ps_256);
+
+if (width == W256)
+{
+return cast(B->CALL2(pfnFunc, vf32Src, i8Round));
+}
+else if (width == W512)
+{
+auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0);
+auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1);
+
+auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round);
+auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round);
+
+return cast(B->JOIN_16(v8f32ResLo, v8f32ResHi));
+}
+

[Mesa-dev] [PATCH 22/45] swr/rast: Cleanup of JitManager convenience types

2018-04-13 Thread George Kyriazis
Small cleanup. Remove convenience types from JitManager and standardize
on the Builder's convenience types.
---
 .../drivers/swr/rasterizer/jitter/JitManager.cpp | 19 ---
 .../drivers/swr/rasterizer/jitter/JitManager.h   | 20 
 .../drivers/swr/rasterizer/jitter/builder.cpp|  7 +++
 src/gallium/drivers/swr/rasterizer/jitter/builder.h  |  3 ++-
 4 files changed, 5 insertions(+), 44 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index bfb1d2e..9080964 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -110,11 +110,6 @@ JitManager::JitManager(uint32_t simdWidth, const char 
*arch, const char* core)
 mpExec->RegisterJITEventListener(vTune);
 #endif
 
-mFP32Ty = Type::getFloatTy(mContext);   // float type
-mInt8Ty = Type::getInt8Ty(mContext);
-mInt32Ty = Type::getInt32Ty(mContext);   // int type
-mInt64Ty = Type::getInt64Ty(mContext);   // int type
-
 // fetch function signature
 #if USE_SIMD16_SHADERS
 // typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, 
simd16vertex& out);
@@ -135,20 +130,6 @@ JitManager::JitManager(uint32_t simdWidth, const char 
*arch, const char* core)
 
 mFetchShaderTy = FunctionType::get(Type::getVoidTy(mContext), fsArgs, 
false);
 
-mSimtFP32Ty = VectorType::get(mFP32Ty, mVWidth);
-mSimtInt32Ty = VectorType::get(mInt32Ty, mVWidth);
-
-mSimdVectorTy = ArrayType::get(mSimtFP32Ty, 4);
-mSimdVectorInt32Ty = ArrayType::get(mSimtInt32Ty, 4);
-
-#if USE_SIMD16_SHADERS
-mSimd16FP32Ty = ArrayType::get(mSimtFP32Ty, 2);
-mSimd16Int32Ty = ArrayType::get(mSimtInt32Ty, 2);
-
-mSimd16VectorFP32Ty = ArrayType::get(mSimd16FP32Ty, 4);
-mSimd16VectorInt32Ty = ArrayType::get(mSimd16Int32Ty, 4);
-
-#endif
 #if defined(_WIN32)
 // explicitly instantiate used symbols from potentially staticly linked 
libs
 sys::DynamicLibrary::AddSymbol("exp2f", );
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index 3660249..86e6758 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -143,26 +143,6 @@ struct JitManager
 uint32_tmVWidth;
 
 
-// Built in types.
-llvm::Type* mInt8Ty;
-llvm::Type* mInt32Ty;
-llvm::Type* mInt64Ty;
-llvm::Type* mFP32Ty;
-
-llvm::Type* mSimtFP32Ty;
-llvm::Type* mSimtInt32Ty;
-
-llvm::Type* mSimdVectorInt32Ty;
-llvm::Type* mSimdVectorTy;
-
-#if USE_SIMD16_SHADERS
-llvm::Type* mSimd16FP32Ty;
-llvm::Type* mSimd16Int32Ty;
-
-llvm::Type* mSimd16VectorFP32Ty;
-llvm::Type* mSimd16VectorInt32Ty;
-
-#endif
 // fetch shader types
 llvm::FunctionType* mFetchShaderTy;
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index 260daab..625f132 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -42,10 +42,8 @@ namespace SwrJit
 : mpJitMgr(pJitMgr),
   mpPrivateContext(nullptr)
 {
-SWR_ASSERT(pJitMgr->mVWidth == 8);
-
 mVWidth = pJitMgr->mVWidth;
-mVWidth16 = pJitMgr->mVWidth * 2;
+mVWidth16 = 16;
 
 mpIRBuilder = >mBuilder;
 
@@ -67,7 +65,7 @@ namespace SwrJit
 
 mSimd4FP64Ty = VectorType::get(mDoubleTy, 4);
 
-// Built in types: simd8
+// Built in types: target simd
 
 mSimdInt1Ty = VectorType::get(mInt1Ty,  mVWidth);
 mSimdInt16Ty= VectorType::get(mInt16Ty, mVWidth);
@@ -76,6 +74,7 @@ namespace SwrJit
 mSimdFP16Ty = VectorType::get(mFP16Ty,  mVWidth);
 mSimdFP32Ty = VectorType::get(mFP32Ty,  mVWidth);
 mSimdVectorTy   = ArrayType::get(mSimdFP32Ty, 4);
+mSimdVectorIntTy= ArrayType::get(mSimdInt32Ty, 4);
 mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5);
 
 // Built in types: simd16
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 0b57fbf..6b2c9f0 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -68,7 +68,7 @@ namespace SwrJit
 
 Type*mSimd4FP64Ty;
 
-// Built in types: simd8
+// Built in types: target SIMD
 
 Type*mSimdFP16Ty;
 Type*mSimdFP32Ty;
@@ -79,6 +79,7 @@ namespace SwrJit
 Type*mSimdIntPtrTy;
 Type*

[Mesa-dev] [PATCH 20/45] swr/rast: Start refactoring of builder/packetizer.

2018-04-13 Thread George Kyriazis
Move x86 intrinsic lowering to a separate pass. Builder now instantiates
generic intrinsics for features not supported by llvm. The separate x86
lowering pass is responsible for lowering to valid x86 for the target
SIMD architecture. Currently it's a port of existing code to get it
up and running quickly. Will eventually support optimized x86 for AVX,
AVX2 and AVX512.
---
 src/gallium/drivers/swr/Makefile.am|   6 +-
 src/gallium/drivers/swr/Makefile.sources   |   3 +-
 src/gallium/drivers/swr/SConscript |   4 +-
 src/gallium/drivers/swr/meson.build|   3 +-
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |  58 +--
 .../drivers/swr/rasterizer/codegen/meson.build |   2 +-
 .../rasterizer/codegen/templates/gen_builder.hpp   |  11 +-
 .../drivers/swr/rasterizer/jitter/blend_jit.cpp|   3 +
 .../drivers/swr/rasterizer/jitter/builder.cpp  |   4 +
 .../drivers/swr/rasterizer/jitter/builder.h|   6 +-
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  |   5 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|   3 +
 .../rasterizer/jitter/functionpasses/lower_x86.cpp | 455 +
 .../swr/rasterizer/jitter/functionpasses/passes.h  |  37 ++
 .../drivers/swr/rasterizer/jitter/meson.build  |   8 +-
 .../swr/rasterizer/jitter/streamout_jit.cpp|   3 +
 16 files changed, 565 insertions(+), 46 deletions(-)
 create mode 100644 
src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
 create mode 100644 
src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h

diff --git a/src/gallium/drivers/swr/Makefile.am 
b/src/gallium/drivers/swr/Makefile.am
index 32dd9e5..c22f09e 100644
--- a/src/gallium/drivers/swr/Makefile.am
+++ b/src/gallium/drivers/swr/Makefile.am
@@ -80,7 +80,7 @@ BUILT_SOURCES = \
rasterizer/codegen/gen_knobs.h \
rasterizer/jitter/gen_state_llvm.h \
rasterizer/jitter/gen_builder.hpp \
-   rasterizer/jitter/gen_builder_x86.hpp \
+   rasterizer/jitter/gen_builder_meta.hpp \
rasterizer/jitter/gen_builder_intrin.hpp \
rasterizer/archrast/gen_ar_event.hpp \
rasterizer/archrast/gen_ar_event.cpp \
@@ -134,12 +134,12 @@ rasterizer/jitter/gen_builder.hpp: 
rasterizer/codegen/gen_llvm_ir_macros.py rast
--output rasterizer/jitter \
--gen_h
 
-rasterizer/jitter/gen_builder_x86.hpp: 
rasterizer/codegen/gen_llvm_ir_macros.py 
rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py
+rasterizer/jitter/gen_builder_meta.hpp: 
rasterizer/codegen/gen_llvm_ir_macros.py 
rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py
$(MKDIR_GEN)
$(PYTHON_GEN) \
$(srcdir)/rasterizer/codegen/gen_llvm_ir_macros.py \
--output rasterizer/jitter \
-   --gen_x86_h
+   --gen_meta_h
 
 rasterizer/jitter/gen_builder_intrin.hpp: 
rasterizer/codegen/gen_llvm_ir_macros.py 
rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py
$(MKDIR_GEN)
diff --git a/src/gallium/drivers/swr/Makefile.sources 
b/src/gallium/drivers/swr/Makefile.sources
index 4924da1..a7fcba8 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -152,7 +152,8 @@ JITTER_CXX_SOURCES := \
rasterizer/jitter/JitManager.h \
rasterizer/jitter/streamout_jit.cpp \
rasterizer/jitter/streamout_jit.h \
-   rasterizer/jitter/shader_lib/DebugOutput.cpp
+   rasterizer/jitter/shader_lib/DebugOutput.cpp \
+   rasterizer/jitter/functionpasses/lower_x86.cpp
 
 MEMORY_CXX_SOURCES := \
rasterizer/memory/ClearTile.cpp \
diff --git a/src/gallium/drivers/swr/SConscript 
b/src/gallium/drivers/swr/SConscript
index 5097be6..528cfac 100644
--- a/src/gallium/drivers/swr/SConscript
+++ b/src/gallium/drivers/swr/SConscript
@@ -76,10 +76,10 @@ Depends('rasterizer/jitter/gen_builder.hpp',
 swrroot + 'rasterizer/codegen/templates/gen_builder.hpp')
 
 env.CodeGenerate(
-target = 'rasterizer/jitter/gen_builder_x86.hpp',
+target = 'rasterizer/jitter/gen_builder_meta.hpp',
 script = swrroot + 'rasterizer/codegen/gen_llvm_ir_macros.py',
 source = '',
-command = python_cmd + ' $SCRIPT --output ' + bldroot + 
'/rasterizer/jitter --gen_x86_h'
+command = python_cmd + ' $SCRIPT --output ' + bldroot + 
'/rasterizer/jitter --gen_meta_h'
 )
 Depends('rasterizer/jitter/gen_builder.hpp',
 swrroot + 'rasterizer/codegen/templates/gen_builder.hpp')
diff --git a/src/gallium/drivers/swr/meson.build 
b/src/gallium/drivers/swr/meson.build
index 3848232..949f582 100644
--- a/src/gallium/drivers/swr/meson.build
+++ b/src/gallium/drivers/swr/meson.build
@@ -80,6 +80,7 @@ files_swr_mesa = files(
   'rasterizer/jitter/streamout_jit.cpp',
   'rasterizer/jitter/streamout_jit.h',
   'rasterizer/jitter/shader_lib/DebugOutput.cpp',
+  

[Mesa-dev] [PATCH 34/45] swr/rast: fix comment

2018-04-13 Thread George Kyriazis
---
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 8d659d0..cdfddf3 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -970,7 +970,7 @@ extern "C" void GetSimdValid16bitIndicesGfx(gfxptr_t 
indices, gfxptr_t lastIndex
 
 //
 /// @brief Loads a simd of valid indices. OOB indices are set to 0
-/// *Note* have to do 16bit index checking in scalar until we have AVX-512
+/// *Note* have to do 8bit index checking in scalar until we have AVX-512
 /// support
 /// @param pIndices - pointer to 8 bit indices
 /// @param pLastIndex - pointer to last valid index
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 40/45] swr/rast: minimize codegen redundant work

2018-04-13 Thread George Kyriazis
Move filtering of redundant codegen operations into gen scripts themselves
---
 .../drivers/swr/rasterizer/codegen/gen_archrast.py | 111 +
 .../drivers/swr/rasterizer/codegen/gen_backends.py |  97 +--
 .../drivers/swr/rasterizer/codegen/gen_common.py   | 131 +++--
 .../drivers/swr/rasterizer/codegen/gen_knobs.py|  53 ++---
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |  42 +--
 .../swr/rasterizer/codegen/gen_llvm_types.py   |  29 -
 6 files changed, 335 insertions(+), 128 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_archrast.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_archrast.py
index aa09f22..c5842aa 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_archrast.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_archrast.py
@@ -24,7 +24,7 @@ from __future__ import print_function
 import os
 import sys
 import re
-from gen_common import ArgumentParser, MakoTemplateWriter
+from gen_common import *
 
 def parse_event_fields(lines, idx, event_dict):
 field_names = []
@@ -144,6 +144,10 @@ def main():
 print('Error: Could not find private proto file %s' % 
proto_private_filename, file=sys.stderr)
 return 1
 
+final_output_dir = output_dir
+MakeDir(final_output_dir)
+output_dir = MakeTmpDir('_codegen')
+
 protos = {}
 protos['events'] = {}   # event dictionary containing events with 
their fields
 protos['event_names'] = []  # needed to keep events in order parsed. dict 
is not ordered.
@@ -153,53 +157,64 @@ def main():
 parse_protos(protos, proto_filename)
 parse_protos(protos, proto_private_filename)
 
-# Generate event header
-if args.gen_event_hpp:
-curdir = os.path.dirname(os.path.abspath(__file__))
-template_file = os.sep.join([curdir, 'templates', 'gen_ar_event.hpp'])
-output_fullpath = os.sep.join([output_dir, output_filename])
-
-MakoTemplateWriter.to_file(template_file, output_fullpath,
-cmdline=sys.argv,
-filename=output_filename,
-protos=protos)
-
-# Generate event implementation
-if args.gen_event_cpp:
-curdir = os.path.dirname(os.path.abspath(__file__))
-template_file = os.sep.join([curdir, 'templates', 'gen_ar_event.cpp'])
-output_fullpath = os.sep.join([output_dir, output_filename])
-
-MakoTemplateWriter.to_file(template_file, output_fullpath,
-cmdline=sys.argv,
-filename=output_filename,
-protos=protos)
-
-# Generate event handler header
-if args.gen_eventhandler_hpp:
-curdir = os.path.dirname(os.path.abspath(__file__))
-template_file = os.sep.join([curdir, 'templates', 
'gen_ar_eventhandler.hpp'])
-output_fullpath = os.sep.join([output_dir, output_filename])
-
-MakoTemplateWriter.to_file(template_file, output_fullpath,
-cmdline=sys.argv,
-filename=output_filename,
-event_header='gen_ar_event.hpp',
-protos=protos)
-
-# Generate event handler header
-if args.gen_eventhandlerfile_hpp:
-curdir = os.path.dirname(os.path.abspath(__file__))
-template_file = os.sep.join([curdir, 'templates', 
'gen_ar_eventhandlerfile.hpp'])
-output_fullpath = os.sep.join([output_dir, output_filename])
-
-MakoTemplateWriter.to_file(template_file, output_fullpath,
-cmdline=sys.argv,
-filename=output_filename,
-event_header='gen_ar_eventhandler.hpp',
-protos=protos)
-
-return 0
+rval = 0
+
+try:
+# Generate event header
+if args.gen_event_hpp:
+curdir = os.path.dirname(os.path.abspath(__file__))
+template_file = os.sep.join([curdir, 'templates', 
'gen_ar_event.hpp'])
+output_fullpath = os.sep.join([output_dir, output_filename])
+
+MakoTemplateWriter.to_file(template_file, output_fullpath,
+cmdline=sys.argv,
+filename=output_filename,
+protos=protos)
+
+# Generate event implementation
+if args.gen_event_cpp:
+curdir = os.path.dirname(os.path.abspath(__file__))
+template_file = os.sep.join([curdir, 'templates', 
'gen_ar_event.cpp'])
+output_fullpath = os.sep.join([output_dir, output_filename])
+
+MakoTemplateWriter.to_file(template_file, output_fullpath,
+cmdline=sys.argv,
+filename=output_filename,
+protos=protos)
+
+# Generate event handler header
+if args.gen_eventhandler_hpp:
+curdir = os.path.dirname(os.path.abspath(__file__))
+template_file = os.sep.join([curdir, 'templates', 
'gen_ar_eventhandler.hpp'])
+output_fullpath = os.sep.join([output_dir, 

[Mesa-dev] [PATCH 36/45] swr/rast: Type-check TemplateArgUnroller

2018-04-13 Thread George Kyriazis
Allows direct use of enum values in conversion to template args.
---
 src/gallium/drivers/swr/rasterizer/core/utils.h | 39 +
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h 
b/src/gallium/drivers/swr/rasterizer/core/utils.h
index c926f6a..d6cbf24 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
@@ -268,12 +268,15 @@ public:
 };
 
 // Ranged integer argument for TemplateArgUnroller
-template 
-struct IntArg
+template 
+struct RangedArg
 {
-uint32_t val;
+T val;
 };
 
+template 
+using IntArg = RangedArg;
+
 // Recursive template used to auto-nest conditionals.  Converts dynamic 
boolean function
 // arguments to static template arguments.
 template 
@@ -307,49 +310,49 @@ struct TemplateArgUnroller
 }
 
 //-
-// Integer value (within specified range)
+// Ranged value (within specified range)
 //-
 
 // Last Arg Terminator
-template 
-static typename TermT::FuncType GetFunc(IntArg iArg)
+template 
+static typename TermT::FuncType GetFunc(RangedArg iArg)
 {
 if (iArg.val == TMax)
 {
-return TermT::template GetFunc>();
+return TermT::template GetFunc>();
 }
 if (TMax > TMin)
 {
-return TemplateArgUnroller::GetFunc(IntArg{iArg.val});
+return TemplateArgUnroller::GetFunc(RangedArg{iArg.val});
 }
 SWR_ASSUME(false); return nullptr;
 }
-template 
-static typename TermT::FuncType GetFunc(IntArg iArg)
+template 
+static typename TermT::FuncType GetFunc(RangedArg iArg)
 {
 SWR_ASSERT(iArg.val == TVal);
-return TermT::template GetFunc>();
+return TermT::template GetFunc>();
 }
 
 // Recursively parse args
-template 
-static typename TermT::FuncType GetFunc(IntArg iArg, TArgsT... 
remainingArgs)
+template 
+static typename TermT::FuncType GetFunc(RangedArg iArg, 
TArgsT... remainingArgs)
 {
 if (iArg.val == TMax)
 {
-return TemplateArgUnroller>::GetFunc(remainingArgs...);
+return TemplateArgUnroller>::GetFunc(remainingArgs...);
 }
 if (TMax > TMin)
 {
-return TemplateArgUnroller::GetFunc(IntArg{iArg.val}, remainingArgs...);
+return TemplateArgUnroller::GetFunc(RangedArg{iArg.val}, remainingArgs...);
 }
 SWR_ASSUME(false); return nullptr;
 }
-template 
-static typename TermT::FuncType GetFunc(IntArg iArg, TArgsT... 
remainingArgs)
+template 
+static typename TermT::FuncType GetFunc(RangedArg iArg, 
TArgsT... remainingArgs)
 {
 SWR_ASSERT(iArg.val == TVal);
-return TemplateArgUnroller>::GetFunc(remainingArgs...);
+return TemplateArgUnroller>::GetFunc(remainingArgs...);
 }
 };
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 16/45] swr/rast: Add some archrast counters

2018-04-13 Thread George Kyriazis
Hook up archrast counters for shader stats: instructions executed.
---
 .../drivers/swr/rasterizer/archrast/archrast.cpp   |  4 +--
 .../drivers/swr/rasterizer/archrast/events.proto   | 30 ++
 .../drivers/swr/rasterizer/core/backend.cpp|  1 +
 .../drivers/swr/rasterizer/core/backend_impl.h |  4 +++
 .../drivers/swr/rasterizer/core/backend_sample.cpp |  5 +++-
 .../swr/rasterizer/core/backend_singlesample.cpp   |  5 +++-
 .../drivers/swr/rasterizer/core/frontend.cpp   |  8 ++
 7 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp 
b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
index 12dfc0e..2184673 100644
--- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
+++ b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
@@ -61,7 +61,7 @@ namespace ArchRast
 //@todo:: Change this to numPatches. Assumed: 1 patch per prim. If 
holds, its fine.
 };
 
-struct GSStats
+struct GSInfo
 {
 uint32_t inputPrimCount;
 uint32_t primGeneratedCount;
@@ -369,7 +369,7 @@ namespace ArchRast
 DepthStencilStats mDSOmZ = {};
 CStats mClipper = {};
 TEStats mTS = {};
-GSStats mGS = {};
+GSInfo mGS = {};
 RastStats rastStats = {};
 CullStats mCullStats = {};
 AlphaStats mAlphaStats = {};
diff --git a/src/gallium/drivers/swr/rasterizer/archrast/events.proto 
b/src/gallium/drivers/swr/rasterizer/archrast/events.proto
index deb0373..f924b57 100644
--- a/src/gallium/drivers/swr/rasterizer/archrast/events.proto
+++ b/src/gallium/drivers/swr/rasterizer/archrast/events.proto
@@ -115,6 +115,36 @@ event FrontendStatsEvent
 uint64_t SoNumPrimsWritten3;
 };
 
+event VSStats
+{
+uint32_t numInstExecuted;
+};
+
+event HSStats
+{
+uint32_t numInstExecuted;
+};
+
+event DSStats
+{
+uint32_t numInstExecuted;
+};
+
+event GSStats
+{
+uint32_t numInstExecuted;
+};
+
+event PSStats
+{
+uint32_t numInstExecuted;
+};
+
+event CSStats
+{
+uint32_t numInstExecuted;
+};
+
 event BackendStatsEvent
 {
 uint32_t drawId;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index ccc7150..1e0769a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -81,6 +81,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, 
uint32_t threadGroup
 state.pfnCsFunc(GetPrivateState(pDC), );
 
 UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
+AR_EVENT(CSStats(csContext.stats.numInstExecuted));
 
 RDTSC_END(BEDispatch, 1);
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h 
b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
index dd349a1..20b2ec5 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
@@ -968,6 +968,10 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t 
workerId, uint32_t x, uint32_t
 UPDATE_STAT_BE(PsInvocations, 
_mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
 RDTSC_END(BEPixelShader, 0);
 
+// update stats
+UPDATE_STAT_BE(PsInvocations, 
_mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
+AR_EVENT(PSStats(psContext.stats.numInstExecuted));
+
 // update active lanes to remove any discarded or oMask'd pixels
 activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, 
_simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si(;
 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp 
b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
index 4982025..c7c6c533 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
@@ -163,10 +163,13 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t 
workerId, uint32_t x, uint32_
 
 // execute pixel shader
 RDTSC_BEGIN(BEPixelShader, pDC->drawId);
-UPDATE_STAT_BE(PsInvocations, 
_mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
 state.psState.pfnPixelShader(GetPrivateState(pDC), 
);
 RDTSC_END(BEPixelShader, 0);
 
+// update stats
+UPDATE_STAT_BE(PsInvocations, 
_mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
+AR_EVENT(PSStats(psContext.stats.numInstExecuted));
+
 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
 
 // late-Z
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp 
b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
index 452fba1..26d5a75 100644
--- 

[Mesa-dev] [PATCH 25/45] swr/rast: Enable generalized fetch jit

2018-04-13 Thread George Kyriazis
Enable generalized fetch jit with 8 or 16 wide SIMD target. Still some
work needed to remove some simd8 double pumping for 16-wide target.

Also removed unused non-gather load vertices path.
---
 .../drivers/swr/rasterizer/jitter/builder.cpp  |   26 +-
 .../drivers/swr/rasterizer/jitter/builder.h|4 +-
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  |   69 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 1197 +++-
 .../drivers/swr/rasterizer/jitter/fetch_jit.h  |6 +-
 5 files changed, 169 insertions(+), 1133 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index 625f132..53947c3 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -66,16 +66,7 @@ namespace SwrJit
 mSimd4FP64Ty = VectorType::get(mDoubleTy, 4);
 
 // Built in types: target simd
-
-mSimdInt1Ty = VectorType::get(mInt1Ty,  mVWidth);
-mSimdInt16Ty= VectorType::get(mInt16Ty, mVWidth);
-mSimdInt32Ty= VectorType::get(mInt32Ty, mVWidth);
-mSimdInt64Ty= VectorType::get(mInt64Ty, mVWidth);
-mSimdFP16Ty = VectorType::get(mFP16Ty,  mVWidth);
-mSimdFP32Ty = VectorType::get(mFP32Ty,  mVWidth);
-mSimdVectorTy   = ArrayType::get(mSimdFP32Ty, 4);
-mSimdVectorIntTy= ArrayType::get(mSimdInt32Ty, 4);
-mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5);
+SetTargetWidth(pJitMgr->mVWidth);
 
 // Built in types: simd16
 
@@ -105,4 +96,19 @@ namespace SwrJit
 mSimd16IntPtrTy = mSimd16Int64Ty;
 }
 }
+
+void Builder::SetTargetWidth(uint32_t width)
+{
+mVWidth = width;
+
+mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth);
+mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth);
+mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth);
+mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth);
+mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth);
+mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
+mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4);
+mSimdVectorIntTy = ArrayType::get(mSimdInt32Ty, 4);
+mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5);
+}
 }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 6b2c9f0..4c79bab 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -46,7 +46,7 @@ namespace SwrJit
 JitManager *mpJitMgr;
 IRBuilder<> *mpIRBuilder;
 
-uint32_t mVWidth;   // vector width simd8
+uint32_t mVWidth;   // vector width target simd
 uint32_t mVWidth16; // vector width simd16
 
 // Built in types: scalar
@@ -95,6 +95,8 @@ namespace SwrJit
 
 Type*mSimd32Int8Ty;
 
+void SetTargetWidth(uint32_t width);
+
 #include "gen_builder.hpp"
 #include "gen_builder_meta.hpp"
 #include "gen_builder_intrin.hpp"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index 0550493..4840fef 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -38,6 +38,7 @@ namespace SwrJit
 {
 void Builder::AssertRastyMemoryParams(Value* ptr, JIT_MEM_CLIENT usage)
 {
+SWR_ASSERT(ptr->getType() != mInt64Ty, "Address appears to be GFX 
access.  Requires translation through BuilderGfxMem.");
 }
 
 Value *Builder::GEP(Value* ptr, const std::initializer_list 
)
@@ -175,78 +176,14 @@ namespace SwrJit
 {
 AssertRastyMemoryParams(pBase, usage);
 
-Value* vGather;
-
-// use avx2 gather instruction if available
-if (JM()->mArch.AVX2())
-{
-vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
-}
-else
-{
-Value* pStack = STACKSAVE();
-
-// store vSrc on the stack.  this way we can select between a 
valid load address and the vSrc address
-Value* vSrcPtr = ALLOCA(vSrc->getType());
-STORE(vSrc, vSrcPtr);
-
-vGather = VUNDEF_I();
-Value *vScaleVec = VIMMED1((uint32_t)scale);
-Value *vOffsets = MUL(vIndices, vScaleVec);
-for (uint32_t i = 0; i < mVWidth; ++i)
-{
-// single component byte index
-Value *offset = VEXTRACT(vOffsets, C(i));
-// byte pointer to component
-Value *loadAddress = GEP(pBase, offset);
-loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 
0));
-// pointer to the value to load if we're masking off a 

[Mesa-dev] [PATCH 44/45] swr/rast: Replace x86 VMOVMSK with llvm-only implementation

2018-04-13 Thread George Kyriazis
---
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |  1 -
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  |  2 +-
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 25 --
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  2 ++
 .../rasterizer/jitter/functionpasses/lower_x86.cpp |  1 -
 5 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 9c1e9e0..bced657 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -58,7 +58,6 @@ intrinsics = [
 ['VPTESTC', ['a', 'b'], 'mInt32Ty'],
 ['VPTESTZ', ['a', 'b'], 'mInt32Ty'],
 ['VFMADDPS',['a', 'b', 'c'], 'a'],
-['VMOVMSKPS',   ['a'], 'mInt32Ty'],
 ['VPHADDD', ['a', 'b'], 'a'],
 ['PDEP32',  ['a', 'b'], 'a'],
 ['RDTSC',   [], 'mInt64Ty'],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index 3c3c157..f9f3e92 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -608,7 +608,7 @@ namespace SwrJit
 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, 
PointerType::get(mInt32Ty, 0));
 
-Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
+Value* pMask = VMOVMSK(vMask);
 
 // Setup loop basic block
 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, 
"Scatter_Loop", pFunc);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index aa9e2dd..f893693 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -525,6 +525,28 @@ namespace SwrJit
 return S_EXT(mask, mSimd16Int32Ty);
 }
 
+/// @brief Convert  llvm mask to integer
+Value *Builder::VMOVMSK(Value* mask)
+{
+SWR_ASSERT(mask->getType()->getVectorElementType() == mInt1Ty);
+uint32_t numLanes = mask->getType()->getVectorNumElements();
+Value* i32Result;
+if (numLanes == 8)
+{
+i32Result = BITCAST(mask, mInt8Ty);
+}
+else if (numLanes == 16)
+{
+i32Result = BITCAST(mask, mInt16Ty);
+}
+else
+{
+SWR_ASSERT("Unsupported vector width");
+i32Result = BITCAST(mask, mInt8Ty);
+}
+return Z_EXT(i32Result, mInt32Ty);
+}
+
 //
 /// @brief Generate a VPSHUFB operation in LLVM IR.  If not  
 /// supported on the underlying platform, emulate it
@@ -768,8 +790,7 @@ namespace SwrJit
 /// @brief pop count on vector mask (e.g. <8 x i1>)
 Value* Builder::VPOPCNT(Value* a)
 {
-Value* b = BITCAST(VMASK(a), mSimdFP32Ty);
-return POPCNT(VMOVMSKPS(b));
+return POPCNT(VMOVMSK(a));
 }
 
 //
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 7308821..bd4be9f 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -102,6 +102,8 @@ Value *MASK_16(Value *vmask);
 Value *VMASK(Value *mask);
 Value *VMASK_16(Value *mask);
 
+Value *VMOVMSK(Value *mask);
+
 //
 /// @brief functions that build IR to call x86 intrinsics directly, or
 /// emulate them with other instructions if not available on the host
diff --git 
a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
index 7cfa772..856d67d 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
@@ -79,7 +79,6 @@ namespace SwrJit
 {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
 {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
 {"meta.intrinsic.VFMADDPS",Intrinsic::x86_fma_vfmadd_ps_256},
-{"meta.intrinsic.VMOVMSKPS",   Intrinsic::x86_avx_movmsk_ps_256},
 {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
 {"meta.intrinsic.PDEP32",  Intrinsic::x86_bmi_pdep_32},
 {"meta.intrinsic.RDTSC",   Intrinsic::x86_rdtsc},
-- 
2.7.4

___
mesa-dev mailing list

[Mesa-dev] [PATCH 07/45] swr/rast: Changes to allow jitter to compile with LLVM5

2018-04-13 Thread George Kyriazis
---
 src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp | 18 +-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp 
b/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp
index 031bced..b1d6076 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp
@@ -1,5 +1,5 @@
 /
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2017-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -112,6 +112,22 @@ using PassManager = llvm::legacy::PassManager;
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #endif
 
+#if LLVM_VERSION_MAJOR >= 5
+static const auto Sync_CrossThread = llvm::SyncScope::System;
+static const auto Attrib_FunctionIndex = llvm::AttributeList::FunctionIndex;
+static inline llvm::AttributeSet GetFuncAttribSet(llvm::LLVMContext& ctx, 
const llvm::AttrBuilder )
+{
+return llvm::AttributeSet::get(ctx, b);
+}
+#else
+static const auto Sync_CrossThread = llvm::SynchronizationScope::CrossThread;
+static const auto Attrib_FunctionIndex = llvm::AttributeSet::FunctionIndex;
+static inline llvm::AttributeSet GetFuncAttribSet(llvm::LLVMContext& ctx, 
const llvm::AttrBuilder )
+{
+return llvm::AttributeSet::get(ctx, Attrib_FunctionIndex, b);
+}
+#endif
+
 #pragma pop_macro("DEBUG")
 
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 08/45] swr/rast: LLVM 6 fix

2018-04-13 Thread George Kyriazis
for getting masked gather intrinsic (also compatible with LLVM 4)
---
 src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index ac01223..7c223d1 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -390,7 +390,7 @@ namespace SwrJit
 /// @param pVecPassthru - SIMD wide vector of values to load when lane is 
inactive
 Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* 
pVecPassthru)
 {
-Function* pMaskedGather = 
llvm::Intrinsic::getDeclaration(JM()->mpCurrentModule, 
Intrinsic::masked_gather, { pVecPassthru->getType() });
+Function* pMaskedGather = 
llvm::Intrinsic::getDeclaration(JM()->mpCurrentModule, 
Intrinsic::masked_gather, { pVecPassthru->getType(), pVecSrcPtr->getType() });
 
 return CALL(pMaskedGather, { pVecSrcPtr, C(0), pVecMask, pVecPassthru 
});
 }
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 39/45] swr/rast: double-pump in x86 lowering pass

2018-04-13 Thread George Kyriazis
Add support for double-pumping a smaller SIMD width intrinsic.
---
 .../rasterizer/jitter/functionpasses/lower_x86.cpp | 30 ++
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git 
a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
index 9423b28..983b227 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
@@ -90,11 +90,14 @@ namespace SwrJit
 Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, 
CallInst* pCallInst);
 Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst);
 Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst);
+Instruction* DOUBLE_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst, Intrinsic::ID intrin);
+
+static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
 
 static std::map intrinsicMap2[] = {
 //  256 wide   
 512 wide
 {   // AVX
-{"meta.intrinsic.VRCPPS",  {{Intrinsic::x86_avx_rcp_ps_256,
  Intrinsic::not_intrinsic},  NO_EMU}},
+{"meta.intrinsic.VRCPPS",  {{Intrinsic::x86_avx_rcp_ps_256,
  DOUBLE},NO_EMU}},
 {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VPERM_EMU}},
 {"meta.intrinsic.VPERMD",  {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VPERM_EMU}},
 {"meta.intrinsic.VGATHERPD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
@@ -104,7 +107,7 @@ namespace SwrJit
 {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_vcvtph2ps_256, 
  Intrinsic::not_intrinsic},  NO_EMU}},
 },
 {   // AVX2
-{"meta.intrinsic.VRCPPS",  {{Intrinsic::x86_avx_rcp_ps_256,
  Intrinsic::not_intrinsic},  NO_EMU}},
+{"meta.intrinsic.VRCPPS",  {{Intrinsic::x86_avx_rcp_ps_256,
  DOUBLE},NO_EMU}},
 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps,   
  Intrinsic::not_intrinsic},  VPERM_EMU}},
 {"meta.intrinsic.VPERMD",  {{Intrinsic::x86_avx2_permd,
  Intrinsic::not_intrinsic},  VPERM_EMU}},
 {"meta.intrinsic.VGATHERPD",   {{Intrinsic::not_intrinsic, 
  Intrinsic::not_intrinsic},  VGATHER_EMU}},
@@ -226,7 +229,15 @@ namespace SwrJit
 
 // Check if there is a native intrinsic for this instruction
 Intrinsic::ID id = intrinsic.intrin[vecWidth];
-if (id != Intrinsic::not_intrinsic)
+if (id == DOUBLE)
+{
+// Double pump the next smaller SIMD intrinsic
+SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD 
width.");
+Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];
+SWR_ASSERT(id2 != Intrinsic::not_intrinsic, "Cannot find 
intrinsic to double pump.");
+return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);
+}
+else if (id != Intrinsic::not_intrinsic)
 {
 Function* pIntrin = 
Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
 SmallVector args;
@@ -488,28 +499,25 @@ namespace SwrJit
 return cast(v32Gather);
 }
 
-#if 0
 // Double pump input using Intrin template arg. This blindly extracts 
lower and upper 256 from each vector argument and
 // calls the 256 wide intrinsic, then merges the results to 512 wide
-template
-Value* EMU_512(LowerX86* pThis, TargetArch arch, TargetWidth width, 
CallInst* pCallInst)
+Instruction* DOUBLE_EMU(LowerX86* pThis, TargetArch arch, TargetWidth 
width, CallInst* pCallInst, Intrinsic::ID intrin)
 {
 auto B = pThis->B;
 SWR_ASSERT(width == W512);
 Value* result[2];
-Function* pX86IntrinFunc = 
Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrin);
+Function* pX86IntrinFunc = 
Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);
 for (uint32_t i = 0; i < 2; ++i)
 {
 SmallVector args;
 for (auto& arg : pCallInst->arg_operands())
 {
-args.push_back(arg.get()->getType()->isVectorTy ? 
B->EXTRACT_16(arg.get(), i) : arg.get());
+args.push_back(arg.get()->getType()->isVectorTy() ? 

  1   2   3   4   >