Mesa (master): swr/rast: Renamed MetaData calls

2018-05-25 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 98d0201577ba21223e6d9a54b1240fe49524d486
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=98d0201577ba21223e6d9a54b1240fe49524d486

Author: Alok Hota 
Date:   Fri May 25 10:19:46 2018 -0500

swr/rast: Renamed MetaData calls

Reviewed-by: Bruce Cherniak 

---

 .../drivers/swr/rasterizer/jitter/builder.cpp  | 170 ++---
 .../drivers/swr/rasterizer/jitter/builder.h|   4 +-
 2 files changed, 87 insertions(+), 87 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index e1c5d80c80..4b06aaa3ab 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -1,32 +1,32 @@
 /
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file builder.h
-* 
-* @brief Includes all the builder related functionality
-* 
-* Notes:
-* 
-**/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file builder.h
+ *
+ * @brief Includes all the builder related functionality
+ *
+ * Notes:
+ *
+ 
**/
 
 #include "jit_pch.hpp"
 #include "builder.h"
@@ -38,11 +38,9 @@ namespace SwrJit
 //
 /// @brief Contructor for Builder.
 /// @param pJitMgr - JitManager which contains modules, function passes, 
etc.
-Builder::Builder(JitManager *pJitMgr)
-: mpJitMgr(pJitMgr),
-  mpPrivateContext(nullptr)
+Builder::Builder(JitManager *pJitMgr) : mpJitMgr(pJitMgr), 
mpPrivateContext(nullptr)
 {
-mVWidth = pJitMgr->mVWidth;
+mVWidth   = pJitMgr->mVWidth;
 mVWidth16 = 16;
 
 mpIRBuilder = >mBuilder;
@@ -70,29 +68,29 @@ namespace SwrJit
 
 // Built in types: simd16
 
-mSimd16Int1Ty   = VectorType::get(mInt1Ty,  mVWidth16);
-mSimd16Int16Ty  = VectorType::get(mInt16Ty, mVWidth16);
-mSimd16Int32Ty  = VectorType::get(mInt32Ty, mVWidth16);
-mSimd16Int64Ty  = VectorType::get(mInt64Ty, mVWidth16);
-mSimd16FP16Ty   = VectorType::get(mFP16Ty,  mVWidth16);
-mSimd16FP32Ty   = VectorType::get(mFP32Ty,  mVWidth16);
-mSimd16VectorTy = ArrayType::get(mSimd16FP32Ty, 4);
-mSimd16VectorTRTy   = ArrayType::get(mSimd16FP32Ty, 5);
+mSimd16Int1Ty = VectorType::get(mInt1Ty, mVWidth16);
+   

Mesa (master): swr/rast: Removed superfluous JitManager argument from passes

2018-05-25 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: b6b114c1aeaa996a4bf8c1fd409e8141d18b120c
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=b6b114c1aeaa996a4bf8c1fd409e8141d18b120c

Author: Alok Hota 
Date:   Fri May 25 10:19:47 2018 -0500

swr/rast: Removed superfluous JitManager argument from passes

Reviewed-by: Bruce Cherniak 

---

 src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp |  2 +-
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp |  2 +-
 .../swr/rasterizer/jitter/functionpasses/lower_x86.cpp  | 17 -
 .../swr/rasterizer/jitter/functionpasses/passes.h   |  2 +-
 .../drivers/swr/rasterizer/jitter/streamout_jit.cpp |  2 +-
 src/gallium/drivers/swr/swr_shader.cpp  |  2 +-
 6 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
index 72bf900c85..20f2e42eec 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -819,7 +819,7 @@ struct BlendJit : public Builder
 passes.add(createSCCPPass());
 passes.add(createAggressiveDCEPass());
 
-passes.add(createLowerX86Pass(JM(), this));
+passes.add(createLowerX86Pass(this));
 
 passes.run(*blendFunc);
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 7b0b80a713..0abcd1a8d7 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -269,7 +269,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& 
fetchState)
 
 optPasses.run(*fetch);
 
-optPasses.add(createLowerX86Pass(JM(), this));
+optPasses.add(createLowerX86Pass(this));
 optPasses.run(*fetch);
 
 JitManager::DumpToFile(fetch, "opt");
diff --git 
a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
index 5a69eaef26..f2bd8889fc 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
@@ -136,21 +136,21 @@ namespace SwrJit
 
 struct LowerX86 : public FunctionPass
 {
-LowerX86(JitManager* pJitMgr = nullptr, Builder* b = nullptr)
-: FunctionPass(ID), mpJitMgr(pJitMgr), B(b)
+LowerX86(Builder* b = nullptr)
+: FunctionPass(ID), B(b)
 {
 initializeLowerX86Pass(*PassRegistry::getPassRegistry());
 
 // Determine target arch
-if (mpJitMgr->mArch.AVX512F())
+if (JM()->mArch.AVX512F())
 {
 mTarget = AVX512;
 }
-else if (mpJitMgr->mArch.AVX2())
+else if (JM()->mArch.AVX2())
 {
 mTarget = AVX2;
 }
-else if (mpJitMgr->mArch.AVX())
+else if (JM()->mArch.AVX())
 {
 mTarget = AVX;
 
@@ -356,9 +356,8 @@ namespace SwrJit
 {
 }
 
-JitManager* JM() { return mpJitMgr; }
+JitManager* JM() { return B->JM(); }
 
-JitManager* mpJitMgr;
 Builder* B;
 
 TargetArch mTarget;
@@ -368,9 +367,9 @@ namespace SwrJit
 
 char LowerX86::ID = 0;   // LLVM uses address of ID as the actual ID.
 
-FunctionPass* createLowerX86Pass(JitManager* pJitMgr, Builder* b)
+FunctionPass* createLowerX86Pass(Builder* b)
 {
-return new LowerX86(pJitMgr, b);
+return new LowerX86(b);
 }
 
 Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, 
CallInst* pCallInst)
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h 
b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h
index f7373f034b..95ef4bcf01 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h
@@ -33,5 +33,5 @@ namespace SwrJit
 {
 using namespace llvm;
 
-FunctionPass* createLowerX86Pass(JitManager* pJitMgr, Builder* b);
+FunctionPass* createLowerX86Pass(Builder* b);
 }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
index f804900291..cb2e3aed61 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -307,7 +307,7 @@ struct StreamOutJit : public Builder
 passes.add(createSCCPPass());
 passes.add(createAggressiveDCEPass());
 
-passes.add(createLowerX86Pass(JM(), this));
+passes.add(createLowerX86Pass(this));
 
 passes.run(*soFunc);
 
diff --git 

Mesa (master): swr/rast: Added in-place building to SCATTERPS

2018-05-25 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: cfe75cc7b5acbf0692baff07a516ff4efe7fa968
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=cfe75cc7b5acbf0692baff07a516ff4efe7fa968

Author: Alok Hota 
Date:   Fri May 25 10:19:43 2018 -0500

swr/rast: Added in-place building to SCATTERPS

SCATTERPS previously assumed it was being used with an existing basic
block

Reviewed-by: Bruce Cherniak 

---

 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  | 29 +++---
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index 6e17888f83..77c2095ea9 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -617,17 +617,28 @@ namespace SwrJit
 
 Value* pIsUndef = ICMP_EQ(pIndex, C(32));
 
-// Split current block
-BasicBlock* pPostLoop = 
pCurBB->splitBasicBlock(cast(pIsUndef)->getNextNode());
+// Split current block or create new one if building inline
+BasicBlock* pPostLoop;
+if (pCurBB->getTerminator())
+{
+pPostLoop = 
pCurBB->splitBasicBlock(cast(pIsUndef)->getNextNode());
 
-// Remove unconditional jump created by splitBasicBlock
-pCurBB->getTerminator()->eraseFromParent();
+// Remove unconditional jump created by splitBasicBlock
+pCurBB->getTerminator()->eraseFromParent();
 
-// Add terminator to end of original block
-IRB()->SetInsertPoint(pCurBB);
+// Add terminator to end of original block
+IRB()->SetInsertPoint(pCurBB);
 
-// Add conditional branch
-COND_BR(pIsUndef, pPostLoop, pLoop);
+// Add conditional branch
+COND_BR(pIsUndef, pPostLoop, pLoop);
+}
+else
+{
+pPostLoop = BasicBlock::Create(mpJitMgr->mContext, 
"PostScatter_Loop", pFunc);
+
+// Add conditional branch
+COND_BR(pIsUndef, pPostLoop, pLoop);
+}
 
 // Add loop basic block contents
 IRB()->SetInsertPoint(pLoop);
@@ -642,7 +653,7 @@ namespace SwrJit
 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
 
 // GEP to this offset in dst
-Value* pCurDst = GEP(pDst, pOffsetElem);
+Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy);
 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
 STORE(pSrcElem, pCurDst);
 

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Moved memory init out of core swr init

2018-05-25 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: b3360f5c8b74906187a8801d83f2c4f73f3c025e
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=b3360f5c8b74906187a8801d83f2c4f73f3c025e

Author: Alok Hota 
Date:   Fri May 25 10:19:48 2018 -0500

swr/rast: Moved memory init out of core swr init

Added two new files for a wrapper function for initialization

v2: added missing include for single architecture builds

Reviewed-by: Bruce Cherniak 

---

 src/gallium/drivers/swr/Makefile.sources   |  4 ++-
 src/gallium/drivers/swr/meson.build|  2 ++
 src/gallium/drivers/swr/rasterizer/core/api.cpp|  4 ---
 src/gallium/drivers/swr/rasterizer/core/state.h|  3 +-
 .../drivers/swr/rasterizer/memory/InitMemory.cpp   | 39 ++
 .../drivers/swr/rasterizer/memory/InitMemory.h | 33 ++
 src/gallium/drivers/swr/swr_loader.cpp |  8 -
 7 files changed, 86 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/swr/Makefile.sources 
b/src/gallium/drivers/swr/Makefile.sources
index 6753d501a0..b298356079 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -177,4 +177,6 @@ MEMORY_CXX_SOURCES := \
rasterizer/memory/StoreTile_TileY2.cpp \
rasterizer/memory/StoreTile_TileY.cpp \
rasterizer/memory/TilingFunctions.h \
-   rasterizer/memory/tilingtraits.h
+   rasterizer/memory/tilingtraits.h \
+   rasterizer/memory/InitMemory.cpp \
+   rasterizer/memory/InitMemory.h
diff --git a/src/gallium/drivers/swr/meson.build 
b/src/gallium/drivers/swr/meson.build
index 9b272aaebd..b95c8bc1bf 100644
--- a/src/gallium/drivers/swr/meson.build
+++ b/src/gallium/drivers/swr/meson.build
@@ -151,6 +151,8 @@ files_swr_arch = files(
   'rasterizer/memory/StoreTile_TileY.cpp',
   'rasterizer/memory/TilingFunctions.h',
   'rasterizer/memory/tilingtraits.h',
+  'rasterizer/memory/InitMemory.h',
+  'rasterizer/memory/InitMemory.cpp',
 )
 
 swr_context_files = files('swr_context.h')
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 47f3633d54..c932ec0bd6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -1728,10 +1728,6 @@ void InitBackendFuncTables();
 /// @brief Initialize swr backend and memory internal tables
 void SwrInit()
 {
-InitSimLoadTilesTable();
-InitSimStoreTilesTable();
-InitSimClearTilesTable();
-
 InitClearTilesTable();
 InitBackendFuncTables();
 InitRasterizerFunctions();
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index c26dabe838..9db17eeed0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -29,10 +29,11 @@
 
 #include "common/formats.h"
 #include "common/intrin.h"
-using gfxptr_t = unsigned long long;
 #include 
 #include 
 
+using gfxptr_t = unsigned long long;
+
 //
 /// PRIMITIVE_TOPOLOGY.
 //
diff --git a/src/gallium/drivers/swr/rasterizer/memory/InitMemory.cpp 
b/src/gallium/drivers/swr/rasterizer/memory/InitMemory.cpp
new file mode 100644
index 00..bff96e17f4
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/memory/InitMemory.cpp
@@ -0,0 +1,39 @@
+/
+* Copyright (C) 2018 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file InitMemory.cpp
+*
+* @brief Provide access to tiles table initialization functions
+*
+**/
+#include 

Mesa (master): swr/rast: Adjusted avx512 primitive assembly for msvc codegen

2018-05-25 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: fb20ae0374425ae3aff2a50a498c7e2b428632a4
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=fb20ae0374425ae3aff2a50a498c7e2b428632a4

Author: Alok Hota 
Date:   Fri May 25 10:19:49 2018 -0500

swr/rast: Adjusted avx512 primitive assembly for msvc codegen

Optimize AVX-512 PA Assemble (PA_STATE_OPT). Reduced generated code by
about 4x, MSVC compiler was going crazy making temporaries and
split-loading inputs onto the stack unless explicit AVX-512 load ops
were added

Reviewed-by: Bruce Cherniak 

---

 src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp | 139 +
 1 file changed, 90 insertions(+), 49 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp 
b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
index 64a90c768b..4f89e0c179 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
@@ -755,36 +755,51 @@ bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, 
simd16vector verts[])
 
 bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
-#if KNOB_ARCH == KNOB_ARCH_AVX
-simd16scalar perm0 = _simd16_setzero_ps();
-simd16scalar perm1 = _simd16_setzero_ps();
-simd16scalar perm2 = _simd16_setzero_ps();
-#elif KNOB_ARCH >= KNOB_ARCH_AVX2
+#if KNOB_ARCH >= KNOB_ARCH_AVX2
 const simd16scalari perm0 = _simd16_set_epi32(13, 10, 7, 4, 1, 14, 11,  8, 
5, 2, 15, 12,  9, 6, 3, 0);
 const simd16scalari perm1 = _simd16_set_epi32(14, 11, 8, 5, 2, 15, 12,  9, 
6, 3,  0, 13, 10, 7, 4, 1);
 const simd16scalari perm2 = _simd16_set_epi32(15, 12, 9, 6, 3,  0, 13, 10, 
7, 4,  1, 14, 11, 8, 5, 2);
+#else   // KNOB_ARCH == KNOB_ARCH_AVX
+simd16scalar perm0 = _simd16_setzero_ps();
+simd16scalar perm1 = _simd16_setzero_ps();
+simd16scalar perm2 = _simd16_setzero_ps();
 #endif
 
 const simd16vector  = PaGetSimdVector_simd16(pa, 0, slot);
 const simd16vector  = PaGetSimdVector_simd16(pa, 1, slot);
 const simd16vector  = PaGetSimdVector_simd16(pa, 2, slot);
 
-simd16vector  = verts[0];
-simd16vector  = verts[1];
-simd16vector  = verts[2];
+const simd16mask mask0 = 0x4924;
+const simd16mask mask1 = 0x2492;
+const simd16mask mask2 = 0x9249;
 
 //  v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
 //  v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
 //  v2 -> a2 a5 a8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
 
+simd16vector  = verts[0];
+simd16vector  = verts[1];
+simd16vector  = verts[2];
+
 // for simd16 x, y, z, and w
 for (int i = 0; i < 4; i += 1)
 {
-simd16scalar temp0 = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 
0x4924), c[i], 0x2492);
-simd16scalar temp1 = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 
0x9249), c[i], 0x4924);
-simd16scalar temp2 = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 
0x2492), c[i], 0x9249);
+simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast([i]));
+simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast([i]));
+simd16scalar tempc = _simd16_loadu_ps(reinterpret_cast([i]));
+
+simd16scalar temp0 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, 
mask0), tempc, mask1);
+simd16scalar temp1 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, 
mask2), tempc, mask0);
+simd16scalar temp2 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, 
mask1), tempc, mask2);
+
+#if KNOB_ARCH >= KNOB_ARCH_AVX2
+v0[i] = _simd16_permute_ps(temp0, perm0);
+v1[i] = _simd16_permute_ps(temp1, perm1);
+v2[i] = _simd16_permute_ps(temp2, perm2);
+#else   // #if KNOB_ARCH == KNOB_ARCH_AVX
+
+// the general permutes (above) are prohibitively slow to emulate on 
AVX (its scalar code)
 
-#if KNOB_ARCH == KNOB_ARCH_AVX
 temp0 = _simd16_permute_ps_i(temp0, 0x6C);  // (0, 3, 2, 1) => 
00 11 01 10 => 0x6C
 perm0 = _simd16_permute2f128_ps(temp0, temp0, 0xB1);// (1, 0, 3, 2) => 
01 00 11 10 => 0xB1
 temp0 = _simd16_blend_ps(temp0, perm0, 0x); // 0010 0010 0010 
0010
@@ -802,10 +817,6 @@ bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, 
simd16vector verts[])
 temp2 = _simd16_blend_ps(temp2, perm2, 0x); // 0100 0100 0100 
0100
 perm2 = _simd16_permute2f128_ps(temp2, temp2, 0x4E);// (2, 3, 0, 1) => 
10 11 00 01 => 0x4E
 v2[i] = _simd16_blend_ps(temp2, perm2, 0x1C1C); // 0011 1000 0011 
1000
-#elif KNOB_ARCH >= KNOB_ARCH_AVX2
-v0[i] = _simd16_permute_ps(temp0, perm0);
-v1[i] = _simd16_permute_ps(temp1, perm1);
-v2[i] = _simd16_permute_ps(temp2, perm2);
 #endif
 }
 
@@ -1056,26 +1067,31 @@ bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t 
slot, simd16vector verts[])
 const simd16vector  = PaGetSimdVector_simd16(pa, pa.prev, slot);
 const simd16vector  = 

Mesa (master): swr/rast: Check gCoreBuckets/CORE_BUCKETS equal length at compile time

2018-05-25 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: f09636e2e1311b24cbcd2a2d49e97f8a69702cfd
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=f09636e2e1311b24cbcd2a2d49e97f8a69702cfd

Author: Alok Hota 
Date:   Fri May 25 10:19:44 2018 -0500

swr/rast: Check gCoreBuckets/CORE_BUCKETS equal length at compile time

Reviewed-by: Bruce Cherniak 

---

 src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp 
b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
index f289a319ca..48ea397018 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
@@ -89,6 +89,7 @@ BUCKET_DESC gCoreBuckets[] = {
 { "BEStoreTiles", "", true, 0xff00 },
 { "BEEndTile", "", false, 0x },
 };
+static_assert(NumBuckets == (sizeof(gCoreBuckets) / sizeof(gCoreBuckets[0])), 
"RDTSC Bucket enum and description table size mismatched.");
 
 /// @todo bucketmanager and mapping should probably be a part of the SWR 
context
 std::vector gBucketMap;

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Use metadata to communicate between passes

2018-05-25 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 14b5cac0be15b2a1f6624431ae1b694f3a4487dd
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=14b5cac0be15b2a1f6624431ae1b694f3a4487dd

Author: Alok Hota 
Date:   Fri May 25 10:19:45 2018 -0500

swr/rast: Use metadata to communicate between passes

Reviewed-by: Bruce Cherniak 

---

 .../drivers/swr/rasterizer/jitter/builder.h| 28 ++
 1 file changed, 28 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 6ca128d38f..08a3a6e473 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -124,6 +124,34 @@ namespace SwrJit
 bool SetTexelMaskEvaluate(Instruction* inst);
 bool IsTexelMaskEvaluate(Instruction* inst);
 Type* GetVectorType(Type* pType);
+void SetMetadata(StringRef s, uint32_t val)
+{
+llvm::NamedMDNode *metaData = 
mpJitMgr->mpCurrentModule->getOrInsertNamedMetadata(s);
+Constant* cval = mpIRBuilder->getInt32(val);
+llvm::MDNode *mdNode = 
llvm::MDNode::get(mpJitMgr->mpCurrentModule->getContext(), 
llvm::ConstantAsMetadata::get(cval));
+if (metaData->getNumOperands())
+{
+metaData->setOperand(0, mdNode);
+}
+else
+{
+metaData->addOperand(mdNode);
+}
+}
+uint32_t GetMetadata(StringRef s)
+{
+NamedMDNode* metaData = 
mpJitMgr->mpCurrentModule->getNamedMetadata(s);
+if (metaData)
+{
+MDNode* mdNode = metaData->getOperand(0);
+Metadata* val = mdNode->getOperand(0);
+return mdconst::dyn_extract(val)->getZExtValue();
+}
+else
+{
+return 0;
+}
+}
 
 #include "gen_builder.hpp"
 #include "gen_builder_meta.hpp"

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: SIMD16 builder - cleanup naming (simd2 -> simd16)

2018-01-10 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: e14b48e00e56b59de4bb916be994756295d7b685
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=e14b48e00e56b59de4bb916be994756295d7b685

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Dec 19 13:39:09 2017 -0600

swr/rast: SIMD16 builder - cleanup naming (simd2 -> simd16)

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/jitter/builder.cpp  |  76 +-
 .../drivers/swr/rasterizer/jitter/builder.h|  45 +++---
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 133 
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  50 +++---
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 168 +++--
 5 files changed, 239 insertions(+), 233 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index 4b83a3204c..c46159a35a 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -40,52 +40,56 @@ namespace SwrJit
 Builder::Builder(JitManager *pJitMgr)
 : mpJitMgr(pJitMgr)
 {
+SWR_ASSERT(pJitMgr->mVWidth == 8);
+
 mVWidth = pJitMgr->mVWidth;
-#if USE_SIMD16_BUILDER
-mVWidth2 = pJitMgr->mVWidth * 2;
-#endif
+mVWidth16 = pJitMgr->mVWidth * 2;
 
 mpIRBuilder = >mBuilder;
 
-mVoidTy = Type::getVoidTy(pJitMgr->mContext);
-mFP16Ty = Type::getHalfTy(pJitMgr->mContext);
-mFP32Ty = Type::getFloatTy(pJitMgr->mContext);
-mFP32PtrTy = PointerType::get(mFP32Ty, 0);
-mDoubleTy = Type::getDoubleTy(pJitMgr->mContext);
-mInt1Ty = Type::getInt1Ty(pJitMgr->mContext);
-mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
-mInt16Ty = Type::getInt16Ty(pJitMgr->mContext);
-mInt32Ty = Type::getInt32Ty(pJitMgr->mContext);
-mInt8PtrTy = PointerType::get(mInt8Ty, 0);
+// Built in types: scalar
+
+mVoidTy = Type::getVoidTy(pJitMgr->mContext);
+mFP16Ty = Type::getHalfTy(pJitMgr->mContext);
+mFP32Ty = Type::getFloatTy(pJitMgr->mContext);
+mFP32PtrTy  = PointerType::get(mFP32Ty, 0);
+mDoubleTy   = Type::getDoubleTy(pJitMgr->mContext);
+mInt1Ty = Type::getInt1Ty(pJitMgr->mContext);
+mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
+mInt16Ty= Type::getInt16Ty(pJitMgr->mContext);
+mInt32Ty= Type::getInt32Ty(pJitMgr->mContext);
+mInt8PtrTy  = PointerType::get(mInt8Ty, 0);
 mInt16PtrTy = PointerType::get(mInt16Ty, 0);
 mInt32PtrTy = PointerType::get(mInt32Ty, 0);
-mInt64Ty = Type::getInt64Ty(pJitMgr->mContext);
-mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth);
-mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth);
-mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth);
-mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth);
-mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth);
-mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
-mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4);
+mInt64Ty= Type::getInt64Ty(pJitMgr->mContext);
+
+// Built in types: simd8
+
+mSimdInt1Ty = VectorType::get(mInt1Ty,  mVWidth);
+mSimdInt16Ty= VectorType::get(mInt16Ty, mVWidth);
+mSimdInt32Ty= VectorType::get(mInt32Ty, mVWidth);
+mSimdInt64Ty= VectorType::get(mInt64Ty, mVWidth);
+mSimdFP16Ty = VectorType::get(mFP16Ty,  mVWidth);
+mSimdFP32Ty = VectorType::get(mFP32Ty,  mVWidth);
+mSimdVectorTy   = ArrayType::get(mSimdFP32Ty, 4);
 mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5);
-#if USE_SIMD16_BUILDER
-mSimd2Int1Ty = VectorType::get(mInt1Ty, mVWidth2);
-mSimd2Int16Ty = VectorType::get(mInt16Ty, mVWidth2);
-mSimd2Int32Ty = VectorType::get(mInt32Ty, mVWidth2);
-mSimd2Int64Ty = VectorType::get(mInt64Ty, mVWidth2);
-mSimd2FP16Ty = VectorType::get(mFP16Ty, mVWidth2);
-mSimd2FP32Ty = VectorType::get(mFP32Ty, mVWidth2);
-mSimd2VectorTy = ArrayType::get(mSimd2FP32Ty, 4);
-mSimd2VectorTRTy = ArrayType::get(mSimd2FP32Ty, 5);
-#endif
+
+// Built in types: simd16
+
+mSimd16Int1Ty   = VectorType::get(mInt1Ty,  mVWidth16);
+mSimd16Int16Ty  = VectorType::get(mInt16Ty, mVWidth16);
+mSimd16Int32Ty  = VectorType::get(mInt32Ty, mVWidth16);
+mSimd16Int64Ty  = VectorType::get(mInt64Ty, mVWidth16);
+mSimd16FP16Ty   = VectorType::get(mFP16Ty,  mVWidth16);
+mSimd16FP32Ty   = VectorType::get(mFP32Ty,  mVWidth16);
+mSimd16VectorTy = ArrayType::get(mSimd16FP32Ty, 4);
+mSimd16VectorTRTy   = ArrayType::get(mSimd16FP32Ty, 5);
 
 if (sizeof(uint3

Mesa (master): swr/rast: SIMD16 fetch shader jitter cleanup

2018-01-10 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 04d0bfde3962ee76ee7310b3dee5e0f72d2b4c17
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=04d0bfde3962ee76ee7310b3dee5e0f72d2b4c17

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Fri Dec 22 13:58:08 2017 -0600

swr/rast: SIMD16 fetch shader jitter cleanup

Bake in USE_SIMD16_BUILDER code paths (for USE_SIMD16_SHADER defined),
remove USE_SIMD16_BUILDER define, remove deprecated psuedo-SIMD16 code
paths.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 1118 +++-
 1 file changed, 383 insertions(+), 735 deletions(-)

Diff:   
http://cgit.freedesktop.org/mesa/mesa/diff/?id=04d0bfde3962ee76ee7310b3dee5e0f72d2b4c17
___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: autogenerate named structs instead of literal structs

2018-01-10 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: f5f1bbcb5c66c55a45e47c71685ca6709b714390
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=f5f1bbcb5c66c55a45e47c71685ca6709b714390

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Dec 28 17:56:03 2017 -0600

swr/rast: autogenerate named structs instead of literal structs

Results in far smaller and useful IR output.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../swr/rasterizer/codegen/templates/gen_llvm.hpp  | 23 ++
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp
index 18ea781713..574ee5aaa7 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp
@@ -40,15 +40,22 @@ namespace SwrJit
 INLINE static StructType *Gen_${type['name']}(JitManager* pJitMgr)
 {
 LLVMContext& ctx = pJitMgr->mContext;
-std::vector<Type*> members;
-<%
-(max_type_len, max_name_len) = calc_max_len(type['members'])
-%>
-%for member in type['members']:
-/* ${member['name']} ${pad(len(member['name']), max_name_len)}*/ 
members.push_back( ${member['type']} );
-%endfor
 
-return StructType::get(ctx, members, false);
+StructType* pRetType = 
pJitMgr->mpCurrentModule->getTypeByName("${type['name']}");
+if (pRetType == nullptr)
+{
+std::vector<Type*> members;
+<%
+(max_type_len, max_name_len) = calc_max_len(type['members'])
+%>
+%for member in type['members']:
+/* ${member['name']} ${pad(len(member['name']), max_name_len)}*/ 
members.push_back(${ member['type'] });
+%endfor
+
+pRetType = StructType::create(members, "${type['name']}", false);
+}
+
+return pRetType;
 }
 
 %for member in type['members']:

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: shuffle header files for msvc pre-compiled header usage

2018-01-10 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: d3a4c8057dfd31b562a8007a511f1de88a153528
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=d3a4c8057dfd31b562a8007a511f1de88a153528

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Dec 21 11:01:37 2017 -0600

swr/rast: shuffle header files for msvc pre-compiled header usage

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/Makefile.sources   |   1 +
 .../drivers/swr/rasterizer/jitter/JitManager.cpp   |  36 +-
 .../drivers/swr/rasterizer/jitter/JitManager.h |  46 +--
 .../drivers/swr/rasterizer/jitter/blend_jit.cpp|   3 +-
 .../drivers/swr/rasterizer/jitter/builder.cpp  |   1 +
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp |   1 +
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|   3 +-
 .../drivers/swr/rasterizer/jitter/jit_api.h|   1 -
 .../drivers/swr/rasterizer/jitter/jit_pch.hpp  | 134 +
 .../swr/rasterizer/jitter/streamout_jit.cpp|   5 +-
 10 files changed, 143 insertions(+), 88 deletions(-)

diff --git a/src/gallium/drivers/swr/Makefile.sources 
b/src/gallium/drivers/swr/Makefile.sources
index 53f8bf011b..cd2040e137 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -145,6 +145,7 @@ JITTER_CXX_SOURCES := \
rasterizer/jitter/fetch_jit.cpp \
rasterizer/jitter/fetch_jit.h \
rasterizer/jitter/jit_api.h \
+   rasterizer/jitter/jit_pch.hpp \
rasterizer/jitter/JitManager.cpp \
rasterizer/jitter/JitManager.h \
rasterizer/jitter/streamout_jit.cpp \
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 59672bb545..883ac77482 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -27,41 +27,7 @@
 * Notes:
 * 
 **/
-#if defined(_WIN32)
-#pragma warning(disable: 4800 4146 4244 4267 4355 4996)
-#endif
-
-#pragma push_macro("DEBUG")
-#undef DEBUG
-
-#if defined(_WIN32)
-#include "llvm/ADT/Triple.h"
-#endif
-#include "llvm/IR/Function.h"
-
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/SourceMgr.h"
-
-#include "llvm/Analysis/CFGPrinter.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Config/llvm-config.h"
-
-#if LLVM_VERSION_MAJOR < 4
-#include "llvm/Bitcode/ReaderWriter.h"
-#else
-#include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/Bitcode/BitcodeReader.h"
-#endif
-
-#if LLVM_USE_INTEL_JITEVENTS
-#include "llvm/ExecutionEngine/JITEventListener.h"
-#endif
-
-#pragma pop_macro("DEBUG")
+#include "jit_pch.hpp"
 
 #include "JitManager.h"
 #include "jit_api.h"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index c30a807222..9e5e4cf2b6 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -29,52 +29,9 @@
 **/
 #pragma once
 
-#if defined(_WIN32)
-#pragma warning(disable : 4146 4244 4267 4800 4996)
-#endif
-
-// llvm 3.7+ reuses "DEBUG" as an enum value
-#pragma push_macro("DEBUG")
-#undef DEBUG
-
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/ExecutionEngine/ObjectCache.h"
-
-#include "llvm/Config/llvm-config.h"
-
-#include "llvm/IR/Verifier.h"
-#include "llvm/ExecutionEngine/MCJIT.h"
-#include "llvm/Support/FileSystem.h"
-#define LLVM_F_NONE sys::fs::F_None
-
-#include "llvm/Analysis/Passes.h"
-
-#include "llvm/IR/LegacyPassManager.h"
-using FunctionPassManager = llvm::legacy::FunctionPassManager;
-using PassManager = llvm::legacy::PassManager;
-
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Support/Host.h"
-#include "llvm/Support/DynamicLibrary.h"
-
-
-#includ

Mesa (master): swr/rast: don't use 32-bit gathers for elements < 32-bits in size

2018-01-10 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 3d4d34e380f33e9daa86ff3aa4c06a56c5fa1318
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=3d4d34e380f33e9daa86ff3aa4c06a56c5fa1318

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Jan  3 11:58:50 2018 -0600

swr/rast: don't use 32-bit gathers for elements < 32-bits in size

Using a gather for elements less than 32-bits in size can cause
pagefaults when loading the last elements in a page-aligned-sized
buffer.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 61 +-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 99a936d176..ad70cbe95d 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -741,7 +741,66 @@ void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, 
Value* pMask, Value* pB
 // only works if pixel size is <= 32bits
 SWR_ASSERT(info.bpp <= 32);
 
-Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
+Value *pGather;
+if (info.bpp == 32)
+{
+pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
+}
+else
+{
+// Can't use 32-bit gather for items less than 32-bits, could cause 
page faults.
+Value *pMem = ALLOCA(mSimdInt32Ty);
+STORE(VIMMED1(0u), pMem);
+
+pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
+Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
+
+for (uint32_t lane = 0; lane < mVWidth; ++lane)
+{
+// Get index
+Value* index = VEXTRACT(pOffsets, C(lane));
+Value* mask = VEXTRACT(pMask, C(lane));
+switch (info.bpp)
+{
+case 8:
+{
+Value* pDst = BITCAST(GEP(pDstMem, C(lane)), 
PointerType::get(mInt8Ty, 0));
+Value* pSrc = BITCAST(GEP(pBase, index), 
PointerType::get(mInt8Ty, 0));
+STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+break;
+}
+
+case 16:
+{
+Value* pDst = BITCAST(GEP(pDstMem, C(lane)), 
PointerType::get(mInt16Ty, 0));
+Value* pSrc = BITCAST(GEP(pBase, index), 
PointerType::get(mInt16Ty, 0));
+STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+break;
+}
+break;
+
+case 24:
+{
+// First 16-bits of data
+Value* pDst = BITCAST(GEP(pDstMem, C(lane)), 
PointerType::get(mInt16Ty, 0));
+Value* pSrc = BITCAST(GEP(pBase, index), 
PointerType::get(mInt16Ty, 0));
+STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+
+// Last 8-bits of data
+pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
+pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
+STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+break;
+}
+
+default:
+SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
+break;
+}
+}
+
+pGather = LOAD(pMem);
+}
 
 for (uint32_t comp = 0; comp < 4; ++comp)
 {

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: switch win32 jit format to COFF

2018-01-10 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: c259888c52a3cd9f6dd39cc33e919540435e5f5a
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=c259888c52a3cd9f6dd39cc33e919540435e5f5a

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Jan  4 10:27:13 2018 -0600

swr/rast: switch win32 jit format to COFF

Allows for call-stack and exception handling for jitted functions.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 883ac77482..508bc027dd 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -92,7 +92,7 @@ JitManager::JitManager(uint32_t simdWidth, const char *arch, 
const char* core)
 #if defined(_WIN32)
 // Needed for MCJIT on windows
 Triple hostTriple(sys::getProcessTriple());
-hostTriple.setObjectFormat(Triple::ELF);
+hostTriple.setObjectFormat(Triple::COFF);
 mpCurrentModule->setTargetTriple(hostTriple.getTriple());
 #endif // _WIN32
 
@@ -486,4 +486,4 @@ std::unique_ptr 
JitCache::getObject(const llvm::Module* M)
 fclose(fpIn);
 
 return pBuf;
-}
+}
\ No newline at end of file

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: fix invalid sign masks in avx512 simdlib code

2018-01-04 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 396c006d907b023f9b187db618ee2a6e4e1b8a85
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=396c006d907b023f9b187db618ee2a6e4e1b8a85

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Jan  4 10:08:48 2018 -0600

swr/rast: fix invalid sign masks in avx512 simdlib code

Should be 0x8000 instead of 0x800.

Cc: mesa-sta...@lists.freedesktop.org
Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl | 2 +-
 src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl | 2 +-
 src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
index 66e8309610..b70a7691e2 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
@@ -270,7 +270,7 @@ static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float 
old, float const* p, In
 {
 __mmask16 m = 0xf;
 m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
-_mm512_set1_epi32(0x800));
+_mm512_set1_epi32(0x8000));
 return __conv(_mm512_mask_i32gather_ps(
 __conv(old),
 m,
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
index 3f93cfbd7f..3fcfd250f9 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
@@ -271,7 +271,7 @@ static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float 
old, float const* p, In
 {
 __mmask16 m = 0xff;
 m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
-_mm512_set1_epi32(0x800));
+_mm512_set1_epi32(0x8000));
 return __conv(_mm512_mask_i32gather_ps(
 __conv(old),
 m,
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
index c13b9f616a..8de62f2a7e 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
@@ -540,7 +540,7 @@ static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
 }
 static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
 {
-__mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(0x800));
+__mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(0x8000));
 return static_cast(m);
 }
 

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: fix MemoryBuffer build break for llvm-6

2018-01-03 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: ad218754c79e0af61d5ba225a4b195cb55c2cac9
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=ad218754c79e0af61d5ba225a4b195cb55c2cac9

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Jan  2 10:48:21 2018 -0600

swr/rast: fix MemoryBuffer build break for llvm-6

LLVM api change.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104381
Tested-by: Laurent Carlier <lordhea...@gmail.com>
Reviewed-By: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp | 4 
 1 file changed, 4 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 3f0772c942..59672bb545 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -498,7 +498,11 @@ std::unique_ptr 
JitCache::getObject(const llvm::Module* M)
 break;
 }
 
+#if LLVM_VERSION_MAJOR < 6
 pBuf = 
llvm::MemoryBuffer::getNewUninitMemBuffer(size_t(header.GetBufferSize()));
+#else
+pBuf = 
llvm::WritableMemoryBuffer::getNewUninitMemBuffer(size_t(header.GetBufferSize()));
+#endif
 if (!fread(const_cast<char*>(pBuf->getBufferStart()), 
header.GetBufferSize(), 1, fpIn))
 {
 pBuf = nullptr;

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Binner fixes for viewport index offset handling

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 0e9e24768785a4e09a785c1f3ab9c0117e82da4e
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=0e9e24768785a4e09a785c1f3ab9c0117e82da4e

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Nov 29 10:46:49 2017 -0600

swr/rast: Binner fixes for viewport index offset handling

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 9 -
 src/gallium/drivers/swr/rasterizer/core/clip.h | 5 -
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 9d1f0d8799..52375f8956 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -470,6 +470,10 @@ void SIMDCALL BinTrianglesImpl(
 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, 
vNumViewports);
 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
 }
+else
+{
+viewportIdx = vpai;
+}
 
 if (feState.vpTransformDisable)
 {
@@ -1326,6 +1330,10 @@ void BinPointsImpl(
 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, 
vNumViewports);
 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
 }
+else
+{
+viewportIdx = vpai;
+}
 
 if (!feState.vpTransformDisable)
 {
@@ -1647,7 +1655,6 @@ void SIMDCALL BinLinesImpl(
 if (state.backendState.readViewportArrayIndex)
 {
 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
-
 vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
 }
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h 
b/src/gallium/drivers/swr/rasterizer/core/clip.h
index 0d3d78057f..9d8bbc19e6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -694,7 +694,6 @@ public:
 if (state.backendState.readViewportArrayIndex)
 {
 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
-
 vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
 }
 
@@ -707,6 +706,10 @@ public:
 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, 
vNumViewports);
 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
 }
+else
+{
+viewportIdx = vpai;
+}
 
 ComputeClipCodes(prim, viewportIdx);
 

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Pass prim to ClipSimd

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: fbc27ff0279c76542fd8e3c61562ca69fa539272
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=fbc27ff0279c76542fd8e3c61562ca69fa539272

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Dec  7 17:54:40 2017 -0600

swr/rast: Pass prim to ClipSimd

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/clip.h | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h 
b/src/gallium/drivers/swr/rasterizer/core/clip.h
index 148f661ab4..8b947668d3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -437,7 +437,7 @@ public:
 return SIMD_T::movemask_ps(vClipCullMask);
 }
 
-void ClipSimd(const typename SIMD_T::Float , const typename 
SIMD_T::Float , PA_STATE , const typename SIMD_T::Integer 
, const typename SIMD_T::Integer )
+void ClipSimd(const typename SIMD_T::Vec4 prim[], const typename 
SIMD_T::Float , const typename SIMD_T::Float , PA_STATE 
, const typename SIMD_T::Integer , const typename SIMD_T::Integer 
)
 {
 // input/output vertex store for clipper
 SIMDVERTEX_T vertices[7]; // maximum 7 verts generated per 
triangle
@@ -452,10 +452,9 @@ public:
 
 // assemble pos
 typename SIMD_T::Vec4 tmpVector[NumVertsPerPrim];
-pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 {
-vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
+vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
 }
 
 // assemble attribs
@@ -568,7 +567,8 @@ public:
 SIMDVERTEX_T transposedPrims[2];
 
 #endif
-for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
+uint32_t numInputPrims = pa.NumPrims();
+for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 {
 uint32_t numEmittedVerts = pVertexCount[inputPrim];
 if (numEmittedVerts < NumVertsPerPrim)
@@ -716,7 +716,7 @@ public:
 AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
 // we have to clip tris, execute the clipper, which will also
 // call the binner
-ClipSimd(SIMD_T::vmask_ps(primMask), SIMD_T::vmask_ps(clipMask), 
pa, primId, viewportIdx);
+ClipSimd(prim, SIMD_T::vmask_ps(primMask), 
SIMD_T::vmask_ps(clipMask), pa, primId, viewportIdx);
 AR_END(FEGuardbandClip, 1);
 }
 else if (validMask)

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Remove unneeded copy of gather mask

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: f2e3900a1e7b48b640bd9fa32d2e1285e397fad0
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=f2e3900a1e7b48b640bd9fa32d2e1285e397fad0

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Nov 21 11:05:08 2017 -0600

swr/rast: Remove unneeded copy of gather mask

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 22 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 80 ++
 2 files changed, 23 insertions(+), 79 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 8ffe05b41c..0221106664 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -1107,23 +1107,19 @@ namespace SwrJit
 }
 
 void Builder::GATHER4PS(const SWR_FORMAT_INFO , Value* pSrcBase, 
Value* byteOffsets, 
-Value* mask, Value* vGatherComponents[], bool 
bPackedOutput)
+Value* vMask, Value* vGatherComponents[], bool 
bPackedOutput)
 {
 switch(info.bpp / info.numComps)
 {
 case 16: 
 {
 Value* vGatherResult[2];
-Value *vMask;
 
 // TODO: vGatherMaskedVal
 Value* vGatherMaskedVal = VIMMED1((float)0);
 
 // always have at least one component out of x or y to 
fetch
 
-// save mask as it is zero'd out after each gather
-vMask = mask;
-
 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, 
byteOffsets, vMask);
 // e.g. result of first 8x32bit integer gather for 16bit 
components
 // 256i - 01234567
@@ -1135,7 +1131,6 @@ namespace SwrJit
 {
 // offset base to the next components(zw) in the 
vertex to gather
 pSrcBase = GEP(pSrcBase, C((char)4));
-vMask = mask;
 
 vGatherResult[1] =  GATHERPS(vGatherMaskedVal, 
pSrcBase, byteOffsets, vMask);
 // e.g. result of second 8x32bit integer gather for 
16bit components
@@ -1164,9 +1159,6 @@ namespace SwrJit
 {
 uint32_t swizzleIndex = info.swizzle[i];
 
-// save mask as it is zero'd out after each gather
-Value *vMask = mask;
-
 // Gather a SIMD of components
 vGatherComponents[swizzleIndex] = 
GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
 
@@ -1182,14 +1174,14 @@ namespace SwrJit
 }
 
 void Builder::GATHER4DD(const SWR_FORMAT_INFO , Value* pSrcBase, 
Value* byteOffsets,
-Value* mask, Value* vGatherComponents[], bool 
bPackedOutput)
+Value* vMask, Value* vGatherComponents[], bool 
bPackedOutput)
 {
 switch (info.bpp / info.numComps)
 {
 case 8:
 {
 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
-Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, 
byteOffsets, mask);
+Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, 
byteOffsets, vMask);
 // e.g. result of an 8x32bit integer gather for 8bit components
 // 256i - 01234567
 //xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
@@ -1200,16 +1192,12 @@ namespace SwrJit
 case 16:
 {
 Value* vGatherResult[2];
-Value *vMask;
 
 // TODO: vGatherMaskedVal
 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
 
 // always have at least one component out of x or y to fetch
 
-// save mask as it is zero'd out after each gather
-vMask = mask;
-
 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, 
byteOffsets, vMask);
 // e.g. result of first 8x32bit integer gather for 16bit 
components
 // 256i - 01234567
@@ -1221,7 +1209,6 @@ namespace SwrJit
 {
 // offset base to the next components(zw) in the vertex to 
gather
 pSrcBase = GEP(pSrcBase, C((char)4));
-vMask = mask;
 
 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, 
byteOffsets, vMask);
 // e.g. result of second 8x32bit integer gather for 16bit 
components
@@ -1251,9 +1238,6 @@ namespace SwrJit
 {
 uint32_t swizzleIndex = info.swizzle[i];
 
-

Mesa (master): swr/rast: WIP - Widen fetch shader to SIMD16

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 36e276b6b03da852c78e314640b3822be263def2
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=36e276b6b03da852c78e314640b3822be263def2

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Sun Dec  3 18:49:29 2017 -0600

swr/rast: WIP - Widen fetch shader to SIMD16

Widen vertex gather/storage to SIMD16 for all component types.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 716 -
 1 file changed, 689 insertions(+), 27 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 337bb7f660..6c0e658e68 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -70,6 +70,9 @@ struct FetchJit : public Builder
 #else
 void Shuffle8bpcGatherd(Shuffle8bpcArgs );
 #endif
+#if USE_SIMD16_BUILDER
+void Shuffle8bpcGatherd2(Shuffle8bpcArgs );
+#endif
 
 typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const 
ConversionType,
 uint32_t&, uint32_t&, const ComponentEnable, const 
ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
@@ -78,6 +81,9 @@ struct FetchJit : public Builder
 #else
 void Shuffle16bpcGather(Shuffle16bpcArgs );
 #endif
+#if USE_SIMD16_BUILDER
+void Shuffle16bpcGather2(Shuffle16bpcArgs );
+#endif
 
 void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const 
uint32_t numEltsToStore, Value* ()[4]);
 #if USE_SIMD16_BUILDER
@@ -726,7 +732,7 @@ void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, 
Value* pMask, Value* pB
 // only works if pixel size is <= 32bits
 SWR_ASSERT(info.bpp <= 32);
 
-   Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
+Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
 
 for (uint32_t comp = 0; comp < 4; ++comp)
 {
@@ -825,6 +831,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 Value* vVertexElements[4];
 #if USE_SIMD16_GATHERS
 Value* vVertexElements2[4];
+#if USE_SIMD16_BUILDER
+Value *pVtxSrc2[4];
+#endif
 #endif
 
 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
@@ -961,6 +970,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 #if USE_SIMD16_GATHERS
 // override cur indices with 0 if pitch is 0
 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
+vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
 vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2);
 
 // are vertices partially OOB?
@@ -983,7 +993,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 
 // only fetch lanes that pass both tests
 vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
-vGatherMask2 = AND(vMaxGatherMask, vMinGatherMask2);
+vGatherMask2 = AND(vMaxGatherMask2, vMinGatherMask2);
 }
 else
 {
@@ -1074,15 +1084,32 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 {
 if (isComponentEnabled(compMask, c))
 {
-vVertexElements[currentVertexElement] = pResults[c];
+#if USE_SIMD16_BUILDER
+// pack adjacent pairs of SIMD8s into SIMD16s
+pVtxSrc2[currentVertexElement] = VUNDEF2_F();
+pVtxSrc2[currentVertexElement] = 
INSERT2_F(pVtxSrc2[currentVertexElement], pResults[c],  0);
+pVtxSrc2[currentVertexElement] = 
INSERT2_F(pVtxSrc2[currentVertexElement], pResults2[c], 1);
+
+#else
+vVertexElements[currentVertexElement]  = pResults[c];
 vVertexElements2[currentVertexElement] = pResults2[c];
-currentVertexElement++;
+
+#endif
+currentVertexElement += 1;
 
 if (currentVertexElement > 3)
 {
+#if USE_SIMD16_BUILDER
+// store SIMD16s
+Value *pVtxOut2 = BITCAST(pVtxOut, 
PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+
+StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
+
+#else
 StoreVertexElements(pVtxOut, outputElt, 4, 
vVertexElements);
 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, 
vVertexElements2);
 
+#endif
 outputElt += 1;
 
 // reset to the next vVertexElement to output
@@ -1113,9 +1140,12 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 else if(info.type[0] == SWR_TYPE_FLOAT)
 {
 ///@todo: support 64 bit vb accesses
-Value* gatherSrc = VIMMED1(0.0f);
+Value *gatherSrc = VIMMED1(0.0f);
 #if

Mesa (master): swr/rast: SIMD16 Fetch - Fully widen 16-bit float vertex components

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: b38ac9dca1536062d5167e6c3c1f587a27ea3d58
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=b38ac9dca1536062d5167e6c3c1f587a27ea3d58

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Fri Dec  8 13:59:19 2017 -0600

swr/rast: SIMD16 Fetch - Fully widen 16-bit float vertex components

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 55 +++---
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 2065db3475..c960dc77fb 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -1277,6 +1277,43 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 case 16:
 {
 #if USE_SIMD16_GATHERS
+#if USE_SIMD16_BUILDER
+Value *gatherResult[2];
+
+// if we have at least one component out of x or y to fetch
+if (isComponentEnabled(compMask, 0) || 
isComponentEnabled(compMask, 1))
+{
+gatherResult[0] = GATHERPS_16(gatherSrc16, 
pStreamBase, vOffsets16, vGatherMask16);
+
+// e.g. result of first 8x32bit integer gather for 
16bit components
+// 256i - 01234567
+//xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+//
+}
+else
+{
+gatherResult[0] = VUNDEF2_I();
+}
+
+// if we have at least one component out of z or w to fetch
+if (isComponentEnabled(compMask, 2) || 
isComponentEnabled(compMask, 3))
+{
+// offset base to the next components(zw) in the 
vertex to gather
+pStreamBase = GEP(pStreamBase, C((char)4));
+
+gatherResult[1] = GATHERPS_16(gatherSrc16, 
pStreamBase, vOffsets16, vGatherMask16);
+
+// e.g. result of second 8x32bit integer gather for 
16bit components
+// 256i - 01234567
+//zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
+//
+}
+else
+{
+gatherResult[1] = VUNDEF2_I();
+}
+
+#else
 Value *vGatherResult[2];
 Value *vGatherResult2[2];
 
@@ -1315,10 +1352,13 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 vGatherResult2[1] = VUNDEF_I();
 }
 
+#endif
 // if we have at least one component to shuffle into place
 if (compMask)
 {
 #if USE_SIMD16_BUILDER
+#if USE_SIMD16_BUILDER
+#else
 Value *gatherResult[2];
 
 gatherResult[0] = VUNDEF2_I();
@@ -1330,6 +1370,7 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 gatherResult[1] = INSERT2_I(gatherResult[1], 
vGatherResult[1],  0);
 gatherResult[1] = INSERT2_I(gatherResult[1], 
vGatherResult2[1], 1);
 
+#endif
 Value *pVtxOut2 = BITCAST(pVtxOut, 
PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
 
 Shuffle16bpcArgs args = 
std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, 
CONVERT_NONE,
@@ -1511,21 +1552,21 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 // if we need to gather the component
 if (compCtrl[i] == StoreSrc)
 {
-Value *vMaskLo  = VSHUFFLE(vGatherMask, 
VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
+Value *vMaskLo  = VSHUFFLE(vGatherMask,  
VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
 Value *vMaskLo2 = VSHUFFLE(vGatherMask2, 
VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
-Value *vMaskHi  = VSHUFFLE(vGatherMask, 
VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
+Value *vMaskHi  = VSHUFFLE(vGatherMask,  
VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
 Value *vMaskHi2 = VSHUFFLE(vGatherMask2, 
VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
 
-Value *vOffsetsLo  = VEXTRACTI128(vOffsets, 
C(0));
+Value *vOffsetsLo  = VEXTRACTI128(vOffsets,  
C(0));
 Value *vOffsetsLo2 = VEXTRACTI128(vOf

Mesa (master): swr/rast: SIMD16 Fetch - Fully widen 32-bit integer vertex components

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 01a57c11cb7fe85196b9cb4b5a1555e6eb239297
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=01a57c11cb7fe85196b9cb4b5a1555e6eb239297

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Sun Dec 10 23:54:30 2017 -0600

swr/rast: SIMD16 Fetch - Fully widen 32-bit integer vertex components

Also widen the 16-bit a 8-bit integer vertex component gathers to SIMD16.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |  1 +
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 36 +
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  3 +
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 86 +-
 4 files changed, 109 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index ac8b3badf6..8bbf36d9b8 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -46,6 +46,7 @@ intrinsics = [
 ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
 ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 
'indices', 'mask', 'scale']],
 ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
+['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 
'indices', 'mask', 'scale']],
 ['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']],
 ['VPSRLI_16', 'x86_avx512_psrli_d_512', ['src', 'imm']],
 ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 3a486e4c1e..684c9fac54 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -723,6 +723,42 @@ namespace SwrJit
 return vGather;
 }
 
+#if USE_SIMD16_BUILDER
+Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, 
Value *vMask, uint8_t scale)
+{
+Value *vGather = VUNDEF2_F();
+
+// use avx512 gather instruction if available
+if (JM()->mArch.AVX512F())
+{
+// force mask to , required by vgather2
+Value *mask = BITCAST(vMask, mInt16Ty);
+
+vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, 
C((uint32_t)scale));
+}
+else
+{
+Value *src0 = EXTRACT2_F(vSrc, 0);
+Value *src1 = EXTRACT2_F(vSrc, 1);
+
+Value *indices0 = EXTRACT2_I(vIndices, 0);
+Value *indices1 = EXTRACT2_I(vIndices, 1);
+
+Value *vmask16 = VMASK2(vMask);
+
+Value *mask0 = MASK(EXTRACT2_I(vmask16, 0));  // TODO: do this 
better..
+Value *mask1 = MASK(EXTRACT2_I(vmask16, 1));
+
+Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
+Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
+
+vGather = JOIN2(gather0, gather1);
+}
+
+return vGather;
+}
+
+#endif
 //
 /// @brief Generate a masked gather operation in LLVM IR.  If not
 /// supported on the underlying platform, emulate it with loads
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 231bd6ad85..6c883d8f52 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -135,6 +135,9 @@ void GATHER4PS(const SWR_FORMAT_INFO , Value* 
pSrcBase, Value* byteOffsets,
Value* mask, Value* vGatherComponents[], bool bPackedOutput);
 
 Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t 
scale = 1);
+#if USE_SIMD16_BUILDER
+Value *GATHERDD_16(Value *src, Value *pBase, Value *indices, Value *mask, 
uint8_t scale = 1);
+#endif
 void GATHER4DD(const SWR_FORMAT_INFO , Value* pSrcBase, Value* 
byteOffsets,
Value* mask, Value* vGatherComponents[], bool bPackedOutput);
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index e0a0770560..ec3b5eafcc 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -1349,14 +1349,6 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 if (compMask)
 {
 #if USE_SIMD16_BUILDER
-#if USE_SIMD16_BUILDER
-#else
-Value *gatherResult[2];
-
-gatherResult[0] = JOIN2(vGatherResult[0], 
v

Mesa (master): swr/rast: Move GatherScissors to header

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: f88289168470873ba47a51b331178cf265c155e5
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=f88289168470873ba47a51b331178cf265c155e5

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Dec  6 12:07:59 2017 -0600

swr/rast: Move GatherScissors to header

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 127 -
 src/gallium/drivers/swr/rasterizer/core/binner.h   | 127 +
 2 files changed, 127 insertions(+), 127 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 8a5356b168..22996c5a5d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -212,133 +212,6 @@ INLINE void ProcessAttributes(
 }
 }
 
-//
-/// @brief  Gather scissor rect data based on per-prim viewport indices.
-/// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
-/// @param pViewportIndex - array of per-primitive vewport indexes.
-/// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
-/// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
-/// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
-/// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
-//
-/// @todo:  Look at speeding this up -- weigh against corresponding costs in 
rasterizer.
-static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const 
uint32_t *pViewportIndex,
-simdscalari , simdscalari , simdscalari , 
simdscalari )
-{
-scisXmin = _simd_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[7]].xmin,
-pScissorsInFixedPoint[pViewportIndex[6]].xmin,
-pScissorsInFixedPoint[pViewportIndex[5]].xmin,
-pScissorsInFixedPoint[pViewportIndex[4]].xmin,
-pScissorsInFixedPoint[pViewportIndex[3]].xmin,
-pScissorsInFixedPoint[pViewportIndex[2]].xmin,
-pScissorsInFixedPoint[pViewportIndex[1]].xmin,
-pScissorsInFixedPoint[pViewportIndex[0]].xmin);
-scisYmin = _simd_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[7]].ymin,
-pScissorsInFixedPoint[pViewportIndex[6]].ymin,
-pScissorsInFixedPoint[pViewportIndex[5]].ymin,
-pScissorsInFixedPoint[pViewportIndex[4]].ymin,
-pScissorsInFixedPoint[pViewportIndex[3]].ymin,
-pScissorsInFixedPoint[pViewportIndex[2]].ymin,
-pScissorsInFixedPoint[pViewportIndex[1]].ymin,
-pScissorsInFixedPoint[pViewportIndex[0]].ymin);
-scisXmax = _simd_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[7]].xmax,
-pScissorsInFixedPoint[pViewportIndex[6]].xmax,
-pScissorsInFixedPoint[pViewportIndex[5]].xmax,
-pScissorsInFixedPoint[pViewportIndex[4]].xmax,
-pScissorsInFixedPoint[pViewportIndex[3]].xmax,
-pScissorsInFixedPoint[pViewportIndex[2]].xmax,
-pScissorsInFixedPoint[pViewportIndex[1]].xmax,
-pScissorsInFixedPoint[pViewportIndex[0]].xmax);
-scisYmax = _simd_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[7]].ymax,
-pScissorsInFixedPoint[pViewportIndex[6]].ymax,
-pScissorsInFixedPoint[pViewportIndex[5]].ymax,
-pScissorsInFixedPoint[pViewportIndex[4]].ymax,
-pScissorsInFixedPoint[pViewportIndex[3]].ymax,
-pScissorsInFixedPoint[pViewportIndex[2]].ymax,
-pScissorsInFixedPoint[pViewportIndex[01]].ymax,
-pScissorsInFixedPoint[pViewportIndex[00]].ymax);
-}
-
-static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const 
uint32_t *pViewportIndex,
-simd16scalari , simd16scalari , simd16scalari , 
simd16scalari )
-{
-scisXmin = _simd16_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[15]].xmin,
-pScissorsInFixedPoint[pViewportIndex[14]].xmin,
-pScissorsInFixedPoint[pViewportIndex[13]].xmin,
-pScissorsInFixedPoint[pViewportIndex[12]].xmin,
-pScissorsInFixedPoint[pViewportIndex[11]].xmin,
-pScissorsInFixedPoint[pViewportIndex[10]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 9]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 8]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 7]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 6]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 5]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 4]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 3]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 2]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 1]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 0]].xmin);
-
-scisYmin = _simd16_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[15]].ymin,
-pScissorsInFixedPoint[pViewportInd

Mesa (master): swr/rast: Rework thread binding parameters for machine partitioning

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 20f9006603139a479b756c593c04a540041e3471
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=20f9006603139a479b756c593c04a540041e3471

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Mon Dec 11 17:45:58 2017 -0600

swr/rast: Rework thread binding parameters for machine partitioning

Add BASE_NUMA_NODE, BASE_CORE, BASE_THREAD parameters to
SwrCreateContext.

Add optional SWR_API_THREADING_INFO parameter to SwrCreateContext to
control reservation of API threads.

Add SwrBindApiThread() function to allow binding of API threads to
reserved HW threads.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/codegen/knob_defs.py|  29 +-
 src/gallium/drivers/swr/rasterizer/core/api.cpp|  40 ++-
 src/gallium/drivers/swr/rasterizer/core/api.h  |  33 +++
 src/gallium/drivers/swr/rasterizer/core/context.h  |   1 +
 .../drivers/swr/rasterizer/core/threads.cpp| 299 +++--
 src/gallium/drivers/swr/rasterizer/core/threads.h  |   4 +
 .../drivers/swr/rasterizer/core/tilemgr.cpp|   4 +-
 7 files changed, 322 insertions(+), 88 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py 
b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
index 09e3124602..30803927e3 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
@@ -62,15 +62,33 @@ KNOBS = [
 'category'  : 'perf',
 }],
 
-['MAX_NUMA_NODES', {
+['BASE_NUMA_NODE', {
 'type'  : 'uint32_t',
 'default'   : '0',
+'desc'  : ['Starting NUMA node index to use when allocating 
compute resources.',
+   'Setting this to a non-zero value will reduce the 
maximum # of NUMA nodes used.'],
+'category'  : 'perf',
+'advanced'  : True,
+}],
+
+['MAX_NUMA_NODES', {
+'type'  : 'uint32_t',
+'default'   : '1' if sys.platform == 'win32' else '0',
 'desc'  : ['Maximum # of NUMA-nodes per system used for worker 
threads',
'  0 == ALL NUMA-nodes in the system',
'  N == Use at most N NUMA-nodes for rendering'],
 'category'  : 'perf',
 }],
 
+['BASE_CORE', {
+'type'  : 'uint32_t',
+'default'   : '0',
+'desc'  : ['Starting core index to use when allocating compute 
resources.',
+   'Setting this to a non-zero value will reduce the 
maximum # of cores used.'],
+'category'  : 'perf',
+'advanced'  : True,
+}],
+
 ['MAX_CORES_PER_NUMA_NODE', {
 'type'  : 'uint32_t',
 'default'   : '0',
@@ -80,6 +98,15 @@ KNOBS = [
 'category'  : 'perf',
 }],
 
+['BASE_THREAD', {
+'type'  : 'uint32_t',
+'default'   : '0',
+'desc'  : ['Starting thread index to use when allocating compute 
resources.',
+   'Setting this to a non-zero value will reduce the 
maximum # of threads used.'],
+'category'  : 'perf',
+'advanced'  : True,
+}],
+
 ['MAX_THREADS_PER_CORE', {
 'type'  : 'uint32_t',
 'default'   : '1',
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 9265440904..25a3f34841 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -95,16 +95,32 @@ HANDLE SwrCreateContext(
 pContext->dsRing[dc].pArena = new 
CachingArena(pContext->cachingArenaAllocator);
 }
 
-pContext->threadInfo.MAX_WORKER_THREADS= KNOB_MAX_WORKER_THREADS;
-pContext->threadInfo.MAX_NUMA_NODES= KNOB_MAX_NUMA_NODES;
-pContext->threadInfo.MAX_CORES_PER_NUMA_NODE   = 
KNOB_MAX_CORES_PER_NUMA_NODE;
-pContext->threadInfo.MAX_THREADS_PER_CORE  = KNOB_MAX_THREADS_PER_CORE;
-pContext->threadInfo.SINGLE_THREADED   = KNOB_SINGLE_THREADED;
-
 if (pCreateInfo->pThreadInfo)
 {
 pContext->threadInfo = *pCreateInfo->pThreadInfo;
 }
+else
+{
+pContext->threadInfo.MAX_WORKER_THREADS = 
KNOB_MAX_WORKER_THREADS;
+pContext->threadInfo.BASE_NUMA_NODE = KNOB_BASE_NUMA_NODE;
+pContext->threadInfo.BASE_CORE  = KNOB_BASE_CORE;
+pContext->threadInfo.BASE_THREAD= KNOB_BASE_THREAD;
+pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES;
+pContext->threadInfo.MAX_CORES_PER_NUMA_NODE= 
KNOB_MAX_CORES_PER_NUMA_NODE;
+pContext->threadInfo.MAX_THREADS_PER_CORE   = 
KNOB_MAX_THREADS_PER_CORE;
+pContext->threadInfo.SINGLE_THREADED= KNOB_SINGLE_THREADED;
+}
+
+if (pCreateInfo->pApiThreadInfo)
+{
+pCo

Mesa (master): swr/rast: Remove no-op VBROADCAST of vID

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: ca59b2e75ccb0de2ef7f72751a52b035d060d1bc
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=ca59b2e75ccb0de2ef7f72751a52b035d060d1bc

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Mon Dec 11 08:38:46 2017 -0600

swr/rast: Remove no-op VBROADCAST of vID

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index ec3b5eafcc..1312ac0009 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -3101,7 +3101,7 @@ Value* FetchJit::GenerateCompCtrlVector(const 
ComponentControl ctrl)
 #else
 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, 
SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
 #endif
-return VBROADCAST(pId);
+return pId;
 }
 case StoreInstanceId:
 {
@@ -3129,7 +3129,7 @@ Value* FetchJit::GenerateCompCtrlVector2(const 
ComponentControl ctrl)
 
 Value *pId = JOIN2(pId_lo, pId_hi);
 
-return VBROADCAST2(pId);
+return pId;
 }
 case StoreInstanceId:
 {

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Move more RTAI handling out of binner

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: f475ac3c40c6204ef73ad5d07d9ae6932822cc2f
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=f475ac3c40c6204ef73ad5d07d9ae6932822cc2f

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Dec 14 13:49:56 2017 -0600

swr/rast: Move more RTAI handling out of binner

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 13 +
 src/gallium/drivers/swr/rasterizer/core/clip.h |  1 +
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 7ef87c4443..9aa9f9e79b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -1023,18 +1023,7 @@ void BinPostSetupPointsImpl(
 SIMD_T::store_si(reinterpret_cast(aMTBottom), bbox.ymax);
 
 // store render target array index
-OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
-if (state.backendState.readRenderTargetArrayIndex)
-{
-typename SIMD_T::Vec4 vRtai[2];
-pa.Assemble(VERTEX_SGV_SLOT, vRtai);
-typename SIMD_T::Integer vRtaii = 
SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
-SIMD_T::store_si(reinterpret_cast(aRTAI), vRtaii);
-}
-else
-{
-SIMD_T::store_si(reinterpret_cast(aRTAI), SIMD_T::setzero_si());
-}
+const uint32_t *aRTAI = reinterpret_cast();
 
 OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
 SIMD_T::store_ps(reinterpret_cast(aPointSize), vPointSize);
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h 
b/src/gallium/drivers/swr/rasterizer/core/clip.h
index e5e00d49b0..592c9bfa73 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -646,6 +646,7 @@ public:
 
 PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast([0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, 
NumVertsPerPrim, clipTopology);
 clipPA.viewportArrayActive = pa.viewportArrayActive;
+clipPA.rtArrayActive = pa.rtArrayActive;
 
 static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 
0x1f, 0x3f, 0x7f };
 

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Fix cache of API thread event manager

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 12adf2c8152d0500b2e1149ad0f5397c4955df86
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=12adf2c8152d0500b2e1149ad0f5397c4955df86

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Dec 13 17:52:52 2017 -0600

swr/rast: Fix cache of API thread event manager

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/api.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 25a3f34841..09b482dcc0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -166,7 +166,7 @@ HANDLE SwrCreateContext(
 
 #if defined(KNOB_ENABLE_AR)
 // cache the API thread event manager, for use with sim layer
-pCreateInfo->hArEventManager = pContext->pArContext[16];
+pCreateInfo->hArEventManager = 
pContext->pArContext[pContext->NumWorkerThreads + 1];
 #endif
 
 // State setup AFTER context is fully initialized

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Pull most of the VPAI manipulation out of the binner/clipper

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 8b069207965b8cbfcb9de0e06ff03dadc8dbd291
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=8b069207965b8cbfcb9de0e06ff03dadc8dbd291

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Dec  7 11:59:45 2017 -0600

swr/rast: Pull most of the VPAI manipulation out of the binner/clipper

Move out of binner/clipper; hand them down from the frontend code instead.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 124 ++---
 src/gallium/drivers/swr/rasterizer/core/clip.cpp   |  25 ++---
 src/gallium/drivers/swr/rasterizer/core/clip.h |  58 +++---
 src/gallium/drivers/swr/rasterizer/core/context.h  |   4 +-
 .../drivers/swr/rasterizer/core/frontend.cpp   | 112 ++-
 src/gallium/drivers/swr/rasterizer/core/frontend.h |   8 +-
 src/gallium/drivers/swr/rasterizer/core/pa.h   |   4 +-
 7 files changed, 177 insertions(+), 158 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 22996c5a5d..a664ed812f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -307,7 +307,8 @@ void SIMDCALL BinTrianglesImpl(
 uint32_t workerId,
 typename SIMD_T::Vec4 tri[3],
 uint32_t triMask,
-typename SIMD_T::Integer const )
+typename SIMD_T::Integer const ,
+typename SIMD_T::Integer const )
 {
 SWR_CONTEXT *pContext = pDC->pContext;
 
@@ -323,31 +324,6 @@ void SIMDCALL BinTrianglesImpl(
 typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
 typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
 
-typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
-typename SIMD_T::Vec4 vpiAttrib[3];
-typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
-
-if (state.backendState.readViewportArrayIndex)
-{
-pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
-
-vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
-}
-
-
-if (state.backendState.readViewportArrayIndex) // VPAIOffsets are 
guaranteed 0-15 -- no OOB issues if they are offsets from 0 
-{
-// OOB indices => forced to zero.
-vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
-typename SIMD_T::Integer vNumViewports = 
SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, 
vNumViewports);
-viewportIdx = SIMD_T::and_si(vClearMask, vpai);
-}
-else
-{
-viewportIdx = vpai;
-}
-
 if (feState.vpTransformDisable)
 {
 // RHW is passed in directly when VP transform is disabled
@@ -375,7 +351,7 @@ void SIMDCALL BinTrianglesImpl(
 tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2);
 
 // Viewport transform to screen space coords
-if (state.backendState.readViewportArrayIndex)
+if (pa.viewportArrayActive)
 {
 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
 }
@@ -568,8 +544,8 @@ void SIMDCALL BinTrianglesImpl(
 /// @todo:  Look at speeding this up -- weigh against corresponding costs 
in rasterizer.
 {
 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
+if (pa.viewportArrayActive)
 
-if (state.backendState.readViewportArrayIndex)
 {
 GatherScissors([0], pViewportIndex, 
scisXmin, scisYmin, scisXmax, scisYmax);
 }
@@ -786,9 +762,10 @@ void BinTriangles(
 uint32_t workerId,
 simdvector tri[3],
 uint32_t triMask,
-simdscalari const )
+simdscalari const ,
+simdscalari const )
 {
-BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(pDC, pa, workerId, tri, 
triMask, primID);
+BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(pDC, pa, workerId, tri, 
triMask, primID, viewportIdx);
 }
 
 #if USE_SIMD16_FRONTEND
@@ -799,9 +776,10 @@ void SIMDCALL BinTriangles_simd16(
 uint32_t workerId,
 simd16vector tri[3],
 uint32_t triMask,
-simd16scalari const )
+simd16scalari const ,
+simd16scalari const )
 {
-BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(pDC, pa, workerId, tri, 
triMask, primID);
+BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(pDC, pa, workerId, tri, 
triMask, primID, viewportIdx);
 }
 
 #endif
@@ -1026,7 +1004,7 @@ void BinPostSetupPointsImpl(
 {
 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
 
-if (state.backendState.readViewportArrayIndex)
+if (pa.viewportArrayActive)
 {
 GatherScissors([0], pViewportIndex, 
scisXmin, scisYmin, scisXmax, scisYmax);
 }
@@ -1176,38 +1154,13 @@ void BinPointsImpl(
 uint32_t workerId,
 typename SIMD_T::Vec4 prim[3],
 uint32_t primMask,
-typename SIMD_T::Int

Mesa (master): swr/rast: Replace VPSRL with LSHR

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: c68b2d5c79239e721d8825e373a02fc843d15f6a
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=c68b2d5c79239e721d8825e373a02fc843d15f6a

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Dec 12 14:23:50 2017 -0600

swr/rast: Replace VPSRL with LSHR

Replace use of x86 intrinsic with general llvm IR instruction.

Generates the same final assembly.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |  2 --
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 30 --
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  5 
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|  8 +++---
 4 files changed, 4 insertions(+), 41 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 8bbf36d9b8..9544353eb9 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -47,8 +47,6 @@ intrinsics = [
 ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 
'indices', 'mask', 'scale']],
 ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
 ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 
'indices', 'mask', 'scale']],
-['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']],
-['VPSRLI_16', 'x86_avx512_psrli_d_512', ['src', 'imm']],
 ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']],
 ['VRSQRTPS', 'x86_avx_rsqrt_ps_256', ['a']],
 ['VRCPPS', 'x86_avx_rcp_ps_256', ['a']],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 684c9fac54..bdcafd28a3 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -809,36 +809,6 @@ namespace SwrJit
 }
 
 #if USE_SIMD16_BUILDER
-Value *Builder::PSRLI(Value *a, Value *imm)
-{
-return VPSRLI(a, imm);
-}
-
-Value *Builder::PSRLI_16(Value *a, Value *imm)
-{
-Value *result = VUNDEF2_I();
-
-// use avx512 shift right instruction if available
-if (JM()->mArch.AVX512F())
-{
-result = VPSRLI_16(a, imm);
-}
-else
-{
-Value *a0 = EXTRACT2_I(a, 0);
-Value *a1 = EXTRACT2_I(a, 1);
-
-Value *result0 = PSRLI(a0, imm);
-Value *result1 = PSRLI(a1, imm);
-
-result = JOIN2(result0, result1);
-}
-
-return result;
-}
-
-#endif
-#if USE_SIMD16_BUILDER
 //
 /// @brief
 Value *Builder::EXTRACT2_F(Value *a2, uint32_t imm)
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 6c883d8f52..98bc563351 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -143,11 +143,6 @@ void GATHER4DD(const SWR_FORMAT_INFO , Value* 
pSrcBase, Value* byteOffsets,
 
 Value *GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t 
scale = 1);
 
-#if USE_SIMD16_BUILDER
-Value *PSRLI(Value *a, Value *imm);
-Value *PSRLI_16(Value *a, Value *imm);
-
-#endif
 void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask);
 
 void Shuffle8bpcGather4(const SWR_FORMAT_INFO , Value* vGatherInput, 
Value* vGatherOutput[], bool bPackedOutput);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 1312ac0009..8d97ddfdc9 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -1422,12 +1422,12 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 // But, we know that elements must be aligned 
for FETCH. :)
 // Right shift the offset by a bit and then 
scale by 2 to remove the sign extension.
 #if USE_SIMD16_BUILDER
-Value *shiftedOffsets = VPSRLI_16(vOffsets16, 
C(1));
+Value *shiftedOffsets = LSHR(vOffsets16, 1);
 pVtxSrc2[currentVertexElement] = 
GATHERPS_16(gatherSrc16, pStreamBase, shiftedOffsets, vGatherMask16, 2);
 
 #else
-Value *vShiftedOffsets = VPSRLI(vOffsets, 
C(1));
-Value *vShiftedOffsets2 = VPSRLI(vOffsets2, 
C(1));
+Value *vShiftedOffsets = LSHR(vOffsets, 1);
+Value *vShiftedOffsets2 = LS

Mesa (master): swr/rast: Corrections to multi-scissor handling

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 6d5275498a9a8e571048ca3dd6c99f693b49a7ed
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=6d5275498a9a8e571048ca3dd6c99f693b49a7ed

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Nov 29 15:14:20 2017 -0600

swr/rast: Corrections to multi-scissor handling

binner's GatherScissors() will be turned into a real gather in the not
too distant future.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 176 ++---
 1 file changed, 88 insertions(+), 88 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 52375f8956..8a5356b168 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -226,117 +226,117 @@ static void GatherScissors(const SWR_RECT 
*pScissorsInFixedPoint, const uint32_t
 simdscalari , simdscalari , simdscalari , 
simdscalari )
 {
 scisXmin = _simd_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[0]].xmin,
-pScissorsInFixedPoint[pViewportIndex[1]].xmin,
-pScissorsInFixedPoint[pViewportIndex[2]].xmin,
-pScissorsInFixedPoint[pViewportIndex[3]].xmin,
-pScissorsInFixedPoint[pViewportIndex[4]].xmin,
-pScissorsInFixedPoint[pViewportIndex[5]].xmin,
+pScissorsInFixedPoint[pViewportIndex[7]].xmin,
 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
-pScissorsInFixedPoint[pViewportIndex[7]].xmin);
+pScissorsInFixedPoint[pViewportIndex[5]].xmin,
+pScissorsInFixedPoint[pViewportIndex[4]].xmin,
+pScissorsInFixedPoint[pViewportIndex[3]].xmin,
+pScissorsInFixedPoint[pViewportIndex[2]].xmin,
+pScissorsInFixedPoint[pViewportIndex[1]].xmin,
+pScissorsInFixedPoint[pViewportIndex[0]].xmin);
 scisYmin = _simd_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[0]].ymin,
-pScissorsInFixedPoint[pViewportIndex[1]].ymin,
-pScissorsInFixedPoint[pViewportIndex[2]].ymin,
-pScissorsInFixedPoint[pViewportIndex[3]].ymin,
-pScissorsInFixedPoint[pViewportIndex[4]].ymin,
-pScissorsInFixedPoint[pViewportIndex[5]].ymin,
+pScissorsInFixedPoint[pViewportIndex[7]].ymin,
 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
-pScissorsInFixedPoint[pViewportIndex[7]].ymin);
+pScissorsInFixedPoint[pViewportIndex[5]].ymin,
+pScissorsInFixedPoint[pViewportIndex[4]].ymin,
+pScissorsInFixedPoint[pViewportIndex[3]].ymin,
+pScissorsInFixedPoint[pViewportIndex[2]].ymin,
+pScissorsInFixedPoint[pViewportIndex[1]].ymin,
+pScissorsInFixedPoint[pViewportIndex[0]].ymin);
 scisXmax = _simd_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[0]].xmax,
-pScissorsInFixedPoint[pViewportIndex[1]].xmax,
-pScissorsInFixedPoint[pViewportIndex[2]].xmax,
-pScissorsInFixedPoint[pViewportIndex[3]].xmax,
-pScissorsInFixedPoint[pViewportIndex[4]].xmax,
-pScissorsInFixedPoint[pViewportIndex[5]].xmax,
+pScissorsInFixedPoint[pViewportIndex[7]].xmax,
 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
-pScissorsInFixedPoint[pViewportIndex[7]].xmax);
+pScissorsInFixedPoint[pViewportIndex[5]].xmax,
+pScissorsInFixedPoint[pViewportIndex[4]].xmax,
+pScissorsInFixedPoint[pViewportIndex[3]].xmax,
+pScissorsInFixedPoint[pViewportIndex[2]].xmax,
+pScissorsInFixedPoint[pViewportIndex[1]].xmax,
+pScissorsInFixedPoint[pViewportIndex[0]].xmax);
 scisYmax = _simd_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[0]].ymax,
-pScissorsInFixedPoint[pViewportIndex[1]].ymax,
-pScissorsInFixedPoint[pViewportIndex[2]].ymax,
-pScissorsInFixedPoint[pViewportIndex[3]].ymax,
-pScissorsInFixedPoint[pViewportIndex[4]].ymax,
-pScissorsInFixedPoint[pViewportIndex[5]].ymax,
+pScissorsInFixedPoint[pViewportIndex[7]].ymax,
 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
-pScissorsInFixedPoint[pViewportIndex[7]].ymax);
+pScissorsInFixedPoint[pViewportIndex[5]].ymax,
+pScissorsInFixedPoint[pViewportIndex[4]].ymax,
+pScissorsInFixedPoint[pViewportIndex[3]].ymax,
+pScissorsInFixedPoint[pViewportIndex[2]].ymax,
+pScissorsInFixedPoint[pViewportIndex[01]].ymax,
+pScissorsInFixedPoint[pViewportIndex[00]].ymax);
 }
 
 static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const 
uint32_t *pViewportIndex,
 simd16scalari , simd16scalari , simd16scalari , 
simd16scalari )
 {
 scisXmin = _simd16_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[0]].xmin,
-pScissorsInFixedPoint[pViewportIndex[1]].xmin,
-pScissorsInFixedPoint[pViewportIndex[2]].xmin,
-pScissorsInFixedPoint[pViewportIn

Mesa (master): swr/rast: Rewrite Shuffle8bpcGatherd using shuffle

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: cdb61d45cd0ca80c3545c1942933abdfbcf7683b
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=cdb61d45cd0ca80c3545c1942933abdfbcf7683b

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Dec  6 10:37:41 2017 -0600

swr/rast: Rewrite Shuffle8bpcGatherd using shuffle

Ease future code maintenance, prepare for folding simd8 and simd16 versions.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 244 ++---
 1 file changed, 62 insertions(+), 182 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 67a4a04072..a847cb74da 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -2014,206 +2014,86 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs 
)
 const uint32_t ()[4] = std::get<9>(args);
 
 // cast types
-Type* vGatherTy = mSimdInt32Ty;
 Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is 
units of 32 bits
 
-// have to do extra work for sign extending
-if ((extendType == Instruction::CastOps::SExt) || (extendType == 
Instruction::CastOps::SIToFP)){
-Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints 
in a 128bit lane
-Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 
128), mVWidth / 4); // vwidth is units of 32 bits
-
-// shuffle mask, including any swizzling
-const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
-const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
-Value* vConstMask = C({char(x), char(x+4), char(x+8), char(x+12),
-char(y), char(y+4), char(y+8), char(y+12),
-char(z), char(z+4), char(z+8), char(z+12),
-char(w), char(w+4), char(w+8), char(w+12),
-char(x), char(x+4), char(x+8), char(x+12),
-char(y), char(y+4), char(y+8), char(y+12),
-char(z), char(z+4), char(z+8), char(z+12),
-char(w), char(w+4), char(w+8), char(w+12)});
-
-Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), 
vConstMask), vGatherTy);
-// after pshufb: group components together in each 128bit lane
-// 256i - 01234567
-//       
-
-Value* vi128XY = nullptr;
-if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
-vi128XY = BITCAST(PERMD(vShufResult, C({0, 4, 0, 0, 1, 5, 
0, 0})), v128Ty);
-// after PERMD: move and pack xy and zw components in low 64 bits 
of each 128bit lane
-// 256i - 01234567
-//  dcdc dcdc   dcdc dcdc (dc - don't care)
-}
-
-// do the same for zw components
-Value* vi128ZW = nullptr;
-if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
-vi128ZW = BITCAST(PERMD(vShufResult, C({2, 6, 0, 0, 3, 7, 
0, 0})), v128Ty);
-}
-
-// init denormalize variables if needed
-Instruction::CastOps fpCast;
-Value* conversionFactor;
-
-switch (conversionType)
-{
-case CONVERT_NORMALIZED:
-fpCast = Instruction::CastOps::SIToFP;
-conversionFactor = VIMMED1((float)(1.0 / 127.0));
-break;
-case CONVERT_SSCALED:
-fpCast = Instruction::CastOps::SIToFP;
-conversionFactor = VIMMED1((float)(1.0));
-break;
-case CONVERT_USCALED:
-SWR_INVALID("Type should not be sign extended!");
-conversionFactor = nullptr;
-break;
-default:
-SWR_ASSERT(conversionType == CONVERT_NONE);
-conversionFactor = nullptr;
-break;
-}
+for (uint32_t i = 0; i < 4; i++)
+{
+if (!isComponentEnabled(compMask, i))
+continue;
 
-// sign extend all enabled components. If we have a fill 
vVertexElements, output to current simdvertex
-for (uint32_t i = 0; i < 4; i++)
+if (compCtrl[i] == ComponentControl::StoreSrc)
 {
-if (isComponentEnabled(compMask, i))
-{
-if (compCtrl[i] == ComponentControl::StoreSrc)
-{
-// if x or z, extract 128bits from lane 0, else for y or 
w, extract from lane 1
-uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
-// if x or y, use vi128XY permute result, else use vi128ZW
-Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-
-// sign extend
-vVertexEl

Mesa (master): swr/rast: SIMD16 Fetch - Fully widen 32-bit float vertex components

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: df54678ba0733380961947d25830ae9695c77d7e
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=df54678ba0733380961947d25830ae9695c77d7e

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Dec  7 18:37:07 2017 -0600

swr/rast: SIMD16 Fetch - Fully widen 32-bit float vertex components

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |   3 +-
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp |  41 -
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |   7 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 175 ++---
 4 files changed, 194 insertions(+), 32 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 44fc857371..ac8b3badf6 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -44,9 +44,10 @@ inst_aliases = {
 intrinsics = [
 ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
 ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
-['VGATHERPS2', 'x86_avx512_gather_dps_512', ['src', 'pBase', 
'indices', 'mask', 'scale']],
+['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 
'indices', 'mask', 'scale']],
 ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
 ['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']],
+['VPSRLI_16', 'x86_avx512_psrli_d_512', ['src', 'imm']],
 ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']],
 ['VRSQRTPS', 'x86_avx_rsqrt_ps_256', ['a']],
 ['VRCPPS', 'x86_avx_rcp_ps_256', ['a']],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 04092541e5..b2210db717 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -639,7 +639,7 @@ namespace SwrJit
 }
 
 #if USE_SIMD16_BUILDER
-Value *Builder::GATHERPS2(Value *vSrc, Value *pBase, Value *vIndices, 
Value *vMask, uint8_t scale)
+Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, 
Value *vMask, uint8_t scale)
 {
 Value *vGather = VUNDEF2_F();
 
@@ -649,7 +649,7 @@ namespace SwrJit
 // force mask to , required by vgather2
 Value *mask = BITCAST(vMask, mInt16Ty);
 
-vGather = VGATHERPS2(vSrc, pBase, vIndices, mask, 
C((uint32_t)scale));
+vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, 
C((uint32_t)scale));
 }
 else
 {
@@ -659,8 +659,10 @@ namespace SwrJit
 Value *indices0 = EXTRACT2_I(vIndices, 0);
 Value *indices1 = EXTRACT2_I(vIndices, 1);
 
-Value *mask0 = EXTRACT2_I(vMask, 0);
-Value *mask1 = EXTRACT2_I(vMask, 1);
+Value *vmask16 = VMASK2(vMask);
+
+Value *mask0 = MASK(EXTRACT2_I(vmask16, 0));  // TODO: do this 
better..
+Value *mask1 = MASK(EXTRACT2_I(vmask16, 1));
 
 Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
 Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
@@ -772,6 +774,37 @@ namespace SwrJit
 }
 
 #if USE_SIMD16_BUILDER
+Value *Builder::PSRLI(Value *a, Value *imm)
+{
+return VPSRLI(a, imm);
+}
+
+Value *Builder::PSRLI_16(Value *a, Value *imm)
+{
+Value *result = VUNDEF2_I();
+
+// use avx512 shift right instruction if available
+if (JM()->mArch.AVX512F())
+{
+result = VPSRLI_16(a, imm);
+}
+else
+{
+Value *a0 = EXTRACT2_I(a, 0);
+Value *a1 = EXTRACT2_I(a, 1);
+
+Value *result0 = PSRLI(a0, imm);
+Value *result1 = PSRLI(a1, imm);
+
+result = INSERT2_I(result, result0, 0);
+result = INSERT2_I(result, result1, 1);
+}
+
+return result;
+}
+
+#endif
+#if USE_SIMD16_BUILDER
 //
 /// @brief
 Value *Builder::EXTRACT2_F(Value *a2, uint32_t imm)
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index d858a827db..62360a3ad7 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -130,7 +130,7 @@ void Gather4(const SWR_FORMAT format, Value* pSrcBase, 
Value* byteOffsets,
 
 Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t 
scale = 1);
 #if USE_SIMD16_BUILDER
-Value *GATHERPS2(Value *src, Value *pBase, Value 

Mesa (master): swr/rast: Pull of RTAI gather & offset out of clip/bin code

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 182cc51a50492926ebf72d4cd38f1e574c768e72
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=182cc51a50492926ebf72d4cd38f1e574c768e72

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Mon Dec 11 15:51:46 2017 -0600

swr/rast: Pull of RTAI gather & offset out of clip/bin code

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 118 +++-
 src/gallium/drivers/swr/rasterizer/core/clip.cpp   |  30 ++--
 src/gallium/drivers/swr/rasterizer/core/clip.h |  35 +++--
 src/gallium/drivers/swr/rasterizer/core/context.h  |   4 +-
 .../drivers/swr/rasterizer/core/frontend.cpp   | 153 +++--
 src/gallium/drivers/swr/rasterizer/core/frontend.h |   8 +-
 src/gallium/drivers/swr/rasterizer/core/pa.h   |   1 +
 7 files changed, 203 insertions(+), 146 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index a664ed812f..7ef87c4443 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -45,7 +45,8 @@ void BinPostSetupLinesImpl(
 typename SIMD_T::Float recipW[],
 uint32_t primMask,
 typename SIMD_T::Integer const ,
-typename SIMD_T::Integer const );
+typename SIMD_T::Integer const ,
+typename SIMD_T::Integer const );
 
 template 
 void BinPostSetupPointsImpl(
@@ -55,7 +56,8 @@ void BinPostSetupPointsImpl(
 typename SIMD_T::Vec4 prim[],
 uint32_t primMask,
 typename SIMD_T::Integer const ,
-typename SIMD_T::Integer const );
+typename SIMD_T::Integer const ,
+typename SIMD_T::Integer const );
 
 //
 /// @brief Processes attributes for the backend based on linkage mask and
@@ -308,9 +310,11 @@ void SIMDCALL BinTrianglesImpl(
 typename SIMD_T::Vec4 tri[3],
 uint32_t triMask,
 typename SIMD_T::Integer const ,
-typename SIMD_T::Integer const )
+typename SIMD_T::Integer const ,
+typename SIMD_T::Integer const )
 {
 SWR_CONTEXT *pContext = pDC->pContext;
+const uint32_t *aRTAI = reinterpret_cast();
 
 AR_BEGIN(FEBinTriangles, pDC->drawId);
 
@@ -604,21 +608,21 @@ endBinTriangles:
 recipW[0] = vRecipW0;
 recipW[1] = vRecipW1;
 
-BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, 
recipW, triMask, primID, viewportIdx);
+BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, 
recipW, triMask, primID, viewportIdx, rtIdx);
 
 line[0] = tri[1];
 line[1] = tri[2];
 recipW[0] = vRecipW1;
 recipW[1] = vRecipW2;
 
-BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, 
recipW, triMask, primID, viewportIdx);
+BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, 
recipW, triMask, primID, viewportIdx, rtIdx);
 
 line[0] = tri[2];
 line[1] = tri[0];
 recipW[0] = vRecipW2;
 recipW[1] = vRecipW0;
 
-BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, 
recipW, triMask, primID, viewportIdx);
+BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, 
recipW, triMask, primID, viewportIdx, rtIdx);
 
 AR_END(FEBinTriangles, 1);
 return;
@@ -626,9 +630,9 @@ endBinTriangles:
 else if (rastState.fillMode == SWR_FILLMODE_POINT)
 {
 // Bin 3 points
-BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, [0], 
triMask, primID, viewportIdx);
-BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, [1], 
triMask, primID, viewportIdx);
-BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, [2], 
triMask, primID, viewportIdx);
+BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, [0], 
triMask, primID, viewportIdx, rtIdx);
+BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, [1], 
triMask, primID, viewportIdx, rtIdx);
+BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, [2], 
triMask, primID, viewportIdx, rtIdx);
 
 AR_END(FEBinTriangles, 1);
 return;
@@ -659,22 +663,6 @@ endBinTriangles:
 TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
 TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
 
-// store render target array index
-OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
-if (state.backendState.readRenderTargetArrayIndex)
-{
-typename SIMD_T::Vec4 vRtai[3];
-pa.Assemble(VERTEX_SGV_SLOT, vRtai);
-typename SIMD_T::Integer vRtaii;
-vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
-SIMD_T::store_si(reinterpret_cast(aRTAI), 
vRtaii);
-}
-else
-{
-SIMD_T::store_si(reinterpret_

Mesa (master): swr/rast: Replace INSERT2 vextract/ vinsert with JOIN2 vshuffle

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: fa3105cdb54415d7b93be932351966d3108511e4
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=fa3105cdb54415d7b93be932351966d3108511e4

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Fri Dec  8 17:33:23 2017 -0600

swr/rast: Replace INSERT2 vextract/vinsert with JOIN2 vshuffle

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 38 ++---
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  5 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 92 ++
 3 files changed, 30 insertions(+), 105 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index b2210db717..3a486e4c1e 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -667,8 +667,7 @@ namespace SwrJit
 Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
 Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
 
-vGather = INSERT2_F(vGather, gather0, 0);
-vGather = INSERT2_F(vGather, gather1, 1);
+vGather = JOIN2(gather0, gather1);
 }
 
 return vGather;
@@ -796,8 +795,7 @@ namespace SwrJit
 Value *result0 = PSRLI(a0, imm);
 Value *result1 = PSRLI(a1, imm);
 
-result = INSERT2_I(result, result0, 0);
-result = INSERT2_I(result, result1, 1);
+result = JOIN2(result0, result1);
 }
 
 return result;
@@ -835,37 +833,13 @@ namespace SwrJit
 return BITCAST(EXTRACT2_F(a2, imm), mSimdInt32Ty);
 }
 
-//
-/// @brief
-Value *Builder::INSERT2_F(Value *a2, Value *b, uint32_t imm)
+Value *Builder::JOIN2(Value *a, Value *b)
 {
-const uint32_t i0 = (imm > 0) ? mVWidth : 0;
-
-Value *result = BITCAST(a2, mSimd2FP32Ty);
-
-for (uint32_t i = 0; i < mVWidth; i += 1)
-{
-#if 1
-if (!b->getType()->getScalarType()->isFloatTy())
-{
-b = BITCAST(b, mSimdFP32Ty);
-}
-
-#endif
-Value *temp = VEXTRACT(b, C(i));
-
-result = VINSERT(result, temp, C(i0 + i));
-}
-
-return result;
+return VSHUFFLE(a, b,
+{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 
15});
 }
-
-Value *Builder::INSERT2_I(Value *a2, Value *b, uint32_t imm)
-{
-return BITCAST(INSERT2_F(a2, b, imm), mSimd2Int32Ty);
-}
-
 #endif
+
 //
 /// @brief convert x86  mask to llvm  mask
 Value *Builder::MASK(Value *vmask)
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 62360a3ad7..231bd6ad85 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -119,10 +119,9 @@ Value *VMASK2(Value *mask);
 #if USE_SIMD16_BUILDER
 Value *EXTRACT2_F(Value *a2, uint32_t imm);
 Value *EXTRACT2_I(Value *a2, uint32_t imm);
-Value *INSERT2_F(Value *a2, Value *b, uint32_t imm);
-Value *INSERT2_I(Value *a2, Value *b, uint32_t imm);
-
+Value *JOIN2(Value *a, Value *b);
 #endif
+
 Value *MASKLOADD(Value* src, Value* mask);
 
 void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index c960dc77fb..e0a0770560 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -960,10 +960,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 // offset indices by baseVertex
 #if USE_SIMD16_GATHERS
 #if USE_SIMD16_BUILDER
-Value *vIndices16 = VUNDEF2_I();
-
-vIndices16 = INSERT2_I(vIndices16, vIndices,  0);
-vIndices16 = INSERT2_I(vIndices16, vIndices2, 1);
+Value *vIndices16 = JOIN2(vIndices, vIndices2);
 
 vCurIndices16 = ADD(vIndices16, vBaseVertex16);
 #else
@@ -982,10 +979,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 // offset indices by baseVertex
 #if USE_SIMD16_GATHERS
 #if USE_SIMD16_BUILDER
-Value *vIndices16 = VUNDEF2_I();
-
-vIndices16 = INSERT2_I(vIndices16, vIndices,  0);
-vIndices16 = INSERT2_I(vIndices16, vIndices2, 1);
+Value *vIndices16 = JOIN2(vIndices, vIndices2);
 
 vCurIndices16 = ADD(vIndices16, vBaseVertex16);
 #else
@@ -1206,9 +1200,7 @@ void FetchJit::JitGatherVertices

Mesa (master): swr/rast: EXTRACT2 changed from vextract/ vinsert to vshuffle

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 11a9d4f9b53722a491d9f23e848a02b741febd44
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=11a9d4f9b53722a491d9f23e848a02b741febd44

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Dec 14 13:39:29 2017 -0600

swr/rast: EXTRACT2 changed from vextract/vinsert to vshuffle

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 60 ++
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  3 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 30 +--
 3 files changed, 32 insertions(+), 61 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index bdcafd28a3..0774889af1 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -653,16 +653,14 @@ namespace SwrJit
 }
 else
 {
-Value *src0 = EXTRACT2_F(vSrc, 0);
-Value *src1 = EXTRACT2_F(vSrc, 1);
+Value *src0 = EXTRACT2(vSrc, 0);
+Value *src1 = EXTRACT2(vSrc, 1);
 
-Value *indices0 = EXTRACT2_I(vIndices, 0);
-Value *indices1 = EXTRACT2_I(vIndices, 1);
+Value *indices0 = EXTRACT2(vIndices, 0);
+Value *indices1 = EXTRACT2(vIndices, 1);
 
-Value *vmask16 = VMASK2(vMask);
-
-Value *mask0 = MASK(EXTRACT2_I(vmask16, 0));  // TODO: do this 
better..
-Value *mask1 = MASK(EXTRACT2_I(vmask16, 1));
+Value *mask0 = EXTRACT2(vMask, 0);
+Value *mask1 = EXTRACT2(vMask, 1);
 
 Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
 Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
@@ -738,16 +736,14 @@ namespace SwrJit
 }
 else
 {
-Value *src0 = EXTRACT2_F(vSrc, 0);
-Value *src1 = EXTRACT2_F(vSrc, 1);
-
-Value *indices0 = EXTRACT2_I(vIndices, 0);
-Value *indices1 = EXTRACT2_I(vIndices, 1);
+Value *src0 = EXTRACT2(vSrc, 0);
+Value *src1 = EXTRACT2(vSrc, 1);
 
-Value *vmask16 = VMASK2(vMask);
+Value *indices0 = EXTRACT2(vIndices, 0);
+Value *indices1 = EXTRACT2(vIndices, 1);
 
-Value *mask0 = MASK(EXTRACT2_I(vmask16, 0));  // TODO: do this 
better..
-Value *mask1 = MASK(EXTRACT2_I(vmask16, 1));
+Value *mask0 = EXTRACT2(vMask, 0);
+Value *mask1 = EXTRACT2(vMask, 1);
 
 Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
 Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
@@ -809,34 +805,12 @@ namespace SwrJit
 }
 
 #if USE_SIMD16_BUILDER
-//
-/// @brief
-Value *Builder::EXTRACT2_F(Value *a2, uint32_t imm)
-{
-const uint32_t i0 = (imm > 0) ? mVWidth : 0;
-
-Value *result = VUNDEF_F();
-
-for (uint32_t i = 0; i < mVWidth; i += 1)
-{
-#if 1
-if (!a2->getType()->getScalarType()->isFloatTy())
-{
-a2 = BITCAST(a2, mSimd2FP32Ty);
-}
-
-#endif
-Value *temp = VEXTRACT(a2, C(i0 + i));
-
-result = VINSERT(result, temp, C(i));
-}
-
-return result;
-}
-
-Value *Builder::EXTRACT2_I(Value *a2, uint32_t imm)
+Value *Builder::EXTRACT2(Value *x, uint32_t imm)
 {
-return BITCAST(EXTRACT2_F(a2, imm), mSimdInt32Ty);
+if (imm == 0)
+return VSHUFFLE(x, UndefValue::get(x->getType()), {0, 1, 2, 3, 4, 
5, 6, 7});
+else
+return VSHUFFLE(x, UndefValue::get(x->getType()), {8, 9, 10, 11, 
12, 13, 14, 15});
 }
 
 Value *Builder::JOIN2(Value *a, Value *b)
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 98bc563351..646ed0efb2 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -117,8 +117,7 @@ Value *VMASK2(Value *mask);
 //
 
 #if USE_SIMD16_BUILDER
-Value *EXTRACT2_F(Value *a2, uint32_t imm);
-Value *EXTRACT2_I(Value *a2, uint32_t imm);
+Value *EXTRACT2(Value *x, uint32_t imm);
 Value *JOIN2(Value *a, Value *b);
 #endif
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 8d97ddfdc9..aa911b58f3 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -1078,14 +1078,12 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
   

Mesa (master): swr/rast: Convert gather masks to Nx1bit

2017-12-15 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 3ec98ab5d4fc9d53948fc9280caac83c70d9dc09
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=3ec98ab5d4fc9d53948fc9280caac83c70d9dc09

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Mon Dec  4 15:16:13 2017 -0600

swr/rast: Convert gather masks to Nx1bit

Simplifies calling code, gets gather function interface closer to llvm's
masked_gather.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 20 +
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 34 +-
 2 files changed, 14 insertions(+), 40 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 0221106664..04092541e5 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -602,7 +602,7 @@ namespace SwrJit
 if(JM()->mArch.AVX2())
 {
 // force mask to , required by vgather
-Value *mask = BITCAST(vMask, mSimdFP32Ty);
+Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
 
 vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
 }
@@ -617,7 +617,6 @@ namespace SwrJit
 vGather = VUNDEF_F();
 Value *vScaleVec = VIMMED1((uint32_t)scale);
 Value *vOffsets = MUL(vIndices,vScaleVec);
-Value *mask = MASK(vMask);
 for(uint32_t i = 0; i < mVWidth; ++i)
 {
 // single component byte index
@@ -627,7 +626,7 @@ namespace SwrJit
 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
 // pointer to the value to load if we're masking off a 
component
 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
-Value *selMask = VEXTRACT(mask,C(i));
+Value *selMask = VEXTRACT(vMask,C(i));
 // switch in a safe address to load if we're trying to access 
a vertex 
 Value *validAddress = SELECT(selMask, loadAddress, 
maskLoadAddress);
 Value *val = LOAD(validAddress);
@@ -648,7 +647,7 @@ namespace SwrJit
 if (JM()->mArch.AVX512F())
 {
 // force mask to , required by vgather2
-Value *mask = BITCAST(MASK2(vMask), mInt16Ty);
+Value *mask = BITCAST(vMask, mInt16Ty);
 
 vGather = VGATHERPS2(vSrc, pBase, vIndices, mask, 
C((uint32_t)scale));
 }
@@ -689,7 +688,7 @@ namespace SwrJit
 // use avx2 gather instruction if available
 if(JM()->mArch.AVX2())
 {
-vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
+vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
 }
 else
 {
@@ -702,7 +701,6 @@ namespace SwrJit
 vGather = VUNDEF_I();
 Value *vScaleVec = VIMMED1((uint32_t)scale);
 Value *vOffsets = MUL(vIndices, vScaleVec);
-Value *mask = MASK(vMask);
 for(uint32_t i = 0; i < mVWidth; ++i)
 {
 // single component byte index
@@ -712,7 +710,7 @@ namespace SwrJit
 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 
0));
 // pointer to the value to load if we're masking off a 
component
 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
-Value *selMask = VEXTRACT(mask, C(i));
+Value *selMask = VEXTRACT(vMask, C(i));
 // switch in a safe address to load if we're trying to access 
a vertex 
 Value *validAddress = SELECT(selMask, loadAddress, 
maskLoadAddress);
 Value *val = LOAD(validAddress, C(0));
@@ -739,6 +737,7 @@ namespace SwrJit
 // use avx2 gather instruction if available
 if(JM()->mArch.AVX2())
 {
+vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, 
mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2));
 vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
 }
 else
@@ -752,7 +751,6 @@ namespace SwrJit
 vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
 Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
 Value *vOffsets = MUL(vIndices,vScaleVec);
-Value *mask = MASK(vMask);
 for(uint32_t i = 0; i < mVWidth/2; ++i)
 {
 // single component byte index
@@ -762,7 +760,7 @@ namespace SwrJit
 loadAddress = 
BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
 // pointer to the value to load if we're masking off a 
component
 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
-Value *selMask = VEXTRACT(mask,C(i));
+

Mesa (master): swr/rast: Add alignment to transpose targets

2017-11-20 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 44025def06a8b8d1c019f611079a003964ea7511
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=44025def06a8b8d1c019f611079a003964ea7511

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Nov  7 15:24:25 2017 -0600

swr/rast: Add alignment to transpose targets

Needed to ensure alignment for avx512.

Fixes address sanitizer crash.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index b624ae69b3..9d1f0d8799 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -796,10 +796,10 @@ endBinTriangles:
 
 // transpose verts needed for backend
 /// @todo modify BE to take non-transformed verts
-simd4scalar vHorizX[SIMD_WIDTH];
-simd4scalar vHorizY[SIMD_WIDTH];
-simd4scalar vHorizZ[SIMD_WIDTH];
-simd4scalar vHorizW[SIMD_WIDTH];
+OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
+OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
+OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
+OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
 
 TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x);
 TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y);
@@ -1510,10 +1510,10 @@ void BinPostSetupLinesImpl(
 
 // transpose verts needed for backend
 /// @todo modify BE to take non-transformed verts
-simd4scalar vHorizX[SIMD_WIDTH];
-simd4scalar vHorizY[SIMD_WIDTH];
-simd4scalar vHorizZ[SIMD_WIDTH];
-simd4scalar vHorizW[SIMD_WIDTH];
+OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
+OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
+OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
+OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
 
 if (!primMask)
 {

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Enable AVX-512 targets in the jitter

2017-11-20 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 395a298fa52adf04062b9fee98258b25c0f047e9
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=395a298fa52adf04062b9fee98258b25c0f047e9

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Oct 31 16:46:59 2017 -0500

swr/rast: Enable AVX-512 targets in the jitter

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/knobs.h| 8 
 src/gallium/drivers/swr/rasterizer/jitter/JitManager.h | 2 --
 2 files changed, 10 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h 
b/src/gallium/drivers/swr/rasterizer/core/knobs.h
index fe0a044ae8..e00e2da650 100644
--- a/src/gallium/drivers/swr/rasterizer/core/knobs.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h
@@ -61,18 +61,10 @@
 #define KNOB_SIMD_WIDTH 8
 #define KNOB_SIMD_BYTES 32
 #elif (KNOB_ARCH == KNOB_ARCH_AVX512)
-#if 0
-// not ready to enable this globally, enabled on the side (below)
 #define KNOB_ARCH_ISA AVX512F
 #define KNOB_ARCH_STR "AVX512"
-#define KNOB_SIMD_WIDTH 16
-#define KNOB_SIMD_BYTES 64
-#else
-#define KNOB_ARCH_ISA AVX2
-#define KNOB_ARCH_STR "AVX2"
 #define KNOB_SIMD_WIDTH 8
 #define KNOB_SIMD_BYTES 32
-#endif
 #else
 #error "Unknown architecture"
 #endif
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index 46ffe276a0..c30a807222 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -102,14 +102,12 @@ public:
 bForceAVX2 = true;
 bForceAVX512 = false;
 }
-#if 0
 else if(isaRequest == "avx512")
 {
 bForceAVX = false;
 bForceAVX2 = false;
 bForceAVX512 = true;
 }
-#endif
 };
 
 bool AVX2(void) { return bForceAVX ? 0 : InstructionSet::AVX2(); }

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Points with clipdistance can' t go through simplepoints path

2017-11-20 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 37bb69fb88d632b4c50162c5d6b0ccd96f23d533
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=37bb69fb88d632b4c50162c5d6b0ccd96f23d533

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Oct 31 09:41:02 2017 -0500

swr/rast: Points with clipdistance can't go through simplepoints path

Fixes piglit glsl-1.20:vs-clip-vertex-primitives and
glsl-1.30:vs-clip-distance-primitives.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/frontend.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h 
b/src/gallium/drivers/swr/rasterizer/core/frontend.h
index 5cb2f87c15..11099d6449 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@@ -352,7 +352,8 @@ bool CanUseSimplePoints(DRAW_CONTEXT *pDC)
 return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X &&
 state.rastState.pointSize == 1.0f &&
 !state.rastState.pointParam &&
-!state.rastState.pointSpriteEnable);
+!state.rastState.pointSpriteEnable &&
+!state.backendState.clipDistanceMask);
 }
 
 INLINE

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Support flexible vertex layout for DS output

2017-11-20 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: e612231f20883aa31a6ed5b260872f1cdb84c223
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=e612231f20883aa31a6ed5b260872f1cdb84c223

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Oct 18 16:51:07 2017 -0500

swr/rast: Support flexible vertex layout for DS output

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 1 +
 src/gallium/drivers/swr/rasterizer/core/state.h  | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 211e9e4b07..e15b300979 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -1237,6 +1237,7 @@ static void TessellationStages(
 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
+dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
 #if USE_SIMD16_FRONTEND
 dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); 
 // simd8 -> simd16
 #else
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index 2af384fd90..d11ffc69b0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -288,6 +288,7 @@ struct SWR_DS_CONTEXT
 uint32_tPrimitiveID;// IN: (SCALAR) PrimitiveID for the patch 
associated with the DS invocation
 uint32_tvectorOffset;   // IN: (SCALAR) vector index offset into 
SIMD data.
 uint32_tvectorStride;   // IN: (SCALAR) stride (in vectors) of 
output data per attribute-component
+uint32_toutVertexAttribOffset; // IN: (SCALAR) Offset to the 
attributes as processed by the next shader stage.
 ScalarPatch*pCpIn;  // IN: (SCALAR) Control patch
 simdscalar* pDomainU;   // IN: (SIMD) Domain Point U coords
 simdscalar* pDomainV;   // IN: (SIMD) Domain Point V coords
@@ -819,6 +820,7 @@ struct SWR_TS_STATE
 uint32_tnumHsOutputAttribs;
 uint32_tnumDsOutputAttribs;
 uint32_tdsAllocationSize;
+uint32_tdsOutVtxAttribOffset;
 
 // Offset to the start of the attributes of the input vertices, in 
simdvector units
 uint32_tvertexAttribOffset;

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Widen fetch shader to SIMD16

2017-11-20 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 08512c52de783233fd2292951095e2456da843a4
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=08512c52de783233fd2292951095e2456da843a4

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Oct 19 17:33:37 2017 -0500

swr/rast: Widen fetch shader to SIMD16

Widen fetch shader to SIMD16, enable SIMD16 types in the jitter,
and provide utility EXTRACT/INSERT SIMD8 <-> SIMD16 utility functions.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/jitter/builder.cpp  | 20 
 .../drivers/swr/rasterizer/jitter/builder.h| 16 ++
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 52 
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  9 
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 57 --
 5 files changed, 151 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index 6a33ec265f..4b83a3204c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -41,6 +41,9 @@ namespace SwrJit
 : mpJitMgr(pJitMgr)
 {
 mVWidth = pJitMgr->mVWidth;
+#if USE_SIMD16_BUILDER
+mVWidth2 = pJitMgr->mVWidth * 2;
+#endif
 
 mpIRBuilder = >mBuilder;
 
@@ -65,17 +68,34 @@ namespace SwrJit
 mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
 mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4);
 mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5);
+#if USE_SIMD16_BUILDER
+mSimd2Int1Ty = VectorType::get(mInt1Ty, mVWidth2);
+mSimd2Int16Ty = VectorType::get(mInt16Ty, mVWidth2);
+mSimd2Int32Ty = VectorType::get(mInt32Ty, mVWidth2);
+mSimd2Int64Ty = VectorType::get(mInt64Ty, mVWidth2);
+mSimd2FP16Ty = VectorType::get(mFP16Ty, mVWidth2);
+mSimd2FP32Ty = VectorType::get(mFP32Ty, mVWidth2);
+mSimd2VectorTy = ArrayType::get(mSimd2FP32Ty, 4);
+mSimd2VectorTRTy = ArrayType::get(mSimd2FP32Ty, 5);
+#endif
 
 if (sizeof(uint32_t*) == 4)
 {
 mIntPtrTy = mInt32Ty;
 mSimdIntPtrTy = mSimdInt32Ty;
+#if USE_SIMD16_BUILDER
+mSimd2IntPtrTy = mSimd2Int32Ty;
+#endif
 }
 else
 {
 SWR_ASSERT(sizeof(uint32_t*) == 8);
+
 mIntPtrTy = mInt64Ty;
 mSimdIntPtrTy = mSimdInt64Ty;
+#if USE_SIMD16_BUILDER
+mSimd2IntPtrTy = mSimd2Int64Ty;
+#endif
 }
 }
 }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 8210e49b18..c6ab64e06e 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -32,6 +32,8 @@
 #include "JitManager.h"
 #include "common/formats.h"
 
+#define USE_SIMD16_BUILDER 0
+
 namespace SwrJit
 {
 using namespace llvm;
@@ -45,6 +47,9 @@ namespace SwrJit
 IRBuilder<>* mpIRBuilder;
 
 uint32_t mVWidth;
+#if USE_SIMD16_BUILDER
+uint32_t mVWidth2;
+#endif
 
 // Built in types.
 Type*mVoidTy;
@@ -70,6 +75,17 @@ namespace SwrJit
 Type*mSimdIntPtrTy;
 Type*mSimdVectorTy;
 Type*mSimdVectorTRTy;
+#if USE_SIMD16_BUILDER
+Type*mSimd2FP16Ty;
+Type*mSimd2FP32Ty;
+Type*mSimd2Int1Ty;
+Type*mSimd2Int16Ty;
+Type*mSimd2Int32Ty;
+Type*mSimd2Int64Ty;
+Type*mSimd2IntPtrTy;
+Type*mSimd2VectorTy;
+Type*mSimd2VectorTRTy;
+#endif
 
 #include "gen_builder.hpp"
 #include "gen_builder_x86.hpp"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 9ca36b2467..daa9cb1ec1 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -231,6 +231,13 @@ namespace SwrJit
 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
 }
 
+#if USE_SIMD16_BUILDER
+Value *Builder::VUNDEF2_F()
+{
+return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2));
+}
+
+#endif
 Value *Builder::VUNDEF(Type* t)
 {
 return UndefValue::get(VectorType::get(t, mVWidth));
@@ -690,6 +697,51 @@ namespace SwrJit
 return vGather;
 }
 
+#if USE_SIMD16_BUILDER
+//
+/// @brief
+Value *Builder::EXTRACT(Value *a2, uint32_t imm)
+{
+const uint32_t i0 = (imm > 0) ? mV

Mesa (master): swr/rast: Repair simd8 frontend code rot

2017-11-20 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 34838c221260f961140040416b1a84b490448ac1
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=34838c221260f961140040416b1a84b490448ac1

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Fri Nov 10 16:45:38 2017 -0600

swr/rast: Repair simd8 frontend code rot

Keep non-default simd8 frontend code running for comparison purposes.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 2fe6cfcf69..5a61dc33a0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -956,7 +956,7 @@ static void GeometryShaderStage(
 PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, 
numEmittedVerts, pState->outputVertexSize, reinterpret_cast(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, 
processCutVerts, pa.numVertsPerPrim);
 
 #else
-PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, 
numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, 
numAttribs, pState->outputTopology, processCutVerts);
+PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, 
numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, 
numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim);
 
 #endif
 while (gsPa.GetNextStreamOutput())

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Cache eventmanager

2017-11-20 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: bc356b0fc0839b19eadbd96018f23c486ff00e84
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=bc356b0fc0839b19eadbd96018f23c486ff00e84

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Nov  7 13:50:11 2017 -0600

swr/rast: Cache eventmanager

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/archrast/archrast.h | 1 +
 src/gallium/drivers/swr/rasterizer/core/api.cpp| 5 +
 src/gallium/drivers/swr/rasterizer/core/api.h  | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.h 
b/src/gallium/drivers/swr/rasterizer/archrast/archrast.h
index fa88a4948c..c74d6ad909 100644
--- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.h
+++ b/src/gallium/drivers/swr/rasterizer/archrast/archrast.h
@@ -29,6 +29,7 @@
 
 #include "common/os.h"
 #include "gen_ar_event.hpp"
+#include "eventmanager.h"
 
 namespace ArchRast
 {
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 20eeb29681..9265440904 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -143,6 +143,11 @@ HANDLE SwrCreateContext(
 #endif
 }
 
+#if defined(KNOB_ENABLE_AR)
+// cache the API thread event manager, for use with sim layer
+pCreateInfo->hArEventManager = pContext->pArContext[16];
+#endif
+
 // State setup AFTER context is fully initialized
 SetupDefaultState(pContext);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h 
b/src/gallium/drivers/swr/rasterizer/core/api.h
index 60f56c6d76..c032b0bb10 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -213,6 +213,9 @@ struct SWR_CREATECONTEXT_INFO
 // Output: size required memory passed to for SwrSaveState / 
SwrRestoreState
 size_t  contextSaveSize;
 
+// ArchRast event manager.
+HANDLE  hArEventManager;
+
 // Input (optional): Threading info that overrides any set KNOB values.
 SWR_THREADING_INFO* pThreadInfo;
 

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Code style change (NFC)

2017-11-20 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: d9de8f3122737517352eeaa4d1f2e79360526eff
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=d9de8f3122737517352eeaa4d1f2e79360526eff

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Mon Oct 23 15:10:35 2017 -0500

swr/rast: Code style change (NFC)

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index e15b300979..2fe6cfcf69 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -39,6 +39,7 @@
 #include "tilemgr.h"
 #include "tessellator.h"
 #include 
+#include 
 
 //
 /// @brief Helper macro to generate a bitmask
@@ -770,6 +771,7 @@ void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, 
uint32_t numVerts, uint32_t
 }
 }
 
+
 //
 /// @brief Implements GS stage.
 /// @param pDC - pointer to draw context.
@@ -1335,8 +1337,11 @@ static void TessellationStages(
 
 SWR_ASSERT(pfnClipFunc);
 #if USE_SIMD16_FRONTEND
-tessPa.useAlternateOffset = false;
-pfnClipFunc(pDC, tessPa, workerId, prim_simd16, 
GenMask(numPrims), primID);
+
+{
+tessPa.useAlternateOffset = false;
+pfnClipFunc(pDC, tessPa, workerId, prim_simd16, 
GenMask(numPrims), primID);
+}
 #else
 pfnClipFunc(pDC, tessPa, workerId, prim,
 GenMask(tessPa.NumPrims()), 
_simd_set1_epi32(dsContext.PrimitiveID));

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Implement AVX-512 GATHERPS in SIMD16 fetch shader

2017-11-20 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 005d937e1533521e87f0119c400298c02f365bf1
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=005d937e1533521e87f0119c400298c02f365bf1

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Nov  8 19:17:24 2017 -0600

swr/rast: Implement AVX-512 GATHERPS in SIMD16 fetch shader

Disabled for now.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |   1 +
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 126 +++--
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  31 -
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|  91 ---
 4 files changed, 220 insertions(+), 29 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index ce892a9abe..44fc857371 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -44,6 +44,7 @@ inst_aliases = {
 intrinsics = [
 ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
 ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
+['VGATHERPS2', 'x86_avx512_gather_dps_512', ['src', 'pBase', 
'indices', 'mask', 'scale']],
 ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
 ['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']],
 ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index bd3a52566d..8ffe05b41c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -211,6 +211,28 @@ namespace SwrJit
 return ConstantVector::getSplat(mVWidth, cast(C(i)));
 }
 
+#if USE_SIMD16_BUILDER
+Value *Builder::VIMMED2_1(int i)
+{
+return ConstantVector::getSplat(mVWidth2, cast(C(i)));
+}
+
+Value *Builder::VIMMED2_1(uint32_t i)
+{
+return ConstantVector::getSplat(mVWidth2, cast(C(i)));
+}
+
+Value *Builder::VIMMED2_1(float i)
+{
+return ConstantVector::getSplat(mVWidth2, cast(C(i)));
+}
+
+Value *Builder::VIMMED2_1(bool i)
+{
+return ConstantVector::getSplat(mVWidth2, cast(C(i)));
+}
+
+#endif
 Value *Builder::VUNDEF_IPTR()
 {
 return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
@@ -237,6 +259,11 @@ namespace SwrJit
 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2));
 }
 
+Value *Builder::VUNDEF2_I()
+{
+return UndefValue::get(VectorType::get(mInt32Ty, mVWidth2));
+}
+
 #endif
 Value *Builder::VUNDEF(Type* t)
 {
@@ -254,6 +281,19 @@ namespace SwrJit
 return VECTOR_SPLAT(mVWidth, src);
 }
 
+#if USE_SIMD16_BUILDER
+Value *Builder::VBROADCAST2(Value *src)
+{
+// check if src is already a vector
+if (src->getType()->isVectorTy())
+{
+return src;
+}
+
+return VECTOR_SPLAT(mVWidth2, src);
+}
+
+#endif
 uint32_t Builder::IMMED(Value* v)
 {
 SWR_ASSERT(isa(v));
@@ -554,16 +594,17 @@ namespace SwrJit
 /// @param vIndices - SIMD wide value of VB byte offsets
 /// @param vMask - SIMD wide mask that controls whether to access memory 
or the src values
 /// @param scale - value to scale indices by
-Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, uint8_t scale)
+Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value 
*vMask, uint8_t scale)
 {
-Value* vGather;
+Value *vGather;
 
 // use avx2 gather instruction if available
 if(JM()->mArch.AVX2())
 {
 // force mask to , required by vgather
-vMask = BITCAST(vMask, mSimdFP32Ty);
-vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,C(scale));
+Value *mask = BITCAST(vMask, mSimdFP32Ty);
+
+vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
 }
 else
 {
@@ -598,6 +639,41 @@ namespace SwrJit
 return vGather;
 }
 
+#if USE_SIMD16_BUILDER
+Value *Builder::GATHERPS2(Value *vSrc, Value *pBase, Value *vIndices, 
Value *vMask, uint8_t scale)
+{
+Value *vGather = VUNDEF2_F();
+
+// use avx512 gather instruction if available
+if (JM()->mArch.AVX512F())
+{
+// force mask to , required by vgather2
+Value *mask = BITCAST(MASK2(vMask), mInt16Ty);
+
+vGather = VGATHERPS2(vSrc, pBase, vIndices, mask, 
C((uint32_t)scale));
+}
+else
+{
+Value *src0 = EXTRACT2_F(vSrc, 0);

Mesa (master): swr/rast: Simplify GATHER* jit builder api

2017-11-20 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 2e244c7168a1130a18c8d8a901161db9b6cbaac3
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=2e244c7168a1130a18c8d8a901161db9b6cbaac3

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Nov  8 14:07:33 2017 -0600

swr/rast: Simplify GATHER* jit builder api

General cleanup, and prep work for possibly moving to llvm masked
gather intrinsic.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 32 ++---
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  6 +--
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 56 +++---
 src/gallium/drivers/swr/swr_shader.cpp |  2 +-
 4 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index daa9cb1ec1..bd3a52566d 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -554,7 +554,7 @@ namespace SwrJit
 /// @param vIndices - SIMD wide value of VB byte offsets
 /// @param vMask - SIMD wide mask that controls whether to access memory 
or the src values
 /// @param scale - value to scale indices by
-Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, Value* scale)
+Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, uint8_t scale)
 {
 Value* vGather;
 
@@ -563,7 +563,7 @@ namespace SwrJit
 {
 // force mask to , required by vgather
 vMask = BITCAST(vMask, mSimdFP32Ty);
-vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
+vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,C(scale));
 }
 else
 {
@@ -574,7 +574,7 @@ namespace SwrJit
 STORE(vSrc, vSrcPtr);
 
 vGather = VUNDEF_F();
-Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
+Value *vScaleVec = VIMMED1((uint32_t)scale);
 Value *vOffsets = MUL(vIndices,vScaleVec);
 Value *mask = MASK(vMask);
 for(uint32_t i = 0; i < mVWidth; ++i)
@@ -606,14 +606,14 @@ namespace SwrJit
 /// @param vIndices - SIMD wide value of VB byte offsets
 /// @param vMask - SIMD wide mask that controls whether to access memory 
or the src values
 /// @param scale - value to scale indices by
-Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, Value* scale)
+Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, uint8_t scale)
 {
 Value* vGather;
 
 // use avx2 gather instruction if available
 if(JM()->mArch.AVX2())
 {
-vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
+vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
 }
 else
 {
@@ -624,7 +624,7 @@ namespace SwrJit
 STORE(vSrc, vSrcPtr);
 
 vGather = VUNDEF_I();
-Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
+Value *vScaleVec = VIMMED1((uint32_t)scale);
 Value *vOffsets = MUL(vIndices, vScaleVec);
 Value *mask = MASK(vMask);
 for(uint32_t i = 0; i < mVWidth; ++i)
@@ -656,14 +656,14 @@ namespace SwrJit
 /// @param vIndices - SIMD wide value of VB byte offsets
 /// @param vMask - SIMD wide mask that controls whether to access memory 
or the src values
 /// @param scale - value to scale indices by
-Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, Value* scale)
+Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, uint8_t scale)
 {
 Value* vGather;
 
 // use avx2 gather instruction if available
 if(JM()->mArch.AVX2())
 {
-vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, scale);
+vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
 }
 else
 {
@@ -674,7 +674,7 @@ namespace SwrJit
 STORE(vSrc, vSrcPtr);
 
 vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
-Value *vScaleVec = VECTOR_SPLAT(4, Z_EXT(scale,mInt32Ty));
+Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
 Value *vOffsets = MUL(vIndices,vScaleVec);
 Value *mask = MASK(vMask);
 for(uint32_t i = 0; i < mVWidth/2; ++i)
@@ -1016,7 +1016,7 @@ namespace SwrJit
 // save mask as it is zero'd out after each gather
 vMask = mask;
 
-vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, 
byteOffsets, vMask, C((char)1));
+vGatherResult[0] = GATHERPS(vGatherMaskedVal, pS

Mesa (master): swr/rast: Faster emulated simd16 permute

2017-11-14 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: d8489517a572c7e5c5405ebf510db9d20b1e2591
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=d8489517a572c7e5c5405ebf510db9d20b1e2591

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Mon Nov 13 18:39:38 2017 -0600

swr/rast: Faster emulated simd16 permute

Speed up simd16 frontend (default) on avx/avx2 platforms;
fixes performance regression caused by switch to simdlib.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>
Cc: mesa-sta...@lists.freedesktop.org

---

 .../swr/rasterizer/common/simdlib_512_emu.inl  | 34 +++---
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
index d6af7b1c64..44eba0b126 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
@@ -521,36 +521,24 @@ SIMD_IWRAPPER_2(packus_epi32); // See documentation 
for _mm256_packus_epi32
 
 static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const , Integer 
const ) // return a[swiz[i]] for each 32-bit lane i (int32)
 {
-Integer result;
-
-// Ugly slow implementation
-uint32_t const *pA = reinterpret_cast();
-uint32_t const *pSwiz = reinterpret_cast();
-uint32_t *pResult = reinterpret_cast();
-
-for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
-{
-pResult[i] = pA[0xF & pSwiz[i]];
-}
-
-return result;
+return castps_si(permute_ps(castsi_ps(a), swiz));
 }
 
 static SIMDINLINE Float SIMDCALL permute_ps(Float const , Integer const 
)// return a[swiz[i]] for each 32-bit lane i (float)
 {
-Float result;
+const auto mask = SIMD256T::set1_epi32(7);
 
-// Ugly slow implementation
-float const *pA = reinterpret_cast();
-uint32_t const *pSwiz = reinterpret_cast();
-float *pResult = reinterpret_cast();
+auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], 
mask));
+auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], 
mask));
 
-for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
-{
-pResult[i] = pA[0xF & pSwiz[i]];
-}
+auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], 
mask));
+auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], 
mask));
 
-return result;
+return Float
+{
+SIMD256T::blendv_ps(lolo, lohi, 
SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
+SIMD256T::blendv_ps(hilo, hihi, 
SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
+};
 }
 
 // All of the 512-bit permute2f128_XX intrinsics do the following:

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Use gather instruction for i32gather_ps on simd16 /avx512

2017-11-14 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 439904847e9c2970494c18e8c47bd6c38c0ed8ab
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=439904847e9c2970494c18e8c47bd6c38c0ed8ab

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Mon Nov 13 15:11:21 2017 -0600

swr/rast: Use gather instruction for i32gather_ps on simd16/avx512

Speed up avx512 platforms; fixes performance regression caused
by swithc to simdlib.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>
Cc: mesa-sta...@lists.freedesktop.org

---

 .../drivers/swr/rasterizer/common/simdlib_512_avx512.inl | 12 +---
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
index 95e4c31909..c13b9f616a 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
@@ -484,17 +484,7 @@ SIMD_WRAPPER_2(unpacklo_ps);
 template
 static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // 
return *(float*)(((int8*)p) + (idx * ScaleT))
 {
-uint32_t *pOffsets = (uint32_t*)
-Float vResult;
-float* pResult = (float*)
-for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
-{
-uint32_t offset = pOffsets[i];
-offset = offset * static_cast(ScaleT);
-pResult[i] = *(float const*)(((uint8_t const*)p + offset));
-}
-
-return vResult;
+return _mm512_i32gather_ps(idx, p, static_cast(ScaleT));
 }
 
 static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p
(broadcast 1 value to all elements)

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): gallivm: allow arch rounding with avx512

2017-11-02 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 0023b5ae67255000e6de93f6e17f74895e7677e0
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=0023b5ae67255000e6de93f6e17f74895e7677e0

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Nov  1 13:22:47 2017 -0500

gallivm: allow arch rounding with avx512

Fixes piglit vs-roundeven-{float,vec[234]} with simd16 VS.

Reviewed-by: Roland Scheidegger <srol...@vmware.com>

---

 src/gallium/auxiliary/gallivm/lp_bld_arit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c 
b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index cf1958b3b6..a1edd349f1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1953,7 +1953,8 @@ arch_rounding_available(const struct lp_type type)
 {
if ((util_cpu_caps.has_sse4_1 &&
(type.length == 1 || type.width*type.length == 128)) ||
-   (util_cpu_caps.has_avx && type.width*type.length == 256))
+   (util_cpu_caps.has_avx && type.width*type.length == 256) ||
+   (util_cpu_caps.has_avx512f && type.width*type.length == 512))
   return TRUE;
else if ((util_cpu_caps.has_altivec &&
 (type.width == 32 && type.length == 4)))

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Fix indentation

2017-10-19 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 04ea03d99d8810a7df5aca059ff00c26ecaa71ee
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=04ea03d99d8810a7df5aca059ff00c26ecaa71ee

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Fri Oct  6 13:50:14 2017 -0500

swr/rast: Fix indentation

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index f7c9308be0..d9450fcbd7 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -820,7 +820,7 @@ struct SWR_TS_STATE
 uint32_tnumDsOutputAttribs;
 
 // Offset to the start of the attributes of the input vertices, in 
simdvector units
-uint32_t vertexAttribOffset;
+uint32_tvertexAttribOffset;
 };
 
 // output merger state

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Add api to override draws in flight

2017-10-19 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 028ffa5e1820707ef0cab52853e36a259b00c849
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=028ffa5e1820707ef0cab52853e36a259b00c849

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Oct 17 15:02:53 2017 -0500

swr/rast: Add api to override draws in flight

Allow draws in flight to be overridden via SWR_CREATECONTEXT_INFO.

Patch by Jan Zielinski.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/api.cpp| 26 +-
 src/gallium/drivers/swr/rasterizer/core/api.h  |  4 
 src/gallium/drivers/swr/rasterizer/core/context.h  |  2 ++
 .../drivers/swr/rasterizer/core/threads.cpp| 18 +++
 4 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 632309821f..20eeb29681 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -74,13 +74,19 @@ HANDLE SwrCreateContext(
 
 pContext->privateStateSize = pCreateInfo->privateStateSize;
 
-pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
-pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
+pContext->MAX_DRAWS_IN_FLIGHT = KNOB_MAX_DRAWS_IN_FLIGHT;
+if (pCreateInfo->MAX_DRAWS_IN_FLIGHT != 0)
+{
+pContext->MAX_DRAWS_IN_FLIGHT = pCreateInfo->MAX_DRAWS_IN_FLIGHT;
+}
+
+pContext->dcRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
+pContext->dsRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
 
-pContext->pMacroTileManagerArray = 
(MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 
64);
-pContext->pDispatchQueueArray = 
(DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 
64);
+pContext->pMacroTileManagerArray = 
(MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * 
pContext->MAX_DRAWS_IN_FLIGHT, 64);
+pContext->pDispatchQueueArray = 
(DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * 
pContext->MAX_DRAWS_IN_FLIGHT, 64);
 
-for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
+for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
 {
 pContext->dcRing[dc].pArena = new 
CachingArena(pContext->cachingArenaAllocator);
 new (>pMacroTileManagerArray[dc]) 
MacroTileMgr(*pContext->dcRing[dc].pArena);
@@ -173,7 +179,7 @@ template
 void QueueWork(SWR_CONTEXT *pContext)
 {
 DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
-uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
 
 if (IsDraw)
 {
@@ -257,7 +263,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool 
isSplitDraw = false)
 }
 
 uint64_t curDraw = pContext->dcRing.GetHead();
-uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
+uint32_t dcIndex = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
 
 if ((pContext->frameCount - pContext->lastFrameChecked) > 2 ||
 (curDraw - pContext->lastDrawChecked) > 0x1)
@@ -273,7 +279,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool 
isSplitDraw = false)
 pContext->pCurDrawContext = pCurDrawContext;
 
 // Assign next available entry in DS ring to this DC.
-uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
+uint32_t dsIndex = pContext->curStateId % 
pContext->MAX_DRAWS_IN_FLIGHT;
 pCurDrawContext->pState = >dsRing[dsIndex];
 
 // Copy previous state to current state.
@@ -361,7 +367,7 @@ void SwrDestroyContext(HANDLE hContext)
 DestroyThreadPool(pContext, >threadPool);
 
 // free the fifos
-for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
+for (uint32_t i = 0; i < pContext->MAX_DRAWS_IN_FLIGHT; ++i)
 {
 AlignedFree(pContext->dcRing[i].dynState.pStats);
 delete pContext->dcRing[i].pArena;
@@ -1481,7 +1487,7 @@ void SwrDispatch(
 pTaskData->threadGroupCountZ = threadGroupCountZ;
 
 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * 
threadGroupCountZ;
-uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
 pDC->pDispatch = >pDispatchQueueArray[dcIndex];
 pDC->pDispatch->initialize(totalThreadGroups, pTaskData, 
);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h 
b/src/gallium/drivers/swr/rasterizer/core/api.h
index 577cfb157a..60f56c6d76 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -215,6 +215,10 @@ struct SWR_CREATECONTEXT_INFO
 
 // Input (optional): Threading info that overrides any set KNOB values.
 

Mesa (master): swr: knob overrides for Intel Xeon Phi

2017-10-19 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: bfda35c8dd4bc602a3b174377dfea92319438e2b
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=bfda35c8dd4bc602a3b174377dfea92319438e2b

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Oct 17 15:11:19 2017 -0500

swr: knob overrides for Intel Xeon Phi

Architecture benefits from having more threads/work outstanding.

Patch by Jan Zielinski.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/swr_context.cpp | 27 +++
 src/gallium/drivers/swr/swr_context.h   |  2 ++
 src/gallium/drivers/swr/swr_loader.cpp  |  4 
 src/gallium/drivers/swr/swr_scratch.cpp |  2 +-
 src/gallium/drivers/swr/swr_screen.h|  3 +++
 5 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/swr_context.cpp 
b/src/gallium/drivers/swr/swr_context.cpp
index 34d9a259fe..b61720cd30 100644
--- a/src/gallium/drivers/swr/swr_context.cpp
+++ b/src/gallium/drivers/swr/swr_context.cpp
@@ -39,6 +39,7 @@
 
 #include "api.h"
 #include "backend.h"
+#include "knobs.h"
 
 static struct pipe_surface *
 swr_create_surface(struct pipe_context *pipe,
@@ -483,6 +484,8 @@ swr_create_context(struct pipe_screen *p_screen, void 
*priv, unsigned flags)
ctx->blendJIT =
   new std::unordered_map<BLEND_COMPILE_STATE, PFN_BLEND_JIT_FUNC>;
 
+   ctx->max_draws_in_flight = KNOB_MAX_DRAWS_IN_FLIGHT;
+
SWR_CREATECONTEXT_INFO createInfo;
memset(, 0, sizeof(createInfo));
createInfo.privateStateSize = sizeof(swr_draw_context);
@@ -491,6 +494,30 @@ swr_create_context(struct pipe_screen *p_screen, void 
*priv, unsigned flags)
createInfo.pfnClearTile = swr_StoreHotTileClear;
createInfo.pfnUpdateStats = swr_UpdateStats;
createInfo.pfnUpdateStatsFE = swr_UpdateStatsFE;
+
+   SWR_THREADING_INFO threadingInfo {0};
+
+   threadingInfo.MAX_WORKER_THREADS= KNOB_MAX_WORKER_THREADS;
+   threadingInfo.MAX_NUMA_NODES= KNOB_MAX_NUMA_NODES;
+   threadingInfo.MAX_CORES_PER_NUMA_NODE   = KNOB_MAX_CORES_PER_NUMA_NODE;
+   threadingInfo.MAX_THREADS_PER_CORE  = KNOB_MAX_THREADS_PER_CORE;
+   threadingInfo.SINGLE_THREADED   = KNOB_SINGLE_THREADED;
+
+   // Use non-standard settings for KNL
+   if (swr_screen(p_screen)->is_knl)
+   {
+  if (nullptr == getenv("KNOB_MAX_THREADS_PER_CORE"))
+ threadingInfo.MAX_THREADS_PER_CORE  = 2;
+
+  if (nullptr == getenv("KNOB_MAX_DRAWS_IN_FLIGHT"))
+  {
+ ctx->max_draws_in_flight = 2048;
+ createInfo.MAX_DRAWS_IN_FLIGHT = ctx->max_draws_in_flight;
+  }
+   }
+
+   createInfo.pThreadInfo = 
+
ctx->swrContext = ctx->api.pfnSwrCreateContext();
 
ctx->api.pfnSwrInit();
diff --git a/src/gallium/drivers/swr/swr_context.h 
b/src/gallium/drivers/swr/swr_context.h
index 8bed78f869..5c280ee365 100644
--- a/src/gallium/drivers/swr/swr_context.h
+++ b/src/gallium/drivers/swr/swr_context.h
@@ -173,6 +173,8 @@ struct swr_context {
unsigned dirty; /**< Mask of SWR_NEW_x flags */
 
SWR_INTERFACE api;
+
+   uint32_t max_draws_in_flight;
 };
 
 static INLINE struct swr_context *
diff --git a/src/gallium/drivers/swr/swr_loader.cpp 
b/src/gallium/drivers/swr/swr_loader.cpp
index e205fe2d7e..9d6f918e34 100644
--- a/src/gallium/drivers/swr/swr_loader.cpp
+++ b/src/gallium/drivers/swr/swr_loader.cpp
@@ -38,11 +38,14 @@ swr_create_screen(struct sw_winsys *winsys)
 
util_cpu_detect();
 
+   bool is_knl = false;
+
if (!strlen(filename) &&
util_cpu_caps.has_avx512f && util_cpu_caps.has_avx512er) {
 #if HAVE_SWR_KNL
   fprintf(stderr, "KNL ");
   sprintf(filename, "%s%s%s", UTIL_DL_PREFIX, "swrKNL", UTIL_DL_EXT);
+  is_knl = true;
 #else
   fprintf(stderr, "KNL (not built) ");
 #endif
@@ -99,6 +102,7 @@ swr_create_screen(struct sw_winsys *winsys)
 
struct pipe_screen *screen = swr_create_screen_internal(winsys);
swr_screen(screen)->pfnSwrGetInterface = (PFNSwrGetInterface)pApiProc;
+   swr_screen(screen)->is_knl = is_knl;
 
return screen;
 }
diff --git a/src/gallium/drivers/swr/swr_scratch.cpp 
b/src/gallium/drivers/swr/swr_scratch.cpp
index d298a48dc0..8afe73c30e 100644
--- a/src/gallium/drivers/swr/swr_scratch.cpp
+++ b/src/gallium/drivers/swr/swr_scratch.cpp
@@ -45,7 +45,7 @@ swr_copy_to_scratch_space(struct swr_context *ctx,
   ptr = ctx->api.pfnSwrAllocDrawContextMemory(ctx->swrContext, size, 4);
} else {
   /* Allocate enough so that MAX_DRAWS_IN_FLIGHT sets fit. */
-  unsigned int max_size_in_flight = size * KNOB_MAX_DRAWS_IN_FLIGHT;
+  uint32_t max_size_in_flight = size * ctx->max_draws_in_flight;
 
   /* Need to grow space */
   if (max_size_in_flight > space->current_size) {
diff --git a/src/gallium/drivers/swr/swr_screen.h 
b/src/gallium/drivers/swr/swr_scre

Mesa (master): swr/rast: Miscellaneous viewport array code changes

2017-10-19 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 62e2d657c868ee7c7ad6a24269c81a9827c66b8f
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=62e2d657c868ee7c7ad6a24269c81a9827c66b8f

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Fri Sep 29 14:45:16 2017 -0500

swr/rast: Miscellaneous viewport array code changes

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 45 --
 src/gallium/drivers/swr/rasterizer/core/clip.h | 14 +--
 .../drivers/swr/rasterizer/core/frontend.cpp   | 22 ++-
 src/gallium/drivers/swr/rasterizer/core/pa.h   | 24 ++--
 src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp |  4 +-
 5 files changed, 71 insertions(+), 38 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index e08e4896f3..b624ae69b3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -450,16 +450,22 @@ void SIMDCALL BinTrianglesImpl(
 typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
 typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
 
-typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
+typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
+typename SIMD_T::Vec4 vpiAttrib[3];
+typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
 
 if (state.backendState.readViewportArrayIndex)
 {
-typename SIMD_T::Vec4 vpiAttrib[3];
 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
 
+vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+}
+
+
+if (state.backendState.readViewportArrayIndex) // VPAIOffsets are 
guaranteed 0-15 -- no OOB issues if they are offsets from 0 
+{
 // OOB indices => forced to zero.
-typename SIMD_T::Integer vpai = 
SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
-vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
+vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
 typename SIMD_T::Integer vNumViewports = 
SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, 
vNumViewports);
 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
@@ -815,6 +821,7 @@ endBinTriangles:
 SIMD_T::store_si(reinterpret_cast(aRTAI), 
SIMD_T::setzero_si());
 }
 
+
 // scan remaining valid triangles and bin each separately
 while (_BitScanForward(, triMask))
 {
@@ -1299,15 +1306,22 @@ void BinPointsImpl(
 const SWR_RASTSTATE& rastState = state.rastState;
 
 // Read back viewport index if required
-typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
+typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
+typename SIMD_T::Vec4 vpiAttrib[1];
+typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
+
 if (state.backendState.readViewportArrayIndex)
 {
-typename SIMD_T::Vec4 vpiAttrib[1];
 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
 
+vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+}
+
+
+if (state.backendState.readViewportArrayIndex) // VPAIOffsets are 
guaranteed 0-15 -- no OOB issues if they are offsets from 0 
+{
 // OOB indices => forced to zero.
-typename SIMD_T::Integer vpai = 
SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
-vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
+vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
 typename SIMD_T::Integer vNumViewports = 
SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, 
vNumViewports);
 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
@@ -1626,15 +1640,22 @@ void SIMDCALL BinLinesImpl(
 
 typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), 
SIMD_T::set1_ps(1.0f) };
 
-typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
+typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
+typename SIMD_T::Vec4 vpiAttrib[2];
+typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
+
 if (state.backendState.readViewportArrayIndex)
 {
-typename SIMD_T::Vec4 vpiAttrib[2];
 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
 
+vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+}
+
+
+if (state.backendState.readViewportArrayIndex) // VPAIOffsets are 
guaranteed 0-15 -- no OOB issues if they are offsets from 0 
+{
 // OOB indices => forced to zero.
-typename SIMD_T::Integer vpai = 
SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
-vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
+vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
 typename SIMD_T::Integer vNumViewports = 
SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCI

Mesa (master): swr/rast: Change DS memory allocation

2017-10-19 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 49090ccf54798f7c9081f9b20d0ed0d0433ec026
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=49090ccf54798f7c9081f9b20d0ed0d0433ec026

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Oct 11 16:21:21 2017 -0500

swr/rast: Change DS memory allocation

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 4 ++--
 src/gallium/drivers/swr/rasterizer/core/state.h  | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index a803512b7c..211e9e4b07 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -1212,9 +1212,9 @@ static void TessellationStages(
 // Allocate DS Output memory
 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, 
KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
 #if USE_SIMD16_FRONTEND
-size_t requiredAllocSize = sizeof(simdvector) * 
RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs;  // 
simd8 -> simd16, padding
+size_t requiredAllocSize = sizeof(simdvector) * 
RoundUpEven(requiredDSVectorInvocations) * tsState.dsAllocationSize;  // 
simd8 -> simd16, padding
 #else
-size_t requiredDSOutputVectors = requiredDSVectorInvocations * 
tsState.numDsOutputAttribs;
+size_t requiredDSOutputVectors = requiredDSVectorInvocations * 
tsState.dsAllocationSize;
 size_t requiredAllocSize = sizeof(simdvector) * 
requiredDSOutputVectors;
 #endif
 if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index d9450fcbd7..2af384fd90 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -818,6 +818,7 @@ struct SWR_TS_STATE
 uint32_tnumHsInputAttribs;
 uint32_tnumHsOutputAttribs;
 uint32_tnumDsOutputAttribs;
+uint32_tdsAllocationSize;
 
 // Offset to the start of the attributes of the input vertices, in 
simdvector units
 uint32_tvertexAttribOffset;

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): gallium: allow 512-bit vectors

2017-10-11 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 9cad9cbaf89b50ec9e15a7e0fef35fc2e4270550
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=9cad9cbaf89b50ec9e15a7e0fef35fc2e4270550

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Oct 10 11:07:11 2017 -0500

gallium: allow 512-bit vectors

Increase the max allowed vector size from 256 to 512.

No piglit llvmpipe regressions running on avx2.

Reviewed-by: Jose Fonseca <jfons...@vmware.com>
Reviewed-by: Roland Scheidegger <srol...@vmware.com>

---

 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c | 14 +++---
 src/gallium/auxiliary/gallivm/lp_bld_type.h |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index de18f629cd..97efc3a399 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1272,9 +1272,9 @@ emit_fetch_constant(
 /**
  * Fetch 64-bit values from two separate channels.
  * 64-bit values are stored split across two channels, like xy and zw.
- * This function creates a set of 16 floats,
+ * This function creates a set of vec_length*2 floats,
  * extracts the values from the two channels,
- * puts them in the correct place, then casts to 8 64-bits.
+ * puts them in the correct place, then casts to vec_length 64-bits.
  */
 static LLVMValueRef
 emit_fetch_64bit(
@@ -1289,9 +1289,9 @@ emit_fetch_64bit(
LLVMValueRef res;
struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
int i;
-   LLVMValueRef shuffles[16];
+   LLVMValueRef shuffles[2 * (LP_MAX_VECTOR_WIDTH/32)];
int len = bld_base->base.type.length * 2;
-   assert(len <= 16);
+   assert(len <= (2 * (LP_MAX_VECTOR_WIDTH/32)));
 
for (i = 0; i < bld_base->base.type.length * 2; i+=2) {
   shuffles[i] = lp_build_const_int32(gallivm, i / 2);
@@ -1691,7 +1691,7 @@ emit_fetch_deriv(
 }
 
 /**
- * store an array of 8 64-bit into two arrays of 8 floats
+ * store an array of vec-length 64-bit into two arrays of vec_length floats
  * i.e.
  * value is d0, d1, d2, d3 etc.
  * each 64-bit has high and low pieces x, y
@@ -1710,8 +1710,8 @@ emit_store_64bit_chan(struct lp_build_tgsi_context 
*bld_base,
struct lp_build_context *float_bld = _base->base;
unsigned i;
LLVMValueRef temp, temp2;
-   LLVMValueRef shuffles[8];
-   LLVMValueRef shuffles2[8];
+   LLVMValueRef shuffles[LP_MAX_VECTOR_WIDTH/32];
+   LLVMValueRef shuffles2[LP_MAX_VECTOR_WIDTH/32];
 
for (i = 0; i < bld_base->base.type.length; i++) {
   shuffles[i] = lp_build_const_int32(gallivm, i * 2);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h 
b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index afe8722b05..62f1f85461 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -59,7 +59,7 @@ extern unsigned lp_native_vector_width;
  * Should only be used when lp_native_vector_width isn't available,
  * i.e. sizing/alignment of non-malloced variables.
  */
-#define LP_MAX_VECTOR_WIDTH 256
+#define LP_MAX_VECTOR_WIDTH 512
 
 /**
  * Minimum vector alignment for static variable alignment
@@ -67,7 +67,7 @@ extern unsigned lp_native_vector_width;
  * It should always be a constant equal to LP_MAX_VECTOR_WIDTH/8.  An
  * expression is non-portable.
  */
-#define LP_MIN_VECTOR_ALIGN 32
+#define LP_MIN_VECTOR_ALIGN 64
 
 /**
  * Several functions can only cope with vectors of length up to this value.

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr: simd16 shaders work in progress

2017-10-11 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: e4848053528ee108755652acc9763f904677bfd3
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=e4848053528ee108755652acc9763f904677bfd3

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Oct 10 11:08:29 2017 -0500

swr: simd16 shaders work in progress

Start building vertex shaders as simd16.

Disabled by default, set USE_SIMD16_SHADERS in knobs.h to experiment.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/swr_screen.cpp |  6 ++
 src/gallium/drivers/swr/swr_screen.h   |  3 +++
 src/gallium/drivers/swr/swr_shader.cpp | 14 --
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/swr_screen.cpp 
b/src/gallium/drivers/swr/swr_screen.cpp
index 639b18f930..46b3a003c6 100644
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -1058,6 +1058,9 @@ swr_destroy_screen(struct pipe_screen *p_screen)
swr_fence_reference(p_screen, >flush_fence, NULL);
 
JitDestroyContext(screen->hJitMgr);
+#if USE_SIMD16_SHADERS
+   JitDestroyContext(screen->hJitMgr16);
+#endif
 
if (winsys->destroy)
   winsys->destroy(winsys);
@@ -1141,6 +1144,9 @@ swr_create_screen_internal(struct sw_winsys *winsys)
 
// Pass in "" for architecture for run-time determination
screen->hJitMgr = JitCreateContext(KNOB_SIMD_WIDTH, "", "swr");
+#if USE_SIMD16_SHADERS
+   screen->hJitMgr16 = JitCreateContext(16, "", "swr");
+#endif
 
swr_fence_init(>base);
 
diff --git a/src/gallium/drivers/swr/swr_screen.h 
b/src/gallium/drivers/swr/swr_screen.h
index a11ea9f41d..1c4e331583 100644
--- a/src/gallium/drivers/swr/swr_screen.h
+++ b/src/gallium/drivers/swr/swr_screen.h
@@ -49,6 +49,9 @@ struct swr_screen {
uint32_t client_copy_limit;
 
HANDLE hJitMgr;
+#if USE_SIMD16_SHADERS
+   HANDLE hJitMgr16;
+#endif
 
PFNSwrGetInterface pfnSwrGetInterface;
 };
diff --git a/src/gallium/drivers/swr/swr_shader.cpp 
b/src/gallium/drivers/swr/swr_shader.cpp
index 510bc0e457..732e08dae7 100644
--- a/src/gallium/drivers/swr/swr_shader.cpp
+++ b/src/gallium/drivers/swr/swr_shader.cpp
@@ -693,7 +693,7 @@ swr_compile_gs(struct swr_context *ctx, swr_jit_gs_key )
 void
 BuilderSWR::WriteVS(Value *pVal, Value *pVsContext, Value *pVtxOutput, 
unsigned slot, unsigned channel)
 {
-#if USE_SIMD16_FRONTEND
+#if USE_SIMD16_FRONTEND && !USE_SIMD16_SHADERS
// interleave the simdvertex components into the dest simd16vertex
//   slot16offset = slot8offset * 2
//   comp16offset = comp8offset * 2 + alternateOffset
@@ -756,6 +756,9 @@ BuilderSWR::CompileVS(struct swr_context *ctx, 
swr_jit_vs_key )
const_sizes_ptr->setName("num_vs_constants");
 
Value *vtxInput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVin});
+#if USE_SIMD16_SHADERS
+   vtxInput = BITCAST(vtxInput, PointerType::get(Gen_simd16vertex(JM()), 0));
+#endif
 
for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) {
   const unsigned mask = swr_vs->info.base.input_usage_mask[attrib];
@@ -777,7 +780,7 @@ BuilderSWR::CompileVS(struct swr_context *ctx, 
swr_jit_vs_key )
 
lp_build_tgsi_soa(gallivm,
  swr_vs->pipe.tokens,
- lp_type_float_vec(32, 32 * 8),
+ lp_type_float_vec(32, 32 * mVWidth),
  NULL, // mask
  wrap(consts_ptr),
  wrap(const_sizes_ptr),
@@ -795,6 +798,9 @@ BuilderSWR::CompileVS(struct swr_context *ctx, 
swr_jit_vs_key )
IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
 
Value *vtxOutput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVout});
+#if USE_SIMD16_SHADERS
+   vtxOutput = BITCAST(vtxOutput, PointerType::get(Gen_simd16vertex(JM()), 0));
+#endif
 
for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) {
   for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_OUTPUTS; attrib++) {
@@ -905,7 +911,11 @@ swr_compile_vs(struct swr_context *ctx, swr_jit_vs_key 
)
   return NULL;
 
BuilderSWR builder(
+#if USE_SIMD16_SHADERS
+  reinterpret_cast(swr_screen(ctx->pipe.screen)->hJitMgr16),
+#else
   reinterpret_cast(swr_screen(ctx->pipe.screen)->hJitMgr),
+#endif
   "VS");
PFN_VERTEX_FUNC func = builder.CompileVS(ctx, key);
 

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: use proper alignment for debug transposedPrims

2017-10-06 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 9716c69e22613229bdc78c0a28491f39bec2520d
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=9716c69e22613229bdc78c0a28491f39bec2520d

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Oct  3 15:23:44 2017 -0500

swr/rast: use proper alignment for debug transposedPrims

Causing a crash in ParaView waveletcontour.py test when
_DEBUG defined due to vector aligned copy with unaligned
address.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/clip.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h 
b/src/gallium/drivers/swr/rasterizer/core/clip.h
index cde5261521..e9a410daa3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -561,7 +561,7 @@ public:
 
 #if defined(_DEBUG)
 // TODO: need to increase stack size, allocating SIMD16-widened 
transposedPrims causes stack overflow in debug builds
-SIMDVERTEX_T *transposedPrims = 
reinterpret_cast<SIMDVERTEX_T *>(malloc(sizeof(SIMDVERTEX_T) * 
2));
+SIMDVERTEX_T *transposedPrims = 
reinterpret_cast<SIMDVERTEX_T 
*>(AlignedMalloc(sizeof(SIMDVERTEX_T) * 2, 64));
 
 #else
 SIMDVERTEX_T transposedPrims[2];
@@ -667,7 +667,7 @@ public:
 }
 
 #if defined(_DEBUG)
-free(transposedPrims);
+AlignedFree(transposedPrims);
 
 #endif
 // update global pipeline stat

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Add support for R10G10B10_FLOAT_A2_UNORM pixel format

2017-09-25 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 41565ddf7a7f8986d232b5619ac80233251d0900
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=41565ddf7a7f8986d232b5619ac80233251d0900

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Mon Sep 11 16:07:32 2017 -0500

swr/rast: Add support for R10G10B10_FLOAT_A2_UNORM pixel format

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/common/formats.cpp  | 27 +++---
 .../drivers/swr/rasterizer/core/format_traits.h|  2 +-
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 16 ++---
 3 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.cpp 
b/src/gallium/drivers/swr/rasterizer/common/formats.cpp
index 263dec649a..1c086ff188 100644
--- a/src/gallium/drivers/swr/rasterizer/common/formats.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/formats.cpp
@@ -2729,16 +2729,27 @@ const SWR_FORMAT_INFO gFormatInfo[] = {
 { 0.0f, 0.0f, 0.0f, 0.0f },
 1, 1
 },
-// padding (0xD5)
+
+// R10G10B10_FLOAT_A2_UNORM (0xD5)
 {
-nullptr,
-{ SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, 
SWR_TYPE_UNKNOWN },
-{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-0, 0, 0, false, false, false, false,
-{ false, false, false, false },
-{ 0.0f, 0.0f, 0.0f, 0.0f },
-1, 1
+"R10G10B10_FLOAT_A2_UNORM",
+{ SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNORM },
+{ 0, 0, 0, 0x3f80 }, // Defaults for missing components
+{ 0, 1, 2, 3 }, // Swizzle
+{ 10, 10, 10, 2 }, // Bits per component
+32, // Bits per element
+4, // Bytes per element
+4, // Num components
+false, // isSRGB
+false, // isBC
+false, // isSubsampled
+false, // isLuminance
+{ false, false, false, false }, // Is normalized?
+{ 1.0f, 1.0f, 1.0f, 1.0f / 3.0f }, // To float scale factor
+1, // bcWidth
+1, // bcHeight
 },
+
 // R32_SINT (0xD6)
 {
 "R32_SINT",
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_traits.h 
b/src/gallium/drivers/swr/rasterizer/core/format_traits.h
index c04ea5f8ee..bc585dd175 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_traits.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_traits.h
@@ -1237,7 +1237,7 @@ template<> struct FormatTraits :
 /// FormatTraits - Format traits specialization for 
R10G10B10_FLOAT_A2_UNORM
 //
 template<> struct FormatTraits :
-ComponentTraits<SWR_TYPE_FLOAT, 10, SWR_TYPE_FLOAT, 10, SWR_TYPE_FLOAT, 
10, SWR_TYPE_FLOAT, 2>,
+ComponentTraits<SWR_TYPE_FLOAT, 10, SWR_TYPE_FLOAT, 10, SWR_TYPE_FLOAT, 
10, SWR_TYPE_UNORM, 2>,
 FormatSwizzle<0, 1, 2, 3>,
 Defaults<0, 0, 0, 0x3f80>
 {
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 402fd2652f..b943909a57 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -42,7 +42,7 @@ namespace SwrJit
 ///number of mantissa bits.
 /// @param val - 32-bit float
 /// @todo Maybe move this outside of this file into a header?
-static uint16_t Convert32To16Float(float val)
+static uint16_t ConvertFloat32ToFloat16(float val)
 {
 uint32_t sign, exp, mant;
 uint32_t roundBits;
@@ -112,7 +112,7 @@ namespace SwrJit
 ///float
 /// @param val - 16-bit float
 /// @todo Maybe move this outside of this file into a header?
-static float ConvertSmallFloatTo32(uint32_t val)
+static float ConvertFloat16ToFloat32(uint32_t val)
 {
 uint32_t result;
 if ((val & 0x7fff) == 0)
@@ -888,11 +888,11 @@ namespace SwrJit
 else
 {
 FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
-Function* pCvtPh2Ps = 
cast(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32",
 pFuncTy));
+Function* pCvtPh2Ps = 
cast(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32",
 pFuncTy));
 
-if 
(sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == 
nullptr)
+if 
(sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == 
nullptr)
 {
-sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void 
*));
+sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", 
(void *));
 }
 
 Value* pResult = UndefValue::get(mSimdFP32Ty);
@@ -921,11 +921,11 @@ namespace SwrJit
 {

Mesa (master): swr/rast: Properly sized null GS buffer

2017-09-25 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 5033d49d5d04efd01f9f4957e3b3dce0250908ad
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=5033d49d5d04efd01f9f4957e3b3dce0250908ad

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Sep 12 15:11:07 2017 -0500

swr/rast: Properly sized null GS buffer

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 15bc93db63..22a5705c48 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -798,7 +798,7 @@ static void GeometryShaderStage(
 const SWR_GS_STATE* pState = 
 SWR_GS_CONTEXT gsContext;
 
-static uint8_t sNullBuffer[1024] = { 0 };
+static uint8_t sNullBuffer[128] = { 0 };
 
 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
 {

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Slightly more efficient blend jit

2017-09-25 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: d18c2a1fa415b660244b25081c6597ea0439565c
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=d18c2a1fa415b660244b25081c6597ea0439565c

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Sep 13 19:16:45 2017 -0500

swr/rast: Slightly more efficient blend jit

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/jitter/blend_jit.cpp| 30 --
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
index f2e6e532bb..3258639d38 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -581,13 +581,13 @@ struct BlendJit : public Builder
 // load src1
 src1[i] = LOAD(pSrc1, { i });
 }
-Value* currentMask = VIMMED1(-1);
+Value* currentSampleMask = VIMMED1(-1);
 if (state.desc.alphaToCoverageEnable)
 {
 Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
 uint32_t bits = (1 << state.desc.numSamples) - 1;
-currentMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
-currentMask = FP_TO_SI(FADD(currentMask, VIMMED1(0.5f)), 
mSimdInt32Ty);
+currentSampleMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
+currentSampleMask = FP_TO_SI(FADD(currentSampleMask, 
VIMMED1(0.5f)), mSimdInt32Ty);
 }
 
 // alpha test
@@ -766,34 +766,24 @@ struct BlendJit : public Builder
 assert(!(state.desc.alphaToCoverageEnable));
 // load current mask
 Value* oMask = LOAD(ppoMask);
-Value* sampleMasked = VBROADCAST(SHL(C(1), sampleNum));
-oMask = AND(oMask, sampleMasked);
-currentMask = AND(oMask, currentMask);
+currentSampleMask = AND(oMask, currentSampleMask);
 }
 
 if(state.desc.sampleMaskEnable)
 {
 Value* sampleMask = LOAD(pBlendState, { 0, 
SWR_BLEND_STATE_sampleMask});
-Value* sampleMasked = SHL(C(1), sampleNum);
-sampleMask = AND(sampleMask, sampleMasked);
-sampleMask = VBROADCAST(ICMP_SGT(sampleMask, C(0)));
-sampleMask = S_EXT(sampleMask, mSimdInt32Ty);
-currentMask = AND(sampleMask, currentMask);
-}
-
-if (state.desc.alphaToCoverageEnable)
-{
-Value* sampleMasked = SHL(C(1), sampleNum);
-currentMask = AND(currentMask, VBROADCAST(sampleMasked));
+currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask);
 }
 
 if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
state.desc.oMaskEnable)
 {
-// load coverage mask
+// load coverage mask and mask off any lanes with no samples
 Value* pMask = LOAD(ppMask);
-currentMask = S_EXT(ICMP_UGT(currentMask, VBROADCAST(C(0))), 
mSimdInt32Ty);
-Value* outputMask = AND(pMask, currentMask);
+Value* sampleMasked = SHL(C(1), sampleNum);
+currentSampleMask = AND(currentSampleMask, 
VBROADCAST(sampleMasked));
+currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, 
VBROADCAST(C(0))), mSimdInt32Ty);
+Value* outputMask = AND(pMask, currentSampleMask);
 // store new mask
 STORE(outputMask, GEP(ppMask, C(0)));
 }

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Fix allocation of DS output data for USE_SIMD16_FRONTEND

2017-09-25 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 9c468c775b666f6da7468a795a98e2fd021c23bf
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=9c468c775b666f6da7468a795a98e2fd021c23bf

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Fri Sep 15 18:53:47 2017 -0500

swr/rast: Fix allocation of DS output data for USE_SIMD16_FRONTEND

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 16 ++--
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 22a5705c48..aea8e88de4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -1062,7 +1062,7 @@ struct TessellationThreadLocalData
 size_t tsCtxSize;
 
 simdscalar* pDSOutput;
-size_t numDSOutputVectors;
+size_t dsOutputAllocSize;
 };
 
 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
@@ -1210,24 +1210,20 @@ static void TessellationStages(
 
 // Allocate DS Output memory
 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, 
KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
-size_t requiredDSOutputVectors = requiredDSVectorInvocations * 
tsState.numDsOutputAttribs;
 #if USE_SIMD16_FRONTEND
 size_t requiredAllocSize = sizeof(simdvector) * 
RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs;  // 
simd8 -> simd16, padding
 #else
+size_t requiredDSOutputVectors = requiredDSVectorInvocations * 
tsState.numDsOutputAttribs;
 size_t requiredAllocSize = sizeof(simdvector) * 
requiredDSOutputVectors;
 #endif
-if (requiredDSOutputVectors > 
gt_pTessellationThreadData->numDSOutputVectors)
+if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
 {
 AlignedFree(gt_pTessellationThreadData->pDSOutput);
 gt_pTessellationThreadData->pDSOutput = 
(simdscalar*)AlignedMalloc(requiredAllocSize, 64);
-#if USE_SIMD16_FRONTEND
-gt_pTessellationThreadData->numDSOutputVectors = 
RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 
-> simd16, padding
-#else
-gt_pTessellationThreadData->numDSOutputVectors = 
requiredDSOutputVectors;
-#endif
+gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
 }
 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
-SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= 
requiredDSOutputVectors);
+SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= 
requiredAllocSize);
 
 #if defined(_DEBUG)
 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
@@ -1356,7 +1352,7 @@ static void TessellationStages(
 AlignedFree(gt_pTessellationThreadData->pDSOutput);
 gt_pTessellationThreadData->pDSOutput = nullptr;
 }
-gt_pTessellationThreadData->numDSOutputVectors = 0;
+gt_pTessellationThreadData->dsOutputAllocSize = 0;
 
 #endif
 TSDestroyCtx(tsCtx);

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Fetch compile state changes

2017-09-25 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: efe7fa4384f89ba909c7a5a303658a6442f4f787
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=efe7fa4384f89ba909c7a5a303658a6442f4f787

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Sep 12 13:38:31 2017 -0500

swr/rast: Fetch compile state changes

Add ForceSequentialAccessEnable and InstanceIDOffsetEnable bools to
FETCH_COMPILE_STATE.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp | 6 ++
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h   | 7 ++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index f3a4b27d9a..906129829c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -275,6 +275,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& 
fetchState)
  : JitGatherVertices(fetchState, streams, 
vIndices, pVtxOut);
 #endif
 
+if (fetchState.bInstanceIDOffsetEnable)
+{
+// TODO: 
+SWR_ASSERT((0), "Add support for handling InstanceID Offset Enable.");
+}
+
 RET_VOID();
 
 JitManager::DumpToFile(fetch, "src");
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
index 0dd6de759a..18fa96357b 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
@@ -107,6 +107,9 @@ struct FETCH_COMPILE_STATE
 bool bVertexIDOffsetEnable{ false };// Offset vertexID by StartVertex 
for non-indexed draws or BaseVertex for indexed draws
 bool bPartialVertexBuffer{ false }; // for indexed draws, map illegal 
indices to a known resident vertex
 
+bool bForceSequentialAccessEnable{ false };
+bool bInstanceIDOffsetEnable{ false };
+
 FETCH_COMPILE_STATE(bool disableVGATHER = false, bool diableIndexOOBCheck 
= false):
 bDisableVGATHER(disableVGATHER), 
bDisableIndexOOBCheck(diableIndexOOBCheck){ };
 
@@ -120,11 +123,13 @@ struct FETCH_COMPILE_STATE
 if (cutIndex != other.cutIndex) return false;
 if (bVertexIDOffsetEnable != other.bVertexIDOffsetEnable) return false;
 if (bPartialVertexBuffer != other.bPartialVertexBuffer) return false;
+if (bForceSequentialAccessEnable != 
other.bForceSequentialAccessEnable) return false;
+if (bInstanceIDOffsetEnable != other.bInstanceIDOffsetEnable) return 
false;
 
 for(uint32_t i = 0; i < numAttribs; ++i)
 {
 if((layout[i].bits != other.layout[i].bits) ||
-   ((layout[i].InstanceEnable == 1) &&
+   (((layout[i].InstanceEnable == 1) || 
(layout[i].InstanceStrideEnable == 1)) &&
 (layout[i].InstanceAdvancementState != 
other.layout[i].InstanceAdvancementState))){
 return false;
 }

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Remove code supporting legacy llvm (<3.9)

2017-09-25 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 68d8dd1fb5a0c28e4f6dfd8512ff6c3550458b46
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=68d8dd1fb5a0c28e4f6dfd8512ff6c3550458b46

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Sep 19 18:19:53 2017 -0500

swr/rast: Remove code supporting legacy llvm (<3.9)

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/jitter/JitManager.cpp   |  11 ++-
 .../drivers/swr/rasterizer/jitter/JitManager.h |   7 --
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 102 ++---
 3 files changed, 15 insertions(+), 105 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index e4281f8e92..3f0772c942 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -48,8 +48,9 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Config/llvm-config.h"
 
-#if HAVE_LLVM < 0x400
+#if LLVM_VERSION_MAJOR < 4
 #include "llvm/Bitcode/ReaderWriter.h"
 #else
 #include "llvm/Bitcode/BitcodeWriter.h"
@@ -231,8 +232,8 @@ void JitManager::DumpAsm(Function* pFunction, const char* 
fileName)
 
 #if defined(_WIN32)
 DWORD pid = GetCurrentProcessId();
-TCHAR procname[MAX_PATH];
-GetModuleFileName(NULL, procname, MAX_PATH);
+char procname[MAX_PATH];
+GetModuleFileNameA(NULL, procname, MAX_PATH);
 const char* pBaseName = strrchr(procname, '\\');
 std::stringstream outDir;
 outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends;
@@ -269,8 +270,8 @@ void JitManager::DumpToFile(Function *f, const char 
*fileName)
 {
 #if defined(_WIN32)
 DWORD pid = GetCurrentProcessId();
-TCHAR procname[MAX_PATH];
-GetModuleFileName(NULL, procname, MAX_PATH);
+char procname[MAX_PATH];
+GetModuleFileNameA(NULL, procname, MAX_PATH);
 const char* pBaseName = strrchr(procname, '\\');
 std::stringstream outDir;
 outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index 4bc543b560..46ffe276a0 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -47,13 +47,6 @@
 #include "llvm/ExecutionEngine/ObjectCache.h"
 
 #include "llvm/Config/llvm-config.h"
-#ifndef LLVM_VERSION_MAJOR
-#include "llvm/Config/config.h"
-#endif
-
-#ifndef HAVE_LLVM
-#define HAVE_LLVM ((LLVM_VERSION_MAJOR << 8) | LLVM_VERSION_MINOR)
-#endif
 
 #include "llvm/IR/Verifier.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index b943909a57..9ca36b2467 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -763,22 +763,10 @@ namespace SwrJit
 /// lower 8 values are used.
 Value *Builder::PMOVSXBD(Value* a)
 {
-// llvm-3.9 removed the pmovsxbd intrinsic
-#if HAVE_LLVM < 0x309
-// use avx2 byte sign extend instruction if available
-if(JM()->mArch.AVX2())
-{
-Function *pmovsxbd = 
Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd);
-return CALL(pmovsxbd, std::initializer_list<Value*>{a});
-}
-else
-#endif
-{
-// VPMOVSXBD output type
-Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
-// Extract 8 values from 128bit lane and sign extend
-return S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), 
v8x32Ty);
-}
+// VPMOVSXBD output type
+Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
+// Extract 8 values from 128bit lane and sign extend
+return S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), 
v8x32Ty);
 }
 
 //
@@ -787,22 +775,10 @@ namespace SwrJit
 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
 Value *Builder::PMOVSXWD(Value* a)
 {
-// llvm-3.9 removed the pmovsxwd intrinsic
-#if HAVE_LLVM < 0x309
-// use avx2 word sign extend if available
-if(JM()->mArch.AVX2())
-{
-Function *pmovsxwd = 
Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd);
-retur

Mesa (master): swr/rast: Handle instanceID offset / Instance Stride enable

2017-09-25 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 5a2bca5db5e025f0884487f590feac0c33db48fd
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=5a2bca5db5e025f0884487f590feac0c33db48fd

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Sep 20 11:50:32 2017 -0500

swr/rast: Handle instanceID offset / Instance Stride enable

Supported in JitGatherVertices(); FetchJit::JitLoadVertices() may require
similar changes, will need address this if it is determined that this
path is still in use.

Handle Force Sequential Access in FetchJit::Create.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 46 ++
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 906129829c..1e3db902bb 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -222,6 +222,18 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& 
fetchState)
 default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; 
break;
 }
 
+if(fetchState.bForceSequentialAccessEnable)
+{
+Value* pOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
+
+// VertexData buffers are accessed sequentially, the index is equal to 
the vertex number
+vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, 
SWR_FETCH_CONTEXT_StartVertex }));
+vIndices = ADD(vIndices, pOffsets);
+#if USE_SIMD16_SHADERS
+vIndices2 = ADD(vIndices, VIMMED1(8));
+#endif
+}
+
 Value* vVertexId = vIndices;
 #if USE_SIMD16_SHADERS
 Value* vVertexId2 = vIndices2;
@@ -275,12 +287,6 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& 
fetchState)
  : JitGatherVertices(fetchState, streams, 
vIndices, pVtxOut);
 #endif
 
-if (fetchState.bInstanceIDOffsetEnable)
-{
-// TODO: 
-SWR_ASSERT((0), "Add support for handling InstanceID Offset Enable.");
-}
-
 RET_VOID();
 
 JitManager::DumpToFile(fetch, "src");
@@ -362,6 +368,11 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE 
, Value* str
 
 vectors.clear();
 
+if (fetchState.bInstanceIDOffsetEnable)
+{
+SWR_ASSERT((0), "TODO: Fill out more once driver sends this down");
+}
+
 Value *vCurIndices;
 Value *startOffset;
 if(ied.InstanceEnable)
@@ -831,8 +842,16 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 minVertex = LOAD(minVertex);
 }
 
+if (fetchState.bInstanceIDOffsetEnable)
+{
+// the InstanceID (curInstance) value is offset by 
StartInstanceLocation
+curInstance = ADD(curInstance, startInstance);
+}
+
 Value *vCurIndices;
 Value *startOffset;
+Value *vInstanceStride = VIMMED1(0);
+
 if(ied.InstanceEnable)
 {
 Value* stepRate = C(ied.InstanceAdvancementState);
@@ -853,11 +872,19 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 }
 else if (ied.InstanceStrideEnable)
 {
+// grab the instance advancement state, determines stride in bytes 
from one instance to the next
+Value* stepRate = C(ied.InstanceAdvancementState);
+vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
+
+// offset indices by baseVertex
+vCurIndices = ADD(vIndices, vBaseVertex);
+
+startOffset = startVertex;
 SWR_ASSERT((0), "TODO: Fill out more once driver sends this 
down.");
 }
 else
 {
-// offset indices by baseVertex
+// offset indices by baseVertex
 vCurIndices = ADD(vIndices, vBaseVertex);
 
 startOffset = startVertex;
@@ -925,6 +952,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 Value* vOffsets = MUL(vCurIndices, vStride);
 vOffsets = ADD(vOffsets, vAlignmentOffsets);
 
+// if instance stride enable is:
+//  true  - add product of the instanceID and advancement state to the 
offst into the VB
+//  false - value of vInstanceStride has been initialialized to zero
+vOffsets = ADD(vOffsets, vInstanceStride);
+
 // Packing and component control 
 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
 const ComponentControl compCtrl[4] { 
(ComponentControl)ied.ComponentControl0, 
(ComponentControl)ied.ComponentControl1, 

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: New GS state/context API

2017-09-25 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: cd6e91d3a2012d2177732f27795e66c8c38e0aba
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=cd6e91d3a2012d2177732f27795e66c8c38e0aba

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Mon Sep 11 17:29:12 2017 -0500

swr/rast: New GS state/context API

One piglit regression, which was a false pass:
  spec@glsl-1.50@execution@geometry@dynamic_input_array_index

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/core/frontend.cpp   | 227 -
 src/gallium/drivers/swr/rasterizer/core/state.h|  55 +++--
 src/gallium/drivers/swr/swr_shader.cpp | 183 -
 3 files changed, 253 insertions(+), 212 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index f882869eb7..26e76a92ef 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -710,45 +710,67 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* 
pStreamIdBase, uint32_t num
 
 THREAD SWR_GS_CONTEXT tlsGsContext;
 
-template
-struct GsBufferInfo
+// Buffers that are allocated if GS is enabled
+struct GsBuffers
 {
-GsBufferInfo(const SWR_GS_STATE )
-{
-const uint32_t vertexCount = gsState.maxNumVerts;
-const uint32_t vertexStride = sizeof(SIMDVERTEX);
-const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / 
SIMD_WIDTH;
+uint8_t* pGsIn;
+uint8_t* pGsOut[KNOB_SIMD_WIDTH];
+uint8_t* pGsTransposed;
+void* pStreamCutBuffer;
+};
 
-vertexPrimitiveStride = vertexStride * numSimdBatches;
-vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
+//
+/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
+/// @param pDst - Destination buffer in AOS form for the current SIMD width, 
fed into the primitive assembler
+/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
+/// @param numVerts - Number of vertices outputted by the GS
+/// @param numAttribs - Number of attributes per vertex
+template
+void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, 
uint32_t numAttribs)
+{
+uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
+uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4;
 
-if (gsState.isSingleStream)
-{
-cutPrimitiveStride = (vertexCount + 7) / 8;
-cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
+OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
 
-streamCutPrimitiveStride = 0;
-streamCutInstanceStride = 0;
-}
-else
-{
-cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
-cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
-
-streamCutPrimitiveStride = (vertexCount + 7) / 8;
-streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
-}
+for (uint32_t i = 0; i < SimdWidth; ++i)
+{
+gatherOffsets[i] = srcVertexStride * i;
 }
+auto vGatherOffsets = SIMD_T::load_si((typename 
SIMD_T::Integer*)[0]);
 
-uint32_t vertexPrimitiveStride;
-uint32_t vertexInstanceStride;
+uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
+uint32_t remainingVerts = numVerts;
 
-uint32_t cutPrimitiveStride;
-uint32_t cutInstanceStride;
+for (uint32_t s = 0; s < numSimd; ++s)
+{
+uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
+uint8_t* pDstBase = pDst + s * dstVertexStride;
 
-uint32_t streamCutPrimitiveStride;
-uint32_t streamCutInstanceStride;
-};
+// Compute mask to prevent src overflow
+uint32_t mask = std::min(remainingVerts, SimdWidth);
+mask = GenMask(mask);
+auto vMask = SIMD_T::vmask_ps(mask);
+auto viMask = SIMD_T::castps_si(vMask);
+
+for (uint32_t a = 0; a < numAttribs; ++a)
+{
+auto attribGatherX = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)pSrcBase, 
vGatherOffsets, vMask);
+auto attribGatherY = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + 
sizeof(float)), vGatherOffsets, vMask);
+auto attribGatherZ = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + 
sizeof(float) * 2), vGatherOffsets, vMask);
+auto attribGatherW = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + 
sizeof(float) * 3), vGatherOffsets, vMask);
+
+SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
+SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename 
SIMD_T::Float)), viMask, attribGatherY);
+SIMD_T::maskstore

Mesa (master): swr/rast: Move SWR_GS_CONTEXT from thread local storage to stack

2017-09-25 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 9c82cf0f1e2b0496d135dc35dbb512e67b4e23f5
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=9c82cf0f1e2b0496d135dc35dbb512e67b4e23f5

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Sep 12 14:37:36 2017 -0500

swr/rast: Move SWR_GS_CONTEXT from thread local storage to stack

Move structure, as the size is significantly reduced due to dynamic
allocation of the GS buffers.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/core/frontend.cpp   | 23 +++---
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 26e76a92ef..15bc93db63 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -708,8 +708,6 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* 
pStreamIdBase, uint32_t num
 }
 }
 
-THREAD SWR_GS_CONTEXT tlsGsContext;
-
 // Buffers that are allocated if GS is enabled
 struct GsBuffers
 {
@@ -798,21 +796,22 @@ static void GeometryShaderStage(
 
 const API_STATE& state = GetApiState(pDC);
 const SWR_GS_STATE* pState = 
+SWR_GS_CONTEXT gsContext;
 
 static uint8_t sNullBuffer[1024] = { 0 };
 
 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
 {
-tlsGsContext.pStreams[i] = pGsBuffers->pGsOut[i];
+gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
 }
-tlsGsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
-tlsGsContext.PrimitiveID = primID;
+gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
+gsContext.PrimitiveID = primID;
 
 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
 simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
 
 // assemble all attributes for the input primitive
-tlsGsContext.inputVertStride = pState->inputVertStride;
+gsContext.inputVertStride = pState->inputVertStride;
 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
 {
 uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;
@@ -821,7 +820,7 @@ static void GeometryShaderStage(
 
 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 {
-tlsGsContext.pVerts[attribSlot + pState->inputVertStride * i] = 
attrib[i];
+gsContext.pVerts[attribSlot + pState->inputVertStride * i] = 
attrib[i];
 }
 }
 
@@ -829,7 +828,7 @@ static void GeometryShaderStage(
 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 {
-tlsGsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * 
i] = attrib[i];
+gsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = 
attrib[i];
 }
 
 // record valid prims from the frontend to avoid over binning the newly 
generated
@@ -842,15 +841,15 @@ static void GeometryShaderStage(
 
 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 {
-tlsGsContext.InstanceID = instance;
-tlsGsContext.mask = GenerateMask(numInputPrims);
+gsContext.InstanceID = instance;
+gsContext.mask = GenerateMask(numInputPrims);
 
 // execute the geometry shader
-state.pfnGsFunc(GetPrivateState(pDC), );
+state.pfnGsFunc(GetPrivateState(pDC), );
 
 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
 {
-tlsGsContext.pStreams[i] += pState->allocationSize;
+gsContext.pStreams[i] += pState->allocationSize;
 }
 }
 

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: remove llvm fence/atomics from generated files

2017-09-22 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 066d1dc951d3a0833de6abd8e004bf467e6e50eb
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=066d1dc951d3a0833de6abd8e004bf467e6e50eb

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Sep 19 14:04:20 2017 -0500

swr/rast: remove llvm fence/atomics from generated files

We currently don't use these instructions, and since their API
changed in llvm-5.0 having them in the autogen files broke the mesa
release tarballs which ship with generated autogen files.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102847
CC: mesa-sta...@lists.freedesktop.org
Tested-by: Laurent Carlier <lordhea...@gmail.com>
Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py | 8 
 1 file changed, 8 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 025d38ab33..ce892a9abe 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -140,6 +140,14 @@ def parse_ir_builder(input_file):
 
 ignore = False
 
+# The following functions need to be ignored in openswr.
+# API change in llvm-5.0 breaks baked autogen files
+if (
+(func_name == 'CreateFence' or
+ func_name == 'CreateAtomicCmpXchg' or
+ func_name == 'CreateAtomicRMW')):
+ignore = True
+
 # The following functions need to be ignored.
 if (func_name == 'CreateInsertNUWNSWBinOp' or
 func_name == 'CreateMaskedIntrinsic' or

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Missed conversion to SIMD_T

2017-09-13 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 1ccf9ad280415536056095314b470156e29b057e
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=1ccf9ad280415536056095314b470156e29b057e

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Aug 30 11:02:16 2017 -0500

swr/rast: Missed conversion to SIMD_T

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index a6713e8c5d..e08e4896f3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -1195,7 +1195,7 @@ void BinPostSetupPointsImpl(
 }
 
 OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
-_simd16_store_ps(reinterpret_cast(aPointSize), vPointSize);
+SIMD_T::store_ps(reinterpret_cast(aPointSize), vPointSize);
 
 uint32_t *pPrimID = (uint32_t *)
 

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Remove hardcoded clip/cull slot from clipper

2017-09-13 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: ae2412dbbdcff6583d7e4cf0430a409b86cb9e80
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=ae2412dbbdcff6583d7e4cf0430a409b86cb9e80

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Aug 22 17:46:14 2017 -0500

swr/rast: Remove hardcoded clip/cull slot from clipper

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/clip.h | 35 +++---
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h 
b/src/gallium/drivers/swr/rasterizer/core/clip.h
index e0aaf81541..cde5261521 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -372,13 +372,15 @@ public:
 int ComputeUserClipCullMask(PA_STATE , typename SIMD_T::Vec4 prim[])
 {
 uint8_t cullMask = state.backendState.cullDistanceMask;
+uint32_t vertexClipCullOffset = 
state.backendState.vertexClipCullOffset;
+
 typename SIMD_T::Float vClipCullMask = SIMD_T::setzero_ps();
 
 typename SIMD_T::Vec4 vClipCullDistLo[3];
 typename SIMD_T::Vec4 vClipCullDistHi[3];
 
-pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
-pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
+pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
+pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
 
 DWORD index;
 while (_BitScanForward(, cullMask))
@@ -488,21 +490,22 @@ public:
 }
 
 // assemble user clip distances if enabled
+uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
 if (state.backendState.clipDistanceMask & 0xf)
 {
-pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
+pa.Assemble(vertexClipCullSlot, tmpVector);
 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 {
-vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = 
tmpVector[i];
+vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
 }
 }
 
 if (state.backendState.clipDistanceMask & 0xf0)
 {
-pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
+pa.Assemble(vertexClipCullSlot + 1, tmpVector);
 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 {
-vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = 
tmpVector[i];
+vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
 }
 }
 
@@ -613,26 +616,27 @@ public:
 }
 
 // transpose user clip distances if enabled
+uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
 if (state.backendState.clipDistanceMask & 0x0f)
 {
-pBase = reinterpret_cast([0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * 
inputPrim;
+pBase = reinterpret_cast([0].attrib[vertexClipCullSlot]) + sizeof(float) * inputPrim;
 
 for (uint32_t c = 0; c < 4; ++c)
 {
 SIMD256::Float temp = SIMD256::template 
mask_i32gather_ps(SIMD256::setzero_ps(), 
reinterpret_cast(pBase), vOffsets, vMask);
-transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] 
= SimdHelper::insert_lo_ps(temp);
+transposedPrims[0].attrib[vertexClipCullSlot][c] = 
SimdHelper::insert_lo_ps(temp);
 pBase += sizeof(typename SIMD_T::Float);
 }
 }
 
 if (state.backendState.clipDistanceMask & 0xf0)
 {
-pBase = reinterpret_cast([0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * 
inputPrim;
+pBase = reinterpret_cast([0].attrib[vertexClipCullSlot + 1]) + sizeof(float) * inputPrim;
 
 for (uint32_t c = 0; c < 4; ++c)
 {
 SIMD256::Float temp = SIMD256::template 
mask_i32gather_ps(SIMD256::setzero_ps(), 
reinterpret_cast(pBase), vOffsets, vMask);
-transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] 
= SimdHelper::insert_lo_ps(temp);
+transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = 
SimdHelper::insert_lo_ps(temp);
 pBase += sizeof(typename SIMD_T::Float);
 }
 }
@@ -692,6 +696,7 @@ public:
 
 // OOB indices => forced to zero.
 typename SIMD_T::Integer vpai = 
SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
 typename SIMD_T::Integer vNumViewports = 
SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, 
vNumViewports);
 viewportIdx = SIMD_T::and_si(vCl

Mesa (master): swr/rast: Add new API SwrStallBE

2017-09-13 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: f5031fb9521ecf3be4af8584a80516c7307ad61a
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=f5031fb9521ecf3be4af8584a80516c7307ad61a

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Fri Aug 18 12:34:48 2017 -0500

swr/rast: Add new API SwrStallBE

SwrStallBE stalls the backend threads until all work submitted before
the stall has finished.  The frontend threads can continue to make
forward progress.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/api.cpp | 9 +
 src/gallium/drivers/swr/rasterizer/core/api.h   | 8 
 2 files changed, 17 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index ccb6dfb7a1..632309821f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -458,6 +458,14 @@ void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, 
uint64_t userData, uint
 AR_API_END(APISync, 1);
 }
 
+void SwrStallBE(HANDLE hContext)
+{
+SWR_CONTEXT* pContext = GetContext(hContext);
+DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+pDC->dependent = true;
+}
+
 void SwrWaitForIdle(HANDLE hContext)
 {
 SWR_CONTEXT *pContext = GetContext(hContext);
@@ -1672,6 +1680,7 @@ void SwrGetInterface(SWR_INTERFACE _funcs)
 out_funcs.pfnSwrSaveState = SwrSaveState;
 out_funcs.pfnSwrRestoreState = SwrRestoreState;
 out_funcs.pfnSwrSync = SwrSync;
+out_funcs.pfnSwrStallBE = SwrStallBE;
 out_funcs.pfnSwrWaitForIdle = SwrWaitForIdle;
 out_funcs.pfnSwrWaitForIdleFE = SwrWaitForIdleFE;
 out_funcs.pfnSwrSetVertexBuffers = SwrSetVertexBuffers;
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h 
b/src/gallium/drivers/swr/rasterizer/core/api.h
index a39420552b..577cfb157a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -263,6 +263,13 @@ SWR_FUNC(void, SwrSync,
 uint64_t userData3);
 
 //
+/// @brief Stall cmd. Stalls the backend until all previous work has been 
completed.
+///Frontend work can continue to make progress
+/// @param hContext - Handle passed back from SwrCreateContext
+SWR_FUNC(void, SwrStallBE,
+HANDLE hContext);
+
+//
 /// @brief Blocks until all rendering has been completed.
 /// @param hContext - Handle passed back from SwrCreateContext
 SWR_FUNC(void, SwrWaitForIdle,
@@ -709,6 +716,7 @@ struct SWR_INTERFACE
 PFNSwrSaveState pfnSwrSaveState;
 PFNSwrRestoreState pfnSwrRestoreState;
 PFNSwrSync pfnSwrSync;
+PFNSwrStallBE pfnSwrStallBE;
 PFNSwrWaitForIdle pfnSwrWaitForIdle;
 PFNSwrWaitForIdleFE pfnSwrWaitForIdleFE;
 PFNSwrSetVertexBuffers pfnSwrSetVertexBuffers;

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: add graph write to jit debug putput

2017-09-13 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 6b9e801832c2691b311ab2429fda1f9ec774f021
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=6b9e801832c2691b311ab2429fda1f9ec774f021

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Sep  7 15:18:08 2017 -0500

swr/rast: add graph write to jit debug putput

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index fc32b627bd..e4281f8e92 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -296,10 +296,10 @@ void JitManager::DumpToFile(Function *f, const char 
*fileName)
 #endif
 fd.flush();
 
-//raw_fd_ostream fd_cfg(fName, EC, llvm::sys::fs::F_Text);
-//WriteGraph(fd_cfg, (const Function*)f);
+raw_fd_ostream fd_cfg(fName, EC, llvm::sys::fs::F_Text);
+WriteGraph(fd_cfg, (const Function*)f);
 
-//fd_cfg.flush();
+fd_cfg.flush();
 }
 }
 

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: adjust linux cpu topology identification code

2017-09-13 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: ead0dfe31ec7a1b1928e4abbfa99d59e0e5e929a
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=ead0dfe31ec7a1b1928e4abbfa99d59e0e5e929a

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Sep  6 14:59:33 2017 -0500

swr/rast: adjust linux cpu topology identification code

Make more robust to handle strange strange configurations like a vmware
exported 4-way numa X 1-core configuration.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/core/threads.cpp| 81 ++
 1 file changed, 38 insertions(+), 43 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp 
b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index b704d23f54..4bb395dec3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -169,37 +169,16 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, 
uint32_t& out_numThread
 std::ifstream input("/proc/cpuinfo");
 std::string line;
 char* c;
-uint32_t threadId = uint32_t(-1);
+uint32_t procId = uint32_t(-1);
 uint32_t coreId = uint32_t(-1);
-uint32_t numaId = uint32_t(-1);
+uint32_t physId = uint32_t(-1);
 
 while (std::getline(input, line))
 {
 if (line.find("processor") != std::string::npos)
 {
-if (threadId != uint32_t(-1))
-{
-// Save information.
-if (out_nodes.size() <= numaId)
-{
-out_nodes.resize(numaId + 1);
-}
-
-auto& numaNode = out_nodes[numaId];
-if (numaNode.cores.size() <= coreId)
-{
-numaNode.cores.resize(coreId + 1);
-}
-
-auto& core = numaNode.cores[coreId];
-core.procGroup = coreId;
-core.threadIds.push_back(threadId);
-
-out_numThreadsPerProcGroup++;
-}
-
 auto data_start = line.find(": ") + 2;
-threadId = std::strtoul(_str()[data_start], , 10);
+procId = std::strtoul(_str()[data_start], , 10);
 continue;
 }
 if (line.find("core id") != std::string::npos)
@@ -211,29 +190,32 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, 
uint32_t& out_numThread
 if (line.find("physical id") != std::string::npos)
 {
 auto data_start = line.find(": ") + 2;
-numaId = std::strtoul(_str()[data_start], , 10);
+physId = std::strtoul(_str()[data_start], , 10);
 continue;
 }
+if (line.length() == 0)
+{
+if (physId + 1 > out_nodes.size())
+out_nodes.resize(physId + 1);
+auto& numaNode = out_nodes[physId];
+numaNode.numaId = physId;
+
+if (coreId + 1 > numaNode.cores.size())
+numaNode.cores.resize(coreId + 1);
+auto& core = numaNode.cores[coreId];
+core.procGroup = coreId;
+core.threadIds.push_back(procId);
+}
 }
 
-if (threadId != uint32_t(-1))
+out_numThreadsPerProcGroup = 0;
+for (auto  : out_nodes)
 {
-// Save information.
-if (out_nodes.size() <= numaId)
+for (auto  : node.cores)
 {
-out_nodes.resize(numaId + 1);
+out_numThreadsPerProcGroup = 
std::max((size_t)out_numThreadsPerProcGroup,
+  core.threadIds.size());
 }
-auto& numaNode = out_nodes[numaId];
-numaNode.numaId = numaId;
-if (numaNode.cores.size() <= coreId)
-{
-numaNode.cores.resize(coreId + 1);
-}
-auto& core = numaNode.cores[coreId];
-
-core.procGroup = coreId;
-core.threadIds.push_back(threadId);
-out_numThreadsPerProcGroup++;
 }
 
 #else
@@ -316,7 +298,11 @@ void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, 
uint32_t procGroupId =
 CPU_ZERO();
 CPU_SET(threadId, );
 
-pthread_setaffinity_np(thread, sizeof(cpu_set_t), );
+int err = pthread_setaffinity_np(thread, sizeof(cpu_set_t), );
+if (err != 0)
+{
+fprintf(stderr, "pthread_setaffinity_np failure for tid %u: %s\n", 
threadId, strerror(err));
+}
 
 #endif
 }
@@ -1031,7 +1017,16 @@ void CreateThreadPool(SWR_CONTEXT* pContext, 
THREAD_POOL* pPool)
 }
 else
 {
-pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 
2, 4, etc.)
+// numa distribution assumes workers on all nodes
+bool useNuma = true;
+if (numCoresPerNode * numHyperThreads == 1)
+useNuma = false;
+
+if (useNuma) {
+  

Mesa (master): swr/rast: Migrate memory pointers to gfxptr_t type

2017-09-13 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 6f0fcec07a16eb48ebdafffd0b4ae0bb5ac611a4
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=6f0fcec07a16eb48ebdafffd0b4ae0bb5ac611a4

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Sep  7 15:17:23 2017 -0500

swr/rast: Migrate memory pointers to gfxptr_t type

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../swr/rasterizer/codegen/gen_llvm_types.py|  2 +-
 src/gallium/drivers/swr/rasterizer/core/state.h |  5 +++--
 .../drivers/swr/rasterizer/memory/StoreTile.h   |  4 ++--
 .../drivers/swr/rasterizer/memory/TilingFunctions.h |  2 +-
 src/gallium/drivers/swr/swr_context.cpp | 18 +-
 src/gallium/drivers/swr/swr_draw.cpp|  8 
 src/gallium/drivers/swr/swr_resource.h  |  2 +-
 src/gallium/drivers/swr/swr_screen.cpp  | 21 ++---
 src/gallium/drivers/swr/swr_state.cpp   | 10 +-
 9 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py
index 94f3f9feff..ccf2bde1ed 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py
@@ -42,7 +42,7 @@ def gen_llvm_type(type, name, is_pointer, is_pointer_pointer, 
is_array, is_array
 else:
 if type == 'BYTE' or type == 'char' or type == 'uint8_t' or type == 
'int8_t' or type == 'bool':
 llvm_type = 'Type::getInt8Ty(ctx)'
-elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type 
== 'int64_t':
+elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type 
== 'int64_t' or type == 'gfxptr_t':
 llvm_type = 'Type::getInt64Ty(ctx)'
 elif type == 'UINT16' or type == 'int16_t' or type == 'uint16_t':
 llvm_type = 'Type::getInt16Ty(ctx)'
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index b0af663d50..13c1d8b7e9 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -29,6 +29,7 @@
 
 #include "common/formats.h"
 #include "common/intrin.h"
+using gfxptr_t = unsigned long long;
 #include 
 #include 
 
@@ -513,7 +514,7 @@ enum SWR_AUX_MODE
 //
 struct SWR_SURFACE_STATE
 {
-uint8_t *pBaseAddress;
+gfxptr_t xpBaseAddress;
 SWR_SURFACE_TYPE type;  // @llvm_enum
 SWR_FORMAT format;  // @llvm_enum
 uint32_t width;
@@ -536,7 +537,7 @@ struct SWR_SURFACE_STATE
 
 uint32_t lodOffsets[2][15]; // lod offsets for sampled surfaces
 
-uint8_t *pAuxBaseAddress;   // Used for compression, append/consume 
counter, etc.
+gfxptr_t xpAuxBaseAddress;   // Used for compression, append/consume 
counter, etc.
 SWR_AUX_MODE auxMode;  // @llvm_enum
 
 
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h 
b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
index c3d14e9509..512c338027 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
@@ -1179,7 +1179,7 @@ struct StoreRasterTile
 resolveColor[3] *= oneOverNumSamples;
 
 // Use the resolve surface state
-SWR_SURFACE_STATE* pResolveSurface = 
(SWR_SURFACE_STATE*)pDstSurface->pAuxBaseAddress;
+SWR_SURFACE_STATE* pResolveSurface = 
(SWR_SURFACE_STATE*)pDstSurface->xpAuxBaseAddress;
 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, 
false>((x + rx), (y + ry),
 pResolveSurface->arrayIndex + renderTargetArrayIndex, 
pResolveSurface->arrayIndex + renderTargetArrayIndex,
 0, pResolveSurface->lod, pResolveSurface);
@@ -2390,7 +2390,7 @@ struct StoreMacroTile
 }
 }
 
-if (pDstSurface->pAuxBaseAddress)
+if (pDstSurface->xpAuxBaseAddress)
 {
 uint32_t sampleOffset = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * 
(FormatTraits::bpp / 8);
 // Store each raster tile from the hot tile to the destination 
surface.
diff --git a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h 
b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
index 9222d3edfb..6c801c7ff6 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
@@ -694,5 +694,5 @@ template
 INLINE
 void* ComputeSurfaceAddress(uint32_t x, uint32_t y, uint32_t z, uint32_t 
array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState)
 {
-return pState->pBaseAddress + ComputeSurfaceOffset(x, y, 
z, array, samp

Mesa (master): swr/rast: Start to remove hardcoded clipcull_dist vertex attrib slot

2017-09-13 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 5471f65976f39299b9fec7e98fd3b122fa86b499
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=5471f65976f39299b9fec7e98fd3b122fa86b499

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Aug 22 16:42:57 2017 -0500

swr/rast: Start to remove hardcoded clipcull_dist vertex attrib slot

Add new field in SWR_BACKEND_STATE::vertexClipCullOffset to specify the
start of the clip/cull section of the vertex header.  Removed use of
hardcoded slot from binner.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 11 ++-
 src/gallium/drivers/swr/rasterizer/core/state.h|  9 ++---
 src/gallium/drivers/swr/swr_state.cpp  |  3 +++
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 19afd1f292..a6713e8c5d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -366,16 +366,17 @@ PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t 
NumVerts, bool IsSwizzl
 /// @param clipDistMask - mask of enabled clip distances
 /// @param pUserClipBuffer - buffer to store results
 template
-void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t 
clipDistMask, float *pRecipW, float* pUserClipBuffer)
+void ProcessUserClipDist(const SWR_BACKEND_STATE& state, PA_STATE& pa, 
uint32_t primIndex, float *pRecipW, float* pUserClipBuffer)
 {
 DWORD clipDist;
+uint32_t clipDistMask = state.clipDistanceMask;
 while (_BitScanForward(, clipDistMask))
 {
 clipDistMask &= ~(1 << clipDist);
 uint32_t clipSlot = clipDist >> 2;
 uint32_t clipComp = clipDist & 0x3;
 uint32_t clipAttribSlot = clipSlot == 0 ?
-VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
+state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
 
 simd4scalar primClipDist[3];
 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
@@ -872,7 +873,7 @@ endBinTriangles:
 {
 uint32_t numClipDist = 
_mm_popcnt_u32(state.backendState.clipDistanceMask);
 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * 
sizeof(float));
-ProcessUserClipDist<3>(pa, triIndex, 
state.backendState.clipDistanceMask, [12], 
desc.pUserClipBuffer);
+ProcessUserClipDist<3>(state.backendState, pa, triIndex, 
[12], desc.pUserClipBuffer);
 }
 
 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
@@ -1248,7 +1249,7 @@ void BinPostSetupPointsImpl(
 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * 
sizeof(float));
 float dists[8];
 float one = 1.0f;
-ProcessUserClipDist<1>(pa, primIndex, 
backendState.clipDistanceMask, , dists);
+ProcessUserClipDist<1>(backendState, pa, primIndex, , 
dists);
 for (uint32_t i = 0; i < numClipDist; i++) {
 desc.pUserClipBuffer[3 * i + 0] = 0.0f;
 desc.pUserClipBuffer[3 * i + 1] = 0.0f;
@@ -1577,7 +1578,7 @@ void BinPostSetupLinesImpl(
 {
 uint32_t numClipDist = 
_mm_popcnt_u32(state.backendState.clipDistanceMask);
 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * 
sizeof(float));
-ProcessUserClipDist<2>(pa, primIndex, 
state.backendState.clipDistanceMask, [12], 
desc.pUserClipBuffer);
+ProcessUserClipDist<2>(state.backendState, pa, primIndex, 
[12], desc.pUserClipBuffer);
 }
 
 MacroTileMgr *pTileMgr = pDC->pTileMgr;
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index 284c523eba..b0af663d50 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -1070,12 +1070,15 @@ struct SWR_BACKEND_STATE
 bool readRenderTargetArrayIndex;// Forward render target array index 
from last FE stage to the backend
 bool readViewportArrayIndex;// Read viewport array index from last 
FE stage during binning
 
-// user clip/cull distance enables
+   // Offset to the start of the attributes of the input vertices, in 
simdvector units
+uint32_t vertexAttribOffset;
+
+// User clip/cull distance enables
 uint8_t cullDistanceMask;
 uint8_t clipDistanceMask;
 
-   // Offset to the start of the attributes of the input vertices, in 
simdvector units
-uint32_t vertexAttribOffset;
+// Offset to clip/cull attrib section of the vertex, in simdvector units
+uint32_t vertexClipCullOffset;
 };
 
 
diff --git a/src/gallium/drivers/swr/swr_state.cpp 
b/src/gallium/drivers

Mesa (master): swr/rast: Fetch compile state changes

2017-09-13 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 000e2958f59a8d8e07f06e384546aa942d49b15f
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=000e2958f59a8d8e07f06e384546aa942d49b15f

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Sep  7 18:53:03 2017 -0500

swr/rast: Fetch compile state changes

Add InstanceStrideEnable field and rename InstanceDataStepRate to
InstanceAdvancementState in INPUT_ELEMENT_DESC structure.

Add stubs for handling InstanceStrideEnable in FetchJit::JitLoadVertices()
and FetchJit::JitGatherVertices() and assert if they are triggered.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp | 12 ++--
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h   |  7 ---
 src/gallium/drivers/swr/swr_state.cpp   |  2 +-
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 761c58ca27..f3a4b27d9a 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -360,7 +360,7 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE 
, Value* str
 Value *startOffset;
 if(ied.InstanceEnable)
 {
-Value* stepRate = C(ied.InstanceDataStepRate);
+Value* stepRate = C(ied.InstanceAdvancementState);
 
 // prevent a div by 0 for 0 step rate
 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
@@ -376,6 +376,10 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE 
, Value* str
 
 startOffset = startInstance;
 }
+else if (ied.InstanceStrideEnable)
+{
+SWR_ASSERT((0), "TODO: Fill out more once driver sends this 
down.");
+}
 else
 {
 // offset indices by baseVertex
@@ -825,7 +829,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 Value *startOffset;
 if(ied.InstanceEnable)
 {
-Value* stepRate = C(ied.InstanceDataStepRate);
+Value* stepRate = C(ied.InstanceAdvancementState);
 
 // prevent a div by 0 for 0 step rate
 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
@@ -841,6 +845,10 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 
 startOffset = startInstance;
 }
+else if (ied.InstanceStrideEnable)
+{
+SWR_ASSERT((0), "TODO: Fill out more once driver sends this 
down.");
+}
 else
 {
 // offset indices by baseVertex
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
index 4f456afffc..0dd6de759a 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
@@ -45,16 +45,17 @@ struct INPUT_ELEMENT_DESC
 uint32_tFormat : 10;
 uint32_tStreamIndex : 6;
 uint32_tInstanceEnable : 1;
+uint32_tInstanceStrideEnable : 1;
 uint32_tComponentControl0 : 3;
 uint32_tComponentControl1 : 3;
 uint32_tComponentControl2 : 3;
 uint32_tComponentControl3 : 3;
 uint32_tComponentPacking : 4;
-uint32_t_reserved : 19;
+uint32_t_reserved : 18;
 };
 uint64_t bits;
 };
-uint32_t InstanceDataStepRate;
+uint32_t InstanceAdvancementState;
 };
 
 // used to set ComponentPacking
@@ -124,7 +125,7 @@ struct FETCH_COMPILE_STATE
 {
 if((layout[i].bits != other.layout[i].bits) ||
((layout[i].InstanceEnable == 1) &&
-(layout[i].InstanceDataStepRate != 
other.layout[i].InstanceDataStepRate))){
+(layout[i].InstanceAdvancementState != 
other.layout[i].InstanceAdvancementState))){
 return false;
 }
 }
diff --git a/src/gallium/drivers/swr/swr_state.cpp 
b/src/gallium/drivers/swr/swr_state.cpp
index 1491868eae..93108de065 100644
--- a/src/gallium/drivers/swr/swr_state.cpp
+++ b/src/gallium/drivers/swr/swr_state.cpp
@@ -531,7 +531,7 @@ swr_create_vertex_elements_state(struct pipe_context *pipe,
 ? ComponentControl::StoreSrc
 : ComponentControl::Store1Fp;
  velems->fsState.layout[i].ComponentPacking = ComponentEnable::XYZW;
- velems->fsState.layout[i].InstanceDataStepRate =
+ velems->fsState.layout[i].InstanceAdvancementState =
 attribs[i].instance_divisor;
 
  /* Calculate the pitch of each stream */

___
mesa-commit mai

Mesa (master): swr/rast: Move clip/cull enables in API

2017-09-13 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 966997269278d5eeeb6baf7d70fb99df0038b081
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=966997269278d5eeeb6baf7d70fb99df0038b081

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Mon Aug 21 17:11:34 2017 -0500

swr/rast: Move clip/cull enables in API

Moved from from SWR_RASTSTATE to SWR_BACKEND_STATE.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/core/backend.cpp|  4 ++--
 .../drivers/swr/rasterizer/core/backend_impl.h |  2 +-
 .../drivers/swr/rasterizer/core/backend_sample.cpp |  4 ++--
 .../swr/rasterizer/core/backend_singlesample.cpp   |  4 ++--
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 18 +-
 src/gallium/drivers/swr/rasterizer/core/clip.h | 22 +++---
 .../drivers/swr/rasterizer/core/rasterizer.cpp |  2 +-
 src/gallium/drivers/swr/rasterizer/core/state.h|  8 
 src/gallium/drivers/swr/swr_state.cpp  | 16 
 9 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 363349f6c8..6282e87f31 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -272,9 +272,9 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, 
uint32_t x, uint32_t y,
 AR_END(BEBarycentric, 0);
 
 // interpolate user clip distance if available
-if (state.rastState.clipDistanceMask)
+if (state.backendState.clipDistanceMask)
 {
-coverageMask &= 
~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, 
psContext.vI.sample, psContext.vJ.sample);
+coverageMask &= 
~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, 
psContext.vI.sample, psContext.vJ.sample);
 }
 
 simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h 
b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
index 0f430ef3ab..593082bd7d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
@@ -886,7 +886,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, 
uint32_t x, uint32_t
 
 AR_END(BESetup, 0);
 
-PixelRateZTestLoop PixelRateZTest(pDC, workerId, work, coeffs, state, 
pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
+PixelRateZTestLoop PixelRateZTest(pDC, workerId, work, coeffs, state, 
pDepthBuffer, pStencilBuffer, state.backendState.clipDistanceMask);
 
 psContext.vY.UL = _simd_add_ps(vULOffsetsY, 
_simd_set1_ps(static_cast(y)));
 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, 
_simd_set1_ps(static_cast(y)));
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp 
b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
index bb2e9a9f63..04e34aa264 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
@@ -128,9 +128,9 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t 
workerId, uint32_t x, uint32_
 AR_END(BEBarycentric, 0);
 
 // interpolate user clip distance if available
-if (state.rastState.clipDistanceMask)
+if (state.backendState.clipDistanceMask)
 {
-coverageMask &= 
~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, 
psContext.vI.sample, psContext.vJ.sample);
+coverageMask &= 
~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, 
psContext.vI.sample, psContext.vJ.sample);
 }
 
 simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp 
b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
index 18f4299f51..686b97912c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
@@ -112,9 +112,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t 
workerId, uint32_t x, uint3
 AR_END(BEBarycentric, 1);
 
 // interpolate user clip distance if available
-if (state.rastState.clipDistanceMask)
+if (state.backendState.clipDistanceMask)
 {
-coverageMask &= 
~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, 
psContext.vI.center, psContext.vJ.center);
+

Mesa (master): swr/rast: whitespace changes

2017-09-13 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: c0ce5c4422a8a49124196da00577196ab22ab89c
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=c0ce5c4422a8a49124196da00577196ab22ab89c

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Sep  7 15:18:35 2017 -0500

swr/rast: whitespace changes

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/jitter/jit_api.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h 
b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
index 9f69669735..e589d2c6a7 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
@@ -51,6 +51,7 @@
 
 struct ShaderInfo;
 
+
 //
 /// Jit Compile Info Input
 //
@@ -63,6 +64,7 @@ struct JIT_COMPILE_INPUT
 size_t irLength;
 
 bool enableJitSampler;
+
 };
 
 extern "C"

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr: set caps for VB 4-byte alignment

2017-09-06 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 4edc5d830550355681df2147fd25dae4c77bccc0
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=4edc5d830550355681df2147fd25dae4c77bccc0

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Fri Aug 18 11:51:59 2017 -0500

swr: set caps for VB 4-byte alignment

Needed to compensate for change to fetch jit requiring
alignment.

Fixes regressions in piglit: vertex-buffer-offsets and about
another hundred of the vs-input*byte* tests.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/swr_screen.cpp | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/swr/swr_screen.cpp 
b/src/gallium/drivers/swr/swr_screen.cpp
index cc8d9955b8..85bf765841 100644
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -263,6 +263,12 @@ swr_get_param(struct pipe_screen *screen, enum pipe_cap 
param)
case PIPE_CAP_FAKE_SW_MSAA:
   return (swr_screen(screen)->msaa_max_count > 1) ? 0 : 1;
 
+   /* fetch jit change for 2-4GB buffers requires alignment */
+   case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
+  return 1;
+
   /* unsupported features */
case PIPE_CAP_ANISOTROPIC_FILTER:
case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
@@ -274,9 +280,6 @@ swr_get_param(struct pipe_screen *screen, enum pipe_cap 
param)
case PIPE_CAP_COMPUTE:
case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
-   case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
-   case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
-   case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
case PIPE_CAP_TGSI_TEXCOORD:
case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: FE/Clipper - unify SIMD8/ 16 functions using simdlib types

2017-09-06 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: dad32fc61c21601e3700b88914cd6b9c1271aa85
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=dad32fc61c21601e3700b88914cd6b9c1271aa85

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Aug 15 18:51:45 2017 -0500

swr/rast: FE/Clipper - unify SIMD8/16 functions using simdlib types

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/clip.cpp |   16 +-
 src/gallium/drivers/swr/rasterizer/core/clip.h   | 1650 ++
 src/gallium/drivers/swr/rasterizer/core/state.h  |7 +
 3 files changed, 465 insertions(+), 1208 deletions(-)

Diff:   
http://cgit.freedesktop.org/mesa/mesa/diff/?id=dad32fc61c21601e3700b88914cd6b9c1271aa85
___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: FE/Binner - unify SIMD8/ 16 functions using simdlib types

2017-09-06 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 6cb20c9f3a327fe3c1a99d6824632aea238d7d72
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=6cb20c9f3a327fe3c1a99d6824632aea238d7d72

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Fri Aug  4 18:07:01 2017 -0500

swr/rast: FE/Binner - unify SIMD8/16 functions using simdlib types

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 2366 ++--
 src/gallium/drivers/swr/rasterizer/core/binner.h   |  186 +-
 .../drivers/swr/rasterizer/core/conservativeRast.h |1 +
 src/gallium/drivers/swr/rasterizer/core/pa.h   |   16 +
 src/gallium/drivers/swr/rasterizer/core/utils.h|8 +
 5 files changed, 767 insertions(+), 1810 deletions(-)

Diff:   
http://cgit.freedesktop.org/mesa/mesa/diff/?id=6cb20c9f3a327fe3c1a99d6824632aea238d7d72
___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Allow gather of floats from fetch shader with 2-4GB offsets

2017-09-06 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 4475583f5ea44c3585e0ffea6118ba3a32fddd72
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=4475583f5ea44c3585e0ffea6118ba3a32fddd72

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Aug  9 17:32:28 2017 -0500

swr/rast: Allow gather of floats from fetch shader with 2-4GB offsets

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py | 1 +
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp  | 7 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 2ed2b2f61e..025d38ab33 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -45,6 +45,7 @@ intrinsics = [
 ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
 ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
 ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
+['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']],
 ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']],
 ['VRSQRTPS', 'x86_avx_rsqrt_ps_256', ['a']],
 ['VRCPPS', 'x86_avx_rcp_ps_256', ['a']],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index dcfe8970f5..761c58ca27 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -1005,7 +1005,12 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 Value *vMask = vGatherMask;
 
 // Gather a SIMD of vertices
-vVertexElements[currentVertexElement++] = 
GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+// APIs allow a 4GB range for offsets
+// However, GATHERPS uses signed 32-bit 
offsets, so only a 2GB range :(
+// But, we know that elements must be aligned 
for FETCH. :)
+// Right shift the offset by a bit and then 
scale by 2 to remove the sign extension.
+Value* vShiftedOffsets = VPSRLI(vOffsets, 
C(1));
+vVertexElements[currentVertexElement++] = 
GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, C((char)2));
 }
 else
 {

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: SIMD16 FE remove templated immediates workaround

2017-09-06 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 9df5691fffafdc31b82eb18f3cd5ce7d45eb83a2
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=9df5691fffafdc31b82eb18f3cd5ce7d45eb83a2

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Aug  9 18:48:58 2017 -0500

swr/rast: SIMD16 FE remove templated immediates workaround

Fixed properly in gcc-compatible fashion.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 110 -
 1 file changed, 20 insertions(+), 90 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index e09ff7a399..832c47d6e4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -404,35 +404,6 @@ void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, 
uint8_t clipDistMask,
 }
 }
 
-// WA linux compiler issue with SIMDLIB and shift immediates
-#define SIMD_WA_SXXI_EPI32 1
-
-#if SIMD_WA_SXXI_EPI32
-template
-simdscalari simd_wa_slli_epi32(simdscalari a)
-{
-return SIMD256::slli_epi32(a);
-}
-
-template
-simd16scalari simd_wa_slli_epi32(simd16scalari a)
-{
-return SIMD512::slli_epi32(a);
-}
-
-template
-simdscalari simd_wa_srai_epi32(simdscalari a)
-{
-return SIMD256::srai_epi32(a);
-}
-
-template
-simd16scalari simd_wa_srai_epi32(simd16scalari a)
-{
-return SIMD512::srai_epi32(a);
-}
-
-#endif
 INLINE
 void TransposeVertices(simd4scalar()[8], const simdscalar , const 
simdscalar , const simdscalar )
 {
@@ -804,17 +775,10 @@ endBinTriangles:
 }
 
 // Convert triangle bbox to macrotile units.
-#if SIMD_WA_SXXI_EPI32
-bbox.xmin = 
simd_wa_srai_epi32(bbox.xmin);
-bbox.ymin = 
simd_wa_srai_epi32(bbox.ymin);
-bbox.xmax = 
simd_wa_srai_epi32(bbox.xmax);
-bbox.ymax = 
simd_wa_srai_epi32(bbox.ymax);
-#else
-bbox.xmin = 
SIMD_T::srai_epi32(bbox.xmin);
-bbox.ymin = 
SIMD_T::srai_epi32(bbox.ymin);
-bbox.xmax = 
SIMD_T::srai_epi32(bbox.xmax);
-bbox.ymax = 
SIMD_T::srai_epi32(bbox.ymax);
-#endif
+bbox.xmin = SIMD_T::template 
srai_epi32(bbox.xmin);
+bbox.ymin = SIMD_T::template 
srai_epi32(bbox.ymin);
+bbox.xmax = SIMD_T::template 
srai_epi32(bbox.xmax);
+bbox.ymax = SIMD_T::template 
srai_epi32(bbox.ymax);
 
 OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], 
aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
 
@@ -1034,13 +998,8 @@ void BinPostSetupPointsImpl(
 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
 
 // compute macro tile coordinates 
-#if SIMD_WA_SXXI_EPI32
-typename SIMD_T::Integer macroX = 
simd_wa_srai_epi32(vXi);
-typename SIMD_T::Integer macroY = 
simd_wa_srai_epi32(vYi);
-#else
-typename SIMD_T::Integer macroX = 
SIMD_T::srai_epi32(vXi);
-typename SIMD_T::Integer macroY = 
SIMD_T::srai_epi32(vYi);
-#endif
+typename SIMD_T::Integer macroX = SIMD_T::template 
srai_epi32(vXi);
+typename SIMD_T::Integer macroY = SIMD_T::template 
srai_epi32(vYi);
 
 OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
 
@@ -1048,30 +1007,15 @@ void BinPostSetupPointsImpl(
 SIMD_T::store_si(reinterpret_cast(aMacroY), macroY);
 
 // compute raster tile coordinates
-#if SIMD_WA_SXXI_EPI32
-typename SIMD_T::Integer rasterX = 
simd_wa_srai_epi32(vXi);
-typename SIMD_T::Integer rasterY = 
simd_wa_srai_epi32(vYi);
-#else
-typename SIMD_T::Integer rasterX = 
SIMD_T::srai_epi32(vXi);
-typename SIMD_T::Integer rasterY = 
SIMD_T::srai_epi32(vYi);
-#endif
+typename SIMD_T::Integer rasterX = SIMD_T::template 
srai_epi32(vXi);
+typename SIMD_T::Integer rasterY = SIMD_T::template 
srai_epi32(vYi);
 
 // compute raster tile relative x,y for coverage mask
-#if SIMD_WA_SXXI_EPI32
-typename SIMD_T::Integer tileAlignedX = 
simd_wa_slli_epi32(rasterX);
-typename SIMD_T::Integer tileAlignedY = 
simd_wa_slli_epi32(rasterY);
-#else
-typename SIMD_T::Integer tileAlignedX = 
SIMD_T::slli_epi32(rasterX);
-typename SIMD_T::Integer tileAlignedY = 
SIMD_T::slli_epi32(rasterY);
-#endif
+typename SIMD_T::Integer tileAlignedX = SIMD_T::template 
slli_epi32(rasterX);
+typename SIMD_T::Integer tileAlignedY = SIMD_T::template 
slli_epi32(rasterY);
 
-#if SIMD_WA_SXXI_EPI32
-typename SIMD_T::Integer tileRelativeX = 
SIMD_T::sub_epi32(simd_wa_srai_epi32(vXi), tileAlignedX);
-typename SIMD_T::Integer tileRelativeY = 
SIMD_T::sub_epi32(simd_wa_srai_epi32(vYi), tileAlignedY);
-#else
-typename SIMD_T::Integer tileRelativeX = 
SIMD_T::sub_epi32(SIMD_T::srai_epi32(vXi), tileAlignedX);
-typename SIMD_T::Integer tileRelativeY = 
SIMD_T::sub_epi32(SIMD_T::srai_epi32(vYi), tileAlignedY);
-#endif
+typename SIMD_T::Integer tileRelativeX = 
SIMD_T::sub_epi32(SIMD_T:

Mesa (master): swr/rast: Removed some trailing whitespace caught during review

2017-09-06 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 6afdc8732c4fca735803b6cbacf9723bbd02afa4
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=6afdc8732c4fca735803b6cbacf9723bbd02afa4

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Aug  1 15:21:04 2017 -0500

swr/rast: Removed some trailing whitespace caught during review

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp |  4 ++--
 src/gallium/drivers/swr/rasterizer/core/fifo.hpp |  4 ++--
 src/gallium/drivers/swr/rasterizer/core/pa.h | 12 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git 
a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
index 0ca9a7828d..d1852b35fd 100644
--- 
a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
+++ 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
@@ -23,7 +23,7 @@
 * @file ${filename}
 *
 * @brief Event handler interface.  auto-generated file
-* 
+*
 * DO NOT EDIT
 *
 * Generation Command Line:
@@ -57,7 +57,7 @@ namespace ArchRast
 std::stringstream outDir;
 outDir << KNOB_DEBUG_OUTPUT_DIR << pBaseName << "_" << pid << 
std::ends;
 CreateDirectory(outDir.str().c_str(), NULL);
-
+
 // There could be multiple threads creating thread pools. We
 // want to make sure they are uniquly identified by adding in
 // the creator's thread id into the filename.
diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp 
b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
index 3be72f37cd..43d3a83226 100644
--- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
+++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
@@ -79,7 +79,7 @@ struct QUEUE
 long initial = InterlockedCompareExchange(, 1, 0);
 return (initial == 0);
 }
-
+
 void unlock()
 {
 mLock = 0;
@@ -112,7 +112,7 @@ struct QUEUE
 __m256 vSrc = _mm256_load_ps(pSrc + i*KNOB_SIMD_WIDTH);
 _mm256_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc);
 };
-
+
 const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH*4);
 static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T),
 "FIFO element size should be multiple of SIMD width.");
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h 
b/src/gallium/drivers/swr/rasterizer/core/pa.h
index cb3470ff6b..87dba22bf8 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@@ -162,7 +162,7 @@ struct PA_STATE_OPT : public PA_STATE
 bool   isStreaming{ false };
 
 SIMDMASK   junkIndices  { 0 };  // temporary index store 
for unused virtual function
-
+
 PA_STATE_OPT() {}
 PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, 
uint32_t streamSizeInVerts,
 uint32_t vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = 
TOP_UNKNOWN);
@@ -412,7 +412,7 @@ struct PA_STATE_CUT : public PA_STATE
 uint32_t vertsPerPrim{ 0 };
 bool processCutVerts{ false };   // vertex indices with cuts should be 
processed as normal, otherwise they
  // are ignored.  Fetch shader sends 
invalid verts on cuts that should be ignored
- // while the GS sends valid verts for 
every index 
+ // while the GS sends valid verts for 
every index
 
 simdvector  junkVector;  // junk simdvector for unimplemented 
API
 #if ENABLE_AVX512_SIMD16
@@ -575,7 +575,7 @@ struct PA_STATE_CUT : public PA_STATE
 return CheckBit(this->pCutIndices[vertexIndex], vertexOffset);
 }
 
-// iterates across the unprocessed verts until we hit the end or we 
+// iterates across the unprocessed verts until we hit the end or we
 // have assembled SIMD prims
 void ProcessVerts()
 {
@@ -583,7 +583,7 @@ struct PA_STATE_CUT : public PA_STATE
 this->numRemainingVerts > 0 &&
 this->curVertex != this->headVertex)
 {
-// if cut index, restart topology 
+// if cut index, restart topology
 if (IsCutIndex(this->curVertex))
 {
 if (this->processCutVerts)
@@ -923,7 +923,7 @@ struct PA_STATE_CUT : public PA_STATE
 case 6:
 SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
 AssembleTriStripAdj();
-
+
 uint32_t nextTri[6];
 if (this->reverseWinding)
 {
@@ -939,7 +939,7 @@ struct PA_STATE_CUT : p

Mesa (master): swr/rast: SIMD16 PA - rename Assemble_simd16 to Assemble

2017-09-06 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 404ac6da9e7eadd62c38e20f382b5280b29fa8bb
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=404ac6da9e7eadd62c38e20f382b5280b29fa8bb

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Mon Aug  7 18:13:54 2017 -0500

swr/rast: SIMD16 PA - rename Assemble_simd16 to Assemble

For consistency and to support overloading.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/clip.h | 18 +-
 .../drivers/swr/rasterizer/core/frontend.cpp   |  6 +++---
 src/gallium/drivers/swr/rasterizer/core/pa.h   | 22 +++---
 3 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h 
b/src/gallium/drivers/swr/rasterizer/core/clip.h
index ffc69c4229..5238284e32 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -399,8 +399,8 @@ public:
 simd16vector vClipCullDistLo[3];
 simd16vector vClipCullDistHi[3];
 
-pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
-pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
+pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
+pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
 
 DWORD index;
 while (_BitScanForward(, cullMask))
@@ -680,7 +680,7 @@ public:
 {
 #if USE_SIMD16_FRONTEND
 simd16vector attrib_simd16[NumVertsPerPrim];
-bool assemble = 
clipPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
+bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, 
attrib_simd16);
 
 if (assemble)
 {
@@ -731,7 +731,7 @@ public:
 
 // assemble pos
 simd16vector tmpVector[NumVertsPerPrim];
-pa.Assemble_simd16(VERTEX_POSITION_SLOT, tmpVector);
+pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 {
 vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
@@ -748,7 +748,7 @@ public:
 maxSlot = std::max(maxSlot, mapSlot);
 uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
 
-pa.Assemble_simd16(inputSlot, tmpVector);
+pa.Assemble(inputSlot, tmpVector);
 
 // if constant interpolation enabled for this attribute, assign 
the provoking
 // vertex values to all edges
@@ -771,7 +771,7 @@ public:
 // assemble user clip distances if enabled
 if (this->state.rastState.clipDistanceMask & 0xf)
 {
-pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
+pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 {
 vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = 
tmpVector[i];
@@ -780,7 +780,7 @@ public:
 
 if (this->state.rastState.clipDistanceMask & 0xf0)
 {
-pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
+pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 {
 vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = 
tmpVector[i];
@@ -919,7 +919,7 @@ public:
 do
 {
 simd16vector attrib[NumVertsPerPrim];
-bool assemble = 
clipPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib);
+bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, 
attrib);
 
 if (assemble)
 {
@@ -1060,7 +1060,7 @@ public:
 if (state.backendState.readViewportArrayIndex)
 {
 simd16vector vpiAttrib[NumVertsPerPrim];
-pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
+pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
 
 // OOB indices => forced to zero.
 simd16scalari vpai = 
_simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 406a0e0bec..f882869eb7 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -929,7 +929,7 @@ static void GeometryShaderStage(
 #if USE_SIMD16_FRONTEND
 simd16vector attrib_simd16[3];
 
-bool assemble = 
gsPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
+bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, 
attrib_simd16);
 
 #else
 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, 
attrib);
@@ -1297,7 +1297,7 @@ static void TessellationStages(
 

Mesa (master): swr/rast: Remove use of C++14 template variable

2017-09-06 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 1ebf6fc86556669fbb7b30e560119622497a5051
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=1ebf6fc86556669fbb7b30e560119622497a5051

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Aug 10 16:11:35 2017 -0500

swr/rast: Remove use of C++14 template variable

SWR rasterizer must remain C++11 compliant.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/binner.cpp |  6 +++---
 src/gallium/drivers/swr/rasterizer/core/binner.h   | 14 +++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 832c47d6e4..01c2f8f7a3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -502,7 +502,7 @@ void SIMDCALL BinTrianglesImpl(
 }
 
 // Adjust for pixel center location
-typename SIMD_T::Float offset = 
g_pixelOffsets[rastState.pixelLocation];
+typename SIMD_T::Float offset = 
SwrPixelOffsets::GetOffset(rastState.pixelLocation);
 
 tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
 tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
@@ -1332,7 +1332,7 @@ void BinPointsImpl(
 }
 }
 
-typename SIMD_T::Float offset = 
g_pixelOffsets[rastState.pixelLocation];
+typename SIMD_T::Float offset = 
SwrPixelOffsets::GetOffset(rastState.pixelLocation);
 
 prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
 prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
@@ -1666,7 +1666,7 @@ void SIMDCALL BinLinesImpl(
 }
 
 // adjust for pixel center location
-typename SIMD_T::Float offset = 
g_pixelOffsets[rastState.pixelLocation];
+typename SIMD_T::Float offset = 
SwrPixelOffsets::GetOffset(rastState.pixelLocation);
 
 prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
 prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.h 
b/src/gallium/drivers/swr/rasterizer/core/binner.h
index e842aa663b..97e113f7f2 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.h
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.h
@@ -31,11 +31,19 @@
 //
 /// @brief Offsets added to post-viewport vertex positions based on
 /// raster state.
+///
+/// Can't use templated variable because we must stick with C++11 features.
+/// Template variables were introduced with C++14
 template 
-static const typename SIMD_T::Float g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
+struct SwrPixelOffsets
 {
-SIMD_T::set1_ps(0.0f),  // SWR_PIXEL_LOCATION_CENTER
-SIMD_T::set1_ps(0.5f),  // SWR_PIXEL_LOCATION_UL
+public:
+INLINE static typename SIMD_T::Float GetOffset(uint32_t loc)
+{
+SWR_ASSERT(loc <= 1);
+
+return SIMD_T::set1_ps(loc ? 0.5f : 0.0f);
+}
 };
 
 //

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr: limit pipe_draw_info->restart_index usage

2017-08-23 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: f0602dc92044ea6d738d0e539e52f938a41f6093
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=f0602dc92044ea6d738d0e539e52f938a41f6093

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Aug 22 10:39:57 2017 -0500

swr: limit pipe_draw_info->restart_index usage

Only copy this value when in restart drawing mode.

Eliminates valgrind errors when running trivial programs.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/swr_draw.cpp | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/swr_draw.cpp 
b/src/gallium/drivers/swr/swr_draw.cpp
index df1c11abeb..2363800f80 100644
--- a/src/gallium/drivers/swr/swr_draw.cpp
+++ b/src/gallium/drivers/swr/swr_draw.cpp
@@ -107,7 +107,10 @@ swr_draw_vbo(struct pipe_context *pipe, const struct 
pipe_draw_info *info)
}
 
struct swr_vertex_element_state *velems = ctx->velems;
-   velems->fsState.cutIndex = info->restart_index;
+   if (info->primitive_restart)
+  velems->fsState.cutIndex = info->restart_index;
+   else
+  velems->fsState.cutIndex = 0;
velems->fsState.bEnableCutIndex = info->primitive_restart;
velems->fsState.bPartialVertexBuffer = (info->min_index > 0);
 

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: Fix invalid casting for calls to Interlocked* functions

2017-08-16 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: b333bc753e2dd1ed1a676606046a4289e7d58187
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=b333bc753e2dd1ed1a676606046a4289e7d58187

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Mon Aug  7 20:33:24 2017 -0500

swr/rast: Fix invalid casting for calls to Interlocked* functions

CID: 1416243, 1416244, 1416255
CC: mesa-sta...@lists.freedesktop.org
Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/api.cpp | 2 +-
 src/gallium/drivers/swr/rasterizer/core/context.h   | 8 
 src/gallium/drivers/swr/rasterizer/core/threads.cpp | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 8dc9ac24a7..ccb6dfb7a1 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -189,7 +189,7 @@ void QueueWork(SWR_CONTEXT *pContext)
 
 if (IsDraw)
 {
-InterlockedIncrement((volatile long*)>drawsOutstandingFE);
+InterlockedIncrement(>drawsOutstandingFE);
 }
 
 _ReadWriteBarrier();
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h 
b/src/gallium/drivers/swr/rasterizer/core/context.h
index 131b3cbbb0..bcd5801a3b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -409,12 +409,12 @@ struct DRAW_CONTEXT
 booldependent;  // Backend work is dependent on all 
previous BE
 boolisCompute;  // Is this DC a compute context?
 boolcleanupState;   // True if this is the last draw using an 
entry in the state ring.
-volatile bool   doneFE; // Is FE work done for this draw?
 
 FE_WORK FeWork;
 
+volatile OSALIGNLINE(bool)   doneFE; // Is FE work done for 
this draw?
 volatile OSALIGNLINE(uint32_t)   FeLock;
-volatile int32_tthreadsDone;
+volatile OSALIGNLINE(uint32_t)   threadsDone;
 
 SYNC_DESC   retireCallback; // Call this func when this DC is retired.
 };
@@ -503,9 +503,9 @@ struct SWR_CONTEXT
 // Scratch space for workers.
 uint8_t** ppScratch;
 
-volatile int32_t  drawsOutstandingFE;
+volatile OSALIGNLINE(uint32_t)  drawsOutstandingFE;
 
-CachingAllocator cachingArenaAllocator;
+OSALIGNLINE(CachingAllocator) cachingArenaAllocator;
 uint32_t frameCount;
 
 uint32_t lastFrameChecked;
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp 
b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 70bde027ee..b704d23f54 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -393,7 +393,7 @@ INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, 
uint32_t workerId, DRAW_CONT
 // inlined-only version
 INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t 
workerId, DRAW_CONTEXT* pDC)
 {
-int32_t result = InterlockedDecrement((volatile long*)>threadsDone);
+int32_t result = 
static_cast(InterlockedDecrement(>threadsDone));
 SWR_ASSERT(result >= 0);
 
 AR_FLUSH(pDC->drawId);
@@ -639,7 +639,7 @@ INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t 
workerId, DRAW_CONTEX
 _mm_mfence();
 pDC->doneFE = true;
 
-InterlockedDecrement((volatile long*)>drawsOutstandingFE);
+InterlockedDecrement(>drawsOutstandingFE);
 }
 
 void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t 
)

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): configure: remove trailing "-a" in swr architecture test

2017-08-10 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 4d9b0dcccb81ad10113d9aef52b4c84496e879f1
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=4d9b0dcccb81ad10113d9aef52b4c84496e879f1

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Aug 10 12:58:57 2017 -0500

configure: remove trailing "-a" in swr architecture test

Fixes "configure: line 27326: test: argument expected"

CC: mesa-sta...@lists.freedesktop.org
Reviewed-by: Matt Turner <matts...@gmail.com>

---

 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index d2704bce05..f131788e3a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2545,7 +2545,7 @@ if test -n "$with_gallium_drivers"; then
 if test "x$HAVE_SWR_AVX" != xyes -a \
 "x$HAVE_SWR_AVX2" != xyes -a \
 "x$HAVE_SWR_KNL" != xyes -a \
-"x$HAVE_SWR_SKX" != xyes -a; then
+"x$HAVE_SWR_SKX" != xyes; then
AC_MSG_ERROR([swr enabled but no swr architectures selected])
 fi
 

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): st/osmesa: add osmesa framebuffer iface hash table per st manager

2017-08-03 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 9966c85e01a4344d2a6bb76e432e0bed70d52ff6
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=9966c85e01a4344d2a6bb76e432e0bed70d52ff6

Author: Bruce Cherniak 
Date:   Wed Aug  2 18:14:19 2017 -0500

st/osmesa: add osmesa framebuffer iface hash table per st manager

Commit bbc29393d3 didn't include osmesa state_tracker.  This patch adds
necessary initialization.

Fixes crash in OSMesa initialization.

Created-by: Charmaine Lee 
Tested-by: Bruce Cherniak 
Reviewed-by: Charmaine Lee 

Cc: 17.2 

---

 src/gallium/state_trackers/osmesa/osmesa.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/src/gallium/state_trackers/osmesa/osmesa.c 
b/src/gallium/state_trackers/osmesa/osmesa.c
index 18f1b88128..751d255c54 100644
--- a/src/gallium/state_trackers/osmesa/osmesa.c
+++ b/src/gallium/state_trackers/osmesa/osmesa.c
@@ -439,6 +439,7 @@ osmesa_st_framebuffer_validate(struct st_context_iface 
*stctx,
return TRUE;
 }
 
+static uint32_t osmesa_fb_ID = 0;
 
 static struct st_framebuffer_iface *
 osmesa_create_st_framebuffer(void)
@@ -448,6 +449,8 @@ osmesa_create_st_framebuffer(void)
   stfbi->flush_front = osmesa_st_framebuffer_flush_front;
   stfbi->validate = osmesa_st_framebuffer_validate;
   p_atomic_set(>stamp, 1);
+  stfbi->ID = p_atomic_inc_return(_fb_ID);
+  stfbi->state_manager = get_st_manager();
}
return stfbi;
 }
@@ -508,6 +511,14 @@ osmesa_find_buffer(enum pipe_format color_format,
 static void
 osmesa_destroy_buffer(struct osmesa_buffer *osbuffer)
 {
+   struct st_api *stapi = get_st_api();
+
+   /*
+* Notify the state manager that the associated framebuffer interface
+* is no longer valid.
+*/
+   stapi->destroy_drawable(stapi, osbuffer->stfb);
+
FREE(osbuffer->stfb);
FREE(osbuffer);
 }

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr: Add arch flags to support Cray and PGI compilers

2017-08-03 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: f0da70a96432dff8f9ebf054b352ce9db45f3ad6
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=f0da70a96432dff8f9ebf054b352ce9db45f3ad6

Author: Chuck Atkins <chuck.atk...@kitware.com>
Date:   Mon Jul 31 15:53:13 2017 -0400

swr: Add arch flags to support Cray and PGI compilers

Note that the Cray flags (-target-cpu=) need to come first since the
cray programming environment uses wappers around other compilers.  By
checking the wrapper flags first, you can be sure to match the wrapper
flag instead of the underlying compiler (gcc, intel, pgi, etc.) flags.

Signed-off-by: Chuck Atkins <chuck.atk...@kitware.com>
Reviewed-by: Tim Rowley <timothy.o.row...@intel.com>

---

 configure.ac | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/configure.ac b/configure.ac
index 6302aa2b0c..3b45baf6d0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2511,7 +2511,7 @@ if test -n "$with_gallium_drivers"; then
 AC_SUBST([SWR_CXX11_CXXFLAGS])
 
 swr_require_cxx_feature_flags "AVX" "defined(__AVX__)" \
-",-mavx,-march=core-avx" \
+
",-target-cpu=sandybridge,-mavx,-march=core-avx,-tp=sandybridge" \
 SWR_AVX_CXXFLAGS
 AC_SUBST([SWR_AVX_CXXFLAGS])
 
@@ -2523,21 +2523,21 @@ if test -n "$with_gallium_drivers"; then
 ;;
 xavx2)
 swr_require_cxx_feature_flags "AVX2" "defined(__AVX2__)" \
-",-mavx2 -mfma -mbmi2 -mf16c,-march=core-avx2" \
+",-target-cpu=haswell,-mavx2 -mfma -mbmi2 
-mf16c,-march=core-avx2,-tp=haswell" \
 SWR_AVX2_CXXFLAGS
 AC_SUBST([SWR_AVX2_CXXFLAGS])
 HAVE_SWR_AVX2=yes
 ;;
 xknl)
 swr_require_cxx_feature_flags "KNL" "defined(__AVX512F__) 
&& defined(__AVX512ER__)" \
-",-march=knl,-xMIC-AVX512" \
+",-target-cpu=mic-knl,-march=knl,-xMIC-AVX512" \
 SWR_KNL_CXXFLAGS
 AC_SUBST([SWR_KNL_CXXFLAGS])
 HAVE_SWR_KNL=yes
 ;;
 xskx)
 swr_require_cxx_feature_flags "SKX" "defined(__AVX512F__) 
&& defined(__AVX512BW__)" \
-",-march=skylake-avx512,-xCORE-AVX512" \
+
",-target-cpu=x86-skylake,-march=skylake-avx512,-xCORE-AVX512" \
 SWR_SKX_CXXFLAGS
 AC_SUBST([SWR_SKX_CXXFLAGS])
 HAVE_SWR_SKX=yes

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: fix scons gen_knobs.h dependency

2017-08-02 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: e4a6ae06cf01a21d7fe32e3ff2fc441102d68f82
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=e4a6ae06cf01a21d7fe32e3ff2fc441102d68f82

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Mon Jul 31 16:59:06 2017 -0500

swr/rast: fix scons gen_knobs.h dependency

Copy/paste error was duplicating a gen_knobs.cpp rule.

Fixes: 5079c277b57 ("swr: [scons] Fix windows build")
Reviewed-by: Emil Velikov <emil.veli...@collabora.com>
Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/SConscript | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/SConscript 
b/src/gallium/drivers/swr/SConscript
index a32807d36b..c578d7a648 100644
--- a/src/gallium/drivers/swr/SConscript
+++ b/src/gallium/drivers/swr/SConscript
@@ -53,7 +53,7 @@ env.CodeGenerate(
 source = '',
 command = python_cmd + ' $SCRIPT --output $TARGET --gen_h'
 )
-Depends('rasterizer/codegen/gen_knobs.cpp',
+Depends('rasterizer/codegen/gen_knobs.h',
 swrroot + 'rasterizer/codegen/templates/gen_knobs.cpp')
 
 env.CodeGenerate(

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: fix movemask_ps / movemask_pd on AVX512

2017-08-02 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: eddbd781af15f655a1dba6949e7c6b214f47e2f8
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=eddbd781af15f655a1dba6949e7c6b214f47e2f8

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Jul 20 17:06:14 2017 -0500

swr/rast: fix movemask_ps / movemask_pd on AVX512

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
index 1001417704..1dbfff8c9c 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
@@ -554,15 +554,20 @@ static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer 
a)
 
 static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
 {
-__mmask8 m = _mm512_test_epi64_mask(castpd_si(a), set1_epi32(-1));
+__mmask8 m = _mm512_test_epi64_mask(castpd_si(a), 
set1_epi64(0x8000LL));
 return static_cast(m);
 }
 static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
 {
-__mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(-1));
+__mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(0x800));
 return static_cast(m);
 }
 
+static SIMDINLINE Integer SIMDCALL set1_epi64(long long i) // return i (all 
elements are same value)
+{
+return _mm512_set1_epi64(i);
+}
+
 static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements 
are same value)
 {
 return _mm512_set1_epi32(i);

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: split gen_knobs templates into .h/.cpp

2017-08-02 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 844be91e70413c1c3871d5f93b1e4766eb495df9
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=844be91e70413c1c3871d5f93b1e4766eb495df9

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Mon Jul 31 17:22:12 2017 -0500

swr/rast: split gen_knobs templates into .h/.cpp

Switch to a 1:1 mapping template:generated for future maintenance.

Reviewed-by: Emil Velikov <emil.veli...@collabora.com>
Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/Makefile.am|   3 +-
 src/gallium/drivers/swr/SConscript |   2 +-
 .../drivers/swr/rasterizer/codegen/gen_knobs.py|  14 +-
 .../swr/rasterizer/codegen/templates/gen_knobs.cpp | 108 --
 .../swr/rasterizer/codegen/templates/gen_knobs.h   | 157 +
 5 files changed, 166 insertions(+), 118 deletions(-)

diff --git a/src/gallium/drivers/swr/Makefile.am 
b/src/gallium/drivers/swr/Makefile.am
index 73fe904a7d..b20f128bd2 100644
--- a/src/gallium/drivers/swr/Makefile.am
+++ b/src/gallium/drivers/swr/Makefile.am
@@ -115,7 +115,7 @@ rasterizer/codegen/gen_knobs.cpp: 
rasterizer/codegen/gen_knobs.py rasterizer/cod
--output rasterizer/codegen/gen_knobs.cpp \
--gen_cpp
 
-rasterizer/codegen/gen_knobs.h: rasterizer/codegen/gen_knobs.py 
rasterizer/codegen/knob_defs.py rasterizer/codegen/templates/gen_knobs.cpp 
rasterizer/codegen/gen_common.py
+rasterizer/codegen/gen_knobs.h: rasterizer/codegen/gen_knobs.py 
rasterizer/codegen/knob_defs.py rasterizer/codegen/templates/gen_knobs.h 
rasterizer/codegen/gen_common.py
$(MKDIR_GEN)
$(PYTHON_GEN) \
$(srcdir)/rasterizer/codegen/gen_knobs.py \
@@ -347,5 +347,6 @@ EXTRA_DIST = \
rasterizer/codegen/templates/gen_builder.hpp \
rasterizer/codegen/templates/gen_header_init.hpp \
rasterizer/codegen/templates/gen_knobs.cpp \
+   rasterizer/codegen/templates/gen_knobs.h \
rasterizer/codegen/templates/gen_llvm.hpp \
rasterizer/codegen/templates/gen_rasterizer.cpp
diff --git a/src/gallium/drivers/swr/SConscript 
b/src/gallium/drivers/swr/SConscript
index c578d7a648..b394cbc17e 100644
--- a/src/gallium/drivers/swr/SConscript
+++ b/src/gallium/drivers/swr/SConscript
@@ -54,7 +54,7 @@ env.CodeGenerate(
 command = python_cmd + ' $SCRIPT --output $TARGET --gen_h'
 )
 Depends('rasterizer/codegen/gen_knobs.h',
-swrroot + 'rasterizer/codegen/templates/gen_knobs.cpp')
+swrroot + 'rasterizer/codegen/templates/gen_knobs.h')
 
 env.CodeGenerate(
 target = 'rasterizer/jitter/gen_state_llvm.h',
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py
index 2c271c7f5c..33f62a28ce 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py
@@ -37,27 +37,25 @@ def main(args=sys.argv[1:]):
 args = parser.parse_args()
 
 cur_dir = os.path.dirname(os.path.abspath(__file__))
-template_file = os.path.join(cur_dir, 'templates', 'gen_knobs.cpp')
+template_cpp = os.path.join(cur_dir, 'templates', 'gen_knobs.cpp')
+template_h = os.path.join(cur_dir, 'templates', 'gen_knobs.h')
 
 if args.gen_h:
 MakoTemplateWriter.to_file(
-template_file,
+template_h,
 args.output,
 cmdline=sys.argv,
 filename='gen_knobs',
-knobs=knob_defs.KNOBS,
-includes=['core/knobs_init.h', 'common/os.h', 'sstream', 
'iomanip'],
-gen_header=True)
+knobs=knob_defs.KNOBS)
 
 if args.gen_cpp:
 MakoTemplateWriter.to_file(
-template_file,
+template_cpp,
 args.output,
 cmdline=sys.argv,
 filename='gen_knobs',
 knobs=knob_defs.KNOBS,
-includes=['core/knobs_init.h', 'common/os.h', 'sstream', 
'iomanip'],
-gen_header=False)
+includes=['core/knobs_init.h', 'common/os.h', 'sstream', 
'iomanip'])
 
 return 0
 
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
index a9506434c6..2f4c47a92e 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
@@ -20,11 +20,7 @@
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
-% if gen_header:
-* @file ${filename}.h
-% else:
 * @file ${filename}.cpp
-% endif 
 *
 * @brief Dynamic Knobs for Core.
 *
@@ -35,105 +31,6 @@
 *
 **/
 <% calc_max_knob_len(knobs) %>
-%if gen_header:
-#pragma once
-#include 
-
-struct KnobBase
-{
-private:
-// Update the input 

Mesa (master): swr/rast: enable USE_SIMD16_FRONTEND by default

2017-08-02 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 030cfa8eed9a91fe5b5ae59670a3001ac0b0f339
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=030cfa8eed9a91fe5b5ae59670a3001ac0b0f339

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Jul 19 17:49:17 2017 -0500

swr/rast: enable USE_SIMD16_FRONTEND by default

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/knobs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h 
b/src/gallium/drivers/swr/rasterizer/core/knobs.h
index 7ad6fe33f0..10bd4a5e70 100644
--- a/src/gallium/drivers/swr/rasterizer/core/knobs.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h
@@ -40,7 +40,7 @@
 
 #define ENABLE_AVX512_SIMD161
 #define USE_8x2_TILE_BACKEND1
-#define USE_SIMD16_FRONTEND 0
+#define USE_SIMD16_FRONTEND 1
 
 ///
 // Architecture validation

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: switch gen_knobs.cpp license

2017-08-02 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: fb3e50a351b52014479a9a81226b7c51b176afed
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=fb3e50a351b52014479a9a81226b7c51b176afed

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Mon Jul 31 17:48:12 2017 -0500

swr/rast: switch gen_knobs.cpp license

Unintentionally added with an apache2 license; relicense to match
the rest of the tree.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../swr/rasterizer/codegen/templates/gen_knobs.cpp | 29 +-
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
index 06b93bd72b..e6fe16533a 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
@@ -1,19 +1,24 @@
 /**
+* Copyright (C) 2015-2017 Intel Corporation.   All Rights Reserved.
 *
-* Copyright 2015-2017
-* Intel Corporation
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
 *
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
 *
-* http ://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
 *
 % if gen_header:
 * @file ${filename}.h

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: stop using MSFT types in platform independent code

2017-08-02 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: f253798205a3ce7f577867a96ce487bf20e10909
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=f253798205a3ce7f577867a96ce487bf20e10909

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Jul 20 13:48:28 2017 -0500

swr/rast: stop using MSFT types in platform independent code

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/common/os.h |  6 --
 src/gallium/drivers/swr/rasterizer/core/api.cpp|  2 +-
 src/gallium/drivers/swr/rasterizer/core/api.h  |  4 ++--
 src/gallium/drivers/swr/rasterizer/core/binner.cpp |  4 ++--
 src/gallium/drivers/swr/rasterizer/core/blend.h|  2 +-
 src/gallium/drivers/swr/rasterizer/core/clip.h |  8 
 src/gallium/drivers/swr/rasterizer/core/fifo.hpp   |  2 +-
 src/gallium/drivers/swr/rasterizer/core/format_traits.h|  4 ++--
 src/gallium/drivers/swr/rasterizer/core/pa.h   |  2 +-
 src/gallium/drivers/swr/rasterizer/core/threads.cpp|  4 ++--
 src/gallium/drivers/swr/rasterizer/core/tilemgr.h  | 12 ++--
 src/gallium/drivers/swr/rasterizer/core/utils.h| 10 ++
 src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp|  2 +-
 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp |  4 ++--
 14 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h 
b/src/gallium/drivers/swr/rasterizer/common/os.h
index dc90fca750..4ed6b88e45 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -220,12 +220,6 @@ void *AlignedMalloc(unsigned int size, unsigned int 
alignment)
 return ret;
 }
 
-inline
-unsigned char _bittest(const LONG *a, LONG b)
-{
-return ((*(unsigned *)(a) & (1 << b)) != 0);
-}
-
 static inline
 void AlignedFree(void* p)
 {
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 855d133920..8dc9ac24a7 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -189,7 +189,7 @@ void QueueWork(SWR_CONTEXT *pContext)
 
 if (IsDraw)
 {
-InterlockedIncrement((volatile LONG*)>drawsOutstandingFE);
+InterlockedIncrement((volatile long*)>drawsOutstandingFE);
 }
 
 _ReadWriteBarrier();
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h 
b/src/gallium/drivers/swr/rasterizer/core/api.h
index 236e0fcd66..a39420552b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -697,8 +697,8 @@ SWR_FUNC(void, SwrStoreHotTileToSurface,
 SWR_FUNC(void, SwrStoreHotTileClear,
  SWR_SURFACE_STATE *pDstSurface,
  SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
- UINT x,
- UINT y,
+ uint32_t x,
+ uint32_t y,
  uint32_t renderTargetArrayIndex,
  const float* pClearColor);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index de6691b4cf..c1f0f07804 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -64,7 +64,7 @@ INLINE void ProcessAttributes(
 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid 
value for NumVertsT");
 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
 // Conservative Rasterization requires degenerate tris to have constant 
attribute interpolation
-LONG constantInterpMask = IsDegenerate::value ? 0x : 
backendState.constantInterpolationMask;
+uint32_t constantInterpMask = IsDegenerate::value ? 0x : 
backendState.constantInterpolationMask;
 const uint32_t provokingVertex = 
pDC->pState->state.frontendState.topologyProvokingVertex;
 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
 
@@ -93,7 +93,7 @@ INLINE void ProcessAttributes(
 
 if (HasConstantInterpT::value || IsDegenerate::value)
 {
-if (_bittest(, i))
+if (CheckBit(constantInterpMask, i))
 {
 uint32_t vid;
 uint32_t adjustedTriIndex;
diff --git a/src/gallium/drivers/swr/rasterizer/core/blend.h 
b/src/gallium/drivers/swr/rasterizer/core/blend.h
index 1b98e442fd..c89c47646a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/blend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/blend.h
@@ -278,7 +278,7 @@ INLINE void Clamp(simdvector )
 }
 
 template
-void Blend(const SWR_BLEND_STATE *pBlendState, const 
SWR_RENDER_TARGET_BLEND_STATE *pState, simdvector , simdvector& src1, BYTE 
*pDst, simdvector )
+void Blend(const SWR_BLEND_STATE *pBlendState, const 
SWR_RENDER_TARGET_BLEND_STATE *pState, simd

Mesa (master): swr/rast: fix USE_SIMD16_FRONTEND issues

2017-08-02 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: d08493f9cef236af57538d4dd3087277f3a65ad2
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=d08493f9cef236af57538d4dd3087277f3a65ad2

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Tue Jul 18 23:52:38 2017 -0500

swr/rast: fix USE_SIMD16_FRONTEND issues

Fix problems found when enabling USE_SIMD16_FRONTEND, mostly related to
vMask / movemask_ps(pd).

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/common/simd16intrin.h| 14 ++
 .../drivers/swr/rasterizer/common/simdintrin.h  | 21 +
 .../swr/rasterizer/common/simdlib_128_avx.inl   | 15 +++
 .../swr/rasterizer/common/simdlib_256_avx.inl   | 10 ++
 .../swr/rasterizer/common/simdlib_512_avx512.inl|  4 ++--
 .../common/simdlib_512_avx512_knights.inl   | 21 -
 .../swr/rasterizer/common/simdlib_512_emu.inl   | 12 +---
 src/gallium/drivers/swr/rasterizer/core/backend.cpp |  2 +-
 .../drivers/swr/rasterizer/core/backend_impl.h  |  8 
 .../drivers/swr/rasterizer/core/backend_sample.cpp  |  2 +-
 .../swr/rasterizer/core/backend_singlesample.cpp|  2 +-
 src/gallium/drivers/swr/rasterizer/core/clip.h  |  6 +++---
 .../drivers/swr/rasterizer/core/frontend.cpp|  2 +-
 src/gallium/drivers/swr/rasterizer/core/pa.h|  4 +++-
 14 files changed, 49 insertions(+), 74 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h 
b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
index a160ca2c5e..019b26d8cf 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
@@ -159,20 +159,10 @@ typedef SIMD512 SIMD16;
 #define _simd16_packus_epi32SIMD16::packus_epi32
 #define _simd16_packs_epi32 SIMD16::packs_epi32
 #define _simd16_cmplt_ps_mask   
SIMD16::cmp_ps_mask
+#define _simd16_cmpeq_ps_mask   
SIMD16::cmp_ps_mask
 #define _simd16_int2mask(mask)  simd16mask(mask)
 #define _simd16_mask2int(mask)  int(mask)
-
-// convert bitmask to vector mask
-SIMDINLINE simd16scalar vMask16(int32_t mask)
-{
-simd16scalari temp = _simd16_set1_epi32(mask);
-
-simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 
0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 
0x0002, 0x0001);
-
-simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), 
_simd16_and_si(temp, bits));
-
-return _simd16_castsi_ps(result);
-}
+#define _simd16_vmask_psSIMD16::vmask_ps
 
 #endif//ENABLE_AVX512_SIMD16
 
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h 
b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index f95c109e6f..f4b9e1055c 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -181,6 +181,7 @@ typedef SIMD256 SIMD;
 #define _simd_storeu2_siSIMD::storeu2_si
 
 #define _simd_blendv_epi32  SIMD::blendv_epi32
+#define _simd_vmask_ps  SIMD::vmask_ps
 
 template SIMDINLINE
 SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer a, SIMD128::Integer b)
@@ -188,26 +189,6 @@ SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer a, 
SIMD128::Integer b)
 return SIMD128::castps_si(SIMD128::blend_ps(SIMD128::castsi_ps(a), 
SIMD128::castsi_ps(b)));
 }
 
-// convert bitmask to vector mask
-SIMDINLINE
-SIMD256::Float vMask(int32_t mask)
-{
-SIMD256::Integer vec = SIMD256::set1_epi32(mask);
-const SIMD256::Integer bit = SIMD256::set_epi32(0x80, 0x40, 0x20, 0x10, 
0x08, 0x04, 0x02, 0x01);
-vec = SIMD256::and_si(vec, bit);
-vec = SIMD256::cmplt_epi32(SIMD256::setzero_si(), vec);
-return SIMD256::castsi_ps(vec);
-}
-
-SIMDINLINE
-SIMD256::Integer vMaski(int32_t mask)
-{
-SIMD256::Integer vec = SIMD256::set1_epi32(mask);
-const SIMD256::Integer bit = SIMD256::set_epi32(0x80, 0x40, 0x20, 0x10, 
0x08, 0x04, 0x02, 0x01);
-vec = SIMD256::and_si(vec, bit);
-return SIMD256::cmplt_epi32(SIMD256::setzero_si(), vec);
-}
-
 SIMDINLINE
 void _simd_mov(simdscalar , unsigned int rlane, simdscalar& s, unsigned int 
slane)
 {
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
index 5bcedf3971..7232791893 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
@@ -519,6 +519,11 @@ static SIMDINLINE Float SIMDCALL set_ps(float in3, float 
in2, float in1, float i
 return _mm_set_ps(in3, in2, in1, in0);
 }
 
+static SIMDINLINE Integer SIMDCALL set_epi32(int in3, int in2, int in1, int 
in0)
+{
+return _

Mesa (master): swr/rast: disable AVX512 optimization of SSE / AVX code

2017-08-02 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: f8a572cdf0cf7fb52348adc7862a7ffc612180ef
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=f8a572cdf0cf7fb52348adc7862a7ffc612180ef

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Jul 19 16:16:57 2017 -0500

swr/rast: disable AVX512 optimization of SSE / AVX code

Disable an optimization which implemented sse/avx operations on avx512
using avx512 intrinsics (to avoid switching between lane widths).

Compile with SIMD_OPT_128_AVX512 / SIMD_OPT_256_AVX512 defined to enable
these optimizations.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/rasterizer/common/simdlib.hpp | 4 
 1 file changed, 4 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp 
b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
index 0c79cdd660..a4b5854d00 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
@@ -53,6 +53,7 @@ namespace SIMDImpl
 #if SIMD_ARCH >= SIMD_ARCH_AVX512
 struct AVX512Impl : AVX2Impl
 {
+#if defined(SIMD_OPT_128_AVX512)
 #define __SIMD_LIB_AVX512_HPP__
 #include "simdlib_128_avx512.inl"
 #if defined(SIMD_ARCH_KNIGHTS)
@@ -61,6 +62,7 @@ namespace SIMDImpl
 #include "simdlib_128_avx512_core.inl"
 #endif // defined(SIMD_ARCH_KNIGHTS)
 #undef __SIMD_LIB_AVX512_HPP__
+#endif // SIMD_OPT_128_AVX512
 }; // struct AVX2Impl
 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
 
@@ -108,6 +110,7 @@ namespace SIMDImpl
 #if SIMD_ARCH >= SIMD_ARCH_AVX512
 struct AVX512Impl : AVX2Impl
 {
+#if defined(SIMD_OPT_256_AVX512)
 #define __SIMD_LIB_AVX512_HPP__
 #include "simdlib_256_avx512.inl"
 #if defined(SIMD_ARCH_KNIGHTS)
@@ -116,6 +119,7 @@ namespace SIMDImpl
 #include "simdlib_256_avx512_core.inl"
 #endif // defined(SIMD_ARCH_KNIGHTS)
 #undef __SIMD_LIB_AVX512_HPP__
+#endif // SIMD_OPT_256_AVX512
 }; // struct AVX2Impl
 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
 

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: constify swr rasterizer

2017-08-02 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 08e3c369550be2842d32fd05d2e9ba68fb1b08f8
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=08e3c369550be2842d32fd05d2e9ba68fb1b08f8

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Wed Jul 26 12:27:44 2017 -0500

swr/rast: constify swr rasterizer

Add "const" as appropriate in method/function signatures.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/common/simdintrin.h |  14 +-
 .../drivers/swr/rasterizer/common/simdlib.hpp  |  10 +-
 .../swr/rasterizer/common/simdlib_256_avx.inl  | 130 +--
 .../swr/rasterizer/common/simdlib_256_avx2.inl |  32 ++---
 .../swr/rasterizer/common/simdlib_512_emu.inl  | 143 +++--
 .../swr/rasterizer/common/simdlib_types.hpp|  76 +--
 .../drivers/swr/rasterizer/core/backend_impl.h |  12 +-
 src/gallium/drivers/swr/rasterizer/core/binner.cpp |  36 +++---
 src/gallium/drivers/swr/rasterizer/core/binner.h   |   4 +-
 src/gallium/drivers/swr/rasterizer/core/clip.cpp   |  12 +-
 src/gallium/drivers/swr/rasterizer/core/clip.h |  66 +-
 src/gallium/drivers/swr/rasterizer/core/context.h  |   8 +-
 .../drivers/swr/rasterizer/core/depthstencil.h |  12 +-
 .../swr/rasterizer/core/format_conversion.h|  18 ++-
 .../drivers/swr/rasterizer/core/format_types.h |  71 +-
 .../drivers/swr/rasterizer/core/frontend.cpp   |   4 +-
 src/gallium/drivers/swr/rasterizer/core/frontend.h |  12 +-
 src/gallium/drivers/swr/rasterizer/core/state.h|   2 +-
 18 files changed, 339 insertions(+), 323 deletions(-)

Diff:   
http://cgit.freedesktop.org/mesa/mesa/diff/?id=08e3c369550be2842d32fd05d2e9ba68fb1b08f8
___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr/rast: fix core / knights split of AVX512 intrinsics

2017-08-02 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 7cd50b9e47a8ad131795da270039da87e0175143
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=7cd50b9e47a8ad131795da270039da87e0175143

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Thu Jul 27 15:33:10 2017 -0500

swr/rast: fix core / knights split of AVX512 intrinsics

Move AVX512BW specific intrinics to be Core-only.

Move some AVX512F intrinsics back to common implementation file.

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 .../drivers/swr/rasterizer/common/simdlib.hpp  |  2 +
 .../swr/rasterizer/common/simdlib_512_avx512.inl   | 53 +
 .../rasterizer/common/simdlib_512_avx512_core.inl  | 54 ++
 .../common/simdlib_512_avx512_knights.inl  | 15 --
 4 files changed, 69 insertions(+), 55 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp 
b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
index 22d7da42d0..500cf8a87e 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
@@ -214,6 +214,8 @@ struct SIMDBase : Traits::IsaImpl
 using Vec4  = typename Traits::Vec4;
 using Mask  = typename Traits::Mask;
 
+static const size_t VECTOR_BYTES = sizeof(Float);
+
 // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes  
  .
 static SIMDINLINE
 void vec4_load1_ps(Vec4& r, const float *p)
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
index 1dbfff8c9c..95e4c31909 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
@@ -158,6 +158,11 @@ private:
 return _mm512_maskz_set1_epi32(m, -1);
 }
 
+static SIMDINLINE Integer vmask(__mmask8 m)
+{
+return _mm512_maskz_set1_epi64(m, -1LL);
+}
+
 public:
 //---
 // Single precision floating point arithmetic operations
@@ -187,8 +192,8 @@ static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return 
round_ps 0xff) ? 0xff : (a + b) 
(uint8) 
+//SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+//SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) 
(uint8) 
 SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
 SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
 SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
@@ -202,7 +207,7 @@ SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
 SIMD_IWRAPPER_2(mullo_epi32);
 SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
 SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
-SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+//SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
 
 //---
 // Logical operations
@@ -276,7 +281,7 @@ static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // 
return (float)a(i
 return _mm512_cvtepi32_ps(a);
 }
 
-SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a(uint8 --> int16)
+//SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a(uint8 --> int16)
 SIMD_IWRAPPER_1_4(cvtepu8_epi32); // return (int32)a(uint8 --> int32)
 SIMD_IWRAPPER_1_8(cvtepu16_epi32);// return (int32)a(uint16 --> int32)
 SIMD_IWRAPPER_1_4(cvtepu16_epi64);// return (int64)a(uint16 --> int64)
@@ -317,20 +322,6 @@ static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float 
b) { return cmp_ps(a, b); }
 
 template
-static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b)
-{
-// Legacy vector mask generator
-__mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast(CmpTypeT));
-return vmask(result);
-}
-template
-static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b)
-{
-// Legacy vector mask generator
-__mmask32 result = _mm512_cmp_epi16_mask(a, b, static_cast(CmpTypeT));
-return vmask(result);
-}
-template
 static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
 {
 // Legacy vector mask generator
@@ -345,12 +336,12 @@ static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, 
Integer b)
 return vmask(result);
 }
 
-SIMD_IWRAPPER_2_CMP(cmpeq_epi8,  cmp_epi8);// return a 
== b (int8)
-SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16);   // return a 
== b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi8,  cmp_epi8);// return 
a == b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16);   // return 
a == b (int16)
 SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32);   // return a 
== b (int32)
 SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64);   // return a 
== b (int64)
-SIMD_IWRAPPER_2_CMP(cmpgt_epi8,  cmp_epi8);// return a 
> b (int8)
-SIMD_IWRAPPER_2_CM

Mesa (master): swr/rast: simdlib better separation of core vs knights avx512

2017-08-02 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 07062daae93b146458db55ba22a2e27d3d59552b
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=07062daae93b146458db55ba22a2e27d3d59552b

Author: Tim Rowley <timothy.o.row...@intel.com>
Date:   Mon Jul 24 16:13:12 2017 -0500

swr/rast: simdlib better separation of core vs knights avx512

Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com>

---

 src/gallium/drivers/swr/Makefile.am|   2 +-
 src/gallium/drivers/swr/Makefile.sources   |   8 +
 .../drivers/swr/rasterizer/common/simdlib.hpp  |  21 ++-
 .../swr/rasterizer/common/simdlib_128_avx512.inl   | 108 +++-
 .../rasterizer/common/simdlib_128_avx512_core.inl  | 193 +
 .../common/simdlib_128_avx512_knights.inl  |  35 
 .../swr/rasterizer/common/simdlib_256_avx512.inl   | 128 +++---
 .../rasterizer/common/simdlib_256_avx512_core.inl  | 127 ++
 .../common/simdlib_256_avx512_knights.inl  |  35 
 .../swr/rasterizer/common/simdlib_512_avx512.inl   |  79 +++--
 .../rasterizer/common/simdlib_512_avx512_core.inl  | 181 +++
 .../common/simdlib_512_avx512_knights.inl  | 183 +++
 .../common/simdlib_512_avx512_masks_core.inl   |  27 +++
 .../common/simdlib_512_avx512_masks_knights.inl|  27 +++
 .../swr/rasterizer/common/simdlib_types.hpp|   2 +-
 15 files changed, 911 insertions(+), 245 deletions(-)

Diff:   
http://cgit.freedesktop.org/mesa/mesa/diff/?id=07062daae93b146458db55ba22a2e27d3d59552b
___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


  1   2   3   4   5   6   >