Mesa (master): swr: [rasterizer core] SIMD16 Frontend WIP - PA

2017-04-11 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: c8cc07ca25914511830c2ce92ff1cd221f5afaae
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=c8cc07ca25914511830c2ce92ff1cd221f5afaae

Author: Tim Rowley 
Date:   Thu Apr  6 16:37:03 2017 -0500

swr: [rasterizer core] SIMD16 Frontend WIP - PA

Fix PA NextPrim for SIMD8 on SIMD16.

Reviewed-by: Bruce Cherniak 

---

 src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp | 44 +++---
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp 
b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
index 3e3b7abab5..6a249638ad 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
@@ -456,7 +456,7 @@ static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t 
slot, simdvector verts[])
 PaPatchList,
 PaPatchListSingle,
 0,
-KNOB_SIMD_WIDTH,
+PA_STATE_OPT::SIMD_WIDTH,
 true);
 
 return true;
@@ -509,7 +509,7 @@ static bool PaPatchListTerm_simd16(PA_STATE_OPT& pa, 
uint32_t slot, simd16vector
 PaPatchList,
 PaPatchListSingle,
 0,
-KNOB_SIMD16_WIDTH,
+PA_STATE_OPT::SIMD_WIDTH,
 true);
 
 return true;
@@ -736,7 +736,7 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector 
verts[])
 }
 
 #endif
-SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, KNOB_SIMD_WIDTH, true);
+SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, 
PA_STATE_OPT::SIMD_WIDTH, true);
 return true;
 }
 
@@ -783,7 +783,7 @@ bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, 
simd16vector verts[])
 v2[i] = _simd16_permute_ps(temp2, perm2);
 }
 
-SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriList0, PaTriListSingle0, 
0, KNOB_SIMD16_WIDTH, true);
+SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriList0, PaTriListSingle0, 
0, PA_STATE_OPT::SIMD_WIDTH, true);
 return true;
 }
 
@@ -1014,7 +1014,7 @@ bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, 
simdvector verts[])
 v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2));
 }
 
-SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, KNOB_SIMD_WIDTH);
+SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, 
PA_STATE_OPT::SIMD_WIDTH);
 return true;
 }
 
@@ -1052,7 +1052,7 @@ bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, 
simd16vector verts[])
 v2[i] = _simd16_shuffle_ps(a[i], shuff, _MM_SHUFFLE(2, 2, 2, 2));  
 // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
 }
 
-SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, 
PaTriStripSingle0, 0, KNOB_SIMD16_WIDTH);
+SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, 
PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
 return true;
 }
 
@@ -1288,7 +1288,7 @@ bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, 
simdvector verts[])
 v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1));
 }
 
-SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, KNOB_SIMD_WIDTH);
+SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, 
PA_STATE_OPT::SIMD_WIDTH);
 return true;
 }
 
@@ -1345,7 +1345,7 @@ bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, 
simd16vector verts[])
 v1[i] = _simd16_shuffle_ps(b[i], v2[i], _MM_SHUFFLE(2, 1, 2, 1));  
 // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
 }
 
-SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0, 0, 
KNOB_SIMD16_WIDTH);
+SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0, 0, 
PA_STATE_OPT::SIMD_WIDTH);
 return true;
 }
 
@@ -1480,7 +1480,7 @@ bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, 
simdvector verts[])
 v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2));
 }
 
-SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, KNOB_SIMD_WIDTH, 
true);
+SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, 
PA_STATE_OPT::SIMD_WIDTH, true);
 return true;
 }
 
@@ -1515,7 +1515,7 @@ bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, 
simd16vector verts[])
 v2[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 2, 3, 2)); 
 // a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
 }
 
-SetNextPaState_simd16(pa, PaQuadList0_simd16, PaQuadList0, 
PaQuadListSingle0, 0, KNOB_SIMD16_WIDTH, true);
+SetNextPaState_simd16(pa, PaQuadList0_simd16, PaQuadList0, 
PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
 return true;
 }
 
@@ -1735,7 +1735,7 @@ bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, 
simdvector verts[])
 }
 }
 
-SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, KNOB_SIMD_WIDTH);
+SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, 
PA_STATE_OPT::SIMD_WIDTH);
 return true;
 }
 
@@ -1765,7 +1765,7 @@ bool 

Mesa (master): swr: [rasterizer core] SIMD16 Frontend WIP - Clipper

2017-04-11 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 08a71368483f2e35b135ebe56ec5746cc94ac452
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=08a71368483f2e35b135ebe56ec5746cc94ac452

Author: Tim Rowley 
Date:   Thu Apr  6 15:22:55 2017 -0500

swr: [rasterizer core] SIMD16 Frontend WIP - Clipper

Implement widened clipper for SIMD16.

Reviewed-by: Bruce Cherniak 

---

 .../drivers/swr/rasterizer/common/simd16intrin.h   |   41 +-
 src/gallium/drivers/swr/rasterizer/core/binner.cpp |   17 +-
 src/gallium/drivers/swr/rasterizer/core/clip.cpp   |   91 +-
 src/gallium/drivers/swr/rasterizer/core/clip.h | 1027 ++--
 src/gallium/drivers/swr/rasterizer/core/frontend.h |   29 +-
 5 files changed, 1011 insertions(+), 194 deletions(-)

Diff:   
http://cgit.freedesktop.org/mesa/mesa/diff/?id=08a71368483f2e35b135ebe56ec5746cc94ac452
___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr: [rasterizer core] SIMD16 Frontend WIP

2017-04-05 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: cd6c200223f7c6f5bac6bd2f2991bccf363fa7d9
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=cd6c200223f7c6f5bac6bd2f2991bccf363fa7d9

Author: Tim Rowley 
Date:   Sun Mar 26 15:46:42 2017 -0500

swr: [rasterizer core] SIMD16 Frontend WIP

Implement widened VS output for SIMD16

Reviewed-by: Bruce Cherniak 

---

 .../drivers/swr/rasterizer/core/frontend.cpp   | 42 +-
 src/gallium/drivers/swr/rasterizer/core/state.h|  9 +++--
 2 files changed, 14 insertions(+), 37 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 9df7eeadc1..8cf234cd67 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -1543,6 +1543,8 @@ void ProcessDraw(
 
 vsContext_lo.pVin = _lo;
 vsContext_hi.pVin = _hi;
+vsContext_lo.AlternateOffset = 0;
+vsContext_hi.AlternateOffset = 1;
 
 SWR_FETCH_CONTEXT   fetchInfo_lo = { 0 };
 
@@ -1612,20 +1614,18 @@ void ProcessDraw(
 pvCutIndices_hi = _cast(())[1];
 }
 
-simdvertex vout_lo;
-simdvertex vout_hi;
-
-vsContext_lo.pVout = _lo;
-vsContext_hi.pVout = _hi;
-
 simd16vertex  = pa.GetNextVsOutput();
 
+vsContext_lo.pVout = reinterpret_cast();
+vsContext_hi.pVout = reinterpret_cast();
+
 if (i < endVertex)
 {
 // 1. Execute FS/VS for a single SIMD.
 AR_BEGIN(FEFetchShader, pDC->drawId);
 state.pfnFetchFunc(fetchInfo_lo, vin_lo);
-if ((i + KNOB_SIMD_WIDTH) < endVertex)
+
+if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of 
KNOB_SIMD16_WIDTH
 {
 state.pfnFetchFunc(fetchInfo_hi, vin_hi);
 }
@@ -1655,35 +1655,9 @@ void ProcessDraw(
 AR_BEGIN(FEVertexShader, pDC->drawId);
 state.pfnVertexFunc(GetPrivateState(pDC), _lo);
 
-// copy SIMD vout_lo to lo part of SIMD16 vout
-{
-const uint32_t attribCount = sizeof(vout.attrib) / 
sizeof(vout.attrib[0]);
-
-for (uint32_t i = 0; i < attribCount; i += 1)
-{
-for (uint32_t j = 0; j < 4; j += 1)
-{
-vout.attrib[i][j] = 
_simd16_insert_ps(_simd16_setzero_ps(), vout_lo.attrib[i][j], 0);
-}
-}
-}
-
-if ((i + KNOB_SIMD_WIDTH) < endVertex)
+if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of 
KNOB_SIMD16_WIDTH
 {
 state.pfnVertexFunc(GetPrivateState(pDC), 
_hi);
-
-// copy SIMD vout_hi to hi part of SIMD16 vout
-{
-const uint32_t attribCount = sizeof(vout.attrib) / 
sizeof(vout.attrib[0]);
-
-for (uint32_t i = 0; i < attribCount; i += 1)
-{
-for (uint32_t j = 0; j < 4; j += 1)
-{
-vout.attrib[i][j] = 
_simd16_insert_ps(vout.attrib[i][j], vout_hi.attrib[i][j], 1);
-}
-}
-}
 }
 AR_END(FEVertexShader, 0);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index bb1336c429..623e70a151 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -201,9 +201,12 @@ struct SWR_VS_CONTEXT
 simdvertex* pVin;   // IN: SIMD input vertex data store
 simdvertex* pVout;  // OUT: SIMD output vertex data store
 
-uint32_t InstanceID;// IN: Instance ID, constant across all verts of 
the SIMD
-simdscalari VertexID;   // IN: Vertex ID
-simdscalari mask;   // IN: Active mask for shader
+uint32_t InstanceID;// IN: Instance ID, constant across all verts 
of the SIMD
+simdscalari VertexID;   // IN: Vertex ID
+simdscalari mask;   // IN: Active mask for shader
+#if USE_SIMD16_FRONTEND
+uint32_t AlternateOffset;   // IN: amount to offset for interleaving 
even/odd simd8 in simd16vertex output
+#endif
 };
 
 /

___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr: [rasterizer core] SIMD16 Frontend WIP

2017-04-05 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: d5157ddca4072856e0afce3d7af8929a7d387044
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=d5157ddca4072856e0afce3d7af8929a7d387044

Author: Tim Rowley 
Date:   Wed Mar 29 12:58:18 2017 -0500

swr: [rasterizer core] SIMD16 Frontend WIP

Implement widened binner for SIMD16

Reviewed-by: Bruce Cherniak 

---

 .../drivers/swr/rasterizer/common/simd16intrin.h   |   44 +-
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 1642 +---
 src/gallium/drivers/swr/rasterizer/core/frontend.h |   98 ++
 src/gallium/drivers/swr/rasterizer/core/utils.h|   10 +
 4 files changed, 1509 insertions(+), 285 deletions(-)

Diff:   
http://cgit.freedesktop.org/mesa/mesa/diff/?id=d5157ddca4072856e0afce3d7af8929a7d387044
___
mesa-commit mailing list
mesa-commit@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-commit


Mesa (master): swr: [rasterizer core] SIMD16 Frontend WIP

2017-03-28 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: 549b9d2e9f1547af3fb061a7956b04fb30870a6d
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=549b9d2e9f1547af3fb061a7956b04fb30870a6d

Author: Tim Rowley 
Date:   Mon Mar 20 12:17:07 2017 -0500

swr: [rasterizer core] SIMD16 Frontend WIP

Fix GS and streamout.

Reviewed-by: George Kyriazis 

---

 src/gallium/drivers/swr/rasterizer/core/clip.h | 61 ++
 .../drivers/swr/rasterizer/core/frontend.cpp   | 97 +-
 2 files changed, 136 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h 
b/src/gallium/drivers/swr/rasterizer/core/clip.h
index eec65707e7..3a79d6a34c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -376,7 +376,16 @@ public:
 const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
 
 uint32_t numClippedPrims = 0;
+#if USE_SIMD16_FRONTEND
+const uint32_t numPrims = pa.NumPrims();
+const uint32_t numPrims_lo = std::min(numPrims, 
KNOB_SIMD_WIDTH);
+
+SWR_ASSERT(numPrims <= numPrims_lo);
+
+for (uint32_t inputPrim = 0; inputPrim < numPrims_lo; ++inputPrim)
+#else
 for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
+#endif
 {
 uint32_t numEmittedVerts = pVertexCount[inputPrim];
 if (numEmittedVerts < NumVertsPerPrim)
@@ -391,13 +400,28 @@ public:
 // tranpose clipper output so that each lane's vertices are in 
SIMD order
 // set aside space for 2 vertices, as the PA will try to read up 
to 16 verts
 // for triangle fan
+#if USE_SIMD16_FRONTEND
+simd16vertex transposedPrims[2];
+#else
 simdvertex transposedPrims[2];
+#endif
 
 // transpose pos
 uint8_t* pBase = 
(uint8_t*)([0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * 
inputPrim;
+
+#if USE_SIMD16_FRONTEND
+// TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug - use 
dx11_clipping_03-09 failures to check for existence of bug
+static const float *dummy = reinterpret_cast(pBase);
+#endif
+
 for (uint32_t c = 0; c < 4; ++c)
 {
+#if USE_SIMD16_FRONTEND
+simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), 
(const float *)pBase, vOffsets, vMask, 1);
+transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = 
_simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
+#else
 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = 
_simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, 
vMask, 1);
+#endif
 pBase += sizeof(simdscalar);
 }
 
@@ -408,7 +432,12 @@ public:
 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
 for (uint32_t c = 0; c < 4; ++c)
 {
+#if USE_SIMD16_FRONTEND
+simdscalar temp = 
_simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, 
vMask, 1);
+transposedPrims[0].attrib[attribSlot][c] = 
_simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
+#else
 transposedPrims[0].attrib[attribSlot][c] = 
_simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, 
vMask, 1);
+#endif
 pBase += sizeof(simdscalar);
 }
 }
@@ -419,7 +448,12 @@ public:
 pBase = 
(uint8_t*)([0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * 
inputPrim;
 for (uint32_t c = 0; c < 4; ++c)
 {
+#if USE_SIMD16_FRONTEND
+simdscalar temp = 
_simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, 
vMask, 1);
+transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] 
= _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
+#else
 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] 
= _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, 
vMask, 1);
+#endif
 pBase += sizeof(simdscalar);
 }
 }
@@ -429,7 +463,12 @@ public:
 pBase = 
(uint8_t*)([0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * 
inputPrim;
 for (uint32_t c = 0; c < 4; ++c)
 {
+#if USE_SIMD16_FRONTEND
+simdscalar temp = 
_simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, 
vMask, 1);
+transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] 
= _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
+#else
 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] 
= _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, 
vMask, 1);
+#endif
 pBase += 

Mesa (master): swr: [rasterizer core] SIMD16 Frontend WIP

2017-03-28 Thread Tim Rowley
Module: Mesa
Branch: master
Commit: aee5276375d79f5d73680d6038a1fd838894679a
URL:
http://cgit.freedesktop.org/mesa/mesa/commit/?id=aee5276375d79f5d73680d6038a1fd838894679a

Author: Tim Rowley 
Date:   Wed Mar 22 12:36:49 2017 -0500

swr: [rasterizer core] SIMD16 Frontend WIP

Implement widened clipper and binner interfaces for SIMD16.

Reviewed-by: George Kyriazis 

---

 src/gallium/drivers/swr/rasterizer/core/api.cpp|  24 
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 154 +
 src/gallium/drivers/swr/rasterizer/core/clip.cpp   | 131 ++
 src/gallium/drivers/swr/rasterizer/core/clip.h |   6 +
 src/gallium/drivers/swr/rasterizer/core/context.h  |   3 +
 .../drivers/swr/rasterizer/core/frontend.cpp   | 115 +--
 src/gallium/drivers/swr/rasterizer/core/frontend.h |   7 +
 src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp |  12 ++
 8 files changed, 371 insertions(+), 81 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index bd63796d13..dabd0616d3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -839,11 +839,18 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
 }
 
 PFN_PROCESS_PRIMS pfnBinner;
+#if USE_SIMD16_FRONTEND
+PFN_PROCESS_PRIMS_SIMD16 pfnBinner_simd16;
+#endif
 switch (pState->state.topology)
 {
 case TOP_POINT_LIST:
 pState->pfnProcessPrims = ClipPoints;
 pfnBinner = BinPoints;
+#if USE_SIMD16_FRONTEND
+pState->pfnProcessPrims_simd16 = ClipPoints_simd16;
+pfnBinner_simd16 = BinPoints_simd16;
+#endif
 break;
 case TOP_LINE_LIST:
 case TOP_LINE_STRIP:
@@ -852,10 +859,18 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
 case TOP_LISTSTRIP_ADJ:
 pState->pfnProcessPrims = ClipLines;
 pfnBinner = BinLines;
+#if USE_SIMD16_FRONTEND
+pState->pfnProcessPrims_simd16 = ClipLines_simd16;
+pfnBinner_simd16 = BinLines_simd16;
+#endif
 break;
 default:
 pState->pfnProcessPrims = ClipTriangles;
 pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0));
+#if USE_SIMD16_FRONTEND
+pState->pfnProcessPrims_simd16 = ClipTriangles_simd16;
+pfnBinner_simd16 = 
GetBinTrianglesFunc_simd16((rastState.conservativeRast > 0));
+#endif
 break;
 };
 
@@ -864,6 +879,9 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
 if (pState->state.frontendState.vpTransformDisable)
 {
 pState->pfnProcessPrims = pfnBinner;
+#if USE_SIMD16_FRONTEND
+pState->pfnProcessPrims_simd16 = pfnBinner_simd16;
+#endif
 }
 
 if ((pState->state.psState.pfnPixelShader == nullptr) &&
@@ -874,11 +892,17 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
 (pState->state.backendState.numAttributes == 0))
 {
 pState->pfnProcessPrims = nullptr;
+#if USE_SIMD16_FRONTEND
+pState->pfnProcessPrims_simd16 = nullptr;
+#endif
 }
 
 if (pState->state.soState.rasterizerDisable == true)
 {
 pState->pfnProcessPrims = nullptr;
+#if USE_SIMD16_FRONTEND
+pState->pfnProcessPrims_simd16 = nullptr;
+#endif
 }
 
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 490a86804f..63eab33ac0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -856,6 +856,58 @@ endBinTriangles:
 AR_END(FEBinTriangles, 1);
 }
 
+#if USE_SIMD16_FRONTEND
+inline uint32_t GetPrimMaskLo(uint32_t primMask)
+{
+return primMask & 255;
+}
+
+inline uint32_t GetPrimMaskHi(uint32_t primMask)
+{
+return (primMask >> 8) & 255;
+}
+
+template 
+void BinTriangles_simd16(
+DRAW_CONTEXT *pDC,
+PA_STATE& pa,
+uint32_t workerId,
+simd16vector tri[3],
+uint32_t triMask,
+simd16scalari primID,
+simd16scalari viewportIdx)
+{
+enum { VERTS_PER_PRIM = 3 };
+
+simdvector verts[VERTS_PER_PRIM];
+
+for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+{
+for (uint32_t j = 0; j < 4; j += 1)
+{
+verts[i][j] = _simd16_extract_ps(tri[i][j], 0);
+}
+}
+
+pa.useAlternateOffset = false;
+BinTriangles(pDC, pa, workerId, verts, GetPrimMaskLo(triMask), 
_simd16_extract_si(primID, 0), _simd16_extract_si(viewportIdx, 0));
+
+if (GetPrimMaskHi(triMask))
+{
+for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+{
+for (uint32_t j = 0; j < 4; j += 1)
+{
+verts[i][j] = _simd16_extract_ps(tri[i][j], 1);
+}
+}
+
+pa.useAlternateOffset = true;
+BinTriangles(pDC, pa, workerId, verts, GetPrimMaskHi(triMask), 
_simd16_extract_si(primID, 1), _simd16_extract_si(viewportIdx, 1));
+}
+}
+
+#endif
 struct