Mesa (master): swr: [rasterizer core] SIMD16 Frontend WIP - PA
Module: Mesa Branch: master Commit: c8cc07ca25914511830c2ce92ff1cd221f5afaae URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=c8cc07ca25914511830c2ce92ff1cd221f5afaae Author: Tim RowleyDate: Thu Apr 6 16:37:03 2017 -0500 swr: [rasterizer core] SIMD16 Frontend WIP - PA Fix PA NextPrim for SIMD8 on SIMD16. Reviewed-by: Bruce Cherniak --- src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp | 44 +++--- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp index 3e3b7abab5..6a249638ad 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp @@ -456,7 +456,7 @@ static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) PaPatchList, PaPatchListSingle, 0, -KNOB_SIMD_WIDTH, +PA_STATE_OPT::SIMD_WIDTH, true); return true; @@ -509,7 +509,7 @@ static bool PaPatchListTerm_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector PaPatchList, PaPatchListSingle, 0, -KNOB_SIMD16_WIDTH, +PA_STATE_OPT::SIMD_WIDTH, true); return true; @@ -736,7 +736,7 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } #endif -SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, KNOB_SIMD_WIDTH, true); +SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); return true; } @@ -783,7 +783,7 @@ bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) v2[i] = _simd16_permute_ps(temp2, perm2); } -SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriList0, PaTriListSingle0, 0, KNOB_SIMD16_WIDTH, true); +SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); return true; } @@ -1014,7 +1014,7 @@ bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2)); } -SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, KNOB_SIMD_WIDTH); +SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); return true; } @@ -1052,7 +1052,7 @@ bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) v2[i] = _simd16_shuffle_ps(a[i], shuff, _MM_SHUFFLE(2, 2, 2, 2)); // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0 } -SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0, 0, KNOB_SIMD16_WIDTH); +SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); return true; } @@ -1288,7 +1288,7 @@ bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1)); } -SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, KNOB_SIMD_WIDTH); +SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); return true; } @@ -1345,7 +1345,7 @@ bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) v1[i] = _simd16_shuffle_ps(b[i], v2[i], _MM_SHUFFLE(2, 1, 2, 1)); // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 } -SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0, 0, KNOB_SIMD16_WIDTH); +SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); return true; } @@ -1480,7 +1480,7 @@ bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2)); } -SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, KNOB_SIMD_WIDTH, true); +SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); return true; } @@ -1515,7 +1515,7 @@ bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) v2[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 2, 3, 2)); // a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF } -SetNextPaState_simd16(pa, PaQuadList0_simd16, PaQuadList0, PaQuadListSingle0, 0, KNOB_SIMD16_WIDTH, true); +SetNextPaState_simd16(pa, PaQuadList0_simd16, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true); return true; } @@ -1735,7 +1735,7 @@ bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) } } -SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, KNOB_SIMD_WIDTH); +SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH); return true; } @@ -1765,7 +1765,7 @@ bool
Mesa (master): swr: [rasterizer core] SIMD16 Frontend WIP - Clipper
Module: Mesa Branch: master Commit: 08a71368483f2e35b135ebe56ec5746cc94ac452 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=08a71368483f2e35b135ebe56ec5746cc94ac452 Author: Tim RowleyDate: Thu Apr 6 15:22:55 2017 -0500 swr: [rasterizer core] SIMD16 Frontend WIP - Clipper Implement widened clipper for SIMD16. Reviewed-by: Bruce Cherniak --- .../drivers/swr/rasterizer/common/simd16intrin.h | 41 +- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 17 +- src/gallium/drivers/swr/rasterizer/core/clip.cpp | 91 +- src/gallium/drivers/swr/rasterizer/core/clip.h | 1027 ++-- src/gallium/drivers/swr/rasterizer/core/frontend.h | 29 +- 5 files changed, 1011 insertions(+), 194 deletions(-) Diff: http://cgit.freedesktop.org/mesa/mesa/diff/?id=08a71368483f2e35b135ebe56ec5746cc94ac452 ___ mesa-commit mailing list mesa-commit@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-commit
Mesa (master): swr: [rasterizer core] SIMD16 Frontend WIP
Module: Mesa Branch: master Commit: cd6c200223f7c6f5bac6bd2f2991bccf363fa7d9 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=cd6c200223f7c6f5bac6bd2f2991bccf363fa7d9 Author: Tim RowleyDate: Sun Mar 26 15:46:42 2017 -0500 swr: [rasterizer core] SIMD16 Frontend WIP Implement widened VS output for SIMD16 Reviewed-by: Bruce Cherniak --- .../drivers/swr/rasterizer/core/frontend.cpp | 42 +- src/gallium/drivers/swr/rasterizer/core/state.h| 9 +++-- 2 files changed, 14 insertions(+), 37 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 9df7eeadc1..8cf234cd67 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -1543,6 +1543,8 @@ void ProcessDraw( vsContext_lo.pVin = _lo; vsContext_hi.pVin = _hi; +vsContext_lo.AlternateOffset = 0; +vsContext_hi.AlternateOffset = 1; SWR_FETCH_CONTEXT fetchInfo_lo = { 0 }; @@ -1612,20 +1614,18 @@ void ProcessDraw( pvCutIndices_hi = _cast(())[1]; } -simdvertex vout_lo; -simdvertex vout_hi; - -vsContext_lo.pVout = _lo; -vsContext_hi.pVout = _hi; - simd16vertex = pa.GetNextVsOutput(); +vsContext_lo.pVout = reinterpret_cast(); +vsContext_hi.pVout = reinterpret_cast(); + if (i < endVertex) { // 1. Execute FS/VS for a single SIMD. AR_BEGIN(FEFetchShader, pDC->drawId); state.pfnFetchFunc(fetchInfo_lo, vin_lo); -if ((i + KNOB_SIMD_WIDTH) < endVertex) + +if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH { state.pfnFetchFunc(fetchInfo_hi, vin_hi); } @@ -1655,35 +1655,9 @@ void ProcessDraw( AR_BEGIN(FEVertexShader, pDC->drawId); state.pfnVertexFunc(GetPrivateState(pDC), _lo); -// copy SIMD vout_lo to lo part of SIMD16 vout -{ -const uint32_t attribCount = sizeof(vout.attrib) / sizeof(vout.attrib[0]); - -for (uint32_t i = 0; i < attribCount; i += 1) -{ -for (uint32_t j = 0; j < 4; j += 1) -{ -vout.attrib[i][j] = _simd16_insert_ps(_simd16_setzero_ps(), vout_lo.attrib[i][j], 0); -} -} -} - -if ((i + KNOB_SIMD_WIDTH) < endVertex) +if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH { state.pfnVertexFunc(GetPrivateState(pDC), _hi); - -// copy SIMD vout_hi to hi part of SIMD16 vout -{ -const uint32_t attribCount = sizeof(vout.attrib) / sizeof(vout.attrib[0]); - -for (uint32_t i = 0; i < attribCount; i += 1) -{ -for (uint32_t j = 0; j < 4; j += 1) -{ -vout.attrib[i][j] = _simd16_insert_ps(vout.attrib[i][j], vout_hi.attrib[i][j], 1); -} -} -} } AR_END(FEVertexShader, 0); diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index bb1336c429..623e70a151 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -201,9 +201,12 @@ struct SWR_VS_CONTEXT simdvertex* pVin; // IN: SIMD input vertex data store simdvertex* pVout; // OUT: SIMD output vertex data store -uint32_t InstanceID;// IN: Instance ID, constant across all verts of the SIMD -simdscalari VertexID; // IN: Vertex ID -simdscalari mask; // IN: Active mask for shader +uint32_t InstanceID;// IN: Instance ID, constant across all verts of the SIMD +simdscalari VertexID; // IN: Vertex ID +simdscalari mask; // IN: Active mask for shader +#if USE_SIMD16_FRONTEND +uint32_t AlternateOffset; // IN: amount to offset for interleaving even/odd simd8 in simd16vertex output +#endif }; / ___ mesa-commit mailing list mesa-commit@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-commit
Mesa (master): swr: [rasterizer core] SIMD16 Frontend WIP
Module: Mesa Branch: master Commit: d5157ddca4072856e0afce3d7af8929a7d387044 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=d5157ddca4072856e0afce3d7af8929a7d387044 Author: Tim RowleyDate: Wed Mar 29 12:58:18 2017 -0500 swr: [rasterizer core] SIMD16 Frontend WIP Implement widened binner for SIMD16 Reviewed-by: Bruce Cherniak --- .../drivers/swr/rasterizer/common/simd16intrin.h | 44 +- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 1642 +--- src/gallium/drivers/swr/rasterizer/core/frontend.h | 98 ++ src/gallium/drivers/swr/rasterizer/core/utils.h| 10 + 4 files changed, 1509 insertions(+), 285 deletions(-) Diff: http://cgit.freedesktop.org/mesa/mesa/diff/?id=d5157ddca4072856e0afce3d7af8929a7d387044 ___ mesa-commit mailing list mesa-commit@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-commit
Mesa (master): swr: [rasterizer core] SIMD16 Frontend WIP
Module: Mesa Branch: master Commit: 549b9d2e9f1547af3fb061a7956b04fb30870a6d URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=549b9d2e9f1547af3fb061a7956b04fb30870a6d Author: Tim RowleyDate: Mon Mar 20 12:17:07 2017 -0500 swr: [rasterizer core] SIMD16 Frontend WIP Fix GS and streamout. Reviewed-by: George Kyriazis --- src/gallium/drivers/swr/rasterizer/core/clip.h | 61 ++ .../drivers/swr/rasterizer/core/frontend.cpp | 97 +- 2 files changed, 136 insertions(+), 22 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index eec65707e7..3a79d6a34c 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -376,7 +376,16 @@ public: const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1); uint32_t numClippedPrims = 0; +#if USE_SIMD16_FRONTEND +const uint32_t numPrims = pa.NumPrims(); +const uint32_t numPrims_lo = std::min(numPrims, KNOB_SIMD_WIDTH); + +SWR_ASSERT(numPrims <= numPrims_lo); + +for (uint32_t inputPrim = 0; inputPrim < numPrims_lo; ++inputPrim) +#else for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim) +#endif { uint32_t numEmittedVerts = pVertexCount[inputPrim]; if (numEmittedVerts < NumVertsPerPrim) @@ -391,13 +400,28 @@ public: // tranpose clipper output so that each lane's vertices are in SIMD order // set aside space for 2 vertices, as the PA will try to read up to 16 verts // for triangle fan +#if USE_SIMD16_FRONTEND +simd16vertex transposedPrims[2]; +#else simdvertex transposedPrims[2]; +#endif // transpose pos uint8_t* pBase = (uint8_t*)([0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim; + +#if USE_SIMD16_FRONTEND +// TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug - use dx11_clipping_03-09 failures to check for existence of bug +static const float *dummy = reinterpret_cast(pBase); +#endif + for (uint32_t c = 0; c < 4; ++c) { +#if USE_SIMD16_FRONTEND +simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1); +transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0); +#else transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); +#endif pBase += sizeof(simdscalar); } @@ -408,7 +432,12 @@ public: uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib; for (uint32_t c = 0; c < 4; ++c) { +#if USE_SIMD16_FRONTEND +simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1); +transposedPrims[0].attrib[attribSlot][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0); +#else transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); +#endif pBase += sizeof(simdscalar); } } @@ -419,7 +448,12 @@ public: pBase = (uint8_t*)([0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim; for (uint32_t c = 0; c < 4; ++c) { +#if USE_SIMD16_FRONTEND +simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1); +transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0); +#else transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); +#endif pBase += sizeof(simdscalar); } } @@ -429,7 +463,12 @@ public: pBase = (uint8_t*)([0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim; for (uint32_t c = 0; c < 4; ++c) { +#if USE_SIMD16_FRONTEND +simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1); +transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0); +#else transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); +#endif pBase +=
Mesa (master): swr: [rasterizer core] SIMD16 Frontend WIP
Module: Mesa Branch: master Commit: aee5276375d79f5d73680d6038a1fd838894679a URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=aee5276375d79f5d73680d6038a1fd838894679a Author: Tim RowleyDate: Wed Mar 22 12:36:49 2017 -0500 swr: [rasterizer core] SIMD16 Frontend WIP Implement widened clipper and binner interfaces for SIMD16. Reviewed-by: George Kyriazis --- src/gallium/drivers/swr/rasterizer/core/api.cpp| 24 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 154 + src/gallium/drivers/swr/rasterizer/core/clip.cpp | 131 ++ src/gallium/drivers/swr/rasterizer/core/clip.h | 6 + src/gallium/drivers/swr/rasterizer/core/context.h | 3 + .../drivers/swr/rasterizer/core/frontend.cpp | 115 +-- src/gallium/drivers/swr/rasterizer/core/frontend.h | 7 + src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp | 12 ++ 8 files changed, 371 insertions(+), 81 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index bd63796d13..dabd0616d3 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -839,11 +839,18 @@ void SetupPipeline(DRAW_CONTEXT *pDC) } PFN_PROCESS_PRIMS pfnBinner; +#if USE_SIMD16_FRONTEND +PFN_PROCESS_PRIMS_SIMD16 pfnBinner_simd16; +#endif switch (pState->state.topology) { case TOP_POINT_LIST: pState->pfnProcessPrims = ClipPoints; pfnBinner = BinPoints; +#if USE_SIMD16_FRONTEND +pState->pfnProcessPrims_simd16 = ClipPoints_simd16; +pfnBinner_simd16 = BinPoints_simd16; +#endif break; case TOP_LINE_LIST: case TOP_LINE_STRIP: @@ -852,10 +859,18 @@ void SetupPipeline(DRAW_CONTEXT *pDC) case TOP_LISTSTRIP_ADJ: pState->pfnProcessPrims = ClipLines; pfnBinner = BinLines; +#if USE_SIMD16_FRONTEND +pState->pfnProcessPrims_simd16 = ClipLines_simd16; +pfnBinner_simd16 = BinLines_simd16; +#endif break; default: pState->pfnProcessPrims = ClipTriangles; pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0)); +#if USE_SIMD16_FRONTEND +pState->pfnProcessPrims_simd16 = ClipTriangles_simd16; +pfnBinner_simd16 = GetBinTrianglesFunc_simd16((rastState.conservativeRast > 0)); +#endif break; }; @@ -864,6 +879,9 @@ void SetupPipeline(DRAW_CONTEXT *pDC) if (pState->state.frontendState.vpTransformDisable) { pState->pfnProcessPrims = pfnBinner; +#if USE_SIMD16_FRONTEND +pState->pfnProcessPrims_simd16 = pfnBinner_simd16; +#endif } if ((pState->state.psState.pfnPixelShader == nullptr) && @@ -874,11 +892,17 @@ void SetupPipeline(DRAW_CONTEXT *pDC) (pState->state.backendState.numAttributes == 0)) { pState->pfnProcessPrims = nullptr; +#if USE_SIMD16_FRONTEND +pState->pfnProcessPrims_simd16 = nullptr; +#endif } if (pState->state.soState.rasterizerDisable == true) { pState->pfnProcessPrims = nullptr; +#if USE_SIMD16_FRONTEND +pState->pfnProcessPrims_simd16 = nullptr; +#endif } diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index 490a86804f..63eab33ac0 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -856,6 +856,58 @@ endBinTriangles: AR_END(FEBinTriangles, 1); } +#if USE_SIMD16_FRONTEND +inline uint32_t GetPrimMaskLo(uint32_t primMask) +{ +return primMask & 255; +} + +inline uint32_t GetPrimMaskHi(uint32_t primMask) +{ +return (primMask >> 8) & 255; +} + +template +void BinTriangles_simd16( +DRAW_CONTEXT *pDC, +PA_STATE& pa, +uint32_t workerId, +simd16vector tri[3], +uint32_t triMask, +simd16scalari primID, +simd16scalari viewportIdx) +{ +enum { VERTS_PER_PRIM = 3 }; + +simdvector verts[VERTS_PER_PRIM]; + +for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1) +{ +for (uint32_t j = 0; j < 4; j += 1) +{ +verts[i][j] = _simd16_extract_ps(tri[i][j], 0); +} +} + +pa.useAlternateOffset = false; +BinTriangles(pDC, pa, workerId, verts, GetPrimMaskLo(triMask), _simd16_extract_si(primID, 0), _simd16_extract_si(viewportIdx, 0)); + +if (GetPrimMaskHi(triMask)) +{ +for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1) +{ +for (uint32_t j = 0; j < 4; j += 1) +{ +verts[i][j] = _simd16_extract_ps(tri[i][j], 1); +} +} + +pa.useAlternateOffset = true; +BinTriangles(pDC, pa, workerId, verts, GetPrimMaskHi(triMask), _simd16_extract_si(primID, 1), _simd16_extract_si(viewportIdx, 1)); +} +} + +#endif struct