Reviewed-by: Bruce Cherniak <bruce.chern...@intel.com> 

> On Dec 12, 2016, at 1:30 PM, Rowley, Timothy O <timothy.o.row...@intel.com> 
> wrote:
> 
> Fixes to 128-bit formats.
> ---
> src/gallium/drivers/swr/rasterizer/core/utils.h    |   8 +-
> .../drivers/swr/rasterizer/memory/StoreTile.h      | 352 ++++++++-------------
> 2 files changed, 138 insertions(+), 222 deletions(-)
> 
> diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h 
> b/src/gallium/drivers/swr/rasterizer/core/utils.h
> index 8f96864..a236575 100644
> --- a/src/gallium/drivers/swr/rasterizer/core/utils.h
> +++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
> @@ -421,8 +421,8 @@ struct Transpose32_32_32_32
> 
>         vTranspose4x8(vDst, _simd16_extract_ps(src0, 1), 
> _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1), 
> _simd16_extract_ps(src3, 1));
> 
> -        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, 
> reinterpret_cast<simd16scalar *>(vDst)[2]);
> -        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, 
> reinterpret_cast<simd16scalar *>(vDst)[3]);
> +        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, 
> reinterpret_cast<simd16scalar *>(vDst)[0]);
> +        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, 
> reinterpret_cast<simd16scalar *>(vDst)[1]);
>     }
> #endif
> };
> @@ -474,8 +474,8 @@ struct Transpose32_32_32
> 
>         vTranspose3x8(vDst, _simd16_extract_ps(src0, 1), 
> _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1));
> 
> -        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, 
> reinterpret_cast<simd16scalar *>(vDst)[2]);
> -        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, 
> reinterpret_cast<simd16scalar *>(vDst)[3]);
> +        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, 
> reinterpret_cast<simd16scalar *>(vDst)[0]);
> +        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, 
> reinterpret_cast<simd16scalar *>(vDst)[1]);
>     }
> #endif
> };
> diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h 
> b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
> index 4fa6683..473ebae 100644
> --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
> +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
> @@ -899,8 +899,8 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* 
> pSrc, uint8_t* pDst, uint8_
> 
> #if KNOB_ARCH == KNOB_ARCH_AVX
> 
> -                                              // splitting into two sets of 
> 4 wide integer vector types
> -                                              // because AVX doesn't have 
> instructions to support this operation at 8 wide
> +    // splitting into two sets of 4 wide integer vector types
> +    // because AVX doesn't have instructions to support this operation at 8 
> wide
>     __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
>     __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
>     __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
> @@ -921,7 +921,7 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* 
> pSrc, uint8_t* pDst, uint8_
>     srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
>     srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
> 
> -                                           // unpack into rows that get the 
> tiling order correct
> +    // unpack into rows that get the tiling order correct
>     __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // 
> 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
>     __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
> 
> @@ -1169,8 +1169,8 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 
> 8>, SrcFormat, DstFormat>
>         // Punt non-full tiles to generic store
>         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 
> 1U);
>         uint32_t lodHeight = std::max(pDstSurface->height >> 
> pDstSurface->lod, 1U);
> -        if (x + KNOB_TILE_X_DIM > lodWidth ||
> -            y + KNOB_TILE_Y_DIM > lodHeight)
> +
> +        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > 
> lodHeight)
>         {
>             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, 
> sampleNum, renderTargetArrayIndex);
>         }
> @@ -1256,8 +1256,8 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 
> 16>, SrcFormat, DstFormat
>         // Punt non-full tiles to generic store
>         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 
> 1U);
>         uint32_t lodHeight = std::max(pDstSurface->height >> 
> pDstSurface->lod, 1U);
> -        if (x + KNOB_TILE_X_DIM > lodWidth ||
> -            y + KNOB_TILE_Y_DIM > lodHeight)
> +
> +        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > 
> lodHeight)
>         {
>             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, 
> sampleNum, renderTargetArrayIndex);
>         }
> @@ -1343,8 +1343,8 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 
> 32>, SrcFormat, DstFormat
>         // Punt non-full tiles to generic store
>         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 
> 1U);
>         uint32_t lodHeight = std::max(pDstSurface->height >> 
> pDstSurface->lod, 1U);
> -        if (x + KNOB_TILE_X_DIM > lodWidth ||
> -            y + KNOB_TILE_Y_DIM > lodHeight)
> +
> +        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > 
> lodHeight)
>         {
>             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, 
> sampleNum, renderTargetArrayIndex);
>         }
> @@ -1410,7 +1410,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 
> 32>, SrcFormat, DstFormat
> //////////////////////////////////////////////////////////////////////////
> /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
> //////////////////////////////////////////////////////////////////////////
> -template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat >
> +template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
> struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, 
> DstFormat>
> {
>     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, 
> DstFormat> GenericStoreTile;
> @@ -1435,8 +1435,8 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 
> 64>, SrcFormat, DstFormat
>         // Punt non-full tiles to generic store
>         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 
> 1U);
>         uint32_t lodHeight = std::max(pDstSurface->height >> 
> pDstSurface->lod, 1U);
> -        if (x + KNOB_TILE_X_DIM > lodWidth ||
> -            y + KNOB_TILE_Y_DIM > lodHeight)
> +
> +        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > 
> lodHeight)
>         {
>             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, 
> sampleNum, renderTargetArrayIndex);
>         }
> @@ -1451,33 +1451,18 @@ struct OptStoreRasterTile< 
> TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat
>         // we have to break these large spans up, since 
> ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
>         static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column 
> offsets");
> 
> -#if 1
> -        uint8_t *ppDsts[8];
> -
> -        {
> -            for (uint32_t y = 0; y < 2; y += 1)
> -            {
> -                for (uint32_t x = 0; x < 4; x += 1)
> -                {
> -                    ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * 
> MAX_DST_COLUMN_BYTES;
> -                }
> -            }
> -        }
> -
> -#else
>         uint8_t *ppDsts[] =
>         {
> -            pDst,                                                   // row 
> 0, col 0
> -            pDst + pDstSurface->pitch,                              // row 
> 1, col 0
> -            pDst + MAX_DST_COLUMN_BYTES,                            // row 
> 0, col 1
> -            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,       // row 
> 1, col 1
> -            pDst + MAX_DST_COLUMN_BYTES * 2,                        // row 
> 0, col 2
> -            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,   // row 
> 1, col 2
> -            pDst + MAX_DST_COLUMN_BYTES * 3,                        // row 
> 0, col 3
> -            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3    // row 
> 1, col 3
> +            pDst,                                                            
>    // row 0, col 0
> +            pDst + pDstSurface->pitch,                                       
>    // row 1, col 0
> +            pDst + MAX_DST_COLUMN_BYTES,                                     
>    // row 0, col 1
> +            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                
>    // row 1, col 1
> +            pDst + MAX_DST_COLUMN_BYTES * 2,                                 
>    // row 0, col 2
> +            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,            
>    // row 1, col 2
> +            pDst + MAX_DST_COLUMN_BYTES * 3,                                 
>    // row 0, col 3
> +            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3             
>    // row 1, col 3
>         };
> 
> -#endif
>         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
>         {
>             // Raster tile width is same as simd16 tile width
> @@ -1560,8 +1545,8 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 
> 128>, SrcFormat, DstForma
>         // Punt non-full tiles to generic store
>         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 
> 1U);
>         uint32_t lodHeight = std::max(pDstSurface->height >> 
> pDstSurface->lod, 1U);
> -        if (x + KNOB_TILE_X_DIM > lodWidth ||
> -            y + KNOB_TILE_Y_DIM > lodHeight)
> +
> +        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > 
> lodHeight)
>         {
>             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, 
> sampleNum, renderTargetArrayIndex);
>         }
> @@ -1571,75 +1556,36 @@ struct OptStoreRasterTile< 
> TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstForma
> #if USE_8x2_TILE_BACKEND
> 
>         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
> -        const uint32_t dy = SIMD16_TILE_Y_DIM * 2 * pDstSurface->pitch; // 
> double up on tile y dim, one simd16 tile will do twice the rows
> +        const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
> 
>         // we have to break these large spans up, since 
> ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
>         static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column 
> offsets");
> 
> -#if 1
> -        uint8_t *ppDsts[16];
> -
> -        {
> -            for (uint32_t y = 0; y < 2; y += 1)
> -            {
> -                for (uint32_t x = 0; x < 4; x += 1)
> -                {
> -                    ppDsts[x * 2 + (y + 0)] = pDst + (y + 0) * 
> pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
> -                    ppDsts[x * 2 + (y + 8)] = pDst + (y + 2) * 
> pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
> -                }
> -            }
> -        }
> -
> -#else
>         uint8_t* ppDsts[] =
>         {
> -            pDst,                                                       // 
> row 0, col 0
> -            pDst + pDstSurface->pitch,                                  // 
> row 1, col 0
> -            pDst + MAX_DST_COLUMN_BYTES,                                // 
> row 0, col 1
> -            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,           // 
> row 1, col 1
> -            pDst + MAX_DST_COLUMN_BYTES * 2,                            // 
> row 0, col 2
> -            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,       // 
> row 1, col 2
> -            pDst + MAX_DST_COLUMN_BYTES * 3,                            // 
> row 0, col 3
> -            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3,       // 
> row 1, col 3
> -
> -            pDst + pDstSurface->pitch * 2,                              // 
> row 2, col 0
> -            pDst + pDstSurface->pitch * 3,                              // 
> row 3, col 0
> -            pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES,       // 
> row 2, col 1
> -            pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES,       // 
> row 3, col 1
> -            pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES * 2,   // 
> row 2, col 2
> -            pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES * 2,   // 
> row 3, col 2
> -            pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES * 3,   // 
> row 2, col 3
> -            pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES * 3    // 
> row 3, col 3
> +            pDst,                                                            
>    // row 0, col 0
> +            pDst + pDstSurface->pitch,                                       
>    // row 1, col 0
> +            pDst + MAX_DST_COLUMN_BYTES,                                     
>    // row 0, col 1
> +            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                
>    // row 1, col 1
> +            pDst + MAX_DST_COLUMN_BYTES * 2,                                 
>    // row 0, col 2
> +            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,            
>    // row 1, col 2
> +            pDst + MAX_DST_COLUMN_BYTES * 3,                                 
>    // row 0, col 3
> +            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3,            
>    // row 1, col 3
> +            pDst + MAX_DST_COLUMN_BYTES * 4,                                 
>    // row 0, col 4
> +            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4,            
>    // row 1, col 4
> +            pDst + MAX_DST_COLUMN_BYTES * 5,                                 
>    // row 0, col 5
> +            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5,            
>    // row 1, col 5
> +            pDst + MAX_DST_COLUMN_BYTES * 6,                                 
>    // row 0, col 6
> +            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6,            
>    // row 1, col 6
> +            pDst + MAX_DST_COLUMN_BYTES * 7,                                 
>    // row 0, col 7
> +            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7,            
>    // row 1, col 7
>         };
> 
> -#endif
> -#if 1
> -        // Raster tile height is quadruple simd16 tile height
> -        static_assert(KNOB_TILE_Y_DIM == SIMD16_TILE_Y_DIM * 4, "Invalid 
> tile y dim");
> -
> -        // Raster tile width is same as simd16 tile width
> -        static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x 
> dim");
> -
> -        // tile rows 0 thru 3
> -        ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
> -
> -        pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
> -
> -        for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
> -        {
> -            ppDsts[i] += dy;
> -        }
> -
> -        // tile rows 4 thru 7
> -        ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
> -
> -#else
> -        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM 
> * 2)
> +        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
>         {
>             // Raster tile width is same as simd16 tile width
> -            static_assert(KNOB_TILE_X_DIM * 2 == SIMD16_TILE_X_DIM, "Invalid 
> tile x dim");
> +            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid 
> tile x dim");
> 
> -            // Format conversion, convert from SOA to AOS, and store
>             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, 
> ppDsts);
> 
>             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
> @@ -1649,8 +1595,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 
> 128>, SrcFormat, DstForma
>                 ppDsts[i] += dy;
>             }
>         }
> -
> -#endif
> #else
>         struct DstPtrs
>         {
> @@ -1723,21 +1667,22 @@ struct OptStoreRasterTile< 
> TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, Dst
>         // Punt non-full tiles to generic store
>         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 
> 1U);
>         uint32_t lodHeight = std::max(pDstSurface->height >> 
> pDstSurface->lod, 1U);
> -        if (x + KNOB_TILE_X_DIM > lodWidth ||
> -            y + KNOB_TILE_Y_DIM > lodHeight)
> +
> +        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > 
> lodHeight)
>         {
>             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, 
> sampleNum, renderTargetArrayIndex);
>         }
> 
>         // TileY is a column-major tiling mode where each 4KB tile consist of 
> 8 columns of 32 x 16B rows.
>         // We can compute the offsets to each column within the raster tile 
> once and increment from these.
> -        // There will be 2 x 4-wide columns in an 8x8 raster tile.
> #if USE_8x2_TILE_BACKEND
> +        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
>         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, 
> pDstSurface->arrayIndex + renderTargetArrayIndex,
>             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, 
> pDstSurface->lod, pDstSurface);
> 
>         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
> 
> +        // The Hot Tile uses a row-major tiling mode and has a larger memory 
> footprint. So we iterate in a row-major pattern.
>         uint8_t *ppDsts[] =
>         {
>             pDst,
> @@ -1746,9 +1691,11 @@ struct OptStoreRasterTile< 
> TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, Dst
>             pDst + DestRowWidthBytes + DestRowWidthBytes / 4
>         };
> 
> -        // The Hot Tile uses a row-major tiling mode and has a larger memory 
> footprint. So we iterate in a row-major pattern.
>         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
>         {
> +            // Raster tile width is same as simd16 tile width
> +            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid 
> tile x dim");
> +
>             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, 
> ppDsts);
> 
>             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
> @@ -1759,6 +1706,7 @@ struct OptStoreRasterTile< 
> TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, Dst
>             ppDsts[3] += dy;
>         }
> #else
> +        // There will be 8 4x2 simd tiles in an 8x8 raster tile.
>         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, 
> pDstSurface->arrayIndex + renderTargetArrayIndex,
>             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, 
> pDstSurface->lod, pDstSurface);
> 
> @@ -1810,21 +1758,22 @@ struct OptStoreRasterTile< 
> TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, Ds
>         // Punt non-full tiles to generic store
>         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 
> 1U);
>         uint32_t lodHeight = std::max(pDstSurface->height >> 
> pDstSurface->lod, 1U);
> -        if (x + KNOB_TILE_X_DIM > lodWidth ||
> -            y + KNOB_TILE_Y_DIM > lodHeight)
> +
> +        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > 
> lodHeight)
>         {
>             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, 
> sampleNum, renderTargetArrayIndex);
>         }
> 
>         // TileY is a column-major tiling mode where each 4KB tile consist of 
> 8 columns of 32 x 16B rows.
>         // We can compute the offsets to each column within the raster tile 
> once and increment from these.
> -        // There will be 2 x 4-wide columns in an 8x8 raster tile.
> #if USE_8x2_TILE_BACKEND
> +        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
>         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, 
> pDstSurface->arrayIndex + renderTargetArrayIndex,
>             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, 
> pDstSurface->lod, pDstSurface);
> 
>         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
> 
> +        // The Hot Tile uses a row-major tiling mode and has a larger memory 
> footprint. So we iterate in a row-major pattern.
>         uint8_t *ppDsts[] =
>         {
>             pDst,
> @@ -1833,9 +1782,11 @@ struct OptStoreRasterTile< 
> TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, Ds
>             pDst + DestRowWidthBytes + DestRowWidthBytes / 2
>         };
> 
> -        // The Hot Tile uses a row-major tiling mode and has a larger memory 
> footprint. So we iterate in a row-major pattern.
>         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
>         {
> +            // Raster tile width is same as simd16 tile width
> +            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid 
> tile x dim");
> +
>             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, 
> ppDsts);
> 
>             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
> @@ -1846,6 +1797,7 @@ struct OptStoreRasterTile< 
> TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, Ds
>             ppDsts[3] += dy;
>         }
> #else
> +        // There will be 8 4x2 simd tiles in an 8x8 raster tile.
>         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, 
> pDstSurface->arrayIndex + renderTargetArrayIndex,
>             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, 
> pDstSurface->lod, pDstSurface);
> 
> @@ -1895,11 +1847,11 @@ struct OptStoreRasterTile< 
> TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, Ds
>     {
>         static const uint32_t DestRowWidthBytes = 512;                   // 
> 512B rows
> 
> -                                                                     // Punt 
> non-full tiles to generic store
> +        // Punt non-full tiles to generic store
>         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 
> 1U);
>         uint32_t lodHeight = std::max(pDstSurface->height >> 
> pDstSurface->lod, 1U);
> -        if (x + KNOB_TILE_X_DIM > lodWidth ||
> -            y + KNOB_TILE_Y_DIM > lodHeight)
> +
> +        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > 
> lodHeight)
>         {
>             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, 
> sampleNum, renderTargetArrayIndex);
>         }
> @@ -1990,32 +1942,36 @@ struct OptStoreRasterTile< 
> TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, Ds
>         // Punt non-full tiles to generic store
>         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 
> 1U);
>         uint32_t lodHeight = std::max(pDstSurface->height >> 
> pDstSurface->lod, 1U);
> -        if (x + KNOB_TILE_X_DIM > lodWidth ||
> -            y + KNOB_TILE_Y_DIM > lodHeight)
> +
> +        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > 
> lodHeight)
>         {
>             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, 
> sampleNum, renderTargetArrayIndex);
>         }
> 
>         // TileY is a column-major tiling mode where each 4KB tile consist of 
> 8 columns of 32 x 16B rows.
>         // We can compute the offsets to each column within the raster tile 
> once and increment from these.
> -        // There will be 2 x 4-wide columns in an 8x8 raster tile.
> #if USE_8x2_TILE_BACKEND
> +        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
>         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, 
> pDstSurface->arrayIndex + renderTargetArrayIndex,
>             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, 
> pDstSurface->lod, pDstSurface);
> 
> +        // we have to break these large spans up, since 
> ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
>         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
> 
> -        uint8_t *ppDsts[] = 
> +        // The Hot Tile uses a row-major tiling mode and has a larger memory 
> footprint. So we iterate in a row-major pattern.
> +        uint8_t *ppDsts[] =
>         {
> -            pDst,
> -            pDst + DestRowWidthBytes,
> -            pDst + DestColumnBytes,
> -            pDst + DestRowWidthBytes + DestColumnBytes
> +            pDst,                                           // row 0, col 0
> +            pDst + DestRowWidthBytes,                       // row 1, col 0
> +            pDst + DestColumnBytes,                         // row 0, col 1
> +            pDst + DestRowWidthBytes + DestColumnBytes      // row 1, col 1
>         };
> 
> -        // The Hot Tile uses a row-major tiling mode and has a larger memory 
> footprint. So we iterate in a row-major pattern.
>         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
>         {
> +            // Raster tile width is same as simd16 tile width
> +            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid 
> tile x dim");
> +
>             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, 
> ppDsts);
> 
>             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
> @@ -2026,6 +1982,7 @@ struct OptStoreRasterTile< 
> TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, Ds
>             ppDsts[3] += dy;
>         }
> #else
> +        // There will be 8 4x2 simd tiles in an 8x8 raster tile.
>         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, 
> pDstSurface->arrayIndex + renderTargetArrayIndex,
>             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, 
> pDstSurface->lod, pDstSurface);
> 
> @@ -2078,52 +2035,40 @@ struct OptStoreRasterTile< 
> TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, Ds
>         // Punt non-full tiles to generic store
>         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 
> 1U);
>         uint32_t lodHeight = std::max(pDstSurface->height >> 
> pDstSurface->lod, 1U);
> -        if (x + KNOB_TILE_X_DIM > lodWidth ||
> -            y + KNOB_TILE_Y_DIM > lodHeight)
> +
> +        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > 
> lodHeight)
>         {
>             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, 
> sampleNum, renderTargetArrayIndex);
>         }
> 
>         // TileY is a column-major tiling mode where each 4KB tile consist of 
> 8 columns of 32 x 16B rows.
>         // We can compute the offsets to each column within the raster tile 
> once and increment from these.
> -        // There will be 2 x 4-wide columns in an 8x8 raster tile.
> #if USE_8x2_TILE_BACKEND
> +        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
>         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, 
> pDstSurface->arrayIndex + renderTargetArrayIndex,
>             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, 
> pDstSurface->lod, pDstSurface);
> 
> -        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
> -
> -#if 1
>         // we have to break these large spans up, since 
> ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
> -        uint8_t *ppDsts[8];
> -
> -        {
> -            for (uint32_t y = 0; y < 2; y += 1)
> -            {
> -                for (uint32_t x = 0; x < 4; x += 1)
> -                {
> -                    ppDsts[x * 2 + y] = pDst + y * DestRowWidthBytes + x * 
> DestColumnBytes;
> -                }
> -            }
> -        }
> +        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
> 
> -#else
> +        // The Hot Tile uses a row-major tiling mode and has a larger memory 
> footprint. So we iterate in a row-major pattern.
>         uint8_t *ppDsts[] =
>         {
> -            pDst,
> -            pDst + DestRowWidthBytes,
> -            pDst + DestColumnBytes,
> -            pDst + DestRowWidthBytes + DestColumnBytes,
> -            pDst + DestColumnBytes * 2,
> -            pDst + DestRowWidthBytes + DestColumnBytes * 2,
> -            pDst + DestColumnBytes * 3,
> -            pDst + DestRowWidthBytes + DestColumnBytes * 3
> +            pDst,                                           // row 0, col 0
> +            pDst + DestRowWidthBytes,                       // row 1, col 0
> +            pDst + DestColumnBytes,                         // row 0, col 1
> +            pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
> +            pDst + DestColumnBytes * 2,                     // row 0, col 2
> +            pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
> +            pDst + DestColumnBytes * 3,                     // row 0, col 3
> +            pDst + DestRowWidthBytes + DestColumnBytes * 3  // row 1, col 3
>         };
> 
> -#endif
> -        // The Hot Tile uses a row-major tiling mode and has a larger memory 
> footprint. So we iterate in a row-major pattern.
>         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
>         {
> +            // Raster tile width is same as simd16 tile width
> +            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid 
> tile x dim");
> +
>             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, 
> ppDsts);
> 
>             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
> @@ -2134,6 +2079,7 @@ struct OptStoreRasterTile< 
> TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, Ds
>             }
>         }
> #else
> +        // There will be 8 4x2 simd tiles in an 8x8 raster tile.
>         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, 
> pDstSurface->arrayIndex + renderTargetArrayIndex,
>             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, 
> pDstSurface->lod, pDstSurface);
>         uint8_t* pCol1 = pCol0 + DestColumnBytes;
> @@ -2210,79 +2156,48 @@ struct OptStoreRasterTile< 
> TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, D
>         // Punt non-full tiles to generic store
>         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 
> 1U);
>         uint32_t lodHeight = std::max(pDstSurface->height >> 
> pDstSurface->lod, 1U);
> -        if (x + KNOB_TILE_X_DIM > lodWidth ||
> -            y + KNOB_TILE_Y_DIM > lodHeight)
> +
> +        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > 
> lodHeight)
>         {
>             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, 
> sampleNum, renderTargetArrayIndex);
>         }
> 
> +        // TileY is a column-major tiling mode where each 4KB tile consist 
> of 8 columns of 32 x 16B rows.
> +        // We can compute the offsets to each column within the raster tile 
> once and increment from these.
> #if USE_8x2_TILE_BACKEND
> +        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
>         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, 
> pDstSurface->arrayIndex + renderTargetArrayIndex,
>             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, 
> pDstSurface->lod, pDstSurface);
> 
> -        const uint32_t dy = SIMD16_TILE_Y_DIM * 2 * DestRowWidthBytes;  // 
> double up on tile y dim, one simd16 tile will do twice the rows
> -
> -#if 1
>         // we have to break these large spans up, since 
> ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
> -        uint8_t *ppDsts[16];
> -
> -        {
> -            for (uint32_t y = 0; y < 2; y += 1)
> -            {
> -                for (uint32_t x = 0; x < 4; x += 1)
> -                {
> -                    ppDsts[x * 2 + (y + 0)] = pDst + (y + 0) * 
> DestRowWidthBytes + x * DestColumnBytes;
> -                    ppDsts[x * 2 + (y + 8)] = pDst + (y + 2) * 
> DestRowWidthBytes + x * DestColumnBytes;
> -                }
> -            }
> -        }
> +        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
> 
> -#else
> +        // The Hot Tile uses a row-major tiling mode and has a larger memory 
> footprint. So we iterate in a row-major pattern.
>         uint8_t *ppDsts[] =
>         {
> -            pDst,
> -            pDst + DestRowWidthBytes,
> -            pDst + DestColumnBytes,
> -            pDst + DestRowWidthBytes + DestColumnBytes,
> -            pDst + DestColumnBytes * 2,
> -            pDst + DestRowWidthBytes + DestColumnBytes * 2,
> -            pDst + DestColumnBytes * 3,
> -            pDst + DestRowWidthBytes + DestColumnBytes * 3,
> -
> -            pDst + DestRowWidthBytes * 2,
> -            pDst + DestRowWidthBytes * 3,
> -            pDst + DestRowWidthBytes * 2 + DestColumnBytes,
> -            pDst + DestRowWidthBytes * 3 + DestColumnBytes,
> -            pDst + DestRowWidthBytes * 2 + DestColumnBytes * 2,
> -            pDst + DestRowWidthBytes * 3 + DestColumnBytes * 2,
> -            pDst + DestRowWidthBytes * 2 + DestColumnBytes * 3,
> -            pDst + DestRowWidthBytes * 3 + DestColumnBytes * 3
> -    };
> -
> -#endif
> -#if 1
> -        // Raster tile height is quadruple simd16 tile height
> -        static_assert(KNOB_TILE_Y_DIM == SIMD16_TILE_Y_DIM * 4, "Invalid 
> tile y dim");
> -
> -        // Raster tile width is same as simd16 tile width
> -        static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x 
> dim");
> -
> -        // tile rows 0 thru 3
> -        ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
> -
> -        pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
> +            pDst,                                           // row 0, col 0
> +            pDst + DestRowWidthBytes,                       // row 1, col 0
> +            pDst + DestColumnBytes,                         // row 0, col 1
> +            pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
> +            pDst + DestColumnBytes * 2,                     // row 0, col 2
> +            pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
> +            pDst + DestColumnBytes * 3,                     // row 0, col 3
> +            pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3
> +            pDst + DestColumnBytes * 4,                     // row 0, col 4
> +            pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4
> +            pDst + DestColumnBytes * 5,                     // row 0, col 5
> +            pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5
> +            pDst + DestColumnBytes * 6,                     // row 0, col 6
> +            pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6
> +            pDst + DestColumnBytes * 7,                     // row 0, col 7
> +            pDst + DestRowWidthBytes + DestColumnBytes * 7  // row 1, col 7
> +        };
> 
> -        for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
> +        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
>         {
> -            ppDsts[i] += dy;
> -        }
> +            // Raster tile width is same as simd16 tile width
> +            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid 
> tile x dim");
> 
> -        // tile rows 4 thru 7
> -        ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
> -#else
> -        // The Hot Tile uses a row-major tiling mode and has a larger memory 
> footprint. So we iterate in a row-major pattern.
> -        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM 
> * 2)
> -        {
>             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, 
> ppDsts);
> 
>             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
> @@ -2292,8 +2207,8 @@ struct OptStoreRasterTile< 
> TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, D
>                 ppDsts[i] += dy;
>             }
>         }
> -#endif
> #else
> +        // There will be 8 4x2 simd tiles in an 8x8 raster tile.
>         uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, 
> pDstSurface->arrayIndex + renderTargetArrayIndex,
>             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, 
> pDstSurface->lod, pDstSurface);
>         struct DstPtrs
> @@ -2389,22 +2304,23 @@ struct StoreMacroTile
>         uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
>     {
>         PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
> +
>         for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; 
> sampleNum++)
>         {
> -           size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, 
> false>(
> -              0,
> -              0,
> -              pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D 
> surfaces
> -              pDstSurface->arrayIndex + renderTargetArrayIndex, // array 
> index for 2D arrays
> -              sampleNum,
> -              pDstSurface->lod,
> -              pDstSurface);
> -
> -           // Only support generic store-tile if lod surface doesn't start 
> on a page boundary and is non-linear
> -           bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && 
> (0 != (dstSurfAddress & 0xfff))) ||
> -              (pDstSurface->bInterleavedSamples);
> -
> -           pfnStore[sampleNum] = (bForceGeneric || 
> KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, 
> DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
> +            size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, 
> false>(
> +                0,
> +                0,
> +                pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 
> 3D surfaces
> +                pDstSurface->arrayIndex + renderTargetArrayIndex, // array 
> index for 2D arrays
> +                sampleNum,
> +                pDstSurface->lod,
> +                pDstSurface);
> +
> +            // Only support generic store-tile if lod surface doesn't start 
> on a page boundary and is non-linear
> +            bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) 
> && (0 != (dstSurfAddress & 0xfff))) ||
> +                (pDstSurface->bInterleavedSamples);
> +
> +            pfnStore[sampleNum] = (bForceGeneric || 
> KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, 
> DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
>         }
> 
>         // Store each raster tile from the hot tile to the destination 
> surface.
> -- 
> 2.7.4
> 

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to