At 2019-09-23 12:50:22, "Akil" <[email protected]> wrote:
# HG changeset patch # User Akil Ayyappan<[email protected]> # Date 1568370446 -19800 # Fri Sep 13 15:57:26 2019 +0530 # Node ID 531f6b03eed0a40a38d3589dec03f14743293146 # Parent c4b098f973e6b0ee4aee3bf0d7b54da4e2734d42 Adaptive Frame duplication + uint32_t y = 0; + + /* Consume rows in ever narrower chunks of height */ + for (int size = BLOCK_64x64; size >= BLOCK_4x4 && y < height; size--) + { + uint32_t rowHeight = 1 << (size + 2); + + for (; y + rowHeight <= height; y += rowHeight) + { + uint32_t y1, x = 0; + + /* Consume each row using the largest square blocks possible */ + if (size == BLOCK_64x64 && !(stride & 31)) + for (; x + 64 <= width; x += 64) + ssd += primitives.cu[BLOCK_64x64].sse_pp(fenc + x, stride, rec + x, stride); + + if (size >= BLOCK_32x32 && !(stride & 15)) + for (; x + 32 <= width; x += 32) + for (y1 = 0; y1 + 32 <= rowHeight; y1 += 32) + ssd += primitives.cu[BLOCK_32x32].sse_pp(fenc + y1 * stride + x, stride, rec + y1 * stride + x, stride); + + if (size >= BLOCK_16x16) + for (; x + 16 <= width; x += 16) + for (y1 = 0; y1 + 16 <= rowHeight; y1 += 16) + ssd += primitives.cu[BLOCK_16x16].sse_pp(fenc + y1 * stride + x, stride, rec + y1 * stride + x, stride); + + if (size >= BLOCK_8x8) + for (; x + 8 <= width; x += 8) + for (y1 = 0; y1 + 8 <= rowHeight; y1 += 8) + ssd += primitives.cu[BLOCK_8x8].sse_pp(fenc + y1 * stride + x, stride, rec + y1 * stride + x, stride); + + for (; x + 4 <= width; x += 4) + for (y1 = 0; y1 + 4 <= rowHeight; y1 += 4) + ssd += primitives.cu[BLOCK_4x4].sse_pp(fenc + y1 * stride + x, stride, rec + y1 * stride + x, stride); + + fenc += stride * rowHeight; + rec += stride * rowHeight; + } + } + + return ssd; +} You try to processing block as big as possible, however, this code styles is less readable. Suggest put trick in optimized version other than inside C model.
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
