- Revision
- 138936
- Author
- rga...@webkit.org
- Date
- 2013-01-07 06:31:32 -0800 (Mon, 07 Jan 2013)
Log Message
Optimizing RGBA16, RGB16, ARGB16, BGRA16 unpacking functions with NEON intrinsics
https://bugs.webkit.org/show_bug.cgi?id=103614
Reviewed by Zoltan Herczeg.
Optimizing RGBA16, RGB16, ARGB16, BGRA16 unpacking functions in GraphicsContext3D
with ARM NEON intrinsics. The optimized functions are 2-3x faster than the origin.
* platform/graphics/GraphicsContext3D.cpp:
(WebCore):
* platform/graphics/cpu/arm/GraphicsContext3DNEON.h:
(WebCore::SIMD::unpackOneRowOfRGBA16LittleToRGBA8):
(SIMD):
(WebCore::SIMD::unpackOneRowOfRGB16LittleToRGBA8):
(WebCore::SIMD::unpackOneRowOfARGB16LittleToRGBA8):
(WebCore::SIMD::unpackOneRowOfBGRA16LittleToRGBA8):
Modified Paths
Diff
Modified: trunk/Source/WebCore/ChangeLog (138935 => 138936)
--- trunk/Source/WebCore/ChangeLog 2013-01-07 13:26:13 UTC (rev 138935)
+++ trunk/Source/WebCore/ChangeLog 2013-01-07 14:31:32 UTC (rev 138936)
@@ -1,3 +1,22 @@
+2013-01-07 Gabor Rapcsanyi <rga...@webkit.org>
+
+ Optimizing RGBA16, RGB16, ARGB16, BGRA16 unpacking functions with NEON intrinsics
+ https://bugs.webkit.org/show_bug.cgi?id=103614
+
+ Reviewed by Zoltan Herczeg.
+
+ Optimizing RGBA16, RGB16, ARGB16, BGRA16 unpacking functions in GraphicsContext3D
+ with ARM NEON intrinsics. The optimized functions are 2-3x faster than the origin.
+
+ * platform/graphics/GraphicsContext3D.cpp:
+ (WebCore):
+ * platform/graphics/cpu/arm/GraphicsContext3DNEON.h:
+ (WebCore::SIMD::unpackOneRowOfRGBA16LittleToRGBA8):
+ (SIMD):
+ (WebCore::SIMD::unpackOneRowOfRGB16LittleToRGBA8):
+ (WebCore::SIMD::unpackOneRowOfARGB16LittleToRGBA8):
+ (WebCore::SIMD::unpackOneRowOfBGRA16LittleToRGBA8):
+
2013-01-07 Christophe Dumez <christophe.du...@intel.com>
Regression(r138786): Causes webaudio tests to crash
Modified: trunk/Source/WebCore/platform/graphics/GraphicsContext3D.cpp (138935 => 138936)
--- trunk/Source/WebCore/platform/graphics/GraphicsContext3D.cpp 2013-01-07 13:26:13 UTC (rev 138935)
+++ trunk/Source/WebCore/platform/graphics/GraphicsContext3D.cpp 2013-01-07 14:31:32 UTC (rev 138936)
@@ -392,6 +392,9 @@
void unpackOneRowOfRGBA16LittleToRGBA8(const uint16_t* source, uint8_t* destination, unsigned int pixelsPerRow)
{
+#if HAVE(ARM_NEON_INTRINSICS)
+ SIMD::unpackOneRowOfRGBA16LittleToRGBA8(source, destination, pixelsPerRow);
+#endif
for (unsigned int i = 0; i < pixelsPerRow; ++i) {
destination[0] = convertColor16LittleTo8(source[0]);
destination[1] = convertColor16LittleTo8(source[1]);
@@ -428,6 +431,9 @@
void unpackOneRowOfRGB16LittleToRGBA8(const uint16_t* source, uint8_t* destination, unsigned int pixelsPerRow)
{
+#if HAVE(ARM_NEON_INTRINSICS)
+ SIMD::unpackOneRowOfRGB16LittleToRGBA8(source, destination, pixelsPerRow);
+#endif
for (unsigned int i = 0; i < pixelsPerRow; ++i) {
destination[0] = convertColor16LittleTo8(source[0]);
destination[1] = convertColor16LittleTo8(source[1]);
@@ -476,6 +482,9 @@
void unpackOneRowOfARGB16LittleToRGBA8(const uint16_t* source, uint8_t* destination, unsigned int pixelsPerRow)
{
+#if HAVE(ARM_NEON_INTRINSICS)
+ SIMD::unpackOneRowOfARGB16LittleToRGBA8(source, destination, pixelsPerRow);
+#endif
for (unsigned int i = 0; i < pixelsPerRow; ++i) {
destination[0] = convertColor16LittleTo8(source[1]);
destination[1] = convertColor16LittleTo8(source[2]);
@@ -530,6 +539,9 @@
void unpackOneRowOfBGRA16LittleToRGBA8(const uint16_t* source, uint8_t* destination, unsigned int pixelsPerRow)
{
+#if HAVE(ARM_NEON_INTRINSICS)
+ SIMD::unpackOneRowOfBGRA16LittleToRGBA8(source, destination, pixelsPerRow);
+#endif
for (unsigned int i = 0; i < pixelsPerRow; ++i) {
destination[0] = convertColor16LittleTo8(source[2]);
destination[1] = convertColor16LittleTo8(source[1]);
Modified: trunk/Source/WebCore/platform/graphics/cpu/arm/GraphicsContext3DNEON.h (138935 => 138936)
--- trunk/Source/WebCore/platform/graphics/cpu/arm/GraphicsContext3DNEON.h 2013-01-07 13:26:13 UTC (rev 138935)
+++ trunk/Source/WebCore/platform/graphics/cpu/arm/GraphicsContext3DNEON.h 2013-01-07 14:31:32 UTC (rev 138936)
@@ -34,6 +34,86 @@
namespace SIMD {
+ALWAYS_INLINE void unpackOneRowOfRGBA16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
+{
+ unsigned componentsPerRow = pixelsPerRow * 4;
+ unsigned tailComponents = componentsPerRow % 16;
+ unsigned componentsSize = componentsPerRow - tailComponents;
+ const uint8_t* src = "" uint8_t*>(source);
+
+ for (unsigned i = 0; i < componentsSize; i += 16) {
+ uint8x16x2_t components = vld2q_u8(src + i * 2);
+ vst1q_u8(destination + i, components.val[1]);
+ }
+
+ source += componentsSize;
+ destination += componentsSize;
+ pixelsPerRow = tailComponents / 4;
+}
+
+ALWAYS_INLINE void unpackOneRowOfRGB16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
+{
+ unsigned componentsPerRow = pixelsPerRow * 3;
+ unsigned tailComponents = componentsPerRow % 24;
+ unsigned componentsSize = componentsPerRow - tailComponents;
+
+ uint8x8_t componentA = vdup_n_u8(0xFF);
+ for (unsigned i = 0; i < componentsSize; i += 24) {
+ uint16x8x3_t RGB16 = vld3q_u16(source + i);
+ uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(RGB16.val[0], 8));
+ uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(RGB16.val[1], 8));
+ uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(RGB16.val[2], 8));
+ uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
+ vst4_u8(destination, RGBA8);
+ destination += 32;
+ }
+
+ source += componentsSize;
+ pixelsPerRow = tailComponents / 3;
+}
+
+ALWAYS_INLINE void unpackOneRowOfARGB16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
+{
+ unsigned componentsPerRow = pixelsPerRow * 4;
+ unsigned tailComponents = componentsPerRow % 32;
+ unsigned componentsSize = componentsPerRow - tailComponents;
+
+ for (unsigned i = 0; i < componentsSize; i += 32) {
+ uint16x8x4_t ARGB16 = vld4q_u16(source + i);
+ uint8x8_t componentA = vqmovn_u16(vshrq_n_u16(ARGB16.val[0], 8));
+ uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(ARGB16.val[1], 8));
+ uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(ARGB16.val[2], 8));
+ uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(ARGB16.val[3], 8));
+ uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
+ vst4_u8(destination + i, RGBA8);
+ }
+
+ source += componentsSize;
+ destination += componentsSize;
+ pixelsPerRow = tailComponents / 4;
+}
+
+ALWAYS_INLINE void unpackOneRowOfBGRA16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
+{
+ unsigned componentsPerRow = pixelsPerRow * 4;
+ unsigned tailComponents = componentsPerRow % 32;
+ unsigned componentsSize = componentsPerRow - tailComponents;
+
+ for (unsigned i = 0; i < componentsSize; i += 32) {
+ uint16x8x4_t ARGB16 = vld4q_u16(source + i);
+ uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(ARGB16.val[0], 8));
+ uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(ARGB16.val[1], 8));
+ uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(ARGB16.val[2], 8));
+ uint8x8_t componentA = vqmovn_u16(vshrq_n_u16(ARGB16.val[3], 8));
+ uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
+ vst4_u8(destination + i, RGBA8);
+ }
+
+ source += componentsSize;
+ destination += componentsSize;
+ pixelsPerRow = tailComponents / 4;
+}
+
ALWAYS_INLINE void unpackOneRowOfRGBA4444ToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
{
unsigned tailPixels = pixelsPerRow % 8;