On Tue, 2006-01-03 at 15:49 -0700, Tres Melton wrote: > Michael, > Crap, Wrong damn tree. Try this instead of the last one please.
-- Tres Melton IRC & Gentoo: RiverRat
Index: eterm/Eterm/src/pixmap.c
===================================================================
RCS file: /cvsroot/enlightenment/eterm/Eterm/src/pixmap.c,v
retrieving revision 1.115
diff -u -b -B -u -r1.115 pixmap.c
--- eterm/Eterm/src/pixmap.c 22 Dec 2005 23:31:33 -0000 1.115
+++ eterm/Eterm/src/pixmap.c 3 Jan 2006 22:59:40 -0000
@@ -66,10 +66,30 @@
extern void shade_ximage_32_mmx(void *data, int bpl, int w, int h, int rm, int gm, int bm);
/* Assembler routines for 64 bit cpu with sse2 */
-extern void shade_ximage_15_sse2(void *data, int bpl, int w, int h, int rm, int gm, int bm);
-extern void shade_ximage_16_sse2(void *data, int bpl, int w, int h, int rm, int gm, int bm);
+#ifdef HAVE_SSE2
+extern void shade_ximage_15_sse2_A(void *data, int bpl, int w, int h, int rm, int gm, int bm );
+extern void shade_ximage_15_sse2_U(void *data, int bpl, int w, int h, int rm, int gm, int bm );
+extern void shade_ximage_16_sse2_A(void *data, int bpl, int w, int h, int rm, int gm, int bm );
+extern void shade_ximage_16_sse2_U(void *data, int bpl, int w, int h, int rm, int gm, int bm );
extern void shade_ximage_32_sse2(void *data, int bpl, int w, int h, int rm, int gm, int bm);
+#define ETERM_ALIGNMENT 16
+
+#define shade_ximage_15_sse2( data, bpl, w, h, rm, gm, bm ) \
+{ \
+ (((long) ( data )) & ((long) ( bpl )) & ((long) ( ETERM_ALIGNMENT - 1 ))) ? \
+ shade_ximage_15_sse2_U((data), (bpl), (w), (h), (rm), (gm), (bm)) : \
+ shade_ximage_15_sse2_A((data), (bpl), (w), (h), (rm), (gm), (bm)); \
+}
+
+#define shade_ximage_16_sse2( data, bpl, w, h, rm, gm, bm ) \
+{ \
+ (((long) ( data )) & ((long) ( bpl )) & ((long) ( ETERM_ALIGNMENT - 1 ))) ? \
+ shade_ximage_16_sse2_U((data), (bpl), (w), (h), (rm), (gm), (bm)) : \
+ shade_ximage_16_sse2_A((data), (bpl), (w), (h), (rm), (gm), (bm)); \
+}
+#endif
+
#ifdef PIXMAP_SUPPORT
static Imlib_Border bord_none = { 0, 0, 0, 0 };
#endif
Index: eterm/Eterm/src/sse2_cmod.c
===================================================================
RCS file: /cvsroot/enlightenment/eterm/Eterm/src/sse2_cmod.c,v
retrieving revision 1.1
diff -u -b -B -u -r1.1 sse2_cmod.c
--- eterm/Eterm/src/sse2_cmod.c 14 Jun 2005 19:39:01 -0000 1.1
+++ eterm/Eterm/src/sse2_cmod.c 3 Jan 2006 22:59:41 -0000
@@ -94,7 +88,7 @@
#ifdef HAVE_SSE2
-void shade_ximage_15_sse2( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
+void shade_ximage_15_sse2_U( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
{
__asm__ __volatile__ (
".align 16 \n\t" /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/
@@ -269,7 +263,7 @@
}
-void shade_ximage_16_sse2( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
+void shade_ximage_16_sse2_U( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
{
__asm__ __volatile__ (
".align 16 \n\t" /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/
@@ -447,6 +441,359 @@
); /* End of Assembly */
}
+void shade_ximage_15_sse2_A( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
+{
+ __asm__ __volatile__ (
+ ".align 16 \n\t" /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/
+ "leaq -14(%%rsi, %%rbx, 2), %%rsi\n\t" /* Load the stack index register with a pointer to data + ( width * bytes/pixel ) -6 */
+ "negq %%rbx \n\t" /* Negate the width to that we can increment the counter */
+ "jz 10f \n\t" /* Jump to end if the line count is zero */
+ "movd %[red_mod], %%xmm5 \n\t" /* Load the color modifiers into mmx registers */
+ "movd %[green_mod], %%xmm6 \n\t" /* " " */
+ "movd %[blue_mod], %%xmm7 \n\t" /* " " */
+ "punpcklwd %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low words. From A64_128bit_Media_Programming (p. 380) */
+ "punpcklwd %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 16 bits into the next 16 bits (both operands are the same) */
+ "punpcklwd %%xmm7, %%xmm7 \n\t"
+ "punpckldq %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low double words. From A64_128bit_Media_Programming (p. 376) */
+ "punpckldq %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 32 bits into the next 32 bits (both operands are the same) */
+ "punpckldq %%xmm7, %%xmm7 \n\t"
+ "punpcklqdq %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low quad words. From A64_128bit_Media_Programming (p. 378) */
+ "punpcklqdq %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 64 bits into the next 64 bits (both operands are the same) */
+ "punpcklqdq %%xmm7, %%xmm7 \n\t"
+ "or %[red_mod], %[green_mod] \n\t" /* This, and the following 4 instructions, check to see if all three colormodifiers are */
+ "or %[blue_mod], %[green_mod] \n\t" /* less than 256. If any of the modifiers are > 256 then they will have the 9th, or higher, */
+ "sar $8, %[green_mod] \n\t" /* bit set. Then we shift off eight bits, leaving something set if a modifier > 256. */
+ "movq %%rax, %[blue_mod] \n\t" /* Use the register named blue_mod to now store bytes_per_line. */
+ "xor %[red_mod], %[red_mod] \n\t" /* zero red so we don't have to load an immediate value for the following compare. */
+ "cmp %[red_mod], %[green_mod] \n\t" /* Compare the left over bits to zero */
+ "jg 5f \n\t" /* If one of the colors (might) need saturated then jump to the secondary set of loops. */
+ "1: \n\t" /* Start of the outer loop (lines). */
+ "movq %%rbx, %%rcx \n\t" /* Move the width into the count register */
+ "addq $7, %%rcx \n\t"
+ "jns 3f \n\t"
+ "2: \n\t" /* Start of the inner loop (pixels 8 at a time --> 8 * 16 = 128bits/xmm register ) */
+ "movdqa (%%rsi, %%rcx, 2), %%xmm0\n\t" /* Load the 16 bits of the pixel (5 bits for red, 6 bits for green, 5 bits for blue) */
+ "movdqa %%xmm0, %%xmm1 \n\t" /* Create a copy of the pixel for the green color */
+ "movdqa %%xmm0, %%xmm2 \n\t" /* Create a copy of the pixel for the blue color */
+ "psrlw $5, %%xmm1 \n\t" /* Packed Shift Right Logical Words */
+ /* From A64_128bit_Media_Programming (p. 347) */
+ /* Shifts the blue off of the green color */
+ "psrlw $10, %%xmm0 \n\t" /* Shifts the blue & green off of the red color */
+ "psllw $11, %%xmm2 \n\t" /* Packed Shift Left Logical Words */
+ /* From A64_128bit_Media_Programming (p. 330) */
+ /* Shifts the red & green off of the blue color */
+ "psllw $11, %%xmm1 \n\t" /* Shifts the red off of the green color */
+ "psllw $8, %%xmm0 \n\t" /* Shifts the red color into position */
+ "psrlw $3, %%xmm1 \n\t" /* Shifts the green color into position */
+ "psrlw $3, %%xmm2 \n\t" /* Shifts the blue color into position */
+ "pmulhw %%xmm5, %%xmm0 \n\t" /* color *= modifier */
+ "pmulhw %%xmm6, %%xmm1 \n\t"
+ "pmulhw %%xmm7, %%xmm2 \n\t"
+ "psllw $10, %%xmm0 \n\t" /* Shift red back into its original position */
+ "psllw $5, %%xmm1 \n\t" /* Shift green back into its original position */
+ "por %%xmm2, %%xmm0 \n\t" /* Mesh the colors back together */
+ "por %%xmm1, %%xmm0 \n\t"
+ "movdqa %%xmm0, (%%rsi, %%rcx, 2)\n\t" /* Place the shaded 8 pixels back into the image map */
+ "addq $8, %%rcx \n\t"
+ "js 2b \n\t"
+ "jmp 4f \n\t"
+ "3: \n\t" /* Deal with pixels one at a time here. */
+ "movw (%%rsi, %%rcx, 2), %%ax \n\t"
+ "movd %%eax, %%xmm0 \n\t"
+ "movq %%xmm0, %%xmm1 \n\t"
+ "movq %%xmm0, %%xmm2 \n\t"
+ "psrlw $5, %%xmm1 \n\t"
+ "psrlw $10, %%xmm0 \n\t"
+ "psllw $11, %%xmm2 \n\t"
+ "psllw $11, %%xmm1 \n\t"
+ "psllw $8, %%xmm0 \n\t"
+ "psrlw $3, %%xmm1 \n\t"
+ "psrlw $3, %%xmm2 \n\t"
+ "pmulhw %%xmm5, %%xmm0 \n\t"
+ "pmulhw %%xmm6, %%xmm1 \n\t"
+ "pmulhw %%xmm7, %%xmm2 \n\t"
+ "psllw $10, %%xmm0 \n\t"
+ "psllw $5, %%xmm1 \n\t"
+ "por %%xmm2, %%xmm0 \n\t"
+ "por %%xmm1, %%xmm0 \n\t"
+ "movd %%xmm0, %%eax \n\t"
+ "movw %%ax, (%%rsi, %%rcx, 2) \n\t"
+ "incq %%rcx \n\t"
+ "4: \n\t"
+ "cmpq $6, %%rcx \n\t"
+ "jng 3b \n\t"
+ "addq %[blue_mod], %%rsi \n\t" /* Blue_mod is the name of a register that now contains bytes_per_line. */
+ "decq %%rdx \n\t"
+ "jnz 1b \n\t"
+ "jmp 10f \n\t" /* We're done! */
+
+ "5: \n\t" /* Saturation is required */
+ "pcmpeqw %%xmm3, %%xmm3 \n\t" /* Packed Compare Equal Words */
+ /* From A64_128bit_Media_Programming (p. 276) */
+ /* This sets xmm3 to 128 1's (since mm6 = mm6) */
+ "psllw $5, %%xmm3 \n\t" /* xmm3 = 8 copies of 1111 1111 1110 0000 */
+ "6: \n\t"
+ "movq %%rbx, %%rcx \n\t"
+ "addq $7, %%rcx \n\t"
+ "jns 8f \n\t"
+ "7: \n\t"
+ "movdqa (%%rsi, %%rcx, 2), %%xmm0\n\t"
+ "movdqa %%xmm0, %%xmm1 \n\t"
+ "movdqa %%xmm0, %%xmm2 \n\t"
+ "psrlw $5, %%xmm1 \n\t"
+ "psrlw $10, %%xmm0 \n\t"
+ "psllw $11, %%xmm2 \n\t"
+ "psllw $11, %%xmm1 \n\t"
+ "psllw $8, %%xmm0 \n\t"
+ "psrlw $3, %%xmm1 \n\t"
+ "psrlw $3, %%xmm2 \n\t"
+ "pmulhw %%xmm5, %%xmm0 \n\t"
+ "pmulhw %%xmm6, %%xmm1 \n\t"
+ "pmulhw %%xmm7, %%xmm2 \n\t"
+ "paddusw %%xmm3, %%xmm0 \n\t"
+ "paddusw %%xmm3, %%xmm1 \n\t"
+ "paddusw %%xmm3, %%xmm2 \n\t"
+ "psubw %%xmm3, %%xmm0 \n\t" /* FIXME: This line needs added to the original asm code */
+ "psubw %%xmm3, %%xmm1 \n\t"
+ "psubw %%xmm3, %%xmm2 \n\t"
+ "psllw $10, %%xmm0 \n\t"
+ "psllw $5, %%xmm1 \n\t"
+ "por %%xmm2, %%xmm0 \n\t"
+ "por %%xmm1, %%xmm0 \n\t"
+ "movdqa %%xmm0, (%%rsi, %%rcx, 2)\n\t"
+ "addq $8, %%rcx \n\t"
+ "js 7b \n\t"
+ "jmp 9f \n\t"
+ "8: \n\t"
+ "movw (%%rsi, %%rcx, 2), %%ax \n\t"
+ "movd %%eax, %%xmm0 \n\t"
+ "movq %%xmm0, %%xmm1 \n\t"
+ "movq %%xmm0, %%xmm2 \n\t"
+ "psrlw $5, %%xmm1 \n\t"
+ "psrlw $10, %%xmm0 \n\t"
+ "psllw $11, %%xmm2 \n\t"
+ "psllw $11, %%xmm1 \n\t"
+ "psllw $8, %%xmm0 \n\t"
+ "psrlw $3, %%xmm1 \n\t"
+ "psrlw $3, %%xmm2 \n\t"
+ "pmulhw %%xmm5, %%xmm0 \n\t"
+ "pmulhw %%xmm6, %%xmm1 \n\t"
+ "pmulhw %%xmm7, %%xmm2 \n\t"
+ "paddusw %%xmm3, %%xmm0 \n\t"
+ "paddusw %%xmm3, %%xmm1 \n\t"
+ "paddusw %%xmm3, %%xmm2 \n\t"
+ "psubw %%xmm3, %%xmm0 \n\t" /* FIXME: This line needs added to the original asm code */
+ "psubw %%xmm3, %%xmm1 \n\t"
+ "psubw %%xmm3, %%xmm2 \n\t"
+ "psllw $10, %%xmm0 \n\t"
+ "psllw $5, %%xmm1 \n\t"
+ "por %%xmm2, %%xmm0 \n\t"
+ "por %%xmm1, %%xmm0 \n\t"
+ "movd %%xmm0, %%eax \n\t"
+ "movw %%ax, (%%rsi, %%rcx, 2) \n\t"
+ "incq %%rcx \n\t"
+ "9: \n\t"
+ "cmpq $6, %%rcx \n\t"
+ "jng 8b \n\t"
+ "addq %[blue_mod], %%rsi \n\t" /* Blue_mod is the name of a register that now contains bytes_per_line. */
+ "decq %%rdx \n\t"
+ "jnz 6b \n\t"
+ "10: \n\t" /* This is the end. Jump here if the line count is zero. */
+ "emms \n\t" /* exit multi-media state (last asm instruction) */
+ : /* outputs: none */
+ /* inputs: (many operations cannot be performed with a mix of 32bit & 64bit operands directly) */
+ /* (however the compiler/assembler can preload 32bit values into 64bit registers) */
+ /* (that is why certain variables cannot be referenced by name -- use their register) */
+ : [data] "S" (data), /* put the pointer data into the rsi register */
+ [width] "b" (w), /* put the width in the %rbx register (cannot be referenced by name) */
+ [height] "d" (h), /* put the heigth in the %rdx register (cannot be referenced by name) */
+ [red_mod] "r" ((unsigned long)(rm)),/* put the red_modifier in a register (referenced by name) */
+ [green_mod] "r" ((unsigned long)(gm)),/* put the green_modifier in a register (referenced by name) */
+ [blue_mod] "r" ((unsigned long)(bm)),/* put the blue_modifier in a register (referenced by name) Later store the bytes_line here */
+ [bytes_line] "a" (bpl) /* put the bytes_per_line in the %rax register (cannot be referenced by name) */
+ : "memory" /* clobbers: (memory includes all the registers) */
+ ); /* End of Assembly */
+}
+
+
+void shade_ximage_16_sse2_A( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
+{
+ __asm__ __volatile__ (
+ ".align 16 \n\t" /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/
+ "leaq -14(%%rsi, %%rbx, 2), %%rsi\n\t" /* Load the stack index register with a pointer to data + ( width * bytes/pixel ) -6 */
+ "negq %%rbx \n\t" /* Negate the width to that we can increment the counter */
+ "jz 10f \n\t" /* Jump to end if the line count is zero */
+ "movd %[red_mod], %%xmm5 \n\t" /* Load the color modifiers into mmx registers */
+ "movd %[green_mod], %%xmm6 \n\t" /* " " */
+ "movd %[blue_mod], %%xmm7 \n\t" /* " " */
+ "punpcklwd %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low words. From A64_128bit_Media_Programming (p. 380) */
+ "punpcklwd %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 16 bits into the next 16 bits (both operands are the same) */
+ "punpcklwd %%xmm7, %%xmm7 \n\t"
+ "punpckldq %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low double words. From A64_128bit_Media_Programming (p. 376) */
+ "punpckldq %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 32 bits into the next 32 bits (both operands are the same) */
+ "punpckldq %%xmm7, %%xmm7 \n\t"
+ "punpcklqdq %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low quad words. From A64_128bit_Media_Programming (p. 378) */
+ "punpcklqdq %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 64 bits into the next 64 bits (both operands are the same) */
+ "punpcklqdq %%xmm7, %%xmm7 \n\t"
+ "or %[red_mod], %[green_mod] \n\t" /* This, and the following 4 instructions, check to see if all three colormodifiers are */
+ "or %[blue_mod], %[green_mod] \n\t" /* less than 256. If any of the modifiers are > 256 then they will have the 9th, or higher, */
+ "sar $8, %[green_mod] \n\t" /* bit set. Then we shift off eight bits, leaving something set if a modifier > 256. */
+ "movq %%rax, %[blue_mod] \n\t" /* Use the register named blue_mod to now store bytes_per_line. */
+ "xor %[red_mod], %[red_mod] \n\t" /* zero red so we don't have to load an immediate value for the following compare. */
+ "cmp %[red_mod], %[green_mod] \n\t" /* Compare the left over bits to zero */
+ "jg 5f \n\t" /* If one of the colors (might) need saturated then jump to the secondary set of loops. */
+ "1: \n\t" /* Start of the outer loop (lines). */
+ "movq %%rbx, %%rcx \n\t" /* Move the width into the count register */
+ "addq $7, %%rcx \n\t"
+ "jns 3f \n\t"
+ "2: \n\t" /* Start of the inner loop (pixels 8 at a time --> 8 * 16 = 128bits/xmm register ) */
+ "movdqa (%%rsi, %%rcx, 2), %%xmm0\n\t" /* Load the 16 bits of the pixel (5 bits for red, 6 bits for green, 5 bits for blue) */
+ "movdqa %%xmm0, %%xmm1 \n\t" /* Create a copy of the pixel for the green color */
+ "movdqa %%xmm0, %%xmm2 \n\t" /* Create a copy of the pixel for the blue color */
+ "psrlw $5, %%xmm1 \n\t" /* Packed Shift Right Logical Words */
+ /* From A64_128bit_Media_Programming (p. 347) */
+ /* Shifts the blue off of the green color */
+ "psrlw $11, %%xmm0 \n\t" /* Shifts the blue & green off of the red color */
+ "psllw $11, %%xmm2 \n\t" /* Packed Shift Left Logical Words */
+ /* From A64_128bit_Media_Programming (p. 330) */
+ /* Shifts the red & green off of the blue color */
+ "psllw $10, %%xmm1 \n\t" /* Shifts the red off of the green color */
+ "psllw $8, %%xmm0 \n\t" /* Shifts the red color into position */
+ "psrlw $2, %%xmm1 \n\t" /* Shifts the green color into position */
+ "psrlw $3, %%xmm2 \n\t" /* Shifts the blue color into position */
+ "pmulhw %%xmm5, %%xmm0 \n\t" /* color *= modifier */
+ "pmulhw %%xmm6, %%xmm1 \n\t"
+ "pmulhw %%xmm7, %%xmm2 \n\t"
+ "psllw $11, %%xmm0 \n\t" /* Shift red back into its original position */
+ "psllw $5, %%xmm1 \n\t" /* Shift green back into its original position */
+ "por %%xmm2, %%xmm0 \n\t" /* Mesh the colors back together */
+ "por %%xmm1, %%xmm0 \n\t"
+ "movdqa %%xmm0, (%%rsi, %%rcx, 2)\n\t" /* Place the shaded 8 pixels back into the image map */
+ "addq $8, %%rcx \n\t"
+ "js 2b \n\t"
+ "jmp 4f \n\t"
+ "3: \n\t" /* Deal with pixels one at a time here. */
+ "movw (%%rsi, %%rcx, 2), %%ax \n\t"
+ "movd %%eax, %%xmm0 \n\t"
+ "movq %%xmm0, %%xmm1 \n\t"
+ "movq %%xmm0, %%xmm2 \n\t"
+ "psrlw $5, %%xmm1 \n\t"
+ "psrlw $11, %%xmm0 \n\t"
+ "psllw $11, %%xmm2 \n\t"
+ "psllw $10, %%xmm1 \n\t"
+ "psllw $8, %%xmm0 \n\t"
+ "psrlw $2, %%xmm1 \n\t"
+ "psrlw $3, %%xmm2 \n\t"
+ "pmulhw %%xmm5, %%xmm0 \n\t"
+ "pmulhw %%xmm6, %%xmm1 \n\t"
+ "pmulhw %%xmm7, %%xmm2 \n\t"
+ "psllw $11, %%xmm0 \n\t"
+ "psllw $5, %%xmm1 \n\t"
+ "por %%xmm2, %%xmm0 \n\t"
+ "por %%xmm1, %%xmm0 \n\t"
+ "movd %%xmm0, %%eax \n\t"
+ "movw %%ax, (%%rsi, %%rcx, 2) \n\t"
+ "incq %%rcx \n\t"
+ "4: \n\t"
+ "cmpq $6, %%rcx \n\t"
+ "jng 3b \n\t"
+ "addq %[blue_mod], %%rsi \n\t" /* Blue_mod is the name of a register that now contains bytes_per_line. */
+ "decq %%rdx \n\t"
+ "jnz 1b \n\t"
+ "jmp 10f \n\t" /* We're done! */
+
+ "5: \n\t" /* Saturation is required */
+ "pcmpeqw %%xmm3, %%xmm3 \n\t" /* Packed Compare Equal Words */
+ /* From A64_128bit_Media_Programming (p. 276) */
+ /* This sets xmm3 to 128 1's (since mm6 = mm6) */
+ "movdqa %%xmm3, %%xmm4 \n\t" /* Make copy of 128 ones */
+ "psllw $5, %%xmm3 \n\t" /* xmm3 = 8 copies of 1111 1111 1110 0000 */
+ "psllw $6, %%xmm4 \n\t" /* xmm4 = 8 copies of 1111 1111 1100 0000 */
+ "6: \n\t"
+ "movq %%rbx, %%rcx \n\t"
+ "addq $7, %%rcx \n\t"
+ "jns 8f \n\t"
+ "7: \n\t"
+ "movdqa (%%rsi, %%rcx, 2), %%xmm0\n\t"
+ "movdqa %%xmm0, %%xmm1 \n\t"
+ "movdqa %%xmm0, %%xmm2 \n\t"
+ "psrlw $5, %%xmm1 \n\t"
+ "psrlw $11, %%xmm0 \n\t"
+ "psllw $11, %%xmm2 \n\t"
+ "psllw $10, %%xmm1 \n\t"
+ "psllw $8, %%xmm0 \n\t"
+ "psrlw $2, %%xmm1 \n\t"
+ "psrlw $3, %%xmm2 \n\t"
+ "pmulhw %%xmm5, %%xmm0 \n\t"
+ "pmulhw %%xmm6, %%xmm1 \n\t"
+ "pmulhw %%xmm7, %%xmm2 \n\t"
+ "paddusw %%xmm3, %%xmm0 \n\t"
+ "paddusw %%xmm4, %%xmm1 \n\t"
+ "paddusw %%xmm3, %%xmm2 \n\t"
+ "psubw %%xmm4, %%xmm1 \n\t"
+ "psubw %%xmm3, %%xmm2 \n\t"
+ "psllw $11, %%xmm0 \n\t"
+ "psllw $5, %%xmm1 \n\t"
+ "por %%xmm2, %%xmm0 \n\t"
+ "por %%xmm1, %%xmm0 \n\t"
+ "movdqa %%xmm0, (%%rsi, %%rcx, 2)\n\t"
+ "addq $8, %%rcx \n\t"
+ "js 7b \n\t"
+ "jmp 9f \n\t"
+ "8: \n\t"
+ "movw (%%rsi, %%rcx, 2), %%ax \n\t"
+ "movd %%eax, %%xmm0 \n\t"
+ "movq %%xmm0, %%xmm1 \n\t"
+ "movq %%xmm0, %%xmm2 \n\t"
+ "psrlw $5, %%xmm1 \n\t"
+ "psrlw $11, %%xmm0 \n\t"
+ "psllw $11, %%xmm2 \n\t"
+ "psllw $10, %%xmm1 \n\t"
+ "psllw $8, %%xmm0 \n\t"
+ "psrlw $2, %%xmm1 \n\t"
+ "psrlw $3, %%xmm2 \n\t"
+ " \n\t"
+ "pmulhw %%xmm5, %%xmm0 \n\t"
+ "pmulhw %%xmm6, %%xmm1 \n\t"
+ "pmulhw %%xmm7, %%xmm2 \n\t"
+ " \n\t"
+ "paddusw %%xmm3, %%xmm0 \n\t"
+ "paddusw %%xmm4, %%xmm1 \n\t"
+ "paddusw %%xmm3, %%xmm2 \n\t"
+ " \n\t"
+ "psubw %%xmm4, %%xmm1 \n\t"
+ "psubw %%xmm3, %%xmm2 \n\t"
+ " \n\t"
+ "psllw $11, %%xmm0 \n\t"
+ "psllw $5, %%xmm1 \n\t"
+ "por %%xmm2, %%xmm0 \n\t"
+ "por %%xmm1, %%xmm0 \n\t"
+ "movd %%xmm0, %%eax \n\t"
+ "movw %%ax, (%%rsi, %%rcx, 2) \n\t"
+ "incq %%rcx \n\t"
+ "9: \n\t"
+ "cmpq $6, %%rcx \n\t"
+ "jng 8b \n\t"
+ "addq %[blue_mod], %%rsi \n\t" /* Blue_mod is the name of a register that now contains bytes_per_line. */
+ "decq %%rdx \n\t"
+ "jnz 6b \n\t"
+ "10: \n\t" /* This is the end. Jump here if the line count is zero. */
+ "emms \n\t" /* exit multi-media state (last asm instruction) */
+ : /* outputs: none */
+ /* inputs: (many operations cannot be performed with a mix of 32bit & 64bit operands directly) */
+ /* (however the compiler/assembler can preload 32bit values into 64bit registers) */
+ /* (that is why certain variables cannot be referenced by name -- use their register) */
+ : [data] "S" (data), /* put the pointer data into the rsi register */
+ [width] "b" (w), /* put the width in the %rbx register (cannot be referenced by name) */
+ [height] "d" (h), /* put the heigth in the %rdx register (cannot be referenced by name) */
+ [red_mod] "r" ((unsigned long)(rm)),/* put the red_modifier in a register (referenced by name) */
+ [green_mod] "r" ((unsigned long)(gm)),/* put the green_modifier in a register (referenced by name) */
+ [blue_mod] "r" ((unsigned long)(bm)),/* put the blue_modifier in a register (referenced by name) Later store the bytes_line here */
+ [bytes_line] "a" (bpl) /* put the bytes_per_line in the %rax register (cannot be referenced by name) */
+ : "memory" /* clobbers: (memory includes all the registers) */
+ ); /* End of Assembly */
+}
+
void shade_ximage_32_sse2( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
{
signature.asc
Description: This is a digitally signed message part
