Package: liboil Followup-For: Bug #368991 Attached is a rework of the NMU patch that now compiles and runs on both i386 and amd64.
Enjoy, Goswin -- System Information: Debian Release: 3.1 APT prefers unstable APT policy: (500, 'unstable') Architecture: amd64 (x86_64) Shell: /bin/sh linked to /bin/bash Kernel: Linux 2.6.16-rc4-xen Locale: LANG=C, LC_CTYPE=C (charmap=ANSI_X3.4-1968)
diff -u liboil-0.3.9/debian/changelog liboil-0.3.9/debian/changelog --- liboil-0.3.9/debian/changelog +++ liboil-0.3.9/debian/changelog @@ -1,3 +1,23 @@ +liboil (0.3.9-1.2) unstable; urgency=low + + * Non-maintainer upload, second attempt. + * Rework patch by Christian Aichinger to not call the wraper on amd64. + Thanks to Goswin von Brederlow. + + -- Andreas Barth <[EMAIL PROTECTED]> Thu, 22 Jun 2006 21:53:26 +0200 + +liboil (0.3.9-1.1) unstable; urgency=low + + * Non-maintainer upload. + * fix possible unalignment on i386 - this change not perfect + and should also contain a test suite, but is still better + than nothing at all. Thanks to Christian Aichinger for his + good work on this and the patch. Closes: #368991 + (also keeping the patch around in the diff, so that it's + obvious what was changed) + + -- Andreas Barth <[EMAIL PROTECTED]> Thu, 22 Jun 2006 19:31:26 +0200 + liboil (0.3.9-1) unstable; urgency=low * New upstream release. only in patch2: unchanged: --- liboil-0.3.9.orig/liboil/sse/composite_sse_2pix.c +++ liboil-0.3.9/liboil/sse/composite_sse_2pix.c @@ -32,6 +32,42 @@ #include <emmintrin.h> #include <liboil/liboilcolorspace.h> +/* Work around non-aligned stack frames (which causes the intristics to crash + * by making sure the stack frame is always aligned + */ +#if defined(__i386__) +#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \ + ret sse_name(__VA_ARGS__) __attribute__((used)); \ + ret sse_name ## _wrap (__VA_ARGS__) { \ + OIL_SSE_WRAPPER_CALL(sse_name); \ + } \ + OIL_DEFINE_IMPL_FULL(sse_name ## _wrap, name, flags); + +#define OIL_SSE_WRAPPER_CALL(name) \ + asm volatile( \ + "\n\t" \ + "subl $0x10,%%esp\n\t" \ + "andl $0xfffffff0,%%esp\n\t" \ + \ + "movdqu 8(%%ebp),%%xmm0\n\t" \ + "movdqa %%xmm0,(%%esp)\n\t" \ + \ + "call " #name "\n\t" \ + "movl %%ebp,%%esp\n\t" \ + : : \ + : "eax","ecx","edx","xmm0") + +#elif defined(__amd64__) + +/* Needed because we call *_wrap. Should get optimized away anyway */ + +#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \ + OIL_DEFINE_IMPL_FULL(sse_name, name, flags); + +#else +#error Can't use sse on !i386 and !amd64 +#endif + /* non-SSE2 compositing support */ #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m))) #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s)) @@ -41,20 +77,12 @@ * the channel value in the low byte. This means 2 pixels per pass. */ -union m128_int { - __m128i m128; - uint64_t ull[2]; -}; - -static const struct _SSEData { - union m128_int sse_8x00ff; - union m128_int sse_8x0080; -} c = { - .sse_8x00ff.ull = {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL}, - .sse_8x0080.ull = {0x0080008000800080ULL, 0x0080008000800080ULL}, -}; +static const __m128i c_sse_8x00ff = + {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL}; +static const __m128i c_sse_8x0080 = + {0x0080008000800080ULL, 0x0080008000800080ULL}; -#define MC(x) (c.sse_##x.m128) +#define MC(x) (c_sse_##x) /* Shuffles the given value such that the alpha for each pixel appears in each * channel of the pixel. @@ -188,8 +216,12 @@ COMPOSITE_IN(oil_argb_B(*src), m)); } } -OIL_DEFINE_IMPL_FULL (composite_in_argb_const_src_sse_2pix, - composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2); + +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_src_sse_2pix, + composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2, + static void, + uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); + static void composite_in_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src, @@ -216,8 +248,10 @@ COMPOSITE_IN(oil_argb_B(s), mask[0])); } } -OIL_DEFINE_IMPL_FULL (composite_in_argb_const_mask_sse_2pix, - composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2); +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_mask_sse_2pix, + composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2, + static void, + uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); static void composite_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, int n) @@ -272,8 +306,11 @@ *dest++ = d; } } -OIL_DEFINE_IMPL_FULL (composite_over_argb_const_src_sse_2pix, - composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2); + +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_const_src_sse_2pix, + composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2, + static void, + uint32_t *dest, const uint32_t *src, int n); static void composite_in_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, @@ -309,8 +346,11 @@ *dest++ = d; } } -OIL_DEFINE_IMPL_FULL (composite_in_over_argb_sse_2pix, composite_in_over_argb, - OIL_IMPL_FLAG_SSE2); + +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_sse_2pix, + composite_in_over_argb, OIL_IMPL_FLAG_SSE2, + static void, + uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); static void composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src, @@ -348,8 +388,11 @@ *dest++ = d; } } -OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_src_sse_2pix, - composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2); + +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_src_sse_2pix, + composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2, + static void, + uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); static void composite_in_over_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src, @@ -387,8 +430,11 @@ *dest++ = d; } } -OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_mask_sse_2pix, - composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2); + +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_mask_sse_2pix, + composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2, + static void, + uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); static void composite_over_u8_sse_2pix (uint8_t *dest, const uint8_t *src, int n) only in patch2: unchanged: --- liboil-0.3.9.orig/liboil/sse/composite_sse_4pix.c +++ liboil-0.3.9/liboil/sse/composite_sse_4pix.c @@ -32,20 +32,49 @@ #include <emmintrin.h> #include <liboil/liboilcolorspace.h> -union m128_int { - __m128i m128; - uint64_t ull[2]; -}; - -static const struct _SSEData { - union m128_int sse_16xff; - union m128_int sse_8x0080; -} c = { - .sse_16xff.ull = {0xffffffffffffffffULL, 0xffffffffffffffffULL}, - .sse_8x0080.ull = {0x0080008000800080ULL, 0x0080008000800080ULL}, -}; +/* Work around non-aligned stack frames (which causes the intristics to crash + * by making sure the stack frame is always aligned + */ +#if defined(__i386__) +#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \ + ret sse_name(__VA_ARGS__) __attribute__((used)); \ + ret sse_name ## _wrap (__VA_ARGS__) { \ + OIL_SSE_WRAPPER_CALL(sse_name); \ + } \ + OIL_DEFINE_IMPL_FULL(sse_name ## _wrap, name, flags); + +#define OIL_SSE_WRAPPER_CALL(name) \ + asm volatile( \ + "\n\t" \ + "subl $0x10,%%esp\n\t" \ + "andl $0xfffffff0,%%esp\n\t" \ + \ + "movdqu 8(%%ebp),%%xmm0\n\t" \ + "movdqa %%xmm0,(%%esp)\n\t" \ + \ + "call " #name "\n\t" \ + "movl %%ebp,%%esp\n\t" \ + : : \ + : "eax","ecx","edx","xmm0") + +#elif defined(__amd64__) + +/* Needed because we call *_wrap. Should get optimized away anyway */ + +#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \ + OIL_DEFINE_IMPL_FULL(sse_name, name, flags); + +#else +#error Can't use sse on !i386 and !amd64 +#endif -#define MC(x) (c.sse_##x.m128) + +static const __m128i c_sse_16xff = + {0xffffffffffffffffULL, 0xffffffffffffffffULL}; +static const __m128i c_sse_8x0080 = + {0x0080008000800080ULL, 0x0080008000800080ULL}; + +#define MC(x) (c_sse_##x) /* non-SSE2 compositing support */ #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m))) @@ -193,8 +222,11 @@ COMPOSITE_IN(oil_argb_B(s), m)); } } -OIL_DEFINE_IMPL_FULL (composite_in_argb_sse, composite_in_argb, - OIL_IMPL_FLAG_SSE2); + +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_sse, composite_in_argb, + OIL_IMPL_FLAG_SSE2, + static void, + uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); static void composite_in_argb_const_src_sse (uint32_t *dest, const uint32_t *src, @@ -230,8 +262,11 @@ COMPOSITE_IN(oil_argb_B(*src), m)); } } -OIL_DEFINE_IMPL_FULL (composite_in_argb_const_src_sse, - composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2); + +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_src_sse, + composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2, + static void, + uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); static void composite_in_argb_const_mask_sse (uint32_t *dest, const uint32_t *src, @@ -267,8 +302,10 @@ COMPOSITE_IN(oil_argb_B(s), mask[0])); } } -OIL_DEFINE_IMPL_FULL (composite_in_argb_const_mask_sse, - composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2); +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_mask_sse, + composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2, + static void, + uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); static void composite_over_argb_sse (uint32_t *dest, const uint32_t *src, int n) @@ -339,8 +376,11 @@ *dest++ = d; } } -OIL_DEFINE_IMPL_FULL (composite_over_argb_const_src_sse, - composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2); + +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_const_src_sse, + composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2, + static void, + uint32_t *dest, const uint32_t *src, int n); static void composite_in_over_argb_sse (uint32_t *dest, const uint32_t *src, @@ -447,8 +487,10 @@ *dest++ = d; } } -OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_src_sse, - composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2); +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_src_sse, + composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2, + static void, + uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); static void composite_in_over_argb_const_mask_sse (uint32_t *dest, const uint32_t *src, @@ -502,8 +544,11 @@ *dest++ = d; } } -OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_mask_sse, - composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2); + +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_mask_sse, + composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2, + static void, + uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); static void composite_over_u8_sse (uint8_t *dest, const uint8_t *src, int n) only in patch2: unchanged: --- liboil-0.3.9.orig/liboil/sse/sad8x8_sse.c +++ liboil-0.3.9/liboil/sse/sad8x8_sse.c @@ -31,6 +31,44 @@ #include <liboil/liboilfunction.h> #include <emmintrin.h> +/* Work around non-aligned stack frames (which causes the intristics to crash + * by making sure the stack frame is always aligned + */ +#if defined(__i386__) +#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \ + ret sse_name(__VA_ARGS__) __attribute__((used)); \ + ret sse_name ## _wrap (__VA_ARGS__) { \ + OIL_SSE_WRAPPER_CALL(sse_name); \ + } \ + OIL_DEFINE_IMPL_FULL(sse_name ## _wrap, name, flags); + +#define OIL_SSE_WRAPPER_CALL(name) \ + asm volatile( \ + "\n\t" \ + "subl $0x18,%%esp\n\t" \ + "andl $0xfffffff0,%%esp\n\t" \ + \ + "movdqu 8(%%ebp),%%xmm0\n\t" \ + "movdqa %%xmm0,(%%esp)\n\t" \ + "movl 0x18(%%ebp), %%ecx\n\t" \ + "movl %%ecx, 0x10(%%esp)\n\t" \ + \ + "call " #name "\n\t" \ + "movl %%ebp,%%esp\n\t" \ + : : \ + : "eax","ecx","edx","xmm0") + +#elif defined(__amd64__) + +/* Needed because we call *_wrap. Should get optimized away anyway */ + +#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \ + OIL_DEFINE_IMPL_FULL(sse_name, name, flags); + +#else +#error Can't use sse on !i386 and !amd64 +#endif + union m128_int { __m128i m128; uint32_t i[4]; @@ -42,7 +80,7 @@ int sstr2) { int i; - __m128i sum = _mm_setzero_si128(); + __m128i sum __attribute__ ((aligned (16))) = _mm_setzero_si128(); union m128_int sumi; for (i = 0; i < 4; i++) { @@ -60,4 +98,8 @@ sumi.m128 = sum; *dest = sumi.i[0] + sumi.i[2]; } -OIL_DEFINE_IMPL_FULL (sad8x8_u8_sse, sad8x8_u8, OIL_IMPL_FLAG_SSE2); + +OIL_DEFINE_IMPL_FULL_WRAPPER (sad8x8_u8_sse, sad8x8_u8, + OIL_IMPL_FLAG_SSE2, static void, + uint32_t *dest, + uint8_t *src1, int sstr1, uint8_t *src2, int sstr2); only in patch2: unchanged: --- liboil-0.3.9.orig/liboil-368991-sse-segv-fix.5.diff +++ liboil-0.3.9/liboil-368991-sse-segv-fix.5.diff @@ -0,0 +1,374 @@ +only in patch2: +unchanged: +--- liboil-0.3.9.orig/liboil/sse/composite_sse_2pix.c ++++ liboil-0.3.9/liboil/sse/composite_sse_2pix.c +@@ -32,6 +32,42 @@ + #include <emmintrin.h> + #include <liboil/liboilcolorspace.h> + ++/* Work around non-aligned stack frames (which causes the intristics to crash ++ * by making sure the stack frame is always aligned ++ */ ++#if defined(__i386__) ++#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \ ++ ret sse_name(__VA_ARGS__) __attribute__((used)); \ ++ ret sse_name ## _wrap (__VA_ARGS__) { \ ++ OIL_SSE_WRAPPER_CALL(sse_name); \ ++ } \ ++ OIL_DEFINE_IMPL_FULL(sse_name ## _wrap, name, flags); ++ ++#define OIL_SSE_WRAPPER_CALL(name) \ ++ asm volatile( \ ++ "\n\t" \ ++ "subl $0x10,%%esp\n\t" \ ++ "andl $0xfffffff0,%%esp\n\t" \ ++ \ ++ "movdqu 8(%%ebp),%%xmm0\n\t" \ ++ "movdqa %%xmm0,(%%esp)\n\t" \ ++ \ ++ "call " #name "\n\t" \ ++ "movl %%ebp,%%esp\n\t" \ ++ : : \ ++ : "eax","ecx","edx","xmm0") ++ ++#elif defined(__amd64__) ++ ++/* Needed because we call *_wrap. Should get optimized away anyway */ ++ ++#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \ ++ OIL_DEFINE_IMPL_FULL(sse_name, name, flags); ++ ++#else ++#error Can't use sse on !i386 and !amd64 ++#endif ++ + /* non-SSE2 compositing support */ + #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m))) + #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s)) +@@ -41,20 +77,12 @@ + * the channel value in the low byte. This means 2 pixels per pass. + */ + +-union m128_int { +- __m128i m128; +- uint64_t ull[2]; +-}; +- +-static const struct _SSEData { +- union m128_int sse_8x00ff; +- union m128_int sse_8x0080; +-} c = { +- .sse_8x00ff.ull = {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL}, +- .sse_8x0080.ull = {0x0080008000800080ULL, 0x0080008000800080ULL}, +-}; ++static const __m128i c_sse_8x00ff = ++ {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL}; ++static const __m128i c_sse_8x0080 = ++ {0x0080008000800080ULL, 0x0080008000800080ULL}; + +-#define MC(x) (c.sse_##x.m128) ++#define MC(x) (c_sse_##x) + + /* Shuffles the given value such that the alpha for each pixel appears in each + * channel of the pixel. +@@ -188,8 +216,12 @@ + COMPOSITE_IN(oil_argb_B(*src), m)); + } + } +-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_src_sse_2pix, +- composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2); ++ ++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_src_sse_2pix, ++ composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2, ++ static void, ++ uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); ++ + + static void + composite_in_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src, +@@ -216,8 +248,10 @@ + COMPOSITE_IN(oil_argb_B(s), mask[0])); + } + } +-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_mask_sse_2pix, +- composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2); ++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_mask_sse_2pix, ++ composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2, ++ static void, ++ uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); + + static void + composite_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, int n) +@@ -272,8 +306,11 @@ + *dest++ = d; + } + } +-OIL_DEFINE_IMPL_FULL (composite_over_argb_const_src_sse_2pix, +- composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2); ++ ++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_const_src_sse_2pix, ++ composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2, ++ static void, ++ uint32_t *dest, const uint32_t *src, int n); + + static void + composite_in_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, +@@ -309,8 +346,11 @@ + *dest++ = d; + } + } +-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_sse_2pix, composite_in_over_argb, +- OIL_IMPL_FLAG_SSE2); ++ ++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_sse_2pix, ++ composite_in_over_argb, OIL_IMPL_FLAG_SSE2, ++ static void, ++ uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); + + static void + composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src, +@@ -348,8 +388,11 @@ + *dest++ = d; + } + } +-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_src_sse_2pix, +- composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2); ++ ++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_src_sse_2pix, ++ composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2, ++ static void, ++ uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); + + static void + composite_in_over_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src, +@@ -387,8 +430,11 @@ + *dest++ = d; + } + } +-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_mask_sse_2pix, +- composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2); ++ ++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_mask_sse_2pix, ++ composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2, ++ static void, ++ uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); + + static void + composite_over_u8_sse_2pix (uint8_t *dest, const uint8_t *src, int n) +only in patch2: +unchanged: +--- liboil-0.3.9.orig/liboil/sse/composite_sse_4pix.c ++++ liboil-0.3.9/liboil/sse/composite_sse_4pix.c +@@ -32,20 +32,49 @@ + #include <emmintrin.h> + #include <liboil/liboilcolorspace.h> + +-union m128_int { +- __m128i m128; +- uint64_t ull[2]; +-}; +- +-static const struct _SSEData { +- union m128_int sse_16xff; +- union m128_int sse_8x0080; +-} c = { +- .sse_16xff.ull = {0xffffffffffffffffULL, 0xffffffffffffffffULL}, +- .sse_8x0080.ull = {0x0080008000800080ULL, 0x0080008000800080ULL}, +-}; ++/* Work around non-aligned stack frames (which causes the intristics to crash ++ * by making sure the stack frame is always aligned ++ */ ++#if defined(__i386__) ++#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \ ++ ret sse_name(__VA_ARGS__) __attribute__((used)); \ ++ ret sse_name ## _wrap (__VA_ARGS__) { \ ++ OIL_SSE_WRAPPER_CALL(sse_name); \ ++ } \ ++ OIL_DEFINE_IMPL_FULL(sse_name ## _wrap, name, flags); ++ ++#define OIL_SSE_WRAPPER_CALL(name) \ ++ asm volatile( \ ++ "\n\t" \ ++ "subl $0x10,%%esp\n\t" \ ++ "andl $0xfffffff0,%%esp\n\t" \ ++ \ ++ "movdqu 8(%%ebp),%%xmm0\n\t" \ ++ "movdqa %%xmm0,(%%esp)\n\t" \ ++ \ ++ "call " #name "\n\t" \ ++ "movl %%ebp,%%esp\n\t" \ ++ : : \ ++ : "eax","ecx","edx","xmm0") ++ ++#elif defined(__amd64__) ++ ++/* Needed because we call *_wrap. Should get optimized away anyway */ ++ ++#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \ ++ OIL_DEFINE_IMPL_FULL(sse_name, name, flags); ++ ++#else ++#error Can't use sse on !i386 and !amd64 ++#endif + +-#define MC(x) (c.sse_##x.m128) ++ ++static const __m128i c_sse_16xff = ++ {0xffffffffffffffffULL, 0xffffffffffffffffULL}; ++static const __m128i c_sse_8x0080 = ++ {0x0080008000800080ULL, 0x0080008000800080ULL}; ++ ++#define MC(x) (c_sse_##x) + + /* non-SSE2 compositing support */ + #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m))) +@@ -193,8 +222,11 @@ + COMPOSITE_IN(oil_argb_B(s), m)); + } + } +-OIL_DEFINE_IMPL_FULL (composite_in_argb_sse, composite_in_argb, +- OIL_IMPL_FLAG_SSE2); ++ ++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_sse, composite_in_argb, ++ OIL_IMPL_FLAG_SSE2, ++ static void, ++ uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); + + static void + composite_in_argb_const_src_sse (uint32_t *dest, const uint32_t *src, +@@ -230,8 +262,11 @@ + COMPOSITE_IN(oil_argb_B(*src), m)); + } + } +-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_src_sse, +- composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2); ++ ++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_src_sse, ++ composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2, ++ static void, ++ uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); + + static void + composite_in_argb_const_mask_sse (uint32_t *dest, const uint32_t *src, +@@ -267,8 +302,10 @@ + COMPOSITE_IN(oil_argb_B(s), mask[0])); + } + } +-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_mask_sse, +- composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2); ++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_mask_sse, ++ composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2, ++ static void, ++ uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); + + static void + composite_over_argb_sse (uint32_t *dest, const uint32_t *src, int n) +@@ -339,8 +376,11 @@ + *dest++ = d; + } + } +-OIL_DEFINE_IMPL_FULL (composite_over_argb_const_src_sse, +- composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2); ++ ++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_const_src_sse, ++ composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2, ++ static void, ++ uint32_t *dest, const uint32_t *src, int n); + + static void + composite_in_over_argb_sse (uint32_t *dest, const uint32_t *src, +@@ -447,8 +487,10 @@ + *dest++ = d; + } + } +-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_src_sse, +- composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2); ++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_src_sse, ++ composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2, ++ static void, ++ uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); + + static void + composite_in_over_argb_const_mask_sse (uint32_t *dest, const uint32_t *src, +@@ -502,8 +544,11 @@ + *dest++ = d; + } + } +-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_mask_sse, +- composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2); ++ ++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_mask_sse, ++ composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2, ++ static void, ++ uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n); + + static void + composite_over_u8_sse (uint8_t *dest, const uint8_t *src, int n) +only in patch2: +unchanged: +--- liboil-0.3.9.orig/liboil/sse/sad8x8_sse.c ++++ liboil-0.3.9/liboil/sse/sad8x8_sse.c +@@ -31,6 +31,44 @@ + #include <liboil/liboilfunction.h> + #include <emmintrin.h> + ++/* Work around non-aligned stack frames (which causes the intristics to crash ++ * by making sure the stack frame is always aligned ++ */ ++#if defined(__i386__) ++#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \ ++ ret sse_name(__VA_ARGS__) __attribute__((used)); \ ++ ret sse_name ## _wrap (__VA_ARGS__) { \ ++ OIL_SSE_WRAPPER_CALL(sse_name); \ ++ } \ ++ OIL_DEFINE_IMPL_FULL(sse_name ## _wrap, name, flags); ++ ++#define OIL_SSE_WRAPPER_CALL(name) \ ++ asm volatile( \ ++ "\n\t" \ ++ "subl $0x18,%%esp\n\t" \ ++ "andl $0xfffffff0,%%esp\n\t" \ ++ \ ++ "movdqu 8(%%ebp),%%xmm0\n\t" \ ++ "movdqa %%xmm0,(%%esp)\n\t" \ ++ "movl 0x18(%%ebp), %%ecx\n\t" \ ++ "movl %%ecx, 0x10(%%esp)\n\t" \ ++ \ ++ "call " #name "\n\t" \ ++ "movl %%ebp,%%esp\n\t" \ ++ : : \ ++ : "eax","ecx","edx","xmm0") ++ ++#elif defined(__amd64__) ++ ++/* Needed because we call *_wrap. Should get optimized away anyway */ ++ ++#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \ ++ OIL_DEFINE_IMPL_FULL(sse_name, name, flags); ++ ++#else ++#error Can't use sse on !i386 and !amd64 ++#endif ++ + union m128_int { + __m128i m128; + uint32_t i[4]; +@@ -42,7 +78,7 @@ + int sstr2) + { + int i; +- __m128i sum = _mm_setzero_si128(); ++ __m128i sum __attribute__ ((aligned (16))) = _mm_setzero_si128(); + union m128_int sumi; + + for (i = 0; i < 4; i++) { +@@ -60,4 +98,8 @@ + sumi.m128 = sum; + *dest = sumi.i[0] + sumi.i[2]; + } +-OIL_DEFINE_IMPL_FULL (sad8x8_u8_sse, sad8x8_u8, OIL_IMPL_FLAG_SSE2); ++ ++OIL_DEFINE_IMPL_FULL_WRAPPER (sad8x8_u8_sse, sad8x8_u8, ++ OIL_IMPL_FLAG_SSE2, static void, ++ uint32_t *dest, ++ uint8_t *src1, int sstr1, uint8_t *src2, int sstr2);