Package: liboil
Followup-For: Bug #368991

Attached is a rework of the NMU patch that now compiles and runs on
both i386 and amd64.

Enjoy,
        Goswin

-- System Information:
Debian Release: 3.1
  APT prefers unstable
  APT policy: (500, 'unstable')
Architecture: amd64 (x86_64)
Shell:  /bin/sh linked to /bin/bash
Kernel: Linux 2.6.16-rc4-xen
Locale: LANG=C, LC_CTYPE=C (charmap=ANSI_X3.4-1968)
diff -u liboil-0.3.9/debian/changelog liboil-0.3.9/debian/changelog
--- liboil-0.3.9/debian/changelog
+++ liboil-0.3.9/debian/changelog
@@ -1,3 +1,23 @@
+liboil (0.3.9-1.2) unstable; urgency=low
+
+  * Non-maintainer upload, second attempt.
+  * Rework patch by Christian Aichinger to not call the wraper on amd64.
+    Thanks to Goswin von Brederlow.
+
+ -- Andreas Barth <[EMAIL PROTECTED]>  Thu, 22 Jun 2006 21:53:26 +0200
+
+liboil (0.3.9-1.1) unstable; urgency=low
+
+  * Non-maintainer upload.
+  * fix possible unalignment on i386 - this change not perfect
+    and should also contain a test suite, but is still better
+    than nothing at all. Thanks to Christian Aichinger for his
+    good work on this and the patch. Closes: #368991
+    (also keeping the patch around in the diff, so that it's
+    obvious what was changed)
+
+ -- Andreas Barth <[EMAIL PROTECTED]>  Thu, 22 Jun 2006 19:31:26 +0200
+
 liboil (0.3.9-1) unstable; urgency=low
   
   * New upstream release.
only in patch2:
unchanged:
--- liboil-0.3.9.orig/liboil/sse/composite_sse_2pix.c
+++ liboil-0.3.9/liboil/sse/composite_sse_2pix.c
@@ -32,6 +32,42 @@
 #include <emmintrin.h>
 #include <liboil/liboilcolorspace.h>
 
+/* Work around non-aligned stack frames (which causes the intristics to crash
+ * by making sure the stack frame is always aligned
+ */
+#if defined(__i386__)
+#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...)  \
+    ret sse_name(__VA_ARGS__) __attribute__((used));                    \
+    ret sse_name ## _wrap (__VA_ARGS__) {                               \
+        OIL_SSE_WRAPPER_CALL(sse_name);                                 \
+    }                                                                  \
+    OIL_DEFINE_IMPL_FULL(sse_name ## _wrap, name, flags);
+
+#define OIL_SSE_WRAPPER_CALL(name)                                      \
+    asm volatile(                                                       \
+            "\n\t"                                                      \
+            "subl $0x10,%%esp\n\t"                                      \
+            "andl $0xfffffff0,%%esp\n\t"                                \
+                                                                        \
+            "movdqu 8(%%ebp),%%xmm0\n\t"                                \
+            "movdqa %%xmm0,(%%esp)\n\t"                                 \
+                                                                        \
+            "call " #name "\n\t"                                        \
+            "movl %%ebp,%%esp\n\t"                                      \
+            : :                                                         \
+            : "eax","ecx","edx","xmm0")
+
+#elif defined(__amd64__)
+
+/* Needed because we call *_wrap. Should get optimized away anyway */ 
+
+#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...)  \
+    OIL_DEFINE_IMPL_FULL(sse_name, name, flags);
+
+#else
+#error Can't use sse on !i386 and !amd64
+#endif
+
 /* non-SSE2 compositing support */
 #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
 #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
@@ -41,20 +77,12 @@
  * the channel value in the low byte.  This means 2 pixels per pass.
  */
 
-union m128_int {
-  __m128i m128;
-  uint64_t ull[2];
-};
-
-static const struct _SSEData {
-  union m128_int sse_8x00ff;
-  union m128_int sse_8x0080;
-} c = {
-    .sse_8x00ff.ull =  {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL},
-    .sse_8x0080.ull =  {0x0080008000800080ULL, 0x0080008000800080ULL},
-};
+static const __m128i c_sse_8x00ff = 
+        {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL};
+static const __m128i c_sse_8x0080 = 
+        {0x0080008000800080ULL, 0x0080008000800080ULL};
 
-#define MC(x) (c.sse_##x.m128)
+#define MC(x) (c_sse_##x)
 
 /* Shuffles the given value such that the alpha for each pixel appears in each
  * channel of the pixel.
@@ -188,8 +216,12 @@
        COMPOSITE_IN(oil_argb_B(*src), m));
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_src_sse_2pix,
-    composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
+
+OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_src_sse_2pix,
+        composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2,
+       static void, 
+       uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
+
 
 static void
 composite_in_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
@@ -216,8 +248,10 @@
        COMPOSITE_IN(oil_argb_B(s), mask[0]));
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_mask_sse_2pix,
-    composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
+OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_mask_sse_2pix,
+    composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2,
+    static void, 
+    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
 
 static void
 composite_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, int n)
@@ -272,8 +306,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_over_argb_const_src_sse_2pix,
-    composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
+
+OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_const_src_sse_2pix,
+    composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2,
+    static void, 
+    uint32_t *dest, const uint32_t *src, int n);
 
 static void
 composite_in_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
@@ -309,8 +346,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_sse_2pix, composite_in_over_argb,
-    OIL_IMPL_FLAG_SSE2);
+
+OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_sse_2pix,
+    composite_in_over_argb, OIL_IMPL_FLAG_SSE2,
+    static void, 
+    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
 
 static void
 composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
@@ -348,8 +388,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_src_sse_2pix,
-    composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
+
+OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_src_sse_2pix,
+    composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2,
+    static void, 
+    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
 
 static void
 composite_in_over_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t 
*src,
@@ -387,8 +430,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_mask_sse_2pix,
-    composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
+
+OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_mask_sse_2pix,
+    composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2,
+    static void, 
+    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
 
 static void
 composite_over_u8_sse_2pix (uint8_t *dest, const uint8_t *src, int n)
only in patch2:
unchanged:
--- liboil-0.3.9.orig/liboil/sse/composite_sse_4pix.c
+++ liboil-0.3.9/liboil/sse/composite_sse_4pix.c
@@ -32,20 +32,49 @@
 #include <emmintrin.h>
 #include <liboil/liboilcolorspace.h>
 
-union m128_int {
-  __m128i m128;
-  uint64_t ull[2];
-};
-
-static const struct _SSEData {
-  union m128_int sse_16xff;
-  union m128_int sse_8x0080;
-} c = {
-    .sse_16xff.ull =   {0xffffffffffffffffULL, 0xffffffffffffffffULL},
-    .sse_8x0080.ull =  {0x0080008000800080ULL, 0x0080008000800080ULL},
-};
+/* Work around non-aligned stack frames (which causes the intristics to crash
+ * by making sure the stack frame is always aligned
+ */
+#if defined(__i386__)
+#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...)  \
+    ret sse_name(__VA_ARGS__) __attribute__((used));                    \
+    ret sse_name ## _wrap (__VA_ARGS__) {                               \
+        OIL_SSE_WRAPPER_CALL(sse_name);                                 \
+    }                                                                  \
+    OIL_DEFINE_IMPL_FULL(sse_name ## _wrap, name, flags);
+
+#define OIL_SSE_WRAPPER_CALL(name)                                      \
+    asm volatile(                                                       \
+            "\n\t"                                                      \
+            "subl $0x10,%%esp\n\t"                                      \
+            "andl $0xfffffff0,%%esp\n\t"                                \
+                                                                        \
+            "movdqu 8(%%ebp),%%xmm0\n\t"                                \
+            "movdqa %%xmm0,(%%esp)\n\t"                                 \
+                                                                        \
+            "call " #name "\n\t"                                        \
+            "movl %%ebp,%%esp\n\t"                                      \
+            : :                                                         \
+            : "eax","ecx","edx","xmm0")
+
+#elif defined(__amd64__)
+
+/* Needed because we call *_wrap. Should get optimized away anyway */ 
+
+#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...)  \
+    OIL_DEFINE_IMPL_FULL(sse_name, name, flags);
+
+#else
+#error Can't use sse on !i386 and !amd64
+#endif
 
-#define MC(x) (c.sse_##x.m128)
+
+static const __m128i c_sse_16xff =
+        {0xffffffffffffffffULL, 0xffffffffffffffffULL};
+static const __m128i c_sse_8x0080 =
+        {0x0080008000800080ULL, 0x0080008000800080ULL};
+
+#define MC(x) (c_sse_##x)
 
 /* non-SSE2 compositing support */
 #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
@@ -193,8 +222,11 @@
        COMPOSITE_IN(oil_argb_B(s), m));
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_argb_sse, composite_in_argb,
-    OIL_IMPL_FLAG_SSE2);
+
+OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_sse, composite_in_argb,
+    OIL_IMPL_FLAG_SSE2,
+    static void,
+    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
 
 static void
 composite_in_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
@@ -230,8 +262,11 @@
        COMPOSITE_IN(oil_argb_B(*src), m));
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_src_sse,
-    composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
+
+OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_src_sse,
+      composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2,
+      static void,
+      uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
 
 static void
 composite_in_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
@@ -267,8 +302,10 @@
        COMPOSITE_IN(oil_argb_B(s), mask[0]));
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_mask_sse,
-    composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
+OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_mask_sse,
+    composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2,
+    static void, 
+    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
 
 static void
 composite_over_argb_sse (uint32_t *dest, const uint32_t *src, int n)
@@ -339,8 +376,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_over_argb_const_src_sse,
-    composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
+
+OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_const_src_sse,
+    composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2,
+    static void, 
+    uint32_t *dest, const uint32_t *src, int n);
 
 static void
 composite_in_over_argb_sse (uint32_t *dest, const uint32_t *src,
@@ -447,8 +487,10 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_src_sse,
-    composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
+OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_src_sse,
+    composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2,
+    static void, 
+    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
 
 static void
 composite_in_over_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
@@ -502,8 +544,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_mask_sse,
-    composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
+
+OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_mask_sse,
+    composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2,
+    static void, 
+    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
 
 static void
 composite_over_u8_sse (uint8_t *dest, const uint8_t *src, int n)
only in patch2:
unchanged:
--- liboil-0.3.9.orig/liboil/sse/sad8x8_sse.c
+++ liboil-0.3.9/liboil/sse/sad8x8_sse.c
@@ -31,6 +31,44 @@
 #include <liboil/liboilfunction.h>
 #include <emmintrin.h>
 
+/* Work around non-aligned stack frames (which causes the intristics to crash
+ * by making sure the stack frame is always aligned
+ */
+#if defined(__i386__)
+#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...)  \
+    ret sse_name(__VA_ARGS__) __attribute__((used));                    \
+    ret sse_name ## _wrap (__VA_ARGS__) {                               \
+        OIL_SSE_WRAPPER_CALL(sse_name);                                 \
+    }                                                                  \
+    OIL_DEFINE_IMPL_FULL(sse_name ## _wrap, name, flags);
+
+#define OIL_SSE_WRAPPER_CALL(name)                                      \
+    asm volatile(                                                       \
+            "\n\t"                                                      \
+            "subl $0x18,%%esp\n\t"                                      \
+            "andl $0xfffffff0,%%esp\n\t"                                \
+                                                                        \
+            "movdqu 8(%%ebp),%%xmm0\n\t"                                \
+            "movdqa %%xmm0,(%%esp)\n\t"                                 \
+            "movl 0x18(%%ebp), %%ecx\n\t"                               \
+            "movl %%ecx, 0x10(%%esp)\n\t"                               \
+                                                                        \
+            "call " #name "\n\t"                                        \
+            "movl %%ebp,%%esp\n\t"                                      \
+            : :                                                         \
+            : "eax","ecx","edx","xmm0")
+
+#elif defined(__amd64__)
+
+/* Needed because we call *_wrap. Should get optimized away anyway */ 
+
+#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...)  \
+    OIL_DEFINE_IMPL_FULL(sse_name, name, flags);
+
+#else
+#error Can't use sse on !i386 and !amd64
+#endif
+
 union m128_int {
   __m128i m128;
   uint32_t i[4];
@@ -42,7 +80,7 @@
     int sstr2)
 {
   int i;
-  __m128i sum = _mm_setzero_si128();
+  __m128i sum __attribute__ ((aligned (16))) = _mm_setzero_si128();
   union m128_int sumi;
 
   for (i = 0; i < 4; i++) {
@@ -60,4 +98,8 @@
   sumi.m128 = sum;
   *dest = sumi.i[0] + sumi.i[2];
 }
-OIL_DEFINE_IMPL_FULL (sad8x8_u8_sse, sad8x8_u8, OIL_IMPL_FLAG_SSE2);
+
+OIL_DEFINE_IMPL_FULL_WRAPPER (sad8x8_u8_sse, sad8x8_u8,
+    OIL_IMPL_FLAG_SSE2, static void, 
+    uint32_t *dest,
+    uint8_t *src1, int sstr1, uint8_t *src2, int sstr2);
only in patch2:
unchanged:
--- liboil-0.3.9.orig/liboil-368991-sse-segv-fix.5.diff
+++ liboil-0.3.9/liboil-368991-sse-segv-fix.5.diff
@@ -0,0 +1,374 @@
+only in patch2:
+unchanged:
+--- liboil-0.3.9.orig/liboil/sse/composite_sse_2pix.c
++++ liboil-0.3.9/liboil/sse/composite_sse_2pix.c
+@@ -32,6 +32,42 @@
+ #include <emmintrin.h>
+ #include <liboil/liboilcolorspace.h>
+ 
++/* Work around non-aligned stack frames (which causes the intristics to crash
++ * by making sure the stack frame is always aligned
++ */
++#if defined(__i386__)
++#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \
++    ret sse_name(__VA_ARGS__) __attribute__((used));                    \
++    ret sse_name ## _wrap (__VA_ARGS__) {                               \
++        OIL_SSE_WRAPPER_CALL(sse_name);                                 \
++    }                                                                 \
++    OIL_DEFINE_IMPL_FULL(sse_name ## _wrap, name, flags);
++
++#define OIL_SSE_WRAPPER_CALL(name)                                      \
++    asm volatile(                                                       \
++            "\n\t"                                                      \
++            "subl $0x10,%%esp\n\t"                                      \
++            "andl $0xfffffff0,%%esp\n\t"                                \
++                                                                        \
++            "movdqu 8(%%ebp),%%xmm0\n\t"                                \
++            "movdqa %%xmm0,(%%esp)\n\t"                                 \
++                                                                        \
++            "call " #name "\n\t"                                        \
++            "movl %%ebp,%%esp\n\t"                                      \
++            : :                                                         \
++            : "eax","ecx","edx","xmm0")
++
++#elif defined(__amd64__)
++
++/* Needed because we call *_wrap. Should get optimized away anyway */ 
++
++#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \
++    OIL_DEFINE_IMPL_FULL(sse_name, name, flags);
++
++#else
++#error Can't use sse on !i386 and !amd64
++#endif
++
+ /* non-SSE2 compositing support */
+ #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
+ #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
+@@ -41,20 +77,12 @@
+  * the channel value in the low byte.  This means 2 pixels per pass.
+  */
+ 
+-union m128_int {
+-  __m128i m128;
+-  uint64_t ull[2];
+-};
+-
+-static const struct _SSEData {
+-  union m128_int sse_8x00ff;
+-  union m128_int sse_8x0080;
+-} c = {
+-    .sse_8x00ff.ull = {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL},
+-    .sse_8x0080.ull = {0x0080008000800080ULL, 0x0080008000800080ULL},
+-};
++static const __m128i c_sse_8x00ff = 
++        {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL};
++static const __m128i c_sse_8x0080 = 
++        {0x0080008000800080ULL, 0x0080008000800080ULL};
+ 
+-#define MC(x) (c.sse_##x.m128)
++#define MC(x) (c_sse_##x)
+ 
+ /* Shuffles the given value such that the alpha for each pixel appears in each
+  * channel of the pixel.
+@@ -188,8 +216,12 @@
+       COMPOSITE_IN(oil_argb_B(*src), m));
+   }
+ }
+-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_src_sse_2pix,
+-    composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
++
++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_src_sse_2pix,
++        composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2,
++      static void, 
++      uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
++
+ 
+ static void
+ composite_in_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
+@@ -216,8 +248,10 @@
+       COMPOSITE_IN(oil_argb_B(s), mask[0]));
+   }
+ }
+-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_mask_sse_2pix,
+-    composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_mask_sse_2pix,
++    composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2,
++    static void, 
++    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
+ 
+ static void
+ composite_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, int n)
+@@ -272,8 +306,11 @@
+     *dest++ = d;
+   }
+ }
+-OIL_DEFINE_IMPL_FULL (composite_over_argb_const_src_sse_2pix,
+-    composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
++
++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_const_src_sse_2pix,
++    composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2,
++    static void, 
++    uint32_t *dest, const uint32_t *src, int n);
+ 
+ static void
+ composite_in_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
+@@ -309,8 +346,11 @@
+     *dest++ = d;
+   }
+ }
+-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_sse_2pix, composite_in_over_argb,
+-    OIL_IMPL_FLAG_SSE2);
++
++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_sse_2pix,
++    composite_in_over_argb, OIL_IMPL_FLAG_SSE2,
++    static void, 
++    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
+ 
+ static void
+ composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t 
*src,
+@@ -348,8 +388,11 @@
+     *dest++ = d;
+   }
+ }
+-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_src_sse_2pix,
+-    composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
++
++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_src_sse_2pix,
++    composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2,
++    static void, 
++    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
+ 
+ static void
+ composite_in_over_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t 
*src,
+@@ -387,8 +430,11 @@
+     *dest++ = d;
+   }
+ }
+-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_mask_sse_2pix,
+-    composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
++
++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_mask_sse_2pix,
++    composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2,
++    static void, 
++    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
+ 
+ static void
+ composite_over_u8_sse_2pix (uint8_t *dest, const uint8_t *src, int n)
+only in patch2:
+unchanged:
+--- liboil-0.3.9.orig/liboil/sse/composite_sse_4pix.c
++++ liboil-0.3.9/liboil/sse/composite_sse_4pix.c
+@@ -32,20 +32,49 @@
+ #include <emmintrin.h>
+ #include <liboil/liboilcolorspace.h>
+ 
+-union m128_int {
+-  __m128i m128;
+-  uint64_t ull[2];
+-};
+-
+-static const struct _SSEData {
+-  union m128_int sse_16xff;
+-  union m128_int sse_8x0080;
+-} c = {
+-    .sse_16xff.ull =  {0xffffffffffffffffULL, 0xffffffffffffffffULL},
+-    .sse_8x0080.ull = {0x0080008000800080ULL, 0x0080008000800080ULL},
+-};
++/* Work around non-aligned stack frames (which causes the intristics to crash
++ * by making sure the stack frame is always aligned
++ */
++#if defined(__i386__)
++#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \
++    ret sse_name(__VA_ARGS__) __attribute__((used));                    \
++    ret sse_name ## _wrap (__VA_ARGS__) {                               \
++        OIL_SSE_WRAPPER_CALL(sse_name);                                 \
++    }                                                                 \
++    OIL_DEFINE_IMPL_FULL(sse_name ## _wrap, name, flags);
++
++#define OIL_SSE_WRAPPER_CALL(name)                                      \
++    asm volatile(                                                       \
++            "\n\t"                                                      \
++            "subl $0x10,%%esp\n\t"                                      \
++            "andl $0xfffffff0,%%esp\n\t"                                \
++                                                                        \
++            "movdqu 8(%%ebp),%%xmm0\n\t"                                \
++            "movdqa %%xmm0,(%%esp)\n\t"                                 \
++                                                                        \
++            "call " #name "\n\t"                                        \
++            "movl %%ebp,%%esp\n\t"                                      \
++            : :                                                         \
++            : "eax","ecx","edx","xmm0")
++
++#elif defined(__amd64__)
++
++/* Needed because we call *_wrap. Should get optimized away anyway */ 
++
++#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \
++    OIL_DEFINE_IMPL_FULL(sse_name, name, flags);
++
++#else
++#error Can't use sse on !i386 and !amd64
++#endif
+ 
+-#define MC(x) (c.sse_##x.m128)
++
++static const __m128i c_sse_16xff =
++        {0xffffffffffffffffULL, 0xffffffffffffffffULL};
++static const __m128i c_sse_8x0080 =
++        {0x0080008000800080ULL, 0x0080008000800080ULL};
++
++#define MC(x) (c_sse_##x)
+ 
+ /* non-SSE2 compositing support */
+ #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
+@@ -193,8 +222,11 @@
+       COMPOSITE_IN(oil_argb_B(s), m));
+   }
+ }
+-OIL_DEFINE_IMPL_FULL (composite_in_argb_sse, composite_in_argb,
+-    OIL_IMPL_FLAG_SSE2);
++
++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_sse, composite_in_argb,
++    OIL_IMPL_FLAG_SSE2,
++    static void,
++    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
+ 
+ static void
+ composite_in_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
+@@ -230,8 +262,11 @@
+       COMPOSITE_IN(oil_argb_B(*src), m));
+   }
+ }
+-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_src_sse,
+-    composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
++
++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_src_sse,
++      composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2,
++      static void,
++      uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
+ 
+ static void
+ composite_in_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
+@@ -267,8 +302,10 @@
+       COMPOSITE_IN(oil_argb_B(s), mask[0]));
+   }
+ }
+-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_mask_sse,
+-    composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_mask_sse,
++    composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2,
++    static void, 
++    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
+ 
+ static void
+ composite_over_argb_sse (uint32_t *dest, const uint32_t *src, int n)
+@@ -339,8 +376,11 @@
+     *dest++ = d;
+   }
+ }
+-OIL_DEFINE_IMPL_FULL (composite_over_argb_const_src_sse,
+-    composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
++
++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_const_src_sse,
++    composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2,
++    static void, 
++    uint32_t *dest, const uint32_t *src, int n);
+ 
+ static void
+ composite_in_over_argb_sse (uint32_t *dest, const uint32_t *src,
+@@ -447,8 +487,10 @@
+     *dest++ = d;
+   }
+ }
+-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_src_sse,
+-    composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_src_sse,
++    composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2,
++    static void, 
++    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
+ 
+ static void
+ composite_in_over_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
+@@ -502,8 +544,11 @@
+     *dest++ = d;
+   }
+ }
+-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_mask_sse,
+-    composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
++
++OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_mask_sse,
++    composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2,
++    static void, 
++    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n);
+ 
+ static void
+ composite_over_u8_sse (uint8_t *dest, const uint8_t *src, int n)
+only in patch2:
+unchanged:
+--- liboil-0.3.9.orig/liboil/sse/sad8x8_sse.c
++++ liboil-0.3.9/liboil/sse/sad8x8_sse.c
+@@ -31,6 +31,44 @@
+ #include <liboil/liboilfunction.h>
+ #include <emmintrin.h>
+ 
++/* Work around non-aligned stack frames (which causes the intristics to crash
++ * by making sure the stack frame is always aligned
++ */
++#if defined(__i386__)
++#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \
++    ret sse_name(__VA_ARGS__) __attribute__((used));                    \
++    ret sse_name ## _wrap (__VA_ARGS__) {                               \
++        OIL_SSE_WRAPPER_CALL(sse_name);                                 \
++    }                                                                 \
++    OIL_DEFINE_IMPL_FULL(sse_name ## _wrap, name, flags);
++
++#define OIL_SSE_WRAPPER_CALL(name)                                      \
++    asm volatile(                                                       \
++            "\n\t"                                                      \
++            "subl $0x18,%%esp\n\t"                                      \
++            "andl $0xfffffff0,%%esp\n\t"                                \
++                                                                        \
++            "movdqu 8(%%ebp),%%xmm0\n\t"                                \
++            "movdqa %%xmm0,(%%esp)\n\t"                                 \
++            "movl 0x18(%%ebp), %%ecx\n\t"                               \
++            "movl %%ecx, 0x10(%%esp)\n\t"                               \
++                                                                        \
++            "call " #name "\n\t"                                        \
++            "movl %%ebp,%%esp\n\t"                                      \
++            : :                                                         \
++            : "eax","ecx","edx","xmm0")
++
++#elif defined(__amd64__)
++
++/* Needed because we call *_wrap. Should get optimized away anyway */ 
++
++#define OIL_DEFINE_IMPL_FULL_WRAPPER(sse_name, name, flags, ret, ...) \
++    OIL_DEFINE_IMPL_FULL(sse_name, name, flags);
++
++#else
++#error Can't use sse on !i386 and !amd64
++#endif
++
+ union m128_int {
+   __m128i m128;
+   uint32_t i[4];
+@@ -42,7 +78,7 @@
+     int sstr2)
+ {
+   int i;
+-  __m128i sum = _mm_setzero_si128();
++  __m128i sum __attribute__ ((aligned (16))) = _mm_setzero_si128();
+   union m128_int sumi;
+ 
+   for (i = 0; i < 4; i++) {
+@@ -60,4 +98,8 @@
+   sumi.m128 = sum;
+   *dest = sumi.i[0] + sumi.i[2];
+ }
+-OIL_DEFINE_IMPL_FULL (sad8x8_u8_sse, sad8x8_u8, OIL_IMPL_FLAG_SSE2);
++
++OIL_DEFINE_IMPL_FULL_WRAPPER (sad8x8_u8_sse, sad8x8_u8,
++    OIL_IMPL_FLAG_SSE2, static void, 
++    uint32_t *dest,
++    uint8_t *src1, int sstr1, uint8_t *src2, int sstr2);

Reply via email to