It seems liboil works fine with banshee on amd64, probably because
16 byte is the minimal stack alignment there. So no fix is needed,
and copying around stuff on the stack only takes extra time.

So I've updated my patch to only adjust the stack on i386. It feels
suboptimal, since it needs that extra function, but I haven't found
a way to get rid of the _wrap functions by a preprocessor macro.

Anyway, it's better then otherwise.

Cheers,
Christian Aichinger
--- liboil-0.3.9.orig/liboil/sse/composite_sse_2pix.c   2005-12-21 
02:27:54.000000000 +0100
+++ liboil-0.3.9/liboil/sse/composite_sse_2pix.c        2006-06-20 
19:10:33.000000000 +0200
@@ -32,6 +32,42 @@
 #include <emmintrin.h>
 #include <liboil/liboilcolorspace.h>
 
+/* Work around non-aligned stack frames (which causes the intristics to crash
+ * by making sure the stack frame is always aligned
+ */
+#if defined(__i386__)
+#define OIL_SSE_WRAPPER(name,ret, ...)                                  \
+    ret name(__VA_ARGS__) __attribute__((used));                        \
+    ret name ## _wrap (__VA_ARGS__) {                                   \
+        OIL_SSE_WRAPPER_CALL(name);                                     \
+    }
+
+#define OIL_SSE_WRAPPER_CALL(name)                                      \
+    asm volatile(                                                       \
+            "\n\t"                                                      \
+            "subl $0x10,%%esp\n\t"                                      \
+            "andl $0xfffffff0,%%esp\n\t"                                \
+                                                                        \
+            "movdqu 8(%%ebp),%%xmm0\n\t"                                \
+            "movdqa %%xmm0,(%%esp)\n\t"                                 \
+                                                                        \
+            "call " #name "\n\t"                                        \
+            "movl %%ebp,%%esp\n\t"                                      \
+            : :                                                         \
+            : "eax","ecx","edx","xmm0")
+
+#elif defined(__amd64__)
+
+/* Needed because we call *_wrap. Should get optimized away anyway */ 
+#define OIL_SSE_WRAPPER(name,ret, ...)                                  \
+    ret name ## _wrap (__VA_ARGS__) {                                   \
+        name(__VA_ARGS__);                                              \
+    }
+
+#else
+#error Can't use sse on !i386 and !amd64
+#endif
+
 /* non-SSE2 compositing support */
 #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
 #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
@@ -41,20 +77,12 @@
  * the channel value in the low byte.  This means 2 pixels per pass.
  */
 
-union m128_int {
-  __m128i m128;
-  uint64_t ull[2];
-};
-
-static const struct _SSEData {
-  union m128_int sse_8x00ff;
-  union m128_int sse_8x0080;
-} c = {
-    .sse_8x00ff.ull =  {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL},
-    .sse_8x0080.ull =  {0x0080008000800080ULL, 0x0080008000800080ULL},
-};
+static const __m128i c_sse_8x00ff = 
+        {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL};
+static const __m128i c_sse_8x0080 = 
+        {0x0080008000800080ULL, 0x0080008000800080ULL};
 
-#define MC(x) (c.sse_##x.m128)
+#define MC(x) (c_sse_##x)
 
 /* Shuffles the given value such that the alpha for each pixel appears in each
  * channel of the pixel.
@@ -188,7 +216,11 @@
        COMPOSITE_IN(oil_argb_B(*src), m));
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_src_sse_2pix,
+
+OIL_SSE_WRAPPER(composite_in_argb_const_src_sse_2pix, static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_argb_const_src_sse_2pix_wrap,
     composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
 
 static void
@@ -216,7 +248,10 @@
        COMPOSITE_IN(oil_argb_B(s), mask[0]));
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_mask_sse_2pix,
+OIL_SSE_WRAPPER(composite_in_argb_const_mask_sse_2pix, static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_argb_const_mask_sse_2pix_wrap,
     composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
 
 static void
@@ -272,7 +307,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_over_argb_const_src_sse_2pix,
+
+OIL_SSE_WRAPPER(composite_over_argb_const_src_sse_2pix, static void, 
+        uint32_t *dest, const uint32_t *src, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_over_argb_const_src_sse_2pix_wrap,
     composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
 
 static void
@@ -309,8 +348,12 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_sse_2pix, composite_in_over_argb,
-    OIL_IMPL_FLAG_SSE2);
+
+OIL_SSE_WRAPPER(composite_in_over_argb_sse_2pix , static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_over_argb_sse_2pix_wrap, 
+        composite_in_over_argb, OIL_IMPL_FLAG_SSE2);
 
 static void
 composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
@@ -348,7 +391,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_src_sse_2pix,
+
+OIL_SSE_WRAPPER(composite_in_over_argb_const_src_sse_2pix , static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_src_sse_2pix_wrap,
     composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
 
 static void
@@ -387,7 +434,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_mask_sse_2pix,
+
+OIL_SSE_WRAPPER(composite_in_over_argb_const_mask_sse_2pix, static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_mask_sse_2pix_wrap,
     composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
 
 static void
--- liboil-0.3.9.orig/liboil/sse/composite_sse_4pix.c   2005-12-21 
02:27:54.000000000 +0100
+++ liboil-0.3.9/liboil/sse/composite_sse_4pix.c        2006-06-20 
19:10:34.000000000 +0200
@@ -32,20 +32,49 @@
 #include <emmintrin.h>
 #include <liboil/liboilcolorspace.h>
 
-union m128_int {
-  __m128i m128;
-  uint64_t ull[2];
-};
-
-static const struct _SSEData {
-  union m128_int sse_16xff;
-  union m128_int sse_8x0080;
-} c = {
-    .sse_16xff.ull =   {0xffffffffffffffffULL, 0xffffffffffffffffULL},
-    .sse_8x0080.ull =  {0x0080008000800080ULL, 0x0080008000800080ULL},
-};
+/* Work around non-aligned stack frames (which causes the intristics to crash
+ * by making sure the stack frame is always aligned
+ */
+#if defined(__i386__)
+#define OIL_SSE_WRAPPER(name,ret, ...)                                  \
+    ret name(__VA_ARGS__) __attribute__((used));                        \
+    ret name ## _wrap (__VA_ARGS__) {                                   \
+        OIL_SSE_WRAPPER_CALL(name);                                     \
+    }
+
+#define OIL_SSE_WRAPPER_CALL(name)                                      \
+    asm volatile(                                                       \
+            "\n\t"                                                      \
+            "subl $0x10,%%esp\n\t"                                      \
+            "andl $0xfffffff0,%%esp\n\t"                                \
+                                                                        \
+            "movdqu 8(%%ebp),%%xmm0\n\t"                                \
+            "movdqa %%xmm0,(%%esp)\n\t"                                 \
+                                                                        \
+            "call " #name "\n\t"                                        \
+            "movl %%ebp,%%esp\n\t"                                      \
+            : :                                                         \
+            : "eax","ecx","edx","xmm0")
+
+#elif defined(__amd64__)
+
+/* Needed because we call *_wrap. Should get optimized away anyway */ 
+#define OIL_SSE_WRAPPER(name,ret, ...)                                  \
+    ret name ## _wrap (__VA_ARGS__) {                                   \
+        name(__VA_ARGS__);                                              \
+    }
+
+#else
+#error Can't use sse on !i386 and !amd64
+#endif
+
 
-#define MC(x) (c.sse_##x.m128)
+static const __m128i c_sse_16xff =
+        {0xffffffffffffffffULL, 0xffffffffffffffffULL};
+static const __m128i c_sse_8x0080 =
+        {0x0080008000800080ULL, 0x0080008000800080ULL};
+
+#define MC(x) (c_sse_##x)
 
 /* non-SSE2 compositing support */
 #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
@@ -193,7 +222,11 @@
        COMPOSITE_IN(oil_argb_B(s), m));
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_argb_sse, composite_in_argb,
+
+OIL_SSE_WRAPPER(composite_in_argb_sse, static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_argb_sse_wrap, composite_in_argb,
     OIL_IMPL_FLAG_SSE2);
 
 static void
@@ -230,7 +263,11 @@
        COMPOSITE_IN(oil_argb_B(*src), m));
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_src_sse,
+
+OIL_SSE_WRAPPER(composite_in_argb_const_src_sse , static void, 
+    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_argb_const_src_sse_wrap,
     composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
 
 static void
@@ -267,7 +304,10 @@
        COMPOSITE_IN(oil_argb_B(s), mask[0]));
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_mask_sse,
+OIL_SSE_WRAPPER(composite_in_argb_const_mask_sse, static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_argb_const_mask_sse_wrap,
     composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
 
 static void
@@ -339,7 +379,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_over_argb_const_src_sse,
+
+OIL_SSE_WRAPPER(composite_over_argb_const_src_sse, static void, 
+    uint32_t *dest, const uint32_t *src, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_over_argb_const_src_sse_wrap,
     composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
 
 static void
@@ -447,9 +491,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_src_sse,
-    composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
+OIL_SSE_WRAPPER(composite_in_over_argb_const_src_sse , static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
 
+OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_src_sse_wrap,
+    composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
 static void
 composite_in_over_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
     const uint8_t *mask, int n)
@@ -502,7 +548,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_mask_sse,
+
+OIL_SSE_WRAPPER(composite_in_over_argb_const_mask_sse, static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_mask_sse_wrap,
     composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
 
 static void
--- liboil-0.3.9.orig/liboil/sse/sad8x8_sse.c   2005-12-23 22:46:25.000000000 
+0100
+++ liboil-0.3.9/liboil/sse/sad8x8_sse.c        2006-06-20 19:10:32.000000000 
+0200
@@ -31,6 +31,44 @@
 #include <liboil/liboilfunction.h>
 #include <emmintrin.h>
 
+/* Work around non-aligned stack frames (which causes the intristics to crash
+ * by making sure the stack frame is always aligned
+ */
+#if defined(__i386__)
+#define OIL_SSE_WRAPPER(name,ret, ...)                                  \
+    ret name(__VA_ARGS__) __attribute__((used));                        \
+    ret name ## _wrap (__VA_ARGS__) {                                   \
+        OIL_SSE_WRAPPER_CALL(name);                                     \
+    }
+
+#define OIL_SSE_WRAPPER_CALL(name)                                      \
+    asm volatile(                                                       \
+            "\n\t"                                                      \
+            "subl $0x18,%%esp\n\t"                                      \
+            "andl $0xfffffff0,%%esp\n\t"                                \
+                                                                        \
+            "movdqu 8(%%ebp),%%xmm0\n\t"                                \
+            "movdqa %%xmm0,(%%esp)\n\t"                                 \
+            "movl 0x18(%%ebp), %%ecx\n\t"                               \
+            "movl %%ecx, 0x10(%%esp)\n\t"                               \
+                                                                        \
+            "call " #name "\n\t"                                        \
+            "movl %%ebp,%%esp\n\t"                                      \
+            : :                                                         \
+            : "eax","ecx","edx","xmm0")
+
+#elif defined(__amd64__)
+
+/* Needed because we call *_wrap. Should get optimized away anyway */ 
+#define OIL_SSE_WRAPPER(name,ret, ...)                                  \
+    ret name ## _wrap (__VA_ARGS__) {                                   \
+        name(__VA_ARGS__);                                              \
+    }
+
+#else
+#error Can't use sse on !i386 and !amd64
+#endif
+
 union m128_int {
   __m128i m128;
   uint32_t i[4];
@@ -42,7 +80,7 @@
     int sstr2)
 {
   int i;
-  __m128i sum = _mm_setzero_si128();
+  __m128i sum __attribute__ ((aligned (16))) = _mm_setzero_si128();
   union m128_int sumi;
 
   for (i = 0; i < 4; i++) {
@@ -60,4 +98,7 @@
   sumi.m128 = sum;
   *dest = sumi.i[0] + sumi.i[2];
 }
-OIL_DEFINE_IMPL_FULL (sad8x8_u8_sse, sad8x8_u8, OIL_IMPL_FLAG_SSE2);
+
+OIL_SSE_WRAPPER(sad8x8_u8_sse, static void, 
+        uint32_t *dest, uint8_t *src1, int sstr1, uint8_t *src2, int sstr2)
+OIL_DEFINE_IMPL_FULL (sad8x8_u8_sse_wrap, sad8x8_u8, OIL_IMPL_FLAG_SSE2);

Attachment: signature.asc
Description: Digital signature

Reply via email to