From 45de7c94fb76820b4a96def73cf6816c3eabcea2 Mon Sep 17 00:00:00 2001
From: mancoast <RobertPancoast77@gmail.com>
Date: Fri, 22 Jan 2016 13:50:55 -0500
Subject: [PATCH] k1om libffi

---
 Modules/_ctypes/libffi/src/raw_api.c    |  16 +--
 Modules/_ctypes/libffi/src/x86/ffi64.c  | 129 +++++++++--------------
 Modules/_ctypes/libffi/src/x86/unix64.S | 180 +++++++++++++++++++++++---------
 3 files changed, 187 insertions(+), 138 deletions(-)

diff --git a/Modules/_ctypes/libffi/src/raw_api.c b/Modules/_ctypes/libffi/src/raw_api.c
index ce21372..4c6af5f 100644
--- a/Modules/_ctypes/libffi/src/raw_api.c
+++ b/Modules/_ctypes/libffi/src/raw_api.c
@@ -29,7 +29,7 @@
 #include <ffi.h>
 #include <ffi_common.h>
 
-#if !FFI_NO_RAW_API
+//#if !FFI_NO_RAW_API
 
 size_t
 ffi_raw_size (ffi_cif *cif)
@@ -178,7 +178,7 @@ ffi_ptrarray_to_raw (ffi_cif *cif, void **args, ffi_raw *raw)
     }
 }
 
-#if !FFI_NATIVE_RAW_API
+//#if !FFI_NATIVE_RAW_API
 
 
 /* This is a generic definition of ffi_raw_call, to be used if the
@@ -195,7 +195,7 @@ void ffi_raw_call (ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *raw)
   ffi_call (cif, fn, rvalue, avalue);
 }
 
-#if FFI_CLOSURES		/* base system provides closures */
+//#if FFI_CLOSURES		/* base system provides closures */
 
 static void
 ffi_translate_args (ffi_cif *cif, void *rvalue,
@@ -231,10 +231,10 @@ ffi_prep_raw_closure_loc (ffi_raw_closure* cl,
   return status;
 }
 
-#endif /* FFI_CLOSURES */
-#endif /* !FFI_NATIVE_RAW_API */
+//#endif /* FFI_CLOSURES */
+//#endif /* !FFI_NATIVE_RAW_API */
 
-#if FFI_CLOSURES
+//#if FFI_CLOSURES
 
 /* Again, here is the generic version of ffi_prep_raw_closure, which
  * will install an intermediate "hub" for translation of arguments from
@@ -249,6 +249,6 @@ ffi_prep_raw_closure (ffi_raw_closure* cl,
   return ffi_prep_raw_closure_loc (cl, cif, fun, user_data, cl);
 }
 
-#endif /* FFI_CLOSURES */
+//#endif /* FFI_CLOSURES */
 
-#endif /* !FFI_NO_RAW_API */
+//#endif /* !FFI_NO_RAW_API */
diff --git a/Modules/_ctypes/libffi/src/x86/ffi64.c b/Modules/_ctypes/libffi/src/x86/ffi64.c
index 5a5e043..d106c0b 100644
--- a/Modules/_ctypes/libffi/src/x86/ffi64.c
+++ b/Modules/_ctypes/libffi/src/x86/ffi64.c
@@ -1,10 +1,8 @@
 /* -----------------------------------------------------------------------
-   ffi64.c - Copyright (c) 2013  The Written Word, Inc.
-             Copyright (c) 2011  Anthony Green
-             Copyright (c) 2008, 2010  Red Hat, Inc.
-             Copyright (c) 2002, 2007  Bo Thorsen <bo@suse.de>
-
-   x86-64 Foreign Function Interface
+   ffi64.c - Copyright (c) 2002, 2007  Bo Thorsen <bo@suse.de>
+             Copyright (c) 2008  Red Hat, Inc.
+   
+   x86-64 Foreign Function Interface 
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -38,30 +36,13 @@
 #define MAX_GPR_REGS 6
 #define MAX_SSE_REGS 8
 
-#if defined(__INTEL_COMPILER)
-#include "xmmintrin.h"
-#define UINT128 __m128
-#else
-#if defined(__SUNPRO_C)
-#include <sunmedia_types.h>
-#define UINT128 __m128i
-#else
-#define UINT128 __int128_t
-#endif
-#endif
-
-union big_int_union
-{
-  UINT32 i32;
-  UINT64 i64;
-  UINT128 i128;
-};
+typedef struct { int64_t m[8]; } __int512_t;
 
 struct register_args
 {
   /* Registers for argument passing.  */
   UINT64 gpr[MAX_GPR_REGS];
-  union big_int_union sse[MAX_SSE_REGS];
+  __int512_t sse[MAX_SSE_REGS];
 };
 
 extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
@@ -152,7 +133,7 @@ merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
 
    See the x86-64 PS ABI for details.
 */
-static size_t
+static int
 classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
 		   size_t byte_offset)
 {
@@ -168,7 +149,7 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
     case FFI_TYPE_SINT64:
     case FFI_TYPE_POINTER:
       {
-	size_t size = byte_offset + type->size;
+	int size = byte_offset + type->size;
 
 	if (size <= 4)
 	  {
@@ -203,17 +184,15 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
     case FFI_TYPE_DOUBLE:
       classes[0] = X86_64_SSEDF_CLASS;
       return 1;
-#if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
     case FFI_TYPE_LONGDOUBLE:
       classes[0] = X86_64_X87_CLASS;
       classes[1] = X86_64_X87UP_CLASS;
       return 2;
-#endif
     case FFI_TYPE_STRUCT:
       {
-	const size_t UNITS_PER_WORD = 8;
-	size_t words = (type->size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
-	ffi_type **ptr;
+	const int UNITS_PER_WORD = 8;
+	int words = (type->size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
+	ffi_type **ptr; 
 	int i;
 	enum x86_64_reg_class subclasses[MAX_CLASSES];
 
@@ -235,7 +214,7 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
 	/* Merge the fields of structure.  */
 	for (ptr = type->elements; *ptr != NULL; ptr++)
 	  {
-	    size_t num;
+	    int num;
 
 	    byte_offset = ALIGN (byte_offset, (*ptr)->alignment);
 
@@ -244,7 +223,7 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
 	      return 0;
 	    for (i = 0; i < num; i++)
 	      {
-		size_t pos = byte_offset / 8;
+		int pos = byte_offset / 8;
 		classes[i + pos] =
 		  merge_classes (subclasses[i], classes[i + pos]);
 	      }
@@ -308,12 +287,11 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
    class.  Return zero iff parameter should be passed in memory, otherwise
    the number of registers.  */
 
-static size_t
+static int
 examine_argument (ffi_type *type, enum x86_64_reg_class classes[MAX_CLASSES],
 		  _Bool in_return, int *pngpr, int *pnsse)
 {
-  size_t n;
-  int i, ngpr, nsse;
+  int i, n, ngpr, nsse;
 
   n = classify_argument (type, classes, 0);
   if (n == 0)
@@ -354,9 +332,9 @@ examine_argument (ffi_type *type, enum x86_64_reg_class classes[MAX_CLASSES],
 ffi_status
 ffi_prep_cif_machdep (ffi_cif *cif)
 {
-  int gprcount, ssecount, i, avn, ngpr, nsse, flags;
+  int gprcount, ssecount, i, avn, n, ngpr, nsse, flags;
   enum x86_64_reg_class classes[MAX_CLASSES];
-  size_t bytes, n;
+  size_t bytes;
 
   gprcount = ssecount = 0;
 
@@ -402,7 +380,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 	  if (align < 8)
 	    align = 8;
 
-	  bytes = ALIGN (bytes, align);
+	  bytes = ALIGN(bytes, align);
 	  bytes += cif->arg_types[i]->size;
 	}
       else
@@ -414,7 +392,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
   if (ssecount)
     flags |= 1 << 11;
   cif->flags = flags;
-  cif->bytes = (unsigned)ALIGN (bytes, 8);
+  cif->bytes = bytes;
 
   return FFI_OK;
 }
@@ -450,14 +428,15 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
   /* If the return value is passed in memory, add the pointer as the
      first integer argument.  */
   if (ret_in_memory)
-    reg_args->gpr[gprcount++] = (unsigned long) rvalue;
+    reg_args->gpr[gprcount++] = (long) rvalue;
 
   avn = cif->nargs;
   arg_types = cif->arg_types;
 
   for (i = 0; i < avn; ++i)
     {
-      size_t n, size = arg_types[i]->size;
+      size_t size = arg_types[i]->size;
+      int n;
 
       n = examine_argument (arg_types[i], classes, 0, &ngpr, &nsse);
       if (n == 0
@@ -487,33 +466,32 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 		{
 		case X86_64_INTEGER_CLASS:
 		case X86_64_INTEGERSI_CLASS:
-		  /* Sign-extend integer arguments passed in general
-		     purpose registers, to cope with the fact that
-		     LLVM incorrectly assumes that this will be done
-		     (the x86-64 PS ABI does not specify this). */
-		  switch (arg_types[i]->type)
-		    {
-		    case FFI_TYPE_SINT8:
-		      *(SINT64 *)&reg_args->gpr[gprcount] = (SINT64) *((SINT8 *) a);
-		      break;
-		    case FFI_TYPE_SINT16:
-		      *(SINT64 *)&reg_args->gpr[gprcount] = (SINT64) *((SINT16 *) a);
-		      break;
-		    case FFI_TYPE_SINT32:
-		      *(SINT64 *)&reg_args->gpr[gprcount] = (SINT64) *((SINT32 *) a);
-		      break;
-		    default:
-		      reg_args->gpr[gprcount] = 0;
-		      memcpy (&reg_args->gpr[gprcount], a, size < 8 ? size : 8);
-		    }
+		  reg_args->gpr[gprcount] = 0;
+		  memcpy (&reg_args->gpr[gprcount], a, size < 8 ? size : 8);
 		  gprcount++;
 		  break;
 		case X86_64_SSE_CLASS:
 		case X86_64_SSEDF_CLASS:
-		  reg_args->sse[ssecount++].i64 = *(UINT64 *) a;
+		  reg_args->sse[ssecount].m[0] = *(UINT64 *) a;
+		  reg_args->sse[ssecount].m[1] = 0;
+		  reg_args->sse[ssecount].m[2] = 0;	
+		  reg_args->sse[ssecount].m[3] = 0;	
+		  reg_args->sse[ssecount].m[4] = 0;	
+		  reg_args->sse[ssecount].m[5] = 0;	
+		  reg_args->sse[ssecount].m[6] = 0;	
+		  reg_args->sse[ssecount].m[7] = 0;	
+		  ssecount++;
 		  break;
 		case X86_64_SSESF_CLASS:
-		  reg_args->sse[ssecount++].i32 = *(UINT32 *) a;
+		  reg_args->sse[ssecount].m[0] = *(UINT32 *) a;
+		  reg_args->sse[ssecount].m[1] = 0;
+		  reg_args->sse[ssecount].m[2] = 0;	
+		  reg_args->sse[ssecount].m[3] = 0;	
+		  reg_args->sse[ssecount].m[4] = 0;	
+		  reg_args->sse[ssecount].m[5] = 0;	
+		  reg_args->sse[ssecount].m[6] = 0;	
+		  reg_args->sse[ssecount].m[7] = 0;	
+		  ssecount++;
 		  break;
 		default:
 		  abort();
@@ -538,21 +516,12 @@ ffi_prep_closure_loc (ffi_closure* closure,
 {
   volatile unsigned short *tramp;
 
-  /* Sanity check on the cif ABI.  */
-  {
-    int abi = cif->abi;
-    if (UNLIKELY (! (abi > FFI_FIRST_ABI && abi < FFI_LAST_ABI)))
-      return FFI_BAD_ABI;
-  }
-
   tramp = (volatile unsigned short *) &closure->tramp[0];
 
   tramp[0] = 0xbb49;		/* mov <code>, %r11	*/
-  *((unsigned long long * volatile) &tramp[1])
-    = (unsigned long) ffi_closure_unix64;
+  *(void * volatile *) &tramp[1] = ffi_closure_unix64;
   tramp[5] = 0xba49;		/* mov <data>, %r10	*/
-  *((unsigned long long * volatile) &tramp[6])
-    = (unsigned long) codeloc;
+  *(void * volatile *) &tramp[6] = codeloc;
 
   /* Set the carry bit iff the function uses any sse registers.
      This is clc or stc, together with the first byte of the jmp.  */
@@ -586,12 +555,12 @@ ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue,
   if (ret != FFI_TYPE_VOID)
     {
       enum x86_64_reg_class classes[MAX_CLASSES];
-      size_t n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
+      int n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
       if (n == 0)
 	{
 	  /* The return value goes in memory.  Arrange for the closure
 	     return value to go directly back to the original caller.  */
-	  rvalue = (void *) (unsigned long) reg_args->gpr[gprcount++];
+	  rvalue = (void *) reg_args->gpr[gprcount++];
 	  /* We don't have to do anything in asm for the return.  */
 	  ret = FFI_TYPE_VOID;
 	}
@@ -609,11 +578,11 @@ ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue,
 
   avn = cif->nargs;
   arg_types = cif->arg_types;
-
+  
   for (i = 0; i < avn; ++i)
     {
       enum x86_64_reg_class classes[MAX_CLASSES];
-      size_t n;
+      int n;
 
       n = examine_argument (arg_types[i], classes, 0, &ngpr, &nsse);
       if (n == 0
@@ -652,7 +621,7 @@ ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue,
       /* Otherwise, allocate space to make them consecutive.  */
       else
 	{
-	  char *a = alloca (16);
+	  char *a = alloca (64);
 	  int j;
 
 	  avalue[i] = a;
diff --git a/Modules/_ctypes/libffi/src/x86/unix64.S b/Modules/_ctypes/libffi/src/x86/unix64.S
index 45a0ed7..b79fe01 100644
--- a/Modules/_ctypes/libffi/src/x86/unix64.S
+++ b/Modules/_ctypes/libffi/src/x86/unix64.S
@@ -1,7 +1,6 @@
 /* -----------------------------------------------------------------------
-   unix64.S - Copyright (c) 2013  The Written Word, Inc.
-	    - Copyright (c) 2008  Red Hat, Inc
-	    - Copyright (c) 2002  Bo Thorsen <bo@suse.de>
+   unix64.S - Copyright (c) 2002  Bo Thorsen <bo@suse.de>
+	      Copyright (c) 2008  Red Hat, Inc
 
    x86-64 Foreign Function Interface 
 
@@ -24,8 +23,17 @@
    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    DEALINGS IN THE SOFTWARE.
+
+
+   PORT TO THE INTEL MIC ARCHITECTURE:
+   EMILIO CASTILLO VILLAR
+   CRISTOBAL CAMARERO COTERILLO
+
+   UNIVERSITY OF CANTABRIA
+   SPAIN
    ----------------------------------------------------------------------- */
 
+/
 #ifdef __x86_64__
 #define LIBFFI_ASM	
 #include <fficonfig.h>
@@ -70,7 +78,7 @@ ffi_call_unix64:
 .Lret_from_load_sse:
 
 	/* Deallocate the reg arg area.  */
-	leaq	176(%r10), %rsp
+	leaq	560(%r10), %rsp
 
 	/* Call the user function.  */
 	call	*%r11
@@ -146,11 +154,20 @@ ffi_call_unix64:
 
 	.align 2
 .Lst_float:
-	movss	%xmm0, (%rdi)
+
+	movl 	$1, %eax
+	kmov	%eax, %k1
+	vpackstorelps %zmm0, (%rdi){%k1}
+	vpackstorehps %zmm0, 64(%rdi){%k1}
+	/*movss	%xmm0, (%rdi)*/
 	ret
 	.align 2
 .Lst_double:
-	movsd	%xmm0, (%rdi)
+	movl 	$1, %eax
+	kmov	%eax, %k1
+	vpackstorelpd %zmm0, (%rdi){%k1}
+	vpackstorehpd %zmm0, 64(%rdi){%k1}
+	/*movsd	%xmm0, (%rdi)*/
 	ret
 .Lst_ldouble:
 	fstpt	(%rdi)
@@ -165,16 +182,39 @@ ffi_call_unix64:
 	   value to a 16 byte scratch area first.  Bits 8, 9, and 10
 	   control where the values are located.  Only one of the three
 	   bits will be set; see ffi_prep_cif_machdep for the pattern.  */
-	movd	%xmm0, %r10
-	movd	%xmm1, %r11
+
+
+	movq 	%rax, %r10
+	movl 	$1, %eax
+	kmov	%eax, %k1
+	movq	%r10, %rax
+
+	vpackstorelpd %zmm0, -200(%rsp){%k1}
+	vpackstorehpd %zmm0, -136(%rsp){%k1}
+	movq 	-200(%rsp), %r10
+
+
+	vpackstorelpd %zmm1, -200(%rsp){%k1}
+	vpackstorehpd %zmm1, -136(%rsp){%k1}
+	movq 	-200(%rsp), %r11
+
+	/*movd	%zmm0, %r10
+	movd	%zmm1, %r11*/
 	testl	$0x100, %ecx
-	cmovnz	%rax, %rdx
-	cmovnz	%r10, %rax
+        jz .Lst_struct_n1
+		movq	%rax, %rdx
+		movq	%r10, %rax
+.Lst_struct_n1:
+
 	testl	$0x200, %ecx
-	cmovnz	%r10, %rdx
+	jz .Lst_struct_n2
+		movq	%r10, %rdx
+.Lst_struct_n2:
 	testl	$0x400, %ecx
-	cmovnz	%r10, %rax
-	cmovnz	%r11, %rdx
+	jz .Lst_struct_n3
+		movq	%r10, %rax
+		movq	%r11, %rdx
+.Lst_struct_n3:
 	movq	%rax, (%rsi)
 	movq	%rdx, 8(%rsi)
 
@@ -190,14 +230,33 @@ ffi_call_unix64:
 	.align 2
 .LUW3:
 .Lload_sse:
-	movdqa	48(%r10), %xmm0
-	movdqa	64(%r10), %xmm1
-	movdqa	80(%r10), %xmm2
-	movdqa	96(%r10), %xmm3
-	movdqa	112(%r10), %xmm4
-	movdqa	128(%r10), %xmm5
-	movdqa	144(%r10), %xmm6
-	movdqa	160(%r10), %xmm7
+	
+	vloadunpacklq	48(%r10),  %zmm0
+	vloadunpacklq	112(%r10), %zmm1
+	vloadunpacklq	176(%r10), %zmm2
+	vloadunpacklq	240(%r10), %zmm3
+	vloadunpacklq	304(%r10), %zmm4
+	vloadunpacklq	368(%r10), %zmm5
+	vloadunpacklq	432(%r10), %zmm6
+	vloadunpacklq	496(%r10), %zmm7
+
+	vloadunpackhq	112(%r10), %zmm0
+	vloadunpackhq	176(%r10), %zmm1
+	vloadunpackhq	240(%r10), %zmm2
+	vloadunpackhq	304(%r10), %zmm3
+	vloadunpackhq	368(%r10), %zmm4
+	vloadunpackhq	432(%r10), %zmm5
+	vloadunpackhq	496(%r10), %zmm6
+	vloadunpackhq	560(%r10), %zmm7
+
+	/*vmovaps	48(%r10),  %zmm0
+	vmovaps	112(%r10), %zmm1
+	vmovaps	176(%r10), %zmm2
+	vmovaps	240(%r10), %zmm3
+	vmovaps	304(%r10), %zmm4
+	vmovaps	368(%r10), %zmm5
+	vmovaps	432(%r10), %zmm6
+	vmovaps	496(%r10), %zmm7*/
 	jmp	.Lret_from_load_sse
 
 .LUW4:
@@ -211,7 +270,7 @@ ffi_closure_unix64:
 .LUW5:
 	/* The carry flag is set by the trampoline iff SSE registers
 	   are used.  Don't clobber it before the branch instruction.  */
-	leaq    -200(%rsp), %rsp
+	leaq    -584(%rsp), %rsp
 .LUW6:
 	movq	%rdi, (%rsp)
 	movq    %rsi, 8(%rsp)
@@ -223,13 +282,13 @@ ffi_closure_unix64:
 .Lret_from_save_sse:
 
 	movq	%r10, %rdi
-	leaq	176(%rsp), %rsi
+	leaq	560(%rsp), %rsi
 	movq	%rsp, %rdx
-	leaq	208(%rsp), %rcx
+	leaq	592(%rsp), %rcx
 	call	ffi_closure_unix64_inner@PLT
 
 	/* Deallocate stack frame early; return value is now in redzone.  */
-	addq	$200, %rsp
+	addq	$584, %rsp
 .LUW7:
 
 	/* The first byte of the return value contains the FFI_TYPE.  */
@@ -279,11 +338,13 @@ ffi_closure_unix64:
 
 	.align 2
 .Lld_float:
-	movss	-24(%rsp), %xmm0
+	vbroadcastss	-24(%rsp), %zmm0
+	/*movss	-24(%rsp), %xmm0*/
 	ret
 	.align 2
 .Lld_double:
-	movsd	-24(%rsp), %xmm0
+	vbroadcastsd	-24(%rsp), %zmm0
+	/*movsd	-24(%rsp), %xmm0*/
 	ret
 	.align 2
 .Lld_ldouble:
@@ -299,40 +360,61 @@ ffi_closure_unix64:
 	   that rax gets the second word.  */
 	movq	-24(%rsp), %rcx
 	movq	-16(%rsp), %rdx
-	movq	-16(%rsp), %xmm1
+	vbroadcastsd	-16(%rsp), %zmm1
+	/*movq	-16(%rsp), %xmm1*/
 	testl	$0x100, %eax
-	cmovnz	%rdx, %rcx
-	movd	%rcx, %xmm0
-	testl	$0x200, %eax
+	jz .Lld_struct_1
+
+	movq	%rdx, %rcx
+.Lld_struct_1:
+	subq	$8, %rsp
+	movq	%rcx, (%rsp)
+	addq	$8, %rsp
+	vbroadcastss	(%rsp), %zmm0
+	
+	/*movd	%rcx, %zmm0*/
 	movq	-24(%rsp), %rax
-	cmovnz	%rdx, %rax
+	testl	$0x200, %eax
+	jz .Lld_struct_2
+	movq	%rdx, %rax
+.Lld_struct_2:
 	ret
 
 	/* See the comment above .Lload_sse; the same logic applies here.  */
 	.align 2
 .LUW8:
 .Lsave_sse:
-	movdqa	%xmm0, 48(%rsp)
-	movdqa	%xmm1, 64(%rsp)
-	movdqa	%xmm2, 80(%rsp)
-	movdqa	%xmm3, 96(%rsp)
-	movdqa	%xmm4, 112(%rsp)
-	movdqa	%xmm5, 128(%rsp)
-	movdqa	%xmm6, 144(%rsp)
-	movdqa	%xmm7, 160(%rsp)
+	vpackstorelq	%zmm0, 48(%rsp)
+	vpackstorelq	%zmm1, 112(%rsp)
+	vpackstorelq	%zmm2, 176(%rsp)
+	vpackstorelq	%zmm3, 240(%rsp)
+	vpackstorelq	%zmm4, 304(%rsp)
+	vpackstorelq	%zmm5, 368(%rsp)
+	vpackstorelq	%zmm6, 432(%rsp)
+	vpackstorelq	%zmm7, 496(%rsp)
+	
+	vpackstorehq	%zmm0, 112(%rsp)
+	vpackstorehq	%zmm1, 176(%rsp)
+	vpackstorehq	%zmm2, 240(%rsp)
+	vpackstorehq	%zmm3, 304(%rsp)
+	vpackstorehq	%zmm4, 368(%rsp)
+	vpackstorehq	%zmm5, 432(%rsp)
+	vpackstorehq	%zmm6, 496(%rsp)
+	vpackstorehq	%zmm7, 560(%rsp)
+        /*vmovaps	%zmm0, 48(%rsp)
+	vmovaps	%zmm1, 112(%rsp)
+	vmovaps	%zmm2, 176(%rsp)
+	vmovaps	%zmm3, 240(%rsp)
+	vmovaps	%zmm4, 304(%rsp)
+	vmovaps	%zmm5, 368(%rsp)
+	vmovaps	%zmm6, 432(%rsp)
+	vmovaps	%zmm7, 496(%rsp) */
 	jmp	.Lret_from_save_sse
 
 .LUW9:
 	.size	ffi_closure_unix64,.-ffi_closure_unix64
 
-#ifdef __GNUC__
-/* Only emit DWARF unwind info when building with the GNU toolchain.  */
-
-#ifdef HAVE_AS_X86_64_UNWIND_SECTION_TYPE
-	.section	.eh_frame,"a",@unwind
-#else
 	.section	.eh_frame,"a",@progbits
-#endif
 .Lframe1:
 	.long	.LECIE1-.LSCIE1		/* CIE Length */
 .LSCIE1:
@@ -366,7 +448,7 @@ ffi_closure_unix64:
 	.byte	0x4			/* DW_CFA_advance_loc4 */
 	.long	.LUW1-.LUW0
 
-	/* New stack frame based off rbp.  This is an itty bit of unwind
+	/* New stack frame based off rbp.  This is a itty bit of unwind
 	   trickery in that the CFA *has* changed.  There is no easy way
 	   to describe it correctly on entry to the function.  Fortunately,
 	   it doesn't matter too much since at all points we can correctly
@@ -423,8 +505,6 @@ ffi_closure_unix64:
 	.align 8
 .LEFDE3:
 
-#endif /* __GNUC__ */
-	
 #endif /* __x86_64__ */
 
 #if defined __ELF__ && defined __linux__
-- 
2.5.3.windows.1

