    Pip Cet <pipcet@gmail.com>

    Okay, here's a first patch that fixes the problem (but I've found
    another bug, yet unfixed, in the process), though it's not
    particularly pretty code (I tried hard to keep the changes to the
    minimum necessary). If we decide to actually get rid of VT_QLONG and
    VT_QFLOAT (please, can we?), there are some further simplifications in
    tccgen.c that might offset some of the cost of this patch.

    The idea is that an integer is no longer enough to describe how an
    argument is stored in registers. There are a number of possibilities
    (none, integer register, two integer registers, float register, two
    float registers, integer register plus float register, float register
    plus integer register), and instead of enumerating them I've
    introduced a RegArgs type that stores the offsets for each of our
    registers (for the other architectures, it's simply an int specifying
    the number of registers). If someone strongly prefers an enum, we
    could do that instead, but I believe this is a place where keeping
    things general is worth it, because this way it should be doable to
    add SSE or AVX support.

    There is one line in the patch that looks suspicious:

             } else {
                 addr = (addr + align - 1) & -align;
                 param_addr = addr;
                 addr += size;
    -            sse_param_index += reg_count;
             }
             break;

    However, this actually fixes one half of a bug we have when calling a
    function with eight double arguments "interrupted" by a two-double
    structure after the seventh double argument:

    f(double,double,double,double,double,double,double,struct { double
    x,y; },double);

    In this case, the last argument should be passed in %xmm7. This patch
    fixes the problem in gfunc_prolog, but not the corresponding problem
    in gfunc_call, which I'll try tackling next.


diff -urN tinycc.old/arm64-gen.c tinycc/arm64-gen.c
--- tinycc.old/arm64-gen.c	2015-04-27 06:27:26.000000000 +0300
+++ tinycc/arm64-gen.c	2015-04-27 06:27:34.000000000 +0300
@@ -14,6 +14,8 @@
 // Number of registers available to allocator:
 #define NB_REGS 28 // x0-x18, x30, v0-v7
 
+typedef int RegArgs;
+
 #define TREG_R(x) (x) // x = 0..18
 #define TREG_R30  19
 #define TREG_F(x) (x + 20) // x = 0..7
@@ -1196,8 +1198,15 @@
     }
 }
 
-ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *align, int *regsize)
+ST_FUNC int regargs_nregs(RegArgs *args)
 {
+    return *args;
+}
+
+ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *align, int *regsize, RegArgs *args)
+{
+    *args = 0;
+
     return 0;
 }
 
diff -urN tinycc.old/arm-gen.c tinycc/arm-gen.c
--- tinycc.old/arm-gen.c	2015-04-27 06:27:26.000000000 +0300
+++ tinycc/arm-gen.c	2015-04-27 06:27:34.000000000 +0300
@@ -34,6 +34,8 @@
 #define NB_REGS             9
 #endif
 
+typedef int RegArgs;
+
 #ifndef TCC_ARM_VERSION
 # define TCC_ARM_VERSION 5
 #endif
@@ -867,9 +869,14 @@
   }
 }
 
+ST_FUNC int regargs_nregs(RegArgs *args)
+{
+    return *args;
+}
+
 /* Return the number of registers needed to return the struct, or 0 if
    returning via struct pointer. */
-ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize) {
+ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize, RegArgs *args) {
 #ifdef TCC_ARM_EABI
     int size, align;
     size = type_size(vt, &align);
@@ -879,18 +886,20 @@
 	*regsize = 8;
         ret->ref = NULL;
         ret->t = VT_DOUBLE;
-        return (size + 7) >> 3;
+        *args = (size + 7) >> 3;
     } else if (size <= 4) {
         *ret_align = 4;
 	*regsize = 4;
         ret->ref = NULL;
         ret->t = VT_INT;
-        return 1;
+        *args = 1;
     } else
-        return 0;
+        *args = 0;
 #else
-    return 0;
+    *args = 0;
 #endif
+
+    return *args != 0;
 }
 
 /* Parameters are classified according to how they are copied to their final
diff -urN tinycc.old/c67-gen.c tinycc/c67-gen.c
--- tinycc.old/c67-gen.c	2015-04-27 06:27:26.000000000 +0300
+++ tinycc/c67-gen.c	2015-04-27 06:27:34.000000000 +0300
@@ -25,6 +25,8 @@
 /* number of available registers */
 #define NB_REGS            24
 
+typedef int RegArgs;
+
 /* a register can belong to several classes. The classes must be
    sorted from more general to more precise (see gv2() code which does
    assumptions on it). */
@@ -1879,10 +1881,17 @@
     }
 }
 
+ST_FUNC int regargs_nregs(RegArgs *args)
+{
+    return *args;
+}
+
 /* Return the number of registers needed to return the struct, or 0 if
    returning via struct pointer. */
-ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize) {
+ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize, RegArgs *args) {
     *ret_align = 1; // Never have to re-align return values for x86-64
+    *args = 0;
+
     return 0;
 }
 
diff -urN tinycc.old/i386-gen.c tinycc/i386-gen.c
--- tinycc.old/i386-gen.c	2015-04-27 06:27:26.000000000 +0300
+++ tinycc/i386-gen.c	2015-04-27 06:27:34.000000000 +0300
@@ -24,6 +24,8 @@
 #define NB_REGS         4
 #define NB_ASM_REGS     8
 
+typedef int RegArgs;
+
 /* a register can belong to several classes. The classes must be
    sorted from more general to more precise (see gv2() code which does
    assumptions on it). */
@@ -374,9 +376,14 @@
 static uint8_t fastcall_regs[3] = { TREG_EAX, TREG_EDX, TREG_ECX };
 static uint8_t fastcallw_regs[2] = { TREG_ECX, TREG_EDX };
 
+ST_FUNC int regargs_nregs(RegArgs *args)
+{
+    return *args;
+}
+
 /* Return the number of registers needed to return the struct, or 0 if
    returning via struct pointer. */
-ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize)
+ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize, RegArgs *args)
 {
 #ifdef TCC_TARGET_PE
     int size, align;
@@ -385,20 +392,22 @@
     *regsize = 4;
     size = type_size(vt, &align);
     if (size > 8) {
-        return 0;
+        *args = 0;
     } else if (size > 4) {
         ret->ref = NULL;
         ret->t = VT_LLONG;
-        return 1;
+        *args = 1;
     } else {
         ret->ref = NULL;
         ret->t = VT_INT;
-        return 1;
+        *args = 1;
     }
 #else
     *ret_align = 1; // Never have to re-align return values for x86
-    return 0;
+    *args = 0;
 #endif
+
+    return *args != 0;
 }
 
 /* Generate function call. The function address is pushed first, then
diff -urN tinycc.old/tccgen.c tinycc/tccgen.c
--- tinycc.old/tccgen.c	2015-04-27 06:27:26.000000000 +0300
+++ tinycc/tccgen.c	2015-04-27 06:27:34.000000000 +0300
@@ -519,7 +519,7 @@
     *vtop = *v;
 }
 
-static void vdup(void)
+ST_FUNC void vdup(void)
 {
     vpushv(vtop);
 }
@@ -4193,6 +4193,7 @@
             SValue ret;
             Sym *sa;
             int nb_args, ret_nregs, ret_align, regsize, variadic;
+            RegArgs args;
 
             /* function call  */
             if ((vtop->type.t & VT_BTYPE) != VT_FUNC) {
@@ -4217,8 +4218,10 @@
             /* compute first implicit argument if a structure is returned */
             if ((s->type.t & VT_BTYPE) == VT_STRUCT) {
                 variadic = (s->c == FUNC_ELLIPSIS);
-                ret_nregs = gfunc_sret(&s->type, variadic, &ret.type,
-                                       &ret_align, &regsize);
+                gfunc_sret(&s->type, variadic, &ret.type,
+                           &ret_align, &regsize, &args);
+                ret_nregs = regargs_nregs(&args);
+
                 if (!ret_nregs) {
                     /* get some space for the returned structure */
                     size = type_size(&s->type, &align);
@@ -4304,6 +4307,36 @@
 		  align = regsize;
                 loc = (loc - size) & -align;
                 addr = loc;
+#if defined(TCC_TARGET_X86_64) && !defined(TCC_TARGET_PE)
+                int i;
+
+                for (i=0; i<REG_ARGS_MAX; i++) {
+                    offset = args.ireg[i];
+
+                    if (offset == -1)
+                        break;
+
+                    ret.type.t = VT_LLONG;
+                    vset(&ret.type, VT_LOCAL | VT_LVAL, addr + offset);
+                    vsetc(&ret.type, i ? REG_LRET : REG_IRET, &ret.c);
+                    vstore();
+                    vtop--;
+                    vtop--;
+                }
+                for (i=0; i<REG_ARGS_MAX; i++) {
+                    offset = args.freg[i];
+
+                    if (offset == -1)
+                        break;
+
+                    ret.type.t = VT_DOUBLE;
+                    vset(&ret.type, VT_LOCAL | VT_LVAL, addr + offset);
+                    vsetc(&ret.type, i ? REG_QRET : REG_FRET, &ret.c);
+                    vstore();
+                    vtop--;
+                    vtop--;
+                }
+#else
                 offset = 0;
                 for (;;) {
                     vset(&ret.type, VT_LOCAL | VT_LVAL, addr + offset);
@@ -4314,6 +4347,7 @@
                         break;
                     offset += regsize;
                 }
+#endif
                 vset(&s->type, VT_LOCAL | VT_LVAL, addr);
             }
         } else {
@@ -4894,8 +4928,11 @@
             if ((func_vt.t & VT_BTYPE) == VT_STRUCT) {
                 CType type, ret_type;
                 int ret_align, ret_nregs, regsize;
-                ret_nregs = gfunc_sret(&func_vt, func_var, &ret_type,
-                                       &ret_align, &regsize);
+                RegArgs args;
+
+                gfunc_sret(&func_vt, func_var, &ret_type,
+                           &ret_align, &regsize, &args);
+                ret_nregs = regargs_nregs(&args);
                 if (0 == ret_nregs) {
                     /* if returning structure, must copy it to implicit
                        first pointer arg location */
@@ -4921,6 +4958,41 @@
                         vset(&ret_type, VT_LOCAL | VT_LVAL, addr);
                     }
                     vtop->type = ret_type;
+#if defined(TCC_TARGET_X86_64) && !defined(TCC_TARGET_PE)
+                    int i;
+
+                    for (i=0; i<REG_ARGS_MAX; i++) {
+                        int off = args.ireg[i];
+
+                        if (off == -1)
+                            break;
+
+                        r = i ? RC_LRET : RC_IRET;
+
+                        vdup();
+                        vtop->c.i += off;
+                        vtop->type.t = VT_LLONG;
+                        gv(r);
+                        vpop();
+                    }
+                    for (i=0; i<REG_ARGS_MAX; i++) {
+                        int off = args.freg[i];
+
+                        if (off == -1)
+                            break;
+
+                        /* We assume that when a structure is returned in multiple
+                           registers, their classes are consecutive values of the
+                           suite s(n) = 2^n */
+                        r = rc_fret(ret_type.t) << i;
+
+                        vdup();
+                        vtop->c.i += off;
+                        vtop->type.t = VT_DOUBLE;
+                        gv(r);
+                        vpop();
+                    }
+#else
                     if (is_float(ret_type.t))
                         r = rc_fret(ret_type.t);
                     else
@@ -4937,6 +5009,7 @@
                         vtop->c.i += regsize;
                         vtop->r = VT_LOCAL | VT_LVAL;
                     }
+#endif
                 }
             } else if (is_float(func_vt.t)) {
                 gv(rc_fret(func_vt.t));
diff -urN tinycc.old/tcc.h tinycc/tcc.h
--- tinycc.old/tcc.h	2015-04-27 06:27:26.000000000 +0300
+++ tinycc/tcc.h	2015-04-27 06:27:34.000000000 +0300
@@ -795,8 +795,8 @@
 #define VT_LLONG           12  /* 64 bit integer */
 #define VT_LONG            13  /* long integer (NEVER USED as type, only
                                   during parsing) */
-#define VT_QLONG           14  /* 128-bit integer. Only used for x86-64 ABI */
-#define VT_QFLOAT          15  /* 128-bit float. Only used for x86-64 ABI */
+#define VT_QLONG           14  /* 128-bit integer. No longer used. */
+#define VT_QFLOAT          15  /* 128-bit float. No longer used. */
 #define VT_UNSIGNED    0x0010  /* unsigned type */
 #define VT_ARRAY       0x0020  /* array type (also has VT_PTR) */
 #define VT_BITFIELD    0x0040  /* bitfield modifier */
@@ -1245,6 +1245,7 @@
 ST_FUNC void gaddrof(void);
 ST_FUNC int gv(int rc);
 ST_FUNC void gv2(int rc1, int rc2);
+ST_FUNC void vdup(void);
 ST_FUNC void vpop(void);
 ST_FUNC void gen_op(int op);
 ST_FUNC int type_size(CType *type, int *a);
@@ -1333,7 +1334,8 @@
 ST_FUNC void gsym(int t);
 ST_FUNC void load(int r, SValue *sv);
 ST_FUNC void store(int r, SValue *v);
-ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *align, int *regsize);
+ST_FUNC int regargs_nregs(RegArgs *args);
+ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *align, int *regsize, RegArgs *args);
 ST_FUNC void gfunc_call(int nb_args);
 ST_FUNC void gfunc_prolog(CType *func_type);
 ST_FUNC void gfunc_epilog(void);
diff -urN tinycc.old/tests/abitest.c tinycc/tests/abitest.c
--- tinycc.old/tests/abitest.c	2015-04-27 06:27:26.000000000 +0300
+++ tinycc/tests/abitest.c	2015-04-27 06:39:41.000000000 +0300
@@ -628,8 +628,8 @@
   RUN_TEST(ret_2double_test);
   /* RUN_TEST(ret_8plus2double_test); currently broken on x86_64 */
   /* RUN_TEST(ret_6plus2longlong_test); currently broken on x86_64 */
-  /* RUN_TEST(ret_mixed_test); currently broken on x86_64 */
-  /* RUN_TEST(ret_mixed2_test); currently broken on x86_64 */
+  RUN_TEST(ret_mixed_test);
+  RUN_TEST(ret_mixed2_test);
   RUN_TEST(ret_mixed3_test);
   RUN_TEST(reg_pack_test);
   RUN_TEST(reg_pack_longlong_test);
diff -urN tinycc.old/x86_64-gen.c tinycc/x86_64-gen.c
--- tinycc.old/x86_64-gen.c	2015-04-27 06:27:26.000000000 +0300
+++ tinycc/x86_64-gen.c	2015-04-27 06:37:44.000000000 +0300
@@ -25,6 +25,24 @@
 /* number of available registers */
 #define NB_REGS         25
 #define NB_ASM_REGS     8
+#define REG_ARGS_MAX    2 /* at most 2 registers used for each argument */
+
+#ifdef TCC_TARGET_PE
+typedef int RegArgs;
+#else
+/* This struct stores the struct offsets at which %rax, %rdx, %xmm0, and
+ * %xmm1 are to be stored.
+ *
+ * struct { long long l; double x; }: ireg = { 0, -1 } freg = { 8, -1 }
+ * struct { double x; long long l; }: ireg = { 8, -1 } freg = { 0, -1 }
+ * struct { long long l; long long l2; }: ireg = { 0, 8 } freg = { -1, -1 }
+ * struct { double x; double x2; }: ireg = { -1, -1 } freg = { 0, 8 }
+ */
+typedef struct {
+    int ireg[REG_ARGS_MAX];
+    int freg[REG_ARGS_MAX];
+} RegArgs;
+#endif
 
 /* a register can belong to several classes. The classes must be
    sorted from more general to more precise (see gv2() code which does
@@ -1041,7 +1059,9 @@
         return x86_64_mode_sse;
 }
 
-static X86_64_Mode classify_x86_64_inner(CType *ty)
+/* classify the x86 eightbytes from byte index start to byte index
+ * end, at offset offset from the root struct */
+static X86_64_Mode classify_x86_64_inner(CType *ty, int offset, int start, int end)
 {
     X86_64_Mode mode;
     Sym *f;
@@ -1067,8 +1087,10 @@
         f = ty->ref;
 
         mode = x86_64_mode_none;
-        for (f = f->next; f; f = f->next)
-            mode = classify_x86_64_merge(mode, classify_x86_64_inner(&f->type));
+        while ((f = f->next) != NULL) {
+            if (f->c + offset >= start && f->c + offset < end)
+                mode = classify_x86_64_merge(mode, classify_x86_64_inner(&f->type, f->c + offset, start, end));
+        }
         
         return mode;
     }
@@ -1076,61 +1098,79 @@
     assert(0);
 }
 
-static X86_64_Mode classify_x86_64_arg(CType *ty, CType *ret, int *psize, int *palign, int *reg_count)
+static X86_64_Mode classify_x86_64_arg_eightbyte(CType *ty, int offset)
 {
     X86_64_Mode mode;
+
+    assert((ty->t & VT_BTYPE) == VT_STRUCT);
+
+    mode = classify_x86_64_inner(ty, 0, offset, offset + 8);
+
+    return mode;
+}
+
+static void regargs_init(RegArgs *args)
+{
+    int i;
+    for(i=0; i<REG_ARGS_MAX; i++) {
+        args->ireg[i] = -1;
+        args->freg[i] = -1;
+    }
+}
+
+static X86_64_Mode classify_x86_64_arg(CType *ty, CType *ret, int *psize, int *palign, RegArgs *args)
+{
+    X86_64_Mode mode = x86_64_mode_none;
     int size, align, ret_t = 0;
-    
+    int ireg = 0, freg = 0;
+
+    if (args)
+        regargs_init(args);
+
     if (ty->t & (VT_BITFIELD|VT_ARRAY)) {
         *psize = 8;
         *palign = 8;
-        *reg_count = 1;
+        if (args)
+            args->ireg[ireg++] = 0;
         ret_t = ty->t;
         mode = x86_64_mode_integer;
     } else {
         size = type_size(ty, &align);
         *psize = (size + 7) & ~7;
         *palign = (align + 7) & ~7;
-    
+
         if (size > 16) {
             mode = x86_64_mode_memory;
         } else {
-            mode = classify_x86_64_inner(ty);
-            switch (mode) {
-            case x86_64_mode_integer:
-                if (size > 8) {
-                    *reg_count = 2;
-                    ret_t = VT_QLONG;
+            int start;
+
+            for(start=0; start < size; start += 8) {
+                if ((ty->t & VT_BTYPE) == VT_STRUCT) {
+                    mode = classify_x86_64_arg_eightbyte(ty, start);
                 } else {
-                    *reg_count = 1;
-                    ret_t = (size > 4) ? VT_LLONG : VT_INT;
+                    mode = classify_x86_64_inner(ty, 0, 0, size);
                 }
-                break;
-                
-            case x86_64_mode_x87:
-                *reg_count = 1;
-                ret_t = VT_LDOUBLE;
-                break;
 
-            case x86_64_mode_sse:
-                if (size > 8) {
-                    *reg_count = 2;
-                    ret_t = VT_QFLOAT;
-                } else {
-                    *reg_count = 1;
+                if (mode == x86_64_mode_integer) {
+                    if (args)
+                        args->ireg[ireg++] = start;
+                    ret_t = (size > 4) ? VT_LLONG : VT_INT;
+                } else if (mode == x86_64_mode_sse) {
+                    if (args)
+                        args->freg[freg++] = start;
                     ret_t = (size > 4) ? VT_DOUBLE : VT_FLOAT;
+                } else {
+                    ret_t = VT_LDOUBLE;
                 }
-                break;
-            default: break; /* nothing to be done for x86_64_mode_memory and x86_64_mode_none*/
             }
         }
     }
-    
+
     if (ret) {
         ret->ref = NULL;
         ret->t = ret_t;
     }
-    
+
     return mode;
 }
 
@@ -1140,8 +1180,8 @@
     enum __va_arg_type {
         __va_gen_reg, __va_float_reg, __va_stack
     };
-    int size, align, reg_count;
-    X86_64_Mode mode = classify_x86_64_arg(ty, NULL, &size, &align, &reg_count);
+    int size, align;
+    X86_64_Mode mode = classify_x86_64_arg(ty, NULL, &size, &align, NULL);
     switch (mode) {
     default: return __va_stack;
     case x86_64_mode_integer: return __va_gen_reg;
@@ -1149,14 +1189,56 @@
     }
 }
 
-/* Return the number of registers needed to return the struct, or 0 if
-   returning via struct pointer. */
-ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize)
+static int regargs_iregs(RegArgs *args)
+{
+    int i;
+    int ret = 0;
+    for(i=0; i<REG_ARGS_MAX; i++) {
+        if(args->ireg[i] != -1)
+            ret++;
+    }
+
+    return ret;
+}
+
+static int regargs_fregs(RegArgs *args)
+{
+    int i;
+    int ret = 0;
+    for(i=0; i<REG_ARGS_MAX; i++) {
+        if(args->freg[i] != -1)
+            ret++;
+    }
+
+    return ret;
+}
+
+/* Count the total number of registers used by args */
+ST_FUNC int regargs_nregs(RegArgs *args)
+{
+    int i;
+    int ret = 0;
+    for(i=0; i<REG_ARGS_MAX; i++) {
+        if(args->ireg[i] != -1)
+            ret++;
+
+        if(args->freg[i] != -1)
+            ret++;
+    }
+
+    return ret;
+}
+
+ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize, RegArgs *args)
 {
-    int size, align, reg_count;
+    int size, align;
     *ret_align = 1; // Never have to re-align return values for x86-64
     *regsize = 8;
-    return (classify_x86_64_arg(vt, ret, &size, &align, &reg_count) != x86_64_mode_memory);
+
+    X86_64_Mode mode = classify_x86_64_arg(vt, ret, &size, &align, args);
+
+    return mode != x86_64_mode_memory &&
+        mode != x86_64_mode_none;
 }
 
 #define REGN 6
@@ -1179,7 +1261,7 @@
 {
     X86_64_Mode mode;
     CType type;
-    int size, align, r, args_size, stack_adjust, run_start, run_end, i, reg_count;
+    int size, align, r, args_size, stack_adjust, run_start, run_end, i;
     int offsets[nb_args*nb_args+1]; //// for a VLA's test
     int nb_reg_args = 0;
     int nb_sse_args = 0;
@@ -1187,11 +1269,15 @@
 
     /* calculate the number of integer/float register arguments */
     for(i = 0; i < nb_args; i++) {
-        mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
-        if (mode == x86_64_mode_sse)
-            nb_sse_args += reg_count;
-        else if (mode == x86_64_mode_integer)
-            nb_reg_args += reg_count;
+        RegArgs args;
+
+        mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &args);
+
+        if (mode == x86_64_mode_sse ||
+            mode == x86_64_mode_integer) {
+            nb_sse_args += regargs_fregs(&args);
+            nb_reg_args += regargs_iregs(&args);
+        }
     }
 
     /* arguments are collected in runs. Each run is a collection of 8-byte aligned arguments
@@ -1210,27 +1296,20 @@
         run_end = nb_args;
         stack_adjust = 0;
         for(i = run_start; (i < nb_args) && (run_end == nb_args); i++) {
-            mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
-            switch (mode) {
-            case x86_64_mode_memory:
-            case x86_64_mode_x87:
-            stack_arg:
+            RegArgs args;
+
+            classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &args);
+
+            int stack = (align == 16) || (sse_reg > 8) || (gen_reg > REGN);
+
+            sse_reg -= regargs_fregs(&args);
+            gen_reg -= regargs_iregs(&args);
+
+            if (stack) {
                 if (align == 16)
                     run_end = i;
                 else
                     stack_adjust += size;
-                break;
-                
-            case x86_64_mode_sse:
-                sse_reg -= reg_count;
-                if (sse_reg + reg_count > 8) goto stack_arg;
-                break;
-            
-            case x86_64_mode_integer:
-                gen_reg -= reg_count;
-                if (gen_reg + reg_count > REGN) goto stack_arg;
-                break;
-	    default: break; /* nothing to be done for x86_64_mode_none */
             }
         }
         
@@ -1258,23 +1337,25 @@
             vtop[0] = vtop[-i];
             vtop[-i] = tmp;
             
-            mode = classify_x86_64_arg(&vtop->type, NULL, &size, &align, &reg_count);
+            RegArgs args;
+            classify_x86_64_arg(&vtop->type, NULL, &size, &align, &args);
+            int reg_count_integer = regargs_iregs(&args);
+            int reg_count_sse = regargs_fregs(&args);
             
             int arg_stored = 1;
             switch (vtop->type.t & VT_BTYPE) {
             case VT_STRUCT:
-                if (mode == x86_64_mode_sse) {
-                    if (sse_reg > 8)
-                        sse_reg -= reg_count;
-                    else
-                        arg_stored = 0;
-                } else if (mode == x86_64_mode_integer) {
-                    if (gen_reg > REGN)
-                        gen_reg -= reg_count;
-                    else
-                        arg_stored = 0;
+                if (reg_count_integer || reg_count_sse) {
+                    if ((reg_count_sse == 0 || sse_reg <= 8) &&
+                        (reg_count_integer == 0 || gen_reg <= REGN)) {
+                      /* argument fits into registers */
+                      arg_stored = 0;
+                    } else {
+                      sse_reg -= reg_count_sse;
+                      gen_reg -= reg_count_integer;
+                    }
                 }
-                
+
                 if (arg_stored) {
                     /* allocate the necessary size on stack */
                     o(0x48);
@@ -1296,7 +1377,6 @@
                 
             case VT_FLOAT:
             case VT_DOUBLE:
-                assert(mode == x86_64_mode_sse);
                 if (sse_reg > 8) {
                     --sse_reg;
                     r = gv(RC_FLOAT);
@@ -1312,7 +1392,6 @@
                 break;
                 
             default:
-                assert(mode == x86_64_mode_integer);
                 /* simple type */
                 /* XXX: implicit cast ? */
                 if (gen_reg > REGN) {
@@ -1346,7 +1425,7 @@
         run_start = i = run_end;
         while (i < nb_args) {
             /* Rotate argument to top since it will always be popped */
-            mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
+            mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, NULL);
             if (align != 16)
               break;
 
@@ -1390,39 +1469,58 @@
     assert(gen_reg <= REGN);
     assert(sse_reg <= 8);
     for(i = 0; i < nb_args; i++) {
-        mode = classify_x86_64_arg(&vtop->type, &type, &size, &align, &reg_count);
+        RegArgs args;
+
+        classify_x86_64_arg(&vtop->type, &type, &size, &align, &args);
+
         /* Alter stack entry type so that gv() knows how to treat it */
-        vtop->type = type;
-        if (mode == x86_64_mode_sse) {
-            if (reg_count == 2) {
-                sse_reg -= 2;
-                gv(RC_FRET); /* Use pair load into xmm0 & xmm1 */
-                if (sse_reg) { /* avoid redundant movaps %xmm0, %xmm0 */
-                    /* movaps %xmm0, %xmmN */
-                    o(0x280f);
-                    o(0xc0 + (sse_reg << 3));
-                    /* movaps %xmm1, %xmmN */
-                    o(0x280f);
-                    o(0xc1 + ((sse_reg+1) << 3));
-                }
-            } else {
-                assert(reg_count == 1);
+        if ((vtop->type.t & VT_BTYPE) == VT_STRUCT) {
+            int k;
+
+            for(k=REG_ARGS_MAX-1; k>=0; k--) {
+                if (args.freg[k] == -1)
+                    continue;
+
+                sse_reg--;
+                assert(sse_reg >= 0);
+
+                vdup();
+                vtop->type.t = VT_DOUBLE;
+                vtop->c.ull += args.freg[k];
+                gv(RC_XMM0 << sse_reg);
+                vpop();
+            }
+            for(k=REG_ARGS_MAX-1; k>=0; k--) {
+                if (args.ireg[k] == -1)
+                    continue;
+
+                gen_reg--;
+
+                vdup();
+                vtop->type.t = VT_LLONG;
+                vtop->c.ull += args.ireg[k];
+                r = gv(RC_INT);
+                int d = arg_prepare_reg(gen_reg);
+                orex(1,d,r,0x89); /* mov */
+                o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
+                vpop();
+            }
+        } else {
+            vtop->type = type;
+            if (args.freg[0] != -1) {
                 --sse_reg;
                 /* Load directly to register */
                 gv(RC_XMM0 << sse_reg);
-            }
-        } else if (mode == x86_64_mode_integer) {
-            /* simple type */
-            /* XXX: implicit cast ? */
-            gen_reg -= reg_count;
-            r = gv(RC_INT);
-            int d = arg_prepare_reg(gen_reg);
-            orex(1,d,r,0x89); /* mov */
-            o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
-            if (reg_count == 2) {
-                d = arg_prepare_reg(gen_reg+1);
-                orex(1,d,vtop->r2,0x89); /* mov */
-                o(0xc0 + REG_VALUE(vtop->r2) * 8 + REG_VALUE(d));
+            } else if (args.ireg[0] != -1) {
+                /* simple type */
+                /* XXX: implicit cast ? */
+                gen_reg--;
+                r = gv(RC_INT);
+                int d = arg_prepare_reg(gen_reg);
+                orex(1,d,r,0x89); /* mov */
+                o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
+            } else {
+                assert(0);
             }
         }
         vtop--;
@@ -1463,7 +1561,7 @@
 void gfunc_prolog(CType *func_type)
 {
     X86_64_Mode mode;
-    int i, addr, align, size, reg_count;
+    int i, addr, align, size;
     int param_addr = 0, reg_param_index, sse_param_index;
     Sym *sym;
     CType *type;
@@ -1484,31 +1582,37 @@
         sym = func_type->ref;
         while ((sym = sym->next) != NULL) {
             type = &sym->type;
-            mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
+            RegArgs args;
+
+            mode = classify_x86_64_arg(type, NULL, &size, &align, &args);
+
             switch (mode) {
             default:
             stack_arg:
                 seen_stack_size = ((seen_stack_size + align - 1) & -align) + size;
                 break;
-                
+
             case x86_64_mode_integer:
-                if (seen_reg_num + reg_count <= 8) {
-                    seen_reg_num += reg_count;
-                } else {
+            case x86_64_mode_sse: {
+                int stack = 0;
+
+                seen_sse_num += regargs_fregs(&args);
+                seen_reg_num += regargs_iregs(&args);
+
+                if (seen_reg_num > 8) {
                     seen_reg_num = 8;
-                    goto stack_arg;
+                    stack = 1;
                 }
-                break;
-                
-            case x86_64_mode_sse:
-                if (seen_sse_num + reg_count <= 8) {
-                    seen_sse_num += reg_count;
-                } else {
+                if (seen_sse_num > 8) {
                     seen_sse_num = 8;
-                    goto stack_arg;
+                    stack = 1;
                 }
+
+                if (stack)
+                    goto stack_arg;
                 break;
             }
+            }
         }
 
         loc -= 16;
@@ -1544,7 +1648,7 @@
     /* if the function returns a structure, then add an
        implicit pointer parameter */
     func_vt = sym->type;
-    mode = classify_x86_64_arg(&func_vt, NULL, &size, &align, &reg_count);
+    mode = classify_x86_64_arg(&func_vt, NULL, &size, &align, NULL);
     if (mode == x86_64_mode_memory) {
         push_arg_reg(reg_param_index);
         func_vc = loc;
@@ -1552,19 +1656,42 @@
     }
     /* define parameters */
     while ((sym = sym->next) != NULL) {
+        RegArgs args;
+        int reg_count_integer = 0;
+        int reg_count_sse = 0;
+
+        mode = classify_x86_64_arg(type, NULL, &size, &align, &args);
+        reg_count_integer = regargs_iregs(&args);
+        reg_count_sse = regargs_fregs(&args);
+
+        int arg_stored = 1;
         type = &sym->type;
-        mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
         switch (mode) {
+        case x86_64_mode_integer:
         case x86_64_mode_sse:
-            if (sse_param_index + reg_count <= 8) {
+            if (reg_count_integer || reg_count_sse) {
+                if ((reg_count_sse == 0 || sse_param_index + reg_count_sse <= 8) &&
+                    (reg_count_integer == 0 || reg_param_index + reg_count_integer <= REGN)) {
+                    /* argument fits into registers */
+                    arg_stored = 0;
+                } else {
+                    sse_param_index += reg_count_sse;
+                    reg_param_index += reg_count_integer;
+                }
+            }
+            if (!arg_stored) {
                 /* save arguments passed by register */
-                loc -= reg_count * 8;
+                loc -= (reg_count_sse + reg_count_integer) * 8;
                 param_addr = loc;
-                for (i = 0; i < reg_count; ++i) {
+                for (i = 0; i < reg_count_sse; ++i) {
                     o(0xd60f66); /* movq */
-                    gen_modrm(sse_param_index, VT_LOCAL, NULL, param_addr + i*8);
+                    gen_modrm(sse_param_index, VT_LOCAL, NULL, param_addr + args.freg[i]);
                     ++sse_param_index;
                 }
+                for (i = 0; i < reg_count_integer; ++i) {
+                    gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, param_addr + args.ireg[i]);
+                    ++reg_param_index;
+                }
             } else {
                 addr = (addr + align - 1) & -align;
                 param_addr = addr;
@@ -1578,23 +1705,7 @@
             param_addr = addr;
             addr += size;
             break;
-            
-        case x86_64_mode_integer: {
-            if (reg_param_index + reg_count <= REGN) {
-                /* save arguments passed by register */
-                loc -= reg_count * 8;
-                param_addr = loc;
-                for (i = 0; i < reg_count; ++i) {
-                    gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, param_addr + i*8);
-                    ++reg_param_index;
-                }
-            } else {
-                addr = (addr + align - 1) & -align;
-                param_addr = addr;
-                addr += size;
-            }
-            break;
-        }
+
 	default: break; /* nothing to be done for x86_64_mode_none */
         }
         sym_push(sym->v & ~SYM_FIELD, type,
