Re: arm: 8-byte alignment (AAPCS)

2016-04-23 Thread Jonathan Gray
On Sat, Apr 23, 2016 at 08:51:25PM +0200, Patrick Wildt wrote:
> On Sat, Apr 02, 2016 at 02:19:17PM +0200, Patrick Wildt wrote:
> > Hi,
> > 
> > the Procedure Call Standard used in EABI requires the stack pointer to
> > be 8-byte aligned by
> > 
> >   * exception handlers, before calling AAPCS-conforming code.
> >   * the OS, before giving control to an application.
> > 
> > This diff makes sure our kernel interfaces adhere to that requirement.
> > 
> > Can someone make sure this doesn't break armish/zaurus?
> > 
> > Patrick
> > 
> > diff --git sys/arch/arm/arm/cpuswitch.S sys/arch/arm/arm/cpuswitch.S
> > index 98e2dbe..0c3d0af 100644
> > --- sys/arch/arm/arm/cpuswitch.S
> > +++ sys/arch/arm/arm/cpuswitch.S
> > @@ -171,6 +171,7 @@ ENTRY(cpu_idle_leave)
> >  
> >  ENTRY(cpu_switchto)
> > stmfd   sp!, {r4-r7, lr}
> > +   sub sp, sp, #4
> >  
> >  #ifdef MULTIPROCESSOR
> > /* XXX use curcpu() */
> > @@ -451,6 +452,7 @@ ENTRY(cpu_switchto)
> >  * Pull the registers that got pushed when either savectx() or
> >  * cpu_switch() was called and return.
> >  */
> > +   add sp, sp, #4
> > ldmfd   sp!, {r4-r7, pc}
> >  
> >  /* LINTSTUB: Func: void savectx(struct pcb *pcb) */
> > @@ -461,6 +463,7 @@ ENTRY(savectx)
> >  
> > /* Push registers.*/
> > stmfd   sp!, {r4-r7, lr}
> > +   sub sp, sp, #4
> >  
> > /* Store all the registers in the process's pcb */
> >  #ifndef __XSCALE__
> > @@ -473,6 +476,7 @@ ENTRY(savectx)
> >  #endif
> >  
> > /* Pull the regs of the stack */
> > +   add sp, sp, #4
> > ldmfd   sp!, {r4-r7, pc}
> >  
> >  ENTRY(proc_trampoline)
> > diff --git sys/arch/arm/arm/cpuswitch7.S sys/arch/arm/arm/cpuswitch7.S
> > index 126b41a..4db9a86 100644
> > --- sys/arch/arm/arm/cpuswitch7.S
> > +++ sys/arch/arm/arm/cpuswitch7.S
> > @@ -165,6 +165,7 @@ ENTRY(cpu_idle_leave)
> >  
> >  ENTRY(cpu_switchto)
> > stmfd   sp!, {r4-r7, lr}
> > +   sub sp, sp, #4
> >  
> >  #ifdef MULTIPROCESSOR
> > /* XXX use curcpu() */
> > @@ -396,6 +397,7 @@ ENTRY(cpu_switchto)
> >  * Pull the registers that got pushed when either savectx() or
> >  * cpu_switch() was called and return.
> >  */
> > +   add sp, sp, #4
> > ldmfd   sp!, {r4-r7, pc}
> >  
> >  /* LINTSTUB: Func: void savectx(struct pcb *pcb) */
> > @@ -406,6 +408,7 @@ ENTRY(savectx)
> >  
> > /* Push registers.*/
> > stmfd   sp!, {r4-r7, lr}
> > +   sub sp, sp, #4
> >  
> > /* Store all the registers in the process's pcb */
> >  #ifndef __XSCALE__
> > @@ -418,6 +421,7 @@ ENTRY(savectx)
> >  #endif
> >  
> > /* Pull the regs of the stack */
> > +   add sp, sp, #4
> > ldmfd   sp!, {r4-r7, pc}
> >  
> >  ENTRY(proc_trampoline)
> > diff --git sys/arch/arm/arm/vm_machdep.c sys/arch/arm/arm/vm_machdep.c
> > index 06f217b..84dfb68 100644
> > --- sys/arch/arm/arm/vm_machdep.c
> > +++ sys/arch/arm/arm/vm_machdep.c
> > @@ -140,10 +140,11 @@ cpu_fork(p1, p2, stack, stacksize, func, arg)
> > *tf = *p1->p_addr->u_pcb.pcb_tf;
> >  
> > /*
> > -* If specified, give the child a different stack.
> > +* If specified, give the child a different stack (make sure
> > +* it's 8-byte aligned).
> >  */
> > if (stack != NULL)
> > -   tf->tf_usr_sp = (u_int)stack + stacksize;
> > +   tf->tf_usr_sp = ((vaddr_t)(stack) + stacksize) & -8;
> >  
> > sf = (struct switchframe *)tf - 1;
> > sf->sf_r4 = (u_int)func;
> > diff --git sys/arch/arm/include/frame.h sys/arch/arm/include/frame.h
> > index 31b2936..56e1368 100644
> > --- sys/arch/arm/include/frame.h
> > +++ sys/arch/arm/include/frame.h
> > @@ -75,6 +75,7 @@ typedef struct trapframe {
> > register_t tf_svc_sp;
> > register_t tf_svc_lr;
> > register_t tf_pc;
> > +   register_t tf_pad;
> >  } trapframe_t;
> >  
> >  /* Register numbers */
> > @@ -137,6 +138,7 @@ typedef struct irqframe {
> > unsigned int if_svc_sp;
> > unsigned int if_svc_lr;
> > unsigned int if_pc;
> > +   unsigned int if_pad;
> >  } irqframe_t;
> >  
> >  #define clockframe irqframe
> > @@ -146,6 +148,7 @@ typedef struct irqframe {
> >   */
> >  
> >  struct switchframe {
> > +   u_int   sf_pad;
> > u_int   sf_r4;
> > u_int   sf_r5;
> > u_int   sf_r6;
> > @@ -203,6 +206,7 @@ struct frame {
> >   */
> >  
> >  #define PUSHFRAME \
> > +   sub sp, sp, #4; /* Align the stack */  \
> > str lr, [sp, #-4]!; /* Push the return address */  \
> > sub sp, sp, #(4*17);/* Adjust the stack pointer */ \
> > stmia   sp, {r0-r14}^;  /* Push the user mode registers */ \
> > @@ -221,7 +225,8 @@ struct frame {
> >  ldmia   sp, {r0-r14}^; /* Restore registers (usr mode) 
> > */ \
> >  mov r0, r0; /* NOP for previous instruction */ 
> > \
> > add sp, sp, #(4*17);/* Adjust the stack pointer */ \
> > 

Re: arm: 8-byte alignment (AAPCS)

2016-04-23 Thread Patrick Wildt
On Sat, Apr 02, 2016 at 02:19:17PM +0200, Patrick Wildt wrote:
> Hi,
> 
> the Procedure Call Standard used in EABI requires the stack pointer to
> be 8-byte aligned by
> 
>   * exception handlers, before calling AAPCS-conforming code.
>   * the OS, before giving control to an application.
> 
> This diff makes sure our kernel interfaces adhere to that requirement.
> 
> Can someone make sure this doesn't break armish/zaurus?
> 
> Patrick
> 
> diff --git sys/arch/arm/arm/cpuswitch.S sys/arch/arm/arm/cpuswitch.S
> index 98e2dbe..0c3d0af 100644
> --- sys/arch/arm/arm/cpuswitch.S
> +++ sys/arch/arm/arm/cpuswitch.S
> @@ -171,6 +171,7 @@ ENTRY(cpu_idle_leave)
>  
>  ENTRY(cpu_switchto)
>   stmfd   sp!, {r4-r7, lr}
> + sub sp, sp, #4
>  
>  #ifdef MULTIPROCESSOR
>   /* XXX use curcpu() */
> @@ -451,6 +452,7 @@ ENTRY(cpu_switchto)
>* Pull the registers that got pushed when either savectx() or
>* cpu_switch() was called and return.
>*/
> + add sp, sp, #4
>   ldmfd   sp!, {r4-r7, pc}
>  
>  /* LINTSTUB: Func: void savectx(struct pcb *pcb) */
> @@ -461,6 +463,7 @@ ENTRY(savectx)
>  
>   /* Push registers.*/
>   stmfd   sp!, {r4-r7, lr}
> + sub sp, sp, #4
>  
>   /* Store all the registers in the process's pcb */
>  #ifndef __XSCALE__
> @@ -473,6 +476,7 @@ ENTRY(savectx)
>  #endif
>  
>   /* Pull the regs of the stack */
> + add sp, sp, #4
>   ldmfd   sp!, {r4-r7, pc}
>  
>  ENTRY(proc_trampoline)
> diff --git sys/arch/arm/arm/cpuswitch7.S sys/arch/arm/arm/cpuswitch7.S
> index 126b41a..4db9a86 100644
> --- sys/arch/arm/arm/cpuswitch7.S
> +++ sys/arch/arm/arm/cpuswitch7.S
> @@ -165,6 +165,7 @@ ENTRY(cpu_idle_leave)
>  
>  ENTRY(cpu_switchto)
>   stmfd   sp!, {r4-r7, lr}
> + sub sp, sp, #4
>  
>  #ifdef MULTIPROCESSOR
>   /* XXX use curcpu() */
> @@ -396,6 +397,7 @@ ENTRY(cpu_switchto)
>* Pull the registers that got pushed when either savectx() or
>* cpu_switch() was called and return.
>*/
> + add sp, sp, #4
>   ldmfd   sp!, {r4-r7, pc}
>  
>  /* LINTSTUB: Func: void savectx(struct pcb *pcb) */
> @@ -406,6 +408,7 @@ ENTRY(savectx)
>  
>   /* Push registers.*/
>   stmfd   sp!, {r4-r7, lr}
> + sub sp, sp, #4
>  
>   /* Store all the registers in the process's pcb */
>  #ifndef __XSCALE__
> @@ -418,6 +421,7 @@ ENTRY(savectx)
>  #endif
>  
>   /* Pull the regs of the stack */
> + add sp, sp, #4
>   ldmfd   sp!, {r4-r7, pc}
>  
>  ENTRY(proc_trampoline)
> diff --git sys/arch/arm/arm/vm_machdep.c sys/arch/arm/arm/vm_machdep.c
> index 06f217b..84dfb68 100644
> --- sys/arch/arm/arm/vm_machdep.c
> +++ sys/arch/arm/arm/vm_machdep.c
> @@ -140,10 +140,11 @@ cpu_fork(p1, p2, stack, stacksize, func, arg)
>   *tf = *p1->p_addr->u_pcb.pcb_tf;
>  
>   /*
> -  * If specified, give the child a different stack.
> +  * If specified, give the child a different stack (make sure
> +  * it's 8-byte aligned).
>*/
>   if (stack != NULL)
> - tf->tf_usr_sp = (u_int)stack + stacksize;
> + tf->tf_usr_sp = ((vaddr_t)(stack) + stacksize) & -8;
>  
>   sf = (struct switchframe *)tf - 1;
>   sf->sf_r4 = (u_int)func;
> diff --git sys/arch/arm/include/frame.h sys/arch/arm/include/frame.h
> index 31b2936..56e1368 100644
> --- sys/arch/arm/include/frame.h
> +++ sys/arch/arm/include/frame.h
> @@ -75,6 +75,7 @@ typedef struct trapframe {
>   register_t tf_svc_sp;
>   register_t tf_svc_lr;
>   register_t tf_pc;
> + register_t tf_pad;
>  } trapframe_t;
>  
>  /* Register numbers */
> @@ -137,6 +138,7 @@ typedef struct irqframe {
>   unsigned int if_svc_sp;
>   unsigned int if_svc_lr;
>   unsigned int if_pc;
> + unsigned int if_pad;
>  } irqframe_t;
>  
>  #define clockframe irqframe
> @@ -146,6 +148,7 @@ typedef struct irqframe {
>   */
>  
>  struct switchframe {
> + u_int   sf_pad;
>   u_int   sf_r4;
>   u_int   sf_r5;
>   u_int   sf_r6;
> @@ -203,6 +206,7 @@ struct frame {
>   */
>  
>  #define PUSHFRAME   \
> + sub sp, sp, #4; /* Align the stack */  \
>   str lr, [sp, #-4]!; /* Push the return address */  \
>   sub sp, sp, #(4*17);/* Adjust the stack pointer */ \
>   stmia   sp, {r0-r14}^;  /* Push the user mode registers */ \
> @@ -221,7 +225,8 @@ struct frame {
>  ldmia   sp, {r0-r14}^;   /* Restore registers (usr mode) 
> */ \
>  mov r0, r0; /* NOP for previous instruction */ \
>   add sp, sp, #(4*17);/* Adjust the stack pointer */ \
> - ldr lr, [sp], #0x0004   /* Pull the return address */
> + ldr lr, [sp], #0x0004;  /* Pull the return address */  \
> + add sp, sp, #4  /* Align the stack */
>  
>  /*
>   * PUSHFRAMEINSVC - 

Re: arm: 8-byte alignment (AAPCS)

2016-04-03 Thread Miod Vallat

> Can someone make sure this doesn't break armish/zaurus?

No noticeable regression on armish.



Re: arm: 8-byte alignment (AAPCS)

2016-04-02 Thread Patrick Wildt
On Sat, Apr 02, 2016 at 12:44:15PM -0700, Philip Guenther wrote:
> On Sat, Apr 2, 2016 at 5:19 AM, Patrick Wildt  wrote:
> > the Procedure Call Standard used in EABI requires the stack pointer to
> > be 8-byte aligned by
> >
> >   * exception handlers, before calling AAPCS-conforming code.
> >   * the OS, before giving control to an application.
> >
> > This diff makes sure our kernel interfaces adhere to that requirement.
> >
> > Can someone make sure this doesn't break armish/zaurus?
> 
> The changes to cpu_switchto(), savectx(), and switchframe seem odd;
> those are leaf functions, no?  When would their frame layouts result
> in misaligned stacks for EABI-expecting code?
> 
> The placement of if_pad is suspect: you're adjusting right before
> pushing sp, so shouldn't it be between if_sp and if_pc?  If that's off
> then the CLKF_PC() macro will access the padding instead of the pushed
> pc, which would break profiling.

Don't confuse the access to the "lr" register with the irqframe members
called lr!

A branch with link saves the current PC into the LR register.  So
basically the LR tells us where we came from.  This is basically the
PC of the process we came from, so we save it in the frame's PC member.

This means the trapframe is made by doing:

  1. pad -> if_pad
  2. push process' PC by storing the current LR -> if_pc
  3. push SVC's LR/SP if needed -> if_svc_*
  4. push the user mode registers -> up until spsr
  5. push the user mode cpsr (current spsr) -> if_spsr

> 
> 
> Philip Guenther
> 



Re: arm: 8-byte alignment (AAPCS)

2016-04-02 Thread Patrick Wildt
On Sat, Apr 02, 2016 at 01:26:18PM -0700, Philip Guenther wrote:
> On Sat, Apr 2, 2016 at 12:44 PM, Philip Guenther  wrote:
> > On Sat, Apr 2, 2016 at 5:19 AM, Patrick Wildt  wrote:
> >> the Procedure Call Standard used in EABI requires the stack pointer to
> >> be 8-byte aligned by
> >>
> >>   * exception handlers, before calling AAPCS-conforming code.
> >>   * the OS, before giving control to an application.
> >>
> >> This diff makes sure our kernel interfaces adhere to that requirement.
> >>
> >> Can someone make sure this doesn't break armish/zaurus?
> >
> > The changes to cpu_switchto(), savectx(), and switchframe seem odd;
> > those are leaf functions, no?  When would their frame layouts result
> > in misaligned stacks for EABI-expecting code?
> 
> Ah, I now see where cpu_switchto() makes calls to C functions, so that
> part makes sense to me now.
> 
> 
> Philip Guenther
> 

Actually, it's a bit tricky.  savectx() actually does work with the SP,
even though it's not easily visible.  If we look in the !XSCALE case,
you can see it also stores r13, which is SP.  In the XSCALE case the
strd on r12 also stores r13.  This directly correlates with how
cpu_fork() and cpu_switchto() work.

What cpu_fork() does is create/fake two frames.  First a trapframe.
This trapframe is used to unwind using a PULLFRAME macro in
proc_trampoline().  The same macro, and the PUSHFRAME macro, is used on
kernel entry (e.g. interrupt).  It's reserved from a per-process kernel
stack pointer.  Then it sets up a switchframe.  This is used to unwind
out of cpu_switchto().  It's reserved directly after the trapframe.

struct switchframe *sf;
[..]
pcb->pcb_un.un_32.pcb32_sp = (u_int)p2->p_addr + USPACE_SVC_STACK_TOP;
p2->p_addr->u_pcb.pcb_tf = tf =
(struct trapframe *)pcb->pcb_un.un_32.pcb32_sp - 1;
[..]
sf = (struct switchframe *)tf - 1;
[..]
sf->sf_pc = (u_int)proc_trampoline;
pcb->pcb_un.un_32.pcb32_sp = (u_int)sf;

As we pad the trapframe and switchframe, pcb32_sp stays aligned.
This pcb32_sp is later on being used as a SP in cpu_switchto():

/* Restore all the save registers */
[..]
ldmia   r7, {r8-r13}
[..]

/*
 * Pull the registers that got pushed when either savectx() or
 * cpu_switch() was called and return.
 */
add sp, sp, #4
ldmfd   sp!, {r4-r7, pc}

The start of cpu_switchto() saves the registers into a PCB struct,
exactly like savectx().

So this stuff essentially makes sure that the in-kernel stack is
correctly aligned, too.



Re: arm: 8-byte alignment (AAPCS)

2016-04-02 Thread Philip Guenther
On Sat, Apr 2, 2016 at 12:44 PM, Philip Guenther  wrote:
> On Sat, Apr 2, 2016 at 5:19 AM, Patrick Wildt  wrote:
>> the Procedure Call Standard used in EABI requires the stack pointer to
>> be 8-byte aligned by
>>
>>   * exception handlers, before calling AAPCS-conforming code.
>>   * the OS, before giving control to an application.
>>
>> This diff makes sure our kernel interfaces adhere to that requirement.
>>
>> Can someone make sure this doesn't break armish/zaurus?
>
> The changes to cpu_switchto(), savectx(), and switchframe seem odd;
> those are leaf functions, no?  When would their frame layouts result
> in misaligned stacks for EABI-expecting code?

Ah, I now see where cpu_switchto() makes calls to C functions, so that
part makes sense to me now.


Philip Guenther



Re: arm: 8-byte alignment (AAPCS)

2016-04-02 Thread Philip Guenther
On Sat, Apr 2, 2016 at 5:19 AM, Patrick Wildt  wrote:
> the Procedure Call Standard used in EABI requires the stack pointer to
> be 8-byte aligned by
>
>   * exception handlers, before calling AAPCS-conforming code.
>   * the OS, before giving control to an application.
>
> This diff makes sure our kernel interfaces adhere to that requirement.
>
> Can someone make sure this doesn't break armish/zaurus?

The changes to cpu_switchto(), savectx(), and switchframe seem odd;
those are leaf functions, no?  When would their frame layouts result
in misaligned stacks for EABI-expecting code?

The placement of if_pad is suspect: you're adjusting right before
pushing sp, so shouldn't it be between if_sp and if_pc?  If that's off
then the CLKF_PC() macro will access the padding instead of the pushed
pc, which would break profiling.


Philip Guenther



arm: 8-byte alignment (AAPCS)

2016-04-02 Thread Patrick Wildt
Hi,

the Procedure Call Standard used in EABI requires the stack pointer to
be 8-byte aligned by

  * exception handlers, before calling AAPCS-conforming code.
  * the OS, before giving control to an application.

This diff makes sure our kernel interfaces adhere to that requirement.

Can someone make sure this doesn't break armish/zaurus?

Patrick

diff --git sys/arch/arm/arm/cpuswitch.S sys/arch/arm/arm/cpuswitch.S
index 98e2dbe..0c3d0af 100644
--- sys/arch/arm/arm/cpuswitch.S
+++ sys/arch/arm/arm/cpuswitch.S
@@ -171,6 +171,7 @@ ENTRY(cpu_idle_leave)
 
 ENTRY(cpu_switchto)
stmfd   sp!, {r4-r7, lr}
+   sub sp, sp, #4
 
 #ifdef MULTIPROCESSOR
/* XXX use curcpu() */
@@ -451,6 +452,7 @@ ENTRY(cpu_switchto)
 * Pull the registers that got pushed when either savectx() or
 * cpu_switch() was called and return.
 */
+   add sp, sp, #4
ldmfd   sp!, {r4-r7, pc}
 
 /* LINTSTUB: Func: void savectx(struct pcb *pcb) */
@@ -461,6 +463,7 @@ ENTRY(savectx)
 
/* Push registers.*/
stmfd   sp!, {r4-r7, lr}
+   sub sp, sp, #4
 
/* Store all the registers in the process's pcb */
 #ifndef __XSCALE__
@@ -473,6 +476,7 @@ ENTRY(savectx)
 #endif
 
/* Pull the regs of the stack */
+   add sp, sp, #4
ldmfd   sp!, {r4-r7, pc}
 
 ENTRY(proc_trampoline)
diff --git sys/arch/arm/arm/cpuswitch7.S sys/arch/arm/arm/cpuswitch7.S
index 126b41a..4db9a86 100644
--- sys/arch/arm/arm/cpuswitch7.S
+++ sys/arch/arm/arm/cpuswitch7.S
@@ -165,6 +165,7 @@ ENTRY(cpu_idle_leave)
 
 ENTRY(cpu_switchto)
stmfd   sp!, {r4-r7, lr}
+   sub sp, sp, #4
 
 #ifdef MULTIPROCESSOR
/* XXX use curcpu() */
@@ -396,6 +397,7 @@ ENTRY(cpu_switchto)
 * Pull the registers that got pushed when either savectx() or
 * cpu_switch() was called and return.
 */
+   add sp, sp, #4
ldmfd   sp!, {r4-r7, pc}
 
 /* LINTSTUB: Func: void savectx(struct pcb *pcb) */
@@ -406,6 +408,7 @@ ENTRY(savectx)
 
/* Push registers.*/
stmfd   sp!, {r4-r7, lr}
+   sub sp, sp, #4
 
/* Store all the registers in the process's pcb */
 #ifndef __XSCALE__
@@ -418,6 +421,7 @@ ENTRY(savectx)
 #endif
 
/* Pull the regs of the stack */
+   add sp, sp, #4
ldmfd   sp!, {r4-r7, pc}
 
 ENTRY(proc_trampoline)
diff --git sys/arch/arm/arm/vm_machdep.c sys/arch/arm/arm/vm_machdep.c
index 06f217b..84dfb68 100644
--- sys/arch/arm/arm/vm_machdep.c
+++ sys/arch/arm/arm/vm_machdep.c
@@ -140,10 +140,11 @@ cpu_fork(p1, p2, stack, stacksize, func, arg)
*tf = *p1->p_addr->u_pcb.pcb_tf;
 
/*
-* If specified, give the child a different stack.
+* If specified, give the child a different stack (make sure
+* it's 8-byte aligned).
 */
if (stack != NULL)
-   tf->tf_usr_sp = (u_int)stack + stacksize;
+   tf->tf_usr_sp = ((vaddr_t)(stack) + stacksize) & -8;
 
sf = (struct switchframe *)tf - 1;
sf->sf_r4 = (u_int)func;
diff --git sys/arch/arm/include/frame.h sys/arch/arm/include/frame.h
index 31b2936..56e1368 100644
--- sys/arch/arm/include/frame.h
+++ sys/arch/arm/include/frame.h
@@ -75,6 +75,7 @@ typedef struct trapframe {
register_t tf_svc_sp;
register_t tf_svc_lr;
register_t tf_pc;
+   register_t tf_pad;
 } trapframe_t;
 
 /* Register numbers */
@@ -137,6 +138,7 @@ typedef struct irqframe {
unsigned int if_svc_sp;
unsigned int if_svc_lr;
unsigned int if_pc;
+   unsigned int if_pad;
 } irqframe_t;
 
 #define clockframe irqframe
@@ -146,6 +148,7 @@ typedef struct irqframe {
  */
 
 struct switchframe {
+   u_int   sf_pad;
u_int   sf_r4;
u_int   sf_r5;
u_int   sf_r6;
@@ -203,6 +206,7 @@ struct frame {
  */
 
 #define PUSHFRAME \
+   sub sp, sp, #4; /* Align the stack */  \
str lr, [sp, #-4]!; /* Push the return address */  \
sub sp, sp, #(4*17);/* Adjust the stack pointer */ \
stmia   sp, {r0-r14}^;  /* Push the user mode registers */ \
@@ -221,7 +225,8 @@ struct frame {
 ldmia   sp, {r0-r14}^; /* Restore registers (usr mode) */ \
 mov r0, r0; /* NOP for previous instruction */ \
add sp, sp, #(4*17);/* Adjust the stack pointer */ \
-   ldr lr, [sp], #0x0004   /* Pull the return address */
+   ldr lr, [sp], #0x0004;  /* Pull the return address */  \
+   add sp, sp, #4  /* Align the stack */
 
 /*
  * PUSHFRAMEINSVC - macro to push a trap frame on the stack in SVC32 mode
@@ -241,6 +246,8 @@ struct frame {
orr r2, r2, #(PSR_SVC32_MODE); \
msr cpsr_c, r2; /* Punch into SVC mode */  \