https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66697

--- Comment #10 from Uroš Bizjak <ubizjak at gmail dot com> ---
Version 2 patch to be attached. For the following testcase:

--cut here--
typedef float __v4sf __attribute__ ((__vector_size__ (16)));

__v4sf
__attribute__((force_align_arg_pointer, ms_abi))
test_ms (__v4sf a, __v4sf b)
{
  volatile register __v4sf x __asm__("%xmm6") = a;
  volatile register __v4sf y __asm__("%xmm7") = b;

  volatile __v4sf r = x + y;

  return r;
}
--cut here--

-O2 generates:

0000000000000000 <test_ms>:
   0:   55                      push   %rbp
   1:   48 89 e5                mov    %rsp,%rbp
   4:   48 83 ec 20             sub    $0x20,%rsp
   8:   48 83 e4 f0             and    $0xfffffffffffffff0,%rsp
   c:   48 83 ec 10             sub    $0x10,%rsp
  10:   0f 28 02                movaps (%rdx),%xmm0
  13:   0f 58 01                addps  (%rcx),%xmm0
  16:   0f 11 75 e0             movups %xmm6,0xffffffffffffffe0(%rbp)
  1a:   0f 11 7d f0             movups %xmm7,0xfffffffffffffff0(%rbp)
  1e:   0f 29 04 24             movaps %xmm0,(%rsp)
  22:   0f 10 75 e0             movups 0xffffffffffffffe0(%rbp),%xmm6
  26:   0f 28 04 24             movaps (%rsp),%xmm0
  2a:   0f 10 7d f0             movups 0xfffffffffffffff0(%rbp),%xmm7
  2e:   c9                      leaveq 
  2f:   c3                      retq   

xmm6 and xmm7 are saved outside stack realignment area. Please note that movups
with aligned operand is as fast as movaps, so considering that unaligned
incoming stack should be avoided, I think that the above code is acceptable.

Reply via email to