https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
John Platts <john_platts at hotmail dot com> changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |john_platts at hotmail dot com --- Comment #28 from John Platts <john_platts at hotmail dot com> --- The correct way to align the stack to a 32-byte or 64-byte boundary on 64-bit Windows is to use a frame pointer in a function that requires stack realignment and then realign the stack to the required alignment once the frame pointer is set and all of the non-volatile registers used in the function are saved. class Avx2VectorGenerator { public: virtual __m256i NextVector() = 0; }; __m256i Example_AVX2_Func(Avx2VectorGenerator* generator, size_t iterations); Example_AVX2_Func: pushq %rbp .seh_pushreg %rbp pushq %rbx .seh_pushreg %rbx pushq %rdi .seh_pushreg %rdi movq %rsp, %rbp .seh_setframe %rbp, 0 .seh_endprologue /* Set rbx to generator and rdi to iterations */ movq %rcx, %rbx movq %rdx, %rdi /* It is okay to allocate additional stack memory */ /* and re-align the stack pointer outside of the */ /* SEH prologue as there is a frame pointer in this */ /* function */ subq $64, %rsp andq $-32, %rsp /* Zero out the result vector */ vpxor %ymm0, %ymm0, %ymm0 test %rdi, %rdi jz .loop_complete .loop_iteration_start: /* Save the result vector to 32(%rsp) */ vmovdqa 32(%rsp), ymm0 /* Move generator into rcx */ movq %rbx, %rcx /* Move the pointer to the NextVector() virtual member func */ /* into rax */ movq (%rbx), %rax /* Call generator->NextVector() */ call *(%rax) /* Add the result of generator->NextVector() to the result vector */ vpaddb 32(%rsp), %ymm0, %ymm0 /* Decrement iterations by 1 */ sub $1, %rdi /* Jump back to the beginning of the loop if iterations is non-zero */ jnz .loop_iteration_start .loop_complete: lea (%rbp), %rsp pop %rdi pop %rbx pop %rbp ret .seh_endproc