https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71927
Bug ID: 71927 Summary: stack alignment prologue not optimized out when no local variables remain Product: gcc Version: 5.4.0 Status: UNCONFIRMED Keywords: missed-optimization Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: mirq-gccboogs at rere dot qmqm.pl Target Milestone: --- Created attachment 38925 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=38925&action=edit test case gcc 5.4 optimizes out local array, but does not remove stack-alignment code. x() has 2-element __m256i array - get's optimized out correctly y() has 3-element __m256i array - stack alignment is left in place for the y() case, at -O2 the loop is not unrolled (unlike x()) $ gcc -mavx2 -O3 -S -o - a.c [...] x: .LFB4854: .cfi_startproc vmovdqa (%rdi), %ymm2 vpminud %ymm1, %ymm2, %ymm3 vpmaxud %ymm1, %ymm2, %ymm1 vmovdqa %ymm1, (%rdi) vmovdqa 32(%rdi), %ymm1 vpminud %ymm0, %ymm1, %ymm2 vpmaxud %ymm0, %ymm1, %ymm0 vmovdqa %ymm0, 32(%rdi) vzeroupper ret .cfi_endproc [...] y: .LFB4855: .cfi_startproc leaq 8(%rsp), %r10 .cfi_def_cfa 10, 0 andq $-32, %rsp pushq -8(%r10) pushq %rbp .cfi_escape 0x10,0x6,0x2,0x76,0 movq %rsp, %rbp pushq %r10 .cfi_escape 0xf,0x3,0x76,0x78,0x6 vmovdqa (%rdi), %ymm0 vpminud %ymm0, %ymm3, %ymm4 vpmaxud %ymm0, %ymm3, %ymm0 vmovdqa %ymm0, (%rdi) vmovdqa 32(%rdi), %ymm0 vpminud %ymm0, %ymm2, %ymm3 vpmaxud %ymm0, %ymm2, %ymm0 vmovdqa %ymm0, 32(%rdi) vmovdqa 64(%rdi), %ymm0 vpminud %ymm0, %ymm1, %ymm2 vpmaxud %ymm0, %ymm1, %ymm0 vmovdqa %ymm0, 64(%rdi) vzeroupper popq %r10 .cfi_def_cfa 10, 0 popq %rbp leaq -8(%r10), %rsp .cfi_def_cfa 7, 8 ret .cfi_endproc $ gcc -mavx2 -O2 -S -o - a.c [... x() - same as -O3] y: .LFB4855: .cfi_startproc leaq 8(%rsp), %r10 .cfi_def_cfa 10, 0 andq $-32, %rsp pushq -8(%r10) pushq %rbp .cfi_escape 0x10,0x6,0x2,0x76,0 movq %rsp, %rbp pushq %r10 .cfi_escape 0xf,0x3,0x76,0x78,0x6 vmovdqa %ymm2, -112(%rbp) xorl %eax, %eax vmovdqa %ymm1, -80(%rbp) vmovdqa %ymm0, -48(%rbp) .L3: vmovdqa (%rdi,%rax), %ymm0 vmovdqa -112(%rbp,%rax), %ymm1 vpminud %ymm0, %ymm1, %ymm2 vpmaxud %ymm0, %ymm1, %ymm0 vmovdqa %ymm2, -112(%rbp,%rax) vmovdqa %ymm0, (%rdi,%rax) addq $32, %rax cmpq $96, %rax jne .L3 vmovdqa -48(%rbp), %ymm2 vmovdqa -80(%rbp), %ymm1 vmovdqa -112(%rbp), %ymm0 vzeroupper popq %r10 .cfi_def_cfa 10, 0 popq %rbp leaq -8(%r10), %rsp .cfi_def_cfa 7, 8 ret .cfi_endproc $ gcc -v Using built-in specs. COLLECT_GCC=gcc COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper Target: x86_64-linux-gnu Configured with: ../src/configure -v --with-pkgversion='Debian 5.4.0-6' --with-bugurl=file:///usr/share/doc/gcc-5/README.Bugs --enable-languages=c,ada,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-5 --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-libmpx --enable-plugin --with-system-zlib --disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-5-amd64/jre --enable-java-home --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-5-amd64 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-5-amd64 --with-arch-directory=amd64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-multiarch --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu Thread model: posix gcc version 5.4.0 20160609 (Debian 5.4.0-6)