https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79355
Bug ID: 79355 Summary: poor code for AVX vector compare Product: gcc Version: 6.3.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: mirq-gccboogs at rere dot qmqm.pl Target Milestone: --- gcc-6.2 (and previous versions) generates a very inefficient code for AVX when comparing 32-byte vectors: $ cat a.c #include <x86intrin.h> __v8su eq2(__v8su a, __v8su b) { return a == b; } $ gcc -S -Ofast -mavx a.c -o - .file "a.c" .text .p2align 4,,15 .globl eq2 .type eq2, @function eq2: .LFB4856: .cfi_startproc vmovd %xmm0, %edx vmovd %xmm1, %eax leaq 8(%rsp), %r10 .cfi_def_cfa 10, 0 vpextrd $1, %xmm0, %ecx andq $-32, %rsp cmpl %eax, %edx [... extracting and comparing every element here ...] vpinsrd $1, %r11d, %xmm5, %xmm1 vpinsrd $1, %r9d, %xmm7, %xmm0 popq %r10 .cfi_def_cfa 10, 0 vpunpcklqdq %xmm3, %xmm0, %xmm0 vpunpcklqdq %xmm2, %xmm1, %xmm1 popq %rbp leaq -8(%r10), %rsp .cfi_def_cfa 7, 8 vinsertf128 $0x1, %xmm1, %ymm0, %ymm0 ret When it could instead generate (i.e. split vector in half and combine afterwards): vextractf128 $0x1, %ymm0, %xmm2 vextractf128 $0x1, %ymm1, %xmm3 vpcmpeqd %xmm1, %xmm0, %xmm0 vpcmpeqd %xmm3, %xmm2, %xmm2 vinsertf128 $0x1, %xmm2, %ymm0, %ymm0 ret $ gcc -v Using built-in specs. COLLECT_GCC=gcc COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/6/lto-wrapper Target: x86_64-linux-gnu Configured with: ../src/configure -v --with-pkgversion='Debian 6.3.0-5' --with-bugurl=file:///usr/share/doc/gcc-6/README.Bugs --enable-languages=c,ada,c++,java ,go,d,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-6 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-libmpx --enable-plugin --enable-default-pie --with-system-zlib --disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-6-amd64/jre --enable-java-home --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-6-amd64 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-6-amd64 --with-arch-directory=amd64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --with-target-system-zlib --enable-objc-gc=auto --enable-multiarch --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu Thread model: posix gcc version 6.3.0 20170124 (Debian 6.3.0-5)