https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104188

            Bug ID: 104188
           Summary: gcc omitting AVX-512 broadcast instruction
           Product: gcc
           Version: 11.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: kvr000 at gmail dot com
  Target Milestone: ---

Hi,
there is a bug when generating AVX-512 instructions from intrinsics.  The code
is generated correctly in gcc-10 but gcc-11 completely omits the
vbroadcastf32x4 .

gcc version: 11.2.0-7ubuntu2 - 11.2.0

Source code of minimal working example:
// Matrix 4*4 multiplication:

#ifndef NO_VECTORIZE
#ifdef __x86_64__
#include <immintrin.h>
#include <x86intrin.h>
#endif
#ifdef __aarch64__
#include <arm_neon.h>
#endif
#endif

union Mat44 {
        float m[4][4];
#ifndef NO_VECTORIZE
#ifdef __x86_64__
        __m128 row[4];
        __m256 rowDuet[2];
        __m512 rowQuad;
#endif
#ifdef __aarch64__
        float32x4_t row[4];
#endif
#endif
};

void matmult_avx512(union Mat44 *out, union Mat44 *a, union Mat44 *b)
{
        __m512 a0123 = _mm512_loadu_ps(a->m[0]);
        __m512 b0000 = _mm512_broadcast_f32x4(b->row[0]);
        __m512 b1111 = _mm512_broadcast_f32x4(b->row[1]);
        __m512 b2222 = _mm512_broadcast_f32x4(b->row[2]);
        __m512 b3333 = _mm512_broadcast_f32x4(b->row[3]);

        __m512 result = _mm512_mul_ps(_mm512_permute_ps(a0123, 0x00), b0000);
        result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0x55), b1111,
result);
        result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0xaa), b2222,
result);
        result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0xff), b3333,
result);

        _mm512_storeu_ps(out->m[0], result);
}


gcc-10 (correct):

        endbr64
        vmovups (%rsi), %zmm0
        vbroadcastf32x4 (%rdx), %zmm6           // note here
        vpermilps       $0, %zmm0, %zmm1
        vmulps  %zmm6, %zmm1, %zmm1
        vbroadcastf32x4 16(%rdx), %zmm5         // note here
        vpermilps       $85, %zmm0, %zmm2
        vbroadcastf32x4 32(%rdx), %zmm4         // note here
        vbroadcastf32x4 48(%rdx), %zmm3         // note here
        vfmadd132ps     %zmm5, %zmm1, %zmm2
        vpermilps       $170, %zmm0, %zmm1
        vpermilps       $255, %zmm0, %zmm0
        vfmadd132ps     %zmm4, %zmm2, %zmm1
        vfmadd132ps     %zmm3, %zmm1, %zmm0
        vmovups %zmm0, (%rdi)
        vzeroupper
        ret


gcc-11 (missing vbroadcasatf32x4) :

        endbr64
        vmovups (%rsi), %zmm0
        vpermilps       $0, %zmm0, %zmm1
        vmulps  (%rdx){1to16}, %zmm1, %zmm1
        vpermilps       $85, %zmm0, %zmm2
        vfmadd132ps     16(%rdx){1to16}, %zmm1, %zmm2
        vpermilps       $170, %zmm0, %zmm1
        vpermilps       $255, %zmm0, %zmm0
        vfmadd132ps     32(%rdx){1to16}, %zmm2, %zmm1
        vfmadd132ps     48(%rdx){1to16}, %zmm1, %zmm0
        vmovups %zmm0, (%rdi)
        vzeroupper
        ret

Reply via email to