https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108320
Bug ID: 108320 Summary: Missing vector/array arithmetic optimization compared to valarray Product: gcc Version: 12.2.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c++ Assignee: unassigned at gcc dot gnu.org Reporter: diegoandres91b at hotmail dot com Target Milestone: --- The next code (with -O3 -mavx2 -mfma): #include <valarray> #include <vector> #include <array> using namespace std; valarray<float> fma1(const valarray<float> &a, const valarray<float> &b, const valarray<float> &c) { return a * b + c; } template<class T> struct vec : vector<T> { constexpr vec(size_t count) : vector<T>(count) {} }; template<class T> constexpr vec<T> operator*(const vec<T> &a, const vec<T> &b) { vec<T> c(a.size()); for (size_t i = 0; i < c.size(); ++i) c[i] = a[i] * b[i]; return c; } template<class T> constexpr vec<T> operator+(const vec<T> &a, const vec<T> &b) { vec<T> c(a.size()); for (size_t i = 0; i < c.size(); ++i) c[i] = a[i] + b[i]; return c; } vec<float> fma2(const vec<float> &a, const vec<float> &b, const vec<float> &c) { return a * b + c; } template<class T, size_t N> struct arr : array<T, N> { }; template<class T, size_t N> constexpr arr<T, N> operator*(const arr<T, N> &a, const arr<T, N> &b) { arr<T, N> c; for (size_t i = 0; i < c.size(); ++i) c[i] = a[i] * b[i]; return c; } template<class T, size_t N> constexpr arr<T, N> operator+(const arr<T, N> &a, const arr<T, N> &b) { arr<T, N> c; for (size_t i = 0; i < c.size(); ++i) c[i] = a[i] + b[i]; return c; } constexpr size_t N = 1024; arr<float, N> fma3(const arr<float, N> &a, const arr<float, N> &b, const arr<float, N> &c) { return a * b + c; } Only optimizes the valarray version (fma1) of the fma function (uses vfmadd132ps): ... .L4: vmovups ymm0, YMMWORD PTR [rdi+rax] vmovups ymm1, YMMWORD PTR [rcx+rax] vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax] vmovups YMMWORD PTR [rdx+rax], ymm0 add rax, 32 cmp rax, r8 jne .L4 mov rax, r10 and rax, -8 lea r9, [0+rax*4] lea r11, [rdx+r9] test r10b, 7 je .L22 vzeroupper .L3: mov r8, r10 sub r8, rax lea r12, [r8-1] cmp r12, 2 jbe .L6 vmovups xmm0, XMMWORD PTR [rdi+rax*4] vmovups xmm2, XMMWORD PTR [rcx+rax*4] vfmadd132ps xmm0, xmm2, XMMWORD PTR [rsi+rax*4] vmovups XMMWORD PTR [rdx+r9], xmm0 test r8b, 3 je .L1 and r8, -4 add rax, r8 lea r11, [r11+r8*4] lea r9, [0+rax*4] ... But it does not optimize the vector or array versions of the function (fma2 and fma3). Note: For smaller N in fma3 optimizes, but for larger numbers like 1024 in the example it does not. Compiler Explorer code: https://godbolt.org/z/v8dnx5aMo