https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116590
--- Comment #4 from nihui <shuizhuyuanluo at gmail dot com> ---
Some more detailed investigation
vfrec7 I can use vfdiv instead, which works, but the vmv1r vmv8r instructions
are hard to avoid in the code, as they seem to be added automatically by the
compiler
I wrote 3 test cases, 1 of which compiles, 1 of which breaks down with O0
optimization, and 1 of which breaks down with O2/O3 optimization
The code that breaks down is just inlining the _a variable or changing its
scope
// all pass
void gen_vmv8r(float* ptr, int n)
{
while (n > 0)
{
size_t vl = __riscv_vsetvl_e32m8(n);
vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
vfloat32m8_t _a = __riscv_vfmv_v_f_f32m8(0.5f, vl);
_p = __riscv_vfmacc_vf_f32m8(_a, 1.4f, _p, vl);
__riscv_vse32_v_f32m8(ptr, _p, vl);
ptr += vl;
n -= vl;
}
}
// -O0 gen vmv8r
// -O1 pass
// -O2/-O3 pass
void gen_vmv8r(float* ptr, int n)
{
while (n > 0)
{
size_t vl = __riscv_vsetvl_e32m8(n);
vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
_p = __riscv_vfmacc_vf_f32m8(__riscv_vfmv_v_f_f32m8(0.5f, vl), 1.4f,
_p, vl);
__riscv_vse32_v_f32m8(ptr, _p, vl);
ptr += vl;
n -= vl;
}
}
// -O0 pass
// -O1 gen vmv8r
// -O2/-O3 gen two vmv8r
void no_vmv8r(float* ptr, int n)
{
size_t vl0 = __riscv_vsetvl_e32m8(n);
vfloat32m8_t _a = __riscv_vfmv_v_f_f32m8(0.5f, vl0);
while (n > 0)
{
size_t vl = __riscv_vsetvl_e32m8(n);
vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
_p = __riscv_vfmacc_vf_f32m8(_a, 1.4f, _p, vl);
__riscv_vse32_v_f32m8(ptr, _p, vl);
ptr += vl;
n -= vl;
}
}