Hi,
I found there is a performance problem with some simd intrinsics
(vld2_dup_*) on aarch64-none-elf. Now the vld2_dup_* are defined as
follows:
#define __LD2R_FUNC(rettype, structtype, ptrtype, \
regsuffix, funcsuffix, Q) \
__extension__ static __inline rettype \
__attribute__ ((__always_inline__)) \
vld2 ## Q ## _dup_ ## funcsuffix (const ptrtype *ptr) \
{ \
rettype result; \
__asm__ ("ld2r {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \
"st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t" \
: "=Q"(result) \
: "Q"(*(const structtype *)ptr) \
: "memory", "v16", "v17"); \
return result; \
}
It loads from memory to registers, and then store the value of
registers to memory as a result. Such code is terribly low in
performance because of redundant memory visit and limited registers
allocation.
Some intinsics like vld2_* were similar to vld2_dup_*, but now they
are realized by builtin functions.
__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
vld2_s16 (const int16_t * __a)
{
int16x4x2_t ret;
__builtin_aarch64_simd_oi __o;
__o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
return ret;
}
Could vld2_dup_* also be written as builtin ? If not, i think the
inline assembler can be optimized as follows :
#define __LD2R_FUNC(rettype, structtype, ptrtype, \
regsuffix, funcsuffix, Q) \
__extension__ static __inline rettype \
__attribute__ ((__always_inline__)) \
vld2 ## Q ## _dup_ ## funcsuffix (const ptrtype *ptr) \
{ \
rettype result; \
__asm__ (
"ld2r {%0.4h, %1.4h}, %2" \
: "=V16"(result.val[0]), "=V17"(result.val[1]) \
: "Q"(*(const structtype *)ptr) \
: "memory", "v16", "v17"); \
return result; \
}
It need to add a reg_class_name v16&v17 and add constraints V16 & V17
for them. For this, aarch64.h、aarch64.c、constraints.md should be
modified.
--
Shanyao Chen