The compiler cannot chain more than two additions together. Use inline assembly for 3 or 4 additions.
Signed-off-by: Richard Henderson <richard.hender...@linaro.org> --- include/fpu/softfloat-macros.h | 18 ++++++++++++++++-- fpu/softfloat.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h index 95d88d05b8..99fa124e56 100644 --- a/include/fpu/softfloat-macros.h +++ b/include/fpu/softfloat-macros.h @@ -436,6 +436,13 @@ static inline void uint64_t *z2Ptr ) { +#ifdef __x86_64__ + asm("add %5, %2\n\t" + "adc %4, %1\n\t" + "adc %3, %0" + : "=&r"(*z0Ptr), "=&r"(*z1Ptr), "=&r"(*z2Ptr) + : "rm"(b0), "rm"(b1), "rm"(b2), "0"(a0), "1"(a1), "2"(a2)); +#else uint64_t z0, z1, z2; int8_t carry0, carry1; @@ -450,7 +457,7 @@ static inline void *z2Ptr = z2; *z1Ptr = z1; *z0Ptr = z0; - +#endif } /*---------------------------------------------------------------------------- @@ -494,6 +501,13 @@ static inline void uint64_t *z2Ptr ) { +#ifdef __x86_64__ + asm("sub %5, %2\n\t" + "sbb %4, %1\n\t" + "sbb %3, %0" + : "=&r"(*z0Ptr), "=&r"(*z1Ptr), "=&r"(*z2Ptr) + : "rm"(b0), "rm"(b1), "rm"(b2), "0"(a0), "1"(a1), "2"(a2)); +#else uint64_t z0, z1, z2; int8_t borrow0, borrow1; @@ -508,7 +522,7 @@ static inline void *z2Ptr = z2; *z1Ptr = z1; *z0Ptr = z0; - +#endif } /*---------------------------------------------------------------------------- diff --git a/fpu/softfloat.c b/fpu/softfloat.c index 5b714fbd82..d8e5d90fd7 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -7297,6 +7297,15 @@ static void shift256RightJamming(uint64_t p[4], int count) /* R = A - B */ static void sub256(uint64_t r[4], uint64_t a[4], uint64_t b[4]) { +#if defined(__x86_64__) + asm("sub %7, %3\n\t" + "sbb %6, %2\n\t" + "sbb %5, %1\n\t" + "sbb %4, %0" + : "=&r"(r[0]), "=&r"(r[1]), "=&r"(r[2]), "=&r"(r[3]) + : "rme"(b[0]), "rme"(b[1]), "rme"(b[2]), "rme"(b[3]), + "0"(a[0]), "1"(a[1]), "2"(a[2]), "3"(a[3])); +#else bool borrow = false; for (int i = 3; i >= 0; --i) { @@ -7308,11 +7317,20 @@ static void sub256(uint64_t r[4], uint64_t a[4], uint64_t b[4]) r[i] = a[i] - b[i]; } } +#endif } /* A = -A */ static void neg256(uint64_t a[4]) { +#if defined(__x86_64__) + asm("negq %3\n\t" + "sbb %6, %2\n\t" + "sbb %5, %1\n\t" + "sbb %4, %0" + : "=&r"(a[0]), "=&r"(a[1]), "=&r"(a[2]), "+rm"(a[3]) + : "rme"(a[0]), "rme"(a[1]), "rme"(a[2]), "0"(0), "1"(0), "2"(0)); +#else a[3] = -a[3]; if (likely(a[3])) { goto not2; @@ -7333,11 +7351,20 @@ static void neg256(uint64_t a[4]) a[1] = ~a[1]; not0: a[0] = ~a[0]; +#endif } /* A += B */ static void add256(uint64_t a[4], uint64_t b[4]) { +#if defined(__x86_64__) + asm("add %7, %3\n\t" + "adc %6, %2\n\t" + "adc %5, %1\n\t" + "adc %4, %0" + : "+r"(a[0]), "+r"(a[1]), "+r"(a[2]), "+r"(a[3]) + : "rme"(b[0]), "rme"(b[1]), "rme"(b[2]), "rme"(b[3])); +#else bool carry = false; for (int i = 3; i >= 0; --i) { @@ -7350,6 +7377,7 @@ static void add256(uint64_t a[4], uint64_t b[4]) } a[i] = t; } +#endif } float128 float128_muladd(float128 a_f, float128 b_f, float128 c_f, -- 2.25.1