New patch attached.
This patch fixes ARM functions and adds a check in `fpu_fma()` for potential
NaN or INF results.
------------------
Best regards,
lh_mouse
2017-01-19
From 3c55daec84dac190b9e3cb032371960e1acbc38f Mon Sep 17 00:00:00 2001
From: LH_Mouse <lh_mo...@126.com>
Date: Wed, 18 Jan 2017 19:35:43 +0800
Subject: [PATCH] mingw-w64-crt/math/fma{,f,l}.c: Implement fused multiply-add
(FMA) funcitons for x86 families properly. mingw-w64-crt/Makefile.am:
Likewise. mingw-w64-crt/math/fma{,f}.S: Merge into corresponding C files with
the same names, respectively.
---
mingw-w64-crt/Makefile.am | 4 +-
mingw-w64-crt/math/fma.S | 42 -------------
mingw-w64-crt/math/fma.c | 31 ++++++++++
mingw-w64-crt/math/fmaf.S | 43 --------------
mingw-w64-crt/math/fmaf.c | 31 ++++++++++
mingw-w64-crt/math/fmal.c | 146 ++++++++++++++++++++++++++++++++++++++++++++--
6 files changed, 205 insertions(+), 92 deletions(-)
delete mode 100644 mingw-w64-crt/math/fma.S
create mode 100644 mingw-w64-crt/math/fma.c
delete mode 100644 mingw-w64-crt/math/fmaf.S
create mode 100644 mingw-w64-crt/math/fmaf.c
diff --git a/mingw-w64-crt/Makefile.am b/mingw-w64-crt/Makefile.am
index 44360db..5eba234 100644
--- a/mingw-w64-crt/Makefile.am
+++ b/mingw-w64-crt/Makefile.am
@@ -227,7 +227,6 @@ src_libmingwex=\
\
math/_chgsignl.S math/ceil.S math/ceilf.S math/ceill.S
math/copysignl.S \
math/floor.S math/floorf.S math/floorl.S \
- math/fma.S math/fmaf.S \
math/nearbyint.S math/nearbyintf.S math/nearbyintl.S \
math/trunc.S math/truncf.S \
math/cbrt.c \
@@ -235,7 +234,8 @@ src_libmingwex=\
math/coshf.c math/coshl.c math/erfl.c \
math/expf.c \
math/fabs.c math/fabsf.c math/fabsl.c math/fdim.c
math/fdimf.c math/fdiml.c \
- math/fmal.c math/fmax.c math/fmaxf.c math/fmaxl.c
math/fmin.c math/fminf.c \
+ math/fma.c math/fmaf.c math/fmal.c \
+ math/fmax.c math/fmaxf.c math/fmaxl.c math/fmin.c
math/fminf.c \
math/fminl.c math/fp_consts.c math/fp_constsf.c \
math/fp_constsl.c math/fpclassify.c math/fpclassifyf.c
math/fpclassifyl.c math/frexpf.c \
math/hypotf.c math/hypot.c math/hypotl.c math/isnan.c
math/isnanf.c math/isnanl.c \
diff --git a/mingw-w64-crt/math/fma.S b/mingw-w64-crt/math/fma.S
deleted file mode 100644
index 74becde..0000000
--- a/mingw-w64-crt/math/fma.S
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * This file has no copyright assigned and is placed in the Public Domain.
- * This file is part of the mingw-w64 runtime package.
- * No warranty is given; refer to the file DISCLAIMER.PD within this package.
- */
-#include <_mingw_mac.h>
-
- .file "fma.S"
- .text
-#ifdef __x86_64__
- .align 8
-#else
- .align 4
-#endif
- .p2align 4,,15
- .globl __MINGW_USYMBOL(fma)
- .def __MINGW_USYMBOL(fma); .scl 2; .type 32; .endef
-__MINGW_USYMBOL(fma):
-#if defined(_AMD64_) || defined(__x86_64__)
- subq $56, %rsp
- movsd %xmm0,(%rsp)
- movsd %xmm1,16(%rsp)
- movsd %xmm2,32(%rsp)
- fldl (%rsp)
- fmull 16(%rsp)
- fldl 32(%rsp)
- faddp
- fstpl (%rsp)
- movsd (%rsp),%xmm0
- addq $56, %rsp
- ret
-#elif defined(_ARM_) || defined(__arm__)
- fmacd d2, d0, d1
- fcpyd d0, d2
- bx lr
-#elif defined(_X86_) || defined(__i386__)
- fldl 4(%esp)
- fmull 12(%esp)
- fldl 20(%esp)
- faddp
- ret
-#endif
diff --git a/mingw-w64-crt/math/fma.c b/mingw-w64-crt/math/fma.c
new file mode 100644
index 0000000..00f100c
--- /dev/null
+++ b/mingw-w64-crt/math/fma.c
@@ -0,0 +1,31 @@
+/**
+ * This file has no copyright assigned and is placed in the Public Domain.
+ * This file is part of the mingw-w64 runtime package.
+ * No warranty is given; refer to the file DISCLAIMER.PD within this package.
+ */
+double fma(double x, double y, double z);
+
+#if defined(_AMD64_) || defined(__x86_64__) || defined(_X86_) ||
defined(__i386__)
+
+long double fmal(long double x, long double y, long double z);
+
+double fma(double x, double y, double z){
+ return (double)fmal(x, y, z);
+}
+
+#elif defined(_ARM_) || defined(__arm__)
+
+double fma(double x, double y, double z){
+ __asm__ (
+ "fmacd %0, %1, %2 \n"
+ : "+w"(z)
+ : "w"(x), "w"(y)
+ );
+ return z;
+}
+
+#else
+
+#error This platform is not supported.
+
+#endif
diff --git a/mingw-w64-crt/math/fmaf.S b/mingw-w64-crt/math/fmaf.S
deleted file mode 100644
index 6bc7ef0..0000000
--- a/mingw-w64-crt/math/fmaf.S
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * This file has no copyright assigned and is placed in the Public Domain.
- * This file is part of the mingw-w64 runtime package.
- * No warranty is given; refer to the file DISCLAIMER.PD within this package.
- */
-#include <_mingw_mac.h>
-
- .file "fmaf.S"
- .text
-#ifdef __x86_64__
- .align 8
-#else
- .align 2
-#endif
- .p2align 4,,15
- .globl __MINGW_USYMBOL(fmaf)
- .def __MINGW_USYMBOL(fmaf); .scl 2; .type 32; .endef
-__MINGW_USYMBOL(fmaf):
-#if defined(_AMD64_) || defined(__x86_64__)
- subq $56, %rsp
- movss %xmm0,(%rsp)
- movss %xmm1,16(%rsp)
- movss %xmm2,32(%rsp)
- flds (%rsp)
- fmuls 16(%rsp)
- flds 32(%rsp)
- faddp
- fstps (%rsp)
- movss (%rsp),%xmm0
- addq $56, %rsp
- ret
-#elif defined(_ARM_) || defined(__arm__)
- fmacs s2, s0, s1
- fcpys s0, s2
- bx lr
-#elif defined(_X86_) || defined(__i386__)
- flds 4(%esp)
- fmuls 8(%esp)
- flds 12(%esp)
- faddp
- ret
-#endif
-
diff --git a/mingw-w64-crt/math/fmaf.c b/mingw-w64-crt/math/fmaf.c
new file mode 100644
index 0000000..d9bea0d
--- /dev/null
+++ b/mingw-w64-crt/math/fmaf.c
@@ -0,0 +1,31 @@
+/**
+ * This file has no copyright assigned and is placed in the Public Domain.
+ * This file is part of the mingw-w64 runtime package.
+ * No warranty is given; refer to the file DISCLAIMER.PD within this package.
+ */
+float fmaf(float x, float y, float z);
+
+#if defined(_AMD64_) || defined(__x86_64__) || defined(_X86_) ||
defined(__i386__)
+
+long double fmal(long double x, long double y, long double z);
+
+float fmaf(float x, float y, float z){
+ return (float)fmal(x, y, z);
+}
+
+#elif defined(_ARM_) || defined(__arm__)
+
+float fmaf(float x, float y, float z){
+ __asm__ (
+ "fmacs %0, %1, %2 \n"
+ : "+t"(z)
+ : "t"(x), "t"(y)
+ );
+ return z;
+}
+
+#else
+
+#error This platform is not supported.
+
+#endif
diff --git a/mingw-w64-crt/math/fmal.c b/mingw-w64-crt/math/fmal.c
index 3d0eb02..65d5edf 100644
--- a/mingw-w64-crt/math/fmal.c
+++ b/mingw-w64-crt/math/fmal.c
@@ -3,10 +3,146 @@
* This file is part of the mingw-w64 runtime package.
* No warranty is given; refer to the file DISCLAIMER.PD within this package.
*/
-long double fmal ( long double _x, long double _y, long double _z);
+long double fmal(long double x, long double y, long double z);
-long double
-fmal ( long double _x, long double _y, long double _z)
-{
- return ((_x * _y) + _z);
+#if defined(_AMD64_) || defined(__x86_64__) || defined(_X86_) ||
defined(__i386__)
+
+/**
+ * X87-specific software-emulated FMA by LH_Mouse (lh_mouse at 126 dot com).
+ * This file is donated to the mingw-w64 project.
+ * Note: This file requires C99 support to compile.
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include <limits.h>
+#include <stdbool.h>
+
+/*
https://en.wikipedia.org/wiki/Extended_precision#x86_Extended_Precision_Format
*/
+typedef union x87reg_ {
+ struct __attribute__((__packed__)) {
+ union {
+ uint64_t f64;
+ struct {
+ uint32_t flo;
+ uint32_t fhi;
+ };
+ };
+ uint16_t exp : 15;
+ uint16_t sgn : 1;
+ };
+ long double f;
+} x87reg;
+
+static inline void break_down(x87reg *restrict lo, x87reg *restrict hi, long
double x){
+ hi->f = x;
+ const uint32_t flo = hi->flo;
+ const long exp = hi->exp;
+ const bool sgn = hi->sgn;
+ /* Erase low-order significant bits. `hi->f` now has only 32 significant
bits. */
+ hi->flo = 0;
+
+ if(flo == 0){
+ /* If the low-order significant bits are all zeroes, return zero in
`lo->f`. */
+ lo->f64 = 0;
+ lo->exp = 0;
+ } else {
+ /* How many bits should we shift to normalize the floating point value? */
+ const long shn = __builtin_clzl(flo) - (sizeof(long) - sizeof(uint32_t)) *
CHAR_BIT + 32;
+#if 0 /* Naive implementation */
+ if(shn < exp){
+ /* `x` can be normalized, normalize it. */
+ lo->f64 = (uint64_t)flo << shn;
+ lo->exp = (exp - shn) & 0x7FFF;
+ } else {
+ /* Otherwise, go with a denormal number. */
+ if(exp > 0){
+ /* Denormalize the source normal number. */
+ lo->f64 = (uint64_t)flo << (exp - 1);
+ } else {
+ /* Leave the source denormal number as is. */
+ lo->f64 = flo;
+ }
+ lo->exp = 0;
+ }
+#else /* Optimal implementation */
+ const long mask = (shn - exp) >> 31; /* mask = (shn < exp) ? -1 : 0 */
+ long expm1 = exp - 1;
+ expm1 &= ~(expm1 >> 31); /* expm1 = (exp - 1 >= 0) ? (exp - 1)
: 0 */
+ lo->f64 = (uint64_t)flo << (((shn ^ expm1) & mask) ^ expm1);
+ /* f64 = flo << ((shn < exp) ? shn :
expm1) */
+ lo->exp = (exp - shn) & mask; /* exp = (shn < exp) ? (exp - shn) :
0 */
+#endif
+ }
+ lo->sgn = sgn;
+}
+static inline long double fpu_fma(long double x, long double y, long double z){
+ /*
+ POSIX-2013:
+ 1. If x or y are NaN, a NaN shall be returned.
+ 2. If x multiplied by y is an exact infinity and z is also an infinity
+ but with the opposite sign, a domain error shall occur, and either a NaN
+ (if supported), or an implementation-defined value shall be returned.
+ 3. If one of x and y is infinite, the other is zero, and z is not a NaN,
+ a domain error shall occur, and either a NaN (if supported), or an
+ implementation-defined value shall be returned.
+ 4. If one of x and y is infinite, the other is zero, and z is a NaN, a NaN
+ shall be returned and a domain error may occur.
+ 5. If x* y is not 0*Inf nor Inf*0 and z is a NaN, a NaN shall be returned.
+ */
+ if(__fpclassifyl(x) == FP_NAN){
+ return x; /* Handle case 1. */
+ }
+ if(__fpclassifyl(y) == FP_NAN){
+ return y; /* Handle case 1. */
+ }
+ /* Handle case 2, 3 and 4 universally. Thanks to x87 a NaN is generated
+ if an Inf is multiplied with zero, saving us a huge amount of work. */
+ const long double xy = x * y;
+ if(__fpclassifyl(xy) == FP_NAN){
+ return xy; /* Handle case 2, 3 and 4. */
+ }
+ if(__fpclassifyl(z) == FP_NAN){
+ return z; /* Handle case 5. */
+ }
+ const long double xy = x * y;
+ if(__fpclassifyl(xy) == FP_NAN){
+ return xy; /* Handle case 2, 3 and 4. */
+ }
+
+ /* Check whecher the result is finite. */
+ const long double xyz = xy + z;
+ const int cxyz = __fpclassifyl(xyz);
+ if((cxyz == FP_NAN) || (cxyz == FP_INFINITY)){
+ return xyz; /* If this naive check doesn't yield a finite value, the FMA
isn't
+ likely to return one either. Forward the value as is. */
+ }
+
+ long double ret;
+ x87reg xlo, xhi, ylo, yhi;
+ break_down(&xlo, &xhi, x);
+ break_down(&ylo, &yhi, y);
+ /* The order of these four statements is essential. Don't move them around.
*/
+ ret = z;
+ ret += xhi.f * yhi.f; /* The most significant item comes
first. */
+ ret += xhi.f * ylo.f + xlo.f * yhi.f; /* They are equally significant. */
+ ret += xlo.f * ylo.f; /* The least significant item comes
last. */
+ return ret;
+}
+
+long double fmal(long double x, long double y, long double z){
+ return fpu_fma(x, y, z);
}
+
+#elif defined(_ARM_) || defined(__arm__)
+
+long double fmal(long double x, long double y, long double z){
+ __asm__ (
+ "fmacd %0, %1, %2 \n"
+ : "+w"(z)
+ : "w"(x), "w"(y)
+ );
+ return z;
+}
+
+#endif
--
2.10.2
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
Mingw-w64-public mailing list
Mingw-w64-public@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public