Re: [Mingw-w64-public] Implement fused multiply-add (FMA) funcitons for x86 families properly

lhmouse Fri, 20 Jan 2017 01:59:50 -0800

The mail has been being rejected for spamming for a few hours.
Hope it wouldn't be this time.



-- 
Best regards,
lh_mouse





From 82fd24e992a402ff2f7c55780fd76945ef83e094 Mon Sep 17 00:00:00 2001
From: LH_Mouse <lh_mo...@126.com>
Date: Wed, 18 Jan 2017 19:35:43 +0800
Subject: [PATCH] mingw-w64-crt/math/fma{,f,l}.c: Implement fused multiply-add
 (FMA) funcitons for x86 families properly. mingw-w64-crt/Makefile.am:
 Likewise. mingw-w64-crt/math/fma{,f}.S: Merge into corresponding C files with
 the same names, respectively.

---
 mingw-w64-crt/Makefile.am |   4 +-
 mingw-w64-crt/math/fma.S  |  42 --------------
 mingw-w64-crt/math/fma.c  |  29 ++++++++++
 mingw-w64-crt/math/fmaf.S |  43 --------------
 mingw-w64-crt/math/fmaf.c |  29 ++++++++++
 mingw-w64-crt/math/fmal.c | 143 ++++++++++++++++++++++++++++++++++++++++++++--
 6 files changed, 198 insertions(+), 92 deletions(-)
 delete mode 100644 mingw-w64-crt/math/fma.S
 create mode 100644 mingw-w64-crt/math/fma.c
 delete mode 100644 mingw-w64-crt/math/fmaf.S
 create mode 100644 mingw-w64-crt/math/fmaf.c

diff --git a/mingw-w64-crt/Makefile.am b/mingw-w64-crt/Makefile.am
index 44360db..5eba234 100644
--- a/mingw-w64-crt/Makefile.am
+++ b/mingw-w64-crt/Makefile.am
@@ -227,7 +227,6 @@ src_libmingwex=\
   \
   math/_chgsignl.S      math/ceil.S        math/ceilf.S          math/ceill.S  
       math/copysignl.S     \
   math/floor.S          math/floorf.S      math/floorl.S         \
-  math/fma.S            math/fmaf.S        \
   math/nearbyint.S      math/nearbyintf.S  math/nearbyintl.S     \
   math/trunc.S          math/truncf.S      \
   math/cbrt.c           \
@@ -235,7 +234,8 @@ src_libmingwex=\
   math/coshf.c          math/coshl.c       math/erfl.c           \
   math/expf.c           \
   math/fabs.c           math/fabsf.c       math/fabsl.c          math/fdim.c   
       math/fdimf.c         math/fdiml.c         \
-  math/fmal.c           math/fmax.c        math/fmaxf.c          math/fmaxl.c  
       math/fmin.c          math/fminf.c         \
+  math/fma.c            math/fmaf.c        math/fmal.c           \
+  math/fmax.c           math/fmaxf.c       math/fmaxl.c          math/fmin.c   
       math/fminf.c         \
   math/fminl.c          math/fp_consts.c   math/fp_constsf.c     \
   math/fp_constsl.c     math/fpclassify.c  math/fpclassifyf.c    
math/fpclassifyl.c   math/frexpf.c        \
   math/hypotf.c         math/hypot.c  math/hypotl.c      math/isnan.c          
math/isnanf.c        math/isnanl.c        \
diff --git a/mingw-w64-crt/math/fma.S b/mingw-w64-crt/math/fma.S
deleted file mode 100644
index 74becde..0000000
--- a/mingw-w64-crt/math/fma.S
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * This file has no copyright assigned and is placed in the Public Domain.
- * This file is part of the mingw-w64 runtime package.
- * No warranty is given; refer to the file DISCLAIMER.PD within this package.
- */
-#include <_mingw_mac.h>
-
-       .file   "fma.S"
-       .text
-#ifdef __x86_64__
-       .align 8
-#else
-       .align 4
-#endif
-       .p2align 4,,15
-       .globl __MINGW_USYMBOL(fma)
-       .def    __MINGW_USYMBOL(fma);   .scl    2;      .type   32;     .endef
-__MINGW_USYMBOL(fma):
-#if defined(_AMD64_) || defined(__x86_64__)
-       subq    $56, %rsp
-       movsd   %xmm0,(%rsp)
-       movsd   %xmm1,16(%rsp)
-       movsd   %xmm2,32(%rsp)
-       fldl    (%rsp)
-       fmull   16(%rsp)
-       fldl    32(%rsp)
-       faddp
-       fstpl   (%rsp)
-       movsd   (%rsp),%xmm0
-       addq    $56, %rsp
-       ret
-#elif defined(_ARM_) || defined(__arm__)
-       fmacd d2, d0, d1
-       fcpyd d0, d2
-       bx      lr
-#elif defined(_X86_) || defined(__i386__)
-       fldl    4(%esp)
-       fmull   12(%esp)
-       fldl    20(%esp)
-       faddp
-       ret
-#endif
diff --git a/mingw-w64-crt/math/fma.c b/mingw-w64-crt/math/fma.c
new file mode 100644
index 0000000..645a3d1
--- /dev/null
+++ b/mingw-w64-crt/math/fma.c
@@ -0,0 +1,29 @@
+/**
+ * This file has no copyright assigned and is placed in the Public Domain.
+ * This file is part of the mingw-w64 runtime package.
+ * No warranty is given; refer to the file DISCLAIMER.PD within this package.
+ */
+double fma(double x, double y, double z);
+
+#if defined(_ARM_) || defined(__arm__)
+
+/* Use hardware FMA on ARM. */
+double fma(double x, double y, double z){
+  __asm__ (
+    "fmacd %0, %1, %2 \n"
+    : "+w"(z)
+    : "w"(x), "w"(y)
+  );
+  return z;
+}
+
+#else
+
+long double fmal(long double x, long double y, long double z);
+
+/* For platforms that don't have hardware FMA, emulate it. */
+double fma(double x, double y, double z){
+  return (double)fmal(x, y, z);
+}
+
+#endif
diff --git a/mingw-w64-crt/math/fmaf.S b/mingw-w64-crt/math/fmaf.S
deleted file mode 100644
index 6bc7ef0..0000000
--- a/mingw-w64-crt/math/fmaf.S
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * This file has no copyright assigned and is placed in the Public Domain.
- * This file is part of the mingw-w64 runtime package.
- * No warranty is given; refer to the file DISCLAIMER.PD within this package.
- */
-#include <_mingw_mac.h>
-
-       .file   "fmaf.S"
-       .text
-#ifdef __x86_64__
-       .align 8
-#else
-       .align 2
-#endif
-       .p2align 4,,15
-       .globl __MINGW_USYMBOL(fmaf)
-       .def    __MINGW_USYMBOL(fmaf);  .scl    2;      .type   32;     .endef
-__MINGW_USYMBOL(fmaf):
-#if defined(_AMD64_) || defined(__x86_64__)
-       subq    $56, %rsp
-       movss   %xmm0,(%rsp)
-       movss   %xmm1,16(%rsp)
-       movss   %xmm2,32(%rsp)
-       flds    (%rsp)
-       fmuls   16(%rsp)
-       flds    32(%rsp)
-       faddp
-       fstps   (%rsp)
-       movss   (%rsp),%xmm0
-       addq    $56, %rsp
-       ret
-#elif defined(_ARM_) || defined(__arm__)
-       fmacs s2, s0, s1
-       fcpys s0, s2
-       bx      lr
-#elif defined(_X86_) || defined(__i386__)
-       flds    4(%esp)
-       fmuls   8(%esp)
-       flds    12(%esp)
-       faddp
-       ret
-#endif
-
diff --git a/mingw-w64-crt/math/fmaf.c b/mingw-w64-crt/math/fmaf.c
new file mode 100644
index 0000000..9a0971d
--- /dev/null
+++ b/mingw-w64-crt/math/fmaf.c
@@ -0,0 +1,29 @@
+/**
+ * This file has no copyright assigned and is placed in the Public Domain.
+ * This file is part of the mingw-w64 runtime package.
+ * No warranty is given; refer to the file DISCLAIMER.PD within this package.
+ */
+float fmaf(float x, float y, float z);
+
+#if defined(_ARM_) || defined(__arm__)
+
+/* Use hardware FMA on ARM. */
+float fmaf(float x, float y, float z){
+  __asm__ (
+    "fmacs %0, %1, %2 \n"
+    : "+t"(z)
+    : "t"(x), "t"(y)
+  );
+  return z;
+}
+
+#else
+
+long double fmal(long double x, long double y, long double z);
+
+/* For platforms that don't have hardware FMA, emulate it. */
+float fmaf(float x, float y, float z){
+  return (float)fmal(x, y, z);
+}
+
+#endif
diff --git a/mingw-w64-crt/math/fmal.c b/mingw-w64-crt/math/fmal.c
index 3d0eb02..f1d575d 100644
--- a/mingw-w64-crt/math/fmal.c
+++ b/mingw-w64-crt/math/fmal.c
@@ -3,10 +3,143 @@
  * This file is part of the mingw-w64 runtime package.
  * No warranty is given; refer to the file DISCLAIMER.PD within this package.
  */
-long double fmal ( long double _x,  long double _y,  long double _z);
+long double fmal(long double x, long double y, long double z);
 
-long double
-fmal ( long double _x,  long double _y,  long double _z)
-{
-  return ((_x * _y) + _z);
+#if defined(_ARM_) || defined(__arm__)
+
+double fma(double x, double y, double z);
+
+/* On ARM `long double` is 64 bits. And ARM has hardware FMA. */
+long double fmal(long double x, long double y, long double z){
+  return fma(x, y, z);
+}
+
+#elif defined(_AMD64_) || defined(__x86_64__) || defined(_X86_) || 
defined(__i386__)
+
+/**
+ * x87-specific software-emulated FMA by LH_Mouse (lh_mouse at 126 dot com).
+ * This file is donated to the mingw-w64 project.
+ * Note: This file requires C99 support to compile.
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include <limits.h>
+#include <stdbool.h>
+
+/* 
https://en.wikipedia.org/wiki/Extended_precision#x86_extended_precision_format 
*/
+typedef union x87reg_ {
+  struct __attribute__((__packed__)) {
+    union {
+      uint64_t f64;
+      struct {
+        uint32_t flo;
+        uint32_t fhi;
+      };
+    };
+    uint16_t exp : 15;
+    uint16_t sgn :  1;
+  };
+  long double f;
+} x87reg;
+
+static inline void break_down(x87reg *restrict lo, x87reg *restrict hi, long 
double x){
+  hi->f = x;
+  const uint32_t flo = hi->flo;
+  const long     exp = hi->exp;
+  const bool     sgn = hi->sgn;
+  /* Erase low-order significant bits. `hi->f` now has only 32 significant 
bits. */
+  hi->flo = 0;
+
+  if(flo == 0){
+    /* If the low-order significant bits are all zeroes, return zero in 
`lo->f`. */
+    lo->f64 = 0;
+    lo->exp = 0;
+  } else {
+    /* How many bits should we shift to normalize the floating point value? */
+    const long shn = __builtin_clzl(flo) - (sizeof(long) - sizeof(uint32_t)) * 
CHAR_BIT + 32;
+#if 0 /* Naive implementation */
+    if(shn < exp){
+      /* `x` can be normalized, normalize it. */
+      lo->f64 = (uint64_t)flo << shn;
+      lo->exp = (exp - shn) & 0x7FFF;
+    } else {
+      /* Otherwise, go with a denormal number. */
+      if(exp > 0){
+        /* Denormalize the source normal number. */
+        lo->f64 = (uint64_t)flo << (exp - 1);
+      } else {
+        /* Leave the source denormal number as is. */
+        lo->f64 = flo;
+      }
+      lo->exp = 0;
+    }
+#else /* Optimal implementation */
+    const long mask = (shn - exp) >> 31; /* mask = (shn < exp) ? -1 : 0 */
+    long expm1 = exp - 1;
+    expm1 &= ~(expm1 >> 31);             /* expm1 = (exp - 1 >= 0) ? (exp - 1) 
: 0 */
+    lo->f64 = (uint64_t)flo << (((shn ^ expm1) & mask) ^ expm1);
+                                         /* f64  = flo << ((shn < exp) ? shn : 
expm1) */
+    lo->exp = (exp - shn) & mask;        /* exp  = (shn < exp) ? (exp - shn) : 
0 */
+#endif
+  }
+  lo->sgn = sgn;
 }
+static inline long double fpu_fma(long double x, long double y, long double z){
+  /*
+    POSIX-2013:
+    1. If x or y are NaN, a NaN shall be returned.
+    2. If x multiplied by y is an exact infinity and z is also an infinity
+       but with the opposite sign, a domain error shall occur, and either a NaN
+       (if supported), or an implementation-defined value shall be returned.
+    3. If one of x and y is infinite, the other is zero, and z is not a NaN,
+       a domain error shall occur, and either a NaN (if supported), or an
+       implementation-defined value shall be returned.
+    4. If one of x and y is infinite, the other is zero, and z is a NaN, a NaN
+       shall be returned and a domain error may occur.
+    5. If x* y is not 0*Inf nor Inf*0 and z is a NaN, a NaN shall be returned.
+  */
+  if(__fpclassifyl(x) == FP_NAN){
+    return x; /* Handle case 1. */
+  }
+  if(__fpclassifyl(y) == FP_NAN){
+    return y; /* Handle case 1. */
+  }
+  /* Handle case 2, 3 and 4 universally. Thanks to x87 a NaN is generated
+     if an INF is multiplied with zero, saving us a huge amount of work. */
+  const long double xy = x * y;
+  if(__fpclassifyl(xy) == FP_NAN){
+    return xy; /* Handle case 2, 3 and 4. */
+  }
+  if(__fpclassifyl(z) == FP_NAN){
+    return z; /* Handle case 5. */
+  }
+  /* Check whether the result is finite. */
+  const long double xyz = xy + z;
+  const int cxyz = __fpclassifyl(xyz);
+  if((cxyz == FP_NAN) || (cxyz == FP_INFINITE)){
+    return xyz; /* If this naive check doesn't yield a finite value, the FMA 
isn't
+                   likely to return one either. Forward the value as is. */
+  }
+
+  long double ret;
+  x87reg xlo, xhi, ylo, yhi;
+  break_down(&xlo, &xhi, x);
+  break_down(&ylo, &yhi, y);
+  /* The order of these four statements is essential. Don't move them around. 
*/
+  ret = z;
+  ret += xhi.f * yhi.f;                 /* The most significant item comes 
first. */
+  ret += xhi.f * ylo.f + xlo.f * yhi.f; /* They are equally significant. */
+  ret += xlo.f * ylo.f;                 /* The least significant item comes 
last. */
+  return ret;
+}
+
+long double fmal(long double x, long double y, long double z){
+  return fpu_fma(x, y, z);
+}
+
+#else
+
+#error Please add FMA implementation for this platform.
+
+#endif
-- 
2.10.2

------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot

_______________________________________________
Mingw-w64-public mailing list
Mingw-w64-public@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public

Re: [Mingw-w64-public] Implement fused multiply-add (FMA) funcitons for x86 families properly

Reply via email to