Re: [Mingw-w64-public] Implement fused multiply-add (FMA) funcitons for x86 families properly

2017-01-22 Thread lhmouse
On 2017/1/23 9:08, David Wohlferd wrote:
> Hmm.
>
> It seems a bit backwards to have the function that takes a 'long double'
> calling the function that takes a 'double.'  Yes, they are both the same
> size on ARM, but I think I would have gone the other way.  Plus I kinda
> like having all the implementations in one file (fmal.c).
I prefer that too. At the moment I have to follow what mingw-w64 has 
been doing. That is, keeping separated functions for {f,,l} in different 
files.

> Other than that, this looks ok to me.  Building for ARM with clang seems
> to work (although I have no way to run it).
Thanks for testing.

-- 
Best regards,
LH_Mouse


--
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
___
Mingw-w64-public mailing list
Mingw-w64-public@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public


Re: [Mingw-w64-public] Implement fused multiply-add (FMA) funcitons for x86 families properly

2017-01-22 Thread David Wohlferd
Hmm.

It seems a bit backwards to have the function that takes a 'long double' 
calling the function that takes a 'double.'  Yes, they are both the same 
size on ARM, but I think I would have gone the other way.  Plus I kinda 
like having all the implementations in one file (fmal.c).

Other than that, this looks ok to me.  Building for ARM with clang seems 
to work (although I have no way to run it).

dw

On 1/20/2017 1:57 AM, lhmouse wrote:
> The mail has been being rejected for spamming for a few hours.
> Hope it wouldn't be this time.


--
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
___
Mingw-w64-public mailing list
Mingw-w64-public@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public


Re: [Mingw-w64-public] Implement fused multiply-add (FMA) funcitons for x86 families properly

2017-01-20 Thread lhmouse
The mail has been being rejected for spamming for a few hours.
Hope it wouldn't be this time.


-- 
Best regards,
lh_mouse





From 82fd24e992a402ff2f7c55780fd76945ef83e094 Mon Sep 17 00:00:00 2001
From: LH_Mouse 
Date: Wed, 18 Jan 2017 19:35:43 +0800
Subject: [PATCH] mingw-w64-crt/math/fma{,f,l}.c: Implement fused multiply-add
 (FMA) funcitons for x86 families properly. mingw-w64-crt/Makefile.am:
 Likewise. mingw-w64-crt/math/fma{,f}.S: Merge into corresponding C files with
 the same names, respectively.

---
 mingw-w64-crt/Makefile.am |   4 +-
 mingw-w64-crt/math/fma.S  |  42 --
 mingw-w64-crt/math/fma.c  |  29 ++
 mingw-w64-crt/math/fmaf.S |  43 --
 mingw-w64-crt/math/fmaf.c |  29 ++
 mingw-w64-crt/math/fmal.c | 143 --
 6 files changed, 198 insertions(+), 92 deletions(-)
 delete mode 100644 mingw-w64-crt/math/fma.S
 create mode 100644 mingw-w64-crt/math/fma.c
 delete mode 100644 mingw-w64-crt/math/fmaf.S
 create mode 100644 mingw-w64-crt/math/fmaf.c

diff --git a/mingw-w64-crt/Makefile.am b/mingw-w64-crt/Makefile.am
index 44360db..5eba234 100644
--- a/mingw-w64-crt/Makefile.am
+++ b/mingw-w64-crt/Makefile.am
@@ -227,7 +227,6 @@ src_libmingwex=\
   \
   math/_chgsignl.S  math/ceil.Smath/ceilf.S  math/ceill.S  
   math/copysignl.S \
   math/floor.S  math/floorf.S  math/floorl.S \
-  math/fma.Smath/fmaf.S\
   math/nearbyint.S  math/nearbyintf.S  math/nearbyintl.S \
   math/trunc.S  math/truncf.S  \
   math/cbrt.c   \
@@ -235,7 +234,8 @@ src_libmingwex=\
   math/coshf.c  math/coshl.c   math/erfl.c   \
   math/expf.c   \
   math/fabs.c   math/fabsf.c   math/fabsl.c  math/fdim.c   
   math/fdimf.c math/fdiml.c \
-  math/fmal.c   math/fmax.cmath/fmaxf.c  math/fmaxl.c  
   math/fmin.c  math/fminf.c \
+  math/fma.cmath/fmaf.cmath/fmal.c   \
+  math/fmax.c   math/fmaxf.c   math/fmaxl.c  math/fmin.c   
   math/fminf.c \
   math/fminl.c  math/fp_consts.c   math/fp_constsf.c \
   math/fp_constsl.c math/fpclassify.c  math/fpclassifyf.c
math/fpclassifyl.c   math/frexpf.c\
   math/hypotf.c math/hypot.c  math/hypotl.c  math/isnan.c  
math/isnanf.cmath/isnanl.c\
diff --git a/mingw-w64-crt/math/fma.S b/mingw-w64-crt/math/fma.S
deleted file mode 100644
index 74becde..000
--- a/mingw-w64-crt/math/fma.S
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * This file has no copyright assigned and is placed in the Public Domain.
- * This file is part of the mingw-w64 runtime package.
- * No warranty is given; refer to the file DISCLAIMER.PD within this package.
- */
-#include <_mingw_mac.h>
-
-   .file   "fma.S"
-   .text
-#ifdef __x86_64__
-   .align 8
-#else
-   .align 4
-#endif
-   .p2align 4,,15
-   .globl __MINGW_USYMBOL(fma)
-   .def__MINGW_USYMBOL(fma);   .scl2;  .type   32; .endef
-__MINGW_USYMBOL(fma):
-#if defined(_AMD64_) || defined(__x86_64__)
-   subq$56, %rsp
-   movsd   %xmm0,(%rsp)
-   movsd   %xmm1,16(%rsp)
-   movsd   %xmm2,32(%rsp)
-   fldl(%rsp)
-   fmull   16(%rsp)
-   fldl32(%rsp)
-   faddp
-   fstpl   (%rsp)
-   movsd   (%rsp),%xmm0
-   addq$56, %rsp
-   ret
-#elif defined(_ARM_) || defined(__arm__)
-   fmacd d2, d0, d1
-   fcpyd d0, d2
-   bx  lr
-#elif defined(_X86_) || defined(__i386__)
-   fldl4(%esp)
-   fmull   12(%esp)
-   fldl20(%esp)
-   faddp
-   ret
-#endif
diff --git a/mingw-w64-crt/math/fma.c b/mingw-w64-crt/math/fma.c
new file mode 100644
index 000..645a3d1
--- /dev/null
+++ b/mingw-w64-crt/math/fma.c
@@ -0,0 +1,29 @@
+/**
+ * This file has no copyright assigned and is placed in the Public Domain.
+ * This file is part of the mingw-w64 runtime package.
+ * No warranty is given; refer to the file DISCLAIMER.PD within this package.
+ */
+double fma(double x, double y, double z);
+
+#if defined(_ARM_) || defined(__arm__)
+
+/* Use hardware FMA on ARM. */
+double fma(double x, double y, double z){
+  __asm__ (
+"fmacd %0, %1, %2 \n"
+: "+w"(z)
+: "w"(x), "w"(y)
+  );
+  return z;
+}
+
+#else
+
+long double fmal(long double x, long double y, long double z);
+
+/* For platforms that don't have hardware FMA, emulate it. */
+double fma(double x, double y, double z){
+  return (double)fmal(x, y, z);
+}
+
+#endif
diff --git a/mingw-w64-crt/math/fmaf.S b/mingw-w64-crt/math/fmaf.S
deleted file mode 100644
index 6bc7ef0..000
--- a/mingw-w64-crt/math/fmaf.S
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * This file has no copyright assigned and is placed in the Public Domain.
- * This file is part of the mingw-w64 runtime 

Re: [Mingw-w64-public] Implement fused multiply-add (FMA) funcitons for x86 families properly

2017-01-19 Thread lhmouse
> So you have decided that __builtins can't be used then?  That's too bad.
Yes it results in a call to `fma()` on x64. Can't test it on ARM though.

> I know almost nothing about the guts of floating point, so I'm prepared
> to defer to your judgement, but here's what I think:
> 
> Let me propose an alternative for fma.c:
> ... ...
> In other words, remove all the platform specific code.  This (greatly)
> simplifies this file.  You were already using fmal for x86.  And it
> doesn't lose anything for ARM, since both fma() and fmal() use the exact
> same inline asm.  Why have the exact same (hard to maintain) code in 2
> places?
Keeping asm code in fmaf.c but not in fma.c seems style inconsistency.
However the contrary is doable: In the case of ARM, call `fma()` in `fmal()`.

> As for fmaf, what about:
> ... ...
> The case here is less compelling, but I assert that if fmal is
> supported, it can always be used to calculate fmaf.  If there is a
> shorter/more efficient method (such as there is with ARM), it can be
> added here.
Fair enough. Updated.

> As for fmal, I have a question about your code.  Not the implementation,
> but the design.  Looking at https://en.wikipedia.org/wiki/Long_double,
> it says "Microsoft Windows with Visual C++ also sets the processor in
> double-precision mode by default."  Since (it appears?) you aren't
> following _controlfp_s, won't this give use a different answer than fmal
> from msvcr120.dll?
MSVC doesn't support 80-bit `long double` (it is 64 bits there) so
the results can't equal unless it fits into 64 bits precisely.
My FMA algorithm is basically splitting both operands into two 32-bit ones,
multiplying them using elementary arithmetics then adding the four 64-bit
results altogether: (a+b)(c+d) = ac+(bc+ad)+bd. So the precision of x87
indeed affects the result.
I doubt whether it is necessary to save the x87 control word and set it to
64-bit precision before the calcuation and restore it thereafter. MinGW-w64
already sets it to 64-bit precision during CRT initialization, and if people
set it lower they ain't going to need `fma()` either.

An interesting look at https://msdn.microsoft.com/en-us/library/c9676k6h.aspx
reminds me that _PC_64 isn't supported on x64. Sounds incredible, no? Does
`_controlfp_s()` return an error if we try to set _PC_64 on 0x64? I have no
idea. Nevertheless the precision flags can be set and restored using inline
assembly - yet another dirty solution.

> More nits:
>
> s/whecher/whether
> s/#x86_Extended_Precision_Format/#x86_extended_precision_format
Fixed. The bookmark to wikipedia was copied from my broswer half a year ago
at least and it probably was modified.

--   
Best regards,
lh_mouse
2017-01-20
--
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot___
Mingw-w64-public mailing list
Mingw-w64-public@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public


Re: [Mingw-w64-public] Implement fused multiply-add (FMA) funcitons for x86 families properly

2017-01-19 Thread David Wohlferd
So you have decided that __builtins can't be used then?  That's too bad.

I know almost nothing about the guts of floating point, so I'm prepared 
to defer to your judgement, but here's what I think:

Let me propose an alternative for fma.c:



/**
  * This file has no copyright assigned and is placed in the Public Domain.
  * This file is part of the mingw-w64 runtime package.
  * No warranty is given; refer to the file DISCLAIMER.PD within this 
package.
  */
double fma(double x, double y, double z);

long double fmal(long double x, long double y, long double z);

double fma(double x, double y, double z){
   return (double)fmal(x, y, z);
}



In other words, remove all the platform specific code.  This (greatly) 
simplifies this file.  You were already using fmal for x86.  And it 
doesn't lose anything for ARM, since both fma() and fmal() use the exact 
same inline asm.  Why have the exact same (hard to maintain) code in 2 
places?

As for fmaf, what about:



/**
  * This file has no copyright assigned and is placed in the Public Domain.
  * This file is part of the mingw-w64 runtime package.
  * No warranty is given; refer to the file DISCLAIMER.PD within this 
package.
  */
float fmaf(float x, float y, float z);

#if defined(_ARM_) || defined(__arm__)

float fmaf(float x, float y, float z){
   __asm__ (
 "fmacs %0, %1, %2 \n"
 : "+t"(z)
 : "t"(x), "t"(y)
   );
   return z;
}

#else

long double fmal(long double x, long double y, long double z);

float fmaf(float x, float y, float z){
   return (float)fmal(x, y, z);
}

#endif



The case here is less compelling, but I assert that if fmal is 
supported, it can always be used to calculate fmaf.  If there is a 
shorter/more efficient method (such as there is with ARM), it can be 
added here.

As for fmal, I have a question about your code.  Not the implementation, 
but the design.  Looking at https://en.wikipedia.org/wiki/Long_double, 
it says "Microsoft Windows with Visual C++ also sets the processor in 
double-precision mode by default."  Since (it appears?) you aren't 
following _controlfp_s, won't this give use a different answer than fmal 
from msvcr120.dll?

More nits:

s/whecher/whether
s/#x86_Extended_Precision_Format/#x86_extended_precision_format

dw


--
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
___
Mingw-w64-public mailing list
Mingw-w64-public@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public


Re: [Mingw-w64-public] Implement fused multiply-add (FMA) funcitons for x86 families properly

2017-01-19 Thread lhmouse
New patch attached.
This patch fixes ARM functions and adds a check in `fpu_fma()` for potential 
NaN or INF results.


--   
Best regards,
lh_mouse
2017-01-19






From 3c55daec84dac190b9e3cb032371960e1acbc38f Mon Sep 17 00:00:00 2001
From: LH_Mouse 
Date: Wed, 18 Jan 2017 19:35:43 +0800
Subject: [PATCH] mingw-w64-crt/math/fma{,f,l}.c: Implement fused multiply-add
 (FMA) funcitons for x86 families properly. mingw-w64-crt/Makefile.am:
 Likewise. mingw-w64-crt/math/fma{,f}.S: Merge into corresponding C files with
 the same names, respectively.

---
 mingw-w64-crt/Makefile.am |   4 +-
 mingw-w64-crt/math/fma.S  |  42 -
 mingw-w64-crt/math/fma.c  |  31 ++
 mingw-w64-crt/math/fmaf.S |  43 --
 mingw-w64-crt/math/fmaf.c |  31 ++
 mingw-w64-crt/math/fmal.c | 146 --
 6 files changed, 205 insertions(+), 92 deletions(-)
 delete mode 100644 mingw-w64-crt/math/fma.S
 create mode 100644 mingw-w64-crt/math/fma.c
 delete mode 100644 mingw-w64-crt/math/fmaf.S
 create mode 100644 mingw-w64-crt/math/fmaf.c

diff --git a/mingw-w64-crt/Makefile.am b/mingw-w64-crt/Makefile.am
index 44360db..5eba234 100644
--- a/mingw-w64-crt/Makefile.am
+++ b/mingw-w64-crt/Makefile.am
@@ -227,7 +227,6 @@ src_libmingwex=\
   \
   math/_chgsignl.S  math/ceil.Smath/ceilf.S  math/ceill.S  
   math/copysignl.S \
   math/floor.S  math/floorf.S  math/floorl.S \
-  math/fma.Smath/fmaf.S\
   math/nearbyint.S  math/nearbyintf.S  math/nearbyintl.S \
   math/trunc.S  math/truncf.S  \
   math/cbrt.c   \
@@ -235,7 +234,8 @@ src_libmingwex=\
   math/coshf.c  math/coshl.c   math/erfl.c   \
   math/expf.c   \
   math/fabs.c   math/fabsf.c   math/fabsl.c  math/fdim.c   
   math/fdimf.c math/fdiml.c \
-  math/fmal.c   math/fmax.cmath/fmaxf.c  math/fmaxl.c  
   math/fmin.c  math/fminf.c \
+  math/fma.cmath/fmaf.cmath/fmal.c   \
+  math/fmax.c   math/fmaxf.c   math/fmaxl.c  math/fmin.c   
   math/fminf.c \
   math/fminl.c  math/fp_consts.c   math/fp_constsf.c \
   math/fp_constsl.c math/fpclassify.c  math/fpclassifyf.c
math/fpclassifyl.c   math/frexpf.c\
   math/hypotf.c math/hypot.c  math/hypotl.c  math/isnan.c  
math/isnanf.cmath/isnanl.c\
diff --git a/mingw-w64-crt/math/fma.S b/mingw-w64-crt/math/fma.S
deleted file mode 100644
index 74becde..000
--- a/mingw-w64-crt/math/fma.S
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * This file has no copyright assigned and is placed in the Public Domain.
- * This file is part of the mingw-w64 runtime package.
- * No warranty is given; refer to the file DISCLAIMER.PD within this package.
- */
-#include <_mingw_mac.h>
-
-   .file   "fma.S"
-   .text
-#ifdef __x86_64__
-   .align 8
-#else
-   .align 4
-#endif
-   .p2align 4,,15
-   .globl __MINGW_USYMBOL(fma)
-   .def__MINGW_USYMBOL(fma);   .scl2;  .type   32; .endef
-__MINGW_USYMBOL(fma):
-#if defined(_AMD64_) || defined(__x86_64__)
-   subq$56, %rsp
-   movsd   %xmm0,(%rsp)
-   movsd   %xmm1,16(%rsp)
-   movsd   %xmm2,32(%rsp)
-   fldl(%rsp)
-   fmull   16(%rsp)
-   fldl32(%rsp)
-   faddp
-   fstpl   (%rsp)
-   movsd   (%rsp),%xmm0
-   addq$56, %rsp
-   ret
-#elif defined(_ARM_) || defined(__arm__)
-   fmacd d2, d0, d1
-   fcpyd d0, d2
-   bx  lr
-#elif defined(_X86_) || defined(__i386__)
-   fldl4(%esp)
-   fmull   12(%esp)
-   fldl20(%esp)
-   faddp
-   ret
-#endif
diff --git a/mingw-w64-crt/math/fma.c b/mingw-w64-crt/math/fma.c
new file mode 100644
index 000..00f100c
--- /dev/null
+++ b/mingw-w64-crt/math/fma.c
@@ -0,0 +1,31 @@
+/**
+ * This file has no copyright assigned and is placed in the Public Domain.
+ * This file is part of the mingw-w64 runtime package.
+ * No warranty is given; refer to the file DISCLAIMER.PD within this package.
+ */
+double fma(double x, double y, double z);
+
+#if defined(_AMD64_) || defined(__x86_64__) || defined(_X86_) || 
defined(__i386__)
+
+long double fmal(long double x, long double y, long double z);
+
+double fma(double x, double y, double z){
+  return (double)fmal(x, y, z);
+}
+
+#elif defined(_ARM_) || defined(__arm__)
+
+double fma(double x, double y, double z){
+  __asm__ (
+"fmacd %0, %1, %2 \n"
+: "+w"(z)
+: "w"(x), "w"(y)
+  );
+  return z;
+}
+
+#else
+
+#error This platform is not supported.
+
+#endif
diff --git a/mingw-w64-crt/math/fmaf.S b/mingw-w64-crt/math/fmaf.S
deleted file mode 100644
index 6bc7ef0..000
--- a/mingw-w64-crt/math/fmaf.S
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * This 

Re: [Mingw-w64-public] Implement fused multiply-add (FMA) funcitons for x86 families properly

2017-01-18 Thread lhmouse
> I see that you have replaced the x86 parts for fma and fmaf with C 
> code.  That seems like a good thing.  Is there some reason you can't do 
> that with the ARM versions too?
ARM has hardware FMA and software emulation is not optimal.

> Reducing the amount of platform-specific code also seems like a good thing.
The x87 80-bit floating point format is already platform-specific.

> There are a number of reasons not to use inline asm (for example 
> https://gcc.gnu.org/wiki/DontUseInlineAsm ).  Are you sure this is a 
> good idea?
I am not sure about the inline asm itself. The primary reason I did that
is because, if we have `fma.S` and `fma.c` in the same directory they will
compile to the same file `fma.o`, and `make` complains about that.

Inline asm is indeed hard to maintain and I am aware of it. Personally
I only write asm statements that contain very few instructions, simulating
builtin functions or intrinsics for use in C code.

> Yup, that's one of the downsides to using inline asm.
> 
> I'm no ARM expert, but I'm not sure about this ARM code for fmal:
> 
> +long double fmal(long double x, long double y, long double z){
> +  __asm__ (
> +"fmacd %2, %0, %1 \n"
> +"fcpyd %0, %2 \n"
> +: "+"(z)
> +: "w"(x), "w"(y)
> +  );
> 
> Doesn't fmacd modify %2?  That would be (y), which is listed as an input 
> parameter (and therefore is read-only).  What's more, I thought fmacd 
> was calculating "Fd + Fn * Fm" where the parameters were "fmacd Fd, Fn, 
> Fm".  Such being the case, I would have expected "fmacd %0, %1 %2"?  I 
> don't have a way to run this either, but this looks wrong.
Thanks for pointing it out. That is a mistake. I forgot to fix it after
copying it from the asm code. The `fma()` function was the correct one.

> Under the nit-picky heading:
> 
> +double fma(double x, double y, double z){
> +  __asm__ (
> +"fmacd %0, %1, %2 \n"
> +: "+"(z)
> +: "w"(x), "w"(y)
> +  );
> 
> The \n is redundant.  And doesn't the + make the & redundant as well?
I just perfer to terminate every line of asm code with \n.

I believe the & is redundant not only because of the +, but also because
that there is only one instruction so nothing can be written before
the others are read.

> Lastly I gotta ask: Can we use __builtin_fmal?  Or is mingw-w64 the one 
> providing the implementations for these?
We have to ask a GCC developer for sure. According to my experience this
function is something guaranteed to be semantically equivalent to the one
without the __builtin_ prefix in the standard library. Sometimes
the compiler cannot assume all functions from the standard C library are
available and have the specified behavior e.g. when compiling the Linux
kernel. The `__builtin_fmal()` function is then considered to be
a standard FMA, suitable for constant folding. It may result in an inline
instruction where possible, but could also result in a call to the `fmal()`
external function, resulting in infinite recursion if used in `fmal()`.

--   
Best regards,
lh_mouse
2017-01-19



--
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
___
Mingw-w64-public mailing list
Mingw-w64-public@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public


Re: [Mingw-w64-public] Implement fused multiply-add (FMA) funcitons for x86 families properly

2017-01-18 Thread David Wohlferd
On 1/18/2017 5:14 AM, lhmouse wrote:
> Patch is attached.

I see that you have replaced the x86 parts for fma and fmaf with C 
code.  That seems like a good thing.  Is there some reason you can't do 
that with the ARM versions too?

Reducing the amount of platform-specific code also seems like a good thing.

> This patch removes assembly files that implement FMA on ARM and merges
> them into the corresponding C files with the same name using inline assembly.

Umm.  Hmm.

There are a number of reasons not to use inline asm (for example 
https://gcc.gnu.org/wiki/DontUseInlineAsm ).  Are you sure this is a 
good idea?

> I don't have any knowledge about ARM assembly. Those functions for ARM
> were created using my x86 assembly knowledge and the actual instructions
> are copy-n-paste'd from old .S files. I don't have an ARM compiler to test
> those functions. Please fix them should they be broken.

Yup, that's one of the downsides to using inline asm.

I'm no ARM expert, but I'm not sure about this ARM code for fmal:

+long double fmal(long double x, long double y, long double z){
+  __asm__ (
+"fmacd %2, %0, %1 \n"
+"fcpyd %0, %2 \n"
+: "+"(z)
+: "w"(x), "w"(y)
+  );

Doesn't fmacd modify %2?  That would be (y), which is listed as an input 
parameter (and therefore is read-only).  What's more, I thought fmacd 
was calculating "Fd + Fn * Fm" where the parameters were "fmacd Fd, Fn, 
Fm".  Such being the case, I would have expected "fmacd %0, %1 %2"?  I 
don't have a way to run this either, but this looks wrong.

Under the nit-picky heading:

+double fma(double x, double y, double z){
+  __asm__ (
+"fmacd %0, %1, %2 \n"
+: "+"(z)
+: "w"(x), "w"(y)
+  );

The \n is redundant.  And doesn't the + make the & redundant as well?

+float fmaf(float x, float y, float z){
+  __asm__ (
+"fmacs %0, %1, %2 \n"
+: "+"(z)
+: "t"(x), "t"(y)

The \n is redundant.  And doesn't the + make the & redundant as well?

Lastly I gotta ask: Can we use __builtin_fmal?  Or is mingw-w64 the one 
providing the implementations for these?

dw

--
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
___
Mingw-w64-public mailing list
Mingw-w64-public@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public


Re: [Mingw-w64-public] Implement fused multiply-add (FMA) funcitons for x86 families properly

2017-01-18 Thread lhmouse
The correctness of fma() function can be verified using the following program:
---
#include 
#include 

volatile double x = 0x1.3p52;
volatile double y = 0x1.5p52;
volatile double z = -0x1.8p104;

int main(){
printf("x * y + z= %f\n", x * y + z);
printf("fma(x, y, z) = %f\n", fma(x, y, z));
}
---
A naive multiply-then-add loses some LSBs during the multiplication and
yields zero when the MSBs are complemented by a negative number.
A true FMA function yields 15 in this example.


--   
Best regards,
lh_mouse
2017-01-18


--
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
___
Mingw-w64-public mailing list
Mingw-w64-public@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public


[Mingw-w64-public] Implement fused multiply-add (FMA) funcitons for x86 families properly

2017-01-18 Thread lhmouse
Patch is attached.
This patch removes assembly files that implement FMA on ARM and merges
them into the corresponding C files with the same name using inline assembly.
A re-generation of Makefile.in is required.

I don't have any knowledge about ARM assembly. Those functions for ARM
were created using my x86 assembly knowledge and the actual instructions
are copy-n-paste'd from old .S files. I don't have an ARM compiler to test
those functions. Please fix them should they be broken.
  
-- 
Best regards, 
lh_mouse 
2017-01-18 


From 0534577644f12e94cc408d37083277f133d1ca47 Mon Sep 17 00:00:00 2001
From: LH_Mouse 
Date: Wed, 18 Jan 2017 19:35:43 +0800
Subject: [PATCH] mingw-w64-crt/math/fma{,f,l}.c: Implement fused multiply-add
 (FMA) funcitons for x86 families properly. mingw-w64-crt/Makefile.am:
 Likewise. mingw-w64-crt/math/fma{,f}.S: Merge into corresponding C files with
 the same names, respectively.

---
 mingw-w64-crt/Makefile.am |   4 +-
 mingw-w64-crt/math/fma.S  |  42 ---
 mingw-w64-crt/math/fma.c  |  31 +++
 mingw-w64-crt/math/fmaf.S |  43 ---
 mingw-w64-crt/math/fmaf.c |  31 +++
 mingw-w64-crt/math/fmal.c | 135 --
 6 files changed, 194 insertions(+), 92 deletions(-)
 delete mode 100644 mingw-w64-crt/math/fma.S
 create mode 100644 mingw-w64-crt/math/fma.c
 delete mode 100644 mingw-w64-crt/math/fmaf.S
 create mode 100644 mingw-w64-crt/math/fmaf.c

diff --git a/mingw-w64-crt/Makefile.am b/mingw-w64-crt/Makefile.am
index 44360db..5eba234 100644
--- a/mingw-w64-crt/Makefile.am
+++ b/mingw-w64-crt/Makefile.am
@@ -227,7 +227,6 @@ src_libmingwex=\
   \
   math/_chgsignl.S  math/ceil.Smath/ceilf.S  math/ceill.S  
   math/copysignl.S \
   math/floor.S  math/floorf.S  math/floorl.S \
-  math/fma.Smath/fmaf.S\
   math/nearbyint.S  math/nearbyintf.S  math/nearbyintl.S \
   math/trunc.S  math/truncf.S  \
   math/cbrt.c   \
@@ -235,7 +234,8 @@ src_libmingwex=\
   math/coshf.c  math/coshl.c   math/erfl.c   \
   math/expf.c   \
   math/fabs.c   math/fabsf.c   math/fabsl.c  math/fdim.c   
   math/fdimf.c math/fdiml.c \
-  math/fmal.c   math/fmax.cmath/fmaxf.c  math/fmaxl.c  
   math/fmin.c  math/fminf.c \
+  math/fma.cmath/fmaf.cmath/fmal.c   \
+  math/fmax.c   math/fmaxf.c   math/fmaxl.c  math/fmin.c   
   math/fminf.c \
   math/fminl.c  math/fp_consts.c   math/fp_constsf.c \
   math/fp_constsl.c math/fpclassify.c  math/fpclassifyf.c
math/fpclassifyl.c   math/frexpf.c\
   math/hypotf.c math/hypot.c  math/hypotl.c  math/isnan.c  
math/isnanf.cmath/isnanl.c\
diff --git a/mingw-w64-crt/math/fma.S b/mingw-w64-crt/math/fma.S
deleted file mode 100644
index 74becde..000
--- a/mingw-w64-crt/math/fma.S
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * This file has no copyright assigned and is placed in the Public Domain.
- * This file is part of the mingw-w64 runtime package.
- * No warranty is given; refer to the file DISCLAIMER.PD within this package.
- */
-#include <_mingw_mac.h>
-
-   .file   "fma.S"
-   .text
-#ifdef __x86_64__
-   .align 8
-#else
-   .align 4
-#endif
-   .p2align 4,,15
-   .globl __MINGW_USYMBOL(fma)
-   .def__MINGW_USYMBOL(fma);   .scl2;  .type   32; .endef
-__MINGW_USYMBOL(fma):
-#if defined(_AMD64_) || defined(__x86_64__)
-   subq$56, %rsp
-   movsd   %xmm0,(%rsp)
-   movsd   %xmm1,16(%rsp)
-   movsd   %xmm2,32(%rsp)
-   fldl(%rsp)
-   fmull   16(%rsp)
-   fldl32(%rsp)
-   faddp
-   fstpl   (%rsp)
-   movsd   (%rsp),%xmm0
-   addq$56, %rsp
-   ret
-#elif defined(_ARM_) || defined(__arm__)
-   fmacd d2, d0, d1
-   fcpyd d0, d2
-   bx  lr
-#elif defined(_X86_) || defined(__i386__)
-   fldl4(%esp)
-   fmull   12(%esp)
-   fldl20(%esp)
-   faddp
-   ret
-#endif
diff --git a/mingw-w64-crt/math/fma.c b/mingw-w64-crt/math/fma.c
new file mode 100644
index 000..98249aa
--- /dev/null
+++ b/mingw-w64-crt/math/fma.c
@@ -0,0 +1,31 @@
+/**
+ * This file has no copyright assigned and is placed in the Public Domain.
+ * This file is part of the mingw-w64 runtime package.
+ * No warranty is given; refer to the file DISCLAIMER.PD within this package.
+ */
+double fma(double x, double y, double z);
+
+#if defined(_AMD64_) || defined(__x86_64__) || defined(_X86_) || 
defined(__i386__)
+
+long double fmal(long double x, long double y, long double z);
+
+double fma(double x, double y, double z){
+  return (double)fmal(x, y, z);
+}
+
+#elif defined(_ARM_) || defined(__arm__)
+
+double fma(double x, double y, double