While performing some tests of IEEE 128 float for PPC64LE, Michael
    Meissner noticed that __gcc_qsub is substantially slower than
    __gcc_qadd.  __gcc_qsub calls __gcc_add with the second operand
    negated.  Because the functions normally are invoked through
    libgcc shared object, the extra PLT overhead has a large impact
    on the overall time of the function.  This patch converts
    __gcc_qadd to a static inline function invoked by __gcc_qadd
    and __gcc_qsub.

    libgcc/ChangeLog:

            * config/rs6000/ibm-ldouble.c (ldouble_qadd_internal): Rename from
            __gcc_qadd.
            (__gcc_qadd): Call ldouble_qadd_internal.
            (__gcc_qsub): Call ldouble_qadd_internal with second long double
            argument negated.

diff --git a/libgcc/config/rs6000/ibm-ldouble.c
b/libgcc/config/rs6000/ibm-ldouble.c
index 4c13453f975..0b385aa940b 100644
--- a/libgcc/config/rs6000/ibm-ldouble.c
+++ b/libgcc/config/rs6000/ibm-ldouble.c
@@ -118,8 +118,8 @@ pack_ldouble (double dh, double dl)
 }

 /* Add two 'IBM128_TYPE' values and return the result. */
-IBM128_TYPE
-__gcc_qadd (double a, double aa, double c, double cc)
+static inline IBM128_TYPE
+ldouble_qadd_internal (double a, double aa, double c, double cc)
 {
   double xh, xl, z, q, zz;

@@ -158,9 +158,15 @@ __gcc_qadd (double a, double aa, double c, double cc)
 }

 IBM128_TYPE
-__gcc_qsub (double a, double b, double c, double d)
+__gcc_qadd (double a, double aa, double c, double cc)
+{
+  return ldouble_qadd_internal (a, aa, c, cc);
+}
+
+IBM128_TYPE
+__gcc_qsub (double a, double aa, double c, double cc)
 {
-  return __gcc_qadd (a, b, -c, -d);
+  return ldouble_qadd_internal (a, aa, -c, -cc);
 }

 #ifdef __NO_FPRS__

Reply via email to