https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/188209
This was originally ported from rocm device libs in 900bd7eb7f5426ad13f624cbf29716afe376c878. Merge in more recent changes. >From 7737106d8cc8061dde76888817cd5a3ac3f7e323 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <[email protected]> Date: Sat, 21 Mar 2026 18:17:33 +0100 Subject: [PATCH] libclc: Update expm1 This was originally ported from rocm device libs in 900bd7eb7f5426ad13f624cbf29716afe376c878. Merge in more recent changes. --- libclc/clc/lib/generic/math/clc_ep.cl | 1 + libclc/clc/lib/generic/math/clc_expm1.cl | 4 + libclc/clc/lib/generic/math/clc_expm1.inc | 227 ++++++++-------------- 3 files changed, 89 insertions(+), 143 deletions(-) diff --git a/libclc/clc/lib/generic/math/clc_ep.cl b/libclc/clc/lib/generic/math/clc_ep.cl index 0e152e8bcd7e9..c08f7ddc11560 100644 --- a/libclc/clc/lib/generic/math/clc_ep.cl +++ b/libclc/clc/lib/generic/math/clc_ep.cl @@ -15,6 +15,7 @@ #include "clc/math/clc_ldexp.h" #include "clc/math/clc_mad.h" #include "clc/math/clc_recip_fast.h" +#include "clc/math/clc_rint.h" #include "clc/math/clc_sqrt_fast.h" #include "clc/relational/clc_isinf.h" #include "clc/relational/clc_signbit.h" diff --git a/libclc/clc/lib/generic/math/clc_expm1.cl b/libclc/clc/lib/generic/math/clc_expm1.cl index 45ce92d70838f..4a1b02358fafc 100644 --- a/libclc/clc/lib/generic/math/clc_expm1.cl +++ b/libclc/clc/lib/generic/math/clc_expm1.cl @@ -9,9 +9,13 @@ #include "clc/clc_convert.h" #include "clc/float/definitions.h" #include "clc/internal/clc.h" +#include "clc/math/clc_exp2_fast.h" #include "clc/math/clc_exp_helper.h" +#include "clc/math/clc_fabs.h" #include "clc/math/clc_fma.h" +#include "clc/math/clc_ldexp.h" #include "clc/math/clc_mad.h" +#include "clc/math/clc_rint.h" #include "clc/math/math.h" #include "clc/math/tables.h" #include "clc/relational/clc_isnan.h" diff --git a/libclc/clc/lib/generic/math/clc_expm1.inc b/libclc/clc/lib/generic/math/clc_expm1.inc index 568fa4e0a36fa..d4a11026fb1c1 100644 --- a/libclc/clc/lib/generic/math/clc_expm1.inc +++ b/libclc/clc/lib/generic/math/clc_expm1.inc @@ -9,151 +9,88 @@ /* Refer to the exp routine for the underlying algorithm */ #if __CLC_FPSIZE == 32 -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_expm1(__CLC_GENTYPE x) { - // 128*log2 : 88.722839111673 - const __CLC_GENTYPE X_MAX = 0x1.62e42ep+6f; - // -149*log2 : -103.27892990343184 - const __CLC_GENTYPE X_MIN = -0x1.9d1da0p+6f; - // 64/log2 : 92.332482616893657 - const __CLC_GENTYPE R_64_BY_LOG2 = 0x1.715476p+6f; - // log2/64 lead: 0.0108032227 - const __CLC_GENTYPE R_LOG2_BY_64_LD = 0x1.620000p-7f; - // log2/64 tail: 0.0000272020388 - const __CLC_GENTYPE R_LOG2_BY_64_TL = 0x1.c85fdep-16f; - - __CLC_INTN n = __CLC_CONVERT_INTN(x * R_64_BY_LOG2); - __CLC_GENTYPE fn = __CLC_CONVERT_GENTYPE(n); - - __CLC_INTN j = n & 0x3f; - __CLC_INTN m = n >> 6; - - __CLC_GENTYPE r = - __clc_mad(fn, -R_LOG2_BY_64_TL, __clc_mad(fn, -R_LOG2_BY_64_LD, x)); - - // Truncated Taylor series - __CLC_GENTYPE z2 = __clc_mad( - r * r, __clc_mad(r, __clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), 0.5f), - r); - - __CLC_GENTYPE m2 = __CLC_AS_GENTYPE((m + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); - __CLC_GENTYPE exp_head = __CLC_USE_TABLE(exp_tbl_ep_head, j); - __CLC_GENTYPE exp_tail = __CLC_USE_TABLE(exp_tbl_ep_tail, j); - - __CLC_GENTYPE two_to_jby64_h = exp_head * m2; - __CLC_GENTYPE two_to_jby64_t = exp_tail * m2; - __CLC_GENTYPE two_to_jby64 = two_to_jby64_h + two_to_jby64_t; - - z2 = __clc_mad(z2, two_to_jby64, two_to_jby64_t) + (two_to_jby64_h - 1.0f); - // Make subnormals work - z2 = x == 0.f ? x : z2; - z2 = x < X_MIN || m < -24 ? -1.0f : z2; - z2 = x > X_MAX ? __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32) : z2; - z2 = __clc_isnan(x) ? x : z2; - - return z2; +_CLC_OVERLOAD _CLC_CONST _CLC_DEF __CLC_FLOATN __clc_expm1(__CLC_FLOATN x) { + const __CLC_FLOATN x_max = 0x1.62e42ep+6f; // 64 * ln(2) + const __CLC_FLOATN ln2_tail = + -0x1.05c610p-29f; // small correction term for ln(2) + + const __CLC_FLOATN c1 = 0x1.a26762p-13f; // ~1.250000e-4 + const __CLC_FLOATN c2 = 0x1.6d2e00p-10f; // ~ 9.999999e-4 + const __CLC_FLOATN c3 = 0x1.110ff2p-7f; // ~8.333333e-3 + const __CLC_FLOATN c4 = 0x1.555502p-5f; // ~4.166667e-2 + const __CLC_FLOATN c5 = 0x1.555556p-3f; // ~1.666667e-1 + + __CLC_FLOATN fn = __clc_rint(x * (__CLC_FLOATN)M_LOG2E); + + __CLC_FLOATN t = + __clc_fma(-fn, ln2_tail, __clc_fma(-fn, (__CLC_FLOATN)M_LN2, x)); + + __CLC_FLOATN q0 = __clc_mad(t, c1, c2); + __CLC_FLOATN q1 = __clc_mad(t, q0, c3); + __CLC_FLOATN q2 = __clc_mad(t, q1, c4); + __CLC_FLOATN q3 = __clc_mad(t, q2, c5); + __CLC_FLOATN p = __clc_fma(t, t * __clc_mad(t, q3, 0.5f), t); + + __CLC_INTN e = fn == 128.0f ? 127 : __CLC_CONVERT_INTN(fn); + __CLC_FLOATN s = __clc_ldexp(__CLC_FP_LIT(1.0), e); + + __CLC_FLOATN z = __clc_fma(s, p, s - 1.0f); + z = fn == 128.0f ? 2.0f * z : z; + + z = x > x_max ? __CLC_GENTYPE_INF : z; + z = x < -17.0f ? -1.0f : z; + return z; } #elif __CLC_FPSIZE == 64 -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_expm1(__CLC_GENTYPE x) { - const __CLC_GENTYPE max_expm1_arg = 709.8; - const __CLC_GENTYPE min_expm1_arg = -37.42994775023704; - // 0x3FCC8FF7C79A9A22 = log(1+1/4) - const __CLC_GENTYPE log_OnePlus_OneByFour = 0.22314355131420976; - // 0xBFD269621134DB93 = log(1-1/4) - const __CLC_GENTYPE log_OneMinus_OneByFour = -0.28768207245178096; - const __CLC_GENTYPE sixtyfour_by_lnof2 = - 92.33248261689366; // 0x40571547652b82fe - const __CLC_GENTYPE lnof2_by_64_head = - 0.010830424696223417; // 0x3f862e42fefa0000 - const __CLC_GENTYPE lnof2_by_64_tail = - 2.5728046223276688e-14; // 0x3d1cf79abc9e3b39 - - // First, assume log(1-1/4) < x < log(1+1/4) i.e -0.28768 < x < 0.22314 - __CLC_GENTYPE u = __CLC_AS_GENTYPE(__CLC_AS_ULONGN(x) & 0xffffffffff000000UL); - __CLC_GENTYPE v = x - u; - __CLC_GENTYPE y = u * u * 0.5; - __CLC_GENTYPE z = v * (x + u) * 0.5; - - __CLC_GENTYPE q = __clc_fma( - x, - __clc_fma( - x, - __clc_fma( - x, - __clc_fma( - x, - __clc_fma( - x, - __clc_fma(x, - __clc_fma(x, - __clc_fma(x, 2.4360682937111612e-8, - 2.7582184028154370e-7), - 2.7558212415361945e-6), - 2.4801576918453420e-5), - 1.9841269447671544e-4), - 1.3888888890687830e-3), - 8.3333333334012270e-3), - 4.1666666666665560e-2), - 1.6666666666666632e-1); - q *= x * x * x; - - __CLC_GENTYPE z1g = (u + y) + (q + (v + z)); - __CLC_GENTYPE z1 = x + (y + (q + z)); - z1 = y >= 0x1.0p-7 ? z1g : z1; - - // Now assume outside interval around 0 - __CLC_INTN n = __CLC_CONVERT_INTN(x * sixtyfour_by_lnof2); - __CLC_INTN j = n & 0x3f; - __CLC_INTN m = n >> 6; - - __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j); - __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j); - __CLC_GENTYPE f = f1 + f2; - - __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(-n); - __CLC_GENTYPE r = - __clc_fma(dn, lnof2_by_64_tail, __clc_fma(dn, lnof2_by_64_head, x)); - - q = __clc_fma(r, - __clc_fma(r, - __clc_fma(r, - __clc_fma(r, 1.38889490863777199667e-03, - 8.33336798434219616221e-03), - 4.16666666662260795726e-02), - 1.66666666665260878863e-01), - 5.00000000000000008883e-01); - q = __clc_fma(r * r, q, r); - - __CLC_GENTYPE twopm = __CLC_AS_GENTYPE(__CLC_CONVERT_LONGN(m + EXPBIAS_DP64) - << EXPSHIFTBITS_DP64); - __CLC_GENTYPE twopmm = __CLC_AS_GENTYPE(__CLC_CONVERT_LONGN(EXPBIAS_DP64 - m) - << EXPSHIFTBITS_DP64); - - // Computations for m > 52, including where result is close to Inf - __CLC_ULONGN uval = __CLC_AS_ULONGN(0x1.0p+1023 * (f1 + (f * q + (f2)))); - __CLC_INTN e = __CLC_CONVERT_INTN(uval >> EXPSHIFTBITS_DP64) + 1; - - __CLC_GENTYPE zme1024 = __CLC_AS_GENTYPE( - (__CLC_CONVERT_ULONGN(e) << EXPSHIFTBITS_DP64) | (uval & MANTBITS_DP64)); - zme1024 = __CLC_CONVERT_LONGN(e == 2047) - ? __CLC_AS_GENTYPE((__CLC_ULONGN)PINFBITPATT_DP64) - : zme1024; - - __CLC_GENTYPE zmg52 = twopm * (f1 + __clc_fma(f, q, f2 - twopmm)); - zmg52 = __CLC_CONVERT_LONGN(m == 1024) ? zme1024 : zmg52; - - // For m < 53 - __CLC_GENTYPE zml53 = - twopm * ((f1 - twopmm) + __clc_fma(f1, q, f2 * (1.0 + q))); - - // For m < -7 - __CLC_GENTYPE zmln7 = __clc_fma(twopm, f1 + __clc_fma(f, q, f2), -1.0); - - z = __CLC_CONVERT_LONGN(m < 53) ? zml53 : zmg52; - z = __CLC_CONVERT_LONGN(m < -7) ? zmln7 : z; - z = x > log_OneMinus_OneByFour && x < log_OnePlus_OneByFour ? z1 : z; - z = x > max_expm1_arg ? __CLC_AS_GENTYPE((__CLC_ULONGN)PINFBITPATT_DP64) : z; +_CLC_OVERLOAD _CLC_CONST _CLC_DEF __CLC_DOUBLEN __clc_expm1(__CLC_DOUBLEN x) { + const __CLC_DOUBLEN max_expm1_arg = 0x1.62e42fefa39efp+9; + const __CLC_DOUBLEN min_expm1_arg = -37.0; + + const __CLC_DOUBLEN lnof2_by_64_tail = 0x1.abc9e3b39803fp-56; + + __CLC_DOUBLEN dn = __clc_rint(x * M_LOG2E); + + // Reduced argument t = x - dn * (ln2/64), split into head and tail for + // precision + __CLC_DOUBLEN t = __clc_mad(-dn, lnof2_by_64_tail, __clc_mad(-dn, M_LN2, x)); + + const __CLC_DOUBLEN c0 = 0x1.1f32ea9d67f34p-29; + const __CLC_DOUBLEN c1 = 0x1.af4eb2a1b768bp-26; + const __CLC_DOUBLEN c2 = 0x1.27e500e0ac05bp-22; + const __CLC_DOUBLEN c3 = 0x1.71de01b889c29p-19; + const __CLC_DOUBLEN c4 = 0x1.a01a0197bcfd8p-16; + const __CLC_DOUBLEN c5 = 0x1.a01a01ac1a723p-13; + const __CLC_DOUBLEN c6 = 0x1.6c16c16c18931p-10; + const __CLC_DOUBLEN c7 = 0x1.1111111110056p-7; + const __CLC_DOUBLEN c8 = 0x1.5555555555552p-5; + const __CLC_DOUBLEN c9 = 0x1.5555555555557p-3; + const __CLC_DOUBLEN c10 = 0x1.0000000000000p-1; + + __CLC_DOUBLEN q0 = __clc_mad(t, c0, c1); + __CLC_DOUBLEN q1 = __clc_mad(t, q0, c2); + __CLC_DOUBLEN q2 = __clc_mad(t, q1, c3); + __CLC_DOUBLEN q3 = __clc_mad(t, q2, c4); + __CLC_DOUBLEN q4 = __clc_mad(t, q3, c5); + __CLC_DOUBLEN q5 = __clc_mad(t, q4, c6); + __CLC_DOUBLEN q6 = __clc_mad(t, q5, c7); + __CLC_DOUBLEN q7 = __clc_mad(t, q6, c8); + __CLC_DOUBLEN q8 = __clc_mad(t, q7, c9); + __CLC_DOUBLEN p = __clc_mad(t, t * __clc_mad(t, q8, c10), t); + + __CLC_INTN e = + __CLC_CONVERT_INTN(dn == 1024.0) ? 1023 : __CLC_CONVERT_INTN(dn); + + // s = 2^e + __CLC_DOUBLEN s = __clc_ldexp(__CLC_FP_LIT(1.0), e); + + // expm1(x) ~= s * p + (s - 1) + __CLC_DOUBLEN z = __clc_mad(s, p, s - 1.0); + + z = dn == 1024.0 ? 2.0 * z : z; + + z = x > max_expm1_arg ? __CLC_GENTYPE_INF : z; z = x < min_expm1_arg ? -1.0 : z; return z; @@ -161,8 +98,12 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_expm1(__CLC_GENTYPE x) { #elif __CLC_FPSIZE == 16 -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_expm1(__CLC_GENTYPE x) { - return __CLC_CONVERT_GENTYPE(__clc_expm1(__CLC_CONVERT_FLOATN(x))); +_CLC_OVERLOAD _CLC_DEF __CLC_HALFN __clc_expm1(__CLC_HALFN x) { + __CLC_FLOATN x_float = __CLC_CONVERT_FLOATN(x); + __CLC_FLOATN float_expm1 = + __clc_exp2_fast(x_float * (__CLC_FLOATN)M_LOG2E) - 1.0f; + __CLC_HALFN p = __clc_fma(x, x * __clc_fma(x, 0x1.555556p-3h, 0.5h), x); + return __clc_fabs(x) < 0x1.0p-6h ? p : __CLC_CONVERT_HALFN(float_expm1); } #endif _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
