Author: Joseph Huber Date: 2026-06-14T20:32:00-05:00 New Revision: 375b70bafedafa71fabfb0e8b1e759daeef0f703
URL: https://github.com/llvm/llvm-project/commit/375b70bafedafa71fabfb0e8b1e759daeef0f703 DIFF: https://github.com/llvm/llvm-project/commit/375b70bafedafa71fabfb0e8b1e759daeef0f703.diff LOG: [libclc] Improve performance and precision of reciprocal functions (#203805) Summary: Small change to improve the performance and output of functions using the reciprocol. This makes these functions *byte-for-byte* identical with their OCML counterparts in ROCm. Additionally ensure tanpi is correctly rounded to match ROCm. Added: libclc/clc/lib/amdgpu/math/clc_recip_fast.cl libclc/clc/lib/amdgpu/math/clc_recip_fast.inc Modified: libclc/clc/lib/amdgpu/CMakeLists.txt libclc/clc/lib/generic/CMakeLists.txt Removed: ################################################################################ diff --git a/libclc/clc/lib/amdgpu/CMakeLists.txt b/libclc/clc/lib/amdgpu/CMakeLists.txt index 910a0cf1765df..1111dc64f605c 100644 --- a/libclc/clc/lib/amdgpu/CMakeLists.txt +++ b/libclc/clc/lib/amdgpu/CMakeLists.txt @@ -25,6 +25,7 @@ libclc_add_sources(${LIBCLC_CLC_TARGET} FILES math/clc_native_exp.cl math/clc_native_exp2.cl math/clc_native_log10.cl + math/clc_recip_fast.cl mem_fence/clc_mem_fence.cl subgroup/clc_subgroup.cl subgroup/clc_sub_group_broadcast.cl @@ -51,4 +52,5 @@ libclc_set_source_options(-fapprox-func math/clc_native_exp.cl math/clc_native_exp2.cl math/clc_native_log10.cl + math/clc_recip_fast.cl ) diff --git a/libclc/clc/lib/amdgpu/math/clc_recip_fast.cl b/libclc/clc/lib/amdgpu/math/clc_recip_fast.cl new file mode 100644 index 0000000000000..32a52cf97b8e2 --- /dev/null +++ b/libclc/clc/lib/amdgpu/math/clc_recip_fast.cl @@ -0,0 +1,14 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clc/internal/clc.h" + +#define __CLC_FUNCTION __clc_recip_fast +#define __CLC_BODY "clc_recip_fast.inc" + +#include "clc/math/gentype.inc" diff --git a/libclc/clc/lib/amdgpu/math/clc_recip_fast.inc b/libclc/clc/lib/amdgpu/math/clc_recip_fast.inc new file mode 100644 index 0000000000000..9d635cc700442 --- /dev/null +++ b/libclc/clc/lib/amdgpu/math/clc_recip_fast.inc @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// On AMDGPU the "fast" reciprocal is the hardware v_rcp_f32 approximation, +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_recip_fast(__CLC_GENTYPE x) { +#if defined(__CLC_SCALAR) && __CLC_FPSIZE == 32 + return __builtin_amdgcn_rcpf(x); +#else + return ((__CLC_GENTYPE)1.0) / x; +#endif +} diff --git a/libclc/clc/lib/generic/CMakeLists.txt b/libclc/clc/lib/generic/CMakeLists.txt index 40261545fce91..673c82b002b33 100644 --- a/libclc/clc/lib/generic/CMakeLists.txt +++ b/libclc/clc/lib/generic/CMakeLists.txt @@ -231,4 +231,5 @@ libclc_set_source_options(-fapprox-func libclc_set_source_options(-cl-fp32-correctly-rounded-divide-sqrt math/clc_div_cr.cl math/clc_sqrt_cr.cl + math/clc_tanpi.cl ) _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
