Author: Joseph Huber
Date: 2026-06-14T20:32:00-05:00
New Revision: 375b70bafedafa71fabfb0e8b1e759daeef0f703

URL: 
https://github.com/llvm/llvm-project/commit/375b70bafedafa71fabfb0e8b1e759daeef0f703
DIFF: 
https://github.com/llvm/llvm-project/commit/375b70bafedafa71fabfb0e8b1e759daeef0f703.diff

LOG: [libclc] Improve performance and precision of reciprocal functions 
(#203805)

Summary:
Small change to improve the performance and output of functions using
the reciprocol. This makes these functions *byte-for-byte* identical
with their OCML counterparts in ROCm.

Additionally ensure tanpi is correctly rounded to match ROCm.

Added: 
    libclc/clc/lib/amdgpu/math/clc_recip_fast.cl
    libclc/clc/lib/amdgpu/math/clc_recip_fast.inc

Modified: 
    libclc/clc/lib/amdgpu/CMakeLists.txt
    libclc/clc/lib/generic/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/libclc/clc/lib/amdgpu/CMakeLists.txt 
b/libclc/clc/lib/amdgpu/CMakeLists.txt
index 910a0cf1765df..1111dc64f605c 100644
--- a/libclc/clc/lib/amdgpu/CMakeLists.txt
+++ b/libclc/clc/lib/amdgpu/CMakeLists.txt
@@ -25,6 +25,7 @@ libclc_add_sources(${LIBCLC_CLC_TARGET} FILES
   math/clc_native_exp.cl
   math/clc_native_exp2.cl
   math/clc_native_log10.cl
+  math/clc_recip_fast.cl
   mem_fence/clc_mem_fence.cl
   subgroup/clc_subgroup.cl
   subgroup/clc_sub_group_broadcast.cl
@@ -51,4 +52,5 @@ libclc_set_source_options(-fapprox-func
   math/clc_native_exp.cl
   math/clc_native_exp2.cl
   math/clc_native_log10.cl
+  math/clc_recip_fast.cl
 )

diff  --git a/libclc/clc/lib/amdgpu/math/clc_recip_fast.cl 
b/libclc/clc/lib/amdgpu/math/clc_recip_fast.cl
new file mode 100644
index 0000000000000..32a52cf97b8e2
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/math/clc_recip_fast.cl
@@ -0,0 +1,14 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/internal/clc.h"
+
+#define __CLC_FUNCTION __clc_recip_fast
+#define __CLC_BODY "clc_recip_fast.inc"
+
+#include "clc/math/gentype.inc"

diff  --git a/libclc/clc/lib/amdgpu/math/clc_recip_fast.inc 
b/libclc/clc/lib/amdgpu/math/clc_recip_fast.inc
new file mode 100644
index 0000000000000..9d635cc700442
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/math/clc_recip_fast.inc
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// On AMDGPU the "fast" reciprocal is the hardware v_rcp_f32 approximation,
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_recip_fast(__CLC_GENTYPE x) {
+#if defined(__CLC_SCALAR) && __CLC_FPSIZE == 32
+  return __builtin_amdgcn_rcpf(x);
+#else
+  return ((__CLC_GENTYPE)1.0) / x;
+#endif
+}

diff  --git a/libclc/clc/lib/generic/CMakeLists.txt 
b/libclc/clc/lib/generic/CMakeLists.txt
index 40261545fce91..673c82b002b33 100644
--- a/libclc/clc/lib/generic/CMakeLists.txt
+++ b/libclc/clc/lib/generic/CMakeLists.txt
@@ -231,4 +231,5 @@ libclc_set_source_options(-fapprox-func
 libclc_set_source_options(-cl-fp32-correctly-rounded-divide-sqrt
   math/clc_div_cr.cl
   math/clc_sqrt_cr.cl
+  math/clc_tanpi.cl
 )


        
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to